Skip to content

Commit 3e905a6

Browse files
authored
Add skipmissing argument to levels (#46)
Currently if one wants to get levels in their appropriate order, but add `missing` if present, the only solution is to do something like `union(levels(x), unique(x))`, which is inefficient. Support `skipmissing=false` to allow doing this in a single pass over the data. Use `@inline` to ensure that the return type can be inferred when the value of `skipmissing` is known statically. Also fix a type instability which existed for ranges. Use a complex strategy to ensure that the new argument is supported even when a custom type defined `levels` without supporting it.
1 parent 1b6c589 commit 3e905a6

File tree

2 files changed

+130
-23
lines changed

2 files changed

+130
-23
lines changed

src/DataAPI.jl

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -105,29 +105,67 @@ definition.
105105
"""
106106
function describe end
107107

108+
# Sentinel type needed to make `levels` inferrable
109+
struct _Default end
110+
108111
"""
109-
levels(x)
112+
levels(x; skipmissing=true)
113+
114+
Return a vector of unique values which occur or could occur in collection `x`.
115+
`missing` values are skipped unless `skipmissing=false` is passed.
110116
111-
Return a vector of unique values which occur or could occur in collection `x`,
112-
omitting `missing` even if present. Values are returned in the preferred order
113-
for the collection, with the result of [`sort`](@ref) as a default.
117+
Values are returned in the preferred order for the collection,
118+
with the result of [`sort`](@ref) as a default.
114119
If the collection is not sortable then the order of levels is unspecified.
115120
116121
Contrary to [`unique`](@ref), this function may return values which do not
117122
actually occur in the data, and does not preserve their order of appearance in `x`.
118123
"""
119-
function levels(x)
120-
T = Base.nonmissingtype(eltype(x))
121-
u = unique(x)
122-
# unique returns its input with copying for ranges
123-
# (and possibly for other types guaranteed to hold unique values)
124-
nmu = (u === x || Base.mightalias(u, x)) ? filter(!ismissing, u) : filter!(!ismissing, u)
125-
levs = convert(AbstractArray{T}, nmu)
126-
try
127-
sort!(levs)
128-
catch
124+
@inline levels(x; skipmissing::Union{Bool, _Default}=_Default()) =
125+
skipmissing isa _Default || skipmissing ?
126+
_levels_skipmissing(x) : _levels_missing(x)
127+
128+
# The `which` check is here for backward compatibility:
129+
# if a type implements a custom `levels` method but does not support
130+
# keyword arguments, `levels(x, skipmissing=true/false)` will dispatch
131+
# to the fallback methods here, and we take care of calling that custom method
132+
function _levels_skipmissing(x)
133+
if which(DataAPI.levels, Tuple{typeof(x)}) === which(DataAPI.levels, Tuple{Any})
134+
T = Base.nonmissingtype(eltype(x))
135+
u = unique(x)
136+
# unique returns its input with copying for ranges
137+
# (and possibly for other types guaranteed to hold unique values)
138+
nmu = (u isa AbstractRange || u === x || Base.mightalias(u, x)) ?
139+
filter(!ismissing, u) : filter!(!ismissing, u)
140+
levs = convert(AbstractArray{T}, nmu)
141+
try
142+
sort!(levs)
143+
catch
144+
end
145+
return levs
146+
else
147+
return levels(x)
148+
end
149+
end
150+
151+
function _levels_missing(x)
152+
if which(DataAPI.levels, Tuple{typeof(x)}) === which(DataAPI.levels, Tuple{Any})
153+
u = convert(AbstractArray{eltype(x)}, unique(x))
154+
# unique returns its input with copying for ranges
155+
# (and possibly for other types guaranteed to hold unique values)
156+
levs = (x isa AbstractRange || u === x || Base.mightalias(u, x)) ?
157+
Base.copymutable(u) : u
158+
try
159+
sort!(levs)
160+
catch
161+
end
162+
return levs
163+
# This is a suboptimal fallback since it does a second pass over the data
164+
elseif any(ismissing, x)
165+
return [levels(x); missing]
166+
else
167+
return convert(AbstractArray{eltype(x)}, levels(x))
129168
end
130-
levs
131169
end
132170

133171
"""

test/runtests.jl

Lines changed: 77 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
using Test, DataAPI
22

3+
const = isequal
4+
5+
# For `levels` tests
6+
struct TestArray{T} <: AbstractVector{T}
7+
x::Vector{T}
8+
end
9+
Base.size(x::TestArray) = size(x.x)
10+
Base.getindex(x::TestArray, i) = x.x[i]
11+
DataAPI.levels(x::TestArray) = reverse(DataAPI.levels(x.x))
12+
313
@testset "DataAPI" begin
414

515
@testset "defaultarray" begin
@@ -29,15 +39,15 @@ end
2939

3040
@testset "levels" begin
3141

32-
@test DataAPI.levels(1:1) ==
33-
DataAPI.levels([1]) ==
34-
DataAPI.levels([1, missing]) ==
35-
DataAPI.levels([missing, 1]) ==
42+
@test @inferred(DataAPI.levels(1:1)) ==
43+
@inferred(DataAPI.levels([1])) ==
44+
@inferred(DataAPI.levels([1, missing])) ==
45+
@inferred(DataAPI.levels([missing, 1])) ==
3646
[1]
37-
@test DataAPI.levels(2:-1:1) ==
38-
DataAPI.levels([2, 1]) ==
39-
DataAPI.levels(Any[2, 1]) ==
40-
DataAPI.levels([2, missing, 1]) ==
47+
@test @inferred(DataAPI.levels(2:-1:1)) ==
48+
@inferred(DataAPI.levels([2, 1])) ==
49+
@inferred(DataAPI.levels(Any[2, 1])) ==
50+
@inferred(DataAPI.levels([2, missing, 1])) ==
4151
[1, 2]
4252
@test DataAPI.levels([missing, "a", "c", missing, "b"]) == ["a", "b", "c"]
4353
@test DataAPI.levels([Complex(0, 1), Complex(1, 0), missing]) ==
@@ -55,6 +65,65 @@ end
5565
@test isempty(DataAPI.levels([missing]))
5666
@test isempty(DataAPI.levels([]))
5767

68+
levels_missing(x) = DataAPI.levels(x, skipmissing=false)
69+
@test @inferred(levels_missing(1:1))
70+
@inferred(levels_missing([1]))
71+
[1]
72+
if VERSION >= v"1.6.0"
73+
@test @inferred(levels_missing([1, missing]))
74+
@inferred(levels_missing([missing, 1]))
75+
[1, missing]
76+
else
77+
@test levels_missing([1, missing])
78+
levels_missing([missing, 1])
79+
[1, missing]
80+
end
81+
@test @inferred(levels_missing(2:-1:1))
82+
@inferred(levels_missing([2, 1]))
83+
[1, 2]
84+
@test @inferred(levels_missing(Any[2, 1]))
85+
[1, 2]
86+
@test DataAPI.levels([2, missing, 1], skipmissing=false)
87+
[1, 2, missing]
88+
@test DataAPI.levels([missing, "a", "c", missing, "b"], skipmissing=false)
89+
["a", "b", "c", missing]
90+
@test DataAPI.levels([Complex(0, 1), Complex(1, 0), missing], skipmissing=false)
91+
[Complex(0, 1), Complex(1, 0), missing]
92+
@test typeof(DataAPI.levels([1], skipmissing=false)) === Vector{Int}
93+
@test typeof(DataAPI.levels([1, missing], skipmissing=false)) ===
94+
Vector{Union{Int, Missing}}
95+
@test typeof(DataAPI.levels(["a"], skipmissing=false)) === Vector{String}
96+
@test typeof(DataAPI.levels(["a", missing], skipmissing=false)) ===
97+
Vector{Union{String, Missing}}
98+
@test typeof(DataAPI.levels(Real[1], skipmissing=false)) === Vector{Real}
99+
@test typeof(DataAPI.levels(Union{Real,Missing}[1, missing], skipmissing=false)) ===
100+
Vector{Union{Real, Missing}}
101+
@test typeof(DataAPI.levels(trues(1), skipmissing=false)) === Vector{Bool}
102+
@test DataAPI.levels([missing], skipmissing=false) [missing]
103+
@test DataAPI.levels([missing], skipmissing=false) isa Vector{Missing}
104+
@test typeof(DataAPI.levels(Union{Int,Missing}[missing], skipmissing=false)) ===
105+
Vector{Union{Int,Missing}}
106+
@test isempty(DataAPI.levels([], skipmissing=false))
107+
@test typeof(DataAPI.levels(Int[], skipmissing=false)) === Vector{Int}
108+
109+
# Backward compatibility test:
110+
# check that an array type which implements a `levels` method
111+
# which does not accept keyword arguments works thanks to fallbacks
112+
@test DataAPI.levels(TestArray([1, 2])) ==
113+
DataAPI.levels(TestArray([1, 2]), skipmissing=true) ==
114+
DataAPI.levels(TestArray([1, 2]), skipmissing=false) == [2, 1]
115+
@test DataAPI.levels(TestArray([1, 2])) isa Vector{Int}
116+
@test DataAPI.levels(TestArray([1, 2]), skipmissing=true) isa Vector{Int}
117+
@test DataAPI.levels(TestArray([1, 2]), skipmissing=false) isa Vector{Int}
118+
@test DataAPI.levels(TestArray([missing, 1, 2])) ==
119+
DataAPI.levels(TestArray([missing, 1, 2]), skipmissing=true) == [2, 1]
120+
@test DataAPI.levels(TestArray([missing, 1, 2]), skipmissing=false)
121+
[2, 1, missing]
122+
@test DataAPI.levels(TestArray([missing, 1, 2])) isa Vector{Int}
123+
@test DataAPI.levels(TestArray([missing, 1, 2]), skipmissing=true) isa
124+
Vector{Int}
125+
@test DataAPI.levels(TestArray([missing, 1, 2]), skipmissing=false) isa
126+
Vector{Union{Int, Missing}}
58127
end
59128

60129
@testset "Between" begin

0 commit comments

Comments
 (0)