diff --git a/Project.toml b/Project.toml index ba6aab3..513241e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "NearestNeighborModels" uuid = "636a865e-7cf4-491e-846c-de09b730eb36" authors = ["Anthony D. Blaom ", "Sebastian Vollmer ", "Thibaut Lienart ", "Okon Samuel "] -version = "0.2.3" +version = "0.2.2" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" @@ -13,15 +13,17 @@ NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" [compat] Distances = "^0.9, ^0.10" -FillArrays = "^0.9, ^0.10, ^0.11, 0.12, 0.13, 1.0" +FillArrays = "^0.9, ^0.10, ^0.11, 0.12, 0.13, 1" MLJModelInterface = "1.4" NearestNeighbors = "^0.4" -StatsBase = "0.33, 0.34" +OrderedCollections = "1.1" +StatsBase = "^0.33, 0.34" Tables = "^1.2" -julia = "1.6" +julia = "1.3" [extras] MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" diff --git a/src/NearestNeighborModels.jl b/src/NearestNeighborModels.jl index 8fa0549..5331f9d 100644 --- a/src/NearestNeighborModels.jl +++ b/src/NearestNeighborModels.jl @@ -14,6 +14,7 @@ using Distances using FillArrays using LinearAlgebra using Statistics +using OrderedCollections # ============================================================================================== ## EXPORTS @@ -36,8 +37,8 @@ const Vec{T} = AbstractVector{T} const Mat{T} = AbstractMatrix{T} const Arr{T, N} = AbstractArray{T, N} const ColumnTable = Tables.ColumnTable -const DictTable = Tables.DictColumns -const MultiUnivariateFinite = Union{DictTable, ColumnTable} +const DictColumnTable = Tables.DictColumnTable +const MultiUnivariateFinite = Union{DictColumnTable, ColumnTable} # Define constants for easy referencing of packages const MMI = MLJModelInterface diff --git a/src/models.jl b/src/models.jl index f3e8256..71ee2ed 100644 --- a/src/models.jl +++ b/src/models.jl @@ -19,19 +19,21 @@ end function dict_preds(::Val{:columnaccess}, func, target_table, idxsvec, weights) cols = Tables.columns(target_table) colnames = Tables.columnnames(cols) - dict_table = Dict( + dict = OrderedDict{Symbol, AbstractVector}( nm => func(weights, Tables.getcolumn(cols, nm), idxsvec) for nm in colnames ) + dict_table = Tables.DictColumnTable(Tables.Schema(colnames, eltype.(values(dict))), dict) return dict_table end function dict_preds(::Val{:noncolumnaccess}, func, target_table, idxsvec, weights) - cols = Tables.dictcolumntable(target_table) - colnames = Tables.columnnames(cols) - dict_table = Dict( - nm => func(weights, Tables.getcolumn(cols, nm), idxsvec) for nm in colnames - ) - return dict_table + cols = Tables.dictcolumntable(target_table) + colnames = Tables.columnnames(cols) + dict = OrderedDict{Symbol, AbstractVector}( + nm => func(weights, Tables.getcolumn(cols, nm), idxsvec) for nm in colnames + ) + dict_table = Tables.DictColumnTable(Tables.Schema(colnames, eltype.(values(dict))), dict) + return dict_table end function dict_preds(func::F, target_table, idxsvec, weights) where {F<:Function} @@ -71,7 +73,7 @@ function ntuple_preds(func::F, target_table, idxsvec, weights) where {F <: Funct end function univariate_table(::Type{T}, weights, target_table, idxsvec) where {T} - table = if T <: DictTable + table = if T <: DictColumnTable dict_preds(_predict_knnclassifier, target_table, idxsvec, weights) else ntuple_preds(_predict_knnclassifier, target_table, idxsvec, weights) @@ -80,7 +82,7 @@ function univariate_table(::Type{T}, weights, target_table, idxsvec) where {T} end function categorical_table(::Type{T}, weights, target_table, idxsvec) where {T} - table = if T <: DictTable + table = if T <: DictColumnTable dict_preds(_predictmode_knnclassifier, target_table, idxsvec, weights) else ntuple_preds(_predictmode_knnclassifier, target_table, idxsvec, weights) @@ -311,7 +313,7 @@ function MultitargetKNNClassifier(; leafsize::Int = (algorithm == :brutetree) ? 0 : 10, reorder::Bool = algorithm != :brutetree, weights::KNNKernel=Uniform(), - output_type::Type{<:MultiUnivariateFinite} = DictTable + output_type::Type{<:MultiUnivariateFinite} = DictColumnTable ) model = MultitargetKNNClassifier( K, algorithm, metric, leafsize, reorder, weights, output_type @@ -623,7 +625,7 @@ Here: - `X` is any table of input features (eg, a `DataFrame`) whose columns are of scitype `Continuous`; check column scitypes with `schema(X)`. -- y` is the target, which can be any table of responses whose element scitype is either +- `y` is the target, which can be any table of responses whose element scitype is either `<:Finite` (`<:Multiclass` or `<:OrderedFactor` will do); check the columns scitypes with `schema(y)`. Each column of `y` is assumed to belong to a common categorical pool. @@ -637,16 +639,16 @@ Train the machine using `fit!(mach, rows=...)`. $KNNFIELDS -* `output_type::Type{<:MultiUnivariateFinite}=DictTable` : One of - (`ColumnTable`, `DictTable`). The type of table type to use for predictions. +* `output_type::Type{<:MultiUnivariateFinite}=DictColumnTable` : One of + (`ColumnTable`, `DictColumnTable`). The type of table type to use for predictions. Setting to `ColumnTable` might improve performance for narrow tables while setting to - `DictTable` improves performance for wide tables. + `DictColumnTable` improves performance for wide tables. # Operations - `predict(mach, Xnew)`: Return predictions of the target given features `Xnew`, which should have same scitype as `X` above. Predictions are either a `ColumnTable` or - `DictTable` of `UnivariateFiniteVector` columns depending on the value set for the + `DictColumnTable` of `UnivariateFiniteVector` columns depending on the value set for the `output_type` parameter discussed above. The probabilistic predictions are uncalibrated. - `predict_mode(mach, Xnew)`: Return the modes of each column of the table of probabilistic