Merge pull request #54 from JuliaAI/dev

For a 0.3.0 release
JuliaAI · Feb 28, 2023 · 075403e · 075403e
2 parents 16c7814 + 9d9f4dc
commit 075403e
Show file tree

Hide file tree

Showing 9 changed files with 158 additions and 56 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -2,8 +2,6 @@ name: CI
 
 env:
   PYTHON: Conda
-# remove next line (and others marked below) when #42 properly resolved
-  LD_LIBRARY_PATH: /home/runner/.julia/conda/3/lib
 
 on:
   pull_request:
@@ -46,12 +44,14 @@ jobs:
             ${{ runner.os }}-test-
             ${{ runner.os }}-
       - uses: julia-actions/julia-buildpkg@v1
-# remove next four lines (and one other marked above) when #42 properly resolved
-      - name: "Install Conda"
-        run: julia -e 'using Pkg; Pkg.add("Conda");'
-      - name: "Install Scikit-learn"
-        run: cd $LD_LIBRARY_PATH
+# The following is needed for Julia <=0.8.4 on Linux OS 
+# due to old version of libstcxx used by Julia
+      - name: "Export LD_LIBRARY_PATH envrioment variable"
+        if: ${{matrix.version == '1.6'}}
+        run: echo "LD_LIBRARY_PATH=/home/runner/.julia/conda/3/x86_64/lib" >> $GITHUB_ENV
       - uses: julia-actions/julia-runtest@v1
+        env:
+          LD_LIBRARY_PATH: /home/runner/.julia/conda/3/x86_64/lib
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v1
         with:

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJScikitLearnInterface"
 uuid = "5ae90465-5518-4432-b9d2-8a1def2f0cab"
 authors = ["Thibaut Lienart, Anthony Blaom"]
-version = "0.2.0"
+version = "0.3.0"
 
 [deps]
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
@@ -11,13 +11,14 @@ ScikitLearn = "3646fa90-6ef7-5e7e-9f22-8aca16db6324"
 [compat]
 MLJModelInterface = "1.4"
 PyCall = "1"
-ScikitLearn = "0.5,0.6"
+ScikitLearn = "0.7"
 julia = "1.6"
 
 [extras]
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJTestInterface = "72560011-54dd-4dc2-94f3-c5de45b75ecd"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["StableRNGs", "Test", "MLJBase"]
+test = ["StableRNGs", "MLJTestInterface", "Test", "MLJBase"]
diff --git a/src/models/clustering.jl b/src/models/clustering.jl
@@ -186,7 +186,8 @@ const KMeans_ = skcl(:KMeans)
     verbose::Int        = 0::(_ ≥ 0)
     random_state::Any   = nothing
     copy_x::Bool        = true
-    algorithm::String   = "auto"::(_ in ("auto", "full", "elkane"))
+    ## TODO: Remove the "auto" and "full" options when python sklearn releases v1.3
+    algorithm::String   = "lloyd"::(_ in ("auto", "full", "elkane", "lloyd"))
     # long
     init::Union{AbstractArray,String}        = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
 end

diff --git a/src/models/ensemble.jl b/src/models/ensemble.jl
@@ -1,6 +1,6 @@
 const AdaBoostRegressor_ = sken(:AdaBoostRegressor)
 @sk_reg mutable struct AdaBoostRegressor <: MMI.Deterministic
-    base_estimator::Any    = nothing
+    estimator::Any    = nothing
     n_estimators::Int      = 50::(_ > 0)
     learning_rate::Float64 = 1.0::(_ > 0)
     loss::String           = "linear"::(_ in ("linear","square","exponential"))
@@ -17,7 +17,7 @@ add_human_name_trait(AdaBoostRegressor, "AdaBoost ensemble regression")
 # ----------------------------------------------------------------------------
 const AdaBoostClassifier_ = sken(:AdaBoostClassifier)
 @sk_clf mutable struct AdaBoostClassifier <: MMI.Probabilistic
-    base_estimator::Any    = nothing
+    estimator::Any    = nothing
     n_estimators::Int      = 50::(_ > 0)
     learning_rate::Float64 = 1.0::(_ > 0)
     algorithm::String      = "SAMME.R"::(_ in ("SAMME", "SAMME.R"))
@@ -39,7 +39,7 @@ meta(AdaBoostClassifier,
 # ============================================================================
 const BaggingRegressor_ = sken(:BaggingRegressor)
 @sk_reg mutable struct BaggingRegressor <: MMI.Deterministic
-    base_estimator::Any      = nothing
+    estimator::Any      = nothing
     n_estimators::Int        = 10::(_>0)
     max_samples::Union{Int,Float64}  = 1.0::(_>0)
     max_features::Union{Int,Float64} = 1.0::(_>0)
@@ -63,7 +63,7 @@ add_human_name_trait(BaggingRegressor, "bagging ensemble regressor")
 # ----------------------------------------------------------------------------
 const BaggingClassifier_ = sken(:BaggingClassifier)
 @sk_clf mutable struct BaggingClassifier <: MMI.Probabilistic
-    base_estimator::Any      = nothing
+    estimator::Any      = nothing
     n_estimators::Int        = 10::(_>0)
     max_samples::Union{Int,Float64}  = 1.0::(_>0)
     max_features::Union{Int,Float64} = 1.0::(_>0)
@@ -95,11 +95,11 @@ meta(BaggingClassifier,
 # ============================================================================
 const GradientBoostingRegressor_ = sken(:GradientBoostingRegressor)
 @sk_reg mutable struct GradientBoostingRegressor <: MMI.Deterministic
-    loss::String                    = "ls"::(_ in ("ls","lad","huber","quantile"))
+    loss::String                    = "squared_error"::(_ in ("squared_error","absolute_error","huber","quantile"))
     learning_rate::Float64          = 0.1::(_>0)
     n_estimators::Int               = 100::(_>0)
     subsample::Float64              = 1.0::(_>0)
-    criterion::String               = "friedman_mse"::(_ in ("mse","mae","friedman_mse"))
+    criterion::String               = "friedman_mse"::(_ in ("squared_error","friedman_mse"))
     min_samples_split::Union{Int,Float64} = 2::(_>0)
     min_samples_leaf::Union{Int,Float64}  = 1::(_>0)
     min_weight_fraction_leaf::Float64     = 0.0::(_≥0)
@@ -130,7 +130,8 @@ add_human_name_trait(GradientBoostingRegressor, "gradient boosting ensemble regr
 # ----------------------------------------------------------------------------
 const GradientBoostingClassifier_ = sken(:GradientBoostingClassifier)
 @sk_clf mutable struct GradientBoostingClassifier <: MMI.Probabilistic
-    loss::String                    = "deviance"::(_ in ("deviance","exponential"))
+    # TODO: Remove "deviance" when python sklearn releases v1.3.0
+    loss::String                    = "log_loss"::(_ in ("deviance", "log_loss","exponential"))
     learning_rate::Float64          = 0.1::(_>0)
     n_estimators::Int               = 100::(_>0)
     subsample::Float64              = 1.0::(_>0)
@@ -155,6 +156,7 @@ MMI.fitted_params(m::GradientBoostingClassifier, (f, _, _)) = (
     n_estimators        = f.n_estimators_,
     feature_importances = f.feature_importances_,
     train_score         = f.train_score_,
+    ## TODO: Remove the `loss_` attribute when python sklearn releases v1.3
     loss                = f.loss_,
     init                = f.init_,
     estimators          = f.estimators_,
@@ -170,12 +172,13 @@ meta(GradientBoostingClassifier,
 const RandomForestRegressor_ = sken(:RandomForestRegressor)
 @sk_reg mutable struct RandomForestRegressor <: MMI.Deterministic
     n_estimators::Int              = 100::(_ > 0)
-    criterion::String              = "mse"::(_ in ("mae", "mse"))
+    criterion::String              = "squared_error"::(_ in ("squared_error","absolute_error", "friedman_mse", "poisson"))
     max_depth::Option{Int}         = nothing::(_ === nothing || _ > 0)
     min_samples_split::Union{Int,Float64} = 2::(_ > 0)
     min_samples_leaf::Union{Int,Float64}  = 1::(_ > 0)
     min_weight_fraction_leaf::Float64     = 0.0::(_ ≥ 0)
-    max_features::Union{Int,Float64,String,Nothing} = "auto"::(_ === nothing || (isa(_, String) && (_ in ("auto","sqrt","log2"))) || _ > 0)
+    ## TODO: Remove the "auto" option in python sklearn v1.3
+    max_features::Union{Int,Float64,String,Nothing} = "sqrt"::(_ === nothing || (isa(_, String) && (_ in ("auto","sqrt","log2"))) || _ > 0)
     max_leaf_nodes::Option{Int}    = nothing::(_ === nothing || _ > 0)
     min_impurity_decrease::Float64 = 0.0::(_ ≥ 0)
     bootstrap::Bool                = true
@@ -191,7 +194,7 @@ end
 MMI.fitted_params(model::RandomForestRegressor, (f, _, _)) = (
     estimators          = f.estimators_,
     feature_importances = f.feature_importances_,
-    n_features          = f.n_features_,
+    n_features          = f.n_features_in_,
     n_outputs           = f.n_outputs_,
     oob_score           = model.oob_score ? f.oob_score_ : nothing,
     oob_prediction      = model.oob_score ? f.oob_prediction_ : nothing
@@ -206,12 +209,13 @@ meta(RandomForestRegressor,
 const RandomForestClassifier_ = sken(:RandomForestClassifier)
 @sk_clf mutable struct RandomForestClassifier <: MMI.Probabilistic
     n_estimators::Int              = 100::(_ > 0)
-    criterion::String              = "gini"::(_ in ("gini","entropy"))
+    criterion::String              = "gini"::(_ in ("gini","entropy", "log_loss"))
     max_depth::Option{Int}         = nothing::(_ === nothing || _ > 0)
     min_samples_split::Union{Int,Float64} = 2::(_ > 0)
     min_samples_leaf::Union{Int,Float64}  = 1::(_ > 0)
     min_weight_fraction_leaf::Float64     = 0.0::(_ ≥ 0)
-    max_features::Union{Int,Float64,String,Nothing} = "auto"::(_ === nothing || (isa(_, String) && (_ in ("auto","sqrt","log2"))) || _ > 0)
+    ## TODO: Remove the "auto" option in python sklearn v1.3
+    max_features::Union{Int,Float64,String,Nothing} = "sqrt"::(_ === nothing || (isa(_, String) && (_ in ("auto","sqrt","log2"))) || _ > 0)
     max_leaf_nodes::Option{Int}    = nothing::(_ === nothing || _ > 0)
     min_impurity_decrease::Float64 = 0.0::(_ ≥ 0)
     bootstrap::Bool                = true
@@ -229,7 +233,7 @@ MMI.fitted_params(m::RandomForestClassifier, (f, _, _)) = (
     estimators            = f.estimators_,
     classes               = f.classes_,
     n_classes             = f.n_classes_,
-    n_features            = f.n_features_,
+    n_features            = f.n_features_in_,
     n_outputs             = f.n_outputs_,
     feature_importances   = f.feature_importances_,
     oob_score             = m.oob_score ? f.oob_score_ : nothing,
@@ -250,12 +254,13 @@ MMI.target_scitype(::ENSEMBLE_REG) = AbstractVector{Continuous}
 const ExtraTreesRegressor_ = sken(:ExtraTreesRegressor)
 @sk_reg mutable struct ExtraTreesRegressor <: MMI.Deterministic
     n_estimators::Int              = 100::(_>0)
-    criterion::String              = "mse"::(_ in ("mae", "mse"))
+    criterion::String              = "squared_error"::(_ in ("squared_error","absolute_error", "friedman_mse", "poisson"))
     max_depth::Option{Int}         = nothing::(_ === nothing || _ > 0)
     min_samples_split::Union{Int,Float64}  = 2::(_ > 0)
     min_samples_leaf::Union{Int,Float64}   = 1::(_ > 0)
     min_weight_fraction_leaf::Float64      = 0.0::(_ ≥ 0)
-    max_features::Union{Int,Float64,String,Nothing} = "auto"::(_ === nothing || (isa(_, String) && (_ in ("auto","sqrt","log2"))) || _ > 0)
+    ## TODO: Remove the "auto" option in python sklearn v1.3
+    max_features::Union{Int,Float64,String,Nothing} = "sqrt"::(_ === nothing || (isa(_, String) && (_ in ("auto","sqrt","log2"))) || _ > 0)
     max_leaf_nodes::Option{Int}    = nothing::(_ === nothing || _ > 0)
     min_impurity_decrease::Float64 = 0.0::(_ ≥ 0)
     bootstrap::Bool                = true
@@ -268,7 +273,7 @@ end
 MMI.fitted_params(m::ExtraTreesRegressor, (f, _, _)) = (
     estimators          = f.estimators_,
     feature_importances = f.feature_importances_,
-    n_features          = f.n_features_,
+    n_features          = f.n_features_in_,
     n_outputs           = f.n_outputs_,
     oob_score           = m.oob_score ? f.oob_score_ : nothing,
     oob_prediction      = m.oob_score ? f.oob_prediction_ : nothing,
@@ -293,12 +298,13 @@ ExtraTreesRegressor
 const ExtraTreesClassifier_ = sken(:ExtraTreesClassifier)
 @sk_clf mutable struct ExtraTreesClassifier <: MMI.Probabilistic
     n_estimators::Int              = 100::(_>0)
-    criterion::String              = "gini"::(_ in ("gini", "entropy"))
+    criterion::String              = "gini"::(_ in ("gini", "entropy", "log_loss"))
     max_depth::Option{Int}         = nothing::(_ === nothing || _ > 0)
     min_samples_split::Union{Int,Float64}  = 2::(_ > 0)
     min_samples_leaf::Union{Int,Float64}   = 1::(_ > 0)
     min_weight_fraction_leaf::Float64      = 0.0::(_ ≥ 0)
-    max_features::Union{Int,Float64,String,Nothing} = "auto"::(_ === nothing || (isa(_, String) && (_ in ("auto","sqrt","log2"))) || _ > 0)
+    ## TODO: Remove the "auto" option in python sklearn v1.3
+    max_features::Union{Int,Float64,String,Nothing} = "sqrt"::(_ === nothing || (isa(_, String) && (_ in ("auto","sqrt","log2"))) || _ > 0)
     max_leaf_nodes::Option{Int}    = nothing::(_ === nothing || _ > 0)
     min_impurity_decrease::Float64 = 0.0::(_ ≥ 0)
     bootstrap::Bool                = true
@@ -314,7 +320,7 @@ MMI.fitted_params(m::ExtraTreesClassifier, (f, _, _)) = (
     classes               = f.classes_,
     n_classes             = f.n_classes_,
     feature_importances   = f.feature_importances_,
-    n_features            = f.n_features_,
+    n_features            = f.n_features_in_,
     n_outputs             = f.n_outputs_,
     oob_score             = m.oob_score ? f.oob_score_ : nothing,
     oob_decision_function = m.oob_score ? f.oob_decision_function_ : nothing,

diff --git a/src/models/linear-classifiers.jl b/src/models/linear-classifiers.jl
@@ -131,7 +131,6 @@ const RidgeClassifier_ = sklm(:RidgeClassifier)
 @sk_clf mutable struct RidgeClassifier <: MMI.Deterministic
     alpha::Float64        = 1.0
     fit_intercept::Bool   = true
-    normalize::Bool       = false
     copy_X::Bool          = true
     max_iter::Option{Int} = nothing::(_ === nothing || _ > 0)
     tol::Float64          = 1e-3::(arg>0)
@@ -155,7 +154,6 @@ const RidgeCVClassifier_ = sklm(:RidgeClassifierCV)
 @sk_clf mutable struct RidgeCVClassifier <: MMI.Deterministic
     alphas::AbstractArray{Float64} = [0.1,1.0,10.0]::(all(0 .≤ _))
     fit_intercept::Bool   = true
-    normalize::Bool       = false
     scoring::Any          = nothing
     cv::Int               = 5
     class_weight::Any     = nothing
@@ -175,7 +173,8 @@ meta(RidgeCVClassifier,
 # ============================================================================
 const SGDClassifier_ = sklm(:SGDClassifier)
 @sk_clf mutable struct SGDClassifier <: MMI.Deterministic
-    loss::String          = "hinge"::(_ in ("hinge", "log", "modified_huber", "squared_hinge", "perceptron", "squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"))
+    ## TODO: remove the `log` option when python releases sklearn v1.3.
+    loss::String          = "hinge"::(_ in ("hinge", "log_loss", "log", "modified_huber", "squared_hinge", "perceptron", "squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"))
     penalty::String       = "l2"::(_ in ("l1", "l2", "elasticnet", "none"))
     alpha::Float64        = 1e-4::(_ > 0)
     l1_ratio::Float64     = 0.15::(0 ≤ _ ≤ 1)
@@ -199,7 +198,8 @@ const SGDClassifier_ = sklm(:SGDClassifier)
 end
 const ProbabilisticSGDClassifier_ = sklm(:SGDClassifier)
 @sk_clf mutable struct ProbabilisticSGDClassifier <: MMI.Probabilistic
-    loss::String          = "log"::(_ in ("log", "modified_huber")) # only those -> predict proba
+    ## TODO: remove the `log` option when python releases sklearn v1.3.
+    loss::String          = "log_loss"::(_ in ("log_loss", "log", "modified_huber")) # only those -> predict proba
     penalty::String       = "l2"::(_ in ("l1", "l2", "elasticnet", "none"))
     alpha::Float64        = 1e-4::(_ > 0)
     l1_ratio::Float64     = 0.15::(0 ≤ _ ≤ 1)

diff --git a/src/models/linear-regressors-multi.jl b/src/models/linear-regressors-multi.jl
@@ -2,7 +2,6 @@ const MultiTaskLassoRegressor_ = sklm(:MultiTaskLasso)
 @sk_reg mutable struct MultiTaskLassoRegressor <: MMI.Deterministic
     alpha::Float64      = 1.0::(_ ≥ 0)
     fit_intercept::Bool = true
-    normalize::Bool     = false
     max_iter::Int       = 1_000::(_ > 0)
     tol::Float64        = 1e-4::(_ > 0)
     copy_X::Bool        = true
@@ -22,7 +21,6 @@ const MultiTaskLassoCVRegressor_ = sklm(:MultiTaskLassoCV)
     n_alphas::Int       = 100::(_ > 0)
     alphas::Any         = nothing::(_ === nothing || all(0 .≤ _ .≤ 1))
     fit_intercept::Bool = true
-    normalize::Bool     = false
     max_iter::Int       = 300::(_ > 0)
     tol::Float64        = 1e-4::(_ > 0)
     copy_X::Bool        = true
@@ -47,7 +45,6 @@ const MultiTaskElasticNetRegressor_ = sklm(:MultiTaskElasticNet)
     alpha::Float64      = 1.0::(_ ≥ 0)
     l1_ratio::Union{Float64, Vector{Float64}} = 0.5::(0 ≤ _ ≤ 1)
     fit_intercept::Bool = true
-    normalize::Bool     = true
     copy_X::Bool        = true
     max_iter::Int       = 1_000::(_ > 0)
     tol::Float64        = 1e-4::(_ > 0)
@@ -69,7 +66,6 @@ const MultiTaskElasticNetCVRegressor_ = sklm(:MultiTaskElasticNetCV)
     n_alphas::Int       = 100::(_ > 0)
     alphas::Any         = nothing::(_ === nothing || all(0 .≤ _ .≤ 1))
     fit_intercept::Bool = true
-    normalize::Bool     = false
     max_iter::Int       = 1_000::(_ > 0)
     tol::Float64        = 1e-4::(_ > 0)
     cv::Any             = 5