From d775efb86eb864b56fdb27b2af3e7f01649f3e54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment?= <mail@clementpinard.fr>
Date: Wed, 18 Sep 2024 22:57:09 +0200
Subject: [PATCH 1/4] Add overload function for sklearn to deal with sparse
 matrices

---
 stubs/sklearn/preprocessing/_data.pyi | 128 +++++++++++++++++++++++++-
 1 file changed, 127 insertions(+), 1 deletion(-)

diff --git a/stubs/sklearn/preprocessing/_data.pyi b/stubs/sklearn/preprocessing/_data.pyi
index 3b358849..0fdbc3ac 100644
--- a/stubs/sklearn/preprocessing/_data.pyi
+++ b/stubs/sklearn/preprocessing/_data.pyi
@@ -1,5 +1,5 @@
 from numbers import Integral as Integral, Real as Real
-from typing import Any, ClassVar, Literal, TypeVar
+from typing import Any, ClassVar, Literal, TypeVar, overload
 
 from numpy import ndarray
 from numpy.random import RandomState
@@ -142,7 +142,15 @@ class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
         y: Series | None | ndarray | list[int] = None,
         sample_weight: None | ArrayLike = None,
     ) -> StandardScaler_Self: ...
+    @overload
+    def transform(self, X: spmatrix, copy: None | bool = None) -> spmatrix: ...
+    @overload
+    def transform(self, X: ArrayLike, copy: None | bool = None) -> ndarray: ...
     def transform(self, X: MatrixLike, copy: None | bool = None) -> ndarray | spmatrix: ...
+    @overload
+    def inverse_transform(self, X: spmatrix, copy: None | bool = None) -> spmatrix: ...
+    @overload
+    def inverse_transform(self, X: ArrayLike, copy: None | bool = None) -> ndarray: ...
     def inverse_transform(self, X: MatrixLike | ArrayLike, copy: None | bool = None) -> ndarray | spmatrix: ...
 
 class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
@@ -157,7 +165,15 @@ class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     def __init__(self, *, copy: bool = True) -> None: ...
     def fit(self: MaxAbsScaler_Self, X: MatrixLike | ArrayLike, y=None) -> MaxAbsScaler_Self: ...
     def partial_fit(self: MaxAbsScaler_Self, X: MatrixLike | ArrayLike, y=None) -> MaxAbsScaler_Self: ...
+    @overload
+    def transform(self, X: spmatrix) -> spmatrix: ...
+    @overload
+    def transform(self, X: ArrayLike) -> ndarray: ...
     def transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ...
+    @overload
+    def inverse_transform(self, X: spmatrix) -> spmatrix: ...
+    @overload
+    def inverse_transform(self, X: ArrayLike) -> ndarray: ...
     def inverse_transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ...
 
 def maxabs_scale(X: MatrixLike | ArrayLike, *, axis: Int = 0, copy: bool = True): ...
@@ -180,9 +196,39 @@ class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
         unit_variance: bool = False,
     ) -> None: ...
     def fit(self: RobustScaler_Self, X: MatrixLike | ArrayLike, y: Any = None) -> RobustScaler_Self: ...
+    @overload
+    def transform(self, X: spmatrix) -> spmatrix: ...
+    @overload
+    def transform(self, X: ArrayLike) -> ndarray: ...
     def transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ...
+    @overload
+    def inverse_transform(self, X: spmatrix) -> spmatrix: ...
+    @overload
+    def inverse_transform(self, X: ArrayLike) -> ndarray: ...
     def inverse_transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ...
 
+@overload
+def robust_scale(
+    X: spmatrix,
+    *,
+    axis: Int = 0,
+    with_centering: bool = True,
+    with_scaling: bool = True,
+    quantile_range: tuple[float, float] = ...,
+    copy: bool = True,
+    unit_variance: bool = False,
+) -> spmatrix: ...
+@overload
+def robust_scale(
+    X: ndarray,
+    *,
+    axis: Int = 0,
+    with_centering: bool = True,
+    with_scaling: bool = True,
+    quantile_range: tuple[float, float] = ...,
+    copy: bool = True,
+    unit_variance: bool = False,
+) -> ndarray: ...
 def robust_scale(
     X: MatrixLike,
     *,
@@ -193,6 +239,42 @@ def robust_scale(
     copy: bool = True,
     unit_variance: bool = False,
 ) -> ndarray | spmatrix: ...
+@overload
+def normalize(
+    X: spmatrix,
+    norm: Literal["l1", "l2", "max", "l2"] = "l2",
+    *,
+    axis: int = 1,
+    copy: bool = True,
+    return_norm: Literal["True"] = ...,
+) -> tuple[csr_matrix, ndarray]: ...
+@overload
+def normalize(
+    X: spmatrix,
+    norm: Literal["l1", "l2", "max", "l2"] = "l2",
+    *,
+    axis: int = 1,
+    copy: bool = True,
+    return_norm: Literal["False"] = ...,
+) -> csr_matrix: ...
+@overload
+def normalize(
+    X: ArrayLike,
+    norm: Literal["l1", "l2", "max", "l2"] = "l2",
+    *,
+    axis: int = 1,
+    copy: bool = True,
+    return_norm: Literal["True"] = ...,
+) -> tuple[ndarray, ndarray]: ...
+@overload
+def normalize(
+    X: ArrayLike,
+    norm: Literal["l1", "l2", "max", "l2"] = "l2",
+    *,
+    axis: int = 1,
+    copy: bool = True,
+    return_norm: Literal["False"] = ...,
+) -> ndarray: ...
 def normalize(
     X: MatrixLike | ArrayLike,
     norm: Literal["l1", "l2", "max", "l2"] = "l2",
@@ -210,6 +292,10 @@ class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
 
     def __init__(self, norm: Literal["l1", "l2", "max", "l2"] = "l2", *, copy: bool = True) -> None: ...
     def fit(self: Normalizer_Self, X: MatrixLike | ArrayLike, y: Any = None) -> Normalizer_Self: ...
+    @overload
+    def transform(self, X: spmatrix, copy: None | bool = None) -> spmatrix: ...
+    @overload
+    def transform(self, X: ArrayLike, copy: None | bool = None) -> ndarray: ...
     def transform(self, X: MatrixLike | ArrayLike, copy: None | bool = None) -> ndarray | spmatrix: ...
 
 def binarize(X: MatrixLike | ArrayLike, *, threshold: Float = 0.0, copy: bool = True) -> ndarray | spmatrix: ...
@@ -222,6 +308,10 @@ class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
 
     def __init__(self, *, threshold: Float = 0.0, copy: bool = True) -> None: ...
     def fit(self: Binarizer_Self, X: MatrixLike | ArrayLike, y=None) -> Binarizer_Self: ...
+    @overload
+    def transform(self, X: spmatrix, copy: None | bool = None) -> spmatrix: ...
+    @overload
+    def transform(self, X: ArrayLike, copy: None | bool = None) -> ndarray: ...
     def transform(self, X: MatrixLike | ArrayLike, copy: None | bool = None) -> ndarray | spmatrix: ...
 
 class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
@@ -234,6 +324,10 @@ class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEsti
     def fit(self: KernelCenterer_Self, K: MatrixLike, y=None) -> KernelCenterer_Self: ...
     def transform(self, K: MatrixLike, copy: bool = True) -> ndarray: ...
 
+@overload
+def add_dummy_feature(X: spmatrix, value: Float = 1.0) -> spmatrix: ...
+@overload
+def add_dummy_feature(X: ArrayLike, value: Float = 1.0) -> ndarray: ...
 def add_dummy_feature(X: MatrixLike | ArrayLike, value: Float = 1.0) -> ndarray | spmatrix: ...
 
 class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
@@ -260,9 +354,41 @@ class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator)
         X: MatrixLike | ArrayLike,
         y: Series | None = None,
     ) -> QuantileTransformer_Self: ...
+    @overload
+    def transform(self, X: spmatrix) -> spmatrix: ...
+    @overload
+    def transform(self, X: ArrayLike) -> ndarray: ...
     def transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ...
+    @overload
+    def inverse_transform(self, X: spmatrix) -> spmatrix: ...
+    @overload
+    def inverse_transform(self, X: ArrayLike) -> ndarray: ...
     def inverse_transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ...
 
+@overload
+def quantile_transform(
+    X: spmatrix,
+    *,
+    axis: Int = 0,
+    n_quantiles: Int = 1000,
+    output_distribution: Literal["uniform", "normal", "uniform"] = "uniform",
+    ignore_implicit_zeros: bool = False,
+    subsample: Int = ...,
+    random_state: RandomState | None | Int = None,
+    copy: bool = True,
+) -> spmatrix: ...
+@overload
+def quantile_transform(
+    X: ArrayLike,
+    *,
+    axis: Int = 0,
+    n_quantiles: Int = 1000,
+    output_distribution: Literal["uniform", "normal", "uniform"] = "uniform",
+    ignore_implicit_zeros: bool = False,
+    subsample: Int = ...,
+    random_state: RandomState | None | Int = None,
+    copy: bool = True,
+) -> ndarray: ...
 def quantile_transform(
     X: MatrixLike | ArrayLike,
     *,

From 4a44c09ad85ad1a56587c52213bd51e2fb966a38 Mon Sep 17 00:00:00 2001
From: Erik De Bonte <erikd@microsoft.com>
Date: Thu, 19 Sep 2024 16:00:43 -0700
Subject: [PATCH 2/4] Fix return_norm on normalize overloads

---
 stubs/sklearn/preprocessing/_data.pyi | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/stubs/sklearn/preprocessing/_data.pyi b/stubs/sklearn/preprocessing/_data.pyi
index 0fdbc3ac..48b688b4 100644
--- a/stubs/sklearn/preprocessing/_data.pyi
+++ b/stubs/sklearn/preprocessing/_data.pyi
@@ -246,7 +246,7 @@ def normalize(
     *,
     axis: int = 1,
     copy: bool = True,
-    return_norm: Literal["True"] = ...,
+    return_norm: Literal[True],
 ) -> tuple[csr_matrix, ndarray]: ...
 @overload
 def normalize(
@@ -255,7 +255,7 @@ def normalize(
     *,
     axis: int = 1,
     copy: bool = True,
-    return_norm: Literal["False"] = ...,
+    return_norm: Literal[False] = ...,
 ) -> csr_matrix: ...
 @overload
 def normalize(
@@ -264,7 +264,7 @@ def normalize(
     *,
     axis: int = 1,
     copy: bool = True,
-    return_norm: Literal["True"] = ...,
+    return_norm: Literal[True],
 ) -> tuple[ndarray, ndarray]: ...
 @overload
 def normalize(
@@ -273,7 +273,7 @@ def normalize(
     *,
     axis: int = 1,
     copy: bool = True,
-    return_norm: Literal["False"] = ...,
+    return_norm: Literal[False] = ...,
 ) -> ndarray: ...
 def normalize(
     X: MatrixLike | ArrayLike,

From 93f655740cb9a3f97120cb8c140171ca5a3ce896 Mon Sep 17 00:00:00 2001
From: Erik De Bonte <erikd@microsoft.com>
Date: Thu, 19 Sep 2024 16:01:15 -0700
Subject: [PATCH 3/4] Add tests for sklearn normalize overloads

---
 tests/requirements.txt               |  6 ++++--
 tests/sklearn/preprocessing_tests.py | 32 ++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 2 deletions(-)
 create mode 100644 tests/sklearn/preprocessing_tests.py

diff --git a/tests/requirements.txt b/tests/requirements.txt
index a3df2390..6d71f9aa 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,5 +1,7 @@
-pyright
 matplotlib
-pytest
 mypy==0.950
+pyright
+pytest
+scikit-learn
+scipy
 typing_extensions==4.2.0
diff --git a/tests/sklearn/preprocessing_tests.py b/tests/sklearn/preprocessing_tests.py
new file mode 100644
index 00000000..0cd3d1b9
--- /dev/null
+++ b/tests/sklearn/preprocessing_tests.py
@@ -0,0 +1,32 @@
+# pyright: reportUnknownVariableType=false
+# pyright: reportMissingTypeStubs=false
+
+from typing import Any, assert_type
+from numpy import ndarray
+from sklearn.preprocessing import normalize
+
+from scipy.sparse._matrix import spmatrix
+from scipy.sparse._csr import csr_matrix
+
+
+# normalize with matrix
+matrix: spmatrix = spmatrix()
+result = normalize(matrix)
+assert_type(result, csr_matrix)
+
+result = normalize(matrix, return_norm=False)
+assert_type(result, csr_matrix)
+
+result = normalize(matrix, return_norm=True)
+assert_type(result, tuple[csr_matrix, ndarray[Any, Any]])
+
+# normalize with array
+array_like = [1]
+result = normalize(array_like)
+assert_type(result, ndarray[Any, Any])
+
+result = normalize(array_like, return_norm=False)
+assert_type(result, ndarray[Any, Any])
+
+result = normalize(array_like, return_norm=True)
+assert_type(result, tuple[ndarray[Any, Any], ndarray[Any, Any]])

From b68fd529487e7da89ecf5ef31437b14bef34d139 Mon Sep 17 00:00:00 2001
From: Erik De Bonte <erikd@microsoft.com>
Date: Thu, 19 Sep 2024 16:05:52 -0700
Subject: [PATCH 4/4] Hygiene

---
 tests/sklearn/preprocessing_tests.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/sklearn/preprocessing_tests.py b/tests/sklearn/preprocessing_tests.py
index 0cd3d1b9..2fbd376b 100644
--- a/tests/sklearn/preprocessing_tests.py
+++ b/tests/sklearn/preprocessing_tests.py
@@ -2,12 +2,11 @@
 # pyright: reportMissingTypeStubs=false
 
 from typing import Any, assert_type
-from numpy import ndarray
-from sklearn.preprocessing import normalize
 
-from scipy.sparse._matrix import spmatrix
+from numpy import ndarray
 from scipy.sparse._csr import csr_matrix
-
+from scipy.sparse._matrix import spmatrix
+from sklearn.preprocessing import normalize
 
 # normalize with matrix
 matrix: spmatrix = spmatrix()