From d775efb86eb864b56fdb27b2af3e7f01649f3e54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment?= Date: Wed, 18 Sep 2024 22:57:09 +0200 Subject: [PATCH 1/4] Add overload function for sklearn to deal with sparse matrices --- stubs/sklearn/preprocessing/_data.pyi | 128 +++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 1 deletion(-) diff --git a/stubs/sklearn/preprocessing/_data.pyi b/stubs/sklearn/preprocessing/_data.pyi index 3b358849..0fdbc3ac 100644 --- a/stubs/sklearn/preprocessing/_data.pyi +++ b/stubs/sklearn/preprocessing/_data.pyi @@ -1,5 +1,5 @@ from numbers import Integral as Integral, Real as Real -from typing import Any, ClassVar, Literal, TypeVar +from typing import Any, ClassVar, Literal, TypeVar, overload from numpy import ndarray from numpy.random import RandomState @@ -142,7 +142,15 @@ class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): y: Series | None | ndarray | list[int] = None, sample_weight: None | ArrayLike = None, ) -> StandardScaler_Self: ... + @overload + def transform(self, X: spmatrix, copy: None | bool = None) -> spmatrix: ... + @overload + def transform(self, X: ArrayLike, copy: None | bool = None) -> ndarray: ... def transform(self, X: MatrixLike, copy: None | bool = None) -> ndarray | spmatrix: ... + @overload + def inverse_transform(self, X: spmatrix, copy: None | bool = None) -> spmatrix: ... + @overload + def inverse_transform(self, X: ArrayLike, copy: None | bool = None) -> ndarray: ... def inverse_transform(self, X: MatrixLike | ArrayLike, copy: None | bool = None) -> ndarray | spmatrix: ... class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): @@ -157,7 +165,15 @@ class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): def __init__(self, *, copy: bool = True) -> None: ... def fit(self: MaxAbsScaler_Self, X: MatrixLike | ArrayLike, y=None) -> MaxAbsScaler_Self: ... def partial_fit(self: MaxAbsScaler_Self, X: MatrixLike | ArrayLike, y=None) -> MaxAbsScaler_Self: ... + @overload + def transform(self, X: spmatrix) -> spmatrix: ... + @overload + def transform(self, X: ArrayLike) -> ndarray: ... def transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ... + @overload + def inverse_transform(self, X: spmatrix) -> spmatrix: ... + @overload + def inverse_transform(self, X: ArrayLike) -> ndarray: ... def inverse_transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ... def maxabs_scale(X: MatrixLike | ArrayLike, *, axis: Int = 0, copy: bool = True): ... @@ -180,9 +196,39 @@ class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): unit_variance: bool = False, ) -> None: ... def fit(self: RobustScaler_Self, X: MatrixLike | ArrayLike, y: Any = None) -> RobustScaler_Self: ... + @overload + def transform(self, X: spmatrix) -> spmatrix: ... + @overload + def transform(self, X: ArrayLike) -> ndarray: ... def transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ... + @overload + def inverse_transform(self, X: spmatrix) -> spmatrix: ... + @overload + def inverse_transform(self, X: ArrayLike) -> ndarray: ... def inverse_transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ... +@overload +def robust_scale( + X: spmatrix, + *, + axis: Int = 0, + with_centering: bool = True, + with_scaling: bool = True, + quantile_range: tuple[float, float] = ..., + copy: bool = True, + unit_variance: bool = False, +) -> spmatrix: ... +@overload +def robust_scale( + X: ndarray, + *, + axis: Int = 0, + with_centering: bool = True, + with_scaling: bool = True, + quantile_range: tuple[float, float] = ..., + copy: bool = True, + unit_variance: bool = False, +) -> ndarray: ... def robust_scale( X: MatrixLike, *, @@ -193,6 +239,42 @@ def robust_scale( copy: bool = True, unit_variance: bool = False, ) -> ndarray | spmatrix: ... +@overload +def normalize( + X: spmatrix, + norm: Literal["l1", "l2", "max", "l2"] = "l2", + *, + axis: int = 1, + copy: bool = True, + return_norm: Literal["True"] = ..., +) -> tuple[csr_matrix, ndarray]: ... +@overload +def normalize( + X: spmatrix, + norm: Literal["l1", "l2", "max", "l2"] = "l2", + *, + axis: int = 1, + copy: bool = True, + return_norm: Literal["False"] = ..., +) -> csr_matrix: ... +@overload +def normalize( + X: ArrayLike, + norm: Literal["l1", "l2", "max", "l2"] = "l2", + *, + axis: int = 1, + copy: bool = True, + return_norm: Literal["True"] = ..., +) -> tuple[ndarray, ndarray]: ... +@overload +def normalize( + X: ArrayLike, + norm: Literal["l1", "l2", "max", "l2"] = "l2", + *, + axis: int = 1, + copy: bool = True, + return_norm: Literal["False"] = ..., +) -> ndarray: ... def normalize( X: MatrixLike | ArrayLike, norm: Literal["l1", "l2", "max", "l2"] = "l2", @@ -210,6 +292,10 @@ class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): def __init__(self, norm: Literal["l1", "l2", "max", "l2"] = "l2", *, copy: bool = True) -> None: ... def fit(self: Normalizer_Self, X: MatrixLike | ArrayLike, y: Any = None) -> Normalizer_Self: ... + @overload + def transform(self, X: spmatrix, copy: None | bool = None) -> spmatrix: ... + @overload + def transform(self, X: ArrayLike, copy: None | bool = None) -> ndarray: ... def transform(self, X: MatrixLike | ArrayLike, copy: None | bool = None) -> ndarray | spmatrix: ... def binarize(X: MatrixLike | ArrayLike, *, threshold: Float = 0.0, copy: bool = True) -> ndarray | spmatrix: ... @@ -222,6 +308,10 @@ class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): def __init__(self, *, threshold: Float = 0.0, copy: bool = True) -> None: ... def fit(self: Binarizer_Self, X: MatrixLike | ArrayLike, y=None) -> Binarizer_Self: ... + @overload + def transform(self, X: spmatrix, copy: None | bool = None) -> spmatrix: ... + @overload + def transform(self, X: ArrayLike, copy: None | bool = None) -> ndarray: ... def transform(self, X: MatrixLike | ArrayLike, copy: None | bool = None) -> ndarray | spmatrix: ... class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): @@ -234,6 +324,10 @@ class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEsti def fit(self: KernelCenterer_Self, K: MatrixLike, y=None) -> KernelCenterer_Self: ... def transform(self, K: MatrixLike, copy: bool = True) -> ndarray: ... +@overload +def add_dummy_feature(X: spmatrix, value: Float = 1.0) -> spmatrix: ... +@overload +def add_dummy_feature(X: ArrayLike, value: Float = 1.0) -> ndarray: ... def add_dummy_feature(X: MatrixLike | ArrayLike, value: Float = 1.0) -> ndarray | spmatrix: ... class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): @@ -260,9 +354,41 @@ class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator) X: MatrixLike | ArrayLike, y: Series | None = None, ) -> QuantileTransformer_Self: ... + @overload + def transform(self, X: spmatrix) -> spmatrix: ... + @overload + def transform(self, X: ArrayLike) -> ndarray: ... def transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ... + @overload + def inverse_transform(self, X: spmatrix) -> spmatrix: ... + @overload + def inverse_transform(self, X: ArrayLike) -> ndarray: ... def inverse_transform(self, X: MatrixLike | ArrayLike) -> ndarray | spmatrix: ... +@overload +def quantile_transform( + X: spmatrix, + *, + axis: Int = 0, + n_quantiles: Int = 1000, + output_distribution: Literal["uniform", "normal", "uniform"] = "uniform", + ignore_implicit_zeros: bool = False, + subsample: Int = ..., + random_state: RandomState | None | Int = None, + copy: bool = True, +) -> spmatrix: ... +@overload +def quantile_transform( + X: ArrayLike, + *, + axis: Int = 0, + n_quantiles: Int = 1000, + output_distribution: Literal["uniform", "normal", "uniform"] = "uniform", + ignore_implicit_zeros: bool = False, + subsample: Int = ..., + random_state: RandomState | None | Int = None, + copy: bool = True, +) -> ndarray: ... def quantile_transform( X: MatrixLike | ArrayLike, *, From 4a44c09ad85ad1a56587c52213bd51e2fb966a38 Mon Sep 17 00:00:00 2001 From: Erik De Bonte Date: Thu, 19 Sep 2024 16:00:43 -0700 Subject: [PATCH 2/4] Fix return_norm on normalize overloads --- stubs/sklearn/preprocessing/_data.pyi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stubs/sklearn/preprocessing/_data.pyi b/stubs/sklearn/preprocessing/_data.pyi index 0fdbc3ac..48b688b4 100644 --- a/stubs/sklearn/preprocessing/_data.pyi +++ b/stubs/sklearn/preprocessing/_data.pyi @@ -246,7 +246,7 @@ def normalize( *, axis: int = 1, copy: bool = True, - return_norm: Literal["True"] = ..., + return_norm: Literal[True], ) -> tuple[csr_matrix, ndarray]: ... @overload def normalize( @@ -255,7 +255,7 @@ def normalize( *, axis: int = 1, copy: bool = True, - return_norm: Literal["False"] = ..., + return_norm: Literal[False] = ..., ) -> csr_matrix: ... @overload def normalize( @@ -264,7 +264,7 @@ def normalize( *, axis: int = 1, copy: bool = True, - return_norm: Literal["True"] = ..., + return_norm: Literal[True], ) -> tuple[ndarray, ndarray]: ... @overload def normalize( @@ -273,7 +273,7 @@ def normalize( *, axis: int = 1, copy: bool = True, - return_norm: Literal["False"] = ..., + return_norm: Literal[False] = ..., ) -> ndarray: ... def normalize( X: MatrixLike | ArrayLike, From 93f655740cb9a3f97120cb8c140171ca5a3ce896 Mon Sep 17 00:00:00 2001 From: Erik De Bonte Date: Thu, 19 Sep 2024 16:01:15 -0700 Subject: [PATCH 3/4] Add tests for sklearn normalize overloads --- tests/requirements.txt | 6 ++++-- tests/sklearn/preprocessing_tests.py | 32 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 tests/sklearn/preprocessing_tests.py diff --git a/tests/requirements.txt b/tests/requirements.txt index a3df2390..6d71f9aa 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,5 +1,7 @@ -pyright matplotlib -pytest mypy==0.950 +pyright +pytest +scikit-learn +scipy typing_extensions==4.2.0 diff --git a/tests/sklearn/preprocessing_tests.py b/tests/sklearn/preprocessing_tests.py new file mode 100644 index 00000000..0cd3d1b9 --- /dev/null +++ b/tests/sklearn/preprocessing_tests.py @@ -0,0 +1,32 @@ +# pyright: reportUnknownVariableType=false +# pyright: reportMissingTypeStubs=false + +from typing import Any, assert_type +from numpy import ndarray +from sklearn.preprocessing import normalize + +from scipy.sparse._matrix import spmatrix +from scipy.sparse._csr import csr_matrix + + +# normalize with matrix +matrix: spmatrix = spmatrix() +result = normalize(matrix) +assert_type(result, csr_matrix) + +result = normalize(matrix, return_norm=False) +assert_type(result, csr_matrix) + +result = normalize(matrix, return_norm=True) +assert_type(result, tuple[csr_matrix, ndarray[Any, Any]]) + +# normalize with array +array_like = [1] +result = normalize(array_like) +assert_type(result, ndarray[Any, Any]) + +result = normalize(array_like, return_norm=False) +assert_type(result, ndarray[Any, Any]) + +result = normalize(array_like, return_norm=True) +assert_type(result, tuple[ndarray[Any, Any], ndarray[Any, Any]]) From b68fd529487e7da89ecf5ef31437b14bef34d139 Mon Sep 17 00:00:00 2001 From: Erik De Bonte Date: Thu, 19 Sep 2024 16:05:52 -0700 Subject: [PATCH 4/4] Hygiene --- tests/sklearn/preprocessing_tests.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/sklearn/preprocessing_tests.py b/tests/sklearn/preprocessing_tests.py index 0cd3d1b9..2fbd376b 100644 --- a/tests/sklearn/preprocessing_tests.py +++ b/tests/sklearn/preprocessing_tests.py @@ -2,12 +2,11 @@ # pyright: reportMissingTypeStubs=false from typing import Any, assert_type -from numpy import ndarray -from sklearn.preprocessing import normalize -from scipy.sparse._matrix import spmatrix +from numpy import ndarray from scipy.sparse._csr import csr_matrix - +from scipy.sparse._matrix import spmatrix +from sklearn.preprocessing import normalize # normalize with matrix matrix: spmatrix = spmatrix()