Merge branch 'main' of https://github.com/virchan/scikit-learn

virchan · Feb 28, 2024 · a188d7e · a188d7e
2 parents 1fb3a20 + 9dee432
commit a188d7e
Show file tree

Hide file tree

Showing 8 changed files with 755 additions and 669 deletions.
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
diff --git a/sklearn/cluster/_hdbscan/hdbscan.py b/sklearn/cluster/_hdbscan/hdbscan.py
@@ -594,6 +594,14 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
     OPTICS : Ordering Points To Identify the Clustering Structure.
     Birch : Memory-efficient, online-learning algorithm.
 
+    Notes
+    -----
+    The `min_samples` parameter includes the point itself, whereas the implementation in
+    `scikit-learn-contrib/hdbscan <https://github.com/scikit-learn-contrib/hdbscan>`_
+    does not. To get the same results in both versions, the value of `min_samples` here
+    must be 1 greater than the value used in `scikit-learn-contrib/hdbscan
+    <https://github.com/scikit-learn-contrib/hdbscan>`_.
+
     References
     ----------
 

diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py
@@ -199,7 +199,7 @@ def fetch_covtype(
         with TemporaryDirectory(dir=covtype_dir) as temp_dir:
             logger.info(f"Downloading {ARCHIVE.url}")
             archive_path = _fetch_remote(
-                ARCHIVE, dirname=temp_dir, _retries=n_retries, delay=delay
+                ARCHIVE, dirname=temp_dir, n_retries=n_retries, delay=delay
             )
             Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=",")
 

diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
@@ -21,8 +21,8 @@
     _safe_indexing,
     check_random_state,
     column_or_1d,
-    indices_to_mask,
 )
+from ..utils._mask import indices_to_mask
 from ..utils._param_validation import HasMethods, Interval, RealNotInt
 from ..utils._tags import _safe_tags
 from ..utils.metadata_routing import (

diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py
@@ -7,7 +7,7 @@
 from scipy import optimize
 
 from ..base import BaseEstimator, RegressorMixin, _fit_context
-from ..utils import axis0_safe_slice
+from ..utils._mask import axis0_safe_slice
 from ..utils._param_validation import Interval
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import _check_optimize_result

diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
@@ -1151,7 +1151,7 @@ def fit_transform(self, X, y=None):
             Embedding of the training data in low-dimensional space.
         """
         # TODO(1.7): remove
-        # Also make sure to change `max_iter` default back to 1 and deprecate None
+        # Also make sure to change `max_iter` default back to 1000 and deprecate None
         if self.n_iter != "deprecated":
             if self.max_iter is not None:
                 raise ValueError(

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
@@ -19,6 +19,7 @@
 from ._bunch import Bunch
 from ._chunking import gen_batches, gen_even_slices
 from ._estimator_html_repr import estimator_html_repr
+from ._mask import safe_mask
 from ._param_validation import Interval, validate_params
 from .class_weight import compute_class_weight, compute_sample_weight
 from .deprecation import deprecated
@@ -64,7 +65,6 @@
     "check_scalar",
     "indexable",
     "check_symmetric",
-    "indices_to_mask",
     "deprecated",
     "parallel_backend",
     "register_parallel_backend",
@@ -76,6 +76,7 @@
     "Bunch",
     "metadata_routing",
     "safe_sqr",
+    "safe_mask",
     "gen_batches",
     "gen_even_slices",
 ]
@@ -85,88 +86,6 @@
 _IS_WASM = platform.machine() in ["wasm32", "wasm64"]
 
 
-@validate_params(
-    {
-        "X": ["array-like", "sparse matrix"],
-        "mask": ["array-like"],
-    },
-    prefer_skip_nested_validation=True,
-)
-def safe_mask(X, mask):
-    """Return a mask which is safe to use on X.
-
-    Parameters
-    ----------
-    X : {array-like, sparse matrix}
-        Data on which to apply mask.
-
-    mask : array-like
-        Mask to be used on X.
-
-    Returns
-    -------
-    mask : ndarray
-        Array that is safe to use on X.
-
-    Examples
-    --------
-    >>> from sklearn.utils import safe_mask
-    >>> from scipy.sparse import csr_matrix
-    >>> data = csr_matrix([[1], [2], [3], [4], [5]])
-    >>> condition = [False, True, True, False, True]
-    >>> mask = safe_mask(data, condition)
-    >>> data[mask].toarray()
-    array([[2],
-           [3],
-           [5]])
-    """
-    mask = np.asarray(mask)
-    if np.issubdtype(mask.dtype, np.signedinteger):
-        return mask
-
-    if hasattr(X, "toarray"):
-        ind = np.arange(mask.shape[0])
-        mask = ind[mask]
-    return mask
-
-
-def axis0_safe_slice(X, mask, len_mask):
-    """Return a mask which is safer to use on X than safe_mask.
-
-    This mask is safer than safe_mask since it returns an
-    empty array, when a sparse matrix is sliced with a boolean mask
-    with all False, instead of raising an unhelpful error in older
-    versions of SciPy.
-
-    See: https://github.com/scipy/scipy/issues/5361
-
-    Also note that we can avoid doing the dot product by checking if
-    the len_mask is not zero in _huber_loss_and_gradient but this
-    is not going to be the bottleneck, since the number of outliers
-    and non_outliers are typically non-zero and it makes the code
-    tougher to follow.
-
-    Parameters
-    ----------
-    X : {array-like, sparse matrix}
-        Data on which to apply mask.
-
-    mask : ndarray
-        Mask to be used on X.
-
-    len_mask : int
-        The length of the mask.
-
-    Returns
-    -------
-    mask : ndarray
-        Array that is safe to use on X.
-    """
-    if len_mask != 0:
-        return X[safe_mask(X, mask), :]
-    return np.zeros(shape=(0, X.shape[1]))
-
-
 def _array_indexing(array, key, key_dtype, axis):
     """Index an array or scipy.sparse consistently across NumPy version."""
     if issparse(array) and key_dtype == "bool":
@@ -806,38 +725,6 @@ def _to_object_array(sequence):
     return out
 
 
-def indices_to_mask(indices, mask_length):
-    """Convert list of indices to boolean mask.
-
-    Parameters
-    ----------
-    indices : list-like
-        List of integers treated as indices.
-    mask_length : int
-        Length of boolean mask to be generated.
-        This parameter must be greater than max(indices).
-
-    Returns
-    -------
-    mask : 1d boolean nd-array
-        Boolean array that is True where indices are present, else False.
-
-    Examples
-    --------
-    >>> from sklearn.utils import indices_to_mask
-    >>> indices = [1, 2 , 3, 4]
-    >>> indices_to_mask(indices, 5)
-    array([False,  True,  True,  True,  True])
-    """
-    if mask_length <= np.max(indices):
-        raise ValueError("mask_length must be greater than max(indices)")
-
-    mask = np.zeros(mask_length, dtype=bool)
-    mask[indices] = True
-
-    return mask
-
-
 def _message_with_time(source, message, time):
     """Create one line message for logging purposes.
 

diff --git a/sklearn/utils/_mask.py b/sklearn/utils/_mask.py
@@ -4,6 +4,7 @@
 from scipy import sparse as sp
 
 from ._missing import is_scalar_nan
+from ._param_validation import validate_params
 from .fixes import _object_dtype_isnan
 
 
@@ -61,3 +62,117 @@ def _get_mask(X, value_to_mask):
     )
 
     return Xt_sparse
+
+
+@validate_params(
+    {
+        "X": ["array-like", "sparse matrix"],
+        "mask": ["array-like"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def safe_mask(X, mask):
+    """Return a mask which is safe to use on X.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}
+        Data on which to apply mask.
+
+    mask : array-like
+        Mask to be used on X.
+
+    Returns
+    -------
+    mask : ndarray
+        Array that is safe to use on X.
+
+    Examples
+    --------
+    >>> from sklearn.utils import safe_mask
+    >>> from scipy.sparse import csr_matrix
+    >>> data = csr_matrix([[1], [2], [3], [4], [5]])
+    >>> condition = [False, True, True, False, True]
+    >>> mask = safe_mask(data, condition)
+    >>> data[mask].toarray()
+    array([[2],
+           [3],
+           [5]])
+    """
+    mask = np.asarray(mask)
+    if np.issubdtype(mask.dtype, np.signedinteger):
+        return mask
+
+    if hasattr(X, "toarray"):
+        ind = np.arange(mask.shape[0])
+        mask = ind[mask]
+    return mask
+
+
+def axis0_safe_slice(X, mask, len_mask):
+    """Return a mask which is safer to use on X than safe_mask.
+
+    This mask is safer than safe_mask since it returns an
+    empty array, when a sparse matrix is sliced with a boolean mask
+    with all False, instead of raising an unhelpful error in older
+    versions of SciPy.
+
+    See: https://github.com/scipy/scipy/issues/5361
+
+    Also note that we can avoid doing the dot product by checking if
+    the len_mask is not zero in _huber_loss_and_gradient but this
+    is not going to be the bottleneck, since the number of outliers
+    and non_outliers are typically non-zero and it makes the code
+    tougher to follow.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}
+        Data on which to apply mask.
+
+    mask : ndarray
+        Mask to be used on X.
+
+    len_mask : int
+        The length of the mask.
+
+    Returns
+    -------
+    mask : ndarray
+        Array that is safe to use on X.
+    """
+    if len_mask != 0:
+        return X[safe_mask(X, mask), :]
+    return np.zeros(shape=(0, X.shape[1]))
+
+
+def indices_to_mask(indices, mask_length):
+    """Convert list of indices to boolean mask.
+
+    Parameters
+    ----------
+    indices : list-like
+        List of integers treated as indices.
+    mask_length : int
+        Length of boolean mask to be generated.
+        This parameter must be greater than max(indices).
+
+    Returns
+    -------
+    mask : 1d boolean nd-array
+        Boolean array that is True where indices are present, else False.
+
+    Examples
+    --------
+    >>> from sklearn.utils._mask import indices_to_mask
+    >>> indices = [1, 2 , 3, 4]
+    >>> indices_to_mask(indices, 5)
+    array([False,  True,  True,  True,  True])
+    """
+    if mask_length <= np.max(indices):
+        raise ValueError("mask_length must be greater than max(indices)")
+
+    mask = np.zeros(mask_length, dtype=bool)
+    mask[indices] = True
+
+    return mask