Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
virchan committed Feb 28, 2024
2 parents 1fb3a20 + 9dee432 commit a188d7e
Show file tree
Hide file tree
Showing 8 changed files with 755 additions and 669 deletions.
1,176 changes: 626 additions & 550 deletions doc/modules/clustering.rst

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions sklearn/cluster/_hdbscan/hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,14 @@ class HDBSCAN(ClusterMixin, BaseEstimator):
OPTICS : Ordering Points To Identify the Clustering Structure.
Birch : Memory-efficient, online-learning algorithm.
Notes
-----
The `min_samples` parameter includes the point itself, whereas the implementation in
`scikit-learn-contrib/hdbscan <https://github.com/scikit-learn-contrib/hdbscan>`_
does not. To get the same results in both versions, the value of `min_samples` here
must be 1 greater than the value used in `scikit-learn-contrib/hdbscan
<https://github.com/scikit-learn-contrib/hdbscan>`_.
References
----------
Expand Down
2 changes: 1 addition & 1 deletion sklearn/datasets/_covtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def fetch_covtype(
with TemporaryDirectory(dir=covtype_dir) as temp_dir:
logger.info(f"Downloading {ARCHIVE.url}")
archive_path = _fetch_remote(
ARCHIVE, dirname=temp_dir, _retries=n_retries, delay=delay
ARCHIVE, dirname=temp_dir, n_retries=n_retries, delay=delay
)
Xy = np.genfromtxt(GzipFile(filename=archive_path), delimiter=",")

Expand Down
2 changes: 1 addition & 1 deletion sklearn/ensemble/_bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
_safe_indexing,
check_random_state,
column_or_1d,
indices_to_mask,
)
from ..utils._mask import indices_to_mask
from ..utils._param_validation import HasMethods, Interval, RealNotInt
from ..utils._tags import _safe_tags
from ..utils.metadata_routing import (
Expand Down
2 changes: 1 addition & 1 deletion sklearn/linear_model/_huber.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from scipy import optimize

from ..base import BaseEstimator, RegressorMixin, _fit_context
from ..utils import axis0_safe_slice
from ..utils._mask import axis0_safe_slice
from ..utils._param_validation import Interval
from ..utils.extmath import safe_sparse_dot
from ..utils.optimize import _check_optimize_result
Expand Down
2 changes: 1 addition & 1 deletion sklearn/manifold/_t_sne.py
Original file line number Diff line number Diff line change
Expand Up @@ -1151,7 +1151,7 @@ def fit_transform(self, X, y=None):
Embedding of the training data in low-dimensional space.
"""
# TODO(1.7): remove
# Also make sure to change `max_iter` default back to 1 and deprecate None
# Also make sure to change `max_iter` default back to 1000 and deprecate None
if self.n_iter != "deprecated":
if self.max_iter is not None:
raise ValueError(
Expand Down
117 changes: 2 additions & 115 deletions sklearn/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from ._bunch import Bunch
from ._chunking import gen_batches, gen_even_slices
from ._estimator_html_repr import estimator_html_repr
from ._mask import safe_mask
from ._param_validation import Interval, validate_params
from .class_weight import compute_class_weight, compute_sample_weight
from .deprecation import deprecated
Expand Down Expand Up @@ -64,7 +65,6 @@
"check_scalar",
"indexable",
"check_symmetric",
"indices_to_mask",
"deprecated",
"parallel_backend",
"register_parallel_backend",
Expand All @@ -76,6 +76,7 @@
"Bunch",
"metadata_routing",
"safe_sqr",
"safe_mask",
"gen_batches",
"gen_even_slices",
]
Expand All @@ -85,88 +86,6 @@
_IS_WASM = platform.machine() in ["wasm32", "wasm64"]


@validate_params(
{
"X": ["array-like", "sparse matrix"],
"mask": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def safe_mask(X, mask):
"""Return a mask which is safe to use on X.
Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.
mask : array-like
Mask to be used on X.
Returns
-------
mask : ndarray
Array that is safe to use on X.
Examples
--------
>>> from sklearn.utils import safe_mask
>>> from scipy.sparse import csr_matrix
>>> data = csr_matrix([[1], [2], [3], [4], [5]])
>>> condition = [False, True, True, False, True]
>>> mask = safe_mask(data, condition)
>>> data[mask].toarray()
array([[2],
[3],
[5]])
"""
mask = np.asarray(mask)
if np.issubdtype(mask.dtype, np.signedinteger):
return mask

if hasattr(X, "toarray"):
ind = np.arange(mask.shape[0])
mask = ind[mask]
return mask


def axis0_safe_slice(X, mask, len_mask):
"""Return a mask which is safer to use on X than safe_mask.
This mask is safer than safe_mask since it returns an
empty array, when a sparse matrix is sliced with a boolean mask
with all False, instead of raising an unhelpful error in older
versions of SciPy.
See: https://github.com/scipy/scipy/issues/5361
Also note that we can avoid doing the dot product by checking if
the len_mask is not zero in _huber_loss_and_gradient but this
is not going to be the bottleneck, since the number of outliers
and non_outliers are typically non-zero and it makes the code
tougher to follow.
Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.
mask : ndarray
Mask to be used on X.
len_mask : int
The length of the mask.
Returns
-------
mask : ndarray
Array that is safe to use on X.
"""
if len_mask != 0:
return X[safe_mask(X, mask), :]
return np.zeros(shape=(0, X.shape[1]))


def _array_indexing(array, key, key_dtype, axis):
"""Index an array or scipy.sparse consistently across NumPy version."""
if issparse(array) and key_dtype == "bool":
Expand Down Expand Up @@ -806,38 +725,6 @@ def _to_object_array(sequence):
return out


def indices_to_mask(indices, mask_length):
"""Convert list of indices to boolean mask.
Parameters
----------
indices : list-like
List of integers treated as indices.
mask_length : int
Length of boolean mask to be generated.
This parameter must be greater than max(indices).
Returns
-------
mask : 1d boolean nd-array
Boolean array that is True where indices are present, else False.
Examples
--------
>>> from sklearn.utils import indices_to_mask
>>> indices = [1, 2 , 3, 4]
>>> indices_to_mask(indices, 5)
array([False, True, True, True, True])
"""
if mask_length <= np.max(indices):
raise ValueError("mask_length must be greater than max(indices)")

mask = np.zeros(mask_length, dtype=bool)
mask[indices] = True

return mask


def _message_with_time(source, message, time):
"""Create one line message for logging purposes.
Expand Down
115 changes: 115 additions & 0 deletions sklearn/utils/_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from scipy import sparse as sp

from ._missing import is_scalar_nan
from ._param_validation import validate_params
from .fixes import _object_dtype_isnan


Expand Down Expand Up @@ -61,3 +62,117 @@ def _get_mask(X, value_to_mask):
)

return Xt_sparse


@validate_params(
{
"X": ["array-like", "sparse matrix"],
"mask": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def safe_mask(X, mask):
"""Return a mask which is safe to use on X.
Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.
mask : array-like
Mask to be used on X.
Returns
-------
mask : ndarray
Array that is safe to use on X.
Examples
--------
>>> from sklearn.utils import safe_mask
>>> from scipy.sparse import csr_matrix
>>> data = csr_matrix([[1], [2], [3], [4], [5]])
>>> condition = [False, True, True, False, True]
>>> mask = safe_mask(data, condition)
>>> data[mask].toarray()
array([[2],
[3],
[5]])
"""
mask = np.asarray(mask)
if np.issubdtype(mask.dtype, np.signedinteger):
return mask

if hasattr(X, "toarray"):
ind = np.arange(mask.shape[0])
mask = ind[mask]
return mask


def axis0_safe_slice(X, mask, len_mask):
"""Return a mask which is safer to use on X than safe_mask.
This mask is safer than safe_mask since it returns an
empty array, when a sparse matrix is sliced with a boolean mask
with all False, instead of raising an unhelpful error in older
versions of SciPy.
See: https://github.com/scipy/scipy/issues/5361
Also note that we can avoid doing the dot product by checking if
the len_mask is not zero in _huber_loss_and_gradient but this
is not going to be the bottleneck, since the number of outliers
and non_outliers are typically non-zero and it makes the code
tougher to follow.
Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.
mask : ndarray
Mask to be used on X.
len_mask : int
The length of the mask.
Returns
-------
mask : ndarray
Array that is safe to use on X.
"""
if len_mask != 0:
return X[safe_mask(X, mask), :]
return np.zeros(shape=(0, X.shape[1]))


def indices_to_mask(indices, mask_length):
"""Convert list of indices to boolean mask.
Parameters
----------
indices : list-like
List of integers treated as indices.
mask_length : int
Length of boolean mask to be generated.
This parameter must be greater than max(indices).
Returns
-------
mask : 1d boolean nd-array
Boolean array that is True where indices are present, else False.
Examples
--------
>>> from sklearn.utils._mask import indices_to_mask
>>> indices = [1, 2 , 3, 4]
>>> indices_to_mask(indices, 5)
array([False, True, True, True, True])
"""
if mask_length <= np.max(indices):
raise ValueError("mask_length must be greater than max(indices)")

mask = np.zeros(mask_length, dtype=bool)
mask[indices] = True

return mask

0 comments on commit a188d7e

Please sign in to comment.