Skip to content

Commit

Permalink
Make automatic centering in PCA methods optional (fixes dask#734) das…
Browse files Browse the repository at this point in the history
  • Loading branch information
hristog committed Apr 20, 2021
1 parent 0ea276d commit d241040
Show file tree
Hide file tree
Showing 4 changed files with 342 additions and 23 deletions.
7 changes: 7 additions & 0 deletions dask_ml/decomposition/incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,15 +132,21 @@ def __init__(
svd_solver="auto",
iterated_power=0,
random_state=None,
center=True,
):
self.n_components = n_components
self.whiten = whiten
self.center = center
self.copy = copy
self.batch_size = batch_size
self.svd_solver = svd_solver
self.iterated_power = iterated_power
self.random_state = random_state

def _check_params(self):
if self.center is False:
raise ValueError("IncrementalPCA with center=False is not supported.")

def _fit(self, X, y=None):
"""Fit the model with X, using minibatches of size batch_size.
Expand Down Expand Up @@ -238,6 +244,7 @@ def partial_fit(self, X, y=None, check_input=True):
self : object
Returns the instance itself.
"""
self._check_params()
if check_input:
if sparse.issparse(X):
raise TypeError(
Expand Down
82 changes: 65 additions & 17 deletions dask_ml/decomposition/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,21 @@ class PCA(sklearn.decomposition.PCA):
If None, the random number generator is the RandomState instance used
by `da.random`. Used when ``svd_solver`` == 'randomized'.
center : bool, optional (default True)
When True (the default), the underlying data gets centered at zero
by subtracting the mean of the data from the data itself.
PCA is performed on centered data due to its being a regression model,
without an intercept. As such, its principal components originate at the
origin of the transformed space.
``center=False`` may be employed when performing PCA on already
centered data.
Since centering is a required step as part of whitening, ``center`` set
to False and ``whiten`` set to True is a combination which may result in
unexpected behavior, if performed on not previously centered data.
Attributes
----------
components_ : array, shape (n_components, n_features)
Expand Down Expand Up @@ -152,18 +167,27 @@ class PCA(sklearn.decomposition.PCA):
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[ 0.99244... 0.00755...]
[0.99244289 0.00755711]
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
[ 6.30061... 0.54980...]
[6.30061232 0.54980396]
>>> pca = PCA(n_components=2, svd_solver='full')
>>> pca.fit(dX) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
svd_solver='full', tol=0.0, whiten=False)
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[ 0.99244... 0.00755...]
[0.99244289 0.00755711]
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
[6.30061232 0.54980396]
>>> dX_mean_0 = dX - dX.mean(axis=0)
>>> pca = PCA(n_components=2, svd_solver='full', center=False)
>>> pca.fit(dX_mean_0)
PCA(center=False, n_components=2, svd_solver='full')
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[0.99244289 0.00755711]
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
[ 6.30061... 0.54980...]
[6.30061232 0.54980396]
Notes
-----
Expand All @@ -175,6 +199,10 @@ class PCA(sklearn.decomposition.PCA):
``dask.linalg.svd_compressed``.
* n_components : ``n_components='mle'`` is not allowed.
Fractional ``n_components`` between 0 and 1 is not allowed.
* center : if ``True`` (the default), automatically center input data before
performing PCA.
Set this parameter to ``False``, if the input data have already been
centered before running ``fit()``.
"""

def __init__(
Expand All @@ -186,10 +214,12 @@ def __init__(
tol=0.0,
iterated_power=0,
random_state=None,
center=True,
):
self.n_components = n_components
self.copy = copy
self.whiten = whiten
self.center = center
self.svd_solver = svd_solver
self.tol = tol
self.iterated_power = iterated_power
Expand All @@ -198,6 +228,7 @@ def __init__(
def fit(self, X, y=None):
if not dask.is_dask_collection(X):
raise TypeError(_TYPE_MSG.format(type(X)))

self._fit(X)
self.n_features_in_ = X.shape[1]
return self
Expand Down Expand Up @@ -266,8 +297,10 @@ def _fit(self, X):

solver = self._get_solver(X, n_components)

self.mean_ = X.mean(0)
X -= self.mean_
self.mean_ = X.mean(axis=0)

if self.center:
X -= self.mean_

if solver in {"full", "tsqr"}:
U, S, V = da.linalg.svd(X)
Expand Down Expand Up @@ -370,14 +403,20 @@ def transform(self, X):
X_new : array-like, shape (n_samples, n_components)
"""
check_is_fitted(self, ["mean_", "components_"])
check_is_fitted(self, "components_")

if self.whiten:
check_is_fitted(self, "explained_variance_")

if self.center:
check_is_fitted(self, "mean_")
if self.mean_ is not None:
X -= self.mean_

# X = check_array(X)
if self.mean_ is not None:
X = X - self.mean_
X_transformed = da.dot(X, self.components_.T)
if self.whiten:
X_transformed /= np.sqrt(self.explained_variance_)

return X_transformed

def fit_transform(self, X, y=None):
Expand All @@ -396,7 +435,6 @@ def fit_transform(self, X, y=None):
X_new : array-like, shape (n_samples, n_components)
"""
# X = check_array(X)
if not dask.is_dask_collection(X):
raise TypeError(_TYPE_MSG.format(type(X)))
U, S, V = self._fit(X)
Expand Down Expand Up @@ -431,18 +469,25 @@ def inverse_transform(self, X):
If whitening is enabled, inverse_transform does not compute the
exact inverse operation of transform.
"""
check_is_fitted(self, "mean_")
check_is_fitted(self, "components_")

if self.center:
check_is_fitted(self, "mean_")
offset = self.mean_
else:
offset = 0

if self.whiten:
check_is_fitted(self, "explained_variance_")
return (
da.dot(
X,
np.sqrt(self.explained_variance_[:, np.newaxis]) * self.components_,
)
+ self.mean_
+ offset
)
else:
return da.dot(X, self.components_) + self.mean_

return da.dot(X, self.components_) + offset

def score_samples(self, X):
"""Return the log-likelihood of each sample.
Expand All @@ -463,8 +508,11 @@ def score_samples(self, X):
"""
check_is_fitted(self, "mean_")

# X = check_array(X)
Xr = X - self.mean_
if self.center:
Xr = X - self.mean_
else:
Xr = X

n_features = X.shape[1]
precision = self.get_precision() # [n_features, n_features]
log_like = -0.5 * (Xr * (da.dot(Xr, precision))).sum(axis=1)
Expand Down
17 changes: 17 additions & 0 deletions tests/test_incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,3 +475,20 @@ def test_incremental_pca_partial_fit_float_division():
np.testing.assert_allclose(
singular_vals_float_samples_seen, singular_vals_int_samples_seen
)


def test_incremental_pca_no_centering_not_supported():
rng = np.random.RandomState(0)
A = rng.randn(5, 3) + 2
A = da.from_array(A, chunks=[3, -1])

pca = IncrementalPCA(n_components=2, center=False)

with pytest.raises(ValueError, match="not supported"):
pca.partial_fit(A)

with pytest.raises(ValueError, match="not supported"):
pca.fit(A)

with pytest.raises(ValueError, match="not supported"):
pca.fit_transform(A)
Loading

0 comments on commit d241040

Please sign in to comment.