Skip to content

Commit

Permalink
warning elimination and test coverage increase
Browse files Browse the repository at this point in the history
  • Loading branch information
gykovacs committed Dec 18, 2023
1 parent cfd32a4 commit 8a8350f
Show file tree
Hide file tree
Showing 12 changed files with 49 additions and 29 deletions.
21 changes: 11 additions & 10 deletions smote_variants/base/_metrictensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,16 +639,17 @@ def tensor(self, X, y):
]

self.metric_tensor = psd_mean(matrices)
elif self.metric_learning_method == "n_unique":
n_uniques = np.array(
[len(np.unique(X_mod[:, idx])) for idx in range(X.shape[1])]
)
self.metric_tensor = np.diag(np.sqrt(n_uniques))
elif self.metric_learning_method == "n_unique_inv":
n_uniques = np.array(
[len(np.unique(X_mod[:, idx])) for idx in range(X.shape[1])]
)
self.metric_tensor = np.diag(np.sqrt(1.0 / n_uniques))

#elif self.metric_learning_method == "n_unique":
# n_uniques = np.array(
# [len(np.unique(X_mod[:, idx])) for idx in range(X.shape[1])]
# )
# self.metric_tensor = np.diag(np.sqrt(n_uniques))
#elif self.metric_learning_method == "n_unique_inv":
# n_uniques = np.array(
# [len(np.unique(X_mod[:, idx])) for idx in range(X.shape[1])]
# )
# self.metric_tensor = np.diag(np.sqrt(1.0 / n_uniques))

return self.metric_tensor

Expand Down
25 changes: 12 additions & 13 deletions smote_variants/base/_simplexsampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
This module implements all simplex sampling related functionalities.
"""

import math
import itertools

import numpy as np
Expand Down Expand Up @@ -105,7 +106,7 @@ def simplex_volume(simplex):
simplex_mod = simplex[:-1] - simplex[-1]
gram = np.dot(simplex_mod, simplex_mod.T)
det = np.linalg.det(gram)
return np.sqrt(det) / np.math.factorial(simplex.shape[0] - 1)
return np.sqrt(det) / math.factorial(simplex.shape[0] - 1)


def simplex_volumes(simplices):
Expand Down Expand Up @@ -394,8 +395,6 @@ def determine_simplex_distribution(self, X, simplices):
return np.repeat(1.0 / len(simplices), len(simplices))
if self.simplex_sampling == "volume":
return simplex_volumes(X[simplices])
if self.simplex_sampling == "volume_inv":
return 1.0 / (simplex_volumes(X[simplices]) + 0.001)
raise ValueError(
f"simplex sampling with weighting "
f"{self.simplex_sampling} not implemented yet"
Expand Down Expand Up @@ -544,18 +543,18 @@ def add_gaussian_noise(self, samples):
"""

if "sigma" in self.gaussian_component:
if "fraction" not in self.gaussian_component:
sigma = self.gaussian_component["sigma"]
return samples + self.random_state.normal(size=samples.shape) * sigma
#if "fraction" not in self.gaussian_component:
sigma = self.gaussian_component["sigma"]
return samples + self.random_state.normal(size=samples.shape) * sigma

# the else branch
sigma = self.gaussian_component["sigma"]
fraction = self.gaussian_component["fraction"]
return samples + self.random_state.normal(
size=samples.shape
) * sigma * self.random_state.choice(
[0, 1], p=[1.0 - fraction, fraction], size=samples.shape
)
#sigma = self.gaussian_component["sigma"]
#fraction = self.gaussian_component["fraction"]
#return samples + self.random_state.normal(
# size=samples.shape
#) * sigma * self.random_state.choice(
# [0, 1], p=[1.0 - fraction, fraction], size=samples.shape
#)
if "sigmas" in self.gaussian_component:
sigmas = self.gaussian_component["sigmas"]
return samples + self.random_state.normal(size=samples.shape) * sigmas
Expand Down
2 changes: 1 addition & 1 deletion smote_variants/oversampling/_adg.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def xmeans(X, rng=(1, 10), random_state=None):
for n_clusters in range(rng[0], min([rng[1], len(X)])):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(X)
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto').fit(X)

bic = bic_score(kmeans, X)
if bic < best_bic:
Expand Down
2 changes: 1 addition & 1 deletion smote_variants/oversampling/_ahc.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def sample_majority(self, X, n_clusters):
Returns:
np.ndarray: downsampled vectors
"""
kmeans = KMeans(n_clusters=n_clusters, random_state=self._random_state_init)
kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=self._random_state_init)

with warnings.catch_warnings():
warnings.simplefilter("ignore")
Expand Down
2 changes: 1 addition & 1 deletion smote_variants/oversampling/_ce_smote.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def do_the_clustering(self, X):
)
features = self.random_state.choice(n_dim, n_features, replace=False)
n_clusters = min([len(X), self.k])
kmeans = KMeans(n_clusters=n_clusters, random_state=self._random_state_init)
kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=self._random_state_init)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
kmeans.fit(X[:, features])
Expand Down
1 change: 1 addition & 0 deletions smote_variants/oversampling/_cluster_smote.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ def sampling_algorithm(self, X, y):

kmeans = KMeans(
n_clusters=min([len(X_min), self.n_clusters]),
n_init='auto',
random_state=self._random_state_init,
)
with warnings.catch_warnings():
Expand Down
1 change: 1 addition & 0 deletions smote_variants/oversampling/_de_oversampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ def cleansing(self, X, y):
# cleansing based on clustering
kmeans = KMeans(
n_clusters=min([len(X), self.n_clusters]),
n_init='auto',
random_state=self._random_state_init,
)

Expand Down
2 changes: 1 addition & 1 deletion smote_variants/oversampling/_kmeans_smote.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def do_the_clustering(self, X, y):
"""
# applying kmeans clustering to all data
n_clusters = min([self.n_clusters, len(X)])
kmeans = KMeans(n_clusters=n_clusters, random_state=self._random_state_init)
kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=self._random_state_init)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
kmeans.fit(X)
Expand Down
2 changes: 1 addition & 1 deletion smote_variants/oversampling/_lvq_smote.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def sampling_algorithm(self, X, y):

# clustering X_min to extract codebook
n_clusters = min([len(X_min), self.n_clusters])
kmeans = KMeans(n_clusters=n_clusters, random_state=self._random_state_init)
kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=self._random_state_init)
with warnings.catch_warnings():
if suppress_external_warnings():
warnings.simplefilter("ignore")
Expand Down
1 change: 1 addition & 0 deletions smote_variants/oversampling/_mwmote.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ def sampling_algorithm(self, X, y):
_logger.info("%s: do clustering", self.__class__.__name__)
kmeans = KMeans(
n_clusters=np.min([len(np.unique(X_min, axis=1)), self.params["M"]]),
n_init='auto',
random_state=self._random_state_init,
)
with warnings.catch_warnings():
Expand Down
2 changes: 1 addition & 1 deletion smote_variants/oversampling/_sso.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def calculate_vectors(self, X, y, X_min, nn_params):
"""
# applying kmeans clustering to find the hidden neurons
h = min([self.h, len(X_min)]) # pylint: disable=invalid-name
kmeans = KMeans(n_clusters=h, random_state=self._random_state_init)
kmeans = KMeans(n_clusters=h, n_init='auto', random_state=self._random_state_init)
kmeans.fit(X)

# extracting the hidden center elements
Expand Down
17 changes: 17 additions & 0 deletions tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
fix_density,
cov,
scipy_mode,
shuffle_training_set
)


Expand Down Expand Up @@ -295,3 +296,19 @@ def test_parameter_combinations():
assert (
len(ParametersMixin.generate_parameter_combinations(params_base, raw=True)) == 2
)

def test_shuffle_training_set():
"""
Testing the training set shuffling
"""

X = np.array([[1, 2], [2, 3], [3, 4]])
y = np.array([0, 0, 1])

shuffled_X, _ = shuffle_training_set(X, y)

assert X is shuffled_X

shuffled_X, _ = shuffle_training_set(X, y, np.random.RandomState(5))

assert X.shape == shuffled_X.shape

0 comments on commit 8a8350f

Please sign in to comment.