-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSMOTEN_oversampling.py
93 lines (73 loc) · 4.08 KB
/
SMOTEN_oversampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
'''
SMOTEN Oversampling (imblearn package)
Link: https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTEN.html
: Performs oversampling, saves result as a csv file, and visualizes the distribution of clusters & the overall components
: All component data are considered categorical
'''
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTEN
from collections import Counter
import os
# Section for defining the variables
df = pd.read_csv('cluster_with_meta_data_0627.csv') # csv file containing meta data of cluster
use_all_clusters = False # True if you want to use all cluster ids in the data
cluster_ids = [1, 2, 5, 7, 11, 13, 15, 17, 22, 29] # if 'use_all_clusters' == False, define cluster ids of interest
oversample_based_on_majority = False # True if you want to oversample based on sample numbers of majority group
oversample_numbers = 300 # number of samples you want for each cluster
k_neighbors = 5 # Default. Adjust accordingly only if there are less than 5 data points in at least one cluster
# filter data
df = df.drop(columns=['battery_file_id', 'Discharge100Capacity', 'DischargeCapacityRetention_min'], axis=1)
if use_all_clusters:
cluster_ids = df['cluster']
df = df[df['cluster'].isin(cluster_ids)]
# Visualize and save distribution plot of clusters from original data
fig = plt.figure(figsize=(10, 6))
ax = sns.countplot(x='cluster', data=df, color='green')
abs_values = df['cluster'].value_counts(ascending=False).sort_index().values
rel_values = df['cluster'].value_counts(ascending=False, normalize=True).sort_index().values * 100
lbls = [f'{p[0]} ({p[1]:.0f}%)' for p in zip(abs_values, rel_values)]
ax.bar_label(container=ax.containers[0], labels=lbls)
ax.set_title('Distribution of clusters')
# plt.show()
plt.savefig('Distribution of clusters_original.png')
# Perform SMOTEN
Features = df.drop('cluster', 1)
Target = df['cluster']
print(f'Original dataset shape: {Features.shape}')
print(f'Original dataset samples per class: {Counter(Target)}')
if oversample_based_on_majority:
sampling_strategy_dict = 'all'
else:
sampling_strategy_dict = {}
for id in cluster_ids:
sampling_strategy_dict[id] = oversample_numbers
sampler = SMOTEN(random_state=42, k_neighbors=k_neighbors, sampling_strategy=sampling_strategy_dict)
ovsersampled_features, ovsersampled_target = sampler.fit_resample(Features, Target)
print(f'Resampled dataset samples per class: {Counter(ovsersampled_target)}')
# Visualize and save distribution plot of clusters from oversampled data
ovsersampled_features['cluster'] = ovsersampled_target
fig = plt.figure(figsize=(20, 6))
ax = sns.countplot(x='cluster', data=ovsersampled_features, color='orange')
abs_values = ovsersampled_features['cluster'].value_counts(ascending=False).sort_index().values
rel_values = ovsersampled_features['cluster'].value_counts(ascending=False, normalize=True).sort_index().values * 100
lbls = [f'{p[0]} ({p[1]:.0f}%)' for p in zip(abs_values, rel_values)]
ax.bar_label(container=ax.containers[0], labels=lbls)
ax.set_title('Distribution of clusters after oversampling')
# plt.show()
plt.savefig('Distribution of clusters_oversampled.png')
path_parent = os.path.abspath(os.getcwd())
os.mkdir('Component Distribution') # folder to save component distribution bar graphs
os.chdir('Component Distribution')
# Visualize and save compared distribution plots (original vs oversampled) for each component
for component in df.iloc[:, [x for x in range(1, 28)]].columns.values:
fig, axes = plt.subplots(1, 2, sharex=True, figsize=(10, 5))
fig.suptitle('{}_total'.format(component))
df[component].value_counts().plot(ax=axes[0], kind='bar', color='green')
axes[0].set_title('original data')
ovsersampled_features[component].value_counts().plot(ax=axes[1], kind='bar', color='orange')
axes[1].set_title('after oversampling')
plt.savefig('{}.png'.format(component))
os.chdir(path_parent)
ovsersampled_features.to_csv('smoten_oversampling_result.csv')