-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtsmoothie_denoising.py
186 lines (132 loc) · 5.14 KB
/
tsmoothie_denoising.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
'''
tsmoothie python package: vectorized time-series smoothing and outlier detection
Link: https://github.com/cerlymarco/tsmoothie
Uses a LowessSmoother (locally-weighted scatterplot smoothing for time series)
: non-parametric regression that fits a unique linear regression for every data point by including nearby data points to
estimate the slope and intercept.
'''
# import required packages
import pandas as pd
from tsmoothie.smoother import LowessSmoother
import matplotlib.pyplot as plt
# user defined parameters
r_df = pd.read_csv('20220325_battery_files_important_ingredients.csv', usecols=['battery_file_id', 'DischargeCapacityRetention', 'Cyc_'])
smooth_fraction = 0.1
iterations = 5
# denoise on "discharge capacity retention"
def denoise(id_number, sf, iter):
data = r_df[r_df['battery_file_id'] == id_number]
data = data.sort_values(by='Cyc_')
y = data['DischargeCapacityRetention']
# smooth_fraction: smoothing span; a larger value results in a smoother curve.
# iterations: the number of residual-baed reweightings to perform. Maximum can be 6
smoother = LowessSmoother(smooth_fraction=sf, iterations=iter)
smoother.smooth(y.to_frame().T)
smooth_result = smoother.smooth_data[0]
dat = data.copy()
dat['Smoothed'] = smooth_result
return dat
# denoise full data
id_number=[]
id_numbers = r_df['battery_file_id'].unique()
appended_data=[]
for i in id_numbers:
dat = denoise(i, smooth_fraction, iterations)
appended_data.append(dat)
appended_data = pd.concat(appended_data, ignore_index=True)
# save the full data
appended_data.to_csv('denoise_full_data.csv')
print(appended_data)
# classify accepted & dropped
cols2 = ['battery_file_id', 'Cyc_', 'Smoothed']
full_df = pd.read_csv('denoise_full_data.csv', usecols=cols2)
#full_df = appended_data
gb = full_df.groupby('battery_file_id')
g = [gb.get_group(x) for x in gb.groups]
accepted=[]
dropped=[]
cnt_accepted=0
cnt_dropped=0
cnt=0
# "filter out" dropped cases
for i in g:
cnt+=1
capacity=i['Smoothed'].values
if capacity[0] < 0.7 or max(capacity) > 1.05:
dropped.append(i)
cnt_dropped+=1
else:
cnt_accepted+=1
acpt_val = []
for x in i['Smoothed'].values:
#print(idx, x)
if x >= 0.7:
acpt_val.append(x)
else:
break
j = i[:len(acpt_val)].copy()
j['Smoothed'] = acpt_val
accepted.append(j)
# count the number of each cases
print('Total cases: {}'.format(cnt))
print('Dropped cases: {} ({}%)'.format(cnt_dropped, round((cnt_dropped/cnt)*100, 2)))
print('Accepted cases: {} ({}%)'.format(cnt_accepted, round((cnt_accepted/cnt)*100, 2)))
# create dataframes
accepted = pd.concat(accepted, ignore_index=True)
dropped = pd.concat(dropped, ignore_index=True)
accepted.to_csv('denoise_accepted.csv')
dropped.to_csv('denoise_droopped.csv')
# visualize dropped cases; blue=original data, red=smoothed data
print("Visualizing dropped cases")
d_df = pd.read_csv('denoise_droopped.csv')
axis_x =0
axis_y =0
num_subplot_y =10
num_subplot_x =(len(d_df['battery_file_id'].unique()) // num_subplot_y)+1
fig, axis = plt.subplots(num_subplot_x, num_subplot_y, figsize=(num_subplot_y+1, num_subplot_x+1))
fig.subplots_adjust(hspace=2 , wspace=0.8)
start = 0
id_number=d_df['battery_file_id'].unique()
for i in id_number:
df = d_df[d_df['battery_file_id'] == i]
df = df.sort_values(by='Cyc_')
x_axis_vals = df['Cyc_'].values
y_axis_vals = df['DischargeCapacityRetention'].values
y_axis_vals2 = df['Smoothed'].values
axis[axis_x, axis_y].plot(x_axis_vals, y_axis_vals, color='cornflowerblue', label='Discharge')
axis[axis_x ,axis_y].plot(x_axis_vals, y_axis_vals2, color='red', label='Discharge')
axis[axis_x ,axis_y].set_title('fileid={}'.format(i), fontsize=10)
axis_y = axis_y +1
if axis_y == num_subplot_y:
axis_y = 0
axis_x = axis_x + 1
plt.suptitle('Dropped Cases', fontsize=15)
plt.show()
plt.savefig('dropped_ids.png')
# visualize accepted cases; blue=original data, red=smoothed data
print("Visualizing accepted cases")
a_df = pd.read_csv('denoise_accepted.csv')
axis_x =0
axis_y =0
num_subplot_y =30
num_subplot_x =(len(a_df['battery_file_id'].unique()) // num_subplot_y)+1
fig, axis = plt.subplots(num_subplot_x, num_subplot_y, figsize=(num_subplot_y+1, num_subplot_x))
fig.subplots_adjust(hspace=2 , wspace=0.8)
start = 0
id_number=a_df['battery_file_id'].unique()
for i in id_number:
df = a_df[a_df['battery_file_id'] == i]
df = df.sort_values(by='Cyc_')
x_axis_vals = df['Cyc_'].values
y_axis_vals = df['DischargeCapacityRetention'].values
y_axis_vals2 = df['Smoothed'].values
axis[axis_x ,axis_y].plot(x_axis_vals, y_axis_vals, color='cornflowerblue', label='Discharge') # original retention data
axis[axis_x ,axis_y].plot(x_axis_vals, y_axis_vals2, color='red', label='Discharge') # smoothed retention data
axis[axis_x ,axis_y].set_title('fileid={}'.format(i), fontsize=10)
axis_y = axis_y +1
if axis_y == num_subplot_y:
axis_y = 0
axis_x = axis_x + 1
plt.suptitle('Accepted Cases', fontsize=30)
plt.show()
plt.savefig('accepted_ids.png')