-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleanData.py
118 lines (96 loc) · 3.12 KB
/
cleanData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pandas as pd
import os
from fileHelper import *
import re
def saveData(filepath, data):
''' Save data to new csv file
Parameters
----------
filepath : directory of original raw data
data : data to be saved
'''
filename = os.path.splitext(filepath)[0]
data.to_csv(filename + "_clean.csv", index = False)
def refineData(col_list, num, col_unit, data, isVer7):
''' Remove string + baseline correct
Parameters
----------
col_list : data of channels
num : number of channels
col_unit : unit of channels
data : raw dataframe
Returns
-------
data : cleaned and baseline corrected
dataframe.
'''
data = remove_string(col_list, num, col_unit, data, isVer7)
data = baseline_correct(data)
return data
def extract(string):
''' Extra cleanning function to deal with negative entries
Parameters
----------
string : string to clean
Returns
-------
string without any number, - or . in front
'''
return re.sub('^\d*\.*\^*-+', '-', string)
def remove_string(col_list, col_num, col_unit, df, isVer7):
''' Support version 7.1 (7.0 is optional) of SLIC
Remove extra 'System running' as noise
Remove all alphabets
Expand to 6 OR more channels (up to 64) -> space-sparated raw data column
Remove useless channels
Drop entries with fewer than 6 channels
Remove first 3 rows (warmming up SLIC)
Rename column headers
Reset index
Parameters
----------
col_list : list of column entries.
col_num : number of channels.
col_unit : unit of channel.
df : raw dataframe.
Returns
-------
df : cleaned dataframe.
'''
start_list = df.iloc[:, 0][df.iloc[:, 0] == 'OK System running'].index.tolist()
if len(start_list) > 1:
print("Warning: More than one 'System start' detected\n")
df = df.iloc[start_list[len(start_list) - 1]:]
print(start_list[len(start_list) - 1], "rows have been removed from the start\n")
else:
print("No multiple start detected\n")
df.iloc[:, 0] = df.iloc[:, 0].str.replace(r"[a-zA-Z]", '')
df.iloc[:, 0] = df.iloc[:, 0].str.replace(r"[^\w\s^.^-]|_", '')
df.iloc[:, 0] = df.iloc[:, 0].apply(extract)
df = df[0].str.split(expand = True)
if (isVer7) :
while (len(df.columns) > col_num):
df.drop(df.columns[0], axis = 1, inplace = True)
else :
while (len(df.columns) > col_num):
df.drop(df.columns[-1], axis = 1, inplace = True)
df = df[df[col_num - 1].notna()]
df = df.iloc[3:]
list_c = []
for i in range (col_num):
list_c.append(str(col_list[i]) + ' ' + col_unit.strip())
df.columns = list_c
df = df.reset_index(drop = True)
return df
def baseline_correct(df):
''' Perform baseline correction
Parameters
----------
df : dataframe
Returns
-------
df : dataframe
'''
df = df.apply(pd.to_numeric)
df = df - df.iloc[0]
return df