-
Notifications
You must be signed in to change notification settings - Fork 2
/
sample.py
178 lines (149 loc) · 5.62 KB
/
sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import csv
import functools # reduce()
import numpy as np
import pickle
# N.B. lines that are commented are considered in the dataset subset but are provided in the original dataset.
coarselabel_map = {
# 'null' : 0,
'still' : 1,
'walk' : 2,
'run' : 3,
'bike' : 4,
'car' : 5,
'bus' : 6,
'train' : 7,
'subway': 8,
}
# channels corresponding to the columns of <position>_motion.txt files
# ordered according to the SHL dataset documentation.
channels_basic = {
# [...]
2: 'Acc_x',
3: 'Acc_y',
4: 'Acc_z',
5: 'Gyr_x',
6: 'Gyr_y',
7: 'Gyr_z',
8: 'Mag_x',
9: 'Mag_y',
10: 'Mag_z',
# 11: 'Ori_w',
# 12: 'Ori_x',
# 13: 'Ori_y',
# 14: 'Ori_z',
# 15: 'Gra_x',
# 16: 'Gra_y',
# 17: 'Gra_z',
# 18: 'LAcc_x',
# 19: 'LAcc_y',
# 20: 'LAcc_z',
# 21: 'Pressure'
# [...]
}
def size_of_index(index:list) -> int:
size = functools.reduce(lambda acc,portion: (portion[1]-portion[0])+acc, index, 0)
return size
def sample(indexes:dict, student_id:int) -> list:
"""
Given the index associated to each activity, this function generates
a single index by selecting portions for each activity. This function
ensures that:
- each activity has a sufficient number of representants;
- the portions have a sufficient number of adjacent segments;
- the number of segments of each activity is balanced;
"""
np.random.seed(student_id)
# 1. For each activity:
# a. filter portions according to their size, e.g. we do not want portions
# containing only 5 segments;
# b. compute the average size of the portions;
min_portion_size = 10
filtered_indexes = {}
average_size = {}
for activity in indexes:
print('filtering the index of {} ...'.format(activity))
filtered_indexes[activity] = list(filter(lambda portion: portion[1]-portion[0]>min_portion_size, indexes[activity]))
average_size[activity] = size_of_index(filtered_indexes[activity])/len(filtered_indexes[activity])
print(' - average size of {} filtered portions: {}'.format(activity, average_size[activity]))
# 2. determine how many portions to sample for each activity;
# 3. sample using np.choice;
sample = []
num_segment_per_activity = 2000
for activity in indexes:
num_segments = int(num_segment_per_activity/average_size[activity])
ret = np.random.choice(len(filtered_indexes[activity]), size=num_segments, replace=False)
print(' - selected portions for {}:{}'.format(activity,ret))
idx = [filtered_indexes[activity][i] for i in ret]
print(' - size (# of segments) of this index:{}'.format(size_of_index(idx)))
for por in idx:
sample.append(por)
return sample
def extract_examples(sample_index:list, channel:str) -> None:
"""
Given a sample index in the form of a list of tuples (start, end)
specifying the starting and ending point of a portion of examples,
this function extract the corresponding examples to a new file (e.g.
Acc_x.txt -> Acc_x_sample.txt)
"""
# sort list in ascending order
sample_index_sorted = sorted(sample_index, key=lambda tup: tup[0])
with open('./generated/tmp/train/Torso/'+channel+'.txt', 'r') as input_:
with open('./generated/sample/train/Torso/'+channel+'.txt', 'w') as output_:
reader = csv.reader(input_)
writer = csv.writer(output_)
try:
# assume that the indexes are sorted in ascending order
sample_index_iter = iter(sample_index_sorted)
idx = next(sample_index_iter)
for i, row in enumerate(reader):
if i > idx[1]:
idx = next(sample_index_iter)
if idx[0]<=i and i<idx[1]:
writer.writerow(row)
except StopIteration:
pass
def index_of_activity(activity:str) -> list:
"""
Example:
```python
idxs = index_of_activity('run')
print('Portions of activity %s :'.format('run'))
for index in idxs:
print('%d: [%d; %d]'.format(index[0][0], index[0][1]))
```
"""
print('Constructing the index of {} ...'.format(activity))
idxs=[]
label_of_activity = coarselabel_map[activity]
with open('./generated/tmp/train/Torso/Label.txt') as labels:
reader = csv.reader(labels, delimiter=' ')
start = -1
for i, row in enumerate(reader):
if (int(row[0]) == label_of_activity) and (start == -1):
# we found the starting point of a portion
start = i
if (int(row[0]) != label_of_activity) and (start != -1):
# we found the end of a portion
# print('portion found [{};{}]'.format(start, i-1))
idxs.append((start, i-1))
start = -1
return idxs
def build_sample(student_id:int) -> dict:
idxs = {}
for activity in coarselabel_map:
idxs[activity] = index_of_activity(activity)
sample_idx = sample(idxs, student_id)
print(sample_idx)
print('##############################')
print('Size of the sample: {}'.format(size_of_index(sample_idx)))
print('##############################')
for _, channel in channels_basic.items():
extract_examples(sample_idx, channel)
extract_examples(sample_idx, 'Label')
# return the used index
return idxs
def save_index(idxs:dict, path:str) -> None:
with open(path, 'wb') as handle:
pickle.dump(idxs, handle, protocol=pickle.HIGHEST_PROTOCOL)
if __name__ == '__main__':
build_sample()