-
Notifications
You must be signed in to change notification settings - Fork 1
/
cluster_similar_device.py
35 lines (32 loc) · 1.36 KB
/
cluster_similar_device.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def trim_mean(x, trim=10):
lower_bound = np.percentile(x, trim)
upper_bound = np.percentile(x, (100-trim))
return np.mean(x[(x>=lower_bound) & (x<=upper_bound)])
def getSimilarDevices(tol):
data = pd.read_csv('train.csv')
print "Getting sample rate...",
## Create steps
data['T'] = data.groupby('Device').apply(lambda x: x['T'] - x['T'].shift(1)).fillna(207)
## Getting samples rate, then sort so that similar devices are close together
data2 = data.groupby('Device')['T'].apply(lambda x: trim_mean(x))
data2.sort()
## How many similar devices you'd want to include from the left and right
## Eg. If tol=3 you'll end up picking the next 3 devices with slightly higher
## samples rate AND the next 3 devices with slightly lower sample rate.
similars = []
for i,dev in enumerate(data2.index.values):
begin=i-tol if (i-tol) >0 else 0
end=i+1+tol if (i+tol+1) < len(data2) else len(data2)
similars.append(
(dev, list(data2.index.values[begin:i]) +
list(data2.index.values[i+1:end]))
)
similars = dict(similars)
return similars
if __name__ == "__main__":
print "Reading the training data..."
train = pd.read_csv('train.csv')
getSimilarDevices(train, 3)