-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathsampler.py
70 lines (60 loc) · 2.28 KB
/
sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from torch.utils.data.sampler import Sampler
import itertools
import numpy as np
def samples(df):
label_to_samples = []
samples = []
label = 0
for index, row in df.iterrows():
if index == 0:
samples.append(index)
label = row['target']
else:
if row['target'] != label:
label_to_samples.append(samples)
samples = []
label = row['target']
samples.append(index)
return label_to_samples
class PKSampler(Sampler):
def __init__(self, data_source, p=15, k=20):
super().__init__(data_source)
self.p = p
self.k = k
self.data_source = data_source
def __iter__(self):
pk_count = len(self) // (self.p * self.k)
for _ in range(pk_count):
labels = np.random.choice(np.arange(len(self.data_source.label_to_samples)), self.p, replace=False)
for l in labels:
indices = self.data_source.label_to_samples[l]
replace = True if len(indices) < self.k else False
for i in np.random.choice(indices, self.k, replace=replace):
yield i
def __len__(self):
pk = self.p * self.k
samples = ((len(self.data_source) - 1) // pk + 1) * pk
return samples
def grouper(iterable, n):
it = itertools.cycle(iter(iterable))
for _ in range((len(iterable) - 1) // n + 1):
yield list(itertools.islice(it, n))
# full label coverage per 'epoch'
class PKSampler2(Sampler):
def __init__(self, data_source, p=15, k=20):
super().__init__(data_source)
self.p = p
self.k = k
self.data_source = data_source
def __iter__(self):
rand_labels = np.random.permutation(np.arange(len(self.data_source.label_to_samples)))
for labels in grouper(rand_labels, self.p):
for l in labels:
indices = self.data_source.label_to_samples[l]
replace = True if len(indices) < self.k else False
for j in np.random.choice(indices, self.k, replace=replace):
yield j
def __len__(self):
num_labels = len(self.data_source.label_to_samples)
samples = ((num_labels - 1) // self.p + 1) * self.p * self.k
return samples