-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_test_file.py
executable file
·153 lines (145 loc) · 7.39 KB
/
generate_test_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import json
from dateutil import parser
import random
import copy
import datetime
import pytz
import argparse
seed = 1
num_pairs = 10
min_similarity = 0.1
max_similarity = 0.6
min_days_before_change = 25
min_days_after_change = 2
min_cnt = 100
min_unique_events = 4
argparser = argparse.ArgumentParser(description='Generation.')
argparser.add_argument('-s','--seed', help='Random generator seed.', required=False)
argparser.add_argument('-p','--pairs', help='Number of switched users.', required=False)
argparser.add_argument('-m','--min_sim', help='Minimum similarity for switching.', required=False)
argparser.add_argument('-n','--max_sim', help='Maximum similarity for switching.', required=False)
argparser.add_argument('-b','--days_before', help='Minimum active days of user before switchting.', required=False)
argparser.add_argument('-a','--days_after', help='Minimum active days of user after switching.', required=False)
argparser.add_argument('-c','--min_count', help='Minimum total number of events by switched user.', required=False)
argparser.add_argument('-u','--min_unique', help='Minimum unique events by switched user.', required=False)
args = vars(argparser.parse_args())
if args["seed"] is not None:
seed = int(args["seed"])
if args["pairs"] is not None:
num_pairs = int(args["pairs"])
if args["min_sim"] is not None:
min_similarity = float(args["min_sim"])
if args["max_sim"] is not None:
max_similarity = float(args["max_sim"])
if args["days_before"] is not None:
min_days_before_change = int(args["days_before"])
if args["days_after"] is not None:
min_days_after_change = int(args["days_after"])
if args["min_count"] is not None:
min_cnt = int(args["min_count"])
if args["min_unique"] is not None:
min_unique_events = int(args["min_unique"])
# Set seed for random generator
random.seed(seed)
# Load user similarity info generated by get_user_sim.py
users_data = json.load(open('user_info.txt'))
users_list = users_data['user_list']
users = users_data['user_info']
for user in users:
# Replace similarity list with dictionary for easier handling
sim_list = users[user]['similarities']
users[user]['similarities'] = {}
users[user]['day_list'] = sorted(users[user]['day_list'])
index = 0
for uid in users_list:
users[user]['similarities'][uid] = sim_list[index]
index += 1
# Randomly select pairs of similar users for switching
pairs = {}
change_times = {}
taken = []
choices = list(users.keys())
while len(pairs) / 2 < num_pairs:
if len(choices) == 0:
print('Could not find user pair, try to extend allowed similarity range. Aborting...')
exit()
# Randomly select a first user for potential switching
first_user = random.choice(choices)
choices.remove(first_user)
if users[first_user]['days'] <= min_days_before_change + min_days_after_change or users[first_user]['cnt'] <= min_cnt or len(users[first_user]['actions']) <= min_unique_events:
# First user does not fulfill requirements - skip
continue
# Try to find a second user to be switched with first user
choices_inner = copy.deepcopy(choices)
found = False
second_user = None
change_time_first = None
change_time_second = None
while found is False:
if len(choices_inner) == 0:
# No second user found; go back and select another first user
break
user_inner = random.choice(choices_inner)
choices_inner.remove(user_inner)
if users[user_inner]['days'] <= min_days_before_change + min_days_after_change or users[user_inner]['cnt'] <= min_cnt or len(users[user_inner]['actions']) <= min_unique_events:
# Second user does not fulfill requirements - skip
continue
if users[first_user]['similarities'][user_inner] >= min_similarity and users[first_user]['similarities'][user_inner] <= max_similarity and first_user != user_inner:
# Potential user pair for switching found; check if sufficient overlap exists for switching them
earliest_change = max(users[first_user]['day_list'][min_days_before_change], users[user_inner]['day_list'][min_days_before_change])
latest_change = min(users[first_user]['day_list'][-min_days_after_change], users[user_inner]['day_list'][-min_days_after_change])
if earliest_change < latest_change:
# Switching is possible; randomly select switching point
ctt = random.randint(earliest_change, latest_change)
# Find first active day of switched user behavior for ground truth
for day_element in users[first_user]['day_list']:
if ctt <= day_element:
change_time_first = day_element
break
for day_element in users[user_inner]['day_list']:
if ctt <= day_element:
change_time_second = day_element
break
second_user = user_inner
found = True
if found is True:
print('Changing user pair with similarity ' + str(round(users[first_user]['similarities'][second_user], 2)) + ':')
print(' * User ' + str(first_user) + ' changed at ' + str(datetime.datetime.fromtimestamp(change_time_first, pytz.utc)) + ' (user originally carried out ' + str(users[first_user]['cnt']) + ' total events and ' + str(len(users[first_user]['actions'])) + ' unique events during ' + str(users[first_user]['days']) + ' active days).')
print(' * User ' + str(second_user) + ' changed at ' + str(datetime.datetime.fromtimestamp(change_time_second, pytz.utc)) + ' (user originally carried out ' + str(users[second_user]['cnt']) + ' total events and ' + str(len(users[second_user]['actions'])) + ' unique events during ' + str(users[second_user]['days']) + ' active days).')
choices.remove(second_user)
pairs[first_user] = second_user
pairs[second_user] = first_user
change_times[first_user] = change_time_first
change_times[second_user] = change_time_second
# Create new file with injected anomalies
write_every = 1000000 # Write file in batches for performance
buf_cnt = 0
out_string = ""
cnt = 0
total_lines = 50522931
with open('clue.json') as f, open('clue_anomaly.json', 'w+') as out, open('labels.txt', 'w+') as labels:
for line in f:
cnt += 1
buf_cnt += 1
if int(cnt % (total_lines / 20)) == 0:
print(str(int(cnt*100/total_lines)) + '%', end=' ', flush=True)
j = json.loads(line)
uid = j['uid']
# Check if uid is in one of the selected user pairs and switch accordingly
if uid in change_times:
ts = parser.isoparse(j['time'])
if ts.timestamp() >= change_times[uid]:
j['uid'] = pairs[uid]
if 'user' in j['params'] and j['params']['user'] == uid:
# Also switch parameter user when it occurs like uid
j['params']['user'] = pairs[uid]
# Write updated events to file
out_string += json.dumps(j) + '\n'
if buf_cnt >= write_every:
buf_cnt = 0
out.write(out_string)
out_string = ""
out.write(out_string) # Write remaining output after final loop
for uid, time in change_times.items():
labels.write(str(pairs[uid]) + ',' + str(time) + '\n') # Need to use pairs[uid] instead of uid since we want to detect the first activity of the replacement-user
print('')