-
Notifications
You must be signed in to change notification settings - Fork 0
/
event_projection.py
166 lines (145 loc) · 5.48 KB
/
event_projection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import json
import logging
from collections import defaultdict
from concurrent.futures.process import ProcessPoolExecutor
from pathlib import Path
from scipy.optimize import minimize
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from models_manager import ModelsManager
from word2vec_wiki2vec_model import Word2VecWiki2VecModel
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO
)
def calc_distances(x, locations, distance_metric):
return [distance_metric(x, loc) for loc in locations]
def calc_mse(x, locations, distances, distance_metric):
target_distances = calc_distances(x, locations, distance_metric=distance_metric)
mse = mean_squared_error(distances, target_distances)
return mse
def find_anchor_words(target_model, source_model, word, anchors_num):
"""
Find anchor words that belong to the target model using an exponential backoff-like method
"""
anchor_words = []
coefficient = max(200, anchors_num * 15)
i = 0
while len(anchor_words) < anchors_num and i < 5:
anchor_words = [
w
for w, score in source_model.similar_by_word(
word, anchors_num + coefficient
)
if score != 1.0 and w in target_model
]
coefficient *= 2
i += 1
if not anchor_words:
logging.info(f'found no anchors for "{word}"')
return None
anchor_words.sort(
key=lambda w: source_model.wv.vocab.get(source_model.get_key(w)).index
)
anchor_words = anchor_words[:anchors_num]
return anchor_words
def project_word(target_model, source_model, word, anchors_num, distance_metric):
if word not in source_model:
# logging.info(f'"{word}" not found in the source model')
return word, None
# find anchor words that belong to the target model
anchor_words = find_anchor_words(target_model, source_model, word, anchors_num)
if not anchor_words:
logging.info(f'found no anchors for "{word}"')
return word, None
anchor_words.sort(
key=lambda w: source_model.wv.vocab.get(source_model.get_key(w)).index
)
anchor_words = anchor_words[:anchors_num]
source_anchor_locations = [source_model.word_vec(anchor) for anchor in anchor_words]
source_distances = calc_distances(
source_model.word_vec(word, use_norm=True),
source_anchor_locations,
distance_metric,
)
target_anchor_locations = [
target_model.word_vec(anchor, use_norm=True) for anchor in anchor_words
]
initial_location = target_anchor_locations[0]
result = minimize(
calc_mse,
initial_location,
args=(target_anchor_locations, source_distances, distance_metric),
method='L-BFGS-B',
)
key = source_model.get_key(word)
return key, result.x
def project_words(target_model, source_model, words, anchors_num, distance_metric):
if not words:
return target_model
args = [
(target_model, source_model, word, anchors_num, distance_metric)
for word in words
if word in source_model
]
with ProcessPoolExecutor() as executor:
source_model.init_sims() # so that we won't have to do that repeatedly while multi-processing
new_words = dict(tqdm(executor.map(project_word, *zip(*args)), total=len(args)))
new_words = {
word: vector for word, vector in new_words.items() if vector is not None
}
target_model.wv.add(list(new_words.keys()), list(new_words.values()))
target_model.init_sims(replace=True)
return target_model
def project_to_temporal_models(
models_dir,
min_year,
max_year,
global_model,
event_year,
distance_metric,
enrich_only_events_of_same_year=False,
):
models_manager = ModelsManager(models_dir, from_year=min_year, to_year=max_year)
models_manager.load_models()
if enrich_only_events_of_same_year:
year_to_event = defaultdict(list)
for event, year in event_year.items():
if models_manager.from_year <= year <= models_manager.to_year:
year_to_event[year].append(event)
new_dir = models_dir / 'enriched'
new_dir.mkdir(parents=True, exist_ok=True)
for year in range(min_year, max_year + 1):
model = models_manager[year]
events_to_enrich = (
year_to_event[year] if enrich_only_events_of_same_year else events
)
new_model = project_words(
model, global_model, events_to_enrich, anchors_num, distance_metric
)
logging.info(
f'model of {year} was enriched with {len(events_to_enrich)} events'
)
file_name = f'word2vec_nyt_{year}.kv'
new_model.wv.save(str(new_dir / file_name))
if __name__ == '__main__':
anchors_num = 30
distance_metric = distance.cosine
events = json.load(open('data/events_since1980.json', encoding='utf-8'))
global_model = Word2VecWiki2VecModel('path-to-wikipedia2vec-model')
min_year = 1981
max_year = 2018
project_only_events_of_same_year = True
logging.info(
f'enriching models from {min_year}-{max_year} using maximum {anchors_num} anchors'
)
models_dir = Path('path-to-temporal-models')
event_year = json.load(open('data/event_year_since1980.json', encoding='utf-8'))
project_to_temporal_models(
models_dir,
min_year,
max_year,
global_model,
project_only_events_of_same_year,
distance_metric,
)