-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommands.py
225 lines (177 loc) · 8.65 KB
/
commands.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
from typing import Dict, List, Any, Tuple
from utils import load_word_lists_for_language, get_classification_datasets, setup_probe, get_metrics, \
setup_metrics
import pprint
import json
from tqdm import tqdm
import os
import wandb
from trainers.poisson_trainer import PoissonTrainer
from trainers.conditional_poisson_trainer import ConditionalPoissonTrainer
def manual(args):
if args.wandb:
tags = [args.language, args.trainer, args.attribute, "manual"]
if args.wandb_tag is not None:
tags.append(args.wandb_tag)
config = vars(args)
config.pop("func", None)
run = wandb.init(project="flexible-probing", config=config, tags=tags)
run.name = f"{args.attribute}-{args.language} ({args.trainer}) [{wandb.run.id}]"
run.save()
# ::::: SETUP :::::
print(f"Setting up datasets ({args.language}, {args.attribute}) and probes...")
word_lists = load_word_lists_for_language(args.language)
datasets = get_classification_datasets(args, word_lists, args.attribute)
dataset_train, dataset_dev, dataset_test = datasets["train"], datasets["dev"], datasets["test"] # noqa
setup = setup_probe(args, dataset_train)
trainer, probe_family = setup["trainer"], setup["probe_family"]
print("Using validation dataset...")
eval_metrics = setup_metrics(dataset_dev)
# ::::: PRE-TRAINING :::::
# If possible, this pre-trains our model. e.g. using our sampling--based procedure, or by just training
# the probe once and then zeroing out dimensions we want to ignore. This trainer modifies the
# neural_probe_model object.
print("Pre-training probe (if possible)...")
trainer.train()
# ::::: EVALUATING THE PROBE :::::
# We figure out what the parameters (the "specification") of the probe should be, for a subset of
# dimensions
select_dimensions = args.dimensions
print("Probing...")
print(f"\tSelected dimensions: {select_dimensions or 'ALL'}")
# The dimensions we want to select
metrics = get_metrics(trainer=trainer, probe_family=probe_family, metrics=eval_metrics,
select_dimensions=select_dimensions)
if args.wandb:
wandb.log(metrics)
# Log to console
pp = pprint.PrettyPrinter(indent=4)
print("Results:")
pp.pprint(metrics)
# Save to file
if args.output_file:
with open(args.output_file, "w") as h:
json.dump(metrics, h)
print(f"Saved output to '{args.output_file}'.")
def file(args):
# ::::: SETUP :::::
print(f"Setting up datasets ({args.language}, {args.attribute}) and probes...")
word_lists = load_word_lists_for_language(args.language)
datasets = get_classification_datasets(args, word_lists, args.attribute)
dataset_train, dataset_dev, dataset_test = datasets["train"], datasets["dev"], datasets["test"] # noqa
setup = setup_probe(args, dataset_train)
trainer, probe_family = setup["trainer"], setup["probe_family"]
print("Using validation dataset...")
eval_metrics = setup_metrics(dataset_dev)
# ::::: PRE-TRAINING :::::
# If possible, this pre-trains our model. e.g. using our sampling--based procedure, or by just training
# the probe once and then zeroing out dimensions we want to ignore. This trainer modifies the
# neural_probe_model object.
print("Pre-training probe (if possible)...")
trainer.train()
variational_weights = None
if isinstance(trainer, PoissonTrainer) or isinstance(trainer, ConditionalPoissonTrainer):
variational_weights = trainer.variational_family.weights.detach().cpu().tolist()
# ::::: EVALUATING THE PROBE :::::
# We figure out what the parameters (the "specification") of the probe should be, for the current
# subset of dimensions
print("Probing...")
with open(args.file, "r") as h:
experiments = json.load(h)
results: List[Dict[str, Any]] = []
for idx, select_dimensions in tqdm(experiments.items()):
# The dimensions we want to select
metrics = get_metrics(trainer=trainer, probe_family=probe_family, metrics=eval_metrics,
select_dimensions=select_dimensions)
# Log to console
tqdm.write(f"\tExperiment {idx} results: {metrics}")
results.append({
"idx": idx,
"dimensions": select_dimensions,
"metrics": metrics,
"variational_weights": variational_weights,
})
# Save to file
if args.output_file:
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
with open(args.output_file, "w") as h:
json.dump(results, h)
print(f"Saved experiment outputs to '{args.output_file}'.")
def greedy(args):
if args.wandb:
tags = [args.language, args.trainer, args.attribute, "sweep"]
if args.wandb_tag is not None:
tags.append(args.wandb_tag)
config = vars(args)
config.pop("func", None)
run = wandb.init(project="flexible-probing", config=config, tags=tags)
run.name = f"{args.attribute}-{args.language} ({args.trainer}) ({args.selection_criterion}) \
[{wandb.run.id}]"
run.save()
# ::::: SETUP :::::
print(f"Setting up datasets ({args.language}, {args.attribute}) and probes...")
word_lists = load_word_lists_for_language(args.language)
datasets = get_classification_datasets(args, word_lists, args.attribute)
dataset_train, dataset_dev, dataset_test = datasets["train"], datasets["dev"], datasets["test"]
embedding_dim = dataset_train.get_dimensionality()
print(f"Embedding dimensionality: {embedding_dim}")
setup = setup_probe(args, dataset_train, report_progress=True)
trainer, probe_family = setup["trainer"], setup["probe_family"]
dimension_selection_metrics = setup_metrics(dataset_dev)
eval_metrics = setup_metrics(dataset_test)
# ::::: PRE-TRAINING :::::
# If possible, this pre-trains our model. e.g. using our sampling--based procedure, or by just training
# the probe once and then zeroing out dimensions we want to ignore. This trainer modifies the
# neural_probe_model object.
print("Pre-training probe (if possible)...")
trainer.train()
variational_weights = None
if isinstance(trainer, PoissonTrainer) or isinstance(trainer, ConditionalPoissonTrainer):
print(trainer.variational_family.weights.cpu().topk(10))
variational_weights = trainer.variational_family.weights.detach().cpu().tolist()
# ::::: EVALUATING THE PROBE :::::
# We figure out what the parameters (the "specification") of the probe should be, for the current
# subset of dimensions
print("Probing through greedy dimension selection...")
results: List[Dict[str, Any]] = []
picked_dimensions = []
for iteration in range(args.selection_size):
print(f"Iteration #{iteration}")
print(f"\tDimensions selected so far: {picked_dimensions}")
print("\tPicking next dimension...")
selection_results: List[Tuple[int, float]] = []
for candidate_dim in tqdm(list(set(range(embedding_dim)) - set(picked_dimensions))):
metrics_dev = get_metrics(trainer=trainer, probe_family=probe_family,
metrics=dimension_selection_metrics,
select_dimensions=[candidate_dim] + picked_dimensions)
selection_results.append((candidate_dim, metrics_dev[args.selection_criterion]))
best_dim = max(selection_results, key=lambda x: x[1])[0]
picked_dimensions.append(best_dim)
print(f"\tSelected dimension: {best_dim}")
metrics_test = get_metrics(trainer=trainer, probe_family=probe_family,
metrics=eval_metrics, select_dimensions=picked_dimensions)
# Log to console
print(f"\tIteration results: {metrics_test}")
results.append({
"iteration_dimension": best_dim,
"dimensions": list(picked_dimensions),
"metrics": metrics_test
})
print()
if args.wandb:
wandb.log(metrics_test)
# Add whole vector results & log to wandb
metrics_test = get_metrics(trainer=trainer, probe_family=probe_family, metrics=eval_metrics,
select_dimensions=None)
results.append({
"dimensions": "ALL",
"metrics": metrics_test,
"variational_weights": variational_weights,
})
if args.wandb:
wandb.run.summary.update({f"{k}_ALL": v for k, v in metrics_test.items()})
# Save to file
if args.output_file:
with open(args.output_file, "w") as h:
json.dump(results, h)
print(f"Saved experiment outputs to '{args.output_file}'.")