-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathmake_stage.py
101 lines (75 loc) · 3.08 KB
/
make_stage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pickle
import fire
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def get_texts(dataset):
conds, texts = {}, {}
for split in ['train', 'dev', 'test']:
print(f'loading {split} set...')
examples = pickle.load(open(
f'data/{dataset}/{split}.pickle', 'rb'))
conds[split], texts[split] = [], []
for example in examples:
conds[split].append(example['condition'])
texts[split].append(example['text'])
return conds, texts
def get_vocab(texts, rate):
vectorizer = TfidfVectorizer(min_df=5, stop_words='english')
features = vectorizer.fit_transform(texts).tocsc()
vocab = vectorizer.get_feature_names()
analyzer = vectorizer.build_analyzer()
df = 1. / np.exp(vectorizer.idf_ - 1) * (len(texts) + 1) - 1
word_value_list = []
for i, word in enumerate(vocab):
assert len(features[:, i].data) == int(round(df[i]))
word_value_list.append(
[word, np.mean(features[:, i].data), len(features[:, i].data)])
word_value_list.sort(key=lambda t: t[1], reverse=True)
total = sum([len(analyzer(text)) for text in texts])
word_counter = {word: 0 for word in vocab}
for text in texts:
for word in analyzer(text):
if word in word_counter:
word_counter[word] += 1
cnt = 0
result_list = []
for i, (word, _, df) in enumerate(word_value_list):
result_list.append(word)
cnt += word_counter[word]
if cnt / total > rate:
print(f'{i+1} words take {cnt / total} content.')
break
return result_list, analyzer
def main(rate, dataset='wp'):
assert dataset in ['cnn', 'wp']
conds, texts = get_texts(dataset)
vocab, analyzer = get_vocab(texts['train'], rate=rate)
pickle.dump(vocab, open(f'data/{dataset}/vocab_{rate}.pickle', 'wb'))
vocab_dict = {word: 1 for word in vocab}
for split in ['train', 'dev', 'test']:
print(f'extracting {split} set...')
examples = []
for cond, text in zip(conds[split], texts[split]):
extracted_paras = []
for para in text.split('\n'):
extracted_paras.append(' '.join([
word for word in analyzer(para) if word in vocab_dict]))
extracted_text = '\n'.join(extracted_paras)
examples.append({
'condition': cond,
'extracted_text': extracted_text,
'original_text': text
})
pickle.dump(examples, open(
f'data/{dataset}/extracted_{split}_{rate}words.pickle', 'wb'))
log_file = open(
f'data/{dataset}/extracted_{split}_{rate}words.txt', 'w')
for example in examples:
print('CONDITION:{}\n\nEXTRACTED:\n{}\n\nORIGINAL TEXT:\n{}'.format(
example['condition'],
example['extracted_text'], example['original_text']),
file=log_file)
print('=' * 100, '\n\n', file=log_file)
log_file.flush()
if __name__ == '__main__':
fire.Fire(main)