-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmost_similars.py.save
86 lines (77 loc) · 4.08 KB
/
most_similars.py.save
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from gensim.models import Doc2Vec as d2v
from gensim.models import doc2vec
import gensim
import os
import collections
import smart_open
import random
import logging
from ast import literal_eval
from argparse import ArgumentParser as ap
from pdb import set_trace as st
def read_corpus(fname, tokens_only=False, pp=False):
with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
for i, line in enumerate(f):
if tokens_only:
yield gensim.utils.simple_preprocess(line)
else:
# For training data, add tags
if pp: # Preprocess strings
yield doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line),
[i])
else: # Raw strings
yield doc2vec.TaggedDocument(line, [i])
# Logging all our program
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
parser = ap(description="This script gives a ranking for ach sentence of a given list (list of 'indexes') or for each string inside a file ('inlines').")
parser.add_argument("--indexes", help="A list of indexes separed by comma.", metavar="indexes", default=None)
parser.add_argument("--corpus", help="A file containing train text a [sentence|document] per line.", metavar="corpus", default=None)
parser.add_argument("--inlines", help="A file containing a sentence by row.", metavar="inlines", default=None)
parser.add_argument("--model", help="A file containing the sentence embeddings model (gensim).", metavar="model", default=None)
parser.add_argument("--dims", help="Dimension of word embeddings.", metavar="dims", default='300', type=int)
parser.add_argument("--top", help="number of sentences returned for each ranking.", metavar="top", default=10, type=int)
parser.add_argument("--train", help="Toggles whether a new model must be trained (unspecified=False).", metavar="train", default=False)
args = parser.parse_args()
train=False
train_file=args.corpus #"/almac/ignacio/data/GUsDany/corpus/GUs_literature.txt"
dims=args.dims
top=args.top
win=8
arch=1 #"cbow" && "skipgram" = 1; "skipgram"=0
model_file=args.model #"/almac/ignacio/data/GUsDany/d2v_raw_GUs-literature-wiki_H%d_W%d_A%s.model" % (dims, win, arch)
if args.indexes and not args.inlines:
centers=[int(i)-1 for i in args.indexes.split(",")]
elif not args.indexes and args.inlines:
import string
with open(args.inlines) as f:
sents=[s.translate(None,
string.punctuation.translate(None, '-')).strip().split()
for s in f.readlines()]
else:
print ("Lines or list, not both...")
exit()
if args.train:
train_corpus = list(read_corpus(train_file))
model = d2v(size=dims, window=win, dbow_words=arch, min_count=1)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count)
model.save(model_file)
m = d2v.load(model_file)
with open(train_file) as f0:
GUs=f0.readlines()
if args.indexes:
for gu_idx in centers:
gu_similarities=m.docvecs.most_similar(gu_idx, topn=top)
line=GUs[gu_idx]
with open("rank_%s" % ''.join(line[0:15]).strip().split()), "w") as f:
f.write("# The most similars to this GU: \n# '%s'\n\n" % (line[0:80] if len(line) > 80 else line))
for sim in gu_similarities:
f.write("%d\t%.5f\t%s\n" % (sim[0], sim[1], GUs[sim[0]][0:80] if len(GUs[sim[0]]) > 80 else GUs[sim[0]]))
elif args.inlines:
for gu in sents:
gu_similarities=m.docvecs.most_similar([m.infer_vector(gu)], topn=top)
with open("rank_%s" % ''.join(gu[0:15].strip().split()), "w") as f:
f.write("# The most similars to this GU: \n# '%s'\n\n" % (" ".join(gu[:80]) if len(" ".join(gu)) > 80 else " ".join(gu)))
for sim in gu_similarities:
f.write("%d\t%.5f\t%s\n" % (sim[0], sim[1], GUs[sim[0]][0:80] if len(GUs[sim[0]]) > 80 else GUs[sim[0]]))