-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
58 lines (39 loc) · 1.28 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
from subprocess import check_output
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
def data_dir(name=None):
path = os.path.realpath(__file__)
base = os.path.dirname(path)
return os.path.join(base, "data", name or "")
def path_to_key(path):
return os.path.splitext(os.path.basename(path))[0].lower()
def load_txt(path):
with open(path, 'r') as f:
return TaggedDocument(
words=nltk.tokenize.word_tokenize(f.read()),
tags=[path_to_key(path)]
)
def load_pdf(path):
text = check_output(["pdftotext", path, "-"]).decode("utf-8")
return TaggedDocument(
words=nltk.tokenize.word_tokenize(text),
tags=[path_to_key(path)]
)
EXT_MAP = {
".txt": load_txt,
".pdf": load_pdf
}
def ext_of(p):
return os.path.splitext(p)[1]
def load_file(path):
loader = EXT_MAP[ext_of(path)]
return loader(path)
def data_from_dir(path):
docs = os.listdir(path)
gen = (load_file(os.path.join(path, d)) for d in docs if ext_of(d) in EXT_MAP)
count = len([d for d in docs if ext_of(d) in EXT_MAP])
return gen, count
def model_from_dir(path):
ws = os.cpu_count()
return Doc2Vec(list(data_from_dir(path)[0]), vector_size=5, window=2, min_count=1, workers=ws)