-
Notifications
You must be signed in to change notification settings - Fork 10
/
pdftowordcloud.py
47 lines (36 loc) · 1.58 KB
/
pdftowordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# go over all pdfs in CVPR2019, get all the words from each, discard stop words,
# count frequencies of all words, retain top 100 for each PDF and dump a
# pickle of results into topwords.p
import os
from string import punctuation
from operator import itemgetter
import re
import pickle
N= 100 # how many top words to retain
# load in stopwords (i.e. boring words, these we will ignore)
stopwords = open("stopwords.txt", "r").read().split()
stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]
# get list of all PDFs
relpath = "content/"
allFiles = os.listdir(relpath)
pdfs = [x for x in allFiles if x.endswith(".pdf")]
# go over every PDF, use pdftotext to get all words, discard boring ones, and count frequencies
topdict = {} # dict of paperid -> [(word, frequency),...]
for i,f in enumerate(pdfs):
paperid = f[:-20]
fullpath = relpath + f
print("processing %s, %d/%d" % (paperid, i, len(pdfs)))
# create text file
cmd = "pdftotext %s %s" % (fullpath, "out.txt")
print("EXEC: " + cmd)
os.system(cmd)
txtlst = open('out.txt', 'r', errors='ignore').read().split() # get all words in a giant list
words = [x.lower() for x in txtlst if re.match('^[\w-]+$', x) is not None] # take only alphanumerics
words = [x for x in words if len(x)>2 and (not x in stopwords)] # remove stop words
# count up frequencies of all words
wcount = {}
for w in words: wcount[w] = wcount.get(w, 0) + 1
top = sorted(wcount.items(), key=itemgetter(1), reverse=True)[:N] # sort and take top N
topdict[paperid] = top # save to our dict
# dump to pickle
pickle.dump(topdict, open("topwords.p", "wb"))