-
Notifications
You must be signed in to change notification settings - Fork 0
/
FindKeyWords.py
25 lines (18 loc) · 1002 Bytes
/
FindKeyWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from PDFExtract import pdf_to_text
# extract candidate words not phrases
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
# tokenize and POS-tag words
tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
for sent in nltk.sent_tokenize(text)))
# filter on certain POS tags and lowercase all words
candidates = [word.lower() for word, tag in tagged_words
if tag in good_tags and word.lower() not in stop_words
and not all(char in punct for char in word)]
print(candidates);
return candidates
# texts = pdf_to_text('pdf/Database_02_lecture_2.pdf');
# set(extract_candidate_words(text));