-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclassifier.py
64 lines (53 loc) · 1.69 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""A sentence classifier based on word similarity."""
__author__ = 'Chong Guo <armourcy@gmail.com>'
__copyright__ = 'Copyright 2018, Chong Guo'
__license__ = 'MIT'
import nltk
from nltk.corpus import wordnet
from rake_nltk import Rake
def classifier(sentence, category):
# download data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
# init category
category_list = []
mapping = {}
for k in category.keys():
syn_k = wordnet.synsets(k, pos=wordnet.NOUN)[0]
category_list.append(syn_k)
mapping[syn_k.name()] = k
# extract keywords
r = Rake()
r.extract_keywords_from_text(sentence)
phrases = r.get_ranked_phrases_with_scores()
# compute keywords score
keywords = {}
for score, phrase in phrases:
tagged_tokens = nltk.pos_tag(nltk.word_tokenize(phrase))
for token, tag in tagged_tokens:
if tag.startswith('NN'):
keywords[token] = keywords.get(token, 0) + score
# computer similarity
res = category
for keyword in keywords:
wordsets = wordnet.synsets(keyword, pos=wordnet.NOUN)
if len(wordsets):
for c in category_list:
name = mapping[c.name()]
res[name] = res.get(name, 0) + wordnet.path_similarity(c, wordsets[0])
# return accumulative probability for this category
return res
# Test
category = {
# 'category_keyword': accumulative probability
'sports': 0,
'food': 0,
'movie': 0,
'technique': 0,
'travel': 0
}
category = classifier('What sports do you like the best and why?', category)
print(category)