-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_keywords.py
128 lines (97 loc) · 3.15 KB
/
find_keywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import re
import nlpnet
import logging
pln_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
nlpnet.set_data_dir(os.path.join(pln_dir, u'pos-pt'))
tagger = nlpnet.POSTagger()
def tag_text(text):
"""Add tags to passed text.
Args:
text (str): Text to be tagged.
Return:
str: Tagged text.
Example:
>>> tag_text('Bom dia')
'Bom/ADJ dia/N'
"""
if text.replace(' ', ''):
try:
tags = tagger.tag(text)[0]
tagged_text = ' '.join(['{}/{}'.format(x,y) for (x, y) in tags])
return tagged_text
except:
logging.exception(u'Error tagging text: "%s"', text)
return ''
def has_tags(word, tags):
"""Verify if word has tag.
"""
tag = word.split('/')[-1]
return tag in tags
def flatten_list(items):
result = list()
for item in items:
for subitem in item:
result.append(subitem)
return result
def is_substring(substring, strings):
for string in strings:
if substring in string and substring!=string:
return True
return False
def find_entities(text):
entities = list()
tagged_text = tag_text(text)
tagged_entities = re.findall(
r'(\w+/ADJ (\w+/PREP)?)?'
r'('
r'((\w+/N ?)+ (\w+/ADJ ?)*)|'
r'((\w+/NPROP ?)+)|'
r'(\w+/PCP)'
r')',
tagged_text
)
if tagged_entities:
flatten = flatten_list(tagged_entities)
striped = set(ent.strip() for ent in flatten if ent!='')
no_tags = [re.sub(r'/\w+', '', ent) for ent in striped]
# remove substrings
entities = [ent for ent in no_tags if not is_substring(ent, no_tags)]
return entities
def get_closer(tagged_words):
distance = 0
for tagged_word in tagged_words:
distance += 1
if tagged_word.strip().endswith('/V'):
return distance, tagged_word[:-2]
return 1000, None
def find_intention(text, entities):
intentions = list()
for entity in entities:
splited = text.split(entity)
pre = tag_text(splited[0])
pos = tag_text(splited[-1]) if len(splited) > 1 and splited[-1] else ''
tagged_pre = re.findall(r'(\w+/V)', pre)
tagged_pre.reverse()
tagged_pos = re.findall(r'(\w+/V)', pos)
dist_pre, closer_pre = get_closer(tagged_pre)
dist_pos, closer_pos = get_closer(tagged_pos)
if closer_pre and closer_pos:
intention = closer_pre if dist_pre < dist_pos else closer_pos
elif closer_pre and not closer_pos:
intention = closer_pre
elif not closer_pre and closer_pos:
intention = closer_pos
else:
intention = None
if intention is not None:
intentions.append(intention)
return list(set(intentions))
if __name__ == '__main__':
# text = 'Como verificar minha conta bancária'
# text = 'Como faço uma venda por boleto bancário?'
text = 'Posso pagar minhas compras com pontos e reais no mesmo pedido?'
ent = find_entities(text)
intention = find_intention(text, ent)
print(ent)
print(intention)