-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathcommon.py
111 lines (87 loc) · 2.8 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
'''
@author: aiman.najjar
Functions that are commonly used across the project
'''
import operator
import constants
import sys
import logging
import re
from HTMLParser import HTMLParser
from PorterStemmer import PorterStemmer
'''
MLStripper:
An implementation of the HTMLParser class that returns only useful terms and discard other markup
Initial skeleton of this implementation was obtained from the following StackOverflow page but was modified as per our needs:
http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
'''
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
self.currentTag = ""
self.currentAttrs = []
def handle_starttag(self, tag, attrs):
self.currentTag = tag
self.currentAttrs = attrs
def handle_endtag(self, tag):
self.currentTag = ""
self.currentAttrs = []
def handle_data(self, d):
if not self.currentTag in constants.IGNORE_TAGS:
res = re.match( r"(.*http.*)", d.lower())
if not res:
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
# Convinent function to quickly invoke our special HTML parser
def strip_tags(html):
s = MLStripper()
try:
html = html.decode('UTF-8')
except UnicodeDecodeError, e:
html = html
s.feed(html)
return s.get_data()
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
'''
getTopTerms:
Given the current query and the new query vector, return the highest scoring terms (default 2 terms)
The current query is used to ensure that returned terms are actually new
'''
def getTopTerms(currentQuery, weightsMap,topX=2):
p = PorterStemmer()
current_terms = []
# for term in currentQuery.split():
# term = p.stem(term.lower(), 0,len(term)-1)
# current_terms.append(term)
i = 0
terms = []
for term in sorted(weightsMap, key=weightsMap.get, reverse=True):
if term in constants.QUERY_SKIP_TERMS or p.stem(term.lower(), 0,len(term)-1) in current_terms:
continue
terms.append(term)
current_terms.append(p.stem(term.lower(), 0,len(term)-1))
i = i + 1
if (topX != 'ALL' and i >= topX):
break;
return terms
'''
printWeights:
Given the new query vector, print out the highest scoring terms (default 10 terms)
Used for debugging purposes only
'''
def printWeights(weightsMap,topX=10):
i = 0
for term in sorted(weightsMap, key=weightsMap.get, reverse=True):
if term in constants.STOP_WORDS_LIST:
continue
print "%-10s: %10f" % (term, weightsMap[term])
i = i + 1
if (topX != 'ALL' and i >= topX):
break;