-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQueryParser.py
70 lines (56 loc) · 2.5 KB
/
QueryParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 20 17:30:16 2017
@author: kalifou, portelas
"""
from TextRepresenter import PorterStemmer
from ParserCACM import ParserCACM
import numpy as np
class Query(object):
def __init__(self, query_id, query_text, query_tf, relevant_docs=None):
self.query_id = query_id # ID of the current query
self.query_text = query_text # text contained in the query
self.query_tf = query_tf
self.relevant_docs = relevant_docs # dict of {doc_id : [(theme1, score1),...(...,scoreN)]} of docs relevant to the query
def getId(self):
return self.query_id
def getText(self):
return self.query_text
def getTf(self):
return self.query_tf
def getRelevantDocs(self):
return self.relevant_docs
class QueryParser(object):
"""Class for query reading from file"""
def __init__(self, query_file, relevance_file):
self.query = open(query_file, 'r')
self.textRepresenter = PorterStemmer()
#init boolean to be able to close source files
self.already_closed = False
#Create parser to read query_file
#WARNING WILL ONLY WORK ON CACM DATASET TODO FIND SOLUTION
self.parser = ParserCACM()
self.parser.initFile(query_file)
#Build a dictionary (query_id, list of relevant documents)
self.relevant_docs = {}
with open(relevance_file, 'r') as f:
for line in f:
data = line.split(" ")
query_id = int(data[0])
if(not self.relevant_docs.has_key(query_id)):
self.relevant_docs[query_id] = []
#A list is added per relevant doc for later use of couple (themes, score)
self.relevant_docs.get(query_id).append([ int(data[1]), None, None])
def nextQuery(self):
"""Return next Query object"""
query_data = self.parser.nextDocument()
if (query_data == None):
if( not self.already_closed ):
self.query.close()
self.already_closed = True
return -1
query_id = query_data.getId()
query_text = query_data.getText()
query_tf = self.textRepresenter.getTextRepresentation(query_text)
relevant_docs_to_query = np.array(self.relevant_docs.get(int(query_id),[[None,None,None]]))
return Query(query_id, query_text, query_tf, relevant_docs_to_query)