-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquery_expansion.py
98 lines (76 loc) · 3.48 KB
/
query_expansion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import abc
import json
from abc import ABC
from typing import List, Dict
from collections import defaultdict
class QueryExpander(ABC):
"""
An abstract class for generating/expanding the query with related alternative terms.
"""
@abc.abstractmethod
def add_alternatives(self, term: str, alternatives: List[str]) -> None:
"""
Add a term with alternatives to the internal reference for future queries.
:param term: The term you want to add.
:param alternatives: A list of alternatives for the given term.
"""
pass
@abc.abstractmethod
def get_alternatives(self, term: str) -> List[str]:
"""
Get the alternative terms for the specified term.
:param term: The specific term you want to look up.
:return: A list of alternative terms.
"""
pass
@abc.abstractmethod
def process_query(self, terms: List[str]) -> Dict[str, List[str]]:
"""
Process terms to generate a set of alternative terms for each when alternative exist.
:param terms: A list of term tokens to generate alternative terms from.
:return: A dictionary mapping the original term to a list of alternative terms.
"""
pass
class TermAlternativesSource(ABC):
"""
An abstract source reader for loading in alternative terms to use in query expansion.
"""
@abc.abstractmethod
def read(self) -> QueryExpander:
"""
Load alternative terms from this source.
:return: A instance of QueryExpander with all the data loaded in.
"""
pass
class ThesaurusQueryExpander(QueryExpander):
def __init__(self, term_alternatives: Dict[str, List[str]]):
self.term_alternatives = term_alternatives
def add_alternatives(self, term: str, alternatives: List[str]) -> None:
# Add alternative for a term, unless alternatives list is empty.
if isinstance(alternatives, list) and len(alternatives) > 0:
self.term_alternatives[term] = alternatives
def get_alternatives(self, term: str) -> List[str]:
return self.term_alternatives.get(term, [])
def process_query(self, query_terms: List[str]) -> Dict[str, List[str]]:
synonyms = defaultdict(list)
# Build a dictionary mapping the original query terms to a list of alternatives (synonyms):
for term in query_terms:
if term in self.term_alternatives: # Don't include the term if we don't have alternatives for it.
synonyms[term] = self.term_alternatives[term]
return synonyms
class JsonlThesaurusTermAlternativesSource(TermAlternativesSource):
def __init__(self, file_path: str):
"""
A source reader for loading in thesaurus data in JSONL format.
:param file_path: The filename and path to read data from.
"""
self.file_path = file_path
def read(self) -> QueryExpander:
alternatives = defaultdict(list)
# Load in alternative records from the file path:
with open(self.file_path, 'r') as fp:
for line in fp:
record = json.loads(line)
if record['syns']: # Only record the record if the alternatives (synonyms) list isn't empty.
alternatives[record['term']] = record['syns']
return ThesaurusQueryExpander(alternatives) # Return a Query Expander with all the loaded data.