Skip to content

Commit

Permalink
Merge pull request #3117 from lonvia/fix-assorted-search-errors
Browse files Browse the repository at this point in the history
More improvements to the Python search algorithm
  • Loading branch information
lonvia authored Jul 22, 2023
2 parents 9fc235d + 587698a commit 4a57863
Show file tree
Hide file tree
Showing 5 changed files with 242 additions and 87 deletions.
78 changes: 40 additions & 38 deletions nominatim/api/search/db_search_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from nominatim.api.search.token_assignment import TokenAssignment
import nominatim.api.search.db_search_fields as dbf
import nominatim.api.search.db_searches as dbs
from nominatim.api.logging import log


def wrap_near_search(categories: List[Tuple[str, str]],
Expand Down Expand Up @@ -156,13 +155,22 @@ def build_housenumber_search(self, sdata: dbf.SearchData, hnrs: List[Token],
""" Build a simple address search for special entries where the
housenumber is the main name token.
"""
partial_tokens: List[int] = []
for trange in address:
partial_tokens.extend(t.token for t in self.query.get_partials_list(trange))
sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], 'lookup_any')]

sdata.lookups = [dbf.FieldLookup('name_vector', [t.token for t in hnrs], 'lookup_any'),
dbf.FieldLookup('nameaddress_vector', partial_tokens, 'lookup_all')
]
partials = [t for trange in address
for t in self.query.get_partials_list(trange)]

if len(partials) != 1 or partials[0].count < 10000:
sdata.lookups.append(dbf.FieldLookup('nameaddress_vector',
[t.token for t in partials], 'lookup_all'))
else:
sdata.lookups.append(
dbf.FieldLookup('nameaddress_vector',
[t.token for t
in self.query.get_tokens(address[0], TokenType.WORD)],
'lookup_any'))

sdata.housenumbers = dbf.WeightedStrings([], [])
yield dbs.PlaceSearch(0.05, sdata, sum(t.count for t in hnrs))


Expand All @@ -187,69 +195,63 @@ def yield_lookups(self, name: TokenRange, address: List[TokenRange])\
be searched for. This takes into account how frequent the terms
are and tries to find a lookup that optimizes index use.
"""
penalty = 0.0 # extra penalty currently unused

penalty = 0.0 # extra penalty
name_partials = self.query.get_partials_list(name)
exp_name_count = min(t.count for t in name_partials)
addr_partials = []
for trange in address:
addr_partials.extend(self.query.get_partials_list(trange))
name_tokens = [t.token for t in name_partials]

addr_partials = [t for r in address for t in self.query.get_partials_list(r)]
addr_tokens = [t.token for t in addr_partials]

partials_indexed = all(t.is_indexed for t in name_partials) \
and all(t.is_indexed for t in addr_partials)
exp_count = min(t.count for t in name_partials)

if (len(name_partials) > 3 or exp_name_count < 1000) and partials_indexed:
# Lookup by name partials, use address partials to restrict results.
lookup = [dbf.FieldLookup('name_vector',
[t.token for t in name_partials], 'lookup_all')]
if addr_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'restrict'))
yield penalty, exp_name_count, lookup
if (len(name_partials) > 3 or exp_count < 1000) and partials_indexed:
yield penalty, exp_count, dbf.lookup_by_names(name_tokens, addr_tokens)
return

exp_addr_count = min(t.count for t in addr_partials) if addr_partials else exp_name_count
if exp_addr_count < 1000 and partials_indexed:
exp_count = min(exp_count, min(t.count for t in addr_partials)) \
if addr_partials else exp_count
if exp_count < 1000 and partials_indexed:
# Lookup by address partials and restrict results through name terms.
# Give this a small penalty because lookups in the address index are
# more expensive
yield penalty + exp_addr_count/5000, exp_addr_count,\
[dbf.FieldLookup('name_vector', [t.token for t in name_partials], 'restrict'),
dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all')]
yield penalty + exp_count/5000, exp_count,\
dbf.lookup_by_addr(name_tokens, addr_tokens)
return

# Partial term to frequent. Try looking up by rare full names first.
name_fulls = self.query.get_tokens(name, TokenType.WORD)
rare_names = list(filter(lambda t: t.count < 1000, name_fulls))
rare_names = list(filter(lambda t: t.count < 10000, name_fulls))
# At this point drop unindexed partials from the address.
# This might yield wrong results, nothing we can do about that.
if not partials_indexed:
addr_tokens = [t.token for t in addr_partials if t.is_indexed]
log().var_dump('before', penalty)
penalty += 1.2 * sum(t.penalty for t in addr_partials if not t.is_indexed)
log().var_dump('after', penalty)
if rare_names:
# Any of the full names applies with all of the partials from the address
lookup = [dbf.FieldLookup('name_vector', [t.token for t in rare_names], 'lookup_any')]
if addr_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'restrict'))
yield penalty, sum(t.count for t in rare_names), lookup
yield penalty, sum(t.count for t in rare_names),\
dbf.lookup_by_any_name([t.token for t in rare_names], addr_tokens)

# To catch remaining results, lookup by name and address
# We only do this if there is a reasonable number of results expected.
if min(exp_name_count, exp_addr_count) < 10000:
if exp_count < 10000:
if all(t.is_indexed for t in name_partials):
lookup = [dbf.FieldLookup('name_vector',
[t.token for t in name_partials], 'lookup_all')]
lookup = [dbf.FieldLookup('name_vector', name_tokens, 'lookup_all')]
else:
# we don't have the partials, try with the non-rare names
non_rare_names = [t.token for t in name_fulls if t.count >= 1000]
non_rare_names = [t.token for t in name_fulls if t.count >= 10000]
if not non_rare_names:
return
lookup = [dbf.FieldLookup('name_vector', non_rare_names, 'lookup_any')]
if addr_tokens:
lookup.append(dbf.FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all'))
yield penalty + 0.1 * max(0, 5 - len(name_partials) - len(addr_tokens)),\
min(exp_name_count, exp_addr_count), lookup
penalty += 0.1 * max(0, 5 - len(name_partials) - len(addr_tokens))
if len(rare_names) == len(name_fulls):
# if there already was a search for all full tokens,
# avoid this if anything has been found
penalty += 0.25
yield penalty, exp_count, lookup


def get_name_ranking(self, trange: TokenRange) -> dbf.FieldRanking:
Expand Down
31 changes: 31 additions & 0 deletions nominatim/api/search/db_search_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,34 @@ def set_ranking(self, rankings: List[FieldRanking]) -> None:
self.rankings.append(ranking)
else:
self.penalty += ranking.default


def lookup_by_names(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where name tokens are looked up via index
and potential address tokens are used to restrict the search further.
"""
lookup = [FieldLookup('name_vector', name_tokens, 'lookup_all')]
if addr_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_tokens, 'restrict'))

return lookup


def lookup_by_any_name(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where name tokens are looked up via index
and only one of the name tokens must be present.
Potential address tokens are used to restrict the search further.
"""
lookup = [FieldLookup('name_vector', name_tokens, 'lookup_any')]
if addr_tokens:
lookup.append(FieldLookup('nameaddress_vector', addr_tokens, 'restrict'))

return lookup


def lookup_by_addr(name_tokens: List[int], addr_tokens: List[int]) -> List[FieldLookup]:
""" Create a lookup list where address tokens are looked up via index
and the name tokens are only used to restrict the search further.
"""
return [FieldLookup('name_vector', name_tokens, 'restrict'),
FieldLookup('nameaddress_vector', addr_tokens, 'lookup_all')]
22 changes: 19 additions & 3 deletions nominatim/api/search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""
Datastructures for a tokenized query.
"""
from typing import List, Tuple, Optional, NamedTuple, Iterator
from typing import List, Tuple, Optional, Iterator
from abc import ABC, abstractmethod
import dataclasses
import enum
Expand Down Expand Up @@ -107,13 +107,29 @@ def get_category(self) -> Tuple[str, str]:
category objects.
"""


class TokenRange(NamedTuple):
@dataclasses.dataclass
class TokenRange:
""" Indexes of query nodes over which a token spans.
"""
start: int
end: int

def __lt__(self, other: 'TokenRange') -> bool:
return self.end <= other.start


def __le__(self, other: 'TokenRange') -> bool:
return NotImplemented


def __gt__(self, other: 'TokenRange') -> bool:
return self.start >= other.end


def __ge__(self, other: 'TokenRange') -> bool:
return NotImplemented


def replace_start(self, new_start: int) -> 'TokenRange':
""" Return a new token range with the new start.
"""
Expand Down
149 changes: 103 additions & 46 deletions nominatim/api/search/token_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,97 @@ def recheck_sequence(self) -> bool:
return True


def _get_assignments_postcode(self, base: TokenAssignment,
query_len: int) -> Iterator[TokenAssignment]:
""" Yield possible assignments of Postcode searches with an
address component.
"""
assert base.postcode is not None

if (base.postcode.start == 0 and self.direction != -1)\
or (base.postcode.end == query_len and self.direction != 1):
log().comment('postcode search')
# <address>,<postcode> should give preference to address search
if base.postcode.start == 0:
penalty = self.penalty
self.direction = -1 # name searches are only possbile backwards
else:
penalty = self.penalty + 0.1
self.direction = 1 # name searches are only possbile forwards
yield dataclasses.replace(base, penalty=penalty)


def _get_assignments_address_forward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments of address searches with
left-to-right reading.
"""
first = base.address[0]

log().comment('first word = name')
yield dataclasses.replace(base, penalty=self.penalty,
name=first, address=base.address[1:])

# To paraphrase:
# * if another name term comes after the first one and before the
# housenumber
# * a qualifier comes after the name
# * the containing phrase is strictly typed
if (base.housenumber and first.end < base.housenumber.start)\
or (base.qualifier and base.qualifier > first)\
or (query.nodes[first.start].ptype != qmod.PhraseType.NONE):
return

penalty = self.penalty

# Penalty for:
# * <name>, <street>, <housenumber> , ...
# * queries that are comma-separated
if (base.housenumber and base.housenumber > first) or len(query.source) > 1:
penalty += 0.25

for i in range(first.start + 1, first.end):
name, addr = first.split(i)
log().comment(f'split first word = name ({i - first.start})')
yield dataclasses.replace(base, name=name, address=[addr] + base.address[1:],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])


def _get_assignments_address_backward(self, base: TokenAssignment,
query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments of address searches with
right-to-left reading.
"""
last = base.address[-1]

if self.direction == -1 or len(base.address) > 1:
log().comment('last word = name')
yield dataclasses.replace(base, penalty=self.penalty,
name=last, address=base.address[:-1])

# To paraphrase:
# * if another name term comes before the last one and after the
# housenumber
# * a qualifier comes before the name
# * the containing phrase is strictly typed
if (base.housenumber and last.start > base.housenumber.end)\
or (base.qualifier and base.qualifier < last)\
or (query.nodes[last.start].ptype != qmod.PhraseType.NONE):
return

penalty = self.penalty
if base.housenumber and base.housenumber < last:
penalty += 0.4
if len(query.source) > 1:
penalty += 0.25

for i in range(last.start + 1, last.end):
addr, name = last.split(i)
log().comment(f'split last word = name ({i - last.start})')
yield dataclasses.replace(base, name=name, address=base.address[:-1] + [addr],
penalty=penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype])


def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Yield possible assignments for the current sequence.
Expand All @@ -265,17 +356,13 @@ def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
"""
base = TokenAssignment.from_ranges(self.seq)

num_addr_tokens = sum(t.end - t.start for t in base.address)
if num_addr_tokens > 50:
return

# Postcode search (postcode-only search is covered in next case)
if base.postcode is not None and base.address:
if (base.postcode.start == 0 and self.direction != -1)\
or (base.postcode.end == query.num_token_slots() and self.direction != 1):
log().comment('postcode search')
# <address>,<postcode> should give preference to address search
if base.postcode.start == 0:
penalty = self.penalty
else:
penalty = self.penalty + 0.1
yield dataclasses.replace(base, penalty=penalty)
yield from self._get_assignments_postcode(base, query.num_token_slots())

# Postcode or country-only search
if not base.address:
Expand All @@ -286,49 +373,19 @@ def get_assignments(self, query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
# <postcode>,<address> should give preference to postcode search
if base.postcode and base.postcode.start == 0:
self.penalty += 0.1
# Use entire first word as name

# Right-to-left reading of the address
if self.direction != -1:
log().comment('first word = name')
yield dataclasses.replace(base, name=base.address[0],
penalty=self.penalty,
address=base.address[1:])

# Use entire last word as name
if self.direction == -1 or (self.direction == 0 and len(base.address) > 1):
log().comment('last word = name')
yield dataclasses.replace(base, name=base.address[-1],
penalty=self.penalty,
address=base.address[:-1])
yield from self._get_assignments_address_forward(base, query)

# Left-to-right reading of the address
if self.direction != 1:
yield from self._get_assignments_address_backward(base, query)

# variant for special housenumber searches
if base.housenumber:
yield dataclasses.replace(base, penalty=self.penalty)

# Use beginning of first word as name
if self.direction != -1:
first = base.address[0]
if (not base.housenumber or first.end >= base.housenumber.start)\
and (not base.qualifier or first.start >= base.qualifier.end):
for i in range(first.start + 1, first.end):
name, addr = first.split(i)
penalty = self.penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype]
log().comment(f'split first word = name ({i - first.start})')
yield dataclasses.replace(base, name=name, penalty=penalty,
address=[addr] + base.address[1:])

# Use end of last word as name
if self.direction != 1:
last = base.address[-1]
if (not base.housenumber or last.start <= base.housenumber.end)\
and (not base.qualifier or last.end <= base.qualifier.start):
for i in range(last.start + 1, last.end):
addr, name = last.split(i)
penalty = self.penalty + PENALTY_TOKENCHANGE[query.nodes[i].btype]
log().comment(f'split last word = name ({i - last.start})')
yield dataclasses.replace(base, name=name, penalty=penalty,
address=base.address[:-1] + [addr])



def yield_token_assignments(query: qmod.QueryStruct) -> Iterator[TokenAssignment]:
""" Return possible word type assignments to word positions.
Expand Down
Loading

0 comments on commit 4a57863

Please sign in to comment.