Skip to content

Commit

Permalink
Merge pull request #3122 from miku0/sanitizer-final
Browse files Browse the repository at this point in the history
Adds sanitizer for Japanese addresses to correspond to block address
  • Loading branch information
lonvia authored Aug 1, 2023
2 parents c29ffc3 + 67e1c7d commit 252fe42
Show file tree
Hide file tree
Showing 6 changed files with 247 additions and 11 deletions.
8 changes: 8 additions & 0 deletions docs/customize/Tokenizers.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,14 @@ The following is a list of sanitizers that are shipped with Nominatim.
rendering:
heading_level: 6

#### tag-japanese

::: nominatim.tokenizer.sanitizers.tag_japanese
selection:
members: False
rendering:
heading_level: 6

#### Token Analysis

Token analyzers take a full name and transform it into one or more normalized
Expand Down
117 changes: 117 additions & 0 deletions nominatim/tokenizer/sanitizers/tag_japanese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2022 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This sanitizer maps OSM data to Japanese block addresses.
It replaces blocknumber and housenumber with housenumber,
and quarter and neighbourhood with place.
"""


from typing import Callable
from typing import List, Optional

from nominatim.tokenizer.sanitizers.base import ProcessInfo
from nominatim.tokenizer.sanitizers.config import SanitizerConfig
from nominatim.data.place_name import PlaceName

def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]:
"""Set up the sanitizer
"""
return tag_japanese

def reconbine_housenumber(
new_address: List[PlaceName],
tmp_housenumber: Optional[str],
tmp_blocknumber: Optional[str]
) -> List[PlaceName]:
""" Recombine the tag of housenumber by using housenumber and blocknumber
"""
if tmp_blocknumber and tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=f'{tmp_blocknumber}-{tmp_housenumber}',
suffix=''
)
)
elif tmp_blocknumber:
new_address.append(
PlaceName(
kind='housenumber',
name=tmp_blocknumber,
suffix=''
)
)
elif tmp_housenumber:
new_address.append(
PlaceName(
kind='housenumber',
name=tmp_housenumber,
suffix=''
)
)
return new_address

def reconbine_place(
new_address: List[PlaceName],
tmp_neighbourhood: Optional[str],
tmp_quarter: Optional[str]
) -> List[PlaceName]:
""" Recombine the tag of place by using neighbourhood and quarter
"""
if tmp_neighbourhood and tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=f'{tmp_quarter}{tmp_neighbourhood}',
suffix=''
)
)
elif tmp_neighbourhood:
new_address.append(
PlaceName(
kind='place',
name=tmp_neighbourhood,
suffix=''
)
)
elif tmp_quarter:
new_address.append(
PlaceName(
kind='place',
name=tmp_quarter,
suffix=''
)
)
return new_address
def tag_japanese(obj: ProcessInfo) -> None:
"""Recombine kind of address
"""
if obj.place.country_code != 'jp':
return
tmp_housenumber = None
tmp_blocknumber = None
tmp_neighbourhood = None
tmp_quarter = None

new_address = []
for item in obj.address:
if item.kind == 'housenumber':
tmp_housenumber = item.name
elif item.kind == 'block_number':
tmp_blocknumber = item.name
elif item.kind == 'neighbourhood':
tmp_neighbourhood = item.name
elif item.kind == 'quarter':
tmp_quarter = item.name
else:
new_address.append(item)

new_address = reconbine_housenumber(new_address, tmp_housenumber, tmp_blocknumber)
new_address = reconbine_place(new_address, tmp_neighbourhood, tmp_quarter)

obj.address = [item for item in new_address if item.name is not None]
22 changes: 11 additions & 11 deletions settings/icu-rules/unicode-digits-to-decimal.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
- "[𞥐𐒠߀𖭐꤀𖩠𑓐𑑐𑋰𑄶꩐꘠᱀᭐᮰᠐០᥆༠໐꧰႐᪐᪀᧐𑵐꯰᱐𑱐𑜰𑛀𑙐𑇐꧐꣐෦𑁦0𝟶𝟘𝟬𝟎𝟢₀⓿⓪⁰] > 0"
- "[𞥑𐒡߁𖭑꤁𖩡𑓑𑑑𑋱𑄷꩑꘡᱁᭑᮱᠑១᥇༡໑꧱႑᪑᪁᧑𑵑꯱᱑𑱑𑜱𑛁𑙑𑇑꧑꣑෧𑁧1𝟷𝟙𝟭𝟏𝟣₁¹①⑴⒈❶➀➊⓵] > 1"
- "[𞥒𐒢߂𖭒꤂𖩢𑓒𑑒𑋲𑄸꩒꘢᱂᭒᮲᠒២᥈༢໒꧲႒᪒᪂᧒𑵒꯲᱒𑱒𑜲𑛂𑙒𑇒꧒꣒෨𑁨2𝟸𝟚𝟮𝟐𝟤₂²②⑵⒉❷➁➋⓶] > 2"
- "[𞥓𐒣߃𖭓꤃𖩣𑓓𑑓𑋳𑄹꩓꘣᱃᭓᮳᠓៣᥉༣໓꧳႓᪓᪃᧓𑵓꯳᱓𑱓𑜳𑛃𑙓𑇓꧓꣓෩𑁩3𝟹𝟛𝟯𝟑𝟥₃³③⑶⒊❸➂➌⓷] > 3"
- "[𞥔𐒤߄𖭔꤄𖩤𑓔𑑔𑋴𑄺꩔꘤᱄᭔᮴᠔៤᥊༤໔꧴႔᪔᪄᧔𑵔꯴᱔𑱔𑜴𑛄𑙔𑇔꧔꣔෪𑁪4𝟺𝟜𝟰𝟒𝟦₄⁴④⑷⒋❹➃➍⓸] > 4"
- "[𞥕𐒥߅𖭕꤅𖩥𑓕𑑕𑋵𑄻꩕꘥᱅᭕᮵᠕៥᥋༥໕꧵႕᪕᪅᧕𑵕꯵᱕𑱕𑜵𑛅𑙕𑇕꧕꣕෫𑁫5𝟻𝟝𝟱𝟓𝟧₅⁵⑤⑸⒌❺➄➎⓹] > 5"
- "[𞥖𐒦߆𖭖꤆𖩦𑓖𑑖𑋶𑄼꩖꘦᱆᭖᮶᠖៦᥌༦໖꧶႖᪖᪆᧖𑵖꯶᱖𑱖𑜶𑛆𑙖𑇖꧖꣖෬𑁬6𝟼𝟞𝟲𝟔𝟨₆⁶⑥⑹⒍❻➅➏⓺] > 6"
- "[𞥗𐒧߇𖭗꤇𖩧𑓗𑑗𑋷𑄽꩗꘧᱇᭗᮷᠗៧᥍༧໗꧷႗᪗᪇᧗𑵗꯷᱗𑱗𑜷𑛇𑙗𑇗꧗꣗෭𑁭7𝟽𝟟𝟳𝟕𝟩₇⁷⑦⑺⒎❼➆➐⓻] > 7"
- "[𞥘𐒨߈𖭘꤈𖩨𑓘𑑘𑋸𑄾꩘꘨᱈᭘᮸᠘៨᥎༨໘꧸႘᪘᪈᧘𑵘꯸᱘𑱘𑜸𑛈𑙘𑇘꧘꣘෮𑁮8𝟾𝟠𝟴𝟖𝟪₈⁸⑧⑻⒏❽➇➑⓼] > 8"
- "[𞥙𐒩߉𖭙꤉𖩩𑓙𑑙𑋹𑄿꩙꘩᱉᭙᮹᠙៩᥏༩໙꧹႙᪙᪉᧙𑵙꯹᱙𑱙𑜹𑛉𑙙𑇙꧙꣙෯𑁯9𝟿𝟡𝟵𝟗𝟫₉⁹⑨⑼⒐❾➈➒⓽] > 9"
- "[𑜺⑩⑽⒑❿➉➓⓾] > '10'"
- "[𞥐𐒠߀𖭐꤀𖩠𑓐𑑐𑋰𑄶꩐꘠᱀᭐᮰᠐០᥆༠໐꧰႐᪐᪀᧐𑵐꯰᱐𑱐𑜰𑛀𑙐𑇐꧐꣐෦𑁦0𝟶𝟘𝟬𝟎𝟢₀⓿⓪⁰] > 0"
- "[𞥑𐒡߁𖭑꤁𖩡𑓑𑑑𑋱𑄷꩑꘡᱁᭑᮱᠑១᥇༡໑꧱႑᪑᪁᧑𑵑꯱᱑𑱑𑜱𑛁𑙑𑇑꧑꣑෧𑁧1𝟷𝟙𝟭𝟏𝟣₁¹①⑴⒈❶➀➊⓵] > 1"
- "[𞥒𐒢߂𖭒꤂𖩢𑓒𑑒𑋲𑄸꩒꘢᱂᭒᮲᠒២᥈༢໒꧲႒᪒᪂᧒𑵒꯲᱒𑱒𑜲𑛂𑙒𑇒꧒꣒෨𑁨2𝟸𝟚𝟮𝟐𝟤₂²②⑵⒉❷➁➋⓶] > 2"
- "[𞥓𐒣߃𖭓꤃𖩣𑓓𑑓𑋳𑄹꩓꘣᱃᭓᮳᠓៣᥉༣໓꧳႓᪓᪃᧓𑵓꯳᱓𑱓𑜳𑛃𑙓𑇓꧓꣓෩𑁩3𝟹𝟛𝟯𝟑𝟥₃³③⑶⒊❸➂➌⓷] > 3"
- "[𞥔𐒤߄𖭔꤄𖩤𑓔𑑔𑋴𑄺꩔꘤᱄᭔᮴᠔៤᥊༤໔꧴႔᪔᪄᧔𑵔꯴᱔𑱔𑜴𑛄𑙔𑇔꧔꣔෪𑁪4𝟺𝟜𝟰𝟒𝟦₄⁴④⑷⒋❹➃➍⓸] > 4"
- "[𞥕𐒥߅𖭕꤅𖩥𑓕𑑕𑋵𑄻꩕꘥᱅᭕᮵᠕៥᥋༥໕꧵႕᪕᪅᧕𑵕꯵᱕𑱕𑜵𑛅𑙕𑇕꧕꣕෫𑁫5𝟻𝟝𝟱𝟓𝟧₅⁵⑤⑸⒌❺➄➎⓹] > 5"
- "[𞥖𐒦߆𖭖꤆𖩦𑓖𑑖𑋶𑄼꩖꘦᱆᭖᮶᠖៦᥌༦໖꧶႖᪖᪆᧖𑵖꯶᱖𑱖𑜶𑛆𑙖𑇖꧖꣖෬𑁬6𝟼𝟞𝟲𝟔𝟨₆⁶⑥⑹⒍❻➅➏⓺] > 6"
- "[𞥗𐒧߇𖭗꤇𖩧𑓗𑑗𑋷𑄽꩗꘧᱇᭗᮷᠗៧᥍༧໗꧷႗᪗᪇᧗𑵗꯷᱗𑱗𑜷𑛇𑙗𑇗꧗꣗෭𑁭7𝟽𝟟𝟳𝟕𝟩₇⁷⑦⑺⒎❼➆➐⓻] > 7"
- "[𞥘𐒨߈𖭘꤈𖩨𑓘𑑘𑋸𑄾꩘꘨᱈᭘᮸᠘៨᥎༨໘꧸႘᪘᪈᧘𑵘꯸᱘𑱘𑜸𑛈𑙘𑇘꧘꣘෮𑁮8𝟾𝟠𝟴𝟖𝟪₈⁸⑧⑻⒏❽➇➑⓼] > 8"
- "[𞥙𐒩߉𖭙꤉𖩩𑓙𑑙𑋹𑄿꩙꘩᱉᭙᮹᠙៩᥏༩໙꧹႙᪙᪉᧙𑵙꯹᱙𑱙𑜹𑛉𑙙𑇙꧙꣙෯𑁯9𝟿𝟡𝟵𝟗𝟫₉⁹⑨⑼⒐❾➈➒⓽] > 9"
- "[𑜺⑩⑽⒑❿➉➓⓾] > '10'"
- "[⑪⑾⒒⓫] > '11'"
- "[⑫⑿⒓⓬] > '12'"
- "[⑬⒀⒔⓭] > '13'"
Expand Down
1 change: 1 addition & 0 deletions settings/icu_tokenizer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ sanitizers:
whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi]
use-defaults: all
mode: append
- step: tag-japanese
token-analysis:
- analyzer: generic
- id: "@housenumber"
Expand Down
30 changes: 30 additions & 0 deletions test/bdd/db/query/japanese.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
@DB
Feature: Searches in Japan
Test specifically for searches of Japanese addresses and in Japanese language.
@fail-legacy
Scenario: A block house-number is parented to the neighbourhood
Given the grid with origin JP
| 1 | | | | 2 |
| | 3 | | | |
| | | 9 | | |
| | | | 6 | |
And the places
| osm | class | type | name | geometry |
| W1 | highway | residential | 雉子橋通り | 1,2 |
And the places
| osm | class | type | housenr | addr+block_number | addr+neighbourhood | geometry |
| N3 | amenity | restaurant | 2 | 6 | 2丁目 | 3 |
And the places
| osm | class | type | name | geometry |
| N9 | place | neighbourhood | 2丁目 | 9 |
And the places
| osm | class | type | name | geometry |
| N6 | place | quarter | 加瀬 | 6 |
When importing
Then placex contains
| object | parent_place_id |
| N3 | N9 |
When sending search query "2丁目 6-2"
Then results contain
| osm |
| N3 |
80 changes: 80 additions & 0 deletions test/python/tokenizer/sanitizers/test_tag_japanese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from nominatim.data.place_info import PlaceInfo
from nominatim.data.place_name import PlaceName
from nominatim.tokenizer.place_sanitizer import PlaceSanitizer
from typing import Mapping, Optional, List
import pytest

class TestTagJapanese:
@pytest.fixture(autouse=True)
def setup_country(self, def_config):
self.config = def_config

def run_sanitizer_on(self,type, **kwargs):
place = PlaceInfo({
'address': kwargs,
'country_code': 'jp'
})
sanitizer_args = {'step': 'tag-japanese'}
_, address = PlaceSanitizer([sanitizer_args], self.config).process_names(place)
tmp_list = [(p.name,p.kind) for p in address]
return sorted(tmp_list)

def test_on_address(self):
res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz')
assert res == [('bar','ref'),('baz','ref_abc'),('foo','name')]

def test_housenumber(self):
res = self.run_sanitizer_on('address', housenumber='2')
assert res == [('2','housenumber')]

def test_blocknumber(self):
res = self.run_sanitizer_on('address', block_number='6')
assert res == [('6','housenumber')]

def test_neighbourhood(self):
res = self.run_sanitizer_on('address', neighbourhood='8')
assert res == [('8','place')]

def test_quarter(self):
res = self.run_sanitizer_on('address', quarter='kase')
assert res==[('kase','place')]

def test_housenumber_blocknumber(self):
res = self.run_sanitizer_on('address', housenumber='2', block_number='6')
assert res == [('6-2','housenumber')]

def test_quarter_neighbourhood(self):
res = self.run_sanitizer_on('address', quarter='kase', neighbourhood='8')
assert res == [('kase8','place')]

def test_blocknumber_housenumber_quarter(self):
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase')
assert res == [('6-2','housenumber'),('kase','place')]

def test_blocknumber_housenumber_quarter_neighbourhood(self):
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', neighbourhood='8')
assert res == [('6-2','housenumber'),('8','place')]

def test_blocknumber_quarter_neighbourhood(self):
res = self.run_sanitizer_on('address',block_number='6', quarter='kase', neighbourhood='8')
assert res == [('6','housenumber'),('kase8','place')]

def test_blocknumber_quarter(self):
res = self.run_sanitizer_on('address',block_number='6', quarter='kase')
assert res == [('6','housenumber'),('kase','place')]

def test_blocknumber_neighbourhood(self):
res = self.run_sanitizer_on('address',block_number='6', neighbourhood='8')
assert res == [('6','housenumber'),('8','place')]

def test_housenumber_quarter_neighbourhood(self):
res = self.run_sanitizer_on('address',housenumber='2', quarter='kase', neighbourhood='8')
assert res == [('2','housenumber'),('kase8','place')]

def test_housenumber_quarter(self):
res = self.run_sanitizer_on('address',housenumber='2', quarter='kase')
assert res == [('2','housenumber'),('kase','place')]

def test_housenumber_blocknumber_neighbourhood_quarter(self):
res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase', neighbourhood='8')
assert res == [('6-2','housenumber'),('kase8','place')]

0 comments on commit 252fe42

Please sign in to comment.