From 0722495434e039b7b716206974d0076db8813ca6 Mon Sep 17 00:00:00 2001 From: miku0 Date: Wed, 26 Jul 2023 07:54:58 +0000 Subject: [PATCH 1/7] add japanese sanitizer --- docs/customize/Tokenizers.md | 8 + .../tokenizer/sanitizers/tag_japanese.py | 150 ++++++++++++++++++ settings/icu_tokenizer.yaml | 1 + test/bdd/db/query/japanese.feature | 29 ++++ .../tokenizer/sanitizers/test_tag_japanese.py | 65 ++++++++ 5 files changed, 253 insertions(+) create mode 100644 nominatim/tokenizer/sanitizers/tag_japanese.py create mode 100644 test/bdd/db/query/japanese.feature create mode 100644 test/python/tokenizer/sanitizers/test_tag_japanese.py diff --git a/docs/customize/Tokenizers.md b/docs/customize/Tokenizers.md index 11c27e38b9..6199ea4252 100644 --- a/docs/customize/Tokenizers.md +++ b/docs/customize/Tokenizers.md @@ -229,6 +229,14 @@ The following is a list of sanitizers that are shipped with Nominatim. rendering: heading_level: 6 +#### tag-japanese + +::: nominatim.tokenizer.sanitizers.tag_japanese + selection: + members: False + rendering: + heading_level: 6 + #### Token Analysis Token analyzers take a full name and transform it into one or more normalized diff --git a/nominatim/tokenizer/sanitizers/tag_japanese.py b/nominatim/tokenizer/sanitizers/tag_japanese.py new file mode 100644 index 0000000000..81d3d5b3a4 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/tag_japanese.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +This sanitizer maps OSM data to Japanese block addresses. +It replaces blocknumber and housenumber with housenumber, +and quarter and neighbourhood with place. +""" + + +from typing import Callable +from typing import List + +from nominatim.tokenizer.sanitizers.base import ProcessInfo +from nominatim.tokenizer.sanitizers.config import SanitizerConfig +from nominatim.data.place_name import PlaceName + +def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: +#def create(config: SanitizerConfig) -> Callable[[ProcessInfo],None]: + """Set up the sanitizer + """ + return tag_japanese + #return tag_japanese(config) + +def convert_kanji_sequence_to_number(sequence: str) -> str: + """Converts Kanji numbers to Arabic numbers + """ + kanji_map = { + '零': '0', + '一': '1', + '二': '2', + '三': '3', + '四': '4', + '五': '5', + '六': '6', + '七': '7', + '八': '8', + '九': '9' + } + converted = '' + current_number = '' + for char in sequence: + if char in kanji_map: + current_number += kanji_map[char] + else: + converted += current_number + current_number = '' + converted += char + converted += current_number + return converted + +def reconbine_housenumber( + new_address: List[PlaceName], + tmp_housenumber: str | None, + tmp_blocknumber: str | None +) -> List[PlaceName]: + """ Recombine the tag of housenumber by using housenumber and blocknumber + """ + if tmp_blocknumber and tmp_housenumber: + new_address.append( + PlaceName( + kind='housenumber', + name=f'{tmp_blocknumber}-{tmp_housenumber}', + suffix='' + ) + ) + elif tmp_blocknumber: + new_address.append( + PlaceName( + kind='housenumber', + name=f'{tmp_blocknumber}', + suffix='' + ) + ) + elif tmp_housenumber: + new_address.append( + PlaceName( + kind='housenumber', + name=f'{tmp_housenumber}', + suffix='' + ) + ) + return new_address + +def reconbine_place( + new_address: List[PlaceName], + tmp_neighbourhood: str | None, + tmp_quarter: str | None +) -> List[PlaceName]: + """ Recombine the tag of place by using neighbourhood and quarter + """ + if tmp_neighbourhood and tmp_quarter: + new_address.append( + PlaceName( + kind='place', + name=f'{tmp_quarter}{tmp_neighbourhood}', + suffix='' + ) + ) + elif tmp_neighbourhood: + new_address.append( + PlaceName( + kind='place', + name=f'{tmp_neighbourhood}', + suffix='' + ) + ) + elif tmp_quarter: + new_address.append( + PlaceName( + kind='place', + name=f'{tmp_quarter}', + suffix='' + ) + ) + return new_address +def tag_japanese(obj: ProcessInfo) -> None: + """Recombine kind of address + """ + if obj.place.country_code != 'jp': + return + tmp_housenumber = None + tmp_blocknumber = None + tmp_neighbourhood = None + tmp_quarter = None + + new_address = [] + for item in obj.names: + item.name = convert_kanji_sequence_to_number(item.name) + + for item in obj.address: + item.name = convert_kanji_sequence_to_number(item.name) + if item.kind == 'housenumber': + tmp_housenumber = item.name + elif item.kind == 'block_number': + tmp_blocknumber = item.name + elif item.kind == 'neighbourhood': + tmp_neighbourhood = item.name + elif item.kind == 'quarter': + tmp_quarter = item.name + else: + new_address.append(item) + + new_address = reconbine_housenumber(new_address,tmp_housenumber,tmp_blocknumber) + new_address = reconbine_place(new_address,tmp_neighbourhood,tmp_quarter) + + obj.address = [item for item in new_address if item.name is not None] diff --git a/settings/icu_tokenizer.yaml b/settings/icu_tokenizer.yaml index 1fa467befe..c5a809c683 100644 --- a/settings/icu_tokenizer.yaml +++ b/settings/icu_tokenizer.yaml @@ -45,6 +45,7 @@ sanitizers: whitelist: [bg,ca,cs,da,de,el,en,es,et,eu,fi,fr,gl,hu,it,ja,mg,ms,nl,no,pl,pt,ro,ru,sk,sl,sv,tr,uk,vi] use-defaults: all mode: append + - step: tag-japanese token-analysis: - analyzer: generic - id: "@housenumber" diff --git a/test/bdd/db/query/japanese.feature b/test/bdd/db/query/japanese.feature new file mode 100644 index 0000000000..f21e0f5c9f --- /dev/null +++ b/test/bdd/db/query/japanese.feature @@ -0,0 +1,29 @@ +@DB +Feature: Searches in Japan + Test specifically for searches of Japanese addresses and in Japanese language. + Scenario: A block house-number is parented to the neighbourhood + Given the grid with origin JP + | 1 | | | | 2 | + | | 3 | | | | + | | | 9 | | | + | | | | 6 | | + And the places + | osm | class | type | name | geometry | + | W1 | highway | residential | 雉子橋通り | 1,2 | + And the places + | osm | class | type | housenr | addr+block_number | addr+neighbourhood | geometry | + | N3 | amenity | restaurant | 2 | 6 | 2丁目 | 3 | + And the places + | osm | class | type | name | geometry | + | N9 | place | neighbourhood | 2丁目 | 9 | + And the places + | osm | class | type | name | geometry | + | N6 | place | quarter | 加瀬 | 6 | + When importing + Then placex contains + | object | parent_place_id | + | N3 | N9 | + When sending search query "2丁目 6-2" + Then results contain + | osm | + | N3 | diff --git a/test/python/tokenizer/sanitizers/test_tag_japanese.py b/test/python/tokenizer/sanitizers/test_tag_japanese.py new file mode 100644 index 0000000000..c82c4261b0 --- /dev/null +++ b/test/python/tokenizer/sanitizers/test_tag_japanese.py @@ -0,0 +1,65 @@ +from nominatim.data.place_info import PlaceInfo +from nominatim.data.place_name import PlaceName +from nominatim.tokenizer.place_sanitizer import PlaceSanitizer +from typing import Mapping, Optional, List +import pytest + +class TestTagJapanese: + @pytest.fixture(autouse=True) + def setup_country(self, def_config): + self.config = def_config + + def run_sanitizer_on(self,type, **kwargs): + place = PlaceInfo({ + 'address': kwargs, + 'country_code': 'jp' + }) + sanitizer_args = {'step': 'tag-japanese'} + _, address = PlaceSanitizer([sanitizer_args], self.config).process_names(place) + tmp_list = [(p.name,p.kind) for p in address] + return sorted(tmp_list) + + def test_on_address(self): + res = self.run_sanitizer_on('address', name='foo', ref='bar', ref_abc='baz') + assert res == [('bar','ref'),('baz','ref_abc'),('foo','name')] + + def test_housenumber(self): + res = self.run_sanitizer_on('address', housenumber='2') + assert res == [('2','housenumber')] + + def test_blocknumber(self): + res = self.run_sanitizer_on('address', block_number='6') + assert res == [('6','housenumber')] + + #def test_neighbourhood(self): + # res = self.run_sanitizer_on('address',neighbourhood='8丁目') + # assert res == [('8','place')] + def test_neighbourhood(self): + res = self.run_sanitizer_on('address', neighbourhood='8') + assert res == [('8','place')] + def test_quarter(self): + res = self.run_sanitizer_on('address', quarter='kase') + assert res==[('kase','place')] + + def test_housenumber_blocknumber(self): + res = self.run_sanitizer_on('address', housenumber='2', block_number='6') + assert res == [('6-2','housenumber')] + + def test_housenumber_blocknumber(self): + res = self.run_sanitizer_on('address', housenumber='2', neighbourhood='8') + assert res == [('2','housenumber'),('8','place')] + + def test_housenumber_blocknumber(self): + res = self.run_sanitizer_on('address', block_number='6', neighbourhood='8') + assert res == [('6','housenumber'),('8','place')] + + def test_housenumber_blocknumber_neighbourhood(self): + res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8') + assert res == [('6-2','housenumber'),('8','place')] + + def test_housenumber_blocknumber_neighbourhood_quarter(self): + res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8',quarter='kase') + assert res == [('6-2','housenumber'),('kase8','place')] + def test_neighbourhood_quarter(self): + res = self.run_sanitizer_on('address', neighbourhood='8',quarter='kase') + assert res == [('kase8','place')] From 848e5ac5de510c1a1ae1a01107453e61a0895b97 Mon Sep 17 00:00:00 2001 From: miku0 Date: Wed, 26 Jul 2023 09:50:25 +0000 Subject: [PATCH 2/7] Correction to PR's comment --- nominatim/tokenizer/sanitizers/kanji_utils.py | 36 ++++++++++++++++++ .../tokenizer/sanitizers/tag_japanese.py | 38 +++---------------- 2 files changed, 42 insertions(+), 32 deletions(-) create mode 100644 nominatim/tokenizer/sanitizers/kanji_utils.py diff --git a/nominatim/tokenizer/sanitizers/kanji_utils.py b/nominatim/tokenizer/sanitizers/kanji_utils.py new file mode 100644 index 0000000000..6956152366 --- /dev/null +++ b/nominatim/tokenizer/sanitizers/kanji_utils.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2022 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +This is a file for a function that converts Kanji (Japanese) numerals to Arabic numerals. +""" + +def convert_kanji_sequence_to_number(sequence: str) -> str: + """Converts Kanji numbers to Arabic numbers + """ + kanji_map = { + '零': '0', + '一': '1', + '二': '2', + '三': '3', + '四': '4', + '五': '5', + '六': '6', + '七': '7', + '八': '8', + '九': '9' + } + converted = '' + current_number = '' + for char in sequence: + if char in kanji_map: + current_number += kanji_map[char] + else: + converted += current_number + current_number = '' + converted += char + converted += current_number + return converted diff --git a/nominatim/tokenizer/sanitizers/tag_japanese.py b/nominatim/tokenizer/sanitizers/tag_japanese.py index 81d3d5b3a4..dffd95595d 100644 --- a/nominatim/tokenizer/sanitizers/tag_japanese.py +++ b/nominatim/tokenizer/sanitizers/tag_japanese.py @@ -12,11 +12,12 @@ from typing import Callable -from typing import List +from typing import List, Optional from nominatim.tokenizer.sanitizers.base import ProcessInfo from nominatim.tokenizer.sanitizers.config import SanitizerConfig from nominatim.data.place_name import PlaceName +from nominatim.tokenizer.sanitizers.kanji_utils import convert_kanji_sequence_to_number def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: #def create(config: SanitizerConfig) -> Callable[[ProcessInfo],None]: @@ -25,37 +26,10 @@ def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: return tag_japanese #return tag_japanese(config) -def convert_kanji_sequence_to_number(sequence: str) -> str: - """Converts Kanji numbers to Arabic numbers - """ - kanji_map = { - '零': '0', - '一': '1', - '二': '2', - '三': '3', - '四': '4', - '五': '5', - '六': '6', - '七': '7', - '八': '8', - '九': '9' - } - converted = '' - current_number = '' - for char in sequence: - if char in kanji_map: - current_number += kanji_map[char] - else: - converted += current_number - current_number = '' - converted += char - converted += current_number - return converted - def reconbine_housenumber( new_address: List[PlaceName], - tmp_housenumber: str | None, - tmp_blocknumber: str | None + tmp_housenumber: Optional[str], + tmp_blocknumber: Optional[str] ) -> List[PlaceName]: """ Recombine the tag of housenumber by using housenumber and blocknumber """ @@ -87,8 +61,8 @@ def reconbine_housenumber( def reconbine_place( new_address: List[PlaceName], - tmp_neighbourhood: str | None, - tmp_quarter: str | None + tmp_neighbourhood: Optional[str], + tmp_quarter: Optional[str] ) -> List[PlaceName]: """ Recombine the tag of place by using neighbourhood and quarter """ From fac8c32cda12d6684ad2232ac255324bd9b85e43 Mon Sep 17 00:00:00 2001 From: miku0 Date: Wed, 26 Jul 2023 21:43:22 +0000 Subject: [PATCH 3/7] Moved KANJI_MAP to global variable --- nominatim/tokenizer/sanitizers/kanji_utils.py | 36 ------------------- .../tokenizer/sanitizers/tag_japanese.py | 29 ++++++++++++++- 2 files changed, 28 insertions(+), 37 deletions(-) delete mode 100644 nominatim/tokenizer/sanitizers/kanji_utils.py diff --git a/nominatim/tokenizer/sanitizers/kanji_utils.py b/nominatim/tokenizer/sanitizers/kanji_utils.py deleted file mode 100644 index 6956152366..0000000000 --- a/nominatim/tokenizer/sanitizers/kanji_utils.py +++ /dev/null @@ -1,36 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later -# -# This file is part of Nominatim. (https://nominatim.org) -# -# Copyright (C) 2022 by the Nominatim developer community. -# For a full list of authors see the git log. -""" -This is a file for a function that converts Kanji (Japanese) numerals to Arabic numerals. -""" - -def convert_kanji_sequence_to_number(sequence: str) -> str: - """Converts Kanji numbers to Arabic numbers - """ - kanji_map = { - '零': '0', - '一': '1', - '二': '2', - '三': '3', - '四': '4', - '五': '5', - '六': '6', - '七': '7', - '八': '8', - '九': '9' - } - converted = '' - current_number = '' - for char in sequence: - if char in kanji_map: - current_number += kanji_map[char] - else: - converted += current_number - current_number = '' - converted += char - converted += current_number - return converted diff --git a/nominatim/tokenizer/sanitizers/tag_japanese.py b/nominatim/tokenizer/sanitizers/tag_japanese.py index dffd95595d..723a6dfaa2 100644 --- a/nominatim/tokenizer/sanitizers/tag_japanese.py +++ b/nominatim/tokenizer/sanitizers/tag_japanese.py @@ -17,7 +17,34 @@ from nominatim.tokenizer.sanitizers.base import ProcessInfo from nominatim.tokenizer.sanitizers.config import SanitizerConfig from nominatim.data.place_name import PlaceName -from nominatim.tokenizer.sanitizers.kanji_utils import convert_kanji_sequence_to_number + +KANJI_MAP = { + '零': '0', + '一': '1', + '二': '2', + '三': '3', + '四': '4', + '五': '5', + '六': '6', + '七': '7', + '八': '8', + '九': '9' + } + +def convert_kanji_sequence_to_number(sequence: str) -> str: + """Converts Kanji numbers to Arabic numbers + """ + converted = '' + current_number = '' + for char in sequence: + if char in KANJI_MAP: + current_number += KANJI_MAP[char] + else: + converted += current_number + current_number = '' + converted += char + converted += current_number + return converted def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: #def create(config: SanitizerConfig) -> Callable[[ProcessInfo],None]: From 67706cec4ebb448fd74093f2e443db3e64d41310 Mon Sep 17 00:00:00 2001 From: miku0 Date: Thu, 27 Jul 2023 07:33:53 +0000 Subject: [PATCH 4/7] add @fail-legacy --- test/bdd/db/query/japanese.feature | 1 + 1 file changed, 1 insertion(+) diff --git a/test/bdd/db/query/japanese.feature b/test/bdd/db/query/japanese.feature index f21e0f5c9f..4960c50b4f 100644 --- a/test/bdd/db/query/japanese.feature +++ b/test/bdd/db/query/japanese.feature @@ -1,6 +1,7 @@ @DB Feature: Searches in Japan Test specifically for searches of Japanese addresses and in Japanese language. + @fail-legacy Scenario: A block house-number is parented to the neighbourhood Given the grid with origin JP | 1 | | | | 2 | From 23500181061bb31c463f54a17467ecffbbe9ef9a Mon Sep 17 00:00:00 2001 From: miku0 Date: Mon, 31 Jul 2023 02:39:04 +0000 Subject: [PATCH 5/7] Fixed cosmetic issues --- .../tokenizer/sanitizers/tag_japanese.py | 59 +++++++++---------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/nominatim/tokenizer/sanitizers/tag_japanese.py b/nominatim/tokenizer/sanitizers/tag_japanese.py index 723a6dfaa2..46296456a9 100644 --- a/nominatim/tokenizer/sanitizers/tag_japanese.py +++ b/nominatim/tokenizer/sanitizers/tag_japanese.py @@ -19,39 +19,28 @@ from nominatim.data.place_name import PlaceName KANJI_MAP = { - '零': '0', - '一': '1', - '二': '2', - '三': '3', - '四': '4', - '五': '5', - '六': '6', - '七': '7', - '八': '8', - '九': '9' + ord('零'): '0', + ord('一'): '1', + ord('二'): '2', + ord('三'): '3', + ord('四'): '4', + ord('五'): '5', + ord('六'): '6', + ord('七'): '7', + ord('八'): '8', + ord('九'): '9' } def convert_kanji_sequence_to_number(sequence: str) -> str: """Converts Kanji numbers to Arabic numbers """ - converted = '' - current_number = '' - for char in sequence: - if char in KANJI_MAP: - current_number += KANJI_MAP[char] - else: - converted += current_number - current_number = '' - converted += char - converted += current_number + converted = sequence.translate(KANJI_MAP) return converted def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: -#def create(config: SanitizerConfig) -> Callable[[ProcessInfo],None]: """Set up the sanitizer """ return tag_japanese - #return tag_japanese(config) def reconbine_housenumber( new_address: List[PlaceName], @@ -60,6 +49,11 @@ def reconbine_housenumber( ) -> List[PlaceName]: """ Recombine the tag of housenumber by using housenumber and blocknumber """ + if tmp_blocknumber: + tmp_blocknumber = convert_kanji_sequence_to_number(tmp_blocknumber) + if tmp_housenumber: + tmp_housenumber = convert_kanji_sequence_to_number(tmp_housenumber) + if tmp_blocknumber and tmp_housenumber: new_address.append( PlaceName( @@ -72,7 +66,7 @@ def reconbine_housenumber( new_address.append( PlaceName( kind='housenumber', - name=f'{tmp_blocknumber}', + name=tmp_blocknumber, suffix='' ) ) @@ -80,7 +74,7 @@ def reconbine_housenumber( new_address.append( PlaceName( kind='housenumber', - name=f'{tmp_housenumber}', + name=tmp_housenumber, suffix='' ) ) @@ -93,6 +87,11 @@ def reconbine_place( ) -> List[PlaceName]: """ Recombine the tag of place by using neighbourhood and quarter """ + if tmp_neighbourhood: + tmp_neighbourhood = convert_kanji_sequence_to_number(tmp_neighbourhood) + if tmp_quarter: + tmp_quarter = convert_kanji_sequence_to_number(tmp_quarter) + if tmp_neighbourhood and tmp_quarter: new_address.append( PlaceName( @@ -105,7 +104,7 @@ def reconbine_place( new_address.append( PlaceName( kind='place', - name=f'{tmp_neighbourhood}', + name=tmp_neighbourhood, suffix='' ) ) @@ -113,7 +112,7 @@ def reconbine_place( new_address.append( PlaceName( kind='place', - name=f'{tmp_quarter}', + name=tmp_quarter, suffix='' ) ) @@ -129,11 +128,7 @@ def tag_japanese(obj: ProcessInfo) -> None: tmp_quarter = None new_address = [] - for item in obj.names: - item.name = convert_kanji_sequence_to_number(item.name) - for item in obj.address: - item.name = convert_kanji_sequence_to_number(item.name) if item.kind == 'housenumber': tmp_housenumber = item.name elif item.kind == 'block_number': @@ -145,7 +140,7 @@ def tag_japanese(obj: ProcessInfo) -> None: else: new_address.append(item) - new_address = reconbine_housenumber(new_address,tmp_housenumber,tmp_blocknumber) - new_address = reconbine_place(new_address,tmp_neighbourhood,tmp_quarter) + new_address = reconbine_housenumber(new_address, tmp_housenumber, tmp_blocknumber) + new_address = reconbine_place(new_address, tmp_neighbourhood, tmp_quarter) obj.address = [item for item in new_address if item.name is not None] From 4d61cc87cff9ecf5c90741a94c4b93da2f12c5ad Mon Sep 17 00:00:00 2001 From: miku0 Date: Mon, 31 Jul 2023 02:39:56 +0000 Subject: [PATCH 6/7] Add the test of reconbine_place --- .../tokenizer/sanitizers/test_tag_japanese.py | 49 +++++++++++++------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/test/python/tokenizer/sanitizers/test_tag_japanese.py b/test/python/tokenizer/sanitizers/test_tag_japanese.py index c82c4261b0..2f5e442cd2 100644 --- a/test/python/tokenizer/sanitizers/test_tag_japanese.py +++ b/test/python/tokenizer/sanitizers/test_tag_japanese.py @@ -31,12 +31,10 @@ def test_blocknumber(self): res = self.run_sanitizer_on('address', block_number='6') assert res == [('6','housenumber')] - #def test_neighbourhood(self): - # res = self.run_sanitizer_on('address',neighbourhood='8丁目') - # assert res == [('8','place')] def test_neighbourhood(self): res = self.run_sanitizer_on('address', neighbourhood='8') assert res == [('8','place')] + def test_quarter(self): res = self.run_sanitizer_on('address', quarter='kase') assert res==[('kase','place')] @@ -45,21 +43,42 @@ def test_housenumber_blocknumber(self): res = self.run_sanitizer_on('address', housenumber='2', block_number='6') assert res == [('6-2','housenumber')] - def test_housenumber_blocknumber(self): - res = self.run_sanitizer_on('address', housenumber='2', neighbourhood='8') - assert res == [('2','housenumber'),('8','place')] + def test_quarter_neighbourhood(self): + res = self.run_sanitizer_on('address', quarter='kase', neighbourhood='8') + assert res == [('kase8','place')] - def test_housenumber_blocknumber(self): - res = self.run_sanitizer_on('address', block_number='6', neighbourhood='8') - assert res == [('6','housenumber'),('8','place')] + def test_blocknumber_housenumber_quarter(self): + res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase') + assert res == [('6-2','housenumber'),('kase','place')] - def test_housenumber_blocknumber_neighbourhood(self): - res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8') + def test_blocknumber_housenumber_quarter_neighbourhood(self): + res = self.run_sanitizer_on('address', block_number='6', housenumber='2', neighbourhood='8') assert res == [('6-2','housenumber'),('8','place')] + def test_blocknumber_quarter_neighbourhood(self): + res = self.run_sanitizer_on('address',block_number='6', quarter='kase', neighbourhood='8') + assert res == [('6','housenumber'),('kase8','place')] + + def test_blocknumber_quarter(self): + res = self.run_sanitizer_on('address',block_number='6', quarter='kase') + assert res == [('6','housenumber'),('kase','place')] + + def test_blocknumber_neighbourhood(self): + res = self.run_sanitizer_on('address',block_number='6', neighbourhood='8') + assert res == [('6','housenumber'),('8','place')] + + def test_housenumber_quarter_neighbourhood(self): + res = self.run_sanitizer_on('address',housenumber='2', quarter='kase', neighbourhood='8') + assert res == [('2','housenumber'),('kase8','place')] + + def test_housenumber_quarter(self): + res = self.run_sanitizer_on('address',housenumber='2', quarter='kase') + assert res == [('2','housenumber'),('kase','place')] + def test_housenumber_blocknumber_neighbourhood_quarter(self): - res = self.run_sanitizer_on('address', housenumber='2', block_number='6', neighbourhood='8',quarter='kase') + res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase', neighbourhood='8') + assert res == [('6-2','housenumber'),('kase8','place')] + + def test_KANJI_MAP(self): + res = self.run_sanitizer_on('address', block_number='六', housenumber='二', quarter='kase', neighbourhood='八') assert res == [('6-2','housenumber'),('kase8','place')] - def test_neighbourhood_quarter(self): - res = self.run_sanitizer_on('address', neighbourhood='8',quarter='kase') - assert res == [('kase8','place')] From 67e1c7dc7205c80957b58c4f9ee644d130dc8ac6 Mon Sep 17 00:00:00 2001 From: miku0 Date: Mon, 31 Jul 2023 11:57:49 +0000 Subject: [PATCH 7/7] Moved KANJI_MAP to icu-rules --- .../tokenizer/sanitizers/tag_japanese.py | 29 ------------------- .../icu-rules/unicode-digits-to-decimal.yaml | 22 +++++++------- .../tokenizer/sanitizers/test_tag_japanese.py | 4 --- 3 files changed, 11 insertions(+), 44 deletions(-) diff --git a/nominatim/tokenizer/sanitizers/tag_japanese.py b/nominatim/tokenizer/sanitizers/tag_japanese.py index 46296456a9..3663860af9 100644 --- a/nominatim/tokenizer/sanitizers/tag_japanese.py +++ b/nominatim/tokenizer/sanitizers/tag_japanese.py @@ -18,25 +18,6 @@ from nominatim.tokenizer.sanitizers.config import SanitizerConfig from nominatim.data.place_name import PlaceName -KANJI_MAP = { - ord('零'): '0', - ord('一'): '1', - ord('二'): '2', - ord('三'): '3', - ord('四'): '4', - ord('五'): '5', - ord('六'): '6', - ord('七'): '7', - ord('八'): '8', - ord('九'): '9' - } - -def convert_kanji_sequence_to_number(sequence: str) -> str: - """Converts Kanji numbers to Arabic numbers - """ - converted = sequence.translate(KANJI_MAP) - return converted - def create(_: SanitizerConfig) -> Callable[[ProcessInfo], None]: """Set up the sanitizer """ @@ -49,11 +30,6 @@ def reconbine_housenumber( ) -> List[PlaceName]: """ Recombine the tag of housenumber by using housenumber and blocknumber """ - if tmp_blocknumber: - tmp_blocknumber = convert_kanji_sequence_to_number(tmp_blocknumber) - if tmp_housenumber: - tmp_housenumber = convert_kanji_sequence_to_number(tmp_housenumber) - if tmp_blocknumber and tmp_housenumber: new_address.append( PlaceName( @@ -87,11 +63,6 @@ def reconbine_place( ) -> List[PlaceName]: """ Recombine the tag of place by using neighbourhood and quarter """ - if tmp_neighbourhood: - tmp_neighbourhood = convert_kanji_sequence_to_number(tmp_neighbourhood) - if tmp_quarter: - tmp_quarter = convert_kanji_sequence_to_number(tmp_quarter) - if tmp_neighbourhood and tmp_quarter: new_address.append( PlaceName( diff --git a/settings/icu-rules/unicode-digits-to-decimal.yaml b/settings/icu-rules/unicode-digits-to-decimal.yaml index 55b3274a3a..db6c842086 100644 --- a/settings/icu-rules/unicode-digits-to-decimal.yaml +++ b/settings/icu-rules/unicode-digits-to-decimal.yaml @@ -1,14 +1,14 @@ -- "[𞥐𐒠߀𖭐꤀𖩠𑓐𑑐𑋰𑄶꩐꘠᱀᭐᮰᠐០᥆༠໐꧰႐᪐᪀᧐𑵐꯰᱐𑱐𑜰𑛀𑙐𑇐꧐꣐෦𑁦0𝟶𝟘𝟬𝟎𝟢₀⓿⓪⁰] > 0" -- "[𞥑𐒡߁𖭑꤁𖩡𑓑𑑑𑋱𑄷꩑꘡᱁᭑᮱᠑១᥇༡໑꧱႑᪑᪁᧑𑵑꯱᱑𑱑𑜱𑛁𑙑𑇑꧑꣑෧𑁧1𝟷𝟙𝟭𝟏𝟣₁¹①⑴⒈❶➀➊⓵] > 1" -- "[𞥒𐒢߂𖭒꤂𖩢𑓒𑑒𑋲𑄸꩒꘢᱂᭒᮲᠒២᥈༢໒꧲႒᪒᪂᧒𑵒꯲᱒𑱒𑜲𑛂𑙒𑇒꧒꣒෨𑁨2𝟸𝟚𝟮𝟐𝟤₂²②⑵⒉❷➁➋⓶] > 2" -- "[𞥓𐒣߃𖭓꤃𖩣𑓓𑑓𑋳𑄹꩓꘣᱃᭓᮳᠓៣᥉༣໓꧳႓᪓᪃᧓𑵓꯳᱓𑱓𑜳𑛃𑙓𑇓꧓꣓෩𑁩3𝟹𝟛𝟯𝟑𝟥₃³③⑶⒊❸➂➌⓷] > 3" -- "[𞥔𐒤߄𖭔꤄𖩤𑓔𑑔𑋴𑄺꩔꘤᱄᭔᮴᠔៤᥊༤໔꧴႔᪔᪄᧔𑵔꯴᱔𑱔𑜴𑛄𑙔𑇔꧔꣔෪𑁪4𝟺𝟜𝟰𝟒𝟦₄⁴④⑷⒋❹➃➍⓸] > 4" -- "[𞥕𐒥߅𖭕꤅𖩥𑓕𑑕𑋵𑄻꩕꘥᱅᭕᮵᠕៥᥋༥໕꧵႕᪕᪅᧕𑵕꯵᱕𑱕𑜵𑛅𑙕𑇕꧕꣕෫𑁫5𝟻𝟝𝟱𝟓𝟧₅⁵⑤⑸⒌❺➄➎⓹] > 5" -- "[𞥖𐒦߆𖭖꤆𖩦𑓖𑑖𑋶𑄼꩖꘦᱆᭖᮶᠖៦᥌༦໖꧶႖᪖᪆᧖𑵖꯶᱖𑱖𑜶𑛆𑙖𑇖꧖꣖෬𑁬6𝟼𝟞𝟲𝟔𝟨₆⁶⑥⑹⒍❻➅➏⓺] > 6" -- "[𞥗𐒧߇𖭗꤇𖩧𑓗𑑗𑋷𑄽꩗꘧᱇᭗᮷᠗៧᥍༧໗꧷႗᪗᪇᧗𑵗꯷᱗𑱗𑜷𑛇𑙗𑇗꧗꣗෭𑁭7𝟽𝟟𝟳𝟕𝟩₇⁷⑦⑺⒎❼➆➐⓻] > 7" -- "[𞥘𐒨߈𖭘꤈𖩨𑓘𑑘𑋸𑄾꩘꘨᱈᭘᮸᠘៨᥎༨໘꧸႘᪘᪈᧘𑵘꯸᱘𑱘𑜸𑛈𑙘𑇘꧘꣘෮𑁮8𝟾𝟠𝟴𝟖𝟪₈⁸⑧⑻⒏❽➇➑⓼] > 8" -- "[𞥙𐒩߉𖭙꤉𖩩𑓙𑑙𑋹𑄿꩙꘩᱉᭙᮹᠙៩᥏༩໙꧹႙᪙᪉᧙𑵙꯹᱙𑱙𑜹𑛉𑙙𑇙꧙꣙෯𑁯9𝟿𝟡𝟵𝟗𝟫₉⁹⑨⑼⒐❾➈➒⓽] > 9" -- "[𑜺⑩⑽⒑❿➉➓⓾] > '10'" +- "[𞥐𐒠߀𖭐꤀𖩠𑓐𑑐𑋰𑄶꩐꘠᱀᭐᮰᠐០᥆༠໐꧰႐᪐᪀᧐𑵐꯰᱐𑱐𑜰𑛀𑙐𑇐꧐꣐෦𑁦0𝟶𝟘𝟬𝟎𝟢₀⓿⓪⁰零] > 0" +- "[𞥑𐒡߁𖭑꤁𖩡𑓑𑑑𑋱𑄷꩑꘡᱁᭑᮱᠑១᥇༡໑꧱႑᪑᪁᧑𑵑꯱᱑𑱑𑜱𑛁𑙑𑇑꧑꣑෧𑁧1𝟷𝟙𝟭𝟏𝟣₁¹①⑴⒈❶➀➊⓵一] > 1" +- "[𞥒𐒢߂𖭒꤂𖩢𑓒𑑒𑋲𑄸꩒꘢᱂᭒᮲᠒២᥈༢໒꧲႒᪒᪂᧒𑵒꯲᱒𑱒𑜲𑛂𑙒𑇒꧒꣒෨𑁨2𝟸𝟚𝟮𝟐𝟤₂²②⑵⒉❷➁➋⓶二] > 2" +- "[𞥓𐒣߃𖭓꤃𖩣𑓓𑑓𑋳𑄹꩓꘣᱃᭓᮳᠓៣᥉༣໓꧳႓᪓᪃᧓𑵓꯳᱓𑱓𑜳𑛃𑙓𑇓꧓꣓෩𑁩3𝟹𝟛𝟯𝟑𝟥₃³③⑶⒊❸➂➌⓷三] > 3" +- "[𞥔𐒤߄𖭔꤄𖩤𑓔𑑔𑋴𑄺꩔꘤᱄᭔᮴᠔៤᥊༤໔꧴႔᪔᪄᧔𑵔꯴᱔𑱔𑜴𑛄𑙔𑇔꧔꣔෪𑁪4𝟺𝟜𝟰𝟒𝟦₄⁴④⑷⒋❹➃➍⓸四] > 4" +- "[𞥕𐒥߅𖭕꤅𖩥𑓕𑑕𑋵𑄻꩕꘥᱅᭕᮵᠕៥᥋༥໕꧵႕᪕᪅᧕𑵕꯵᱕𑱕𑜵𑛅𑙕𑇕꧕꣕෫𑁫5𝟻𝟝𝟱𝟓𝟧₅⁵⑤⑸⒌❺➄➎⓹五] > 5" +- "[𞥖𐒦߆𖭖꤆𖩦𑓖𑑖𑋶𑄼꩖꘦᱆᭖᮶᠖៦᥌༦໖꧶႖᪖᪆᧖𑵖꯶᱖𑱖𑜶𑛆𑙖𑇖꧖꣖෬𑁬6𝟼𝟞𝟲𝟔𝟨₆⁶⑥⑹⒍❻➅➏⓺六] > 6" +- "[𞥗𐒧߇𖭗꤇𖩧𑓗𑑗𑋷𑄽꩗꘧᱇᭗᮷᠗៧᥍༧໗꧷႗᪗᪇᧗𑵗꯷᱗𑱗𑜷𑛇𑙗𑇗꧗꣗෭𑁭7𝟽𝟟𝟳𝟕𝟩₇⁷⑦⑺⒎❼➆➐⓻七] > 7" +- "[𞥘𐒨߈𖭘꤈𖩨𑓘𑑘𑋸𑄾꩘꘨᱈᭘᮸᠘៨᥎༨໘꧸႘᪘᪈᧘𑵘꯸᱘𑱘𑜸𑛈𑙘𑇘꧘꣘෮𑁮8𝟾𝟠𝟴𝟖𝟪₈⁸⑧⑻⒏❽➇➑⓼八] > 8" +- "[𞥙𐒩߉𖭙꤉𖩩𑓙𑑙𑋹𑄿꩙꘩᱉᭙᮹᠙៩᥏༩໙꧹႙᪙᪉᧙𑵙꯹᱙𑱙𑜹𑛉𑙙𑇙꧙꣙෯𑁯9𝟿𝟡𝟵𝟗𝟫₉⁹⑨⑼⒐❾➈➒⓽九] > 9" +- "[𑜺⑩⑽⒑❿➉➓⓾十] > '10'" - "[⑪⑾⒒⓫] > '11'" - "[⑫⑿⒓⓬] > '12'" - "[⑬⒀⒔⓭] > '13'" diff --git a/test/python/tokenizer/sanitizers/test_tag_japanese.py b/test/python/tokenizer/sanitizers/test_tag_japanese.py index 2f5e442cd2..946f137ce7 100644 --- a/test/python/tokenizer/sanitizers/test_tag_japanese.py +++ b/test/python/tokenizer/sanitizers/test_tag_japanese.py @@ -78,7 +78,3 @@ def test_housenumber_quarter(self): def test_housenumber_blocknumber_neighbourhood_quarter(self): res = self.run_sanitizer_on('address', block_number='6', housenumber='2', quarter='kase', neighbourhood='8') assert res == [('6-2','housenumber'),('kase8','place')] - - def test_KANJI_MAP(self): - res = self.run_sanitizer_on('address', block_number='六', housenumber='二', quarter='kase', neighbourhood='八') - assert res == [('6-2','housenumber'),('kase8','place')]