diff --git a/MANIFEST.in b/MANIFEST.in index 6870176..2c4f726 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,3 +4,4 @@ include LICENSE include requirements*.txt recursive-include jamdict/data/ *.sql recursive-include jamdict/data/ *.json +recursive-include jamdict/data/ *.gz diff --git a/README.md b/README.md index 758632d..a5ec644 100644 --- a/README.md +++ b/README.md @@ -97,7 +97,5 @@ result = jmd.lookup('食べる') ... [id#1358280] たべる (食べる) : 1. to eat ((Ichidan verb|transitive verb)) 2. to live on (e.g. a salary)/to live off/to subsist on ``` ->>> for c in result.chars: -... print(repr(c)) See `jamdict_demo.py` and `jamdict/tools.py` for more information. diff --git a/jamdict/__version__.py b/jamdict/__version__.py index 37b4db5..4144c82 100644 --- a/jamdict/__version__.py +++ b/jamdict/__version__.py @@ -10,6 +10,6 @@ __url__ = "https://github.com/neocl/jamdict" __maintainer__ = "Le Tuan Anh" __version_major__ = "0.1" -__version__ = "{}a5".format(__version_major__) +__version__ = "{}a6".format(__version_major__) __version_long__ = "{} - Alpha".format(__version_major__) __status__ = "Prototype" diff --git a/jamdict/data/kradfile-u.gz b/jamdict/data/kradfile-u.gz new file mode 100644 index 0000000..92193f8 Binary files /dev/null and b/jamdict/data/kradfile-u.gz differ diff --git a/jamdict/data/radkfile.gz b/jamdict/data/radkfile.gz new file mode 100644 index 0000000..292bb2f Binary files /dev/null and b/jamdict/data/radkfile.gz differ diff --git a/jamdict/kanjidic2.py b/jamdict/kanjidic2.py index 0f100be..8e2f978 100644 --- a/jamdict/kanjidic2.py +++ b/jamdict/kanjidic2.py @@ -52,12 +52,17 @@ from lxml import etree from chirptext import chio +from chirptext.sino import Radical as KangxiRadical +from .krad import KRad # ------------------------------------------------------------------------------ # Configuration # ------------------------------------------------------------------------------ +krad = KRad() + + def getLogger(): return logging.getLogger(__name__) @@ -112,6 +117,7 @@ def __init__(self): self.literal = '' # The character itself in UTF8 coding. self.codepoints = [] # self.radicals = [] # + self.__canon_radical = None self.stroke_count = None # first stroke_count in misc self.grade = None # / self.stroke_miscounts = [] # /stroke_count[1:] @@ -135,6 +141,21 @@ def __repr__(self): def __str__(self): return self.literal + @property + def components(self): + if self.literal in krad.krad: + return krad.krad[self.literal] + else: + return [] + + @property + def radical(self): + if self.__canon_radical is None: + for rad in self.radicals: + if rad.rad_type == 'classical': + self.__canon_radical = KangxiRadical.kangxi()[rad.value] + return self.__canon_radical + def to_json(self): return {'literal': self.literal, 'codepoints': [cp.to_json() for cp in self.codepoints], diff --git a/jamdict/krad.py b/jamdict/krad.py new file mode 100644 index 0000000..ade09fa --- /dev/null +++ b/jamdict/krad.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +''' +Module for retrieving kanji components (i.e. radicals) +Latest version can be found at https://github.com/neocl/jamdict + +This package uses the RADKFILE/KRADFILE[1] file. +These files are the property of the [Electronic Dictionary Research and Development Group][2], and are used in conformance with the Group's [licence][3]. + +[1]: http://www.edrdg.org/krad/kradinf.html +[2]: http://www.edrdg.org/ +[3]: http://www.edrdg.org/edrdg/licence.html + +References: + JMDict website: + http://www.csse.monash.edu.au/~jwb/edict.html + Python documentation: + https://docs.python.org/ + PEP 257 - Python Docstring Conventions: + https://www.python.org/dev/peps/pep-0257/ + +@author: Le Tuan Anh +@license: MIT +''' + +# Copyright (c) 2016, Le Tuan Anh +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +######################################################################## + +import os +import logging +import threading +from collections import OrderedDict +from collections import defaultdict as dd +from lxml import etree + +from chirptext import chio +from chirptext.sino import Radical + +# ------------------------------------------------------------------------------ +# Configuration +# ------------------------------------------------------------------------------ +MY_FOLDER = os.path.dirname(os.path.abspath(__file__)) +DATA_FOLDER = os.path.join(MY_FOLDER, 'data') +KRADFILE = os.path.join(DATA_FOLDER, 'kradfile-u.gz') +RADKFILE = os.path.join(DATA_FOLDER, 'radkfile.gz') + +logger = logging.getLogger(__name__) + + +######################################################################## + +class KRad: + def __init__(self, **kwargs): + """ Kanji Radical management + """ + self.__krad_map = None + self.__radk_map = None + self.__rads = {} + self.lock = threading.Lock() + + def _build_krad_map(self): + with self.lock: + lines = chio.read_file(KRADFILE, mode='rt').splitlines() + # build the krad map + self.__krad_map = {} + self.__radk_map = dd(set) + for line in lines: + if line.startswith("#"): + continue + else: + parts = line.split(':', maxsplit=1) + if len(parts) == 2: + rads = [r.strip() for r in parts[1].split()] + char_literal = parts[0].strip() + self.__krad_map[char_literal] = rads + for rad in rads: + self.__radk_map[rad].add(char_literal) + + @property + def radk(self): + if self.__radk_map is None: + self._build_krad_map() + return self.__radk_map + + @property + def krad(self): + if self.__krad_map is None: + self._build_krad_map() + return self.__krad_map diff --git a/requirements.txt b/requirements.txt index 8a52c56..c685813 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ lxml chirptext >= 0.1a18 -puchikarui +puchikarui >= 0.1a3 diff --git a/setup.py b/setup.py index ccd1181..37f935c 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ def read(*filenames, **kwargs): long_description=long_description, long_description_content_type='text/markdown', packages=['jamdict'], - package_data={'jamdict': ['data/*.sql', 'data/*.json']}, + package_data={'jamdict': ['data/*.sql', 'data/*.json', 'data/*.gz']}, include_package_data=True, platforms='any', test_suite='test', diff --git a/test/test_krad.py b/test/test_krad.py new file mode 100644 index 0000000..1e087f7 --- /dev/null +++ b/test/test_krad.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +''' +Script for testing KRad module library +Latest version can be found at https://github.com/neocl/jamdict + +References: + Python documentation: + https://docs.python.org/ + Python unittest + https://docs.python.org/3/library/unittest.html + -- + argparse module: + https://docs.python.org/3/howto/argparse.html + PEP 257 - Python Docstring Conventions: + https://www.python.org/dev/peps/pep-0257/ + +@author: Le Tuan Anh +''' + +# Copyright (c) 2016, Le Tuan Anh +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +__author__ = "Le Tuan Anh " +__copyright__ = "Copyright 2016, jamdict" +__license__ = "MIT" + +######################################################################## + +import os +import logging +import unittest +from jamdict import config +from jamdict.jmdict import JMDictXMLParser +from jamdict.kanjidic2 import Kanjidic2XMLParser +from jamdict.krad import KRad + +######################################################################## + +MY_DIR = os.path.abspath(os.path.dirname(__file__)) +TEST_DATA = os.path.join(MY_DIR, 'data') +MINI_JMD = os.path.join(TEST_DATA, 'JMdict_mini.xml') +MINI_KD2 = os.path.join(TEST_DATA, 'kanjidic2_mini.xml') +TEST_DB = os.path.join(TEST_DATA, 'jamdict_test.db') + + +def getLogger(): + return logging.getLogger(__name__) + + +class TestConfig(unittest.TestCase): + + def test_config(self): + cfg = config.read_config() + self.assertIn('KD2_XML', cfg) + self.assertTrue(config.get_file('KD2_XML')) + getLogger().info("jamdict log file location: {}".format(config._get_config_manager().locate_config())) + + +class TestModels(unittest.TestCase): + + def test_read_krad(self): + krad = KRad() + self.assertEqual(krad.krad['㘅'], ['亅', '二', '口', '彳', '金']) + self.assertEqual(krad.krad['𪚲'], ['乙', '勹', '月', '田', '亀']) + self.assertEqual(krad.radk['龠'], {'籥', '鸙', '龢', '龠', '龡', '籲', '瀹', '龥', '禴', '鑰', '爚', '龣'}) + + +######################################################################## + +if __name__ == "__main__": + logging.getLogger('jamdict').setLevel(logging.DEBUG) + unittest.main()