Skip to content

Commit

Permalink
Merge pull request #7 from neocl/dev
Browse files Browse the repository at this point in the history
Version 0.1a3 is ready
  • Loading branch information
letuananh authored Apr 16, 2018
2 parents cf7cce7 + a5f5c70 commit 8b30d90
Show file tree
Hide file tree
Showing 27 changed files with 673 additions and 313 deletions.
3 changes: 2 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
include README.rst
include CHANGES.md
include LICENSE
recursive-include jamdict/scripts/ *.sql
recursive-include jamdict/data/ *.sql
recursive-include jamdict/data/ *.json
47 changes: 41 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,67 @@ Python library for manipulating Jim Breen's JMdict & KanjiDic2

# Installation

Homepage: [https://github.com/neocl/jamdict](https://github.com/neocl/jamdict)

```bash
pip install jamdict
# pip script sometimes doesn't work properly, so you may want to try this instead
python3 -m pip install jamdict

# initial setup (this command will create ~/.jamdict for you
# it will also tell you where to copy the data files
python3 -m jamdict.tools info

# to look up a word using command line
python3 -m jamdict.tools lookup たべる
========================================
Found entries
========================================
Entry: 1358280 | Kj: 食べる, 喰べる | Kn: たべる
--------------------
1. to eat ((Ichidan verb|transitive verb))
2. to live on (e.g. a salary)/to live off/to subsist on

========================================
Found characters
========================================
Char: 食 | Strokes: 9
--------------------
Readings: shi2, si4, sig, sa, 식, 사, Thực, Tự, ショク, ジキ, く.う, く.らう, た.べる, は.む
Meanings: eat, food
Char: 喰 | Strokes: 12
--------------------
Readings: shi2, si4, sig, 식, Thặc, Thực, Tự, く.う, く.らう
Meanings: eat, drink, receive (a blow), (kokuji)
```

## Data
XML files (JMdict_e.xml, kanjidic2.xml) must be downloaded from JMdict home page and copy into `~/local/jamdict/data`
XML files (JMdict_e.xml, kanjidic2.xml) must be downloaded and copy into `~/.jamdict/data`

I have mirrored these files to Google Drive so you can download there too:
[https://drive.google.com/drive/folders/1z4zF9ImZlNeTZZplflvvnpZfJp3WVLPk](https://drive.google.com/drive/folders/1z4zF9ImZlNeTZZplflvvnpZfJp3WVLPk)

Official website
- JMdict: [http://edrdg.org/jmdict/edict_doc.html](http://edrdg.org/jmdict/edict_doc.html)
- kanjidic2: [http://www.edrdg.org/kanjidic/kanjd2index.html](http://www.edrdg.org/kanjidic/kanjd2index.html)
- KRADFILE: [http://www.edrdg.org/krad/kradinf.html](http://www.edrdg.org/krad/kradinf.html)

Read more about JMdict here: http://www.csse.monash.edu.au/~jwb/edict.html

# Sample codes

```python
>>> from jamdict import Jamdict
>>> jmd = Jamdict("/home/tuananh/local/jamdict/data/jamdict.db")
>>> jmd = Jamdict()
>>> jmd.lookup('食べる')
<jamdict.util.LookupResult object at 0x7fc70775a710>
'Entries: たべる(食べる):1. to eat2. to live on (e.g. a salary)/to live off/to subsist on | Chars: 食, 喰'
>>> result = jmd.lookup('食べる')
>>> print(result.entries)
[ID:1358280|たべる|食べる|1. to eat ((Ichidan verb|transitive verb))|2. to live on (e.g. a salary)/to live off/to subsist on]
[たべる (食べる) : 1. to eat 2. to live on (e.g. a salary)/to live off/to subsist on]
>>> for c in result.chars:
... print(c, c.rm_groups)
...
喰 [R: shi2, si4, sig, 식, Thặcÿ, Thựcÿ, Tự,ÿ く.う, く.らう | M: eat, drink, receive (a blow), (kokuji)]
食 [R: shi2, si4, sig, sa, 식, 사, Thực, Tự, ショク, ジキ, く.う, く.らう, た.べる, は.む | M: eat, food, manger, nourriture, alimento, comida, eclipse, comer, comer, comida, alimento]
喰 [R: shi2, si4, sig, 식, Thặc, Thực, Tự, く.う, く.らう | M: eat, drink, receive (a blow), (kokuji)]
```

See `jamdict_demo.py` and `jamdict/tools.py` for more information.
2 changes: 1 addition & 1 deletion data/README.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Copy JMDict dictionary file (JMdict_e.xml) here
Copy dictionary files (JMdict_e.xml, kanjidic2.xml, kradfile, etc.) here
24 changes: 6 additions & 18 deletions jamdict/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,26 +44,14 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.


__author__ = "Le Tuan Anh"
__email__ = "tuananh.ke@gmail.com"
__copyright__ = "Copyright 2016, jamdict"
__credits__ = []
__license__ = "MIT License"
__description__ = "Python library for manipulating Jim Breen's JMdict & KanjiDic2"
__url__ = "https://github.com/neocl/jamdict"
__maintainer__ = "Le Tuan Anh"
__version_major__ = "0.1"
__version__ = "{}a1".format(__version_major__)
__version_long__ = "{} - Alpha".format(__version_major__)
__status__ = "Prototype"

########################################################################

from .__version__ import __author__, __email__, __copyright__, __maintainer__
from .__version__ import __credits__, __license__, __description__, __url__
from .__version__ import __version_major__, __version_long__, __version__, __status__

from .jmdict_sqlite import JMDictSQLite
from .kanjidic2_sqlite import KanjiDic2SQLite
from .util import Jamdict, JMDictXML, KanjiDic2XML

########################################################################

__all__ = ['Jamdict', 'JMDictSQLite', 'JMDictXML', 'KanjiDic2SQLite', 'KanjiDic2XML']
__all__ = ['Jamdict', 'JMDictSQLite', 'JMDictXML', 'KanjiDic2SQLite', 'KanjiDic2XML',
"__version__", "__author__", "__description__", "__copyright__"]
15 changes: 15 additions & 0 deletions jamdict/__version__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-

# jamdict's package version information
__author__ = "Le Tuan Anh"
__email__ = "tuananh.ke@gmail.com"
__copyright__ = "Copyright (c) 2016, Le Tuan Anh"
__credits__ = []
__license__ = "MIT License"
__description__ = "Python library for manipulating Jim Breen's JMdict & KanjiDic2"
__url__ = "https://github.com/neocl/jamdict"
__maintainer__ = "Le Tuan Anh"
__version_major__ = "0.1"
__version__ = "{}a3".format(__version_major__)
__version_long__ = "{} - Alpha".format(__version_major__)
__status__ = "Prototype"
91 changes: 91 additions & 0 deletions jamdict/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-

'''
Jamdict configuration management
Latest version can be found at https://github.com/neocl/jamdict
@author: Le Tuan Anh <tuananh.ke@gmail.com>
@license: MIT
'''

# Copyright (c) 2016, Le Tuan Anh <tuananh.ke@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

########################################################################

import os
import logging

from chirptext import AppConfig
from chirptext.io import read_file, write_file

# ----------------------------------------------------------------------
# Configuration
# ----------------------------------------------------------------------

MY_DIR = os.path.dirname(__file__)
CONFIG_TEMPLATE = os.path.join(MY_DIR, 'data', 'config_template.json')
__jamdict_home = os.environ.get('JAMDICT_HOME', MY_DIR)
__app_config = AppConfig('jamdict', mode=AppConfig.JSON, working_dir=__jamdict_home)


def getLogger():
return logging.getLogger(__name__)


def _get_config_manager():
''' Internal function for retrieving application config manager object
Don't use this directly, use read_config() method instead
'''
return __app_config


def read_config():
if not __app_config.config and not __app_config.locate_config():
# need to create a config
config_dir = os.path.expanduser('~/.jamdict/')
if not os.path.exists(config_dir):
os.makedirs(config_dir)
cfg_loc = os.path.join(config_dir, 'config.json')
default_config = read_file(CONFIG_TEMPLATE)
getLogger().warning("Jamdict configuration file could not be found. A new configuration file will be generated at {}".format(cfg_loc))
getLogger().debug("Default config: {}".format(default_config))
write_file(cfg_loc, default_config)
# read config
config = __app_config.config
return config


def home_dir():
_config = read_config()
return _config.get('JAMDICT_HOME', '.')


def data_dir():
_config = read_config()
_data_dir = _config.get('JAMDICT_DATA', '{JAMDICT_HOME}/data').format(JAMDICT_HOME=home_dir())
return _data_dir


def get_file(file_key):
_config = read_config()
_data_dir = data_dir()
return _config.get(file_key).format(JAMDICT_DATA=_data_dir)
8 changes: 8 additions & 0 deletions jamdict/data/config_template.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"JAMDICT_HOME": "~/.jamdict",
"JAMDICT_DATA": "{JAMDICT_HOME}/data",
"JAMDICT_DB": "{JAMDICT_DATA}/jamdict.db",
"JMDICT_XML": "{JAMDICT_DATA}/JMdict_e.gz",
"KD2_XML": "{JAMDICT_DATA}/kanjidic2.xml.gz",
"KRADFILE": "{JAMDICT_DATA}/kradfile-u.gz"
}
File renamed without changes.
File renamed without changes.
77 changes: 53 additions & 24 deletions jamdict/jmdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
import logging
from lxml import etree

from chirptext import io as chio

logger = logging.getLogger(__name__)

########################################################################
Expand All @@ -64,8 +66,8 @@ class JMDEntry(object):
def __init__(self, idseq=''):
# A unique numeric sequence number for each entry
self.idseq = idseq # ent_seq
self.kanji_forms = [] # k_ele* => KanjiReading[]
self.kana_forms = [] # r_ele+ => KanaReading[]
self.kanji_forms = [] # k_ele* => KanjiForm[]
self.kana_forms = [] # r_ele+ => KanaForm[]
self.info = None # info? => EntryInfo
self.senses = [] # sense+

Expand All @@ -80,18 +82,28 @@ def set_info(self, info):
logging.warning("WARNING: multiple info tag")
self.info = info

def __repr__(self):
tmp = ['ID:%s' % self.idseq]
def text(self, compact=True, separator=' '):
tmp = []
if not compact:
tmp.append('[id#%s]' % self.idseq)
if self.kana_forms:
tmp.append(self.kana_forms[0].text)
if self.kanji_forms:
tmp.append(self.kanji_forms[0].text)
for sense, idx in zip(self.senses, range(len(self.senses))):
tmp.append('{i}. {s}'.format(i=idx + 1, s=sense))
return '|'.join(tmp)
tmp.append("({})".format(self.kanji_forms[0].text))
if self.senses:
tmp.append(':')
if len(self.senses) == 1:
tmp.append(self.senses[0].text(compact=compact))
else:
for sense, idx in zip(self.senses, range(len(self.senses))):
tmp.append('{i}. {s}'.format(i=idx + 1, s=sense.text(compact=compact)))
return separator.join(tmp)

def __repr__(self):
return self.text(compact=True)

def __str__(self):
return repr(self)
return self.text(compact=False)

def to_json(self):
ed = {'idseq': self.idseq,
Expand All @@ -103,7 +115,7 @@ def to_json(self):
return ed


class KanjiReading(object):
class KanjiForm(object):
''' The kanji element, or in its absence, the reading element, is
the defining component of each entry.
The overwhelming majority of entries will have a single kanji
Expand Down Expand Up @@ -174,8 +186,14 @@ def to_json(self):
kjd['pri'] = self.pri
return kjd

def __repr__(self):
return str(self)

def __str__(self):
return self.text

class KanaReading(object):

class KanaForm(object):
'''<!ELEMENT r_ele (reb, re_nokanji?, re_restr*, re_inf*, re_pri*)>
The reading element typically contains the valid readings
of the word(s) in the kanji element using modern kanadzukai.
Expand Down Expand Up @@ -228,6 +246,12 @@ def to_json(self):
knd['pri'] = self.pri
return knd

def __repr__(self):
return str(self)

def __str__(self):
return self.text


class EntryInfo(object):
'''General coded information relating to the entry as a whole.
Expand Down Expand Up @@ -375,8 +399,11 @@ def __repr__(self):
return str(self)

def __str__(self):
return self.text(compact=False)

def text(self, compact=True):
tmp = [str(x) for x in self.gloss]
if self.pos:
if not compact and self.pos:
return '{gloss} ({pos})'.format(gloss='/'.join(tmp), pos=('(%s)' % '|'.join(self.pos)))
else:
return '/'.join(tmp)
Expand Down Expand Up @@ -513,16 +540,18 @@ def __init__(self):
def parse_file(self, jmdict_file_path):
''' Parse JMDict_e.xml file and return a list of JMDEntry objects
'''
logger.debug('Loading data from file: %s' % (os.path.abspath(jmdict_file_path)))

tree = etree.iterparse(jmdict_file_path)
entries = []
for event, element in tree:
if event == 'end' and element.tag == 'entry':
entries.append(self.parse_entry_tag(element))
# and then we can clear the element to save memory
element.clear()
return entries
actual_path = os.path.abspath(os.path.expanduser(jmdict_file_path))
logger.debug('Loading data from file: {}'.format(actual_path))

with chio.open(actual_path, mode='rb') as jmfile:
tree = etree.iterparse(jmfile)
entries = []
for event, element in tree:
if event == 'end' and element.tag == 'entry':
entries.append(self.parse_entry_tag(element))
# and then we can clear the element to save memory
element.clear()
return entries

def parse_entry_tag(self, etag):
'''Parse a lxml XML Node and generate a JMDEntry entry'''
Expand Down Expand Up @@ -559,7 +588,7 @@ def get_single(self, tag_name, a_tag):
return children[0]

def parse_k_ele(self, k_ele, entry):
kr = KanjiReading()
kr = KanjiForm()
for child in k_ele:
if child.tag == 'keb':
kr.set_text(child.text)
Expand All @@ -574,7 +603,7 @@ def parse_k_ele(self, k_ele, entry):
return kr

def parse_r_ele(self, r_ele, entry):
kr = KanaReading()
kr = KanaForm()
for child in r_ele:
if child.tag == 'reb':
kr.set_text(child.text)
Expand Down
Loading

0 comments on commit 8b30d90

Please sign in to comment.