Skip to content

Commit

Permalink
Merge pull request #12 from neocl/dev
Browse files Browse the repository at this point in the history
Improve search() performance:

    use == instead of LIKE when query does not contain any wildcard character
    Reuse SQLite execution context by default
  • Loading branch information
letuananh authored Jul 8, 2019
2 parents 06ccaae + 66ff215 commit e70124c
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 17 deletions.
44 changes: 36 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,44 @@ Official website
```python
>>> from jamdict import Jamdict
>>> jmd = Jamdict()
>>> jmd.lookup('食べる')
'Entries: たべる(食べる):1. to eat2. to live on (e.g. a salary)/to live off/to subsist on | Chars: 食, 喰'
>>> result = jmd.lookup('食べる')
>>> print(result.entries)
[たべる (食べる) : 1. to eat 2. to live on (e.g. a salary)/to live off/to subsist on]
# use wildcard matching to find anything starts with 食べ and ends with る
>>> result = jmd.lookup('食べ%る')
# print all found word entries
>>> for entry in result.entries:
... print(entry)
...
[id#1358280] たべる (食べる) : 1. to eat ((Ichidan verb|transitive verb)) 2. to live on (e.g. a salary)/to live off/to subsist on
[id#1358300] たべすぎる (食べ過ぎる) : to overeat ((Ichidan verb|transitive verb))
[id#1852290] たべつける (食べ付ける) : to be used to eating ((Ichidan verb|transitive verb))
[id#2145280] たべはじめる (食べ始める) : to start eating ((Ichidan verb))
[id#2449430] たべかける (食べ掛ける) : to start eating ((Ichidan verb))
[id#2671010] たべなれる (食べ慣れる) : to be used to eating/to become used to eating/to be accustomed to eating/to acquire a taste for ((Ichidan verb))
[id#2765050] たべられる (食べられる) : 1. to be able to eat ((Ichidan verb|intransitive verb)) 2. to be edible/to be good to eat ((pre-noun adjectival (rentaishi)))
[id#2795790] たべくらべる (食べ比べる) : to taste and compare several dishes (or foods) of the same type ((Ichidan verb|transitive verb))
[id#2807470] たべあわせる (食べ合わせる) : to eat together (various foods) ((Ichidan verb))
# print all related characters
>>> for c in result.chars:
... print(c, c.rm_groups)
... print(repr(c))
...
食 [R: shi2, si4, sig, sa, 식, 사, Thực, Tự, ショク, ジキ, く.う, く.らう, た.べる, は.む | M: eat, food, manger, nourriture, alimento, comida, eclipse, comer, comer, comida, alimento]
喰 [R: shi2, si4, sig, 식, Thặc, Thực, Tự, く.う, く.らう | M: eat, drink, receive (a blow), (kokuji)]
食:9:eat,food
喰:12:eat,drink,receive (a blow),(kokuji)
過:12:overdo,exceed,go beyond,error
付:5:adhere,attach,refer to,append
始:8:commence,begin
掛:11:hang,suspend,depend,arrive at,tax,pour
慣:14:accustomed,get used to,become experienced
比:4:compare,race,ratio,Philippines
合:6:fit,suit,join,0.1

# use exact matching to increase searching speed (thanks to @reem-codes)
result = jmd.lookup('食べる')

>>> for entry in result.entries:
... print(entry)
...
[id#1358280] たべる (食べる) : 1. to eat ((Ichidan verb|transitive verb)) 2. to live on (e.g. a salary)/to live off/to subsist on
```
>>> for c in result.chars:
... print(repr(c))

See `jamdict_demo.py` and `jamdict/tools.py` for more information.
2 changes: 1 addition & 1 deletion jamdict/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
__url__ = "https://github.com/neocl/jamdict"
__maintainer__ = "Le Tuan Anh"
__version_major__ = "0.1"
__version__ = "{}a4".format(__version_major__)
__version__ = "{}a5".format(__version_major__)
__version_long__ = "{} - Alpha".format(__version_major__)
__status__ = "Prototype"
11 changes: 8 additions & 3 deletions jamdict/jmdict_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,17 @@ def update_meta(self, version, url, ctx=None):
ju.value = url
ctx.meta.save(ju)

def search(self, query, ctx=None):
def search(self, query, ctx=None, **kwargs):
# ensure context
if ctx is None:
with self.ctx() as ctx:
return self.search(query, ctx=ctx)
where = "idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text like ?)"
_is_wildcard_search = '_' in query or '@' in query or '%' in query
if _is_wildcard_search:
where = "idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text like ?)"
else:
where = "idseq IN (SELECT idseq FROM Kanji WHERE text == ?) OR idseq IN (SELECT idseq FROM Kana WHERE text == ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text == ?)"
getLogger().debug(where)
params = [query, query, query]
try:
if query.startswith('id#'):
Expand All @@ -155,7 +160,7 @@ def search(self, query, ctx=None):
print("Searching by ID: {}".format(query_int))
where = "idseq = ?"
params = [query_int]
except:
except Exception:
pass
# else (a context is provided)
eids = self.Entry.select(where, params, ctx=ctx)
Expand Down
43 changes: 40 additions & 3 deletions jamdict/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,14 @@ def __init__(self, data_source, setup_script=None, setup_file=None, *args, **kwa

class Jamdict(object):

def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None, auto_config=True, auto_expand=True):
def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None, auto_config=True, auto_expand=True, reuse_ctx=True, **kwargs):
# file paths configuration
self.auto_expand = auto_expand
self.db_file = db_file if db_file else config.get_file('JAMDICT_DB') if auto_config else None
self.kd2_file = kd2_file if kd2_file else config.get_file('JAMDICT_DB') if auto_config else None
if not self.db_file or not os.path.isfile(self.db_file):
getLogger().warning("JAMDICT_DB could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict.tools import` first")
if not self.kd2_file or os.path.isfile(self.kd2_file):
if not self.kd2_file or not os.path.isfile(self.kd2_file):
getLogger().warning("Kanjidic2 database could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict.tools import` first")
self.jmd_xml_file = jmd_xml_file if jmd_xml_file else config.get_file('JMDICT_XML') if auto_config else None
self.kd2_xml_file = kd2_xml_file if kd2_xml_file else config.get_file('KD2_XML') if auto_config else None
Expand All @@ -127,6 +127,21 @@ def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=
self._kd2_sqlite = None
self._jmd_xml = None
self._kd2_xml = None
self.reuse_ctx = reuse_ctx
self.__jm_ctx = None
try:
if self.reuse_ctx and self.db_file and os.path.isfile(self.db_file):
self.__jm_ctx = self.jmdict.ctx()
except Exception:
getLogger().warning("JMdict data could not be accessed.")

def __del__(self):
if self.__jm_ctx is not None:
try:
# try to close default SQLite context if needed
self.__jm_ctx.close()
except Exception:
pass

@property
def db_file(self):
Expand All @@ -139,6 +154,17 @@ def db_file(self, value):
else:
self.__db_file = None

@property
def kd2_file(self):
return self.__kd2_file

@kd2_file.setter
def kd2_file(self, value):
if self.auto_expand and value:
self.__kd2_file = os.path.abspath(os.path.expanduser(value))
else:
self.__kd2_file = None

@property
def jmdict(self):
if not self._db_sqlite and self.db_file:
Expand Down Expand Up @@ -208,11 +234,22 @@ def get_entry(self, idseq):
else:
raise LookupError("There is no backend data available")

def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None):
def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, **kwargs):
''' Search words and characters and return a LookupResult object.
Keyword arguments:
query --- Text to query, may contains wildcard characters
exact_match --- use exact SQLite matching (==) instead of wildcard matching (LIKE)
strict_lookup --- Only look up the Kanji characters in query (i.e. discard characters from variants)
lookup_chars --- set lookup_chars to False to disable character lookup
ctx --- Database access context, can be reused for better performance
'''
if not self.is_available():
raise LookupError("There is no backend data available")
elif not query:
raise ValueError("Query cannot be empty")
if ctx is None and self.reuse_ctx and self.__jm_ctx is not None:
ctx = self.__jm_ctx
# Lookup words
entries = []
chars = []
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def read(*filenames, **kwargs):
author_email=pkg_info['__email__'],
description=pkg_info['__description__'],
long_description=long_description,
long_description_content_type='text/markdown',
packages=['jamdict'],
package_data={'jamdict': ['data/*.sql', 'data/*.json']},
include_package_data=True,
Expand Down
4 changes: 2 additions & 2 deletions test/test_jmdict_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,11 @@ def test_search(self):
self.assertEqual(len(es), 2)
getLogger().info('あの: {}'.format('|'.join([str(x) for x in es])))
# Search by kanji
es = self.db.search('%子%', ctx)
es = self.db.search('%子%', ctx, exact_match=False)
self.assertEqual(len(es), 4)
getLogger().info('%子%: {}'.format('|'.join([str(x) for x in es])))
# search by meaning
es = self.db.search('%confections%', ctx)
es = self.db.search('%confections%', ctx, exact_match=False)
self.assertTrue(es)
getLogger().info('%confections%: {}'.format('|'.join([str(x) for x in es])))

Expand Down

0 comments on commit e70124c

Please sign in to comment.