Merge pull request #12 from neocl/dev

Improve search() performance: use == instead of LIKE when query does not contain any wildcard character Reuse SQLite execution context by default
neocl · Jul 8, 2019 · e70124c · e70124c
2 parents 06ccaae + 66ff215
commit e70124c
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -60,16 +60,44 @@ Official website
 ```python
 >>> from jamdict import Jamdict
 >>> jmd = Jamdict()
->>> jmd.lookup('食べる')
-'Entries: たべる(食べる):1. to eat2. to live on (e.g. a salary)/to live off/to subsist on | Chars: 食, 喰'
->>> result = jmd.lookup('食べる')
->>> print(result.entries)
-[たべる (食べる) : 1. to eat 2. to live on (e.g. a salary)/to live off/to subsist on]
+# use wildcard matching to find anything starts with 食べ and ends with る
+>>> result = jmd.lookup('食べ%る')
+# print all found word entries
+>>> for entry in result.entries:
+...     print(entry)
+...
+[id#1358280] たべる (食べる) : 1. to eat ((Ichidan verb|transitive verb)) 2. to live on (e.g. a salary)/to live off/to subsist on
+[id#1358300] たべすぎる (食べ過ぎる) : to overeat ((Ichidan verb|transitive verb))
+[id#1852290] たべつける (食べ付ける) : to be used to eating ((Ichidan verb|transitive verb))
+[id#2145280] たべはじめる (食べ始める) : to start eating ((Ichidan verb))
+[id#2449430] たべかける (食べ掛ける) : to start eating ((Ichidan verb))
+[id#2671010] たべなれる (食べ慣れる) : to be used to eating/to become used to eating/to be accustomed to eating/to acquire a taste for ((Ichidan verb))
+[id#2765050] たべられる (食べられる) : 1. to be able to eat ((Ichidan verb|intransitive verb)) 2. to be edible/to be good to eat ((pre-noun adjectival (rentaishi)))
+[id#2795790] たべくらべる (食べ比べる) : to taste and compare several dishes (or foods) of the same type ((Ichidan verb|transitive verb))
+[id#2807470] たべあわせる (食べ合わせる) : to eat together (various foods) ((Ichidan verb))
+# print all related characters
 >>> for c in result.chars:
-...     print(c, c.rm_groups)
+...     print(repr(c))
 ... 
-食 [R: shi2, si4, sig, sa, 식, 사, Thực, Tự, ショク, ジキ, く.う, く.らう, た.べる, は.む | M: eat, food, manger, nourriture, alimento, comida, eclipse, comer, comer, comida, alimento]
-喰 [R: shi2, si4, sig, 식, Thặc, Thực, Tự, く.う, く.らう | M: eat, drink, receive (a blow), (kokuji)]
+食:9:eat,food
+喰:12:eat,drink,receive (a blow),(kokuji)
+過:12:overdo,exceed,go beyond,error
+付:5:adhere,attach,refer to,append
+始:8:commence,begin
+掛:11:hang,suspend,depend,arrive at,tax,pour
+慣:14:accustomed,get used to,become experienced
+比:4:compare,race,ratio,Philippines
+合:6:fit,suit,join,0.1
+
+# use exact matching to increase searching speed (thanks to @reem-codes)
+result = jmd.lookup('食べる')
+
+>>> for entry in result.entries:
+...     print(entry)
+... 
+[id#1358280] たべる (食べる) : 1. to eat ((Ichidan verb|transitive verb)) 2. to live on (e.g. a salary)/to live off/to subsist on
 ```
+>>> for c in result.chars:
+...     print(repr(c))
 
 See `jamdict_demo.py` and `jamdict/tools.py` for more information.
diff --git a/jamdict/__version__.py b/jamdict/__version__.py
@@ -10,6 +10,6 @@
 __url__ = "https://github.com/neocl/jamdict"
 __maintainer__ = "Le Tuan Anh"
 __version_major__ = "0.1"
-__version__ = "{}a4".format(__version_major__)
+__version__ = "{}a5".format(__version_major__)
 __version_long__ = "{} - Alpha".format(__version_major__)
 __status__ = "Prototype"
diff --git a/jamdict/jmdict_sqlite.py b/jamdict/jmdict_sqlite.py
@@ -141,12 +141,17 @@ def update_meta(self, version, url, ctx=None):
             ju.value = url
             ctx.meta.save(ju)
 
-    def search(self, query, ctx=None):
+    def search(self, query, ctx=None, **kwargs):
         # ensure context
         if ctx is None:
             with self.ctx() as ctx:
                 return self.search(query, ctx=ctx)
-        where = "idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text like ?)"
+        _is_wildcard_search = '_' in query or '@' in query or '%' in query
+        if _is_wildcard_search:
+            where = "idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text like ?)"
+        else:
+            where = "idseq IN (SELECT idseq FROM Kanji WHERE text == ?) OR idseq IN (SELECT idseq FROM Kana WHERE text == ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text == ?)"
+        getLogger().debug(where)
         params = [query, query, query]
         try:
             if query.startswith('id#'):
@@ -155,7 +160,7 @@ def search(self, query, ctx=None):
                     print("Searching by ID: {}".format(query_int))
                     where = "idseq = ?"
                     params = [query_int]
-        except:
+        except Exception:
             pass
         # else (a context is provided)
         eids = self.Entry.select(where, params, ctx=ctx)

diff --git a/jamdict/util.py b/jamdict/util.py
@@ -111,14 +111,14 @@ def __init__(self, data_source, setup_script=None, setup_file=None, *args, **kwa
 
 class Jamdict(object):
 
-    def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None, auto_config=True, auto_expand=True):
+    def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None, auto_config=True, auto_expand=True, reuse_ctx=True, **kwargs):
         # file paths configuration
         self.auto_expand = auto_expand
         self.db_file = db_file if db_file else config.get_file('JAMDICT_DB') if auto_config else None
         self.kd2_file = kd2_file if kd2_file else config.get_file('JAMDICT_DB') if auto_config else None
         if not self.db_file or not os.path.isfile(self.db_file):
             getLogger().warning("JAMDICT_DB could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict.tools import` first")
-        if not self.kd2_file or os.path.isfile(self.kd2_file):
+        if not self.kd2_file or not os.path.isfile(self.kd2_file):
             getLogger().warning("Kanjidic2 database could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict.tools import` first")
         self.jmd_xml_file = jmd_xml_file if jmd_xml_file else config.get_file('JMDICT_XML') if auto_config else None
         self.kd2_xml_file = kd2_xml_file if kd2_xml_file else config.get_file('KD2_XML') if auto_config else None
@@ -127,6 +127,21 @@ def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=
         self._kd2_sqlite = None
         self._jmd_xml = None
         self._kd2_xml = None
+        self.reuse_ctx = reuse_ctx
+        self.__jm_ctx = None
+        try:
+            if self.reuse_ctx and self.db_file and os.path.isfile(self.db_file):
+                self.__jm_ctx = self.jmdict.ctx()
+        except Exception:
+            getLogger().warning("JMdict data could not be accessed.")
+
+    def __del__(self):
+        if self.__jm_ctx is not None:
+            try:
+                # try to close default SQLite context if needed
+                self.__jm_ctx.close()
+            except Exception:
+                pass
 
     @property
     def db_file(self):
@@ -139,6 +154,17 @@ def db_file(self, value):
         else:
             self.__db_file = None
 
+    @property
+    def kd2_file(self):
+        return self.__kd2_file
+
+    @kd2_file.setter
+    def kd2_file(self, value):
+        if self.auto_expand and value:
+            self.__kd2_file = os.path.abspath(os.path.expanduser(value))
+        else:
+            self.__kd2_file = None
+
     @property
     def jmdict(self):
         if not self._db_sqlite and self.db_file:
@@ -208,11 +234,22 @@ def get_entry(self, idseq):
         else:
             raise LookupError("There is no backend data available")
 
-    def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None):
+    def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, **kwargs):
+        ''' Search words and characters and return a LookupResult object.
+
+        Keyword arguments:
+        query --- Text to query, may contains wildcard characters
+        exact_match --- use exact SQLite matching (==) instead of wildcard matching (LIKE)
+        strict_lookup --- Only look up the Kanji characters in query (i.e. discard characters from variants)
+        lookup_chars --- set lookup_chars to False to disable character lookup
+        ctx --- Database access context, can be reused for better performance
+        '''
         if not self.is_available():
             raise LookupError("There is no backend data available")
         elif not query:
             raise ValueError("Query cannot be empty")
+        if ctx is None and self.reuse_ctx and self.__jm_ctx is not None:
+            ctx = self.__jm_ctx
         # Lookup words
         entries = []
         chars = []

diff --git a/setup.py b/setup.py
@@ -53,6 +53,7 @@ def read(*filenames, **kwargs):
     author_email=pkg_info['__email__'],
     description=pkg_info['__description__'],
     long_description=long_description,
+    long_description_content_type='text/markdown',
     packages=['jamdict'],
     package_data={'jamdict': ['data/*.sql', 'data/*.json']},
     include_package_data=True,

diff --git a/test/test_jmdict_sqlite.py b/test/test_jmdict_sqlite.py
@@ -125,11 +125,11 @@ def test_search(self):
             self.assertEqual(len(es), 2)
             getLogger().info('あの: {}'.format('|'.join([str(x) for x in es])))
             # Search by kanji
-            es = self.db.search('%子%', ctx)
+            es = self.db.search('%子%', ctx, exact_match=False)
             self.assertEqual(len(es), 4)
             getLogger().info('%子%: {}'.format('|'.join([str(x) for x in es])))
             # search by meaning
-            es = self.db.search('%confections%', ctx)
+            es = self.db.search('%confections%', ctx, exact_match=False)
             self.assertTrue(es)
             getLogger().info('%confections%: {}'.format('|'.join([str(x) for x in es])))