Improve speed of FastTextKeyedVectors.__contains__ (#1499)

* Improve speed of FastTextKeyedVectors __contains__ The current implementation of __contains__ in FastTextKeyedVectors is `O(n*m)` where `n` is the number of character ngrams in the query word and `m` is the size of the vocabulary. This is very slow for large corpora. The new implementation is O(n). * any() was unnecessary. * Update variable name and docstring to improve clarity
piskvorky · Sep 11, 2017 · 224566c · 224566c
1 parent db9e230
commit 224566c
Showing 1 changed file with 4 additions and 8 deletions.
diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py
@@ -121,18 +121,14 @@ def init_sims(self, replace=False):
 
     def __contains__(self, word):
         """
-        Check if word is present in the vocabulary, or if any word ngrams are present. A vector for the word is
-        guaranteed to exist if `__contains__` returns True.
-
+        Check if `word` or any character ngrams in `word` are present in the vocabulary.
+        A vector for the word is guaranteed to exist if `__contains__` returns True.
         """
         if word in self.vocab:
             return True
         else:
-            word_ngrams = set(FastText.compute_ngrams(word, self.min_n, self.max_n))
-            if len(word_ngrams & set(self.ngrams.keys())):
-                return True
-            else:
-                return False
+            char_ngrams = FastText.compute_ngrams(word, self.min_n, self.max_n)
+            return any(ng in self.ngrams for ng in char_ngrams)
 
 
 class FastText(Word2Vec):