Add doc2idx method for gensim.corpora.Dictionary. Fix piskvorky#1634 (p…

…iskvorky#1720) * define doc2idx to convert a document to a vector of indexes per the dictionary * update documentation * changes to textcorpus to add a mode for index vector format output. adding test case for the changes * fixing doc string * fix doc string * fix doc string * removing trailing white spaces * removing trailing white spaces * changes as per review * change as per review. reverting changes to TextCorpus as discussed
VaiyeBe · Nov 26, 2017 · 76afe9e · 76afe9e
1 parent 02d848f
commit 76afe9e
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 1 deletion.
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -173,6 +173,46 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
         else:
             return result
 
+    def doc2idx(self, document, unknown_word_index=-1):
+        """Convert `document` (a list of words) into a list of indexes = list of `token_id`.
+
+        Each word is assumed to be a **tokenized and normalized** string (either unicode or utf8-encoded).
+        No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling
+        this method.
+
+        Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`,
+        defaults to -1.
+
+        Notes
+        -----
+        This function is `const`, aka read-only
+
+        Parameters
+        ----------
+        document : list of str
+            Tokenized, normalized and preprocessed words
+        unknown_word_index : int, optional
+            Index to use for words not in the dictionary.
+
+        Returns
+        -------
+        list of int
+            Indexes in the dictionary for words in the `document` preserving the order of words
+
+        Examples
+        --------
+        >>> dictionary_obj = Dictionary()
+        >>> dictionary_obj.token2id = {'computer': 0, 'human': 1, 'response': 2, 'survey': 3}
+        >>> dictionary_obj.doc2idx(document=['human', 'computer', 'interface'], unknown_word_index=-1)
+        [1, 0, -1]
+
+        """
+        if isinstance(document, string_types):
+            raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string")
+
+        document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
+        return [self.token2id.get(word, unknown_word_index) for word in document]
+
     def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
         """
         Filter out tokens that appear in

diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
@@ -112,7 +112,8 @@ class TextCorpus(interfaces.CorpusABC):
     6.  remove stopwords; see `gensim.parsing.preprocessing` for the list of stopwords
 
     """
-    def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None, token_filters=None):
+    def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None,
+                 token_filters=None):
         """
         Args:
             input (str): path to top-level directory to traverse for corpus documents.