Skip to content

Commit

Permalink
Add doc2idx method for gensim.corpora.Dictionary. Fix piskvorky#1634 (p…
Browse files Browse the repository at this point in the history
…iskvorky#1720)

* define doc2idx to convert a document to a vector of indexes per the dictionary

* update documentation

* changes to textcorpus to add a mode for index vector format output. adding test case for the changes

* fixing doc string

* fix doc string

* fix doc string

* removing trailing white spaces

* removing trailing white spaces

* changes as per review

* change as per review.

reverting changes to TextCorpus as discussed
  • Loading branch information
roopalgarg authored and KMarie1 committed Nov 26, 2017
1 parent 02d848f commit 76afe9e
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 1 deletion.
40 changes: 40 additions & 0 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,46 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
else:
return result

def doc2idx(self, document, unknown_word_index=-1):
"""Convert `document` (a list of words) into a list of indexes = list of `token_id`.
Each word is assumed to be a **tokenized and normalized** string (either unicode or utf8-encoded).
No further preprocessing is done on the words in `document`; apply tokenization, stemming etc. before calling
this method.
Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`,
defaults to -1.
Notes
-----
This function is `const`, aka read-only
Parameters
----------
document : list of str
Tokenized, normalized and preprocessed words
unknown_word_index : int, optional
Index to use for words not in the dictionary.
Returns
-------
list of int
Indexes in the dictionary for words in the `document` preserving the order of words
Examples
--------
>>> dictionary_obj = Dictionary()
>>> dictionary_obj.token2id = {'computer': 0, 'human': 1, 'response': 2, 'survey': 3}
>>> dictionary_obj.doc2idx(document=['human', 'computer', 'interface'], unknown_word_index=-1)
[1, 0, -1]
"""
if isinstance(document, string_types):
raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string")

document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
return [self.token2id.get(word, unknown_word_index) for word in document]

def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
"""
Filter out tokens that appear in
Expand Down
3 changes: 2 additions & 1 deletion gensim/corpora/textcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ class TextCorpus(interfaces.CorpusABC):
6. remove stopwords; see `gensim.parsing.preprocessing` for the list of stopwords
"""
def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None, token_filters=None):
def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None,
token_filters=None):
"""
Args:
input (str): path to top-level directory to traverse for corpus documents.
Expand Down

0 comments on commit 76afe9e

Please sign in to comment.