diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index 9986c780f3..b3f388a04d 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -10,6 +10,7 @@ from __future__ import with_statement import logging +from collections import Counter from gensim import utils from gensim.corpora import IndexedCorpus @@ -159,25 +160,24 @@ def line2doc(self, line): words = self.line2words(line) if self.use_wordids: - # get all distinct terms in this document, ignore unknown words - uniq_words = set(words).intersection(iterkeys(self.word2id)) - # the following creates a unique list of words *in the same order* # as they were in the input. when iterating over the documents, # the (word, count) pairs will appear in the same order as they # were in the input (bar duplicates), which looks better. # if this was not needed, we might as well have used useWords = set(words) - use_words, marker = [], set() + use_words, counts = [], Counter() for word in words: - if (word in uniq_words) and (word not in marker): + if word not in self.word2id: + continue + if word not in counts: use_words.append(word) - marker.add(word) + counts[word] += 1 # construct a list of (wordIndex, wordFrequency) 2-tuples - doc = [(self.word2id.get(w), words.count(w)) for w in use_words] + doc = [(self.word2id[w], counts[w]) for w in use_words] else: - uniq_words = set(words) - # construct a list of (word, wordFrequency) 2-tuples - doc = [(w, words.count(w)) for w in uniq_words] + word_freqs = Counter(words) + # construct a list of (word, wordFrequency) 2-tuples + doc = list(word_freqs.items()) # return the document, then forget it and move on to the next one # note that this way, only one doc is stored in memory at a time, not the whole corpus diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index db28b9e632..d1149ee585 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -126,10 +126,10 @@ def line2doc(self, line): [(3, 1), (4, 1)] """ - splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word] - docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:] + split_line = utils.to_unicode(line).strip().split(maxsplit=2) + docid, doclang, words = split_line[0], split_line[1], split_line[2] - doc = super(MalletCorpus, self).line2doc(' '.join(words)) + doc = super(MalletCorpus, self).line2doc(words) if self.metadata: return doc, (docid, doclang)