Refactor to more optimal line2doc method of LowCorpus and MalletCorpus

piskvorky · Nov 17, 2018 · 5ba87a5 · 5ba87a5
1 parent 7e4965e
commit 5ba87a5
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 13 deletions.
diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
@@ -10,6 +10,7 @@
 from __future__ import with_statement
 
 import logging
+from collections import Counter
 
 from gensim import utils
 from gensim.corpora import IndexedCorpus
@@ -159,25 +160,24 @@ def line2doc(self, line):
         words = self.line2words(line)
 
         if self.use_wordids:
-            # get all distinct terms in this document, ignore unknown words
-            uniq_words = set(words).intersection(iterkeys(self.word2id))
-
             # the following creates a unique list of words *in the same order*
             # as they were in the input. when iterating over the documents,
             # the (word, count) pairs will appear in the same order as they
             # were in the input (bar duplicates), which looks better.
             # if this was not needed, we might as well have used useWords = set(words)
-            use_words, marker = [], set()
+            use_words, counts = [], Counter()
             for word in words:
-                if (word in uniq_words) and (word not in marker):
+                if word not in self.word2id:
+                    continue
+                if word not in counts:
                     use_words.append(word)
-                    marker.add(word)
+                counts[word] += 1
             # construct a list of (wordIndex, wordFrequency) 2-tuples
-            doc = [(self.word2id.get(w), words.count(w)) for w in use_words]
+            doc = [(self.word2id[w], counts[w]) for w in use_words]
         else:
-            uniq_words = set(words)
-            # construct a list of (word, wordFrequency) 2-tuples
-            doc = [(w, words.count(w)) for w in uniq_words]
+            word_freqs = Counter(words)
+            # construct a list of (word, wordFrequency) 2-tuples    
+            doc = list(word_freqs.items())
 
         # return the document, then forget it and move on to the next one
         # note that this way, only one doc is stored in memory at a time, not the whole corpus

diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
@@ -126,10 +126,10 @@ def line2doc(self, line):
             [(3, 1), (4, 1)]
 
         """
-        splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word]
-        docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:]
+        split_line = utils.to_unicode(line).strip().split(maxsplit=2)
+        docid, doclang, words = split_line[0], split_line[1], split_line[2]
 
-        doc = super(MalletCorpus, self).line2doc(' '.join(words))
+        doc = super(MalletCorpus, self).line2doc(words)
 
         if self.metadata:
             return doc, (docid, doclang)