From fcc9bc1c2a4fad8d369945e05366e0c6d6200fa1 Mon Sep 17 00:00:00 2001
From: horpto <__singleton__@hackerdom.ru>
Date: Sat, 17 Nov 2018 04:53:27 +0500
Subject: [PATCH 1/4] Refactor to more optimal line2doc method of LowCorpus and
 MalletCorpus

---
 gensim/corpora/lowcorpus.py    | 19 +++++++++----------
 gensim/corpora/malletcorpus.py |  7 ++++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
index 9986c780f3..ae7acb6377 100644
--- a/gensim/corpora/lowcorpus.py
+++ b/gensim/corpora/lowcorpus.py
@@ -10,10 +10,10 @@
 from __future__ import with_statement
 
 import logging
+from collections import Counter
 
 from gensim import utils
 from gensim.corpora import IndexedCorpus
-from six import iterkeys
 from six.moves import xrange, zip as izip
 
 
@@ -159,25 +159,24 @@ def line2doc(self, line):
         words = self.line2words(line)
 
         if self.use_wordids:
-            # get all distinct terms in this document, ignore unknown words
-            uniq_words = set(words).intersection(iterkeys(self.word2id))
-
             # the following creates a unique list of words *in the same order*
             # as they were in the input. when iterating over the documents,
             # the (word, count) pairs will appear in the same order as they
             # were in the input (bar duplicates), which looks better.
             # if this was not needed, we might as well have used useWords = set(words)
-            use_words, marker = [], set()
+            use_words, counts = [], Counter()
             for word in words:
-                if (word in uniq_words) and (word not in marker):
+                if word not in self.word2id:
+                    continue
+                if word not in counts:
                     use_words.append(word)
-                    marker.add(word)
+                counts[word] += 1
             # construct a list of (wordIndex, wordFrequency) 2-tuples
-            doc = [(self.word2id.get(w), words.count(w)) for w in use_words]
+            doc = [(self.word2id[w], counts[w]) for w in use_words]
         else:
-            uniq_words = set(words)
+            word_freqs = Counter(words)
             # construct a list of (word, wordFrequency) 2-tuples
-            doc = [(w, words.count(w)) for w in uniq_words]
+            doc = list(word_freqs.items())
 
         # return the document, then forget it and move on to the next one
         # note that this way, only one doc is stored in memory at a time, not the whole corpus
diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
index db28b9e632..b8922f3cda 100644
--- a/gensim/corpora/malletcorpus.py
+++ b/gensim/corpora/malletcorpus.py
@@ -126,10 +126,11 @@ def line2doc(self, line):
             [(3, 1), (4, 1)]
 
         """
-        splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word]
-        docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:]
+        split_line = utils.to_unicode(line).strip().split(None, 2)
+        docid, doclang = split_line[0], split_line[1]
+        words = split_line[2] if len(split_line) >= 3 else ''
 
-        doc = super(MalletCorpus, self).line2doc(' '.join(words))
+        doc = super(MalletCorpus, self).line2doc(words)
 
         if self.metadata:
             return doc, (docid, doclang)

From 8165920c369bb9745854a6c3cc6fbff36f232271 Mon Sep 17 00:00:00 2001
From: horpto <__singleton__@hackerdom.ru>
Date: Thu, 20 Dec 2018 06:15:30 +0500
Subject: [PATCH 2/4] fix build

---
 gensim/corpora/lowcorpus.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
index e01e30b9d0..c67c34b700 100644
--- a/gensim/corpora/lowcorpus.py
+++ b/gensim/corpora/lowcorpus.py
@@ -14,7 +14,6 @@
 
 from gensim import utils
 from gensim.corpora import IndexedCorpus
-from six import iterkeys
 from six.moves import zip, range
 
 

From f05da54a07e7e0a190861587ac7b85cb9066cd22 Mon Sep 17 00:00:00 2001
From: horpto <__singleton__@hackerdom.ru>
Date: Thu, 10 Jan 2019 23:40:14 +0500
Subject: [PATCH 3/4] add tests

---
 gensim/test/test_corpora.py | 50 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
index 5efc8f2b3f..7f86e8dde7 100644
--- a/gensim/test/test_corpora.py
+++ b/gensim/test/test_corpora.py
@@ -420,11 +420,30 @@ def test_save_format_for_dtm(self):
 
 class TestLowCorpus(CorpusTestCase):
     TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
+    CORPUS_LINE = 'mom  wash  window window was washed'
 
     def setUp(self):
         self.corpus_class = lowcorpus.LowCorpus
         self.file_extension = '.low'
 
+    def test_line2doc(self):
+        fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
+        id2word = {1: 'mom', 2: 'window'}
+
+        corpus = self.corpus_class(fname, id2word=id2word)
+
+        # should return all words in doc
+        corpus.use_wordids = False
+        self.assertEqual(
+            sorted(corpus.line2doc(self.CORPUS_LINE)),
+            [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2),])
+
+        # should return words in word2id
+        corpus.use_wordids = True
+        self.assertEqual(
+            sorted(corpus.line2doc(self.CORPUS_LINE)),
+            [(1, 1), (2, 2),])
+
 
 class TestUciCorpus(CorpusTestCase):
     TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
@@ -438,8 +457,9 @@ def test_serialize_compressed(self):
         pass
 
 
-class TestMalletCorpus(CorpusTestCase):
+class TestMalletCorpus(TestLowCorpus):
     TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
+    CORPUS_LINE = '#3  lang mom  wash  window window was washed'
 
     def setUp(self):
         self.corpus_class = malletcorpus.MalletCorpus
@@ -459,6 +479,34 @@ def test_load_with_metadata(self):
             self.assertEqual(metadata[0], str(i + 1))
             self.assertEqual(metadata[1], 'en')
 
+    def test_line2doc(self):
+        # case with metadata=False (by default)
+        super(TestMalletCorpus, self).test_line2doc()
+
+        # case with metadata=True
+        fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
+        id2word = {1: 'mom', 2: 'window'}
+
+        corpus = self.corpus_class(fname, id2word=id2word, metadata=True)
+
+        # should return all words in doc
+        corpus.use_wordids = False
+        doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE)
+        self.assertEqual(docid, '#3')
+        self.assertEqual(doclang, 'lang')
+        self.assertEqual(
+            sorted(doc),
+            [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2),])
+
+        # should return words in word2id
+        corpus.use_wordids = True
+        doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE)
+
+        self.assertEqual(docid, '#3')
+        self.assertEqual(doclang, 'lang')
+        self.assertEqual(
+            sorted(doc),
+            [(1, 1), (2, 2),])
 
 class TestTextCorpus(CorpusTestCase):
 

From 83a6ec3e1f8a59946fd0ef48fa3503f1326359be Mon Sep 17 00:00:00 2001
From: horpto <__singleton__@hackerdom.ru>
Date: Fri, 11 Jan 2019 00:27:51 +0500
Subject: [PATCH 4/4] fix build

---
 gensim/test/test_corpora.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
index 7f86e8dde7..0b8c3c97bd 100644
--- a/gensim/test/test_corpora.py
+++ b/gensim/test/test_corpora.py
@@ -436,13 +436,13 @@ def test_line2doc(self):
         corpus.use_wordids = False
         self.assertEqual(
             sorted(corpus.line2doc(self.CORPUS_LINE)),
-            [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2),])
+            [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)])
 
         # should return words in word2id
         corpus.use_wordids = True
         self.assertEqual(
             sorted(corpus.line2doc(self.CORPUS_LINE)),
-            [(1, 1), (2, 2),])
+            [(1, 1), (2, 2)])
 
 
 class TestUciCorpus(CorpusTestCase):
@@ -496,7 +496,7 @@ def test_line2doc(self):
         self.assertEqual(doclang, 'lang')
         self.assertEqual(
             sorted(doc),
-            [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2),])
+            [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)])
 
         # should return words in word2id
         corpus.use_wordids = True
@@ -506,7 +506,8 @@ def test_line2doc(self):
         self.assertEqual(doclang, 'lang')
         self.assertEqual(
             sorted(doc),
-            [(1, 1), (2, 2),])
+            [(1, 1), (2, 2)])
+
 
 class TestTextCorpus(CorpusTestCase):