From fcc9bc1c2a4fad8d369945e05366e0c6d6200fa1 Mon Sep 17 00:00:00 2001 From: horpto <__singleton__@hackerdom.ru> Date: Sat, 17 Nov 2018 04:53:27 +0500 Subject: [PATCH 1/4] Refactor to more optimal line2doc method of LowCorpus and MalletCorpus --- gensim/corpora/lowcorpus.py | 19 +++++++++---------- gensim/corpora/malletcorpus.py | 7 ++++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index 9986c780f3..ae7acb6377 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -10,10 +10,10 @@ from __future__ import with_statement import logging +from collections import Counter from gensim import utils from gensim.corpora import IndexedCorpus -from six import iterkeys from six.moves import xrange, zip as izip @@ -159,25 +159,24 @@ def line2doc(self, line): words = self.line2words(line) if self.use_wordids: - # get all distinct terms in this document, ignore unknown words - uniq_words = set(words).intersection(iterkeys(self.word2id)) - # the following creates a unique list of words *in the same order* # as they were in the input. when iterating over the documents, # the (word, count) pairs will appear in the same order as they # were in the input (bar duplicates), which looks better. # if this was not needed, we might as well have used useWords = set(words) - use_words, marker = [], set() + use_words, counts = [], Counter() for word in words: - if (word in uniq_words) and (word not in marker): + if word not in self.word2id: + continue + if word not in counts: use_words.append(word) - marker.add(word) + counts[word] += 1 # construct a list of (wordIndex, wordFrequency) 2-tuples - doc = [(self.word2id.get(w), words.count(w)) for w in use_words] + doc = [(self.word2id[w], counts[w]) for w in use_words] else: - uniq_words = set(words) + word_freqs = Counter(words) # construct a list of (word, wordFrequency) 2-tuples - doc = [(w, words.count(w)) for w in uniq_words] + doc = list(word_freqs.items()) # return the document, then forget it and move on to the next one # note that this way, only one doc is stored in memory at a time, not the whole corpus diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index db28b9e632..b8922f3cda 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -126,10 +126,11 @@ def line2doc(self, line): [(3, 1), (4, 1)] """ - splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word] - docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:] + split_line = utils.to_unicode(line).strip().split(None, 2) + docid, doclang = split_line[0], split_line[1] + words = split_line[2] if len(split_line) >= 3 else '' - doc = super(MalletCorpus, self).line2doc(' '.join(words)) + doc = super(MalletCorpus, self).line2doc(words) if self.metadata: return doc, (docid, doclang) From 8165920c369bb9745854a6c3cc6fbff36f232271 Mon Sep 17 00:00:00 2001 From: horpto <__singleton__@hackerdom.ru> Date: Thu, 20 Dec 2018 06:15:30 +0500 Subject: [PATCH 2/4] fix build --- gensim/corpora/lowcorpus.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index e01e30b9d0..c67c34b700 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -14,7 +14,6 @@ from gensim import utils from gensim.corpora import IndexedCorpus -from six import iterkeys from six.moves import zip, range From f05da54a07e7e0a190861587ac7b85cb9066cd22 Mon Sep 17 00:00:00 2001 From: horpto <__singleton__@hackerdom.ru> Date: Thu, 10 Jan 2019 23:40:14 +0500 Subject: [PATCH 3/4] add tests --- gensim/test/test_corpora.py | 50 ++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 5efc8f2b3f..7f86e8dde7 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -420,11 +420,30 @@ def test_save_format_for_dtm(self): class TestLowCorpus(CorpusTestCase): TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []] + CORPUS_LINE = 'mom wash window window was washed' def setUp(self): self.corpus_class = lowcorpus.LowCorpus self.file_extension = '.low' + def test_line2doc(self): + fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) + id2word = {1: 'mom', 2: 'window'} + + corpus = self.corpus_class(fname, id2word=id2word) + + # should return all words in doc + corpus.use_wordids = False + self.assertEqual( + sorted(corpus.line2doc(self.CORPUS_LINE)), + [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2),]) + + # should return words in word2id + corpus.use_wordids = True + self.assertEqual( + sorted(corpus.line2doc(self.CORPUS_LINE)), + [(1, 1), (2, 2),]) + class TestUciCorpus(CorpusTestCase): TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []] @@ -438,8 +457,9 @@ def test_serialize_compressed(self): pass -class TestMalletCorpus(CorpusTestCase): +class TestMalletCorpus(TestLowCorpus): TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []] + CORPUS_LINE = '#3 lang mom wash window window was washed' def setUp(self): self.corpus_class = malletcorpus.MalletCorpus @@ -459,6 +479,34 @@ def test_load_with_metadata(self): self.assertEqual(metadata[0], str(i + 1)) self.assertEqual(metadata[1], 'en') + def test_line2doc(self): + # case with metadata=False (by default) + super(TestMalletCorpus, self).test_line2doc() + + # case with metadata=True + fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) + id2word = {1: 'mom', 2: 'window'} + + corpus = self.corpus_class(fname, id2word=id2word, metadata=True) + + # should return all words in doc + corpus.use_wordids = False + doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE) + self.assertEqual(docid, '#3') + self.assertEqual(doclang, 'lang') + self.assertEqual( + sorted(doc), + [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2),]) + + # should return words in word2id + corpus.use_wordids = True + doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE) + + self.assertEqual(docid, '#3') + self.assertEqual(doclang, 'lang') + self.assertEqual( + sorted(doc), + [(1, 1), (2, 2),]) class TestTextCorpus(CorpusTestCase): From 83a6ec3e1f8a59946fd0ef48fa3503f1326359be Mon Sep 17 00:00:00 2001 From: horpto <__singleton__@hackerdom.ru> Date: Fri, 11 Jan 2019 00:27:51 +0500 Subject: [PATCH 4/4] fix build --- gensim/test/test_corpora.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 7f86e8dde7..0b8c3c97bd 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -436,13 +436,13 @@ def test_line2doc(self): corpus.use_wordids = False self.assertEqual( sorted(corpus.line2doc(self.CORPUS_LINE)), - [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2),]) + [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)]) # should return words in word2id corpus.use_wordids = True self.assertEqual( sorted(corpus.line2doc(self.CORPUS_LINE)), - [(1, 1), (2, 2),]) + [(1, 1), (2, 2)]) class TestUciCorpus(CorpusTestCase): @@ -496,7 +496,7 @@ def test_line2doc(self): self.assertEqual(doclang, 'lang') self.assertEqual( sorted(doc), - [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2),]) + [('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)]) # should return words in word2id corpus.use_wordids = True @@ -506,7 +506,8 @@ def test_line2doc(self): self.assertEqual(doclang, 'lang') self.assertEqual( sorted(doc), - [(1, 1), (2, 2),]) + [(1, 1), (2, 2)]) + class TestTextCorpus(CorpusTestCase):