Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor line2doc methods of LowCorpus and MalletCorpus #2269

Merged
merged 6 commits into from
Jan 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
from __future__ import with_statement

import logging
from collections import Counter

from gensim import utils
from gensim.corpora import IndexedCorpus
from six import iterkeys
from six.moves import zip, range


Expand Down Expand Up @@ -159,25 +159,24 @@ def line2doc(self, line):
words = self.line2words(line)

if self.use_wordids:
# get all distinct terms in this document, ignore unknown words
uniq_words = set(words).intersection(iterkeys(self.word2id))

# the following creates a unique list of words *in the same order*
# as they were in the input. when iterating over the documents,
# the (word, count) pairs will appear in the same order as they
# were in the input (bar duplicates), which looks better.
# if this was not needed, we might as well have used useWords = set(words)
use_words, marker = [], set()
use_words, counts = [], Counter()
for word in words:
if (word in uniq_words) and (word not in marker):
if word not in self.word2id:
continue
if word not in counts:
use_words.append(word)
marker.add(word)
counts[word] += 1
# construct a list of (wordIndex, wordFrequency) 2-tuples
doc = [(self.word2id.get(w), words.count(w)) for w in use_words]
doc = [(self.word2id[w], counts[w]) for w in use_words]
else:
uniq_words = set(words)
word_freqs = Counter(words)
# construct a list of (word, wordFrequency) 2-tuples
doc = [(w, words.count(w)) for w in uniq_words]
doc = list(word_freqs.items())

# return the document, then forget it and move on to the next one
# note that this way, only one doc is stored in memory at a time, not the whole corpus
Expand Down
7 changes: 4 additions & 3 deletions gensim/corpora/malletcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,11 @@ def line2doc(self, line):
[(3, 1), (4, 1)]

"""
splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word]
docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:]
split_line = utils.to_unicode(line).strip().split(None, 2)
docid, doclang = split_line[0], split_line[1]
words = split_line[2] if len(split_line) >= 3 else ''
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why >=, not an =? I asked because "If maxsplit is given, at most maxsplit splits are done".

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a habbit to write more flexible code on the case of future changes.


doc = super(MalletCorpus, self).line2doc(' '.join(words))
doc = super(MalletCorpus, self).line2doc(words)

if self.metadata:
return doc, (docid, doclang)
Expand Down
51 changes: 50 additions & 1 deletion gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,11 +420,30 @@ def test_save_format_for_dtm(self):

class TestLowCorpus(CorpusTestCase):
TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
CORPUS_LINE = 'mom wash window window was washed'

def setUp(self):
self.corpus_class = lowcorpus.LowCorpus
self.file_extension = '.low'

def test_line2doc(self):
fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
id2word = {1: 'mom', 2: 'window'}

corpus = self.corpus_class(fname, id2word=id2word)

# should return all words in doc
corpus.use_wordids = False
self.assertEqual(
sorted(corpus.line2doc(self.CORPUS_LINE)),
[('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)])

# should return words in word2id
corpus.use_wordids = True
self.assertEqual(
sorted(corpus.line2doc(self.CORPUS_LINE)),
[(1, 1), (2, 2)])


class TestUciCorpus(CorpusTestCase):
TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
Expand All @@ -438,8 +457,9 @@ def test_serialize_compressed(self):
pass


class TestMalletCorpus(CorpusTestCase):
class TestMalletCorpus(TestLowCorpus):
TEST_CORPUS = [[(1, 1)], [], [(0, 2), (2, 1)], []]
CORPUS_LINE = '#3 lang mom wash window window was washed'

def setUp(self):
self.corpus_class = malletcorpus.MalletCorpus
Expand All @@ -459,6 +479,35 @@ def test_load_with_metadata(self):
self.assertEqual(metadata[0], str(i + 1))
self.assertEqual(metadata[1], 'en')

def test_line2doc(self):
# case with metadata=False (by default)
super(TestMalletCorpus, self).test_line2doc()

# case with metadata=True
fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
id2word = {1: 'mom', 2: 'window'}

corpus = self.corpus_class(fname, id2word=id2word, metadata=True)

# should return all words in doc
corpus.use_wordids = False
doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE)
self.assertEqual(docid, '#3')
self.assertEqual(doclang, 'lang')
self.assertEqual(
sorted(doc),
[('mom', 1), ('was', 1), ('wash', 1), ('washed', 1), ('window', 2)])

# should return words in word2id
corpus.use_wordids = True
doc, (docid, doclang) = corpus.line2doc(self.CORPUS_LINE)

self.assertEqual(docid, '#3')
self.assertEqual(doclang, 'lang')
self.assertEqual(
sorted(doc),
[(1, 1), (2, 2)])


class TestTextCorpus(CorpusTestCase):

Expand Down