Skip to content

Commit

Permalink
Refactor to more optimal line2doc method of LowCorpus and MalletCorpus
Browse files Browse the repository at this point in the history
  • Loading branch information
horpto committed Nov 17, 2018
1 parent 7e4965e commit 5ba87a5
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 13 deletions.
20 changes: 10 additions & 10 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from __future__ import with_statement

import logging
from collections import Counter

from gensim import utils
from gensim.corpora import IndexedCorpus
Expand Down Expand Up @@ -159,25 +160,24 @@ def line2doc(self, line):
words = self.line2words(line)

if self.use_wordids:
# get all distinct terms in this document, ignore unknown words
uniq_words = set(words).intersection(iterkeys(self.word2id))

# the following creates a unique list of words *in the same order*
# as they were in the input. when iterating over the documents,
# the (word, count) pairs will appear in the same order as they
# were in the input (bar duplicates), which looks better.
# if this was not needed, we might as well have used useWords = set(words)
use_words, marker = [], set()
use_words, counts = [], Counter()
for word in words:
if (word in uniq_words) and (word not in marker):
if word not in self.word2id:
continue
if word not in counts:
use_words.append(word)
marker.add(word)
counts[word] += 1
# construct a list of (wordIndex, wordFrequency) 2-tuples
doc = [(self.word2id.get(w), words.count(w)) for w in use_words]
doc = [(self.word2id[w], counts[w]) for w in use_words]
else:
uniq_words = set(words)
# construct a list of (word, wordFrequency) 2-tuples
doc = [(w, words.count(w)) for w in uniq_words]
word_freqs = Counter(words)
# construct a list of (word, wordFrequency) 2-tuples
doc = list(word_freqs.items())

# return the document, then forget it and move on to the next one
# note that this way, only one doc is stored in memory at a time, not the whole corpus
Expand Down
6 changes: 3 additions & 3 deletions gensim/corpora/malletcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@ def line2doc(self, line):
[(3, 1), (4, 1)]
"""
splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word]
docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:]
split_line = utils.to_unicode(line).strip().split(maxsplit=2)
docid, doclang, words = split_line[0], split_line[1], split_line[2]

doc = super(MalletCorpus, self).line2doc(' '.join(words))
doc = super(MalletCorpus, self).line2doc(words)

if self.metadata:
return doc, (docid, doclang)
Expand Down

0 comments on commit 5ba87a5

Please sign in to comment.