-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] Cythonizing phrases module #1385
Changes from 6 commits
73f33c9
22ce838
08eb4ae
ebf39d1
1c17b6c
7e456fa
bec2baf
48f8b9a
1cfe39d
8316c1a
a3b7e5f
dbee0c7
024442f
3bab54e
4d636ae
c164173
154fcfa
f7de6f0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,6 +70,10 @@ | |
from gensim import utils, interfaces | ||
|
||
logger = logging.getLogger(__name__) | ||
#from gensim.models.phrases_inner import learn_vocab | ||
|
||
|
||
|
||
|
||
|
||
def _is_single(obj): | ||
|
@@ -105,6 +109,44 @@ class Phrases(interfaces.TransformationABC): | |
and `phrases[corpus]` syntax. | ||
|
||
""" | ||
#from gensim.models.phrases_inner import learn_vocab | ||
try: | ||
from gensim.models.phrases_inner import learn_vocab | ||
logger.info("Cython file loaded") | ||
except ImportError: | ||
logger.info("Cython file not loaded") | ||
#failed... fall back to plain numpy (20-80x slower training than the above) | ||
|
||
|
||
def learn_vocab(self,sentences, max_vocab_size, delimiter=b'_', progress_per=10000): | ||
#Collect unigram/bigram counts from the `sentences` iterable. | ||
sentence_no = -1 | ||
total_words = 0 | ||
logger.info("collecting all words and their counts") | ||
vocab = defaultdict(int) | ||
min_reduce = 1 | ||
for sentence_no, sentence in enumerate(sentences): | ||
if sentence_no % progress_per == 0: | ||
logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % | ||
(sentence_no, total_words, len(vocab))) | ||
#sentence = [utils.any2utf8(w) for w in sentence] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @piskvorky was there any particular reason behind creating the vocab for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, saving memory. Up to Python 3.3 (and including all Python 2.x), unicode strings take up 2-4x as much memory, compared to UTF8 byte strings, for normal text. Since memory is more critical than speed here, we went with UTF8. |
||
for bigram in zip(sentence, sentence[1:]): | ||
vocab[bigram[0]] += 1 | ||
vocab[delimiter.join(bigram)] += 1 | ||
total_words += 1 | ||
|
||
if sentence: # add last word skipped by previous loop | ||
word = sentence[-1] | ||
vocab[word] += 1 | ||
|
||
if len(vocab) > max_vocab_size: | ||
utils.prune_vocab(vocab, min_reduce) | ||
min_reduce += 1 | ||
|
||
logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % | ||
(len(vocab), total_words, sentence_no + 1)) | ||
return min_reduce, vocab | ||
|
||
def __init__(self, sentences=None, min_count=5, threshold=10.0, | ||
max_vocab_size=40000000, delimiter=b'_', progress_per=10000): | ||
""" | ||
|
@@ -157,35 +199,7 @@ def __str__(self): | |
self.__class__.__name__, len(self.vocab), self.min_count, | ||
self.threshold, self.max_vocab_size) | ||
|
||
@staticmethod | ||
def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): | ||
"""Collect unigram/bigram counts from the `sentences` iterable.""" | ||
sentence_no = -1 | ||
total_words = 0 | ||
logger.info("collecting all words and their counts") | ||
vocab = defaultdict(int) | ||
min_reduce = 1 | ||
for sentence_no, sentence in enumerate(sentences): | ||
if sentence_no % progress_per == 0: | ||
logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" % | ||
(sentence_no, total_words, len(vocab))) | ||
sentence = [utils.any2utf8(w) for w in sentence] | ||
for bigram in zip(sentence, sentence[1:]): | ||
vocab[bigram[0]] += 1 | ||
vocab[delimiter.join(bigram)] += 1 | ||
total_words += 1 | ||
|
||
if sentence: # add last word skipped by previous loop | ||
word = sentence[-1] | ||
vocab[word] += 1 | ||
|
||
if len(vocab) > max_vocab_size: | ||
utils.prune_vocab(vocab, min_reduce) | ||
min_reduce += 1 | ||
|
||
logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % | ||
(len(vocab), total_words, sentence_no + 1)) | ||
return min_reduce, vocab | ||
|
||
|
||
def add_vocab(self, sentences): | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the 20-80x figure correct? If not, better remove the stale comments and start the development with a clean slate.