diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index be735b865a..33390fc08e 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -64,6 +64,8 @@ import warnings from collections import defaultdict import itertools as it +from functools import partial +from math import log from six import iteritems, string_types, next @@ -106,7 +108,8 @@ class Phrases(interfaces.TransformationABC): """ def __init__(self, sentences=None, min_count=5, threshold=10.0, - max_vocab_size=40000000, delimiter=b'_', progress_per=10000): + max_vocab_size=40000000, delimiter=b'_', progress_per=10000, + scoring='default'): """ Initialize the model from an iterable of `sentences`. Each sentence must be a list of words (unicode strings) that will be used for training. @@ -120,10 +123,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, `min_count` ignore all words and bigrams with total collected count lower than this. - `threshold` represents a threshold for forming the phrases (higher means - fewer phrases). A phrase of words `a` and `b` is accepted if - `(cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold`, where `N` is the - total vocabulary size. + `threshold` represents a score threshold for forming the phrases (higher means + fewer phrases). A phrase of words `a` followed by `b` is accepted if the score of the + phrase is greater than threshold. see the `scoring' setting `max_vocab_size` is the maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control. The default @@ -133,12 +135,31 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, `delimiter` is the glue character used to join collocation tokens, and should be a byte string (e.g. b'_'). + `scoring` specifies how potential phrases are scored for comparison to the `threshold` + setting. two settings are available: + 'default': from "Efficient Estimaton of Word Representations in Vector Space" by + Mikolov, et. al.: + (count(worda followed by wordb) - min_count) * N / + (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size. + 'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual + Information in Colocation Extraction" by Gerlof Bouma: + ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / + - ln(prop(worda followed by wordb) + where prop(n) is the count of n / the count of everything in the entire corpus + 'npmi' is more robust when dealing with common words that form part of common bigrams, and + ranges from -1 to 1, but is slower to calculate than the default + """ if min_count <= 0: raise ValueError("min_count should be at least 1") - if threshold <= 0: - raise ValueError("threshold should be positive") + if threshold <= 0 and scoring == 'default': + raise ValueError("threshold should be positive for default scoring") + if scoring == 'npmi' and (threshold < -1 or threshold > 1): + raise ValueError("threshold should be between -1 and 1 for npmi scoring") + + if not (scoring == 'default' or scoring == 'npmi'): + raise ValueError('unknown scoring function "' + scoring + '" specified') self.min_count = min_count self.threshold = threshold @@ -147,6 +168,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, self.min_reduce = 1 # ignore any tokens with count smaller than this self.delimiter = delimiter self.progress_per = progress_per + self.scoring = scoring + self.corpus_word_count = 0 if sentences is not None: self.add_vocab(sentences) @@ -178,6 +201,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): if sentence: # add last word skipped by previous loop word = sentence[-1] vocab[word] += 1 + total_words += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) @@ -185,7 +209,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % (len(vocab), total_words, sentence_no + 1)) - return min_reduce, vocab + return min_reduce, vocab, total_words def add_vocab(self, sentences): """ @@ -197,8 +221,10 @@ def add_vocab(self, sentences): # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. - min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) + min_reduce, vocab, total_words = \ + self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) + self.corpus_word_count += total_words if len(self.vocab) > 0: logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) @@ -226,31 +252,47 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): then you can debug the threshold with generated tsv """ + + vocab = self.vocab + threshold = self.threshold + delimiter = self.delimiter # delimiter used for lookup + min_count = self.min_count + scoring = self.scoring + corpus_word_count = self.corpus_word_count + + if scoring == 'default': + scoring_function = \ + partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) + elif scoring == 'npmi': + scoring_function = \ + partial(self.npmi_scorer, corpus_word_count=corpus_word_count) + # no else here to catch unknown scoring function, check is done in Phrases.__init__ + for sentence in sentences: s = [utils.any2utf8(w) for w in sentence] last_bigram = False - vocab = self.vocab - threshold = self.threshold - delimiter = self.delimiter # delimiter used for lookup - min_count = self.min_count + for word_a, word_b in zip(s, s[1:]): - if word_a in vocab and word_b in vocab: + # last bigram check was moved here to save a few CPU cycles + if word_a in vocab and word_b in vocab and not last_bigram: bigram_word = delimiter.join((word_a, word_b)) - if bigram_word in vocab and not last_bigram: - pa = float(vocab[word_a]) - pb = float(vocab[word_b]) - pab = float(vocab[bigram_word]) - score = (pab - min_count) / pa / pb * len(vocab) + if bigram_word in vocab: + count_a = float(vocab[word_a]) + count_b = float(vocab[word_b]) + count_ab = float(vocab[bigram_word]) + score = scoring_function(count_a, count_b, count_ab) # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) - if score > threshold: + # added mincount check because if the scorer doesn't contain min_count + # it would not be enforced otherwise + if score > threshold and count_ab >= min_count: if as_tuples: yield ((word_a, word_b), score) else: yield (out_delimiter.join((word_a, word_b)), score) last_bigram = True continue - last_bigram = False + last_bigram = False def __getitem__(self, sentence): """ @@ -311,6 +353,20 @@ def __getitem__(self, sentence): return [utils.to_unicode(w) for w in new_s] + # calculation of score based on original mikolov word2vec paper + # len_vocab and min_count set so functools.partial works + @staticmethod + def original_scorer(worda_count, wordb_count, bigram_count, len_vocab=0.0, min_count=0.0): + return (bigram_count - min_count) / worda_count / wordb_count * len_vocab + + # normalized PMI, requires corpus size + @staticmethod + def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count=0.0): + pa = worda_count / corpus_word_count + pb = wordb_count / corpus_word_count + pab = bigram_count / corpus_word_count + return log(pab / (pa * pb)) / -log(pab) + def pseudocorpus(source_vocab, sep): """Feeds source_vocab's compound keys back to it, to discover phrases""" @@ -329,8 +385,8 @@ class Phraser(interfaces.TransformationABC): After the one-time initialization, a Phraser will be much smaller and somewhat faster than using the full Phrases model. - Reflects the results of the source model's `min_count` and `threshold` - settings. (You can tamper with those & create a new Phraser to try + Reflects the results of the source model's `min_count`, `threshold`, and + `scoring` settings. (You can tamper with those & create a new Phraser to try other values.) """ @@ -338,6 +394,7 @@ def __init__(self, phrases_model): self.threshold = phrases_model.threshold self.min_count = phrases_model.min_count self.delimiter = phrases_model.delimiter + self.scoring = phrases_model.scoring self.phrasegrams = {} corpus = pseudocorpus(phrases_model.vocab, phrases_model.delimiter) logger.info('source_vocab length %i', len(phrases_model.vocab)) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index ba2cfc7192..688f92dbd0 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -138,7 +138,7 @@ def testExportPhrases(self): b'human interface' ]) - def test_multiple_bigrams_single_entry(self): + def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(sentences, min_count=1, threshold=1) @@ -153,6 +153,36 @@ def test_multiple_bigrams_single_entry(self): b'human interface' ]) + def testScoringDefault(self): + """ test the default scoring, from the mikolov word2vec paper """ + bigram = Phrases(sentences, min_count=1, threshold=1) + + seen_scores = set() + + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) + + assert seen_scores == set([ + 5.167, # score for graph minors + 3.444 # score for human interface + ]) + + def testScoringNpmi(self): + """ test normalized pointwise mutual information scoring """ + bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi') + + seen_scores = set() + + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) + + assert seen_scores == set([ + .882, # score for graph minors + .714 # score for human interface + ]) + def testBadParameters(self): """Test the phrases module with bad parameters.""" # should fail with something less or equal than 0