Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Skip common English words in phrases #2979

Merged
merged 12 commits into from
Oct 14, 2020
168 changes: 82 additions & 86 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases
>>> from gensim.models.phrases import Phrases, ENGLISH_COMMON_TERMS
>>>
>>> # Create training corpus. Must be a sequence of sentences (e.g. an iterable or a generator).
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
Expand All @@ -31,7 +31,7 @@
['computer', 'human', 'interface', 'computer', 'response', 'survey', 'system', 'time', 'user', 'interface']
>>>
>>> # Train a toy phrase model on our training corpus.
>>> phrase_model = Phrases(sentences, delimiter='_', min_count=1, threshold=1)
>>> phrase_model = Phrases(sentences, min_count=1, threshold=1, common_terms=ENGLISH_COMMON_TERMS)
>>>
>>> # Apply the trained phrases model to a new, unseen sentence.
>>> new_sentence = ['trees', 'graph', 'minors']
Expand Down Expand Up @@ -61,8 +61,6 @@

"""

import sys
import os
import logging
from collections import defaultdict
import itertools
Expand All @@ -77,6 +75,17 @@

NEGATIVE_INFINITY = float('-inf')

#: Set of common English prepositions and articles. Tokens from this set
# are "ignored" during phrase detection:
# 1) Phrases may not start with these words.
# 2) Phrases may include any number of these words inside.
ENGLISH_COMMON_TERMS = frozenset(
" a an the " # articles; we never care about these in MWEs
" for of with without at from to in on by " # prepositions; incomplete on purpose, to minimize FNs
" and or " # conjunctions; incomplete on purpose, to minimize FNs
.split()
)


def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
r"""Bigram scoring function, based on the original `Mikolov, et. al: "Distributed Representations
Expand All @@ -100,7 +109,7 @@ def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count
Returns
-------
float
Score for given bi-gram, greater than or equal to 0.
Score for given phrase. Can be negative.

Notes
-----
Expand Down Expand Up @@ -225,7 +234,7 @@ def analyze_sentence(self, sentence):

Yields
------
(str, score)
(str, {float, None})
Iterate through the input sentence tokens and yield 2-tuples of:
- ``(concatenated_phrase_tokens, score)`` for token sequences that form a phrase.
- ``(word, None)`` if the token is not a part of a phrase.
Expand Down Expand Up @@ -266,7 +275,7 @@ def analyze_sentence(self, sentence):
yield w, None

def __getitem__(self, sentence):
"""Convert the input sequence of tokens `sentence` into a sequence of tokens where adjacent
"""Convert the input sequence of tokens ``sentence`` into a sequence of tokens where adjacent
tokens are replaced by a single token if they form a bigram collocation.

If `sentence` is an entire corpus (iterable of sentences rather than a single
Expand All @@ -281,10 +290,10 @@ def __getitem__(self, sentence):
Return
------
{list of str, iterable of list of str}
Sentence with phrase tokens joined by `self.delimiter` character, if input was a single sentence.
A generator of such joined sentences if input was a corpus.
Sentence with phrase tokens joined by ``self.delimiter``, if input was a single sentence.
A generator of such sentences if input was a corpus.

"""
s """
is_single, sentence = _is_single(sentence)
if not is_single:
# If the input is an entire corpus (rather than a single sentence),
Expand All @@ -293,7 +302,7 @@ def __getitem__(self, sentence):

return [token for token, _ in self.analyze_sentence(sentence)]

def export_phrases(self, sentences):
def find_phrases(self, sentences):
"""Get all unique phrases (multi-word expressions) that appear in ``sentences``, and their scores.

Parameters
Expand All @@ -304,20 +313,20 @@ def export_phrases(self, sentences):
Returns
-------
dict(str, float)
Unique phrases mapped to their scores.
Unique phrases found in ``sentences``, mapped to their scores.

Example
-------
.. sourcecode:: pycon

>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases
>>> from gensim.models.phrases import Phrases, ENGLISH_COMMON_TERMS
>>>
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>> phrases = Phrases(sentences, min_count=1, threshold=0.1)
>>> phrases = Phrases(sentences, min_count=1, threshold=0.1, common_terms=ENGLISH_COMMON_TERMS)
>>>
>>> for phrase, score in phrases.export_phrases(sentences).items():
>>> for phrase, score in phrases.find_phrases(sentences).items():
... print(phrase, score)
"""
result = {}
Expand Down Expand Up @@ -446,40 +455,26 @@ def __init__(
#. "default" - :func:`~gensim.models.phrases.original_scorer`.
#. "npmi" - :func:`~gensim.models.phrases.npmi_scorer`.
common_terms : set of str, optional
List of "stop words" that won't affect frequency count of expressions containing them.
Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".

Notes
-----
'npmi' is more robust when dealing with common words that form part of common bigrams, and
ranges from -1 to 1, but is slower to calculate than the default. The default is the PMI-like scoring
as described by `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality"
<https://arxiv.org/abs/1310.4546>`_.

To use a custom scoring function, pass in a function with the following signature:

* worda_count - number of corpus occurrences in `sentences` of the first token in the bigram being scored
* wordb_count - number of corpus occurrences in `sentences` of the second token in the bigram being scored
* bigram_count - number of occurrences in `sentences` of the whole bigram
* len_vocab - the number of unique tokens in `sentences`
* min_count - the `min_count` setting of the Phrases class
* corpus_word_count - the total number of tokens (non-unique) in `sentences`
Set of "stop words" that may be included within a phrase, without affecting its scoring.
piskvorky marked this conversation as resolved.
Show resolved Hide resolved

The scoring function **must accept all these parameters**, even if it doesn't use them in its scoring.
**If your texts are in English, set ``common_terms=phrases.ENGLISH_COMMON_TERMS``.**
This will cause phrases to include common English articles and prepositions, such
as `bank_of_america` or `eye_of_the_beholder`.

The scoring function **must be pickleable**.
For other languages or specific applications domains, use custom ``common_terms``
that make sense there: ``common_terms=frozenset("der die das".split())`` etc.

Examples
----------
--------
.. sourcecode:: pycon

>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases
>>> from gensim.models.phrases import Phrases, ENGLISH_COMMON_TERMS
>>>
>>> # Load corpus and train a model.
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>> phrases = Phrases(sentences, min_count=1, threshold=1)
>>> phrases = Phrases(sentences, min_count=1, threshold=1, common_terms=ENGLISH_COMMON_TERMS)
>>>
>>> # Use the model to detect phrases in a new sentence.
>>> sent = [u'trees', u'graph', u'minors']
Expand All @@ -488,7 +483,7 @@ def __init__(
>>>
>>> # Or transform multiple sentences at once.
>>> sents = [[u'trees', u'graph', u'minors'], [u'graph', u'minors']]
>>> for phrase in frozen_phrases[sents]:
>>> for phrase in phrases[sents]:
... print(phrase)
[u'trees_graph', u'minors']
[u'graph_minors']
Expand All @@ -498,6 +493,27 @@ def __init__(
>>> print(frozen_phrases[sent])
[u'trees_graph', u'minors']

Notes
-----

The ``scoring="npmi"`` is more robust when dealing with common words that form part of common bigrams, and
ranges from -1 to 1, but is slower to calculate than the default``scoring="default"``.
The default is the PMI-like scoring as described in `Mikolov, et. al: "Distributed
Representations of Words and Phrases and their Compositionality" <https://arxiv.org/abs/1310.4546>`_.

To use your own custom ``scoring`` function, pass in a function with the following signature:

* ``worda_count`` - number of corpus occurrences in `sentences` of the first token in the bigram being scored
* ``wordb_count`` - number of corpus occurrences in `sentences` of the second token in the bigram being scored
* ``bigram_count`` - number of occurrences in `sentences` of the whole bigram
* ``len_vocab`` - the number of unique tokens in `sentences`
* ``min_count`` - the `min_count` setting of the Phrases class
* ``corpus_word_count`` - the total number of tokens (non-unique) in `sentences`

The scoring function must accept all these parameters, even if it doesn't use them in its scoring.

The scoring function **must be pickleable**.

"""
super().__init__(common_terms=common_terms)
if min_count <= 0:
Expand Down Expand Up @@ -621,24 +637,24 @@ def add_vocab(self, sentences):
Parameters
----------
sentences : iterable of list of str
Text corpus.
Text corpus to update this model's parameters from.

Example
-------
.. sourcecode:: pycon

>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases
>>> from gensim.models.phrases import Phrases, ENGLISH_COMMON_TERMS
>>>
>>> # Train a phrase detector from a text corpus.
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>> phrases = Phrases(sentences) # train model
>>> phrases = Phrases(sentences, common_words=ENGLISH_COMMON_TERMS) # train model
>>> assert len(phrases.vocab) == 37
>>>
>>> more_sentences = [
... [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'],
... [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes']
... [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes'],
... ]
>>>
>>> phrases.add_vocab(more_sentences) # add new sentences to model
Expand Down Expand Up @@ -711,9 +727,28 @@ def freeze(self):
"""
return FrozenPhrases(self)

def export_phrases(self):
"""Extract all found phrases.

Returns
------
dict(str, float)
Mapping between phrases and their scores.

"""
result, source_vocab = {}, self.vocab
for token in source_vocab:
unigrams = token.split(self.delimiter)
if len(unigrams) < 2:
continue # no phrases here
phrase, score = self.score_candidate(unigrams[0], unigrams[-1], unigrams[1:-1])
if score is not None:
result[phrase] = score
return result


class FrozenPhrases(_PhrasesTransformation):
"""Minimal state & functionality exported from :class:`~gensim.models.phrases.Phrases`.
"""Minimal state & functionality exported from a trained :class:`~gensim.models.phrases.Phrases` model.

The goal of this class is to cut down memory consumption of `Phrases`, by discarding model state
not strictly needed for the phrase detection task.
Expand Down Expand Up @@ -741,11 +776,11 @@ def __init__(self, phrases_model):

>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases
>>> from gensim.models.phrases import Phrases, ENGLISH_COMMON_TERMS
>>>
>>> # Load corpus and train a model.
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>> phrases = Phrases(sentences, min_count=1, threshold=1)
>>> phrases = Phrases(sentences, min_count=1, threshold=1, common_terms=ENGLISH_COMMON_TERMS)
>>>
>>> # Export a FrozenPhrases object that is more efficient but doesn't allow further training.
>>> frozen_phrases = phrases.freeze()
Expand All @@ -759,33 +794,14 @@ def __init__(self, phrases_model):
self.scoring = phrases_model.scoring
self.common_terms = phrases_model.common_terms
logger.info('exporting phrases from %s', phrases_model)
self.phrasegrams = self._import_phrases(phrases_model)
self.phrasegrams = phrases_model.export_phrases()
logger.info('exported %s', self)

def __str__(self):
return "%s<%i phrases, min_count=%s, threshold=%s>" % (
self.__class__.__name__, len(self.phrasegrams), self.min_count, self.threshold,
)

def _import_phrases(self, phrases_model):
"""Extract all phrases that pass the threshold out of `phrases_model`.

Returns
------
dict[str, float]
Mapping between phrases and their scores.

"""
result, source_vocab = {}, phrases_model.vocab
for token in source_vocab:
unigrams = token.split(self.delimiter)
if len(unigrams) < 2:
continue # no phrases here
phrase, score = phrases_model.score_candidate(unigrams[0], unigrams[-1], unigrams[1:-1])
if score is not None:
result[phrase] = score
return result

def score_candidate(self, word_a, word_b, in_between):
phrase = self.delimiter.join([word_a] + in_between + [word_b])
score = self.phrasegrams.get(phrase, NEGATIVE_INFINITY)
Expand All @@ -795,23 +811,3 @@ def score_candidate(self, word_a, word_b, in_between):


Phraser = FrozenPhrases # alias for backward compatibility


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("running %s", " ".join(sys.argv))

# check and process cmdline input
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 2:
print(globals()['__doc__'] % locals())
sys.exit(1)
infile = sys.argv[1]

from gensim.models import Phrases # noqa:F811 for pickle
from gensim.models.word2vec import Text8Corpus
sentences = Text8Corpus(infile)

bigram = Phrases(sentences, min_count=5, threshold=100)
for s in bigram[sentences]:
print(u' '.join(s))
Loading