piskvorky · piskvorky · Oct 14, 2020 · Oct 10, 2020 · Oct 10, 2020 · Oct 10, 2020
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
@@ -21,7 +21,7 @@
 
     >>> from gensim.test.utils import datapath
     >>> from gensim.models.word2vec import Text8Corpus
-    >>> from gensim.models.phrases import Phrases
+    >>> from gensim.models.phrases import Phrases, ENGLISH_COMMON_TERMS
     >>>
     >>> # Create training corpus. Must be a sequence of sentences (e.g. an iterable or a generator).
     >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
@@ -31,7 +31,7 @@
     ['computer', 'human', 'interface', 'computer', 'response', 'survey', 'system', 'time', 'user', 'interface']
     >>>
     >>> # Train a toy phrase model on our training corpus.
-    >>> phrase_model = Phrases(sentences, delimiter='_', min_count=1, threshold=1)
+    >>> phrase_model = Phrases(sentences, min_count=1, threshold=1, common_terms=ENGLISH_COMMON_TERMS)
     >>>
     >>> # Apply the trained phrases model to a new, unseen sentence.
     >>> new_sentence = ['trees', 'graph', 'minors']
@@ -61,8 +61,6 @@
 
 """
 
-import sys
-import os
 import logging
 from collections import defaultdict
 import itertools
@@ -77,6 +75,17 @@
 
 NEGATIVE_INFINITY = float('-inf')
 
+#: Set of common English prepositions and articles. Tokens from this set
+# are "ignored" during phrase detection:
+# 1) Phrases may not start with these words.
+# 2) Phrases may include any number of these words inside.
+ENGLISH_COMMON_TERMS = frozenset(
+    " a an the "  # articles; we never care about these in MWEs
+    " for of with without at from to in on by "  # prepositions; incomplete on purpose, to minimize FNs
+    " and or "  # conjunctions; incomplete on purpose, to minimize FNs
+    .split()
+)
+
 
 def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
     r"""Bigram scoring function, based on the original `Mikolov, et. al: "Distributed Representations
@@ -100,7 +109,7 @@ def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count
     Returns
     -------
     float
-        Score for given bi-gram, greater than or equal to 0.
+        Score for given phrase. Can be negative.
 
     Notes
     -----
@@ -225,7 +234,7 @@ def analyze_sentence(self, sentence):
 
         Yields
         ------
-        (str, score)
+        (str, {float, None})
             Iterate through the input sentence tokens and yield 2-tuples of:
             - ``(concatenated_phrase_tokens, score)`` for token sequences that form a phrase.
             - ``(word, None)`` if the token is not a part of a phrase.
@@ -266,7 +275,7 @@ def analyze_sentence(self, sentence):
                 yield w, None
 
     def __getitem__(self, sentence):
-        """Convert the input sequence of tokens `sentence` into a sequence of tokens where adjacent
+        """Convert the input sequence of tokens ``sentence`` into a sequence of tokens where adjacent
         tokens are replaced by a single token if they form a bigram collocation.
 
         If `sentence` is an entire corpus (iterable of sentences rather than a single
@@ -281,10 +290,10 @@ def __getitem__(self, sentence):
         Return
         ------
         {list of str, iterable of list of str}
-            Sentence with phrase tokens joined by `self.delimiter` character, if input was a single sentence.
-            A generator of such joined sentences if input was a corpus.
+            Sentence with phrase tokens joined by ``self.delimiter``, if input was a single sentence.
+            A generator of such sentences if input was a corpus.
 
-        """
+s        """
         is_single, sentence = _is_single(sentence)
         if not is_single:
             # If the input is an entire corpus (rather than a single sentence),
@@ -293,7 +302,7 @@ def __getitem__(self, sentence):
 
         return [token for token, _ in self.analyze_sentence(sentence)]
 
-    def export_phrases(self, sentences):
+    def find_phrases(self, sentences):
         """Get all unique phrases (multi-word expressions) that appear in ``sentences``, and their scores.
 
         Parameters
@@ -304,20 +313,20 @@ def export_phrases(self, sentences):
         Returns
         -------
         dict(str, float)
-           Unique phrases mapped to their scores.
+           Unique phrases found in ``sentences``, mapped to their scores.
 
         Example
         -------
         .. sourcecode:: pycon
 
             >>> from gensim.test.utils import datapath
             >>> from gensim.models.word2vec import Text8Corpus
-            >>> from gensim.models.phrases import Phrases
+            >>> from gensim.models.phrases import Phrases, ENGLISH_COMMON_TERMS
             >>>
             >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
-            >>> phrases = Phrases(sentences, min_count=1, threshold=0.1)
+            >>> phrases = Phrases(sentences, min_count=1, threshold=0.1, common_terms=ENGLISH_COMMON_TERMS)
             >>>
-            >>> for phrase, score in phrases.export_phrases(sentences).items():
+            >>> for phrase, score in phrases.find_phrases(sentences).items():
             ...     print(phrase, score)
         """
         result = {}
@@ -446,40 +455,26 @@ def __init__(
             #. "default" - :func:`~gensim.models.phrases.original_scorer`.
             #. "npmi" - :func:`~gensim.models.phrases.npmi_scorer`.
         common_terms : set of str, optional
-            List of "stop words" that won't affect frequency count of expressions containing them.
-            Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".
-
-        Notes
-        -----
-        'npmi' is more robust when dealing with common words that form part of common bigrams, and
-        ranges from -1 to 1, but is slower to calculate than the default. The default is the PMI-like scoring
-        as described by `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality"
-        <https://arxiv.org/abs/1310.4546>`_.
-
-        To use a custom scoring function, pass in a function with the following signature:
-
-        * worda_count - number of corpus occurrences in `sentences` of the first token in the bigram being scored
-        * wordb_count - number of corpus occurrences in `sentences` of the second token in the bigram being scored
-        * bigram_count - number of occurrences in `sentences` of the whole bigram
-        * len_vocab - the number of unique tokens in `sentences`
-        * min_count - the `min_count` setting of the Phrases class
-        * corpus_word_count - the total number of tokens (non-unique) in `sentences`
+            Set of "stop words" that may be included within a phrase, without affecting its scoring.
 
-        The scoring function **must accept all these parameters**, even if it doesn't use them in its scoring.
+            **If your texts are in English, set ``common_terms=phrases.ENGLISH_COMMON_TERMS``.**
+            This will cause phrases to include common English articles and prepositions, such
+            as `bank_of_america` or `eye_of_the_beholder`.
 
-        The scoring function **must be pickleable**.
+            For other languages or specific applications domains, use custom ``common_terms``
+            that make sense there: ``common_terms=frozenset("der die das".split())`` etc.
 
         Examples
-        ----------
+        --------
         .. sourcecode:: pycon
 
             >>> from gensim.test.utils import datapath
             >>> from gensim.models.word2vec import Text8Corpus
-            >>> from gensim.models.phrases import Phrases
+            >>> from gensim.models.phrases import Phrases, ENGLISH_COMMON_TERMS
             >>>
             >>> # Load corpus and train a model.
             >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
-            >>> phrases = Phrases(sentences, min_count=1, threshold=1)
+            >>> phrases = Phrases(sentences, min_count=1, threshold=1, common_terms=ENGLISH_COMMON_TERMS)
             >>>
             >>> # Use the model to detect phrases in a new sentence.
             >>> sent = [u'trees', u'graph', u'minors']
@@ -488,7 +483,7 @@ def __init__(
             >>>
             >>> # Or transform multiple sentences at once.
             >>> sents = [[u'trees', u'graph', u'minors'], [u'graph', u'minors']]
-            >>> for phrase in frozen_phrases[sents]:
+            >>> for phrase in phrases[sents]:
             ...     print(phrase)
             [u'trees_graph', u'minors']
             [u'graph_minors']
@@ -498,6 +493,27 @@ def __init__(
             >>> print(frozen_phrases[sent])
             [u'trees_graph', u'minors']
 
+        Notes
+        -----
+
+        The ``scoring="npmi"`` is more robust when dealing with common words that form part of common bigrams, and
+        ranges from -1 to 1, but is slower to calculate than the default``scoring="default"``.
+        The default is the PMI-like scoring as described in `Mikolov, et. al: "Distributed
+        Representations of Words and Phrases and their Compositionality" <https://arxiv.org/abs/1310.4546>`_.
+
+        To use your own custom ``scoring`` function, pass in a function with the following signature:
+
+        * ``worda_count`` - number of corpus occurrences in `sentences` of the first token in the bigram being scored
+        * ``wordb_count`` - number of corpus occurrences in `sentences` of the second token in the bigram being scored
+        * ``bigram_count`` - number of occurrences in `sentences` of the whole bigram
+        * ``len_vocab`` - the number of unique tokens in `sentences`
+        * ``min_count`` - the `min_count` setting of the Phrases class
+        * ``corpus_word_count`` - the total number of tokens (non-unique) in `sentences`
+
+        The scoring function must accept all these parameters, even if it doesn't use them in its scoring.
+
+        The scoring function **must be pickleable**.
+
         """
         super().__init__(common_terms=common_terms)
         if min_count <= 0:
@@ -621,24 +637,24 @@ def add_vocab(self, sentences):
         Parameters
         ----------
         sentences : iterable of list of str
-            Text corpus.
+            Text corpus to update this model's parameters from.
 
         Example
         -------
         .. sourcecode:: pycon
 
             >>> from gensim.test.utils import datapath
             >>> from gensim.models.word2vec import Text8Corpus
-            >>> from gensim.models.phrases import Phrases
+            >>> from gensim.models.phrases import Phrases, ENGLISH_COMMON_TERMS
             >>>
             >>> # Train a phrase detector from a text corpus.
             >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
-            >>> phrases = Phrases(sentences)  # train model
+            >>> phrases = Phrases(sentences, common_words=ENGLISH_COMMON_TERMS)  # train model
             >>> assert len(phrases.vocab) == 37
             >>>
             >>> more_sentences = [
             ...     [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'],
-            ...     [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes']
+            ...     [u'machine', u'learning', u'can', u'be', u'new', u'york', u'sometimes'],
             ... ]
             >>>
             >>> phrases.add_vocab(more_sentences)  # add new sentences to model
@@ -711,9 +727,28 @@ def freeze(self):
         """
         return FrozenPhrases(self)
 
+    def export_phrases(self):
+        """Extract all found phrases.
+
+        Returns
+        ------
+        dict(str, float)
+            Mapping between phrases and their scores.
+
+        """
+        result, source_vocab = {}, self.vocab
+        for token in source_vocab:
+            unigrams = token.split(self.delimiter)
+            if len(unigrams) < 2:
+                continue  # no phrases here
+            phrase, score = self.score_candidate(unigrams[0], unigrams[-1], unigrams[1:-1])
+            if score is not None:
+                result[phrase] = score
+        return result
+
 
 class FrozenPhrases(_PhrasesTransformation):
-    """Minimal state & functionality exported from :class:`~gensim.models.phrases.Phrases`.
+    """Minimal state & functionality exported from a trained :class:`~gensim.models.phrases.Phrases` model.
 
     The goal of this class is to cut down memory consumption of `Phrases`, by discarding model state
     not strictly needed for the phrase detection task.
@@ -741,11 +776,11 @@ def __init__(self, phrases_model):
 
             >>> from gensim.test.utils import datapath
             >>> from gensim.models.word2vec import Text8Corpus
-            >>> from gensim.models.phrases import Phrases
+            >>> from gensim.models.phrases import Phrases, ENGLISH_COMMON_TERMS
             >>>
             >>> # Load corpus and train a model.
             >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
-            >>> phrases = Phrases(sentences, min_count=1, threshold=1)
+            >>> phrases = Phrases(sentences, min_count=1, threshold=1, common_terms=ENGLISH_COMMON_TERMS)
             >>>
             >>> # Export a FrozenPhrases object that is more efficient but doesn't allow further training.
             >>> frozen_phrases = phrases.freeze()
@@ -759,33 +794,14 @@ def __init__(self, phrases_model):
         self.scoring = phrases_model.scoring
         self.common_terms = phrases_model.common_terms
         logger.info('exporting phrases from %s', phrases_model)
-        self.phrasegrams = self._import_phrases(phrases_model)
+        self.phrasegrams = phrases_model.export_phrases()
         logger.info('exported %s', self)
 
     def __str__(self):
         return "%s<%i phrases, min_count=%s, threshold=%s>" % (
             self.__class__.__name__, len(self.phrasegrams), self.min_count, self.threshold,
         )
 
-    def _import_phrases(self, phrases_model):
-        """Extract all phrases that pass the threshold out of `phrases_model`.
-
-        Returns
-        ------
-        dict[str, float]
-            Mapping between phrases and their scores.
-
-        """
-        result, source_vocab = {}, phrases_model.vocab
-        for token in source_vocab:
-            unigrams = token.split(self.delimiter)
-            if len(unigrams) < 2:
-                continue  # no phrases here
-            phrase, score = phrases_model.score_candidate(unigrams[0], unigrams[-1], unigrams[1:-1])
-            if score is not None:
-                result[phrase] = score
-        return result
-
     def score_candidate(self, word_a, word_b, in_between):
         phrase = self.delimiter.join([word_a] + in_between + [word_b])
         score = self.phrasegrams.get(phrase, NEGATIVE_INFINITY)
@@ -795,23 +811,3 @@ def score_candidate(self, word_a, word_b, in_between):
 
 
 Phraser = FrozenPhrases  # alias for backward compatibility
-
-
-if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
-    logging.info("running %s", " ".join(sys.argv))
-
-    # check and process cmdline input
-    program = os.path.basename(sys.argv[0])
-    if len(sys.argv) < 2:
-        print(globals()['__doc__'] % locals())
-        sys.exit(1)
-    infile = sys.argv[1]
-
-    from gensim.models import Phrases  # noqa:F811 for pickle
-    from gensim.models.word2vec import Text8Corpus
-    sentences = Text8Corpus(infile)
-
-    bigram = Phrases(sentences, min_count=5, threshold=100)
-    for s in bigram[sentences]:
-        print(u' '.join(s))