Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add common_terms parameter to sklearn_api.PhrasesTransformer #2074

Merged
merged 10 commits into from
Oct 4, 2018
19 changes: 14 additions & 5 deletions gensim/sklearn_api/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from sklearn.exceptions import NotFittedError

from gensim import models
from gensim.models.phrases import Phraser


class PhrasesTransformer(TransformerMixin, BaseEstimator):
Expand All @@ -41,7 +42,7 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator):

"""
def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
delimiter=b'_', progress_per=10000, scoring='default'):
delimiter=b'_', progress_per=10000, scoring='default', common_terms=frozenset()):
"""

Parameters
Expand Down Expand Up @@ -84,15 +85,20 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,

A scoring function without any of these parameters (even if the parameters are not used) will
raise a ValueError on initialization of the Phrases class. The scoring function must be pickleable.
common_terms : set of str, optional
List of "stop words" that won't affect frequency count of expressions containing them.
Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder".

"""
self.gensim_model = None
self.phraser = None
self.min_count = min_count
self.threshold = threshold
self.max_vocab_size = max_vocab_size
self.delimiter = delimiter
self.progress_per = progress_per
self.scoring = scoring
self.common_terms = common_terms

def fit(self, X, y=None):
"""Fit the model according to the given training data.
Expand All @@ -111,8 +117,9 @@ def fit(self, X, y=None):
self.gensim_model = models.Phrases(
sentences=X, min_count=self.min_count, threshold=self.threshold,
max_vocab_size=self.max_vocab_size, delimiter=self.delimiter,
progress_per=self.progress_per, scoring=self.scoring
progress_per=self.progress_per, scoring=self.scoring, common_terms=self.common_terms
)
self.phraser = Phraser(self.gensim_model)
pmlk marked this conversation as resolved.
Show resolved Hide resolved
return self

def transform(self, docs):
Expand All @@ -131,15 +138,16 @@ def transform(self, docs):
Phrase representation for each of the input sentences.

"""
if self.gensim_model is None:
if self.gensim_model is None or self.phraser is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's not true too: if you load old model - self.gensim_model exist, but self.phraser isn't (through model was fitted early).

raise NotFittedError(
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)

# input as python lists
if isinstance(docs[0], string_types):
docs = [docs]
return [self.gensim_model[doc] for doc in docs]

return [self.phraser[doc] for doc in docs]

def partial_fit(self, X):
"""Train model over a potentially incomplete set of sentences.
Expand All @@ -163,8 +171,9 @@ def partial_fit(self, X):
self.gensim_model = models.Phrases(
sentences=X, min_count=self.min_count, threshold=self.threshold,
max_vocab_size=self.max_vocab_size, delimiter=self.delimiter,
progress_per=self.progress_per, scoring=self.scoring
progress_per=self.progress_per, scoring=self.scoring, common_terms=self.common_terms
)

self.gensim_model.add_vocab(X)
self.phraser = Phraser(self.gensim_model)
return self
60 changes: 60 additions & 0 deletions gensim/test/test_sklearn_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,14 @@
['graph', 'minors', 'survey', 'human', 'interface']
]

common_terms = ["of", "the", "was", "are"]
phrases_w_common_terms = [
[u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'],
[u'the', u'mayor', u'of', u'new', u'orleans', u'was', u'there'],
[u'the', u'bank', u'of', u'america', u'offices', u'are', u'open'],
[u'the', u'bank', u'of', u'america', u'offices', u'are', u'closed']
]


class TestLdaWrapper(unittest.TestCase):
def setUp(self):
Expand Down Expand Up @@ -1151,6 +1159,58 @@ def testModelNotFitted(self):
self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0])


class TestPhrasesTransformerCommonTerms(unittest.TestCase):
def setUp(self):
pmlk marked this conversation as resolved.
Show resolved Hide resolved
numpy.random.seed(0)
Copy link
Contributor

@menshikh-iv menshikh-iv Jul 30, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's not good, because this freeze "global" seed (effect to all test cases, not only to your class). Why you need this here? Probably you can simply remove this line.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quite honestly, I was just looking at the other test cases. Almost all of them include that line in setUp. But I'll remove it. :-)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, you are right, that's looks like a bug for me

ivan@P50:~/release/gensim$ find gensim/ -name "*.py" | xargs grep "numpy.random.seed"
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)  # set fixed seed to get similar values everytime
gensim/test/test_sklearn_api.py:            id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)
gensim/test/test_sklearn_api.py:            id2word=dictionary, num_topics=10, passes=100, minimum_probability=0, random_state=numpy.random.seed(0)
gensim/test/test_sklearn_api.py:            minimum_probability=0, random_state=numpy.random.seed(0)
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)  # set fixed seed to get similar values everytime
gensim/test/test_sklearn_api.py:        model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0))
gensim/test/test_sklearn_api.py:            minimum_probability=0, random_state=numpy.random.seed(0)
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)  # set fixed seed to get similar values everytime
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)  # set fixed seed to get similar values everytime
gensim/test/test_sklearn_api.py:        numpy.random.seed(13)
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)  # set fixed seed to get similar values everytime
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)  # set fixed seed to get similar values everytime
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)  # set fixed seed to get similar values everytime
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)
gensim/test/test_sklearn_api.py:        lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0))
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)
gensim/test/test_sklearn_api.py:        lda_model = LdaTransformer(num_topics=2, passes=10, minimum_probability=0, random_state=numpy.random.seed(0))
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)
gensim/test/test_sklearn_api.py:        numpy.random.seed(0)

that's not your fault, please simply remove this line from your code, that's enough.

self.model = PhrasesTransformer(min_count=1, threshold=1, common_terms=common_terms)
self.expected_transformations = [
[u'the', u'mayor_of_new', u'york', u'was', u'there'],
[u'the', u'mayor_of_new', u'orleans', u'was', u'there'],
[u'the', u'bank_of_america', u'offices', u'are', u'open'],
[u'the', u'bank_of_america', u'offices', u'are', u'closed']
]

def testFitAndTransform(self):
self.model.fit(phrases_w_common_terms)

transformed = self.model.transform(phrases_w_common_terms)
self.assertEqual(transformed, self.expected_transformations)

def testFitTransform(self):
transformed = self.model.fit_transform(phrases_w_common_terms)
self.assertEqual(transformed, self.expected_transformations)

def testPartialFit(self):
# fit half of the sentences
self.model.fit(phrases_w_common_terms[:2])

expected_transformations_0 = [
[u'the', u'mayor_of_new', u'york', u'was', u'there'],
[u'the', u'mayor_of_new', u'orleans', u'was', u'there'],
[u'the', u'bank', u'of', u'america', u'offices', u'are', u'open'],
[u'the', u'bank', u'of', u'america', u'offices', u'are', u'closed']
]
# transform all sentences, second half should be same as original
transformed_0 = self.model.transform(phrases_w_common_terms)
self.assertEqual(transformed_0, expected_transformations_0)

# fit remaining sentences, result should be the same as in the other tests
self.model.partial_fit(phrases_w_common_terms[2:])
transformed_1 = self.model.fit_transform(phrases_w_common_terms)
self.assertEqual(transformed_1, self.expected_transformations)

new_phrases = [[u'offices', u'are', u'open'], [u'offices', u'are', u'closed']]
self.model.partial_fit(new_phrases)
expected_transformations_2 = [
[u'the', u'mayor_of_new', u'york', u'was', u'there'],
[u'the', u'mayor_of_new', u'orleans', u'was', u'there'],
[u'the', u'bank_of_america', u'offices_are_open'],
[u'the', u'bank_of_america', u'offices_are_closed']
]
transformed_2 = self.model.transform(phrases_w_common_terms)
self.assertEqual(transformed_2, expected_transformations_2)


# specifically test pluggable scoring in Phrases, because possible pickling issues with function parameter

# this is intentionally in main rather than a class method to support pickling
Expand Down