From 4728156f029967e3aed4b888c6b0edeeb2346217 Mon Sep 17 00:00:00 2001 From: Thalish Sajeed Date: Sun, 7 Feb 2021 18:04:54 +0530 Subject: [PATCH 1/9] fix typo --- gensim/test/test_phrases.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 650139fd2b..0475cc5bf4 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -510,9 +510,9 @@ def test__getitem__(self): assert phrased_sentence == ['data_and_graph', 'survey', 'for', 'human_interface'] -class TestFrozenPhrasesModelCompatibilty(unittest.TestCase): +class TestFrozenPhrasesModelCompatibility(unittest.TestCase): - def test_compatibilty(self): + def test_compatibility(self): phrases = Phrases.load(datapath("phrases-3.6.0.model")) phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model")) test_sentences = ['trees', 'graph', 'minors'] From ea0b1cb6e84cf6b0bf7f421059d7750be84a2e29 Mon Sep 17 00:00:00 2001 From: Thalish Sajeed Date: Sun, 7 Feb 2021 18:28:17 +0530 Subject: [PATCH 2/9] fix test cases for test_export_phrases --- gensim/test/test_phrases.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 0475cc5bf4..aa2e82f02d 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -213,15 +213,23 @@ def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co class TestPhrasesModel(PhrasesCommon, unittest.TestCase): def test_export_phrases(self): - """Test Phrases bigram export phrases.""" + """Test Phrases bigram and trigram export phrases.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') - seen_bigrams = set(bigram.find_phrases(self.sentences).keys()) + trigram = Phrases(bigram[self.sentences], min_count=1, threshold=1, delimiter=' ') + seen_bigrams = set(bigram.export_phrases().keys()) + seen_trigrams = set(trigram.export_phrases().keys()) - assert seen_bigrams == { + assert seen_bigrams == set([ + 'human interface', 'response time', 'graph minors', + 'minors survey', + ]) + + assert seen_trigrams == set([ 'human interface', - } + 'graph minors survey', + ]) def test_multiple_bigrams_single_entry(self): """Test a single entry produces multiple bigrams.""" @@ -443,14 +451,19 @@ def test_multiple_bigrams_single_entry(self): def test_export_phrases(self): """Test Phrases bigram export phrases.""" - bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words, delimiter=' ') - seen_bigrams = set(bigram.find_phrases(self.sentences).keys()) + bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') + trigram = Phrases(bigram[self.sentences], min_count=1, threshold=1, delimiter=' ') + seen_bigrams = set(bigram.export_phrases().keys()) assert seen_bigrams == set([ + 'and graph', + 'data and', + 'graph of', + 'graph survey', 'human interface', - 'graph of trees', - 'data and graph', - 'lack of interest', + 'lack of', + 'of interest', + 'of trees', ]) def test_scoring_default(self): From d82bb7d161de592ac3eb901259ae384636313f74 Mon Sep 17 00:00:00 2001 From: Thalish Sajeed Date: Sun, 7 Feb 2021 18:39:23 +0530 Subject: [PATCH 3/9] add test cases for test_find_phrases --- gensim/test/test_phrases.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index aa2e82f02d..6f474f9c40 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -231,6 +231,17 @@ def test_export_phrases(self): 'graph minors survey', ]) + def test_find_phrases(self): + """Test Phrases bigram find phrases.""" + bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') + seen_bigrams = set(bigram.find_phrases(self.sentences).keys()) + + assert seen_bigrams == set([ + 'response time', + 'graph minors', + 'human interface', + ]) + def test_multiple_bigrams_single_entry(self): """Test a single entry produces multiple bigrams.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') @@ -449,6 +460,18 @@ def test_multiple_bigrams_single_entry(self): 'human interface', ]) + def test_find_phrases(self): + """Test Phrases bigram export phrases.""" + bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words, delimiter=' ') + seen_bigrams = set(bigram.find_phrases(self.sentences).keys()) + + assert seen_bigrams == set([ + 'human interface', + 'graph of trees', + 'data and graph', + 'lack of interest', + ]) + def test_export_phrases(self): """Test Phrases bigram export phrases.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') From 856692cf6b4f08e5e18a917b437892ea12eaf935 Mon Sep 17 00:00:00 2001 From: Thalish Sajeed Date: Sun, 7 Feb 2021 18:56:05 +0530 Subject: [PATCH 4/9] Fix #3031 Runtime error in phrases.py --- gensim/models/phrases.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index b6dc8d6970..9c0001e25f 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -62,7 +62,6 @@ """ import logging -from collections import defaultdict import itertools from math import log import pickle @@ -412,7 +411,7 @@ def load(cls, *args, **kwargs): if not isinstance(word, str): logger.info("old version of %s loaded, upgrading %i words in memory", cls.__name__, len(model.vocab)) logger.info("re-save the loaded model to avoid this upgrade in the future") - vocab = defaultdict(int) + vocab = dict() for key, value in model.vocab.items(): # needs lots of extra RAM temporarily! vocab[str(key, encoding='utf8')] = value model.vocab = vocab @@ -554,7 +553,7 @@ def __init__( self.min_count = min_count self.threshold = threshold self.max_vocab_size = max_vocab_size - self.vocab = defaultdict(int) # mapping between token => its count + self.vocab = dict() # mapping between token => its count self.min_reduce = 1 # ignore any tokens with count smaller than this self.delimiter = delimiter self.progress_per = progress_per @@ -579,7 +578,7 @@ def __str__(self): def _learn_vocab(sentences, max_vocab_size, delimiter, connector_words, progress_per): """Collect unigram and bigram counts from the `sentences` iterable.""" sentence_no, total_words, min_reduce = -1, 0, 1 - vocab = defaultdict(int) + vocab = dict() logger.info("collecting all words and their counts") for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: @@ -590,10 +589,11 @@ def _learn_vocab(sentences, max_vocab_size, delimiter, connector_words, progress start_token, in_between = None, [] for word in sentence: if word not in connector_words: - vocab[word] += 1 + vocab[word] = vocab.get(word, 0) + 1 if start_token is not None: phrase_tokens = itertools.chain([start_token], in_between, [word]) - vocab[delimiter.join(phrase_tokens)] += 1 + joined_phrase_token = delimiter.join(phrase_tokens) + vocab[joined_phrase_token] = vocab.get(joined_phrase_token, 0) + 1 start_token, in_between = word, [] # treat word as both end of a phrase AND beginning of another elif start_token is not None: in_between.append(word) @@ -654,7 +654,7 @@ def add_vocab(self, sentences): logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) for word, count in vocab.items(): - self.vocab[word] += count + self.vocab[word] = self.vocab.get(word, 0) + count if len(self.vocab) > self.max_vocab_size: utils.prune_vocab(self.vocab, self.min_reduce) self.min_reduce += 1 @@ -666,17 +666,17 @@ def add_vocab(self, sentences): def score_candidate(self, word_a, word_b, in_between): # Micro optimization: check for quick early-out conditions, before the actual scoring. - word_a_cnt = self.vocab[word_a] + word_a_cnt = self.vocab.get(word_a, 0) if word_a_cnt <= 0: return None, None - word_b_cnt = self.vocab[word_b] + word_b_cnt = self.vocab.get(word_b, 0) if word_b_cnt <= 0: return None, None phrase = self.delimiter.join([word_a] + in_between + [word_b]) # XXX: Why do we care about *all* phrase tokens? Why not just score the start+end bigram? - phrase_cnt = self.vocab[phrase] + phrase_cnt = self.vocab.get(phrase, 0) if phrase_cnt <= 0: return None, None @@ -788,4 +788,4 @@ def score_candidate(self, word_a, word_b, in_between): return None, None -Phraser = FrozenPhrases # alias for backward compatibility +Phraser = FrozenPhrases # alias for backward compatibility \ No newline at end of file From 4fff8ddf3d9ed3e810bf8ac96ce6e0870e1a83c8 Mon Sep 17 00:00:00 2001 From: Thalish Sajeed Date: Sun, 7 Feb 2021 19:52:49 +0530 Subject: [PATCH 5/9] remove unused variable reference --- gensim/test/test_phrases.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 6f474f9c40..bbfbfaad40 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -475,9 +475,7 @@ def test_find_phrases(self): def test_export_phrases(self): """Test Phrases bigram export phrases.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') - trigram = Phrases(bigram[self.sentences], min_count=1, threshold=1, delimiter=' ') seen_bigrams = set(bigram.export_phrases().keys()) - assert seen_bigrams == set([ 'and graph', 'data and', From 9ad8e0144ecd20d2231c1adaea53d03985360afc Mon Sep 17 00:00:00 2001 From: Thalish Sajeed Date: Sun, 7 Feb 2021 20:03:06 +0530 Subject: [PATCH 6/9] fix newline to end of file --- gensim/models/phrases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 9c0001e25f..30c775320a 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -788,4 +788,4 @@ def score_candidate(self, word_a, word_b, in_between): return None, None -Phraser = FrozenPhrases # alias for backward compatibility \ No newline at end of file +Phraser = FrozenPhrases # alias for backward compatibility From e76b948e455c1d1bb37b6e49306ab9cc07ceea08 Mon Sep 17 00:00:00 2001 From: Thalish Sajeed Date: Sun, 7 Feb 2021 21:03:34 +0530 Subject: [PATCH 7/9] fix formattingpy --- gensim/models/phrases.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 30c775320a..410fb9dced 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -411,7 +411,7 @@ def load(cls, *args, **kwargs): if not isinstance(word, str): logger.info("old version of %s loaded, upgrading %i words in memory", cls.__name__, len(model.vocab)) logger.info("re-save the loaded model to avoid this upgrade in the future") - vocab = dict() + vocab = {} for key, value in model.vocab.items(): # needs lots of extra RAM temporarily! vocab[str(key, encoding='utf8')] = value model.vocab = vocab @@ -553,7 +553,7 @@ def __init__( self.min_count = min_count self.threshold = threshold self.max_vocab_size = max_vocab_size - self.vocab = dict() # mapping between token => its count + self.vocab = {} # mapping between token => its count self.min_reduce = 1 # ignore any tokens with count smaller than this self.delimiter = delimiter self.progress_per = progress_per @@ -578,7 +578,7 @@ def __str__(self): def _learn_vocab(sentences, max_vocab_size, delimiter, connector_words, progress_per): """Collect unigram and bigram counts from the `sentences` iterable.""" sentence_no, total_words, min_reduce = -1, 0, 1 - vocab = dict() + vocab = {} logger.info("collecting all words and their counts") for sentence_no, sentence in enumerate(sentences): if sentence_no % progress_per == 0: From 7f93a9dff4e2a8f6fcc6ec65c6ce96d5a32171f4 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 13 Feb 2021 09:30:37 +0900 Subject: [PATCH 8/9] Update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2800e0d34d..cd329becef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ Changes ======= +## Unreleased + +- fix RuntimeError in export_phrases (change defaultdict to dict) (PR [#3041](https://github.com/RaRe-Technologies/gensim/pull/3041), [@thalishsajeed](https://github.com/thalishsajeed) + ## 4.0.0beta, 2020-10-31 **⚠️ Gensim 4.0 contains breaking API changes! See the [Migration guide](https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4) to update your existing Gensim 3.x code and models.** From ee97c7bb69b79b62c1bee099c4e7c2275956b29b Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 13 Feb 2021 09:30:49 +0900 Subject: [PATCH 9/9] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd329becef..6996ed2477 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ Changes ## Unreleased -- fix RuntimeError in export_phrases (change defaultdict to dict) (PR [#3041](https://github.com/RaRe-Technologies/gensim/pull/3041), [@thalishsajeed](https://github.com/thalishsajeed) +- fix RuntimeError in export_phrases (change defaultdict to dict) (PR [#3041](https://github.com/RaRe-Technologies/gensim/pull/3041), [@thalishsajeed](https://github.com/thalishsajeed)) ## 4.0.0beta, 2020-10-31