Skip to content

Commit

Permalink
Reduce Phraser memory usage (drop frequencies) (#2208)
Browse files Browse the repository at this point in the history
* fix phraser memory

* reduce phraser memory

* using isinstance

* update model when loaded

* update model when loaded

* update model when loaded

* updated changes

* updated changes

* update changes

* fix loading

* make test better
  • Loading branch information
jenishah authored and menshikh-iv committed Jan 11, 2019
1 parent a3dbdcc commit c5a8f73
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 3 deletions.
11 changes: 9 additions & 2 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,13 @@ def load(cls, *args, **kwargs):
"""
model = super(PhrasesTransformation, cls).load(*args, **kwargs)
# update older models
# if value in phrasegrams dict is a tuple, load only the scores.

for component, score in getattr(model, "phrasegrams", {}).items():
if isinstance(score, tuple):
frequency, score_val = score
model.phrasegrams[component] = score_val

# if no scoring parameter, use default scoring
if not hasattr(model, 'scoring'):
logger.info('older version of %s loaded without scoring function', cls.__name__)
Expand Down Expand Up @@ -814,7 +821,7 @@ def __init__(self, phrases_model):
for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True):
if bigram in self.phrasegrams:
logger.info('Phraser repeat %s', bigram)
self.phrasegrams[bigram] = (phrases_model.vocab[self.delimiter.join(bigram)], score)
self.phrasegrams[bigram] = score
count += 1
if not count % 50000:
logger.info('Phraser added %i phrasegrams', count)
Expand Down Expand Up @@ -857,7 +864,7 @@ def score_item(self, worda, wordb, components, scorer):
"""
try:
return self.phrasegrams[tuple(components)][1]
return self.phrasegrams[tuple(components)]
except KeyError:
return -1

Expand Down
Binary file added gensim/test/test_data/phraser-3.6.0.model
Binary file not shown.
Binary file added gensim/test/test_data/phrases-3.6.0.model
Binary file not shown.
17 changes: 16 additions & 1 deletion gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import unittest

import six

import numpy as np

from gensim.utils import to_unicode
Expand Down Expand Up @@ -646,6 +645,22 @@ def testEncoding(self):
self.assertTrue(isinstance(transformed, six.text_type))


class TestPhraserModelCompatibilty(unittest.TestCase):

def testCompatibilty(self):
phr = Phraser.load(datapath("phraser-3.6.0.model"))
model = Phrases.load(datapath("phrases-3.6.0.model"))

test_sentences = ['trees', 'graph', 'minors']
expected_res = ['trees', 'graph_minors']

phr_out = phr[test_sentences]
model_out = model[test_sentences]

self.assertEqual(phr_out, expected_res)
self.assertEqual(model_out, expected_res)


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()

0 comments on commit c5a8f73

Please sign in to comment.