Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Cythonizing phrases module #1385

Closed
wants to merge 18 commits into from
Closed
2 changes: 1 addition & 1 deletion continuous_integration/travis/flake8_diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,6 @@ check_files() {
if [[ "$MODIFIED_FILES" == "no_match" ]]; then
echo "No file has been modified"
else
check_files "$(echo "$MODIFIED_FILES" )" "--ignore=E501,E731,E12,W503 --exclude=*.sh,*.md,*.yml,*.rst,*.ipynb,*.txt,*.csv,*.vec,Dockerfile*"
check_files "$(echo "$MODIFIED_FILES" )" "--ignore=E501,E731,E12,W503 --exclude=*.sh,*.md,*.yml,*.rst,*.ipynb,*.txt,*.csv,*.vec,*.c,Dockerfile*"
fi
echo -e "No problem detected by flake8\n"
72 changes: 43 additions & 29 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@
from gensim import utils, interfaces

logger = logging.getLogger(__name__)
#from gensim.models.phrases_inner import learn_vocab





def _is_single(obj):
Expand Down Expand Up @@ -105,6 +109,44 @@ class Phrases(interfaces.TransformationABC):
and `phrases[corpus]` syntax.

"""
#from gensim.models.phrases_inner import learn_vocab
try:
from gensim.models.phrases_inner import learn_vocab
logger.info("Cython file loaded")
except ImportError:
logger.info("Cython file not loaded")
#failed... fall back to plain numpy (20-80x slower training than the above)
Copy link
Owner

@piskvorky piskvorky Jun 3, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the 20-80x figure correct? If not, better remove the stale comments and start the development with a clean slate.



def learn_vocab(self,sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
#Collect unigram/bigram counts from the `sentences` iterable.
sentence_no = -1
total_words = 0
logger.info("collecting all words and their counts")
vocab = defaultdict(int)
min_reduce = 1
for sentence_no, sentence in enumerate(sentences):
if sentence_no % progress_per == 0:
logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
(sentence_no, total_words, len(vocab)))
#sentence = [utils.any2utf8(w) for w in sentence]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@piskvorky was there any particular reason behind creating the vocab for Phrases with utf8-encoded bytestrings, rather than unicode strings themselves?
Currently, according to profiling done by Prakhar, the utf8 conversion significantly affects performance due to overhead in the conversion and the fact that the conversion is done for every individual word.

Copy link
Owner

@piskvorky piskvorky Jun 15, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, saving memory.

Up to Python 3.3 (and including all Python 2.x), unicode strings take up 2-4x as much memory, compared to UTF8 byte strings, for normal text.

Since memory is more critical than speed here, we went with UTF8.

for bigram in zip(sentence, sentence[1:]):
vocab[bigram[0]] += 1
vocab[delimiter.join(bigram)] += 1
total_words += 1

if sentence: # add last word skipped by previous loop
word = sentence[-1]
vocab[word] += 1

if len(vocab) > max_vocab_size:
utils.prune_vocab(vocab, min_reduce)
min_reduce += 1

logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
(len(vocab), total_words, sentence_no + 1))
return min_reduce, vocab

def __init__(self, sentences=None, min_count=5, threshold=10.0,
max_vocab_size=40000000, delimiter=b'_', progress_per=10000):
"""
Expand Down Expand Up @@ -157,35 +199,7 @@ def __str__(self):
self.__class__.__name__, len(self.vocab), self.min_count,
self.threshold, self.max_vocab_size)

@staticmethod
def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
"""Collect unigram/bigram counts from the `sentences` iterable."""
sentence_no = -1
total_words = 0
logger.info("collecting all words and their counts")
vocab = defaultdict(int)
min_reduce = 1
for sentence_no, sentence in enumerate(sentences):
if sentence_no % progress_per == 0:
logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
(sentence_no, total_words, len(vocab)))
sentence = [utils.any2utf8(w) for w in sentence]
for bigram in zip(sentence, sentence[1:]):
vocab[bigram[0]] += 1
vocab[delimiter.join(bigram)] += 1
total_words += 1

if sentence: # add last word skipped by previous loop
word = sentence[-1]
vocab[word] += 1

if len(vocab) > max_vocab_size:
utils.prune_vocab(vocab, min_reduce)
min_reduce += 1

logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
(len(vocab), total_words, sentence_no + 1))
return min_reduce, vocab


def add_vocab(self, sentences):
"""
Expand Down
Loading