piskvorky · prakhar2b · May 30, 2017 · May 30, 2017 · Jun 2, 2017 · Jun 2, 2017
diff --git a/continuous_integration/travis/flake8_diff.sh b/continuous_integration/travis/flake8_diff.sh
@@ -133,6 +133,6 @@ check_files() {
 if [[ "$MODIFIED_FILES" == "no_match" ]]; then
     echo "No file has been modified"
 else
-    check_files "$(echo "$MODIFIED_FILES" )" "--ignore=E501,E731,E12,W503 --exclude=*.sh,*.md,*.yml,*.rst,*.ipynb,*.txt,*.csv,*.vec,Dockerfile*"
+    check_files "$(echo "$MODIFIED_FILES" )" "--ignore=E501,E731,E12,W503 --exclude=*.sh,*.md,*.yml,*.rst,*.ipynb,*.txt,*.csv,*.vec,*.c,Dockerfile*"
 fi
 echo -e "No problem detected by flake8\n"
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
@@ -70,6 +70,10 @@
 from gensim import utils, interfaces
 
 logger = logging.getLogger(__name__)
+#from gensim.models.phrases_inner import learn_vocab
+
+
+
 
 
 def _is_single(obj):
@@ -105,6 +109,44 @@ class Phrases(interfaces.TransformationABC):
     and `phrases[corpus]` syntax.
 
     """
+    #from gensim.models.phrases_inner import learn_vocab
+    try:
+        from gensim.models.phrases_inner import learn_vocab
+        logger.info("Cython file loaded")
+    except ImportError:
+        logger.info("Cython file not loaded")
+        #failed... fall back to plain numpy (20-80x slower training than the above)
+
+
+        def learn_vocab(self,sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
+            #Collect unigram/bigram counts from the `sentences` iterable.
+            sentence_no = -1
+            total_words = 0
+            logger.info("collecting all words and their counts")
+            vocab = defaultdict(int)
+            min_reduce = 1
+            for sentence_no, sentence in enumerate(sentences):
+                if sentence_no % progress_per == 0:
+                    logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
+                                (sentence_no, total_words, len(vocab)))
+                #sentence = [utils.any2utf8(w) for w in sentence] 
+                for bigram in zip(sentence, sentence[1:]):
+                    vocab[bigram[0]] += 1
+                    vocab[delimiter.join(bigram)] += 1
+                    total_words += 1
+
+                if sentence:  # add last word skipped by previous loop
+                    word = sentence[-1]
+                    vocab[word] += 1
+
+                if len(vocab) > max_vocab_size:
+                    utils.prune_vocab(vocab, min_reduce)
+                    min_reduce += 1
+
+            logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
+                        (len(vocab), total_words, sentence_no + 1))
+            return min_reduce, vocab
+
     def __init__(self, sentences=None, min_count=5, threshold=10.0,
                  max_vocab_size=40000000, delimiter=b'_', progress_per=10000):
         """
@@ -157,35 +199,7 @@ def __str__(self):
             self.__class__.__name__, len(self.vocab), self.min_count,
             self.threshold, self.max_vocab_size)
 
-    @staticmethod
-    def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
-        """Collect unigram/bigram counts from the `sentences` iterable."""
-        sentence_no = -1
-        total_words = 0
-        logger.info("collecting all words and their counts")
-        vocab = defaultdict(int)
-        min_reduce = 1
-        for sentence_no, sentence in enumerate(sentences):
-            if sentence_no % progress_per == 0:
-                logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
-                            (sentence_no, total_words, len(vocab)))
-            sentence = [utils.any2utf8(w) for w in sentence]
-            for bigram in zip(sentence, sentence[1:]):
-                vocab[bigram[0]] += 1
-                vocab[delimiter.join(bigram)] += 1
-                total_words += 1
-
-            if sentence:  # add last word skipped by previous loop
-                word = sentence[-1]
-                vocab[word] += 1
-
-            if len(vocab) > max_vocab_size:
-                utils.prune_vocab(vocab, min_reduce)
-                min_reduce += 1
-
-        logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
-                    (len(vocab), total_words, sentence_no + 1))
-        return min_reduce, vocab
+
 
     def add_vocab(self, sentences):
         """