From 976a7de7e3be0f6f67d3a352fd4f1ee9b43ae559 Mon Sep 17 00:00:00 2001 From: Mack Date: Mon, 26 Jun 2017 11:12:00 -0400 Subject: [PATCH] Fix issues with `WordOccurenceAccumulator`on Windows. Fix #1441 (#1449) * #1441: Fix issues with `WordOccurenceAccumulator` on Windows. * #1441: Use pre-scipy0.17 version of `scipy.sparse.diags` function by passing explicit `offset` parameter. --- gensim/topic_coherence/text_analysis.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index a44e57fb3e..1be0574d7b 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -221,7 +221,6 @@ def __init__(self, *args): self._co_occurrences = sps.lil_matrix((self._vocab_size, self._vocab_size), dtype='uint32') self._uniq_words = np.zeros((self._vocab_size + 1,), dtype=bool) # add 1 for none token - self._mask = self._uniq_words[:-1] # to exclude none token self._counter = Counter() def __str__(self): @@ -251,9 +250,10 @@ def partial_accumulate(self, texts, window_size): def analyze_text(self, window, doc_num=None): self._slide_window(window, doc_num) - if self._mask.any(): - self._occurrences[self._mask] += 1 - self._counter.update(itertools.combinations(np.nonzero(self._mask)[0], 2)) + mask = self._uniq_words[:-1] # to exclude none token + if mask.any(): + self._occurrences[mask] += 1 + self._counter.update(itertools.combinations(np.nonzero(mask)[0], 2)) def _slide_window(self, window, doc_num): if doc_num != self._current_doc_num: @@ -273,7 +273,8 @@ def _symmetrize(self): """ co_occ = self._co_occurrences co_occ.setdiag(self._occurrences) # diagonal should be equal to occurrence counts - self._co_occurrences = co_occ + co_occ.T - sps.diags(co_occ.diagonal(), dtype='uint32') + self._co_occurrences = \ + co_occ + co_occ.T - sps.diags(co_occ.diagonal(), offsets=0, dtype='uint32') def _get_occurrences(self, word_id): return self._occurrences[word_id]