From 1ff7416d9fcb2b70ade33e8fd6dc993fa4b9e99e Mon Sep 17 00:00:00 2001 From: Takanori Hayashi Date: Sun, 6 Jun 2021 02:29:02 +0900 Subject: [PATCH 1/2] Faster WMD computation by removing a nested loop --- gensim/models/keyedvectors.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index daa5482184..b3f27cad2d 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -178,6 +178,7 @@ ) import numpy as np from scipy import stats +from scipy.spatial.distance import cdist from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc from gensim.corpora.dictionary import Dictionary @@ -901,23 +902,16 @@ def wmdistance(self, document1, document2, norm=True): # Both documents are composed of a single unique token => zero distance. return 0.0 - # Sets for faster look-up. - docset1 = set(document1) - docset2 = set(document2) + doclist1 = list(set(document1)) + doclist2 = list(set(document2)) + v1 = np.array([self.get_vector(token, norm=norm) for token in doclist1]) + v2 = np.array([self.get_vector(token, norm=norm) for token in doclist2]) + doc1_indices = dictionary.doc2idx(doclist1) + doc2_indices = dictionary.doc2idx(doclist2) # Compute distance matrix. distance_matrix = zeros((vocab_len, vocab_len), dtype=double) - for i, t1 in dictionary.items(): - if t1 not in docset1: - continue - - for j, t2 in dictionary.items(): - if t2 not in docset2 or distance_matrix[i, j] != 0.0: - continue - - # Compute Euclidean distance between (potentially unit-normed) word vectors. - distance_matrix[i, j] = distance_matrix[j, i] = np.sqrt( - np_sum((self.get_vector(t1, norm=norm) - self.get_vector(t2, norm=norm))**2)) + distance_matrix[np.ix_(doc1_indices, doc2_indices)] = cdist(v1, v2) if abs(np_sum(distance_matrix)) < 1e-8: # `emd` gets stuck if the distance matrix contains only zeros. From 6ffedceb393c115dd6e6c9ec26093d477b182f75 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 29 Jun 2021 10:46:56 +0900 Subject: [PATCH 2/2] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 143f669e96..5b574083fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Changes * [#3115](https://github.com/RaRe-Technologies/gensim/pull/3115): Make LSI dispatcher CLI param for number of jobs optional, by [@robguinness](https://github.com/robguinness) * [#3128](https://github.com/RaRe-Technologies/gensim/pull/3128): Materialize and copy the corpus passed to SoftCosineSimilarity, by [@Witiko](https://github.com/Witiko) * [#3131](https://github.com/RaRe-Technologies/gensim/pull/3131): Added import to Nmf docs, and to models/__init__.py, by [@properGrammar](https://github.com/properGrammar) +* [#3163](https://github.com/RaRe-Technologies/gensim/pull/3163): Optimize word mover distance (WMD) computation, by [@flowlight0](https://github.com/flowlight0) ### :books: Documentation