piskvorky · menshikh-iv · Dec 4, 2017 · Oct 23, 2017 · Oct 23, 2017 · Oct 25, 2017
diff --git a/docs/notebooks/Poincare Evaluation.ipynb b/docs/notebooks/Poincare Evaluation.ipynb
diff --git a/docs/notebooks/Poincare Report.ipynb b/docs/notebooks/Poincare Report.ipynb
diff --git a/docs/notebooks/poincare/entailment_eval.png b/docs/notebooks/poincare/entailment_eval.png
diff --git a/docs/notebooks/poincare/entailment_paper.png b/docs/notebooks/poincare/entailment_paper.png
diff --git a/docs/notebooks/poincare/example_tree.png b/docs/notebooks/poincare/example_tree.png
diff --git a/docs/notebooks/poincare/link_prediction_eval.png b/docs/notebooks/poincare/link_prediction_eval.png
diff --git a/docs/notebooks/poincare/link_prediction_paper.png b/docs/notebooks/poincare/link_prediction_paper.png
diff --git a/docs/notebooks/poincare/reconstruction_eval.png b/docs/notebooks/poincare/reconstruction_eval.png
diff --git a/docs/notebooks/poincare/reconstruction_paper.png b/docs/notebooks/poincare/reconstruction_paper.png
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -73,6 +73,7 @@
     double, array, vstack, fromstring, sqrt, newaxis,\
     ndarray, sum as np_sum, prod, ascontiguousarray,\
     argmax
+import numpy as np
 
 from gensim import utils, matutils  # utility fnc for pickling, common scipy operations etc
 from gensim.corpora.dictionary import Dictionary
@@ -103,28 +104,19 @@ def __str__(self):
         return "%s(%s)" % (self.__class__.__name__, ', '.join(vals))
 
 
-class KeyedVectors(utils.SaveLoad):
+
+class KeyedVectorsBase(utils.SaveLoad):
     """
-    Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly
-    involved in training such as most_similar()
+    Base class to contain vectors and vocab for any set of vectors which are each associated with a key.
+
     """
 
     def __init__(self):
         self.syn0 = []
-        self.syn0norm = None
         self.vocab = {}
         self.index2word = []
         self.vector_size = None
 
-    @property
-    def wv(self):
-        return self
-
-    def save(self, *args, **kwargs):
-        # don't bother storing the cached normalized vectors
-        kwargs['ignore'] = kwargs.get('ignore', ['syn0norm'])
-        super(KeyedVectors, self).save(*args, **kwargs)
-
     def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
         """
         Store the input-hidden weight matrix in the same format used by the original
@@ -263,6 +255,121 @@ def add_word(word, weights):
         logger.info("loaded %s matrix from %s", result.syn0.shape, fname)
         return result
 
+    def similarity(self, word_1, word_2):
+        """
+        Compute similarity between vectors of two input words.
+        To be implemented by child class.
+        """
+        raise NotImplementedError
+
+    def distance(self, word_1, word_2):
+        """
+        Compute distance between vectors of two input words.
+        To be implemented by child class.
+        """
+        raise NotImplementedError
+
+    def word_vec(self, word):
+        """
+        Accept a single word as input.
+        Returns the word's representations in vector space, as a 1D numpy array.
+
+        Example::
+
+          >>> trained_model['office']
+          array([ -1.40128313e-02, ...])
+
+        """
+        if word in self.vocab:
+            result = self.syn0[self.vocab[word].index]
+            result.setflags(write=False)
+            return result
+        else:
+            raise KeyError("word '%s' not in vocabulary" % word)
+
+    def __getitem__(self, words):
+        """
+        Accept a single word or a list of words as input.
+
+        If a single word: returns the word's representations in vector space, as
+        a 1D numpy array.
+
+        Multiple words: return the words' representations in vector space, as a
+        2d numpy array: #words x #vector_size. Matrix rows are in the same order
+        as in input.
+
+        Example::
+
+          >>> trained_model['office']
+          array([ -1.40128313e-02, ...])
+
+          >>> trained_model[['office', 'products']]
+          array([ -1.40128313e-02, ...]
+                [ -1.70425311e-03, ...]
+                 ...)
+
+        """
+        if isinstance(words, string_types):
+            # allow calls like trained_model['office'], as a shorthand for trained_model[['office']]
+            return self.word_vec(words)
+
+        return vstack([self.word_vec(word) for word in words])
+
+    def __contains__(self, word):
+        return word in self.vocab
+
+    def most_similar(self, word, topn=10, restrict_vocab=None):
+        """
+        Find the top-N most similar words to the given word, sorted in increasing order of distance.
+        To be implemented by child classes
+
+        """
+        raise NotImplementedError
+
+    def most_similar_to_given(self, w1, word_list):
+        """Return the word from word_list most similar to w1.
+
+        Args:
+            w1 (str): a word
+            word_list (list): list of words containing a word most similar to w1
+
+        Returns:
+            the word in word_list with the highest similarity to w1
+
+        Raises:
+            KeyError: If w1 or any word in word_list is not in the vocabulary
+
+        Example::
+
+          >>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse'])
+          'sound'
+
+          >>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone'])
+          'animal'
+
+        """
+        return word_list[argmax([self.similarity(w1, word) for word in word_list])]
+
+
+class EuclideanKeyedVectors(KeyedVectorsBase):
+    """
+    Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly
+    involved in training such as most_similar()
+    """
+
+    def __init__(self):
+        super(EuclideanKeyedVectors, self).__init__()
+        self.syn0norm = None
+
+    @property
+    def wv(self):
+        return self
+
+    def save(self, *args, **kwargs):
+        # don't bother storing the cached normalized vectors
+        kwargs['ignore'] = kwargs.get('ignore', ['syn0norm'])
+        super(EuclideanKeyedVectors, self).save(*args, **kwargs)
+
     def word_vec(self, word, use_norm=False):
         """
         Accept a single word as input.
@@ -356,6 +463,44 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non
         result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
         return result[:topn]
 
+    def similar_by_word(self, word, topn=10, restrict_vocab=None):
+        """
+        Find the top-N most similar words.
+
+        If topn is False, similar_by_word returns the vector of similarity scores.
+
+        `restrict_vocab` is an optional integer which limits the range of vectors which
+        are searched for most-similar values. For example, restrict_vocab=10000 would
+        only check the first 10000 word vectors in the vocabulary order. (This may be
+        meaningful if you've sorted the vocabulary by descending frequency.)
+
+        Example::
+
+          >>> trained_model.similar_by_word('graph')
+          [('user', 0.9999163150787354), ...]
+
+        """
+        return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
+
+    def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
+        """
+        Find the top-N most similar words by vector.
+
+        If topn is False, similar_by_vector returns the vector of similarity scores.
+
+        `restrict_vocab` is an optional integer which limits the range of vectors which
+        are searched for most-similar values. For example, restrict_vocab=10000 would
+        only check the first 10000 word vectors in the vocabulary order. (This may be
+        meaningful if you've sorted the vocabulary by descending frequency.)
+
+        Example::
+
+          >>> trained_model.similar_by_vector([1,2])
+          [('survey', 0.9942699074745178), ...]
+
+        """
+        return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
+
     def wmdistance(self, document1, document2):
         """
         Compute the Word Mover's Distance between two documents. When using this
@@ -511,46 +656,6 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
         result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
         return result[:topn]
 
-    def similar_by_word(self, word, topn=10, restrict_vocab=None):
-        """
-        Find the top-N most similar words.
-
-        If topn is False, similar_by_word returns the vector of similarity scores.
-
-        `restrict_vocab` is an optional integer which limits the range of vectors which
-        are searched for most-similar values. For example, restrict_vocab=10000 would
-        only check the first 10000 word vectors in the vocabulary order. (This may be
-        meaningful if you've sorted the vocabulary by descending frequency.)
-
-        Example::
-
-          >>> trained_model.similar_by_word('graph')
-          [('user', 0.9999163150787354), ...]
-
-        """
-
-        return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
-
-    def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
-        """
-        Find the top-N most similar words by vector.
-
-        If topn is False, similar_by_vector returns the vector of similarity scores.
-
-        `restrict_vocab` is an optional integer which limits the range of vectors which
-        are searched for most-similar values. For example, restrict_vocab=10000 would
-        only check the first 10000 word vectors in the vocabulary order. (This may be
-        meaningful if you've sorted the vocabulary by descending frequency.)
-
-        Example::
-
-          >>> trained_model.similar_by_vector([1,2])
-          [('survey', 0.9942699074745178), ...]
-
-        """
-
-        return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
-
     def doesnt_match(self, words):
         """
         Which word from the given list doesn't go with the others?
@@ -574,36 +679,47 @@ def doesnt_match(self, words):
         dists = dot(vectors, mean)
         return sorted(zip(dists, used_words))[0][1]
 
-    def __getitem__(self, words):
+    @staticmethod
+    def cosine_similarities(vector_1, vectors_all):
         """
-        Accept a single word or a list of words as input.
+        Return cosine similarities between one vector and a set of other vectors.
 
-        If a single word: returns the word's representations in vector space, as
-        a 1D numpy array.
+        Parameters
+        ----------
+        vector_1 : numpy.array
+            vector from which similarities are to be computed.
+            expected shape (dim,)
+        vectors_all : numpy.array
+            for each row in vectors_all, distance from vector_1 is computed.
+            expected shape (num_vectors, dim)
 
-        Multiple words: return the words' representations in vector space, as a
-        2d numpy array: #words x #vector_size. Matrix rows are in the same order
-        as in input.
+        Returns
+        -------
+        numpy.array
+            Contains cosine distance between vector_1 and each row in vectors_all.
+            shape (num_vectors,)
 
-        Example::
+        """
+        norm = np.linalg.norm(vector_1)
+        all_norms = np.linalg.norm(vectors_all, axis=1)
+        dot_products = dot(vectors_all, vector_1)
+        similarities = dot_products  / (norm * all_norms)
+        return similarities
 
-          >>> trained_model['office']
-          array([ -1.40128313e-02, ...])
+    def distance(self, w1, w2):
+        """
+        Compute cosine distance between two words.
 
-          >>> trained_model[['office', 'products']]
-          array([ -1.40128313e-02, ...]
-                [ -1.70425311e-03, ...]
-                 ...)
+        Example::
 
-        """
-        if isinstance(words, string_types):
-            # allow calls like trained_model['office'], as a shorthand for trained_model[['office']]
-            return self.word_vec(words)
+          >>> trained_model.distance('woman', 'man')
+          0.34
 
-        return vstack([self.word_vec(word) for word in words])
+          >>> trained_model.distance('woman', 'woman')
+          0.0
 
-    def __contains__(self, word):
-        return word in self.vocab
+        """
+        return 1 - self.similarity(w1, w2)
 
     def similarity(self, w1, w2):
         """
@@ -620,30 +736,6 @@ def similarity(self, w1, w2):
         """
         return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2]))
 
-    def most_similar_to_given(self, w1, word_list):
-        """Return the word from word_list most similar to w1.
-
-        Args:
-            w1 (str): a word
-            word_list (list): list of words containing a word most similar to w1
-
-        Returns:
-            the word in word_list with the highest similarity to w1
-
-        Raises:
-            KeyError: If w1 or any word in word_list is not in the vocabulary
-
-        Example::
-
-          >>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse'])
-          'sound'
-
-          >>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone'])
-          'animal'
-
-        """
-        return word_list[argmax([self.similarity(w1, word) for word in word_list])]
-
     def n_similarity(self, ws1, ws2):
         """
         Compute cosine similarity between two sets of words.
@@ -873,3 +965,6 @@ def get_keras_embedding(self, train_embeddings=False):
             weights=[weights], trainable=train_embeddings
         )
         return layer
+
+# For backward compatibility
+KeyedVectors = EuclideanKeyedVectors