-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] Poincare model keyedvectors #1700
Changes from 163 commits
6afdd22
a804006
6bd0d4b
98f94a7
b727523
1e6aee1
e286a0b
3e28e8b
99a2270
2e9e31c
d72cb10
d439501
e1ed24d
3b2a383
7d68aae
ba82d42
71f61d1
2a5a7fb
0c57aa1
9c51609
f22d9b2
7905c8c
075df25
6060e56
8ea8f23
b8d77e3
0011b93
b52ee2e
d247384
8c4f5a3
34b0ad3
1779cd7
faacb43
c68088e
0c2f2cb
386f602
f0fb9e9
7d8fbec
315f95c
0802dd5
a106191
1aa586d
13b00dc
5978af6
e40c3e3
ec8b516
86ae4d6
ac51e9c
5900c6f
4ac4d2e
9eb6f48
2ded72b
81960e1
5de194b
6dd6915
4b502af
12be121
29e799c
b4ff1dd
e2f72bc
953b4a7
eebc12a
21a1c82
f9325ea
5db8456
e5c1a3b
c62da7a
53030a0
db0d293
1adf81a
3898089
5cd913a
6305228
fb13eb5
ea2fd48
110fb1e
0aeec2f
38feb7a
0e7ebb3
52a1e57
630771d
7c6d972
16dcf0b
3501d6f
d690a25
2383e82
3ed0bea
9f562cb
98e078d
530146d
d17c075
be0249a
dc2ab95
a306f20
f9750e6
b7212ff
3556ee4
4644eda
6946b74
770c5a9
f027f20
7eea9b7
241f706
3e6f0fe
7a87d6a
9d495a1
63750e7
7ea9d13
748288d
ad5f635
9db3d87
de31b3d
94a2a18
7a4ec79
613ca38
3029d41
055044c
f75491f
59fcf8b
dcbe7aa
b69f51f
84d3e5e
e4c2d62
001ec76
b548464
9446a05
930dfd4
355e521
0d5175c
68e872e
53f6622
7f9337c
00ca7ab
8ff23ae
30ac3e6
f0e15ee
a928ca1
dfc19cb
e967c54
a39781b
4920194
8765299
9e3190f
b1d5aa1
9cb4fa8
abbe77d
6db7b3b
97bb0a4
4ba5c83
7fd2518
6e6c7df
68435b5
8650910
000b499
b4f07bd
c1f68e4
d6f743d
9a0b64c
5b38f42
db7def8
b4ae804
a64f262
f10b0a7
ed304eb
5568a20
ad965b4
2b982ab
e31e816
d73c0e2
235b643
d0b8563
cedd0e1
0317189
e693e64
e931085
3c8d9f2
73ed696
ee92be9
46a7efb
291dac6
c532e6e
c506b96
b4ec393
a7c3080
e53f487
4775f4d
a22c601
6a2da73
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -73,6 +73,7 @@ | |
double, array, vstack, fromstring, sqrt, newaxis,\ | ||
ndarray, sum as np_sum, prod, ascontiguousarray,\ | ||
argmax | ||
import numpy as np | ||
|
||
from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc | ||
from gensim.corpora.dictionary import Dictionary | ||
|
@@ -103,28 +104,19 @@ def __str__(self): | |
return "%s(%s)" % (self.__class__.__name__, ', '.join(vals)) | ||
|
||
|
||
class KeyedVectors(utils.SaveLoad): | ||
|
||
class KeyedVectorsBase(utils.SaveLoad): | ||
""" | ||
Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly | ||
involved in training such as most_similar() | ||
Base class to contain vectors and vocab for any set of vectors which are each associated with a key. | ||
|
||
""" | ||
|
||
def __init__(self): | ||
self.syn0 = [] | ||
self.syn0norm = None | ||
self.vocab = {} | ||
self.index2word = [] | ||
self.vector_size = None | ||
|
||
@property | ||
def wv(self): | ||
return self | ||
|
||
def save(self, *args, **kwargs): | ||
# don't bother storing the cached normalized vectors | ||
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm']) | ||
super(KeyedVectors, self).save(*args, **kwargs) | ||
|
||
def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): | ||
""" | ||
Store the input-hidden weight matrix in the same format used by the original | ||
|
@@ -263,6 +255,121 @@ def add_word(word, weights): | |
logger.info("loaded %s matrix from %s", result.syn0.shape, fname) | ||
return result | ||
|
||
def similarity(self, word_1, word_2): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The parameters are called |
||
""" | ||
Compute similarity between vectors of two input words. | ||
To be implemented by child class. | ||
""" | ||
raise NotImplementedError | ||
|
||
def distance(self, word_1, word_2): | ||
""" | ||
Compute distance between vectors of two input words. | ||
To be implemented by child class. | ||
""" | ||
raise NotImplementedError | ||
|
||
def word_vec(self, word): | ||
""" | ||
Accept a single word as input. | ||
Returns the word's representations in vector space, as a 1D numpy array. | ||
|
||
Example:: | ||
|
||
>>> trained_model['office'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Weird example. Should be |
||
array([ -1.40128313e-02, ...]) | ||
|
||
""" | ||
if word in self.vocab: | ||
result = self.syn0[self.vocab[word].index] | ||
result.setflags(write=False) | ||
return result | ||
else: | ||
raise KeyError("word '%s' not in vocabulary" % word) | ||
|
||
def __getitem__(self, words): | ||
""" | ||
Accept a single word or a list of words as input. | ||
|
||
If a single word: returns the word's representations in vector space, as | ||
a 1D numpy array. | ||
|
||
Multiple words: return the words' representations in vector space, as a | ||
2d numpy array: #words x #vector_size. Matrix rows are in the same order | ||
as in input. | ||
|
||
Example:: | ||
|
||
>>> trained_model['office'] | ||
array([ -1.40128313e-02, ...]) | ||
|
||
>>> trained_model[['office', 'products']] | ||
array([ -1.40128313e-02, ...] | ||
[ -1.70425311e-03, ...] | ||
...) | ||
|
||
""" | ||
if isinstance(words, string_types): | ||
# allow calls like trained_model['office'], as a shorthand for trained_model[['office']] | ||
return self.word_vec(words) | ||
|
||
return vstack([self.word_vec(word) for word in words]) | ||
|
||
def __contains__(self, word): | ||
return word in self.vocab | ||
|
||
def most_similar(self, word, topn=10, restrict_vocab=None): | ||
""" | ||
Find the top-N most similar words to the given word, sorted in increasing order of distance. | ||
To be implemented by child classes | ||
|
||
""" | ||
raise NotImplementedError | ||
|
||
def most_similar_to_given(self, w1, word_list): | ||
"""Return the word from word_list most similar to w1. | ||
|
||
Args: | ||
w1 (str): a word | ||
word_list (list): list of words containing a word most similar to w1 | ||
|
||
Returns: | ||
the word in word_list with the highest similarity to w1 | ||
|
||
Raises: | ||
KeyError: If w1 or any word in word_list is not in the vocabulary | ||
|
||
Example:: | ||
|
||
>>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse']) | ||
'sound' | ||
|
||
>>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone']) | ||
'animal' | ||
|
||
""" | ||
return word_list[argmax([self.similarity(w1, word) for word in word_list])] | ||
|
||
|
||
class EuclideanKeyedVectors(KeyedVectorsBase): | ||
""" | ||
Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly | ||
involved in training such as most_similar() | ||
""" | ||
|
||
def __init__(self): | ||
super(EuclideanKeyedVectors, self).__init__() | ||
self.syn0norm = None | ||
|
||
@property | ||
def wv(self): | ||
return self | ||
|
||
def save(self, *args, **kwargs): | ||
# don't bother storing the cached normalized vectors | ||
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm']) | ||
super(EuclideanKeyedVectors, self).save(*args, **kwargs) | ||
|
||
def word_vec(self, word, use_norm=False): | ||
""" | ||
Accept a single word as input. | ||
|
@@ -356,6 +463,44 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non | |
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] | ||
return result[:topn] | ||
|
||
def similar_by_word(self, word, topn=10, restrict_vocab=None): | ||
""" | ||
Find the top-N most similar words. | ||
|
||
If topn is False, similar_by_word returns the vector of similarity scores. | ||
|
||
`restrict_vocab` is an optional integer which limits the range of vectors which | ||
are searched for most-similar values. For example, restrict_vocab=10000 would | ||
only check the first 10000 word vectors in the vocabulary order. (This may be | ||
meaningful if you've sorted the vocabulary by descending frequency.) | ||
|
||
Example:: | ||
|
||
>>> trained_model.similar_by_word('graph') | ||
[('user', 0.9999163150787354), ...] | ||
|
||
""" | ||
return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab) | ||
|
||
def similar_by_vector(self, vector, topn=10, restrict_vocab=None): | ||
""" | ||
Find the top-N most similar words by vector. | ||
|
||
If topn is False, similar_by_vector returns the vector of similarity scores. | ||
|
||
`restrict_vocab` is an optional integer which limits the range of vectors which | ||
are searched for most-similar values. For example, restrict_vocab=10000 would | ||
only check the first 10000 word vectors in the vocabulary order. (This may be | ||
meaningful if you've sorted the vocabulary by descending frequency.) | ||
|
||
Example:: | ||
|
||
>>> trained_model.similar_by_vector([1,2]) | ||
[('survey', 0.9942699074745178), ...] | ||
|
||
""" | ||
return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab) | ||
|
||
def wmdistance(self, document1, document2): | ||
""" | ||
Compute the Word Mover's Distance between two documents. When using this | ||
|
@@ -511,46 +656,6 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): | |
result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] | ||
return result[:topn] | ||
|
||
def similar_by_word(self, word, topn=10, restrict_vocab=None): | ||
""" | ||
Find the top-N most similar words. | ||
|
||
If topn is False, similar_by_word returns the vector of similarity scores. | ||
|
||
`restrict_vocab` is an optional integer which limits the range of vectors which | ||
are searched for most-similar values. For example, restrict_vocab=10000 would | ||
only check the first 10000 word vectors in the vocabulary order. (This may be | ||
meaningful if you've sorted the vocabulary by descending frequency.) | ||
|
||
Example:: | ||
|
||
>>> trained_model.similar_by_word('graph') | ||
[('user', 0.9999163150787354), ...] | ||
|
||
""" | ||
|
||
return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab) | ||
|
||
def similar_by_vector(self, vector, topn=10, restrict_vocab=None): | ||
""" | ||
Find the top-N most similar words by vector. | ||
|
||
If topn is False, similar_by_vector returns the vector of similarity scores. | ||
|
||
`restrict_vocab` is an optional integer which limits the range of vectors which | ||
are searched for most-similar values. For example, restrict_vocab=10000 would | ||
only check the first 10000 word vectors in the vocabulary order. (This may be | ||
meaningful if you've sorted the vocabulary by descending frequency.) | ||
|
||
Example:: | ||
|
||
>>> trained_model.similar_by_vector([1,2]) | ||
[('survey', 0.9942699074745178), ...] | ||
|
||
""" | ||
|
||
return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab) | ||
|
||
def doesnt_match(self, words): | ||
""" | ||
Which word from the given list doesn't go with the others? | ||
|
@@ -574,36 +679,47 @@ def doesnt_match(self, words): | |
dists = dot(vectors, mean) | ||
return sorted(zip(dists, used_words))[0][1] | ||
|
||
def __getitem__(self, words): | ||
@staticmethod | ||
def cosine_similarities(vector_1, vectors_all): | ||
""" | ||
Accept a single word or a list of words as input. | ||
Return cosine similarities between one vector and a set of other vectors. | ||
|
||
If a single word: returns the word's representations in vector space, as | ||
a 1D numpy array. | ||
Parameters | ||
---------- | ||
vector_1 : numpy.array | ||
vector from which similarities are to be computed. | ||
expected shape (dim,) | ||
vectors_all : numpy.array | ||
for each row in vectors_all, distance from vector_1 is computed. | ||
expected shape (num_vectors, dim) | ||
|
||
Multiple words: return the words' representations in vector space, as a | ||
2d numpy array: #words x #vector_size. Matrix rows are in the same order | ||
as in input. | ||
Returns | ||
------- | ||
numpy.array | ||
Contains cosine distance between vector_1 and each row in vectors_all. | ||
shape (num_vectors,) | ||
|
||
Example:: | ||
""" | ||
norm = np.linalg.norm(vector_1) | ||
all_norms = np.linalg.norm(vectors_all, axis=1) | ||
dot_products = dot(vectors_all, vector_1) | ||
similarities = dot_products / (norm * all_norms) | ||
return similarities | ||
|
||
>>> trained_model['office'] | ||
array([ -1.40128313e-02, ...]) | ||
def distance(self, w1, w2): | ||
""" | ||
Compute cosine distance between two words. | ||
|
||
>>> trained_model[['office', 'products']] | ||
array([ -1.40128313e-02, ...] | ||
[ -1.70425311e-03, ...] | ||
...) | ||
Example:: | ||
|
||
""" | ||
if isinstance(words, string_types): | ||
# allow calls like trained_model['office'], as a shorthand for trained_model[['office']] | ||
return self.word_vec(words) | ||
>>> trained_model.distance('woman', 'man') | ||
0.34 | ||
|
||
return vstack([self.word_vec(word) for word in words]) | ||
>>> trained_model.distance('woman', 'woman') | ||
0.0 | ||
|
||
def __contains__(self, word): | ||
return word in self.vocab | ||
""" | ||
return 1 - self.similarity(w1, w2) | ||
|
||
def similarity(self, w1, w2): | ||
""" | ||
|
@@ -620,30 +736,6 @@ def similarity(self, w1, w2): | |
""" | ||
return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2])) | ||
|
||
def most_similar_to_given(self, w1, word_list): | ||
"""Return the word from word_list most similar to w1. | ||
|
||
Args: | ||
w1 (str): a word | ||
word_list (list): list of words containing a word most similar to w1 | ||
|
||
Returns: | ||
the word in word_list with the highest similarity to w1 | ||
|
||
Raises: | ||
KeyError: If w1 or any word in word_list is not in the vocabulary | ||
|
||
Example:: | ||
|
||
>>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse']) | ||
'sound' | ||
|
||
>>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone']) | ||
'animal' | ||
|
||
""" | ||
return word_list[argmax([self.similarity(w1, word) for word in word_list])] | ||
|
||
def n_similarity(self, ws1, ws2): | ||
""" | ||
Compute cosine similarity between two sets of words. | ||
|
@@ -873,3 +965,6 @@ def get_keras_embedding(self, train_embeddings=False): | |
weights=[weights], trainable=train_embeddings | ||
) | ||
return layer | ||
|
||
# For backward compatibility | ||
KeyedVectors = EuclideanKeyedVectors | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PEP8: no newline at the EOF |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PEP8: too many blank lines