-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Sent2Vec model. Fix #1376 #1619
Changes from 1 commit
023c141
ad33484
4a5143a
9c31c01
91980d6
5eb9008
ecfd353
f62e113
c7ffec3
f4b4ce6
c37e2e7
02fb823
ade83fc
8eb3666
318ffe7
06cefaa
e1078e5
66b8bca
1a18310
9e213b4
c066b53
e40e3f3
43f4baf
bc73f7e
993f0d8
1e7de52
c99d18f
af8b3a0
ef45548
1468b91
2c4a25f
990ee8e
8c7279d
c60cfbd
50049bf
1a9b2ac
7e8e520
acef372
757704c
62b7bdc
59c1cc6
65d64ae
d453ffd
94176cd
d2754ca
3d902d2
6203489
ff13007
0f4bc01
4e1b68a
5139a3a
ab365f4
080e99e
4f9fafc
44f87fc
b4ac60c
4c409ee
8549548
10783e5
a5fb365
35e7fc9
439fe35
9227f31
58fcf81
4a03936
fc1c396
18e2862
2631e5c
e3ac088
4e28dfc
0a240ed
41ebb7a
9a2768d
8f3c0ce
ac75523
3979afb
b662335
2314d0f
b09d3db
c6cfa1b
0f921a5
d107762
1ea4bbc
a9db84f
23b9d49
2da7d58
8d066f2
16e6d15
fd68a3a
00daf79
282a7fe
d328fff
3e6ed48
6fec21a
9bff735
c3ae833
dd02f24
743c657
47a2b90
f2cac01
8f09726
9d45856
12ded39
f58d19c
09b1281
7bc16c8
0159917
898b23e
7ac7866
ed6d468
68ddb42
4e24a20
799360d
63e6332
daf9ae3
5c4547e
d575f81
7136df9
b9577bd
361eb9f
121936a
d8a3a5f
a5fa735
693614b
4e2944c
0e36c43
aba1255
4bac49e
fa109a1
8e77f06
f0a0bd5
67f6499
d4a7228
991b8d6
d5f37d1
e045ace
f6a821b
0d5b7ef
b777ea5
e5a7531
9443375
ccb0678
2cfb5be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -75,6 +75,25 @@ class ModelDictionary(): | |
""" | ||
|
||
def __init__(self, t, bucket, minn, maxn, max_vocab_size=30000000, max_line_size=1024): | ||
""" | ||
Initialize a sent2vec dictionary. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Numpy-style please: https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt and http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html (here and anywhere) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, my bad! I used the word2vec code as a reference. I've updated the docstrings. Kindly verify in the latest commit. |
||
|
||
`t` = threshold for configuring which higher-frequency words are randomly downsampled; | ||
default is 1e-3, useful range is (0, 1e-5). | ||
|
||
`bucket` = Number of hash buckets for vocabulary. Default is 2000000. | ||
|
||
`minn` = min length of char ngrams. Default is 3. | ||
|
||
`maxn` = max length of char ngrams. Default is 6. | ||
|
||
`max_vocab_size` = limit RAM during vocabulary building; if there are more unique | ||
words than this, then prune the infrequent ones. Every 10 million word types | ||
need about 1GB of RAM. Set to `None` for no limit (default). | ||
|
||
`max_line_size` = maximum number of characters in a sentence. | ||
""" | ||
|
||
self.max_vocab_size = max_vocab_size | ||
self.max_line_size = max_line_size | ||
self.words = [] | ||
|
@@ -90,6 +109,8 @@ def __init__(self, t, bucket, minn, maxn, max_vocab_size=30000000, max_line_size | |
def hash_(self, word): | ||
""" | ||
Compute hash of given word. | ||
|
||
`word` is the actual vocabulary word. | ||
""" | ||
|
||
h = 2166136261 | ||
|
@@ -101,6 +122,8 @@ def hash_(self, word): | |
def find(self, word): | ||
""" | ||
Find hash of given word. The word may or may not be present in the vocabulary. | ||
|
||
`word` is the actual vocabulary word. | ||
""" | ||
|
||
h = self.hash_(word) % self.max_vocab_size | ||
|
@@ -111,6 +134,8 @@ def find(self, word): | |
def add(self, word): | ||
""" | ||
Add given word to vocabulary. | ||
|
||
`word` is the actual vocabulary word. | ||
""" | ||
|
||
h = self.find(word) | ||
|
@@ -128,7 +153,7 @@ def read(self, sentences, min_count): | |
Process all words present in sentences (where each sentence is a list of unicode strings). | ||
Initialize discard table to downsample higher frequency words according to given sampling threshold. | ||
Also initialize character ngrams for all words and threshold lower frequency words if their count | ||
is less than a given value. | ||
is less than a given value (min_count). | ||
""" | ||
|
||
min_threshold = 1 | ||
|
@@ -330,6 +355,10 @@ def __init__(self, vector_size=100, lr=0.2, lr_update_rate=100, epochs=5, | |
self.dropoutk = dropoutk | ||
|
||
def negative_sampling(self, target, lr): | ||
""" | ||
Get loss using negative sampling. | ||
""" | ||
|
||
loss = 0.0 | ||
self.grad = np.zeros(self.vector_size) | ||
for i in range(self.neg + 1): | ||
|
@@ -340,9 +369,17 @@ def negative_sampling(self, target, lr): | |
return loss | ||
|
||
def sigmoid(self, val): | ||
""" | ||
Compute sigmoid of a particular value. | ||
""" | ||
|
||
return 1.0 / (1.0 + np.exp(-val)) | ||
|
||
def binary_logistic(self, target, label, lr): | ||
""" | ||
Compute loss for given target, label and learning rate using binary logistic regression. | ||
""" | ||
|
||
score = self.sigmoid(np.dot(self.wo[target], self.hidden)) | ||
alpha = lr * (float(label) - score) | ||
self.grad += self.wo[target] * alpha | ||
|
@@ -353,6 +390,12 @@ def binary_logistic(self, target, label, lr): | |
return -np.log(1.0 - score) | ||
|
||
def init_table_negatives(self, counts): | ||
""" | ||
Initialise table of negatives for negative sampling. | ||
|
||
`counts` is a list of counts of all words in the vocabulary. | ||
""" | ||
|
||
z = 0.0 | ||
for i in range(len(counts)): | ||
z += counts[i] ** 0.5 | ||
|
@@ -363,6 +406,10 @@ def init_table_negatives(self, counts): | |
random.shuffle(self.negatives) | ||
|
||
def get_negative(self, target): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please hide all non-public methods with |
||
""" | ||
Get a negative from the list of negatives for caluculating nagtive sampling loss. | ||
""" | ||
|
||
while True: | ||
negative = self.negatives[self.negpos] | ||
self.negpos = (self.negpos + 1) % len(self.negatives) | ||
|
@@ -371,6 +418,10 @@ def get_negative(self, target): | |
return negative | ||
|
||
def update(self, input_, target, lr): | ||
""" | ||
Update model's neural weights for given context, target word and learning rate. | ||
""" | ||
|
||
assert(target >= 0) | ||
assert(target < self.dict.size) | ||
if len(input_) == 0: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add docstrings everywhere (with parameter description + types)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. Kindly verify in the current commit.