Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add demo for ntm_addressing_mechanism #56

Open
wants to merge 2 commits into
base: dev-static
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions ntm_addressing_mechanism/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import paddle.v2 as paddle
from ntm_conf import gru_encoder_decoder
import gzip
import wmt14


def main():
paddle.init(use_gpu=False, trainer_count=1)
dict_size = 30000

is_hybrid_addressing = True
gen_creator = wmt14.gen(dict_size, src_seq_zero=is_hybrid_addressing)
gen_data = []
gen_num = 3

for item in gen_creator():
gen_data.append((item[0], item[1]))
if len(gen_data) == gen_num:
break

beam_gen = gru_encoder_decoder(
src_dict_dim=dict_size,
trg_dict_dim=dict_size,
is_generating=True,
is_hybrid_addressing=is_hybrid_addressing)

with gzip.open('./models/model_pass_00000.tar.gz') as f:
parameters = paddle.parameters.Parameters.from_tar(f)

beam_result = paddle.infer(
output_layer=beam_gen,
parameters=parameters,
input=gen_data,
field=['prob', 'id'])

src_dict, trg_dict = wmt14.get_dict(dict_size)
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []

prob = beam_result[0]
beam_size = 3
for i in xrange(gen_num):
print "\n*******************************************************\n"
print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + 1]


if __name__ == '__main__':
main()
180 changes: 180 additions & 0 deletions ntm_addressing_mechanism/ntm_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import paddle.v2 as paddle
import sys
import math


def gru_encoder_decoder(src_dict_dim,
trg_dict_dim,
is_generating=False,
is_hybrid_addressing=True,
word_vec_dim=512,
latent_chain_dim=512,
beam_max_len=230,
beam_size=3):
src_word_id = paddle.layer.data(
name='source_language_word',
type=paddle.data_type.integer_value_sequence(src_dict_dim))

src_embedding = paddle.layer.embedding(
input=src_word_id,
size=word_vec_dim,
param_attr=paddle.attr.ParamAttr(
name='_source_language_embedding',
initial_std=1. / math.sqrt(word_vec_dim)))
# use bi-gru as encoder
src_forward = paddle.networks.simple_gru(
input=src_embedding, size=latent_chain_dim)
src_backward = paddle.networks.simple_gru(
input=src_embedding, size=latent_chain_dim, reverse=True)
encoder_vector = paddle.layer.concat(input=[src_forward, src_backward])
with paddle.layer.mixed(
size=latent_chain_dim, bias_attr=False,
act=paddle.activation.Linear()) as encoder_projected:
encoder_projected += paddle.layer.full_matrix_projection(
input=encoder_vector)

if is_hybrid_addressing:
attention_memory_init = paddle.layer.data(
name='init_attention_weights',
type=paddle.data_type.dense_vector(1))
# expand dense vector to sequence
expand_attention_memory_init = paddle.layer.expand(
input=attention_memory_init, expand_as=src_word_id, bias_attr=False)

# build decoder with/without addressing mechanism
def gru_decoder_with_attention(encoder_projected, current_word):
decoder_state_memory = paddle.layer.memory(
name='gru_decoder', size=latent_chain_dim, is_seq=False)

# get attention in this code section
with paddle.layer.mixed(
size=latent_chain_dim,
act=paddle.activation.Linear(),
bias_attr=False) as decoder_state_projected:
decoder_state_projected += paddle.layer.full_matrix_projection(
input=decoder_state_memory)
expand_decoder_state_projected = paddle.layer.expand(
input=decoder_state_projected,
expand_as=encoder_projected,
bias_attr=False)
with paddle.layer.mixed(
size=latent_chain_dim,
act=paddle.activation.Tanh(),
bias_attr=False) as attention_vecs:
attention_vecs += paddle.layer.identity_projection(
input=expand_decoder_state_projected)
attention_vecs += paddle.layer.identity_projection(
input=encoder_projected)
with paddle.layer.mixed(
name='attention_weights',
size=1,
act=paddle.activation.SequenceSoftmax(),
bias_attr=False) as attention_weights:
attention_weights += paddle.layer.full_matrix_projection(
input=attention_vecs)

if is_hybrid_addressing == False:
context_vectors = paddle.layer.scaling(
input=encoder_projected, weight=attention_weights)
else:
# save attention weights of last step
attention_weight_memory = paddle.layer.memory(
name='attention_weights',
size=1,
is_seq=True,
boot_layer=expand_attention_memory_init)

# interpolating weight
with paddle.layer.mixed(
size=1, act=paddle.activation.Sigmoid(),
bias_attr=False) as addressing_gate:
addressing_gate += paddle.layer.full_matrix_projection(
input=current_word)
expand_addressing_gate = paddle.layer.expand(
input=addressing_gate,
expand_as=encoder_projected,
bias_attr=False)
weight_interpolation = paddle.layer.interpolation(
input=[attention_weights, attention_weight_memory],
weight=expand_addressing_gate)

# convolution shift
with paddle.layer.mixed(
size=3,
act=paddle.activation.Softmax(),
bias_attr=paddle.attr.Param(
initial_std=0)) as shifting_weights:
shifting_weights += paddle.layer.full_matrix_projection(
input=current_word)
convolutional_shift = paddle.layer.conv_shift(
a=weight_interpolation, b=shifting_weights)
context_vectors = paddle.layer.scaling(
input=encoder_projected, weight=convolutional_shift)

# sum together to get context vector
context = paddle.layer.pooling(
input=context_vectors, pooling_type=paddle.pooling.Sum())

with paddle.layer.mixed(
size=latent_chain_dim * 3,
layer_attr=paddle.attr.ExtraAttr(
error_clipping_threshold=100.0)) as decoder_step_input:
decoder_step_input += paddle.layer.full_matrix_projection(
input=context)
decoder_step_input += paddle.layer.full_matrix_projection(
input=current_word)

gru_step = paddle.layer.gru_step(
name='gru_decoder',
input=decoder_step_input,
output_mem=decoder_state_memory,
size=latent_chain_dim)

with paddle.layer.mixed(
size=trg_dict_dim,
act=paddle.activation.Softmax(),
bias_attr=paddle.attr.Param(initial_std=0)) as out:
out += paddle.layer.full_matrix_projection(input=gru_step)

return out

decoder_group_name = 'decoder_group'
group_inputs = [
paddle.layer.StaticInputV2(input=encoder_projected, is_seq=True)
]

if not is_generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(trg_dict_dim)),
size=word_vec_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(trg_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
else:
trg_embedding = paddle.layer.GeneratedInputV2(
size=trg_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vec_dim)
group_inputs.append(trg_embedding)
beam_gen = paddle.layer.beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=beam_max_len)
return beam_gen
84 changes: 84 additions & 0 deletions ntm_addressing_mechanism/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import paddle.v2 as paddle
from ntm_conf import gru_encoder_decoder
import wmt14
import sys
import gzip


def main():
paddle.init(use_gpu=False, trainer_count=1, log_error_clipping=True)
dict_size = 30000

is_hybrid_addressing = True
cost = gru_encoder_decoder(
src_dict_dim=dict_size,
trg_dict_dim=dict_size,
is_generating=False,
is_hybrid_addressing=is_hybrid_addressing)

parameters = paddle.parameters.create(cost)

optimizer = paddle.optimizer.Adam(
learning_rate=5e-4,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
model_average=paddle.optimizer.ModelAverage(
average_window=0.5, max_average_window=2500),
learning_rate_decay_a=0.0,
learning_rate_decay_b=0.0,
gradient_clipping_threshold=25)

trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)

# define data reader
wmt14_reader = paddle.batch(
paddle.reader.shuffle(
wmt14.train(dict_size, src_seq_zero=is_hybrid_addressing),
buf_size=8192),
batch_size=5)

def event_handler(event):
if isinstance(event, paddle.event.EndPass):
model_name = './models/model_pass_%05d.tar.gz' % event.pass_id
print('Save model to %s !' % model_name)
with gzip.open(model_name, 'w') as f:
parameters.to_tar(f)

if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print("\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics))
else:
sys.stdout.write('.')
sys.stdout.flush()

if event.batch_id % 100 == 0:
model_name = './models/model_pass_%05d.tar.gz' % event.pass_id
print('Save model to %s !' % model_name)
with gzip.open(model_name, 'w') as f:
parameters.to_tar(f)

if is_hybrid_addressing == True:
feeding = {
'source_language_word': 0,
'init_attention_weights': 1,
'target_language_word': 2,
'target_language_next_word': 3
}
else:
feeding = {
'source_language_word': 0,
'target_language_word': 1,
'target_language_next_word': 2
}

# start to train
trainer.train(
reader=wmt14_reader,
event_handler=event_handler,
num_passes=2,
feeding=feeding)


if __name__ == '__main__':
main()
13 changes: 6 additions & 7 deletions word_embedding/hsigmoid_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,8 @@ def decode_res(infer_res, dict_size):
return predict_lbls


def predict(batch_ins, idx_word_dict, dict_size, prediction_layer, parameters):
infer_res = paddle.infer(
output_layer=prediction_layer, parameters=parameters, input=batch_ins)
def predict(batch_ins, idx_word_dict, dict_size, inferer):
infer_res = inferer.infer(input=batch_ins)

predict_lbls = decode_res(infer_res, dict_size)
predict_words = [idx_word_dict[lbl] for lbl in predict_lbls] # map to word
Expand All @@ -62,6 +61,8 @@ def main():
with gzip.open('./models/model_pass_00000.tar.gz') as f:
parameters = paddle.parameters.Parameters.from_tar(f)

inferer = paddle.inference.Inference(
output_layer=prediction_layer, parameters=parameters)
idx_word_dict = dict((v, k) for k, v in word_dict.items())
batch_size = 64
batch_ins = []
Expand All @@ -70,13 +71,11 @@ def main():
for ins in ins_iter():
batch_ins.append(ins[:-1])
if len(batch_ins) == batch_size:
predict(batch_ins, idx_word_dict, dict_size, prediction_layer,
parameters)
predict(batch_ins, idx_word_dict, dict_size, inferer)
batch_ins = []

if len(batch_ins) > 0:
predict(batch_ins, idx_word_dict, dict_size, prediction_layer,
parameters)
predict(batch_ins, idx_word_dict, dict_size, inferer)


if __name__ == '__main__':
Expand Down