Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unable to use with real data #12

Open
Bashima opened this issue May 31, 2019 · 1 comment
Open

Unable to use with real data #12

Bashima opened this issue May 31, 2019 · 1 comment

Comments

@Bashima
Copy link

Bashima commented May 31, 2019

I am trying to use this attention code (Monotonic) for spanish to english translation (http://download.tensorflow.org/data/spa-eng.zip).
I am getting this error

Input 0 is incompatible with layer AttentionDecoder: expected ndim=3, found ndim=2

I would really appreciate if you can help me out. I am quite new with RNN and Keras.

Following is the code I am using.

`

path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', 
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    w = '<start> ' + w + ' <end>'
    return w

def create_dataset(path, num_examples):
    lines = open(path, encoding='UTF-8').read().strip().split('\n')
    
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    
    return word_pairs

class LanguageIndex():
  def __init__(self, lang):
    self.lang = lang
    self.word2idx = {}
    self.idx2word = {}
    self.vocab = set()
    
    self.create_index()
    
  def create_index(self):
    for phrase in self.lang:
      self.vocab.update(phrase.split(' '))
    
    self.vocab = sorted(self.vocab)
    
    self.word2idx['<pad>'] = 0
    for index, word in enumerate(self.vocab):
      self.word2idx[word] = index + 1
    
    for word, index in self.word2idx.items():
      self.idx2word[index] = word
    
def max_length(tensor):
    return max(len(t) for t in tensor)


def load_dataset(path, num_examples):
    pairs = create_dataset(path, num_examples)
    inp_lang = LanguageIndex(sp for en, sp in pairs)
    targ_lang = LanguageIndex(en for en, sp in pairs)
    input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for en, sp in pairs]
    target_tensor = [[targ_lang.word2idx[s] for s in en.split(' ')] for en, sp in pairs]

    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                 maxlen=max_length_inp,
                                                                 padding='post')
    
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                  maxlen=max_length_tar, 
                                                                  padding='post')
    
    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar

num_examples = 30000
input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
embedding_dim = 256
units = 1024 # unit is the dimension of the output
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

print(vocab_inp_size, vocab_tar_size)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))


def gru(units):
    if tf.test.is_gpu_available():
        print("GPU")
        return tf.keras.layers.CuDNNGRU(units, 
                                        return_sequences=True, 
                                        return_state=True,
                                        recurrent_initializer='glorot_uniform')
    else:
        print("NO GPU")
        return tf.keras.layers.GRU(units, 
                               return_sequences=True, 
                               return_state=True, 
                               recurrent_activation='sigmoid', 
                               recurrent_initializer='glorot_uniform')

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

EPOCHA = 10
inputs = Input(shape=(None,), dtype='int64')
outp_true = Input(shape=(None,), dtype='int64')
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
attention_decoder = AttentionDecoder(units=units, alphabet_size=vocab_tar_size,
                                     embedding_dim=embedding_dim,
                                     is_monotonic=True,
                                     normalize_energy=True)

output = attention_decoder([inputs, outp_true])

model = Model(inputs=[inputs, outp_true], outputs=[output])
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adadelta',
    metrics=['accuracy'])
model.summary()

dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)  
hidden = encoder.initialize_hidden_state()
for epoch in range(EPOCHS):
    for (batch, (inp, targ)) in enumerate (dataset):
        print(batch)
        enc_output, enc_hidden = encoder(inp, hidden)
        print(enc_hidden.shape)
        model.fit([enc_hidden, targ], targ,
           epochs=1, batch_size = 1,
           validation_data=([input_tensor_val, np.squeeze(target_tensor_val, axis=-1)], target_tensor_val))`
@Victor-Almeida
Copy link

If you need 3 dimensions instead of 2 you can create a new dimension with a single position. For example, if your array shape is (m, n) you reshape it to (m, n, 1).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants