research/lexnet_nc/lexnet_model.py

# Copyright 2017, 2018 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""The integrated LexNET model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import lexnet_common
import numpy as np
import tensorflow as tf
from six.moves import xrange


class LexNETModel(object):
  """The LexNET model for classifying relationships between noun compounds."""

  @classmethod
  def default_hparams(cls):
    """Returns the default hyper-parameters."""
    return tf.contrib.training.HParams(
        batch_size=10,
        num_classes=37,
        num_epochs=30,
        input_keep_prob=0.9,
        input='integrated',  # dist/ dist-nc/ path/ integrated/ integrated-nc
        learn_relata=False,
        corpus='wiki_gigawords',
        random_seed=133,  # zero means no random seed
        relata_embeddings_file='glove/glove.6B.300d.bin',
        nc_embeddings_file='nc_glove/vecs.6B.300d.bin',
        path_embeddings_file='path_embeddings/tratz/fine_grained/wiki',
        hidden_layers=1,
        path_dim=60)

  def __init__(self, hparams, relata_embeddings, path_embeddings, nc_embeddings,
               path_to_index):
    """Initialize the LexNET classifier.

    Args:
      hparams: the hyper-parameters.
      relata_embeddings: word embeddings for the distributional component.
      path_embeddings: embeddings for the paths.
      nc_embeddings: noun compound embeddings.
      path_to_index: a mapping from string path to an index in the path
      embeddings matrix.
    """
    self.hparams = hparams

    self.path_embeddings = path_embeddings
    self.relata_embeddings = relata_embeddings
    self.nc_embeddings = nc_embeddings

    self.vocab_size, self.relata_dim = 0, 0
    self.path_to_index = None
    self.path_dim = 0

    # Set the random seed
    if hparams.random_seed > 0:
      tf.set_random_seed(hparams.random_seed)

    # Get the vocabulary size and relata dim
    if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']:
      self.vocab_size, self.relata_dim = self.relata_embeddings.shape

    # Create the mapping from string path to an index in the embeddings matrix
    if self.hparams.input in ['path', 'integrated', 'integrated-nc']:
      self.path_to_index = tf.contrib.lookup.HashTable(
          tf.contrib.lookup.KeyValueTensorInitializer(
              tf.constant(path_to_index.keys()),
              tf.constant(path_to_index.values()),
              key_dtype=tf.string, value_dtype=tf.int32), 0)

      self.path_dim = self.path_embeddings.shape[1]

    # Create the network
    self.__create_computation_graph__()

  def __create_computation_graph__(self):
    """Initialize the model and define the graph."""
    network_input = 0

    # Define the network inputs
    # Distributional x and y
    if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']:
      network_input += 2 * self.relata_dim
      self.relata_lookup = tf.get_variable(
          'relata_lookup',
          initializer=self.relata_embeddings,
          dtype=tf.float32,
          trainable=self.hparams.learn_relata)

    # Path-based
    if self.hparams.input in ['path', 'integrated', 'integrated-nc']:
      network_input += self.path_dim

      self.path_initial_value_t = tf.placeholder(tf.float32, None)

      self.path_lookup = tf.get_variable(
          name='path_lookup',
          dtype=tf.float32,
          trainable=False,
          shape=self.path_embeddings.shape)

      self.initialize_path_op = tf.assign(
          self.path_lookup, self.path_initial_value_t, validate_shape=False)

    # Distributional noun compound
    if self.hparams.input in ['dist-nc', 'integrated-nc']:
      network_input += self.relata_dim

      self.nc_initial_value_t = tf.placeholder(tf.float32, None)

      self.nc_lookup = tf.get_variable(
          name='nc_lookup',
          dtype=tf.float32,
          trainable=False,
          shape=self.nc_embeddings.shape)

      self.initialize_nc_op = tf.assign(
          self.nc_lookup, self.nc_initial_value_t, validate_shape=False)

    hidden_dim = network_input // 2

    # Define the MLP
    if self.hparams.hidden_layers == 0:
      self.weights1 = tf.get_variable(
          'W1',
          shape=[network_input, self.hparams.num_classes],
          dtype=tf.float32)
      self.bias1 = tf.get_variable(
          'b1',
          shape=[self.hparams.num_classes],
          dtype=tf.float32)

    elif self.hparams.hidden_layers == 1:

      self.weights1 = tf.get_variable(
          'W1',
          shape=[network_input, hidden_dim],
          dtype=tf.float32)
      self.bias1 = tf.get_variable(
          'b1',
          shape=[hidden_dim],
          dtype=tf.float32)

      self.weights2 = tf.get_variable(
          'W2',
          shape=[hidden_dim, self.hparams.num_classes],
          dtype=tf.float32)
      self.bias2 = tf.get_variable(
          'b2',
          shape=[self.hparams.num_classes],
          dtype=tf.float32)

    else:
      raise ValueError('Only 0 or 1 hidden layers are supported')

    # Define the variables
    self.instances = tf.placeholder(dtype=tf.string,
                                    shape=[self.hparams.batch_size])

    (self.x_embedding_id,
     self.y_embedding_id,
     self.nc_embedding_id,
     self.path_embedding_id,
     self.path_counts,
     self.labels) = parse_tensorflow_examples(
         self.instances, self.hparams.batch_size, self.path_to_index)

    # Create the MLP
    self.__mlp__()

    self.instances_to_load = tf.placeholder(dtype=tf.string, shape=[None])
    self.labels_to_load = lexnet_common.load_all_labels(self.instances_to_load)
    self.pairs_to_load = lexnet_common.load_all_pairs(self.instances_to_load)

  def load_labels(self, session, instances):
    """Loads the labels for these instances.

    Args:
      session: The current TensorFlow session,
      instances: The instances for which to load the labels.

    Returns:
      the labels of these instances.
    """
    return session.run(self.labels_to_load,
                       feed_dict={self.instances_to_load: instances})

  def load_pairs(self, session, instances):
    """Loads the word pairs for these instances.

    Args:
      session: The current TensorFlow session,
      instances: The instances for which to load the labels.

    Returns:
      the word pairs of these instances.
    """
    word_pairs = session.run(self.pairs_to_load,
                             feed_dict={self.instances_to_load: instances})
    return [pair[0].split('::') for pair in word_pairs]

  def __train_single_batch__(self, session, batch_instances):
    """Train a single batch.

    Args:
      session: The current TensorFlow session.
      batch_instances: TensorFlow examples containing the training intances

    Returns:
      The cost for the current batch.
    """
    cost, _ = session.run([self.cost, self.train_op],
                          feed_dict={self.instances: batch_instances})

    return cost

  def fit(self, session, inputs, on_epoch_completed, val_instances, val_labels,
          save_path):
    """Train the model.

    Args:
      session: The current TensorFlow session.
      inputs:
      on_epoch_completed: A method to call after each epoch.
      val_instances: The validation set instances (evaluation between epochs).
      val_labels: The validation set labels (for evaluation between epochs).
      save_path: Where to save the model.
    """
    for epoch in range(self.hparams.num_epochs):

      losses = []
      epoch_indices = list(np.random.permutation(len(inputs)))

      # If the number of instances doesn't divide by batch_size, enlarge it
      # by duplicating training examples
      mod = len(epoch_indices) % self.hparams.batch_size
      if mod > 0:
        epoch_indices.extend([np.random.randint(0, high=len(inputs))] * mod)

      # Define the batches
      n_batches = len(epoch_indices) // self.hparams.batch_size

      for minibatch in range(n_batches):

        batch_indices = epoch_indices[minibatch * self.hparams.batch_size:(
            minibatch + 1) * self.hparams.batch_size]
        batch_instances = [inputs[i] for i in batch_indices]

        loss = self.__train_single_batch__(session, batch_instances)
        losses.append(loss)

      epoch_loss = np.nanmean(losses)

      if on_epoch_completed:
        should_stop = on_epoch_completed(self, session, epoch, epoch_loss,
                                         val_instances, val_labels, save_path)
        if should_stop:
          print('Stopping training after %d epochs.' % epoch)
          return

  def predict(self, session, inputs):
    """Predict the classification of the test set.

    Args:
      session: The current TensorFlow session.
      inputs: the train paths, x, y and/or nc vectors

    Returns:
      The test predictions.
    """
    predictions, _ = zip(*self.predict_with_score(session, inputs))
    return np.array(predictions)

  def predict_with_score(self, session, inputs):
    """Predict the classification of the test set.

    Args:
      session: The current TensorFlow session.
      inputs: the test paths, x, y and/or nc vectors

    Returns:
      The test predictions along with their scores.
    """
    test_pred = [0] * len(inputs)

    for chunk in xrange(0, len(test_pred), self.hparams.batch_size):

      # Initialize the variables with the current batch data
      batch_indices = list(
          range(chunk, min(chunk + self.hparams.batch_size, len(test_pred))))

      # If the batch is too small, add a few other examples
      if len(batch_indices) < self.hparams.batch_size:
        batch_indices += [0] * (self.hparams.batch_size-len(batch_indices))

      batch_instances = [inputs[i] for i in batch_indices]

      predictions, scores = session.run(
          [self.predictions, self.scores],
          feed_dict={self.instances: batch_instances})

      for index_in_batch, index_in_dataset in enumerate(batch_indices):
        prediction = predictions[index_in_batch]
        score = scores[index_in_batch][prediction]
        test_pred[index_in_dataset] = (prediction, score)

    return test_pred

  def __mlp__(self):
    """Performs the MLP operations.

    Returns: the prediction object to be computed in a Session
    """
    # Define the operations

    # Network input
    vec_inputs = []

    # Distributional component
    if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']:
      for emb_id in [self.x_embedding_id, self.y_embedding_id]:
        vec_inputs.append(tf.nn.embedding_lookup(self.relata_lookup, emb_id))

    # Noun compound component
    if self.hparams.input in ['dist-nc', 'integrated-nc']:
      vec = tf.nn.embedding_lookup(self.nc_lookup, self.nc_embedding_id)
      vec_inputs.append(vec)

    # Path-based component
    if self.hparams.input in ['path', 'integrated', 'integrated-nc']:

      # Get the current paths for each batch instance
      self.path_embeddings = tf.nn.embedding_lookup(self.path_lookup,
                                                    self.path_embedding_id)

      # self.path_embeddings is of shape
      # [batch_size, max_path_per_instance, output_dim]
      # We need to multiply it by path counts
      # ([batch_size, max_path_per_instance]).
      # Start by duplicating path_counts along the output_dim axis.
      self.path_freq = tf.tile(tf.expand_dims(self.path_counts, -1),
                               [1, 1, self.path_dim])

      # Compute the averaged path vector for each instance.
      # First, multiply the path embeddings and frequencies element-wise.
      self.weighted = tf.multiply(self.path_freq, self.path_embeddings)

      # Second, take the sum to get a tensor of shape [batch_size, output_dim].
      self.pair_path_embeddings = tf.reduce_sum(self.weighted, 1)

      # Finally, divide by the total number of paths.
      # The number of paths for each pair has a shape [batch_size, 1],
      # We duplicate it output_dim times along the second axis.
      self.num_paths = tf.clip_by_value(
          tf.reduce_sum(self.path_counts, 1), 1, np.inf)
      self.num_paths = tf.tile(tf.expand_dims(self.num_paths, -1),
                               [1, self.path_dim])

      # And finally, divide pair_path_embeddings by num_paths element-wise.
      self.pair_path_embeddings = tf.div(
          self.pair_path_embeddings, self.num_paths)
      vec_inputs.append(self.pair_path_embeddings)

    # Concatenate the inputs and feed to the MLP
    self.input_vec = tf.nn.dropout(
        tf.concat(vec_inputs, 1),
        keep_prob=self.hparams.input_keep_prob)

    h = tf.matmul(self.input_vec, self.weights1)
    self.output = h

    if self.hparams.hidden_layers == 1:
      self.output = tf.matmul(tf.nn.tanh(h), self.weights2)

    self.scores = self.output
    self.predictions = tf.argmax(self.scores, axis=1)

    # Define the loss function and the optimization algorithm
    self.cross_entropies = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=self.scores, labels=self.labels)
    self.cost = tf.reduce_sum(self.cross_entropies, name='cost')
    self.global_step = tf.Variable(0, name='global_step', trainable=False)
    self.optimizer = tf.train.AdamOptimizer()
    self.train_op = self.optimizer.minimize(
        self.cost, global_step=self.global_step)


def parse_tensorflow_examples(record, batch_size, path_to_index):
  """Reads TensorFlow examples from a RecordReader.

  Args:
    record: a record with TensorFlow examples.
    batch_size: the number of instances in a minibatch
    path_to_index: mapping from string path to index in the embeddings matrix.

  Returns:
    The word embeddings IDs, paths and counts
  """
  features = tf.parse_example(
      record, {
          'x_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64),
          'y_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64),
          'nc_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64),
          'reprs': tf.FixedLenSequenceFeature(
              shape=(), dtype=tf.string, allow_missing=True),
          'counts': tf.FixedLenSequenceFeature(
              shape=(), dtype=tf.int64, allow_missing=True),
          'rel_id': tf.FixedLenFeature([1], dtype=tf.int64)
      })

  x_embedding_id = tf.squeeze(features['x_embedding_id'], [-1])
  y_embedding_id = tf.squeeze(features['y_embedding_id'], [-1])
  nc_embedding_id = tf.squeeze(features['nc_embedding_id'], [-1])
  labels = tf.squeeze(features['rel_id'], [-1])
  path_counts = tf.to_float(tf.reshape(features['counts'], [batch_size, -1]))

  path_embedding_id = None
  if path_to_index:
    path_embedding_id = path_to_index.lookup(features['reprs'])

  return (
      x_embedding_id, y_embedding_id, nc_embedding_id,
      path_embedding_id, path_counts, labels)