research/struct2depth/nets.py


# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Depth and Ego-Motion networks."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf
slim = tf.contrib.slim

SIMPLE = 'simple'
RESNET = 'resnet'
ARCHITECTURES = [SIMPLE, RESNET]

SCALE_TRANSLATION = 0.001
SCALE_ROTATION = 0.01

# Disparity (inverse depth) values range from 0.01 to 10. Note that effectively,
# this is undone if depth normalization is used, which scales the values to
# have a mean of 1.
DISP_SCALING = 10
MIN_DISP = 0.01
WEIGHT_DECAY_KEY = 'WEIGHT_DECAY'
EGOMOTION_VEC_SIZE = 6


def egomotion_net(image_stack, disp_bottleneck_stack, joint_encoder, seq_length,
                  weight_reg):
  """Predict ego-motion vectors from a stack of frames or embeddings.

  Args:
    image_stack: Input tensor with shape [B, h, w, seq_length * 3] in order.
    disp_bottleneck_stack: Input tensor with shape [B, h_hidden, w_hidden,
        seq_length * c_hidden] in order.
    joint_encoder: Determines if the same encoder is used for computing the
        bottleneck layer of both the egomotion and the depth prediction
        network. If enabled, disp_bottleneck_stack is used as input, and the
        encoding steps are skipped. If disabled, a separate encoder is defined
        on image_stack.
    seq_length: The sequence length used.
    weight_reg: The amount of weight regularization.

  Returns:
    Egomotion vectors with shape [B, seq_length - 1, 6].
  """
  num_egomotion_vecs = seq_length - 1
  with tf.variable_scope('pose_exp_net') as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                        normalizer_fn=None,
                        weights_regularizer=slim.l2_regularizer(weight_reg),
                        normalizer_params=None,
                        activation_fn=tf.nn.relu,
                        outputs_collections=end_points_collection):
      if not joint_encoder:
        # Define separate encoder. If sharing, we can skip the encoding step,
        # as the bottleneck layer will already be passed as input.
        cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1')
        cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
        cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
        cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
        cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')

      with tf.variable_scope('pose'):
        inputs = disp_bottleneck_stack if joint_encoder else cnv5
        cnv6 = slim.conv2d(inputs, 256, [3, 3], stride=2, scope='cnv6')
        cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
        pred_channels = EGOMOTION_VEC_SIZE * num_egomotion_vecs
        egomotion_pred = slim.conv2d(cnv7, pred_channels, [1, 1], scope='pred',
                                     stride=1, normalizer_fn=None,
                                     activation_fn=None)
        egomotion_avg = tf.reduce_mean(egomotion_pred, [1, 2])
        egomotion_res = tf.reshape(
            egomotion_avg, [-1, num_egomotion_vecs, EGOMOTION_VEC_SIZE])
        # Tinghui found that scaling by a small constant facilitates training.
        egomotion_scaled = tf.concat([egomotion_res[:, 0:3] * SCALE_TRANSLATION,
                                      egomotion_res[:, 3:6] * SCALE_ROTATION],
                                     axis=1)
    return egomotion_scaled


def objectmotion_net(image_stack, disp_bottleneck_stack, joint_encoder,
                     seq_length, weight_reg):
  """Predict object-motion vectors from a stack of frames or embeddings.

  Args:
    image_stack: Input tensor with shape [B, h, w, seq_length * 3] in order.
    disp_bottleneck_stack: Input tensor with shape [B, h_hidden, w_hidden,
        seq_length * c_hidden] in order.
    joint_encoder: Determines if the same encoder is used for computing the
        bottleneck layer of both the egomotion and the depth prediction
        network. If enabled, disp_bottleneck_stack is used as input, and the
        encoding steps are skipped. If disabled, a separate encoder is defined
        on image_stack.
    seq_length: The sequence length used.
    weight_reg: The amount of weight regularization.

  Returns:
    Egomotion vectors with shape [B, seq_length - 1, 6].
  """
  num_egomotion_vecs = seq_length - 1
  with tf.variable_scope('pose_exp_net') as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                        normalizer_fn=None,
                        weights_regularizer=slim.l2_regularizer(weight_reg),
                        normalizer_params=None,
                        activation_fn=tf.nn.relu,
                        outputs_collections=end_points_collection):
      if not joint_encoder:
        # Define separate encoder. If sharing, we can skip the encoding step,
        # as the bottleneck layer will already be passed as input.
        cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1')
        cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
        cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
        cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
        cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')

      with tf.variable_scope('pose'):
        inputs = disp_bottleneck_stack if joint_encoder else cnv5
        cnv6 = slim.conv2d(inputs, 256, [3, 3], stride=2, scope='cnv6')
        cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
        pred_channels = EGOMOTION_VEC_SIZE * num_egomotion_vecs
        egomotion_pred = slim.conv2d(cnv7, pred_channels, [1, 1], scope='pred',
                                     stride=1, normalizer_fn=None,
                                     activation_fn=None)
        egomotion_avg = tf.reduce_mean(egomotion_pred, [1, 2])
        egomotion_res = tf.reshape(
            egomotion_avg, [-1, num_egomotion_vecs, EGOMOTION_VEC_SIZE])
        # Tinghui found that scaling by a small constant facilitates training.
        egomotion_scaled = tf.concat([egomotion_res[:, 0:3] * SCALE_TRANSLATION,
                                      egomotion_res[:, 3:6] * SCALE_ROTATION],
                                     axis=1)
    return egomotion_scaled


def disp_net(architecture, image, use_skip, weight_reg, is_training):
  """Defines an encoder-decoder architecture for depth prediction."""
  if architecture not in ARCHITECTURES:
    raise ValueError('Unknown architecture.')
  encoder_selected = encoder(architecture)
  decoder_selected = decoder(architecture)

  # Encode image.
  bottleneck, skip_connections = encoder_selected(image, weight_reg,
                                                  is_training)
  # Decode to depth.
  multiscale_disps_i = decoder_selected(target_image=image,
                                        bottleneck=bottleneck,
                                        weight_reg=weight_reg,
                                        use_skip=use_skip,
                                        skip_connections=skip_connections)
  return multiscale_disps_i, bottleneck


def encoder(architecture):
  return encoder_resnet if architecture == RESNET else encoder_simple


def decoder(architecture):
  return decoder_resnet if architecture == RESNET else decoder_simple


def encoder_simple(target_image, weight_reg, is_training):
  """Defines the old encoding architecture."""
  del is_training
  with slim.arg_scope([slim.conv2d],
                      normalizer_fn=None,
                      normalizer_params=None,
                      weights_regularizer=slim.l2_regularizer(weight_reg),
                      activation_fn=tf.nn.relu):
    # Define (joint) encoder.
    cnv1 = slim.conv2d(target_image, 32, [7, 7], stride=2, scope='cnv1')
    cnv1b = slim.conv2d(cnv1, 32, [7, 7], stride=1, scope='cnv1b')
    cnv2 = slim.conv2d(cnv1b, 64, [5, 5], stride=2, scope='cnv2')
    cnv2b = slim.conv2d(cnv2, 64, [5, 5], stride=1, scope='cnv2b')
    cnv3 = slim.conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3')
    cnv3b = slim.conv2d(cnv3, 128, [3, 3], stride=1, scope='cnv3b')
    cnv4 = slim.conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4')
    cnv4b = slim.conv2d(cnv4, 256, [3, 3], stride=1, scope='cnv4b')
    cnv5 = slim.conv2d(cnv4b, 512, [3, 3], stride=2, scope='cnv5')
    cnv5b = slim.conv2d(cnv5, 512, [3, 3], stride=1, scope='cnv5b')
    cnv6 = slim.conv2d(cnv5b, 512, [3, 3], stride=2, scope='cnv6')
    cnv6b = slim.conv2d(cnv6, 512, [3, 3], stride=1, scope='cnv6b')
    cnv7 = slim.conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7')
    cnv7b = slim.conv2d(cnv7, 512, [3, 3], stride=1, scope='cnv7b')
  return cnv7b, (cnv6b, cnv5b, cnv4b, cnv3b, cnv2b, cnv1b)


def decoder_simple(target_image, bottleneck, weight_reg, use_skip,
                   skip_connections):
  """Defines the old depth decoder architecture."""
  h = target_image.get_shape()[1].value
  w = target_image.get_shape()[2].value
  (cnv6b, cnv5b, cnv4b, cnv3b, cnv2b, cnv1b) = skip_connections
  with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                      normalizer_fn=None,
                      normalizer_params=None,
                      weights_regularizer=slim.l2_regularizer(weight_reg),
                      activation_fn=tf.nn.relu):
    up7 = slim.conv2d_transpose(bottleneck, 512, [3, 3], stride=2,
                                scope='upcnv7')
    up7 = _resize_like(up7, cnv6b)
    if use_skip:
      i7_in = tf.concat([up7, cnv6b], axis=3)
    else:
      i7_in = up7
    icnv7 = slim.conv2d(i7_in, 512, [3, 3], stride=1, scope='icnv7')

    up6 = slim.conv2d_transpose(icnv7, 512, [3, 3], stride=2, scope='upcnv6')
    up6 = _resize_like(up6, cnv5b)
    if use_skip:
      i6_in = tf.concat([up6, cnv5b], axis=3)
    else:
      i6_in = up6
    icnv6 = slim.conv2d(i6_in, 512, [3, 3], stride=1, scope='icnv6')

    up5 = slim.conv2d_transpose(icnv6, 256, [3, 3], stride=2, scope='upcnv5')
    up5 = _resize_like(up5, cnv4b)
    if use_skip:
      i5_in = tf.concat([up5, cnv4b], axis=3)
    else:
      i5_in = up5
    icnv5 = slim.conv2d(i5_in, 256, [3, 3], stride=1, scope='icnv5')

    up4 = slim.conv2d_transpose(icnv5, 128, [3, 3], stride=2, scope='upcnv4')
    up4 = _resize_like(up4, cnv3b)
    if use_skip:
      i4_in = tf.concat([up4, cnv3b], axis=3)
    else:
      i4_in = up4
    icnv4 = slim.conv2d(i4_in, 128, [3, 3], stride=1, scope='icnv4')
    disp4 = (slim.conv2d(icnv4, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
                         normalizer_fn=None, scope='disp4')
             * DISP_SCALING + MIN_DISP)
    disp4_up = tf.image.resize_bilinear(disp4, [np.int(h / 4), np.int(w / 4)],
                                        align_corners=True)

    up3 = slim.conv2d_transpose(icnv4, 64, [3, 3], stride=2, scope='upcnv3')
    up3 = _resize_like(up3, cnv2b)
    if use_skip:
      i3_in = tf.concat([up3, cnv2b, disp4_up], axis=3)
    else:
      i3_in = tf.concat([up3, disp4_up])
    icnv3 = slim.conv2d(i3_in, 64, [3, 3], stride=1, scope='icnv3')
    disp3 = (slim.conv2d(icnv3, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
                         normalizer_fn=None, scope='disp3')
             * DISP_SCALING + MIN_DISP)
    disp3_up = tf.image.resize_bilinear(disp3, [np.int(h / 2), np.int(w / 2)],
                                        align_corners=True)

    up2 = slim.conv2d_transpose(icnv3, 32, [3, 3], stride=2, scope='upcnv2')
    up2 = _resize_like(up2, cnv1b)
    if use_skip:
      i2_in = tf.concat([up2, cnv1b, disp3_up], axis=3)
    else:
      i2_in = tf.concat([up2, disp3_up])
    icnv2 = slim.conv2d(i2_in, 32, [3, 3], stride=1, scope='icnv2')
    disp2 = (slim.conv2d(icnv2, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
                         normalizer_fn=None, scope='disp2')
             * DISP_SCALING + MIN_DISP)
    disp2_up = tf.image.resize_bilinear(disp2, [h, w], align_corners=True)

    up1 = slim.conv2d_transpose(icnv2, 16, [3, 3], stride=2, scope='upcnv1')
    i1_in = tf.concat([up1, disp2_up], axis=3)
    icnv1 = slim.conv2d(i1_in, 16, [3, 3], stride=1, scope='icnv1')
    disp1 = (slim.conv2d(icnv1, 1, [3, 3], stride=1, activation_fn=tf.sigmoid,
                         normalizer_fn=None, scope='disp1')
             * DISP_SCALING + MIN_DISP)
  return [disp1, disp2, disp3, disp4]


def encoder_resnet(target_image, weight_reg, is_training):
  """Defines a ResNet18-based encoding architecture.

  This implementation follows Juyong Kim's implementation of ResNet18 on GitHub:
  https://github.com/dalgu90/resnet-18-tensorflow

  Args:
    target_image: Input tensor with shape [B, h, w, 3] to encode.
    weight_reg: Parameter ignored.
    is_training: Whether the model is being trained or not.

  Returns:
    Tuple of tensors, with the first being the bottleneck layer as tensor of
    size [B, h_hid, w_hid, c_hid], and others being intermediate layers
    for building skip-connections.
  """
  del weight_reg
  encoder_filters = [64, 64, 128, 256, 512]
  stride = 2

  # conv1
  with tf.variable_scope('conv1'):
    x = _conv(target_image, 7, encoder_filters[0], stride)
    x = _bn(x, is_train=is_training)
    econv1 = _relu(x)
    x = tf.nn.max_pool(econv1, [1, 3, 3, 1], [1, 2, 2, 1], 'SAME')

  # conv2_x
  x = _residual_block(x, is_training, name='conv2_1')
  econv2 = _residual_block(x, is_training, name='conv2_2')

  # conv3_x
  x = _residual_block_first(econv2, is_training, encoder_filters[2], stride,
                            name='conv3_1')
  econv3 = _residual_block(x, is_training, name='conv3_2')

  # conv4_x
  x = _residual_block_first(econv3, is_training, encoder_filters[3], stride,
                            name='conv4_1')
  econv4 = _residual_block(x, is_training, name='conv4_2')

  # conv5_x
  x = _residual_block_first(econv4, is_training, encoder_filters[4], stride,
                            name='conv5_1')
  econv5 = _residual_block(x, is_training, name='conv5_2')
  return econv5, (econv4, econv3, econv2, econv1)


def decoder_resnet(target_image, bottleneck, weight_reg, use_skip,
                   skip_connections):
  """Defines the depth decoder architecture.

  Args:
    target_image: The original encoder input tensor with shape [B, h, w, 3].
                  Just the shape information is used here.
    bottleneck: Bottleneck layer to be decoded.
    weight_reg: The amount of weight regularization.
    use_skip: Whether the passed skip connections econv1, econv2, econv3 and
              econv4 should be used.
    skip_connections: Tensors for building skip-connections.

  Returns:
    Disparities at 4 different scales.
  """
  (econv4, econv3, econv2, econv1) = skip_connections
  decoder_filters = [16, 32, 64, 128, 256]
  default_pad = tf.constant([[0, 0], [1, 1], [1, 1], [0, 0]])
  reg = slim.l2_regularizer(weight_reg) if weight_reg > 0.0 else None
  with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                      normalizer_fn=None,
                      normalizer_params=None,
                      activation_fn=tf.nn.relu,
                      weights_regularizer=reg):
    upconv5 = slim.conv2d_transpose(bottleneck, decoder_filters[4], [3, 3],
                                    stride=2, scope='upconv5')
    upconv5 = _resize_like(upconv5, econv4)
    if use_skip:
      i5_in = tf.concat([upconv5, econv4], axis=3)
    else:
      i5_in = upconv5
    i5_in = tf.pad(i5_in, default_pad, mode='REFLECT')
    iconv5 = slim.conv2d(i5_in, decoder_filters[4], [3, 3], stride=1,
                         scope='iconv5', padding='VALID')

    upconv4 = slim.conv2d_transpose(iconv5, decoder_filters[3], [3, 3],
                                    stride=2, scope='upconv4')
    upconv4 = _resize_like(upconv4, econv3)
    if use_skip:
      i4_in = tf.concat([upconv4, econv3], axis=3)
    else:
      i4_in = upconv4
    i4_in = tf.pad(i4_in, default_pad, mode='REFLECT')
    iconv4 = slim.conv2d(i4_in, decoder_filters[3], [3, 3], stride=1,
                         scope='iconv4', padding='VALID')

    disp4_input = tf.pad(iconv4, default_pad, mode='REFLECT')
    disp4 = (slim.conv2d(disp4_input, 1, [3, 3], stride=1,
                         activation_fn=tf.sigmoid, normalizer_fn=None,
                         scope='disp4', padding='VALID')
             * DISP_SCALING + MIN_DISP)

    upconv3 = slim.conv2d_transpose(iconv4, decoder_filters[2], [3, 3],
                                    stride=2, scope='upconv3')
    upconv3 = _resize_like(upconv3, econv2)
    if use_skip:
      i3_in = tf.concat([upconv3, econv2], axis=3)
    else:
      i3_in = upconv3
    i3_in = tf.pad(i3_in, default_pad, mode='REFLECT')
    iconv3 = slim.conv2d(i3_in, decoder_filters[2], [3, 3], stride=1,
                         scope='iconv3', padding='VALID')
    disp3_input = tf.pad(iconv3, default_pad, mode='REFLECT')
    disp3 = (slim.conv2d(disp3_input, 1, [3, 3], stride=1,
                         activation_fn=tf.sigmoid, normalizer_fn=None,
                         scope='disp3', padding='VALID')
             * DISP_SCALING + MIN_DISP)

    upconv2 = slim.conv2d_transpose(iconv3, decoder_filters[1], [3, 3],
                                    stride=2, scope='upconv2')
    upconv2 = _resize_like(upconv2, econv1)
    if use_skip:
      i2_in = tf.concat([upconv2, econv1], axis=3)
    else:
      i2_in = upconv2
    i2_in = tf.pad(i2_in, default_pad, mode='REFLECT')
    iconv2 = slim.conv2d(i2_in, decoder_filters[1], [3, 3], stride=1,
                         scope='iconv2', padding='VALID')
    disp2_input = tf.pad(iconv2, default_pad, mode='REFLECT')
    disp2 = (slim.conv2d(disp2_input, 1, [3, 3], stride=1,
                         activation_fn=tf.sigmoid, normalizer_fn=None,
                         scope='disp2', padding='VALID')
             * DISP_SCALING + MIN_DISP)

    upconv1 = slim.conv2d_transpose(iconv2, decoder_filters[0], [3, 3],
                                    stride=2, scope='upconv1')
    upconv1 = _resize_like(upconv1, target_image)
    upconv1 = tf.pad(upconv1, default_pad, mode='REFLECT')
    iconv1 = slim.conv2d(upconv1, decoder_filters[0], [3, 3], stride=1,
                         scope='iconv1', padding='VALID')
    disp1_input = tf.pad(iconv1, default_pad, mode='REFLECT')
    disp1 = (slim.conv2d(disp1_input, 1, [3, 3], stride=1,
                         activation_fn=tf.sigmoid, normalizer_fn=None,
                         scope='disp1', padding='VALID')
             * DISP_SCALING + MIN_DISP)

  return [disp1, disp2, disp3, disp4]


def _residual_block_first(x, is_training, out_channel, strides, name='unit'):
  """Helper function for defining ResNet architecture."""
  in_channel = x.get_shape().as_list()[-1]
  with tf.variable_scope(name):
    # Shortcut connection
    if in_channel == out_channel:
      if strides == 1:
        shortcut = tf.identity(x)
      else:
        shortcut = tf.nn.max_pool(x, [1, strides, strides, 1],
                                  [1, strides, strides, 1], 'VALID')
    else:
      shortcut = _conv(x, 1, out_channel, strides, name='shortcut')
    # Residual
    x = _conv(x, 3, out_channel, strides, name='conv_1')
    x = _bn(x, is_train=is_training, name='bn_1')
    x = _relu(x, name='relu_1')
    x = _conv(x, 3, out_channel, 1, name='conv_2')
    x = _bn(x, is_train=is_training, name='bn_2')
    # Merge
    x = x + shortcut
    x = _relu(x, name='relu_2')
  return x


def _residual_block(x, is_training, input_q=None, output_q=None, name='unit'):
  """Helper function for defining ResNet architecture."""
  num_channel = x.get_shape().as_list()[-1]
  with tf.variable_scope(name):
    shortcut = x  # Shortcut connection
    # Residual
    x = _conv(x, 3, num_channel, 1, input_q=input_q, output_q=output_q,
              name='conv_1')
    x = _bn(x, is_train=is_training, name='bn_1')
    x = _relu(x, name='relu_1')
    x = _conv(x, 3, num_channel, 1, input_q=output_q, output_q=output_q,
              name='conv_2')
    x = _bn(x, is_train=is_training, name='bn_2')
    # Merge
    x = x + shortcut
    x = _relu(x, name='relu_2')
  return x


def _conv(x, filter_size, out_channel, stride, pad='SAME', input_q=None,
          output_q=None, name='conv'):
  """Helper function for defining ResNet architecture."""
  if (input_q is None) ^ (output_q is None):
    raise ValueError('Input/Output splits are not correctly given.')

  in_shape = x.get_shape()
  with tf.variable_scope(name):
    # Main operation: conv2d
    with tf.device('/CPU:0'):
      kernel = tf.get_variable(
          'kernel', [filter_size, filter_size, in_shape[3], out_channel],
          tf.float32, initializer=tf.random_normal_initializer(
              stddev=np.sqrt(2.0/filter_size/filter_size/out_channel)))
    if kernel not in tf.get_collection(WEIGHT_DECAY_KEY):
      tf.add_to_collection(WEIGHT_DECAY_KEY, kernel)
    conv = tf.nn.conv2d(x, kernel, [1, stride, stride, 1], pad)
  return conv


def _bn(x, is_train, name='bn'):
  """Helper function for defining ResNet architecture."""
  bn = tf.layers.batch_normalization(x, training=is_train, name=name)
  return bn


def _relu(x, name=None, leakness=0.0):
  """Helper function for defining ResNet architecture."""
  if leakness > 0.0:
    name = 'lrelu' if name is None else name
    return tf.maximum(x, x*leakness, name='lrelu')
  else:
    name = 'relu' if name is None else name
    return tf.nn.relu(x, name='relu')


def _resize_like(inputs, ref):
  i_h, i_w = inputs.get_shape()[1], inputs.get_shape()[2]
  r_h, r_w = ref.get_shape()[1], ref.get_shape()[2]
  if i_h == r_h and i_w == r_w:
    return inputs
  else:
    # TODO(casser): Other interpolation methods could be explored here.
    return tf.image.resize_bilinear(inputs, [r_h.value, r_w.value],
                                    align_corners=True)