model.proto

// Copyright 2021 The Deeplab2 Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";

package deeplab2;

option java_multiple_files = true;

/********** Submessages used to config model options **********/
// Configure the decoder model options.
message DecoderOptions {
  // Set the features key for the high-level features, e.g. 'res5'.
  optional string feature_key = 1;
  // Set the number of filters in each convolution of the decoder.
  optional int32 decoder_channels = 2 [default = 256];
  // Set the decoder convolution type. Support 'depthwise_separable_conv' and
  // 'standard_conv'.
  optional string decoder_conv_type = 5 [default = 'depthwise_separable_conv'];
  // Set the number of filters in each convolution of the ASPP.
  optional int32 aspp_channels = 3 [default = 256];
  // Set the list of atrous rates used in the ASPP. Note that this field has
  // to be of length 3 (to specify the three 3x3 atrous convolutions in ASPP),
  // and it is effective only when `aspp_use_only_1x1_proj_conv` is false.
  repeated int32 atrous_rates = 4;
  // The ASPP module uses only 1x1 projection convolution (i.e., the ASPP five
  // branches consisting of one 1x1 convolution, three 3x3 atrous convolutions
  // with specified `atrous_rates`, and the global average pooling are turned
  // off, when `aspp_use_only_1x1_proj_conv` is true), equivalent to applying
  // only one 1x1 convolution to reduce the feature map channels (obtained from
  // encoder backbone) to the specified `aspp_channels`. This field is mainly
  // used (i.e., set to true) when the encoder backbone is already able to
  // efficiently capture long-range information, e.g., by axial attention blocks
  // (for reference, see configs/cityscapes/axial_deeplab).
  optional bool aspp_use_only_1x1_proj_conv = 6 [default = false];
}

// Configure the low level features to use.
message LowLevelOptions {
  // Set the name of the low-level feature, e.g. 'res2'.
  optional string feature_key = 1;
  // Set the number of filters for the 1x1 projection convolution.
  optional int32 channels_project = 2;
}

// Configure the head options.
message HeadOptions {
  // Set the number of filters in the last convolution, e.g. 1 or NUM_CLASSES.
  optional int32 output_channels = 1;
  // Set the number of filters in the 5x5 convolution, e.g. 256 or 32.
  optional int32 head_channels = 2;
  // Set the head convolution type. Support 'depthwise_separable_conv' and
  // 'standard_conv'
  optional string head_conv_type = 3 [default = 'depthwise_separable_conv'];
  // Set the maximum value after activation. max_value_after_activation and
  // min_value_after_activation are used to shift the head outputs to a range.
  // One example is using sigmoid to shift the range for depth prediction.
  optional float max_value_after_activation = 4 [default = 0];
  // Set the minimum value after activation. max_value_after_activation and
  // min_value_after_activation are used to shift the head outputs to a range.
  // One example is using sigmoid to shift the range for depth prediction.
  optional float min_value_after_activation = 5 [default = 0];
}

// Configure the instance branch.
message InstanceOptions {
  // Set whether to use the instance branch.
  optional bool enable = 1 [default = true];

  // Set the low level options used in instance branch. The list of
  // LowLevelOptions must be ordered lower resolution to higher resolution.
  // Leaving it empty will use the same low level options as the semantic
  // branch.
  repeated LowLevelOptions low_level_override = 2;
  // Set the decoder options of the instance branch. Leaving it empty will use
  // the same decoder options as the semantic branch.
  optional DecoderOptions instance_decoder_override = 3;

  // Configure instance center head.
  optional HeadOptions center_head = 4;
  // Configure instance regression head.
  optional HeadOptions regression_head = 5;

  // Configure next-frame instance regression head.
  optional HeadOptions next_regression_head = 6;
}

// Configure the model options.
// Next ID: 12
message ModelOptions {
  // Configure model backbone.
  message BackboneOptions {
    // Set the name of the specific architecture of the family.
    optional string name = 1 [default = 'resnet50'];
    // Set the output stride of the encoder.
    optional int32 output_stride = 2 [default = 32];
    // Set path to pretrained weights to load pretrained weights.
    optional string pretrained_weights = 3;
    // Set whether to use the squeeze-and-excite operation.
    optional bool use_squeeze_and_excite = 4 [default = false];
    // Set the drop path keep probability for training. Default not to use.
    optional float drop_path_keep_prob = 5 [default = 1.0];
    // Set the drop path schedule. Currently support (1) 'constant': use the
    // same drop path probability for all blocks, and (2) 'linear': linearly
    // decrease the drop path probability from 1.0 at the 0-th stage (or STEM)
    // to drop_path_keep_prob at the last block.
    optional string drop_path_schedule = 6 [default = 'constant'];
    // Set the STEM width_multiplier, controlloing STEM convolution channels.
    optional float stem_width_multiplier = 7 [default = 1.0];
    // Set the backbone (except STEM) width_multiplier, controlling backbone
    // (except STEM) convolution channels.
    optional float backbone_width_multiplier = 8 [default = 1.0];
    // Set the backbone (except STEM) layer_multiplier, controlling the number
    // of layers in the backbone (except STEM).
    optional float backbone_layer_multiplier = 9 [default = 1.0];
    // Use the Switchable Atrous Convolution (SAC) beyond the specified stride.
    // For example, if use_sac_beyond_stride = 16, SAC will be applied to the
    // network stage whose original output stride >= 16 (i.e., 16 and 32, or
    // the last two stages). Set to -1 to disable it.
    optional int32 use_sac_beyond_stride = 10 [default = -1];
  }
  // Set the model option for the backbone encoder model.
  optional BackboneOptions backbone = 1;

  // Shared decoder settings across different meta architectures.
  optional DecoderOptions decoder = 2;

  // Meta-architecture specific settings.
  message DeeplabV3Options {
    // Set the number of classes for the last convolution to predict logits.
    optional int32 num_classes = 1;
  }

  message DeeplabV3PlusOptions {
    // Set the low level options used in this decoder. The list of
    // LowLevelOptions must be ordered from higher to lower levels.
    optional LowLevelOptions low_level = 1;

    // Set the number of classes for the last convolution to predict logits.
    optional int32 num_classes = 2;
  }

  message PanopticDeeplabOptions {
    // Set the low level options used in this decoder. The list of
    // LowLevelOptions must be ordered lower resolution to higher resolution.
    repeated LowLevelOptions low_level = 1;
    // Set the model options for the instance branch.
    optional InstanceOptions instance = 2;
    // Set the model options of the semantic head.
    optional HeadOptions semantic_head = 3;
    // Set the model options of the depth head.
    optional HeadOptions depth_head = 4;
  }

  message MotionDeepLabOptions {
    // Set the low level options used in this decoder. The list of
    // LowLevelOptions must be ordered lower resolution to higher resolution.
    repeated LowLevelOptions low_level = 1;
    // Set the model options for the instance branch.
    optional InstanceOptions instance = 2;
    // Set the model options of the semantic head.
    optional HeadOptions semantic_head = 3;
    // Set the model options for the motion head.
    optional HeadOptions motion_head = 4;
  }

  message MaXDeepLabOptions {
    // Set the head options of the mask head.
    optional HeadOptions pixel_space_head = 1;
    // Set the low level options used in the semantic decoder. The list of
    // LowLevelOptions must be ordered lower resolution to higher resolution.
    repeated LowLevelOptions auxiliary_low_level = 2;
    // Set the head options of the semantic head.
    optional HeadOptions auxiliary_semantic_head = 3;
  }

  oneof meta_architecture {
    DeeplabV3Options deeplab_v3 = 3;
    DeeplabV3PlusOptions deeplab_v3_plus = 4;
    PanopticDeeplabOptions panoptic_deeplab = 5;
    MotionDeepLabOptions motion_deeplab = 7;
    MaXDeepLabOptions max_deeplab = 10;
    PanopticDeeplabOptions vip_deeplab = 11;
  }
  // Set the checkpoint to load.
  optional string initial_checkpoint = 6;
  // Set whether to restore the last convolution of the semantic head when
  // loading from the initial checkpoint. Setting this flag to false is useful
  // when an initial checkpoint was trained on a dataset with different classes.
  optional bool restore_semantic_last_layer_from_initial_checkpoint = 8
      [default = true];
  // Set whether to restore the last convolution of the instance heads when
  // loading from the initial checkpoint. Depending on the meta architecture,
  // this includes center heatmap, center regression and motion regression.
  optional bool restore_instance_last_layer_from_initial_checkpoint = 9
      [default = true];
}