BVLC · jeffdonahue · May 15, 2015 · Mar 21, 2015 · shelhamer · Mar 24, 2015
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
@@ -451,6 +451,72 @@ class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
 };
 #endif
 
+/**
+ * @brief Does spatial pyramid pooling on the input image
+ *        by taking the max, average, etc. within regions
+ *        so that the result vector of different sized
+ *        images are of the same size.
+ */
+template <typename Dtype>
+class SPPLayer : public Layer<Dtype> {
+ public:
+  explicit SPPLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "SPP"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  // MAX POOL layers can output an extra top blob for the mask;
+  // others can only output the pooled inputs.
+  virtual inline int MaxTopBlobs() const {
+    return (this->layer_param_.pooling_param().pool() ==
+            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+  }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  // calculates the kernel and stride dimensions for the pooling layer,
+  // returns a correctly configured LayerParameter for a PoolingLayer
+  virtual LayerParameter GetPoolingParam(const int pyramid_level,
+      const int bottom_h, const int bottom_w, const SPPParameter spp_param);
+
+  int pyramid_height_;
+  int bottom_h_, bottom_w_;
+  int channels_;
+  int kernel_h_, kernel_w_;
+  int pad_h_, pad_w_;
+
+  /// the internal Split layer that feeds the pooling layers
+  shared_ptr<SplitLayer<Dtype> > split_layer_;
+  /// top vector holder used in call to the underlying SplitLayer::Forward
+  vector<Blob<Dtype>*> split_top_vec_;
+  /// bottom vector holder used in call to the underlying PoolingLayer::Forward
+  vector<vector<Blob<Dtype>*>*> pooling_bottom_vecs_;
+  /// the internal Pooling layers of different kernel sizes
+  vector<shared_ptr<PoolingLayer<Dtype> > > pooling_layers_;
+  /// top vector holders used in call to the underlying PoolingLayer::Forward
+  vector<vector<Blob<Dtype>*>*> pooling_top_vecs_;
+  /// pooling_outputs stores the outputs of the PoolingLayers
+  vector<Blob<Dtype>*> pooling_outputs_;
+  /// the internal Flatten layers that the Pooling layers feed into
+  vector<FlattenLayer<Dtype>*> flatten_layers_;
+  /// top vector holders used in call to the underlying FlattenLayer::Forward
+  vector<vector<Blob<Dtype>*>*> flatten_top_vecs_;
+  /// flatten_outputs stores the outputs of the FlattenLayers
+  vector<Blob<Dtype>*> flatten_outputs_;
+  /// bottom vector holder used in call to the underlying ConcatLayer::Forward
+  vector<Blob<Dtype>*> concat_bottom_vec_;
+  /// the internal Concat layers that the Flatten layers feed into
+  shared_ptr<ConcatLayer<Dtype> > concat_layer_;
+};
+
 }  // namespace caffe
 
 #endif  // CAFFE_VISION_LAYERS_HPP_
diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp
@@ -0,0 +1,193 @@
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+using std::min;
+using std::max;
+
+template <typename Dtype>
+LayerParameter SPPLayer<Dtype>::GetPoolingParam(const int pyramid_level,
+      const int bottom_h, const int bottom_w, const SPPParameter spp_param) {
+  LayerParameter pooling_param;
+  int num_bins = pow(2, pyramid_level);
+
+  // find padding and kernel size so that the pooling is
+  // performed across the entire image
+  int kernel_h = ceil(bottom_h / static_cast<double>(num_bins));
+  // remainder_h is the min number of pixels that need to be padded before
+  // entire image height is pooled over with the chosen kernel dimension
+  int remainder_h = kernel_h * num_bins - bottom_h;
+  // pooling layer pads (2 * pad_h) pixels on the top and bottom of the
+  // image.
+  int pad_h = (remainder_h + 1) / 2;
+
+  // similar logic for width
+  int kernel_w = ceil(bottom_w / static_cast<double>(num_bins));
+  int remainder_w = kernel_w * num_bins - bottom_w;
+  int pad_w = (remainder_w + 1) / 2;
+
+  pooling_param.mutable_pooling_param()->set_pad_h(pad_h);
+  pooling_param.mutable_pooling_param()->set_pad_w(pad_w);
+  pooling_param.mutable_pooling_param()->set_kernel_h(kernel_h);
+  pooling_param.mutable_pooling_param()->set_kernel_w(kernel_w);
+  pooling_param.mutable_pooling_param()->set_stride_h(kernel_h);
+  pooling_param.mutable_pooling_param()->set_stride_w(kernel_w);
+
+  switch (spp_param.pool()) {
+  case SPPParameter_PoolMethod_MAX:
+    pooling_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_MAX);
+    break;
+  case SPPParameter_PoolMethod_AVE:
+    pooling_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_AVE);
+    break;
+  case SPPParameter_PoolMethod_STOCHASTIC:
+    pooling_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_STOCHASTIC);
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
+
+  return pooling_param;
+}
+
+template <typename Dtype>
+void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  SPPParameter spp_param = this->layer_param_.spp_param();
+
+  bottom_h_ = bottom[0]->height();
+  bottom_w_ = bottom[0]->width();
+  CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero.";
+  CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero.";
+
+  pyramid_height_ = spp_param.pyramid_height();
+  split_top_vec_.clear();
+  pooling_bottom_vecs_.clear();
+  pooling_layers_.clear();
+  pooling_top_vecs_.clear();
+  pooling_outputs_.clear();
+  flatten_layers_.clear();
+  flatten_top_vecs_.clear();
+  flatten_outputs_.clear();
+  concat_bottom_vec_.clear();
+
+  // split layer output holders setup
+  for (int i = 0; i < pyramid_height_; i++) {
+    split_top_vec_.push_back(new Blob<Dtype>());
+  }
+
+  // split layer setup
+  LayerParameter split_param;
+  split_layer_.reset(new SplitLayer<Dtype>(split_param));
+  split_layer_->SetUp(bottom, split_top_vec_);
+
+  for (int i = 0; i < pyramid_height_; i++) {
+    // pooling layer input holders setup
+    pooling_bottom_vecs_.push_back(new vector<Blob<Dtype>*>);
+    pooling_bottom_vecs_[i]->push_back(split_top_vec_[i]);
+
+    // pooling layer output holders setup
+    pooling_outputs_.push_back(new Blob<Dtype>());
+    pooling_top_vecs_.push_back(new vector<Blob<Dtype>*>);
+    pooling_top_vecs_[i]->push_back(pooling_outputs_[i]);
+
+    // pooling layer setup
+    LayerParameter pooling_param = GetPoolingParam(
+        i, bottom_h_, bottom_w_, spp_param);
+
+    pooling_layers_.push_back(shared_ptr<PoolingLayer<Dtype> > (
+        new PoolingLayer<Dtype>(pooling_param)));
+    pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+
+    // flatten layer output holders setup
+    flatten_outputs_.push_back(new Blob<Dtype>());
+    flatten_top_vecs_.push_back(new vector<Blob<Dtype>*>);
+    flatten_top_vecs_[i]->push_back(flatten_outputs_[i]);
+
+    // flatten layer setup
+    LayerParameter flatten_param;
+    flatten_layers_.push_back(new FlattenLayer<Dtype>(flatten_param));
+    flatten_layers_[i]->SetUp(*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+
+    // concat layer input holders setup
+    concat_bottom_vec_.push_back(flatten_outputs_[i]);
+  }
+
+  // concat layer setup
+  LayerParameter concat_param;
+  concat_layer_.reset(new ConcatLayer<Dtype>(concat_param));
+  concat_layer_->SetUp(concat_bottom_vec_, top);
+}
+
+template <typename Dtype>
+void SPPLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  channels_ = bottom[0]->channels();
+  bottom_h_ = bottom[0]->height();
+  bottom_w_ = bottom[0]->width();
+  SPPParameter spp_param = this->layer_param_.spp_param();
+  split_layer_->Reshape(bottom, split_top_vec_);
+  for (int i = 0; i < pyramid_height_; i++) {
+    LayerParameter pooling_param = GetPoolingParam(
+        i, bottom_h_, bottom_w_, spp_param);
+
+    pooling_layers_[i].reset(
+        new PoolingLayer<Dtype>(pooling_param));
+    pooling_layers_[i]->SetUp(
+        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+    pooling_layers_[i]->Reshape(
+        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+    flatten_layers_[i]->Reshape(
+        *pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+  }
+  concat_layer_->Reshape(concat_bottom_vec_, top);
+}
+
+template <typename Dtype>
+void SPPLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  split_layer_->Forward(bottom, split_top_vec_);
+  for (int i = 0; i < pyramid_height_; i++) {
+    pooling_layers_[i]->Forward(
+        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+    flatten_layers_[i]->Forward(
+        *pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+  }
+  concat_layer_->Forward(concat_bottom_vec_, top);
+}
+
+template <typename Dtype>
+void SPPLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  vector<bool> concat_propagate_down(pyramid_height_, true);
+  concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_);
+  for (int i = 0; i < pyramid_height_; i++) {
+    flatten_layers_[i]->Backward(
+        *flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]);
+    pooling_layers_[i]->Backward(
+        *pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]);
+  }
+  split_layer_->Backward(split_top_vec_, propagate_down, bottom);
+}
+
+
+INSTANTIATE_CLASS(SPPLayer);
+REGISTER_LAYER_CLASS(SPP);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -259,7 +259,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 132 (last added: prelu_param)
+// LayerParameter next available layer-specific ID: 133 (last added: spp_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -328,6 +328,7 @@ message LayerParameter {
   optional ReLUParameter relu_param = 123;
   optional SigmoidParameter sigmoid_param = 124;
   optional SoftmaxParameter softmax_param = 125;
+  optional SPPParameter spp_param = 132;
   optional SliceParameter slice_param = 126;
   optional TanHParameter tanh_param = 127;
   optional ThresholdParameter threshold_param = 128;
@@ -768,6 +769,23 @@ message WindowDataParameter {
   optional string root_folder = 13 [default = ""];
 }
 
+// Message that stores parameters used by SPPLayer
+message SPPParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
 // DEPRECATED: use LayerParameter.
 message V1LayerParameter {
   repeated string bottom = 2;