src/operator/correlation.cc - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file correlation.cc
  * \brief correlation op
  * \author Xu Dong
  */
 #include "./correlation-inl.h"
 #include "./mshadow_op.h"

 namespace mshadow {
 template <typename Dtype>
 void AddPad(const Tensor<cpu, 4, Dtype>& original, const Tensor<cpu, 4, Dtype>& out, int pad_size) {
   for (index_t nbatch = 0; nbatch < original.size(0); nbatch++)
     for (index_t channel = 0; channel < original.size(1); channel++)
       for (index_t h = 0; h < original.size(2); h++)
         for (index_t w = 0; w < original.size(3); w++)
           out[nbatch][h + pad_size][w + pad_size][channel] = original[nbatch][channel][h][w];
 }
 template <typename Dtype>
 inline void CorrelationForward(const Tensor<cpu, 4, Dtype>& out,
                                const Tensor<cpu, 4, Dtype>& data1,
                                const Tensor<cpu, 4, Dtype>& data2,
                                const Tensor<cpu, 4, Dtype>& tmp1,
                                const Tensor<cpu, 4, Dtype>& tmp2,
                                int top_channels_,
                                int top_height_,
                                int top_width_,
                                int pad_size_,
                                bool is_multiply,
                                int max_displacement_,
                                int kernel_size_,
                                int neighborhood_grid_radius_,
                                int neighborhood_grid_width_,
                                int kernel_radius_,
                                int stride1_,
                                int stride2_) {
   const index_t bnum  = data1.size(0);
   const int bchannels = data1.size(1);
   const int sumelems  = kernel_size_ * kernel_size_ * bchannels;
   AddPad<Dtype>(data1, tmp1, pad_size_);
   index_t top_channels_unsigned_ = static_cast<index_t>(top_channels_);
   AddPad<Dtype>(data2, tmp2, pad_size_);
   for (index_t i = 0; i < static_cast<index_t>(top_height_); i++)
     for (index_t j = 0; j < static_cast<index_t>(top_width_); j++)
       for (index_t nbatch = 0; nbatch < bnum; nbatch++) {
         int x1 = j * stride1_ + max_displacement_;
         int y1 = i * stride1_ + max_displacement_;
         for (index_t top_channel = 0; top_channel < top_channels_unsigned_; top_channel++) {
           int s2o = (top_channel % neighborhood_grid_width_ - neighborhood_grid_radius_) * stride2_;
           int s2p = (top_channel / neighborhood_grid_width_ - neighborhood_grid_radius_) * stride2_;
           int x2  = x1 + s2o;
           int y2  = y1 + s2p;
           for (index_t h = 0; h < static_cast<index_t>(kernel_size_); h++)
             for (index_t w = 0; w < static_cast<index_t>(kernel_size_); w++)
               for (index_t channel = 0; channel < static_cast<index_t>(bchannels); channel++) {
                 if (is_multiply == true)
                   out[nbatch][top_channel][i][j] +=
                       tmp1[nbatch][y1 + h][x1 + w][channel] * tmp2[nbatch][y2 + h][x2 + w][channel];
                 else
                   out[nbatch][top_channel][i][j] += std::abs(tmp1[nbatch][y1 + h][x1 + w][channel] -
                                                              tmp2[nbatch][y2 + h][x2 + w][channel]);
               }
           out[nbatch][top_channel][i][j] /= sumelems;
         }
       }
 }
 template <typename Dtype>
 inline void CorrelationBackward(const Tensor<cpu, 4, Dtype>& out_grad,
                                 const Tensor<cpu, 4, Dtype>& in_grad1,
                                 const Tensor<cpu, 4, Dtype>& in_grad2,
                                 const Tensor<cpu, 4, Dtype>& tmp1,
                                 const Tensor<cpu, 4, Dtype>& tmp2,
                                 int top_channels_,
                                 int top_height_,
                                 int top_width_,
                                 int pad_size_,
                                 bool is_multiply,
                                 int max_displacement_,
                                 int kernel_size_,
                                 int neighborhood_grid_radius_,
                                 int neighborhood_grid_width_,
                                 int kernel_radius_,
                                 int stride1_,
                                 int stride2_,
                                 int num,
                                 int channels,
                                 int height,
                                 int width) {
   const float sumelems = kernel_size_ * kernel_size_ * channels;
   for (index_t i = 0; i < static_cast<index_t>(top_height_); i++)
     for (index_t j = 0; j < static_cast<index_t>(top_width_); j++)
       for (index_t nbatch = 0; nbatch < static_cast<index_t>(num); nbatch++) {
         int x1 = j * stride1_ + max_displacement_;
         int y1 = i * stride1_ + max_displacement_;
         for (int top_channel = 0; top_channel < top_channels_; top_channel++) {
           int s2o = (top_channel % neighborhood_grid_width_ - neighborhood_grid_radius_) * stride2_;
           int s2p = (top_channel / neighborhood_grid_width_ - neighborhood_grid_radius_) * stride2_;
           int x2  = x1 + s2o;
           int y2  = y1 + s2p;
           for (int h = 0; h < kernel_size_; h++)
             for (int w = 0; w < kernel_size_; w++)
               for (int channel = 0; channel < channels; channel++) {
                 if (is_multiply == true) {
                   if ((y1 + h - pad_size_ >= 0) && (x1 + w - pad_size_ >= 0) &&
                       (y1 + h < height + pad_size_) && (x1 + w < width + pad_size_)) {
                     in_grad1[nbatch][channel][y1 + h - pad_size_][x1 + w - pad_size_] +=
                         out_grad[nbatch][top_channel][i][j] *
                         tmp2[nbatch][y2 + h][x2 + w][channel] / sumelems;
                   }
                   if ((y2 + h - pad_size_ >= 0) && (x2 + w - pad_size_ >= 0) &&
                       (y2 + h < height + pad_size_) && (x2 + w < width + pad_size_)) {
                     in_grad2[nbatch][channel][y2 + h - pad_size_][x2 + w - pad_size_] +=
                         out_grad[nbatch][top_channel][i][j] *
                         tmp1[nbatch][y1 + h][x1 + w][channel] / sumelems;
                   }
                 } else {
                   if ((y1 + h - pad_size_ >= 0) && (x1 + w - pad_size_ >= 0) &&
                       (y1 + h < height + pad_size_) && (x1 + w < width + pad_size_)) {
                     Dtype sign = (tmp1[nbatch][y1 + h][x1 + w][channel] >=
                                   tmp2[nbatch][y2 + h][x2 + w][channel]) ?
                                      Dtype(1.0) :
                                      Dtype(-1.0);
                     in_grad1[nbatch][channel][y1 + h - pad_size_][x1 + w - pad_size_] +=
                         out_grad[nbatch][top_channel][i][j] * sign / sumelems;
                   }
                   if ((y2 + h - pad_size_ >= 0) && (x2 + w - pad_size_ >= 0) &&
                       (y2 + h < height + pad_size_) && (x2 + w < width + pad_size_)) {
                     Dtype sign = (tmp1[nbatch][y1 + h][x1 + w][channel] >=
                                   tmp2[nbatch][y2 + h][x2 + w][channel]) ?
                                      Dtype(-1.0) :
                                      Dtype(1.0);
                     in_grad2[nbatch][channel][y2 + h - pad_size_][x2 + w - pad_size_] +=
                         out_grad[nbatch][top_channel][i][j] * sign / sumelems;
                   }
                 }
               }
         }
       }
 }
 }  // namespace mshadow
 namespace mxnet {
 namespace op {
 template <>
 Operator* CreateOp<cpu>(CorrelationParam param, int dtype) {
   Operator* op = nullptr;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { op = new CorrelationOp<cpu, DType>(param); });
   return op;
 }
 Operator* CorrelationProp::CreateOperatorEx(Context ctx,
                                             mxnet::ShapeVector* in_shape,
                                             std::vector<int>* in_type) const {
   DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 DMLC_REGISTER_PARAMETER(CorrelationParam);
 MXNET_REGISTER_OP_PROPERTY(Correlation, CorrelationProp)
     .add_argument("data1", "NDArray-or-Symbol", "Input data1 to the correlation.")
     .add_argument("data2", "NDArray-or-Symbol", "Input data2 to the correlation.")
     .add_arguments(CorrelationParam::__FIELDS__())
     .describe(R"code(Applies correlation to inputs.

 The correlation layer performs multiplicative patch comparisons between two feature maps.

 Given two multi-channel feature maps :math:`f_{1}, f_{2}`, with :math:`w`, :math:`h`, and :math:`c` being their width, height, and number of channels,
 the correlation layer lets the network compare each patch from :math:`f_{1}` with each patch from :math:`f_{2}`.

 For now we consider only a single comparison of two patches. The 'correlation' of two patches centered at :math:`x_{1}` in the first map and
 :math:`x_{2}` in the second map is then defined as:

 .. math::

    c(x_{1}, x_{2}) = \sum_{o \in [-k,k] \times [-k,k]} <f_{1}(x_{1} + o), f_{2}(x_{2} + o)>

 for a square patch of size :math:`K:=2k+1`.

 Note that the equation above is identical to one step of a convolution in neural networks, but instead of convolving data with a filter, it convolves data with other
 data. For this reason, it has no training weights.

 Computing :math:`c(x_{1}, x_{2})` involves :math:`c * K^{2}` multiplications. Comparing all patch combinations involves :math:`w^{2}*h^{2}` such computations.

 Given a maximum displacement :math:`d`, for each location :math:`x_{1}` it computes correlations :math:`c(x_{1}, x_{2})` only in a neighborhood of size :math:`D:=2d+1`,
 by limiting the range of :math:`x_{2}`. We use strides :math:`s_{1}, s_{2}`, to quantize :math:`x_{1}` globally and to quantize :math:`x_{2}` within the neighborhood
 centered around :math:`x_{1}`.

 The final output is defined by the following expression:

 .. math::
   out[n, q, i, j] = c(x_{i, j}, x_{q})

 where :math:`i` and :math:`j` enumerate spatial locations in :math:`f_{1}`, and :math:`q` denotes the :math:`q^{th}` neighborhood of :math:`x_{i,j}`.
 )code" ADD_FILELINE);
 }  // namespace op
 }  // namespace mxnet
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file correlation.cc
	* \brief correlation op
	* \author Xu Dong
	*/
	#include "./correlation-inl.h"
	#include "./mshadow_op.h"

	namespace mshadow {
	template <typename Dtype>
	void AddPad(const Tensor<cpu, 4, Dtype>& original, const Tensor<cpu, 4, Dtype>& out, int pad_size) {
	for (index_t nbatch = 0; nbatch < original.size(0); nbatch++)
	for (index_t channel = 0; channel < original.size(1); channel++)
	for (index_t h = 0; h < original.size(2); h++)
	for (index_t w = 0; w < original.size(3); w++)
	out[nbatch][h + pad_size][w + pad_size][channel] = original[nbatch][channel][h][w];
	}
	template <typename Dtype>
	inline void CorrelationForward(const Tensor<cpu, 4, Dtype>& out,
	const Tensor<cpu, 4, Dtype>& data1,
	const Tensor<cpu, 4, Dtype>& data2,
	const Tensor<cpu, 4, Dtype>& tmp1,
	const Tensor<cpu, 4, Dtype>& tmp2,
	int top_channels_,
	int top_height_,
	int top_width_,
	int pad_size_,
	bool is_multiply,
	int max_displacement_,
	int kernel_size_,
	int neighborhood_grid_radius_,
	int neighborhood_grid_width_,
	int kernel_radius_,
	int stride1_,
	int stride2_) {
	const index_t bnum = data1.size(0);
	const int bchannels = data1.size(1);
	const int sumelems = kernel_size_ * kernel_size_ * bchannels;
	AddPad<Dtype>(data1, tmp1, pad_size_);
	index_t top_channels_unsigned_ = static_cast<index_t>(top_channels_);
	AddPad<Dtype>(data2, tmp2, pad_size_);
	for (index_t i = 0; i < static_cast<index_t>(top_height_); i++)
	for (index_t j = 0; j < static_cast<index_t>(top_width_); j++)
	for (index_t nbatch = 0; nbatch < bnum; nbatch++) {
	int x1 = j * stride1_ + max_displacement_;
	int y1 = i * stride1_ + max_displacement_;
	for (index_t top_channel = 0; top_channel < top_channels_unsigned_; top_channel++) {
	int s2o = (top_channel % neighborhood_grid_width_ - neighborhood_grid_radius_) * stride2_;
	int s2p = (top_channel / neighborhood_grid_width_ - neighborhood_grid_radius_) * stride2_;
	int x2 = x1 + s2o;
	int y2 = y1 + s2p;
	for (index_t h = 0; h < static_cast<index_t>(kernel_size_); h++)
	for (index_t w = 0; w < static_cast<index_t>(kernel_size_); w++)
	for (index_t channel = 0; channel < static_cast<index_t>(bchannels); channel++) {
	if (is_multiply == true)
	out[nbatch][top_channel][i][j] +=
	tmp1[nbatch][y1 + h][x1 + w][channel] * tmp2[nbatch][y2 + h][x2 + w][channel];
	else
	out[nbatch][top_channel][i][j] += std::abs(tmp1[nbatch][y1 + h][x1 + w][channel] -
	tmp2[nbatch][y2 + h][x2 + w][channel]);
	}
	out[nbatch][top_channel][i][j] /= sumelems;
	}
	}
	}
	template <typename Dtype>
	inline void CorrelationBackward(const Tensor<cpu, 4, Dtype>& out_grad,
	const Tensor<cpu, 4, Dtype>& in_grad1,
	const Tensor<cpu, 4, Dtype>& in_grad2,
	const Tensor<cpu, 4, Dtype>& tmp1,
	const Tensor<cpu, 4, Dtype>& tmp2,
	int top_channels_,
	int top_height_,
	int top_width_,
	int pad_size_,
	bool is_multiply,
	int max_displacement_,
	int kernel_size_,
	int neighborhood_grid_radius_,
	int neighborhood_grid_width_,
	int kernel_radius_,
	int stride1_,
	int stride2_,
	int num,
	int channels,
	int height,
	int width) {
	const float sumelems = kernel_size_ * kernel_size_ * channels;
	for (index_t i = 0; i < static_cast<index_t>(top_height_); i++)
	for (index_t j = 0; j < static_cast<index_t>(top_width_); j++)
	for (index_t nbatch = 0; nbatch < static_cast<index_t>(num); nbatch++) {
	int x1 = j * stride1_ + max_displacement_;
	int y1 = i * stride1_ + max_displacement_;
	for (int top_channel = 0; top_channel < top_channels_; top_channel++) {
	int s2o = (top_channel % neighborhood_grid_width_ - neighborhood_grid_radius_) * stride2_;
	int s2p = (top_channel / neighborhood_grid_width_ - neighborhood_grid_radius_) * stride2_;
	int x2 = x1 + s2o;
	int y2 = y1 + s2p;
	for (int h = 0; h < kernel_size_; h++)
	for (int w = 0; w < kernel_size_; w++)
	for (int channel = 0; channel < channels; channel++) {
	if (is_multiply == true) {
	if ((y1 + h - pad_size_ >= 0) && (x1 + w - pad_size_ >= 0) &&
	(y1 + h < height + pad_size_) && (x1 + w < width + pad_size_)) {
	in_grad1[nbatch][channel][y1 + h - pad_size_][x1 + w - pad_size_] +=
	out_grad[nbatch][top_channel][i][j] *
	tmp2[nbatch][y2 + h][x2 + w][channel] / sumelems;
	}
	if ((y2 + h - pad_size_ >= 0) && (x2 + w - pad_size_ >= 0) &&
	(y2 + h < height + pad_size_) && (x2 + w < width + pad_size_)) {
	in_grad2[nbatch][channel][y2 + h - pad_size_][x2 + w - pad_size_] +=
	out_grad[nbatch][top_channel][i][j] *
	tmp1[nbatch][y1 + h][x1 + w][channel] / sumelems;
	}
	} else {
	if ((y1 + h - pad_size_ >= 0) && (x1 + w - pad_size_ >= 0) &&
	(y1 + h < height + pad_size_) && (x1 + w < width + pad_size_)) {
	Dtype sign = (tmp1[nbatch][y1 + h][x1 + w][channel] >=
	tmp2[nbatch][y2 + h][x2 + w][channel]) ?
	Dtype(1.0) :
	Dtype(-1.0);
	in_grad1[nbatch][channel][y1 + h - pad_size_][x1 + w - pad_size_] +=
	out_grad[nbatch][top_channel][i][j] * sign / sumelems;
	}
	if ((y2 + h - pad_size_ >= 0) && (x2 + w - pad_size_ >= 0) &&
	(y2 + h < height + pad_size_) && (x2 + w < width + pad_size_)) {
	Dtype sign = (tmp1[nbatch][y1 + h][x1 + w][channel] >=
	tmp2[nbatch][y2 + h][x2 + w][channel]) ?
	Dtype(-1.0) :
	Dtype(1.0);
	in_grad2[nbatch][channel][y2 + h - pad_size_][x2 + w - pad_size_] +=
	out_grad[nbatch][top_channel][i][j] * sign / sumelems;
	}
	}
	}
	}
	}
	}
	} // namespace mshadow
	namespace mxnet {
	namespace op {
	template <>
	Operator* CreateOp<cpu>(CorrelationParam param, int dtype) {
	Operator* op = nullptr;
	MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { op = new CorrelationOp<cpu, DType>(param); });
	return op;
	}
	Operator* CorrelationProp::CreateOperatorEx(Context ctx,
	mxnet::ShapeVector* in_shape,
	std::vector<int>* in_type) const {
	DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
	}
	DMLC_REGISTER_PARAMETER(CorrelationParam);
	MXNET_REGISTER_OP_PROPERTY(Correlation, CorrelationProp)
	.add_argument("data1", "NDArray-or-Symbol", "Input data1 to the correlation.")
	.add_argument("data2", "NDArray-or-Symbol", "Input data2 to the correlation.")
	.add_arguments(CorrelationParam::__FIELDS__())
	.describe(R"code(Applies correlation to inputs.

	The correlation layer performs multiplicative patch comparisons between two feature maps.

	Given two multi-channel feature maps :math:`f_{1}, f_{2}`, with :math:`w`, :math:`h`, and :math:`c` being their width, height, and number of channels,
	the correlation layer lets the network compare each patch from :math:`f_{1}` with each patch from :math:`f_{2}`.

	For now we consider only a single comparison of two patches. The 'correlation' of two patches centered at :math:`x_{1}` in the first map and
	:math:`x_{2}` in the second map is then defined as:

	.. math::

	c(x_{1}, x_{2}) = \sum_{o \in [-k,k] \times [-k,k]} <f_{1}(x_{1} + o), f_{2}(x_{2} + o)>

	for a square patch of size :math:`K:=2k+1`.

	Note that the equation above is identical to one step of a convolution in neural networks, but instead of convolving data with a filter, it convolves data with other
	data. For this reason, it has no training weights.

	Computing :math:`c(x_{1}, x_{2})` involves :math:`c * K^{2}` multiplications. Comparing all patch combinations involves :math:`w^{2}*h^{2}` such computations.

	Given a maximum displacement :math:`d`, for each location :math:`x_{1}` it computes correlations :math:`c(x_{1}, x_{2})` only in a neighborhood of size :math:`D:=2d+1`,
	by limiting the range of :math:`x_{2}`. We use strides :math:`s_{1}, s_{2}`, to quantize :math:`x_{1}` globally and to quantize :math:`x_{2}` within the neighborhood
	centered around :math:`x_{1}`.

	The final output is defined by the following expression:

	.. math::
	out[n, q, i, j] = c(x_{i, j}, x_{q})

	where :math:`i` and :math:`j` enumerate spatial locations in :math:`f_{1}`, and :math:`q` denotes the :math:`q^{th}` neighborhood of :math:`x_{i,j}`.
	)code" ADD_FILELINE);
	} // namespace op
	} // namespace mxnet