src/operator/correlation.cu - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * Copyright [2016] <Contributors>
  * \file Correation.cu
  * \brief  Correlation operator
  * \author Xu Dong
 */
 #include "./correlation-inl.h"
 #include <mshadow/tensor.h>
 #include <mshadow/cuda/reduce.cuh>
 #include <algorithm>
 #include <vector>

 #define ROUND_OFF 50000
 #define WARPS_PER_BLOCK 1
 #define THREADS_PER_WARP 32
 #define CORRELATION_CUDA_CHECK(condition) \
   /* Code block avoids redefinition of cudaError_t error */ \
   do { \
     cudaError_t error = condition; \
     CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
   } while (0)
 #define CUDA_KERNEL_LOOP(i, n) \
 for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
       i < (n); \
       i += blockDim.x * gridDim.x)
 namespace mshadow {
 namespace cuda {
 // == Correlation Kernel
 template <typename Dtype>
 __global__ void CorrelateData(const int nthreads, int num, int topwidth,
   int topheight, int topchannels, int topcount,
   int max_displacement, int neighborhood_grid_radius,
   int neighborhood_grid_width, int kernel_radius, int kernel_size, int stride1, int stride2,
   int bottomwidth, int bottomheight, int bottomchannels,
   const Dtype *bottom0, const Dtype *bottom1, Dtype *top) {
   extern __shared__ char patch_data_char[];
   Dtype *patch_data = reinterpret_cast<Dtype *>(patch_data_char);
   //  First (upper left) position of kernel upper-left corner
   //  in current center position of neighborhood in image 1
   int x1 = blockIdx.x * stride1 + max_displacement;
   int y1 = blockIdx.y * stride1 + max_displacement;
   int item = blockIdx.z;
   int ch_off = threadIdx.x;
   //  Load 3D patch into shared shared memory
   for (int j = 0; j < kernel_size; j++) {  //  HEIGHT
     for (int i = 0; i < kernel_size; i++) {  //  WIDTH
       int ji_off = ((j * kernel_size) + i) * bottomchannels;
       for (int ch = ch_off; ch < bottomchannels; ch += (THREADS_PER_WARP * WARPS_PER_BLOCK))  {
           //  CHANNELS
           int idx1 = ((item * bottomheight + y1+j) * bottomwidth + x1+i) * bottomchannels + ch;
           int idxPatchData = ji_off + ch;
           patch_data[idxPatchData] = bottom0[idx1];
       }
     }
   }
   __syncthreads();
   __shared__ Dtype sum[THREADS_PER_WARP * WARPS_PER_BLOCK];
   //  Compute correlation
   for (int top_channel = 0; top_channel < topchannels; top_channel++) {
     sum[ch_off] = 0;
     int s2o = (top_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride2;
     int s2p = (top_channel / neighborhood_grid_width - neighborhood_grid_radius) * stride2;
     for (int j = 0; j < kernel_size; j++) {  //  HEIGHT
       for (int i = 0; i < kernel_size; i++) {  //  WIDTH
         int ji_off = ((j * kernel_size) + i) * bottomchannels;
         for (int ch = ch_off; ch < bottomchannels; ch += (THREADS_PER_WARP * WARPS_PER_BLOCK)) {
           //  CHANNELS
           int x2 = x1 + s2o;
           int y2 = y1 + s2p;
           int idxPatchData = ji_off + ch;
           int idx2 = ((item * bottomheight + y2 + j) * bottomwidth + x2 + i) * bottomchannels + ch;
           sum[ch_off] += patch_data[idxPatchData] * bottom1[idx2];
         }
       }
     }
     __syncthreads();
     if (ch_off == 0) {
         Dtype total_sum = 0;
         for (int idx = 0; idx < THREADS_PER_WARP * WARPS_PER_BLOCK; idx++) {
             total_sum += sum[idx];
         }
         const int sumelems = kernel_size * kernel_size * bottomchannels;
         const int index = ((top_channel * topheight + blockIdx.y) * topwidth) + blockIdx.x;
         top[index + item*topcount] = total_sum / static_cast<float>(sumelems);
     }  //  Aggregate result of  different threads
   }
 }
 //  == Correlation Backward Pass Kernel (For data1)
 template <typename Dtype>
 __global__ void CorrelateDataBackward0(const int nthreads, int num, int item,
   int topwidth, int topheight, int topchannels,
   int max_displacement, int neighborhood_grid_radius,
   int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
   int bottomwidth, int bottomheight, int pbottomwidth, int pbottomheight,
   int bottomchannels, int bottomcount, int pad_size,
   Dtype *bottom0diff, const Dtype *bottom1, const Dtype *topdiff) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     int n = index % bottomchannels;  //  channels
     int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
     int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
     //  Get X,Y ranges and clamp
     //  round_off is a trick to enable integer division with ceil, even for negative numbers
     //  We use a large offset, for the inner part not to become negative.
     const int round_off = ROUND_OFF;
     const int round_off_s1 = stride1 * round_off;
     //  We add round_off before_s1 the int division and subtract round_off after it,
     //  to ensure the formula matches ceil behavior:
     int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
      / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
     int ymin = (m - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
      / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
     //  Same here:
     int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off;
     //  floor (l - max_displacement) / stride1
     int ymax = (m - max_displacement + round_off_s1) / stride1 - round_off;
     //  floor (m - max_displacement) / stride1
     Dtype sum = 0;
     if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth-1) && (ymin <= topheight-1)) {
         xmin = max(0, xmin);
         xmax = min(topwidth-1, xmax);
         ymin = max(0, ymin);
         ymax = min(topheight-1, ymax);
         for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
           for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
             //  Get bottom1 data:
             int s2o = stride2 * o;
             int s2p = stride2 * p;
             int idxbot1 = ((item * pbottomheight + (m + s2p)) * pbottomwidth + (l + s2o))\
              * bottomchannels + n;
             Dtype bot1tmp = bottom1[idxbot1];  // bottom1[l+s2o,m+s2p,n]
             //  Index offset for topdiff in following loops:
             int op = (p+neighborhood_grid_radius) * neighborhood_grid_width\
              + (o + neighborhood_grid_radius);  //  index [o,p]
             int idxopoffset = (item * topchannels + op);
             for (int y = ymin; y <= ymax; y++) {
               for (int x = xmin; x <= xmax; x++) {
                 int idxtopdiff = (idxopoffset * topheight + y) * topwidth + x;  //  topdiff[x,y,o,p]
                 sum += topdiff[idxtopdiff] * bot1tmp;
               }
             }
           }
         }
     }
     const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2+1) * bottomchannels;
     const int bot0index = ((n * bottomheight) + (m-pad_size)) * bottomwidth + (l-pad_size);
     bottom0diff[bot0index + item * bottomcount] = sum / static_cast<float>(sumelems);
   }
 }
 // == Correlation Backward Pass Kernel (For Blob 1)
 template <typename Dtype>
 __global__ void CorrelateDataBackward1(const int nthreads,
   int num, int item, int topwidth, int topheight, int topchannels,
   int max_displacement, int neighborhood_grid_radius,
   int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
   int bottomwidth, int bottomheight, int pbottomwidth, int pbottomheight,
   int bottomchannels, int bottomcount, int pad_size,
   const Dtype *bottom0, Dtype *bottom1diff, const Dtype *topdiff) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     //  int l = index % bottomwidth + pad_size; //w-pos
     //  int m = (index / bottomwidth) % bottomheight + pad_size; //  h-pos
     //  int n = (index / bottomwidth / bottomheight) % bottomchannels; //  channels
     int n = index % bottomchannels;  //  channels
     int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
     int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
     //  round_off is a trick to enable integer division with ceil, even for negative numbers
     //  We use a large offset, for the inner part not to become negative.
     const int round_off = ROUND_OFF;
     const int round_off_s1 = stride1 * round_off;
     Dtype sum = 0;
     for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
       for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
         int s2o = stride2 * o;
         int s2p = stride2 * p;
         //  Get X,Y ranges and clamp
         //  We add round_off before_s1 the int division and subtract round_off after it,
         //  to ensure the formula matches ceil behavior:
         int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1)\
          / stride1 + 1 - round_off;
          // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
         int ymin = (m - 2*kernel_radius - max_displacement - s2p + round_off_s1 - 1)\
          / stride1 + 1 - round_off;
         // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
         //  Same here:
         int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off;
         //  floor (l - max_displacement - s2o) / stride1
         int ymax = (m - max_displacement - s2p + round_off_s1) / stride1 - round_off;
         //  floor (m - max_displacement - s2p) / stride1
         if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth - 1) && (ymin <= topheight - 1)) {
             xmin = max(0, xmin);
             xmax = min(topwidth-1, xmax);
             ymin = max(0, ymin);
             ymax = min(topheight-1, ymax);
             //  Get bottom0 data:
             int idxbot0 = ((item * pbottomheight + (m - s2p)) \
             * pbottomwidth + (l - s2o)) * bottomchannels + n;
             Dtype bot0tmp = bottom0[idxbot0];  //  bottom1[l+s2o,m+s2p,n]
             //  Index offset for topdiff in following loops:
             int op = (p+neighborhood_grid_radius) * \
             neighborhood_grid_width + (o+neighborhood_grid_radius);  //  index [o,p]
             int idxOpOffset = (item * topchannels + op);
             for (int y = ymin; y <= ymax; y++) {
               for (int x = xmin; x <= xmax; x++) {
                 int idxtopdiff = (idxOpOffset * topheight + y)\
                  * topwidth + x;  //  topdiff[x,y,o,p]
                 sum += topdiff[idxtopdiff] * bot0tmp;
               }
             }
         }
       }
     }
     const int sumelems = (kernel_radius*2+1)*(kernel_radius*2+1)*bottomchannels;
     const int bot1index = ((n * bottomheight) + (m - pad_size)) * bottomwidth + (l - pad_size);
     bottom1diff[bot1index + item * bottomcount] = sum / static_cast<float>(sumelems);
   }
 }
 // == Correlation Kernel Subtraction
 template <typename Dtype>
 __global__ void CorrelateDataSubtract(const int nthreads, int num, int item,
   int topwidth, int topheight, int topchannels, int topcount,
   int max_displacement, int neighborhood_grid_radius,
   int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
   int bottomwidth, int bottomheight, int bottomchannels,
   const Dtype *bottom0, const Dtype *bottom1, Dtype *top) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     int x = index % topwidth;  //  w-pos
     int y = (index / topwidth) % topheight;  //  h-pos
     int c = (index / topwidth / topheight) % topchannels;  //  channels
     //  Offset of patch in image 2
     int s2o = (c % neighborhood_grid_width - neighborhood_grid_radius) * stride2;
     int s2p = (c / neighborhood_grid_width - neighborhood_grid_radius) * stride2;
     //  First (upper left) position of kernel center in current neighborhood in image 1
     int x1 = x*stride1 + kernel_radius + max_displacement;
     int y1 = y*stride1 + kernel_radius + max_displacement;
     //  Iterate through 3D patch
     Dtype sum = 0;
     for (int j = -kernel_radius; j <= kernel_radius; j++) {  //  HEIGHT
       for (int i = -kernel_radius; i <= kernel_radius; i++) {  //  WIDTH
         for (int l = 0; l < bottomchannels; l++) {  //  CHANNELS
           //  Calculate position in image 2
           int x2 = x1 + s2o;
           int y2 = y1 + s2p;
           //  Indices in bottom data: (CH=l,W=x2,H=y2,N)
           int idx1 = ((item * bottomheight + y1 + j) * bottomwidth + x1 + i) \
           * bottomchannels + l;
           int idx2 = ((item * bottomheight + y2 + j) * bottomwidth + x2 + i) \
           * bottomchannels + l;
           //  Do the correlation:
           sum += fabsf(bottom0[idx1] - bottom1[idx2]);
         }
       }
     }
     const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * bottomchannels;
     top[index + item * topcount] = sum / static_cast<float>(sumelems);
   }
 }
 //  == Correlation Backward Pass Kernel (For Blob 0)
 template <typename Dtype>
 __global__ void CorrelateDataBackward0Subtract(const int nthreads, int num,
   int item, int topwidth, int topheight, int topchannels,
   int max_displacement, int neighborhood_grid_radius,
   int neighborhood_grid_width, int kernel_radius,
   int stride1, int stride2, int bottomwidth, int bottomheight,
   int pbottomwidth, int pbottomheight,
   int bottomchannels, int bottomcount, int pad_size,
   Dtype *bottom0diff, const Dtype *bottom0, const Dtype *bottom1, const Dtype *topdiff) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     int n = index % bottomchannels;  //  channels
     int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
     int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
     //  Get X,Y ranges and clamp
     //  round_off is a trick to enable integer division with ceil, even for negative numbers
     //  We use a large offset, for the inner part not to become negative.
     const int round_off = ROUND_OFF;
     const int round_off_s1 = stride1 * round_off;
     int idxbot0 = ((item * pbottomheight + m) * pbottomwidth + l)\
              * bottomchannels + n;
     //  We add round_off before_s1 the int division and subtract round_off after it,
     //  to ensure the formula matches ceil behavior:
     int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
      / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
     int ymin = (m - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
      / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
     //  Same here:
     int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off;
     //  floor (l - max_displacement) / stride1
     int ymax = (m - max_displacement + round_off_s1) / stride1 - round_off;
     //  floor (m - max_displacement) / stride1
     Dtype sum = 0;
     if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth-1) && (ymin <= topheight-1)) {
         xmin = max(0, xmin);
         xmax = min(topwidth-1, xmax);
         ymin = max(0, ymin);
         ymax = min(topheight-1, ymax);
         for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
           for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
             //  Get bottom1 data:
             int s2o = stride2 * o;
             int s2p = stride2 * p;
             int idxbot1 = ((item * pbottomheight + (m+s2p)) * pbottomwidth\
              + (l+s2o)) * bottomchannels + n;
             Dtype bot0tmp = bottom0[idxbot0];
             Dtype bot1tmp = bottom1[idxbot1];
             Dtype sign = (bot0tmp >= bot1tmp) ? Dtype(1.0) : Dtype(-1.0);
             //  Index offset for topdiff in following loops:
             int op = (p+neighborhood_grid_radius) * neighborhood_grid_width\
              + (o + neighborhood_grid_radius);  //  index [o,p]
             int idxopoffset = (item * topchannels + op);
             for (int y = ymin; y <= ymax; y++) {
               for (int x = xmin; x <= xmax; x++) {
                 int idxtopdiff = (idxopoffset * topheight + y) * topwidth + x;  //  topdiff[x,y,o,p]
                 sum += topdiff[idxtopdiff] * sign;
               }
             }
           }
         }
     }
     const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2+1) * bottomchannels;
     const int bot0index = ((n * bottomheight) + (m-pad_size)) * bottomwidth + (l-pad_size);
     bottom0diff[bot0index + item * bottomcount] = sum / static_cast<float>(sumelems);
   }
 }
 //  == Correlation Backward Pass Kernel (For Blob 1)
 template <typename Dtype>
 __global__ void CorrelateDataBackward1Subtract(const int nthreads, int num,
   int item, int topwidth, int topheight, int topchannels,
   int max_displacement, int neighborhood_grid_radius,
   int neighborhood_grid_width, int kernel_radius,
   int stride1, int stride2, int bottomwidth, int bottomheight,
   int pbottomwidth, int pbottomheight, int bottomchannels,
   int bottomcount, int pad_size, const Dtype *bottom0,
   const Dtype *bottom1, Dtype *bottom1diff, const Dtype *topdiff) {
     CUDA_KERNEL_LOOP(index, nthreads) {
     //  int l = index % bottomwidth + pad_size; //w-pos
     //  int m = (index / bottomwidth) % bottomheight + pad_size; //  h-pos
     //  int n = (index / bottomwidth / bottomheight) % bottomchannels; //  channels
     int n = index % bottomchannels;  //  channels
     int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
     int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
     //  round_off is a trick to enable integer division with ceil, even for negative numbers
     //  We use a large offset, for the inner part not to become negative.
     const int round_off = ROUND_OFF;
     const int round_off_s1 = stride1 * round_off;
     Dtype sum = 0;
     int idxbot1 = ((item * pbottomheight + m) * pbottomwidth + l)\
              * bottomchannels + n;
     for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
       for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
         int s2o = stride2 * o;
         int s2p = stride2 * p;
         //  Get X,Y ranges and clamp
         //  We add round_off before_s1 the int division and subtract round_off after it,
         //  to ensure the formula matches ceil behavior:
         int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1)\
          / stride1 + 1 - round_off;
          // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
         int ymin = (m - 2*kernel_radius - max_displacement - s2p + round_off_s1 - 1)\
          / stride1 + 1 - round_off;
         // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
         //  Same here:
         int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off;
         //  floor (l - max_displacement - s2o) / stride1
         int ymax = (m - max_displacement - s2p + round_off_s1) / stride1 - round_off;
         //  floor (m - max_displacement - s2p) / stride1
         if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth - 1) && (ymin <= topheight - 1)) {
             xmin = max(0, xmin);
             xmax = min(topwidth-1, xmax);
             ymin = max(0, ymin);
             ymax = min(topheight-1, ymax);
             //  Get bottom0 data:
             int idxbot0 = ((item * pbottomheight + (m - s2p)) * pbottomwidth + (l - s2o))\
              * bottomchannels + n;
             //  bottom0[l+s2o,m+s2p,n]
             Dtype bot0tmp = bottom0[idxbot0];
             Dtype bot1tmp = bottom1[idxbot1];
             Dtype sign = (bot0tmp >= bot1tmp) ? Dtype(-1.0) : Dtype(1.0);
             //  Index offset for topdiff in following loops:
             int op = (p+neighborhood_grid_radius) * \
             neighborhood_grid_width + (o+neighborhood_grid_radius);  //  index [o,p]
             int idxOpOffset = (item * topchannels + op);
             for (int y = ymin; y <= ymax; y++) {
               for (int x = xmin; x <= xmax; x++) {
                 int idxtopdiff = (idxOpOffset * topheight + y)\
                  * topwidth + x;  //  topdiff[x,y,o,p]
                 sum += topdiff[idxtopdiff] * sign;
               }
             }
         }
       }
     }
     const int sumelems = (kernel_radius*2+1)*(kernel_radius*2+1)*bottomchannels;
     const int bot1index = ((n * bottomheight) + (m - pad_size)) * bottomwidth + (l - pad_size);
     bottom1diff[bot1index + item * bottomcount] = sum / static_cast<float>(sumelems);
   }
 }
 //  == Forward
 //  == Dimension rearrangement Kernel
 template <typename Dtype>
 __global__ void blob_rearrange_kernel2(const Dtype* in, Dtype* out, int num,
 int channels, int width, int height, int widthheight, int padding, int pwidthheight) {
     //  change shape from [batchsize,channel,y,x] to [batchsize,y,x,channel]
     int xy = blockIdx.x * blockDim.x + threadIdx.x;
     if (xy >= widthheight )
         return;
     int ch = blockIdx.y;
     int n  = blockIdx.z;
     Dtype value = in[(n * channels + ch) * widthheight + xy];
     __syncthreads();
     int xpad  = (xy % width + padding);
     int ypad  = (xy / width + padding);
     int xypad = ypad * (width + 2 * padding) + xpad;
     out[(n * pwidthheight + xypad) * channels + ch] = value;
 }
 template <typename Dtype>
 void Forward_gpu(
       const Tensor<gpu, 4, Dtype> &out,
       const Tensor<gpu, 4, Dtype> &data1,
       const Tensor<gpu, 4, Dtype> &data2,
       const Tensor<gpu, 4, Dtype> &tmp1,
       const Tensor<gpu, 4, Dtype> &tmp2,
       int top_channels_, int top_height_, int top_width_, int pad_size_,
       bool is_multiply, int max_displacement_, int kernel_size_,
       int neighborhood_grid_radius_, int neighborhood_grid_width_,
       int  kernel_radius_, int stride1_, int stride2_, cudaStream_t stream,
       cudaStream_t stream_tmp1, cudaStream_t stream_tmp2) {
     const Dtype *bottom_data1 = data1.dptr_;
     const Dtype *bottom_data2 = data2.dptr_;
     Dtype *rbot1 = tmp1.dptr_;
     Dtype *rbot2 = tmp2.dptr_;
     Dtype *top = out.dptr_;
     const int bnum = data1.size(0);
     const int bchannels = data1.size(1);
     const int bheight = data1.size(2);
     const int bwidth = data1.size(3);
     const int bwidthheight = bwidth * bheight;
     const int topcount = top_width_ * top_height_ * top_channels_;
     dim3 threadsPerBlock(THREADS_PER_WARP * WARPS_PER_BLOCK);
     int threads_per_block = 16;
     dim3 totalBlocksRearr((bwidthheight - 1) / threads_per_block + 1, bchannels, bnum);
     const int pwidthheight = (bwidth + 2 * pad_size_) * (bheight + 2 * pad_size_);
     blob_rearrange_kernel2<Dtype><<<totalBlocksRearr, threads_per_block, 0, stream_tmp1>>>
     (bottom_data1, rbot1, bnum, bchannels, bwidth, bheight, bwidthheight, pad_size_, pwidthheight);
     blob_rearrange_kernel2<Dtype><<<totalBlocksRearr, threads_per_block, 0, stream_tmp2>>>
     (bottom_data2, rbot2, bnum, bchannels, bwidth, bheight, bwidthheight, pad_size_, pwidthheight);
     const int num = bnum;
     const int channels = bchannels;
     const int height = bheight + 2 * pad_size_;
     const int width = bwidth + 2 * pad_size_;
     const int shared_memory_per_block = (kernel_size_ * kernel_size_) * bchannels;
     if (is_multiply == true) {
         //  CorrelationLayer
         int topThreadCount = topcount;
         dim3 totalBlocksCorr(top_width_, top_height_, num);
         CorrelateData<Dtype><<<totalBlocksCorr, threadsPerBlock,
         shared_memory_per_block * sizeof(Dtype), stream>>>(
             topThreadCount,
             num, top_width_, top_height_, top_channels_, topcount,
             max_displacement_, neighborhood_grid_radius_,
             neighborhood_grid_width_, kernel_radius_, kernel_size_,
             stride1_, stride2_,
             width, height, channels,
             rbot1, rbot2, top);
         CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
     } else {
         //  CorrelationLayer
         for (int n = 0; n < num; n++) {
             int topThreadCount = topcount;
             const int gridSize = (topThreadCount + kMaxThreadsPerBlock - 1)\
              / kMaxThreadsPerBlock;
             CorrelateDataSubtract<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream>>>(
                 topThreadCount,
                 num, n, top_width_, top_height_, top_channels_, topcount,
                 max_displacement_, neighborhood_grid_radius_,
                 neighborhood_grid_width_, kernel_radius_,
                 stride1_, stride2_, width, height, channels, rbot1, rbot2, top);
          CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
         }
     }
 }
 template <typename Dtype>
 void Backward_gpu(
        const Tensor<gpu, 4, Dtype> &out_grad,
       const Tensor<gpu, 4, Dtype> &in_grad1,
       const Tensor<gpu, 4, Dtype> &in_grad2,
       const Tensor<gpu, 4, Dtype> &tmp1,
       const Tensor<gpu, 4, Dtype> &tmp2,
       int top_channels_, int top_height_,
       int top_width_, int pad_size_, bool is_multiply,
       int max_displacement_, int kernel_size_,
       int neighborhood_grid_radius_, int neighborhood_grid_width_,
       int  kernel_radius_, int stride1_, int stride2_,
       cudaStream_t stream0, cudaStream_t stream1,
       int num, int channels, int height, int width) {
     //  Get top diff, compute bottom diff
     const Dtype* top_diff = out_grad.dptr_;
     Dtype* bottom0_diff = in_grad1.dptr_;
     Dtype* bottom1_diff = in_grad2.dptr_;
     const Dtype* rbot1 = tmp1.dptr_;
     const Dtype* rbot2 = tmp2.dptr_;
     const int paddedheight = height + 2 * pad_size_;
     const int paddedwidth = width + 2 * pad_size_;
     const int bottomcount = channels * height * width;
     int botThreadCount = bottomcount;
     const int gridSize = (botThreadCount + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
     //  CorrelationLayerBackward
     if (is_multiply == true) {
         //  == Run kernel Backward 0
         dim3 totalBlocksBackward0(width, height, channels * num);  //  First dim is fastest
         const int buffer_size_backw0 = \
         (static_cast<int>(ceil(static_cast<float>(2 * kernel_radius_)\
          / static_cast<float>(stride1_))) + 1) * top_channels_;
         //  == Run kernel Backward 0
         for (int n = 0; n < num; n++) {
         CorrelateDataBackward0<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream0>>>(
             botThreadCount,
             num, n, top_width_, top_height_, top_channels_,
             max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
             bottom0_diff, rbot2, top_diff);
         CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
         }
         //  == Run kernel Backward 1
         for (int n = 0; n < num; n++) {
         CorrelateDataBackward1<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream1>>>(
             botThreadCount,
             num, n, top_width_, top_height_, top_channels_,
             max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
             rbot1, bottom1_diff, top_diff);
        CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
         }
     } else  {
         for (int n = 0; n < num; n++) {
         //  Bottom0:
         CorrelateDataBackward0Subtract<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream0>>>(
             botThreadCount,
             num, n, top_width_, top_height_, top_channels_,
             max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
             bottom0_diff, rbot1, rbot2, top_diff);
         CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
         }
         for (int n = 0; n < num; n++) {
         //  Bottom1:
         CorrelateDataBackward1Subtract<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream1>>>(
             botThreadCount,
             num, n, top_width_, top_height_, top_channels_,
             max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
             rbot1, rbot2, bottom1_diff, top_diff);
         CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
         }
     }
 }
 }  // namespace cuda
 template<typename Dtype>
 inline void CorrelationForward(const Tensor<gpu, 4, Dtype> &out,
                                const Tensor<gpu, 4, Dtype> &data1,
                                const Tensor<gpu, 4, Dtype> &data2,
                                const Tensor<gpu, 4, Dtype> &tmp1,
                                const Tensor<gpu, 4, Dtype> &tmp2,
                                int top_channels_, int top_height_,
                                int top_width_, int pad_size_, bool is_multiply,
                                int max_displacement_, int kernel_size_,
                                int neighborhood_grid_radius_, int neighborhood_grid_width_,
                                int kernel_radius_, int stride1_, int stride2_
                            ) {
   cudaStream_t stream = Stream<gpu>::GetStream(out.stream_);
   cudaStream_t stream_tmp1 = Stream<gpu>::GetStream(tmp1.stream_);
   cudaStream_t stream_tmp2 = Stream<gpu>::GetStream(tmp2.stream_);
   cuda::Forward_gpu(out, data1, data2, tmp1, tmp2, top_channels_, top_height_,
                     top_width_, pad_size_, is_multiply, max_displacement_, kernel_size_,
                     neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
                     stride1_, stride2_, stream, stream_tmp1, stream_tmp2);
 }

 template<typename Dtype>
 inline void CorrelationBackward(const Tensor<gpu, 4, Dtype> &out_grad,
                             const Tensor<gpu, 4, Dtype> &in_grad1,
                             const Tensor<gpu, 4, Dtype> &in_grad2,
                             const Tensor<gpu, 4, Dtype> &tmp1,
                             const Tensor<gpu, 4, Dtype> &tmp2,
                             int top_channels_, int top_height_,
                             int top_width_, int pad_size_, bool is_multiply,
                             int max_displacement_, int kernel_size_,
                             int neighborhood_grid_radius_, int neighborhood_grid_width_,
                             int  kernel_radius_, int stride1_,
                             int stride2_, int num, int channels, int height, int width
                             ) {
   cudaStream_t stream0 = Stream<gpu>::GetStream(in_grad1.stream_);
   cudaStream_t stream1 = Stream<gpu>::GetStream(in_grad2.stream_);
   cuda::Backward_gpu(out_grad, in_grad1, in_grad2, tmp1, tmp2, top_channels_,
                       top_height_, top_width_, pad_size_, is_multiply,
                       max_displacement_, kernel_size_, neighborhood_grid_radius_,
                       neighborhood_grid_width_, kernel_radius_, stride1_, stride2_,
                       stream0, stream1, num, channels, height, width);
 }
 }  // namespace mshadow
 namespace mxnet {
 namespace op {
 template<>
 Operator* CreateOp<gpu>(CorrelationParam param, int dtype) {
   Operator* op = nullptr;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     op = new CorrelationOp<gpu, DType>(param);
   });
   return op;
 }
 }  // namespace op
 }  // namespace mxnet