src/operator/random/pdf_op.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file pdf_op.h
  * \brief Operators for computing the pdf of random distributions.
  */
 #ifndef MXNET_OPERATOR_RANDOM_PDF_OP_H_
 #define MXNET_OPERATOR_RANDOM_PDF_OP_H_

 #include <mxnet/operator_util.h>
 #include <vector>
 #include <algorithm>
 #include "../mshadow_op.h"
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #include "../elemwise_op_common.h"
 #include "../special_functions-inl.h"
 #include "../tensor/broadcast_reduce_op.h"

 namespace mxnet {
 namespace op {

 template <typename DType>
 MSHADOW_XINLINE static DType ceph_psi(DType val) {
   return special_functions::cephes::psi(val);
 }
 template <>
 MSHADOW_XINLINE mshadow::half::half_t ceph_psi(mshadow::half::half_t val) {
   return special_functions::cephes::psi<float>(val);
 }

 template <bool logpdf>
 struct PDF_Uniform {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   DType* out,
                                   IType1* sample,
                                   IType2* lower,
                                   IType2* upper) {
     const index_t index(start / sample_size);
     const DType l(lower[index]), h(upper[index]);
     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       // No check whether sample is in the support.
       out[i] = logpdf ? -DType(log(h - l)) : DType(1.0) / (h - l);
     }
   }
 };

 template <bool logpdf>
 struct PDF_Uniform_Grad {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   OpReqType req,
                                   DType* out,
                                   IType1* sample,
                                   IType2* lower,
                                   IType2* upper,
                                   DType* grad_out,
                                   IType1* grad_sample,
                                   IType2* grad_lower,
                                   IType2* grad_upper) {
     const index_t index(start / sample_size);
     const DType l(lower[index]), h(upper[index]);

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType scaling(grad_out[i] * (logpdf ? DType(1) : out[i]));
       grad_lower[i] = scaling / (h - l);
       grad_upper[i] = scaling / (l - h);
       KERNEL_ASSIGN(grad_sample[i], req, 0);
     }
   }
 };

 template <bool logpdf>
 struct PDF_Normal {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   DType* out,
                                   IType1* sample,
                                   IType2* loc,
                                   IType2* scale) {
     const index_t index(start / sample_size);
     const DType u(loc[index]), s(scale[index]), sq(s * s);
     const DType normalizer(sqrt(2.0 * mxnet_op::PI) * s);

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType x(sample[i]);
       const DType exponent((DType(-0.5) * (x - u) * (x - u)) / (sq));
       out[i] = logpdf ? exponent - log(normalizer) : exp(exponent) / normalizer;
     }
   }
 };

 template <bool logpdf>
 struct PDF_Normal_Grad {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   OpReqType req,
                                   DType* out,
                                   IType1* sample,
                                   IType2* loc,
                                   IType2* scale,
                                   DType* grad_out,
                                   IType1* grad_sample,
                                   IType2* grad_loc,
                                   IType2* grad_scale) {
     const index_t index(start / sample_size);
     const DType u(loc[index]), s(scale[index]), s_squared(s * s), s_cubed(s_squared * s);

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType x(sample[i]);
       const DType scaling(grad_out[i] * (logpdf ? DType(1) : out[i]));
       grad_loc[i]   = scaling * (x - u) / s_squared;
       grad_scale[i] = scaling * ((x - u) * (x - u) - s_squared) / s_cubed;
       KERNEL_ASSIGN(grad_sample[i], req, scaling * (u - x) / s_squared);
     }
   }
 };

 template <bool logpdf>
 struct PDF_Gamma {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   DType* out,
                                   IType1* sample,
                                   IType2* alpha,
                                   IType2* beta) {
     const index_t index(start / sample_size);
     const DType a(alpha[index]), b(beta[index]), lgamma_a(lgamma(a)), a_log_b(a * log(b));

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType x(sample[i]);
       const DType lpdf(a_log_b + (a - 1) * log(x) - b * x - lgamma_a);
       out[i] = logpdf ? lpdf : DType(exp(lpdf));
     }
   }
 };

 template <bool logpdf>
 struct PDF_Gamma_Grad {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   OpReqType req,
                                   DType* out,
                                   IType1* sample,
                                   IType2* alpha,
                                   IType2* beta,
                                   DType* grad_out,
                                   IType1* grad_sample,
                                   IType2* grad_alpha,
                                   IType2* grad_beta) {
     const index_t index(start / sample_size);
     const DType a(alpha[index]), b(beta[index]), log_b(log(b)), ceph_psi_a(ceph_psi(a));

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType x(sample[i]);
       const DType scaling(grad_out[i] * (logpdf ? DType(1) : out[i]));
       grad_alpha[i] = scaling * (log_b + log(x) - ceph_psi_a);
       grad_beta[i]  = scaling * (a / b - x);
       KERNEL_ASSIGN(grad_sample[i], req, scaling * ((a - 1) / x - b));
     }
   }
 };

 template <bool logpdf>
 struct PDF_Exponential {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   DType* out,
                                   IType1* sample,
                                   IType2* lambda) {
     const index_t index(start / sample_size);
     const DType l(lambda[index]), log_l(log(l));

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType x(sample[i]);
       out[i] = logpdf ? log_l - l * x : l * exp(-l * x);
     }
   }
 };

 template <bool logpdf>
 struct PDF_Exponential_Grad {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   OpReqType req,
                                   DType* out,
                                   IType1* sample,
                                   IType2* lambda,
                                   DType* grad_out,
                                   IType1* grad_sample,
                                   IType2* grad_lambda) {
     const index_t index(start / sample_size);
     const DType l(lambda[index]);

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType x(sample[i]);
       const DType scaling(grad_out[i] * (logpdf ? DType(1) : out[i]));
       grad_lambda[i] = scaling * (DType(1) / l - x);
       KERNEL_ASSIGN(grad_sample[i], req, -scaling * l);
     }
   }
 };

 template <bool logpdf>
 struct PDF_Poisson {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   DType* out,
                                   IType1* sample,
                                   IType2* lambda) {
     const index_t index(start / sample_size);
     const DType l(lambda[index]), log_l(log(l));

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType x(sample[i]);
       const DType lpdf((x * log_l - lgamma(x + 1)) - l);
       out[i] = logpdf ? lpdf : DType(exp(lpdf));
     }
   }
 };

 template <bool logpdf>
 struct PDF_Poisson_Grad {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   OpReqType req,
                                   DType* out,
                                   IType1* sample,
                                   IType2* lambda,
                                   DType* grad_out,
                                   IType1* grad_sample,
                                   IType2* grad_lambda) {
     const index_t index(start / sample_size);
     const DType l(lambda[index]);

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType x(sample[i]);
       const DType scaling(grad_out[i] * (logpdf ? DType(1) : out[i]));
       grad_lambda[i] = scaling * (x / l - DType(1));
       KERNEL_ASSIGN(grad_sample[i], req, 0);
     }
   }
 };

 template <bool logpdf>
 struct PDF_NegativeBinomial {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   DType* out,
                                   IType1* sample,
                                   IType2* limit,
                                   IType2* prob) {
     const index_t index(start / sample_size);
     const DType l(limit[index]), p(prob[index]), lgamma_l(lgamma(l));

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType x(sample[i]);
       const DType lpdf((lgamma(x + l) - lgamma(x + 1) - lgamma_l) + l * log(p) + x * log(1 - p));
       out[i] = logpdf ? lpdf : DType(exp(lpdf));
     }
   }

   template <typename DType>
   MSHADOW_XINLINE static DType LPDF(DType l, DType p, DType x) {
     // Note that "p" is the failure and not the success probability.
     return (lgamma(x + l) - lgamma(x + 1) - lgamma(l)) + l * log(p) + x * log(1 - p);
   }
 };

 template <bool logpdf>
 struct PDF_NegativeBinomial_Grad {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   OpReqType req,
                                   DType* out,
                                   IType1* sample,
                                   IType2* limit,
                                   IType2* prob,
                                   DType* grad_out,
                                   IType1* grad_sample,
                                   IType2* grad_limit,
                                   IType2* grad_prob) {
     const index_t index(start / sample_size);
     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       DType grad_l(0), grad_p(0);
       LPDF_GRAD(DType(limit[index]),
                 DType(prob[index]),
                 DType(sample[i]),
                 out[i],
                 grad_out[i],
                 &grad_l,
                 &grad_p);
       grad_limit[i] = grad_l;
       grad_prob[i]  = grad_p;
       KERNEL_ASSIGN(grad_sample[i], req, 0);
     }
   }

   template <typename DType>
   MSHADOW_XINLINE static void
   LPDF_GRAD(DType l, DType p, DType x, DType o, DType grad_o, DType* grad_l, DType* grad_p) {
     const DType scaling(grad_o * (logpdf ? DType(1) : o));
     *grad_l = scaling * ((ceph_psi(x + l) - ceph_psi(l)) + log(p));
     *grad_p = scaling * (l / p - x / (1 - p));
   }
 };

 template <bool logpdf>
 struct PDF_GeneralizedNegativeBinomial {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   DType* out,
                                   IType1* sample,
                                   IType2* mu,
                                   IType2* alpha) {
     const index_t index(start / sample_size);

     // Reparameterize with limit = 1 / alpha, prob = 1 / (mu * alpha + 1)
     const DType limit(1.0 / alpha[index]), prob(1.0 / (mu[index] * alpha[index] + 1.0));

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const DType lpdf(PDF_NegativeBinomial<logpdf>::LPDF(limit, prob, DType(sample[i])));
       out[i] = logpdf ? lpdf : DType(exp(lpdf));
     }
   }
 };

 template <bool logpdf>
 struct PDF_GeneralizedNegativeBinomial_Grad {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   OpReqType req,
                                   DType* out,
                                   IType1* sample,
                                   IType2* mu,
                                   IType2* alpha,
                                   DType* grad_out,
                                   IType1* grad_sample,
                                   IType2* grad_mu,
                                   IType2* grad_alpha) {
     const index_t index(start / sample_size);
     const DType fmu(mu[index]), falpha(alpha[index]), den(fmu * falpha + 1.0);

     // Reparameterize with limit = 1 / alpha, prob = 1 / (mu * alpha + 1)
     const DType limit(1.0 / falpha), prob(1.0 / (fmu * falpha + 1.0));

     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       // Grad returned as d_limit, d_prob
       DType grad_l(0), grad_p(0);
       PDF_NegativeBinomial_Grad<logpdf>::LPDF_GRAD(
           limit, prob, DType(sample[i]), out[i], grad_out[i], &grad_l, &grad_p);
       grad_mu[i]    = -grad_p * falpha / (den * den);
       grad_alpha[i] = -grad_l / (falpha * falpha) - grad_p * fmu / (den * den);
       KERNEL_ASSIGN(grad_sample[i], req, 0);
     }
   }
 };

 template <bool logpdf>
 struct PDF_Dirichlet {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   index_t k,
                                   DType* out,
                                   IType1* sample,
                                   IType2* alpha) {
     const index_t index(start / sample_size);
     const index_t end = start + length;
     for (index_t i = start; i < end; ++i) {
       const IType1* cur_sample = sample + i * k;
       const IType2* cur_alpha  = alpha + index * k;
       DType sum_alpha(0), sum_lgamma(0), sum_sample(0);
       for (index_t j = 0; j < k; ++j) {
         sum_alpha += cur_alpha[j];
         sum_lgamma += lgamma(cur_alpha[j]);
         sum_sample += (cur_alpha[j] - 1) * log(cur_sample[j]);
       }
       DType lpdf(sum_sample + (lgamma(sum_alpha) - sum_lgamma));
       out[i] = logpdf ? lpdf : DType(exp(lpdf));
     }
   }
 };

 template <bool logpdf>
 struct PDF_Dirichlet_Grad {
   template <typename DType, typename IType1, typename IType2>
   MSHADOW_XINLINE static void Map(index_t start,
                                   index_t length,
                                   index_t sample_size,
                                   OpReqType req,
                                   index_t k,
                                   DType* out,
                                   IType1* sample,
                                   IType2* alpha,
                                   DType* grad_out,
                                   IType1* grad_sample,
                                   IType2* grad_alpha) {
     const index_t index(start / sample_size);
     const index_t end = start + length;

     for (index_t i = start; i < end; ++i) {
       // Digamma function
       const IType1* cur_sample = sample + i * k;
       const IType2* cur_alpha  = alpha + index * k;

       const DType scaling(grad_out[i] * (logpdf ? DType(1) : out[i]));
       DType sum_alpha(0);
       for (index_t j = 0; j < k; ++j) {
         sum_alpha += cur_alpha[j];
       }
       const DType psi_sum(ceph_psi(sum_alpha));

       for (index_t j = 0; j < k; ++j) {
         size_t grad_alpha_index  = i % sample_size + sample_size * (j + k * index);
         size_t grad_sample_index = i * k + j;

         // order grad_alpha differently to allow efficient reduction at the end.
         grad_alpha[grad_alpha_index] =
             scaling * (log(cur_sample[j]) + (psi_sum - ceph_psi(cur_alpha[j])));
         KERNEL_ASSIGN(
             grad_sample[grad_sample_index], req, scaling * (cur_alpha[j] - 1) / cur_sample[j]);
       }
     }
   }
 };

 struct PdfParam : public dmlc::Parameter<PdfParam> {
   bool is_log;
   DMLC_DECLARE_PARAMETER(PdfParam) {
     DMLC_DECLARE_FIELD(is_log).set_default(false).describe(
         "If set, compute the density of the log-probability instead of the probability.");
   }
 };

 template <bool vparm = false>
 inline bool PdfOpShape(const nnvm::NodeAttrs& attrs,
                        std::vector<TShape>* in_attrs,
                        std::vector<TShape>* out_attrs) {
   CHECK_GT(in_attrs->size(), 1) << "pdf operator takes at least 2 arguments (" << in_attrs->size()
                                 << " given)";
   CHECK_EQ(out_attrs->size(), 1);
   // All inputs must be defined in order to infer output shape.
   if (std::all_of(
           (*in_attrs).begin(), (*in_attrs).end(), [](const TShape& s) { return s.ndim() > 0; })) {
     // Tensors of distribution parameters must have same shape.
     for (size_t i = 2; i < in_attrs->size(); ++i) {
       SHAPE_ASSIGN_CHECK(*in_attrs, i, (*in_attrs)[i - 1]);
     }
     // Tensors of distribution parameters must match leftmost subshape of samples.
     CHECK_LE((*in_attrs)[1].ndim(), (*in_attrs)[0].ndim())
         << "dimension of input samples (" << (*in_attrs)[0].ndim()
         << ") must be at least dimension of distribution parameters (" << (*in_attrs)[1].ndim()
         << ")";
     TShape tshape((*in_attrs)[0].begin(), (*in_attrs)[0].begin() + (*in_attrs)[1].ndim());
     if (vparm) {
       *(tshape.end() - 1) = *((*in_attrs)[0].end() - 1);
     }
     for (size_t i = 1; i < in_attrs->size(); ++i) {
       SHAPE_ASSIGN_CHECK(*in_attrs, i, tshape);
     }
     // Output shape must equal input tensor of samples except for last dimension if we are
     // dealing with samples that are itself vectors. Be aware of the special case where we
     // are dealing with a single vector sample.
     if (vparm && ((*in_attrs)[0].ndim() == 1)) {
       // Special case where we are dealing with a single vector sample.
       SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1));
     } else {
       TShape oshape((*in_attrs)[0].begin(), (*in_attrs)[0].end() - (vparm ? 1 : 0));
       SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
     }
     return true;
   }
   return false;
 }

 template <typename OP>
 struct LaunchExWrapper {
   template <typename... Args>
   MSHADOW_XINLINE static void Map(const index_t start,
                                   const index_t length,
                                   const index_t sample_size,
                                   Args... args) {
     // Apply the operator to the sample in strides of sample_size, so that
     // the operators can assume that their distribution parameters are constant.
     index_t i = start;

     // Get aligned
     const index_t align_step   = sample_size - (i % sample_size);
     const index_t first_stride = length > align_step ? align_step : length;
     OP::Map(i, first_stride, sample_size, args...);
     i += first_stride;

     const index_t end = start + length - sample_size;
     for (; i < end; i += sample_size) {
       OP::Map(i, sample_size, sample_size, args...);
     }

     // Last stride might not be aligned either
     const index_t last_stride = start + length - i;
     if (last_stride > 0) {  // Don't overstep even if length <= sample_size
       OP::Map(i, last_stride, sample_size, args...);
     }
   }
 };

 template <typename xpu, typename DType, typename pdf, int pnum, bool vparm = false>
 struct PdfCaller;

 template <typename xpu, typename DType, typename pdf>
 struct PdfCaller<xpu, DType, pdf, 1, false> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
                  mshadow::Stream<xpu>* s) {
     CHECK_EQ(inputs[0].Size() % inputs[1].Size(), 0);
     CHECK_EQ(inputs[0].Size() % outputs[0].Size(), 0);
     index_t num_samples(inputs[0].Size() / inputs[1].Size());
     mxnet_op::Kernel<LaunchExWrapper<pdf>, xpu>::LaunchEx(s,
                                                           outputs[0].Size(),
                                                           num_samples,
                                                           outputs[0].dptr<DType>(),
                                                           inputs[0].dptr<DType>(),
                                                           inputs[1].dptr<DType>());
   }
 };

 template <typename xpu, typename DType, typename pdf>
 struct PdfCaller<xpu, DType, pdf, 1, true> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
                  mshadow::Stream<xpu>* s) {
     CHECK_EQ(inputs[0].Size() % inputs[1].Size(), 0);
     CHECK_EQ(inputs[0].Size() % outputs[0].Size(), 0);
     index_t num_samples(inputs[0].Size() / inputs[1].Size());
     index_t sample_size(inputs[0].Size() / outputs[0].Size());

     // Covers distributions parametrized by a vector of parameters (Dirichlet distribution).
     mxnet_op::Kernel<LaunchExWrapper<pdf>, xpu>::LaunchEx(s,
                                                           outputs[0].Size(),
                                                           num_samples,
                                                           sample_size,
                                                           outputs[0].dptr<DType>(),
                                                           inputs[0].dptr<DType>(),
                                                           inputs[1].dptr<DType>());
   }
 };

 template <typename xpu, typename DType, typename pdf>
 struct PdfCaller<xpu, DType, pdf, 2, false> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
                  mshadow::Stream<xpu>* s) {
     CHECK_EQ(inputs[0].Size() % inputs[1].Size(), 0);
     CHECK_EQ(inputs[0].Size(), outputs[0].Size());
     index_t num_samples(inputs[0].Size() / inputs[1].Size());
     mxnet_op::Kernel<LaunchExWrapper<pdf>, xpu>::LaunchEx(s,
                                                           outputs[0].Size(),
                                                           num_samples,
                                                           outputs[0].dptr<DType>(),
                                                           inputs[0].dptr<DType>(),
                                                           inputs[1].dptr<DType>(),
                                                           inputs[2].dptr<DType>());
   }
 };

 template <typename xpu, template <bool> class pdf, int pnum, bool vparm>
 void PdfOpForward(const nnvm::NodeAttrs& attrs,
                   const OpContext& ctx,
                   const std::vector<TBlob>& inputs,
                   const std::vector<OpReqType>& req,
                   const std::vector<TBlob>& outputs) {
   CHECK_NE(req[0], kAddTo);
   CHECK_EQ(inputs.size(), pnum + 1);
   CHECK_EQ(outputs.size(), 1);

   // Skip kernel launch for zero-size tensors
   if (inputs[1].shape_.Size() == 0U || outputs[0].Size() == 0U) {
     return;
   }

   mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   const PdfParam& param   = nnvm::get<PdfParam>(attrs.parsed);
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     if (param.is_log) {
       PdfCaller<xpu, DType, pdf<true>, pnum, vparm>::op(inputs, outputs, s);
     } else {
       PdfCaller<xpu, DType, pdf<false>, pnum, vparm>::op(inputs, outputs, s);
     }
   });
 }

 template <typename xpu, typename DType, typename pdfgrad, int pnum, int vparm = false>
 struct PdfGradCaller;

 template <typename xpu, typename DType, typename pdfgrad>
 struct PdfGradCaller<xpu, DType, pdfgrad, 1, false> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<OpReqType>& req,
                  const std::vector<TBlob>& grads,
                  mshadow::Stream<xpu>* s) {
     index_t num_samples(inputs[1].Size() / inputs[2].Size());
     mxnet_op::Kernel<LaunchExWrapper<pdfgrad>, xpu>::LaunchEx(s,
                                                               inputs[0].Size(),
                                                               num_samples,
                                                               req[0],
                                                               inputs[3].dptr<DType>(),
                                                               inputs[1].dptr<DType>(),
                                                               inputs[2].dptr<DType>(),
                                                               inputs[0].dptr<DType>(),
                                                               grads[0].dptr<DType>(),
                                                               grads[1].dptr<DType>());
   }
 };

 template <typename xpu, typename DType, typename pdfgrad>
 struct PdfGradCaller<xpu, DType, pdfgrad, 1, true> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<OpReqType>& req,
                  const std::vector<TBlob>& grads,
                  mshadow::Stream<xpu>* s) {
     index_t num_samples(inputs[1].Size() / inputs[2].Size());
     index_t sample_size(inputs[1].Size() / inputs[0].Size());
     mxnet_op::Kernel<LaunchExWrapper<pdfgrad>, xpu>::LaunchEx(s,
                                                               inputs[0].Size(),
                                                               num_samples,
                                                               req[0],
                                                               sample_size,
                                                               inputs[3].dptr<DType>(),
                                                               inputs[1].dptr<DType>(),
                                                               inputs[2].dptr<DType>(),
                                                               inputs[0].dptr<DType>(),
                                                               grads[0].dptr<DType>(),
                                                               grads[1].dptr<DType>());
   }
 };

 template <typename xpu, typename DType, typename pdfgrad>
 struct PdfGradCaller<xpu, DType, pdfgrad, 2, false> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<OpReqType>& req,
                  const std::vector<TBlob>& grads,
                  mshadow::Stream<xpu>* s) {
     index_t num_samples(inputs[1].Size() / inputs[2].Size());
     mxnet_op::Kernel<LaunchExWrapper<pdfgrad>, xpu>::LaunchEx(s,
                                                               inputs[0].Size(),
                                                               num_samples,
                                                               req[0],
                                                               inputs[4].dptr<DType>(),
                                                               inputs[1].dptr<DType>(),
                                                               inputs[2].dptr<DType>(),
                                                               inputs[3].dptr<DType>(),
                                                               inputs[0].dptr<DType>(),
                                                               grads[0].dptr<DType>(),
                                                               grads[1].dptr<DType>(),
                                                               grads[2].dptr<DType>());
   }
 };

 template <typename xpu, template <bool> class pdfgrad, int pnum, bool vparm>
 void PdfOpBackward(const nnvm::NodeAttrs& attrs,
                    const OpContext& ctx,
                    const std::vector<TBlob>& inputs,
                    const std::vector<OpReqType>& req,
                    const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   CHECK_EQ(inputs.size(), pnum + 3);
   CHECK_EQ(outputs.size(), pnum + 1);
   mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   const PdfParam& param   = nnvm::get<PdfParam>(attrs.parsed);
   const size_t N(outputs[1].Size());
   const TShape src_shape(Shape2(N, outputs[0].Size() / N)), dst_shape(Shape2(N, 1));
   const size_t red_work_size(broadcast::ReduceWorkspaceSize(s, dst_shape, kAddTo, src_shape));
 #if !defined(__CUDACC__)
   // Inputs to PdfOpBackward: grad, samples, parm1, parm2, pdf.
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     const size_t tmp_size(outputs[0].Size() * pnum * sizeof(DType) + red_work_size);
     Tensor<xpu, 1, char> tmp_space =
         ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(tmp_size), s);
     std::vector<TBlob> grads = {outputs[0]};
     grads.push_back(TBlob(tmp_space.dptr_,
                           outputs[0].shape_,
                           outputs[1].dev_mask(),
                           outputs[1].type_flag_,
                           outputs[1].dev_id()));
     if (pnum == 2) {
       grads.push_back(TBlob(tmp_space.dptr_ + outputs[0].Size() * sizeof(DType),
                             outputs[0].shape_,
                             outputs[2].dev_mask(),
                             outputs[2].type_flag_,
                             outputs[2].dev_id()));
     }
     if (param.is_log) {
       PdfGradCaller<xpu, DType, pdfgrad<true>, pnum, vparm>::op(inputs, req, grads, s);
     } else {
       PdfGradCaller<xpu, DType, pdfgrad<false>, pnum, vparm>::op(inputs, req, grads, s);
     }
     Tensor<xpu, 1, char> red_work(
         tmp_space.dptr_ + pnum * outputs[0].Size() * sizeof(DType), Shape1(red_work_size), s);
     broadcast::Reduce<red::sum, 2, DType, op::mshadow_op::identity>(
         s, outputs[1].reshape(dst_shape), req[1], red_work, grads[1].reshape(src_shape));
     if (pnum == 2) {
       broadcast::Reduce<red::sum, 2, DType, op::mshadow_op::identity>(
           s, outputs[2].reshape(dst_shape), req[2], red_work, grads[2].reshape(src_shape));
     }
   });
 #else
   // Inputs to PdfOpBackward: grad, samples, parm1, parm2, pdf.
   MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     const size_t tmp_size(outputs[0].Size() * pnum * sizeof(DType) + red_work_size);
     Tensor<xpu, 1, char> tmp_space =
         ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(tmp_size), s);
     std::vector<TBlob> grads = {outputs[0]};
     grads.push_back(TBlob(tmp_space.dptr_,
                           outputs[0].shape_,
                           outputs[1].dev_mask(),
                           outputs[1].type_flag_,
                           outputs[1].dev_id()));
     if (pnum == 2) {
       grads.push_back(TBlob(tmp_space.dptr_ + outputs[0].Size() * sizeof(DType),
                             outputs[0].shape_,
                             outputs[2].dev_mask(),
                             outputs[2].type_flag_,
                             outputs[2].dev_id()));
     }
     if (param.is_log) {
       PdfGradCaller<xpu, DType, pdfgrad<true>, pnum, vparm>::op(inputs, req, grads, s);
     } else {
       PdfGradCaller<xpu, DType, pdfgrad<false>, pnum, vparm>::op(inputs, req, grads, s);
     }
     Tensor<xpu, 1, char> red_work(
         tmp_space.dptr_ + pnum * outputs[0].Size() * sizeof(DType), Shape1(red_work_size), s);
     broadcast::RTCReduce(ctx,
                          outputs[1].reshape(dst_shape),
                          req[1],
                          red_work,
                          grads[1].reshape(src_shape),
                          "red::sum{}",
                          2,
                          "identity");
     if (pnum == 2) {
       broadcast::RTCReduce(ctx,
                            outputs[2].reshape(dst_shape),
                            req[2],
                            red_work,
                            grads[2].reshape(src_shape),
                            "red::sum{}",
                            2,
                            "identity");
     }
   });

 #endif
 }

 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_RANDOM_PDF_OP_H_