src/operator/random/sampler.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file sampler.h
  * \brief implementations of random sampling functors.
  */

 #ifndef MXNET_OPERATOR_RANDOM_SAMPLER_H_
 #define MXNET_OPERATOR_RANDOM_SAMPLER_H_

 #include <algorithm>

 using namespace mshadow;
 using namespace mxnet::op::mxnet_op;
 using namespace mxnet::common::random;

 namespace mxnet {
 namespace op {

 /*!
  * \brief Launch a generic kernel with parallel random generator.
  * \tparam gen random generator
  * \tparam N Number of iterations
  * \tparam Args Varargs type to eventually pass to the OP::Map() function
  */
 template<typename OP, typename xpu, typename GType, typename ...Args>
 inline static void LaunchRNG(mshadow::Stream<xpu> *s,
                              common::random::RandGenerator<xpu, GType> *gen,
                              const index_t N, Args... args) {
   // minimal check to avoid division by zero, below.
   // if `N` is zero the map operation is a no-op in any case.
   if (N <= 0) {
     return;
   }
   const index_t nloop = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
                     RandGenerator<xpu>::kMinNumRandomPerThread;
   const index_t nthread = std::min(nloop,
                                    static_cast<index_t>(RandGenerator<xpu>::kNumRandomStates));
   const index_t step = (N + nthread - 1) / nthread;
   Kernel<OP, xpu>::Launch(s, nthread, *gen, N, step, args...);
 }

 #define RNG_KERNEL_LOOP(xpu, GType, thread_id, gen, N, step, ...)        \
   const index_t start = thread_id * step;                                    \
   const index_t end = start + step;                                          \
   typename RandGenerator<xpu, GType>::Impl genImpl(&gen, thread_id);     \
   for (index_t i = start; i < end && i < N; ++i) {                           \
     {__VA_ARGS__}                                                        \
   }

 template<typename xpu>
 struct SampleUniformKernel {
   template<typename IType, typename OType>
   MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
                                   const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lower, const IType *upper, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
       index_t nBatch(1 + (nSample - 1) / nParm);
       out[i] = OType(lower[i / nBatch] +
                      (upper[i / nBatch] - lower[i / nBatch]) * genImpl.uniform());
     });
   }
 };

 template<typename xpu>
 struct UniformSampler {
   template<typename IType, typename OType>
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lower,
                                    const Tensor<xpu, 1, IType>& upper,
                                    const Tensor<xpu, 1, OType>& out,
                                    RandGenerator<xpu, OType> *pgen,
                                    Stream<xpu> *s) {
     LaunchRNG<SampleUniformKernel<xpu>, xpu>(s, pgen, out.size(0), lower.size(0), out.size(0),
                                              lower.dptr_, upper.dptr_, out.dptr_);
   }
 };

 template<typename xpu>
 struct SampleRandIntKernel {
   template<typename IType, typename OType>
   MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
                                   const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lower, const IType *upper, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
       index_t nBatch(1 + (nSample - 1) / nParm);
       if (sizeof(IType) == sizeof(int64_t))
         out[i] = OType(lower[i / nBatch] +
                      genImpl.rand_int64() % (upper[i / nBatch] - lower[i / nBatch]));
       else
         out[i] = OType(lower[i / nBatch] +
                      genImpl.rand() % (upper[i / nBatch] - lower[i / nBatch]));
     });
   }
 };

 template<typename xpu>
 struct RandIntSampler {
   template<typename IType, typename OType>
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lower,
                                    const Tensor<xpu, 1, IType>& upper,
                                    const Tensor<xpu, 1, OType>& out,
                                    RandGenerator<xpu, OType> *pgen,
                                    Stream<xpu> *s) {
     LaunchRNG<SampleRandIntKernel<xpu>, xpu>(s, pgen, out.size(0), lower.size(0), out.size(0),
                                              lower.dptr_, upper.dptr_, out.dptr_);
   }
 };

 template<typename xpu>
 struct SampleNormalKernel {
   template<typename IType, typename OType>
   MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
                                   const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *mean, const IType *std, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
       index_t nBatch(1 + (nSample - 1) / nParm);
       out[i] = OType(genImpl.normal() * std[i / nBatch] + mean[i / nBatch]);
     });
   }
 };

 template<typename xpu>
 struct NormalSampler {
   template<typename IType, typename OType>
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& mean,
                                    const Tensor<xpu, 1, IType>& std,
                                    const Tensor<xpu, 1, OType>& out,
                                    RandGenerator<xpu, OType> *pgen,
                                    Stream<xpu> *s) {
     LaunchRNG<SampleNormalKernel<xpu>, xpu>(s, pgen, out.size(0), mean.size(0), out.size(0),
                                             mean.dptr_, std.dptr_, out.dptr_);
   }
 };

 template<typename xpu>
 struct SampleExponentialKernel {
   template<typename IType, typename OType>
   MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
                                   const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lambda, OType *out) {
     RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
       index_t nBatch(1 + (nSample - 1) / nParm);
       out[i] = OType(-log(1.0 - genImpl.uniform()) / lambda[i / nBatch]);
     });
   }
 };

 template<typename xpu>
 struct ExponentialSampler {
   template<typename IType, typename OType>
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lambda,
                                    const Tensor<xpu, 1, OType>& out,
                                    RandGenerator<xpu, OType> *pgen,
                                    Stream<xpu> *s) {
     LaunchRNG<SampleExponentialKernel<xpu>, xpu>(s, pgen, out.size(0),
                                                  lambda.size(0), out.size(0),
                                                  lambda.dptr_, out.dptr_);
   }
 };

 template<typename xpu, typename IType, typename OType>
 MSHADOW_XINLINE OType SampleGamma(IType a, IType b, typename RandGenerator<xpu, OType>::Impl *gen) {
   // Generate one sample of the gamma distribution
   OType sample;
   OType d = a < 1 ? a + 2.0 / 3.0 : a - 1.0 / 3.0;
   OType k = sqrt(9.0 * d);
   OType c = 1.0 / k;
   while (1) {
     OType Z = gen->normal();
     if (Z > -k) {
       OType x = 1.0 + c * Z;
       OType V = x * x * x;
       if (log(1.0-gen->uniform()) < 0.5 * Z * Z + d * (1.0 - V + log(V))) {
         sample = d * V * b;
         break;
       }
     }
   }
   return a < 1 ? sample * pow(gen->uniform(), OType(1.0 / a)) : sample;
 }

 template<typename xpu>
 struct SampleGammaKernel {
   template<typename IType, typename OType, typename FType>
   MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, FType> gen,
                                   const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *alpha, const IType *beta, OType *out) {
     RNG_KERNEL_LOOP(xpu, FType, id, gen, N, step, {
       index_t nBatch(1 + (nSample - 1) / nParm);
       out[i] = OType(SampleGamma<xpu, IType, FType>(alpha[i / nBatch],
                                                     beta[i / nBatch], &genImpl));
     });
   }
 };

 template<typename xpu>
 struct GammaSampler {
   template<typename IType, typename OType>
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& alpha,
                                    const Tensor<xpu, 1, IType>& beta,
                                    const Tensor<xpu, 1, OType>& out,
                                    RandGenerator<xpu, OType> *pgen,
                                    Stream<xpu> *s) {
     typedef typename std::conditional<std::is_floating_point<OType>::value,
                                       OType, float>::type FType;
     RandGenerator<xpu, FType> *gen = reinterpret_cast<RandGenerator<xpu, FType> *>(pgen);
     LaunchRNG<SampleGammaKernel<xpu>, xpu>(s, gen, out.size(0), alpha.size(0), out.size(0),
                                            alpha.dptr_, beta.dptr_, out.dptr_);
   }
 };

 template<typename xpu>
 MSHADOW_XINLINE int SamplePoisson(float lambda, typename RandGenerator<xpu, float>::Impl *gen) {
   // Generate one sample of the poisson distribution. Intentionally written
   // towards a specific type (float) for internal computation which is sufficient
   // for accurate enough computation.
   if ( lambda < 12.0 ) {
     float t = expf(-lambda);
     int x = 0;
     for ( float prod = gen->uniform(); prod > t; prod *= gen->uniform() ) { x += 1; }
     return x;
   } else {
     // Approximation for high lambda according to:
     // Numerical Recipes in C: The Art of Scientific Computing
     // Cambridge University Press
     const float pi(3.1415926);
     const float sq(sqrt(2.0*lambda));
     const float loglambda(log(lambda));
     const float g(lambda*loglambda-lgammaf(lambda+1.0));
     float em(0), t(0), y(0);
     do {
       do {
         y = tanf(pi * gen->uniform());
         em = sq * y + lambda;
       } while (em < 0.0);
       em = floorf(em);
       t = 0.9 * (1.0 + y * y) * expf(em * loglambda - lgammaf(em + 1.0) - g);
     } while (gen->uniform() > t);
     return static_cast<int>(em);
   }
 }

 template<typename xpu>
 struct SamplePoissonKernel {
   template<typename IType, typename OType>
   MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
                                   const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *lambda, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
       index_t nBatch(1 + (nSample - 1) / nParm);
       out[i] = OType(SamplePoisson<xpu>(lambda[i / nBatch], &genImpl));
     });
   }
 };

 template<typename xpu>
 struct PoissonSampler {
   template<typename IType, typename OType>
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lambda,
                                    const Tensor<xpu, 1, OType>& out,
                                    RandGenerator<xpu, OType> *pgen,
                                    Stream<xpu> *s) {
     RandGenerator<xpu, float> *gen = reinterpret_cast<RandGenerator<xpu, float> *>(pgen);
     LaunchRNG<SamplePoissonKernel<xpu>, xpu>(s, gen, out.size(0), lambda.size(0), out.size(0),
                                              lambda.dptr_, out.dptr_);
   }
 };

 template<typename xpu>
 struct SampleNegativeBinomialKernel {
   template<typename IType, typename OType>
   MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
                                   const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *k, const IType *p, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
       index_t nBatch(1 + (nSample - 1) / nParm);
       float alpha = k[i / nBatch];
       float prob = p[i / nBatch];
       float beta = (1.0 - prob) / prob;
       float lambda = SampleGamma<xpu, IType, float>(alpha, beta, &genImpl);
       out[i] = OType(SamplePoisson<xpu>(lambda, &genImpl));
     });
   }
 };

 template<typename xpu>
 struct NegativeBinomialSampler {
   template<typename IType, typename OType>
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& k,
                                    const Tensor<xpu, 1, IType>& p,
                                    const Tensor<xpu, 1, OType>& out,
                                    RandGenerator<xpu, OType> *pgen,
                                    Stream<xpu> *s) {
     RandGenerator<xpu, float> *gen = reinterpret_cast<RandGenerator<xpu, float> *>(pgen);
     LaunchRNG<SampleNegativeBinomialKernel<xpu>, xpu>(s, gen, out.size(0), k.size(0), out.size(0),
                                                       k.dptr_, p.dptr_, out.dptr_);
   }
 };

 template<typename xpu>
 struct SampleGeneralizedNegativeBinomialKernel {
   template<typename IType, typename OType>
   MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
                                   const index_t N, const index_t step,
                                   index_t nParm, index_t nSample,
                                   const IType *mu, const IType *alpha, OType *out) {
     RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
       index_t nBatch(1 + (nSample - 1) / nParm);
       float lambda = alpha[i / nBatch] == 0 ?
                      static_cast<float>(mu[i / nBatch]) :
                      SampleGamma<xpu, IType, float>(IType(1) / alpha[i / nBatch],
                                                     alpha[i / nBatch] * mu[i / nBatch], &genImpl);
       out[i] = OType(SamplePoisson<xpu>(lambda, &genImpl));
     });
   }
 };

 template<typename xpu>
 struct GeneralizedNegativeBinomialSampler {
   template<typename IType, typename OType>
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& mu,
                                    const Tensor<xpu, 1, IType>& alpha,
                                    const Tensor<xpu, 1, OType>& out,
                                    RandGenerator<xpu, OType> *pgen,
                                    Stream<xpu> *s) {
     RandGenerator<xpu, float> *gen = reinterpret_cast<RandGenerator<xpu, float> *>(pgen);
     LaunchRNG<SampleGeneralizedNegativeBinomialKernel<xpu>, xpu>(s, gen, out.size(0),
                                                                  mu.size(0), out.size(0),
                                                                  mu.dptr_, alpha.dptr_, out.dptr_);
   }
 };

 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_RANDOM_SAMPLER_H_
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file sampler.h
	* \brief implementations of random sampling functors.
	*/

	#ifndef MXNET_OPERATOR_RANDOM_SAMPLER_H_
	#define MXNET_OPERATOR_RANDOM_SAMPLER_H_

	#include <algorithm>

	using namespace mshadow;
	using namespace mxnet::op::mxnet_op;
	using namespace mxnet::common::random;

	namespace mxnet {
	namespace op {

	/*!
	* \brief Launch a generic kernel with parallel random generator.
	* \tparam gen random generator
	* \tparam N Number of iterations
	* \tparam Args Varargs type to eventually pass to the OP::Map() function
	*/
	template<typename OP, typename xpu, typename GType, typename ...Args>
	inline static void LaunchRNG(mshadow::Stream<xpu> *s,
	common::random::RandGenerator<xpu, GType> *gen,
	const index_t N, Args... args) {
	// minimal check to avoid division by zero, below.
	// if `N` is zero the map operation is a no-op in any case.
	if (N <= 0) {
	return;
	}
	const index_t nloop = (N + RandGenerator<xpu>::kMinNumRandomPerThread - 1) /
	RandGenerator<xpu>::kMinNumRandomPerThread;
	const index_t nthread = std::min(nloop,
	static_cast<index_t>(RandGenerator<xpu>::kNumRandomStates));
	const index_t step = (N + nthread - 1) / nthread;
	Kernel<OP, xpu>::Launch(s, nthread, *gen, N, step, args...);
	}

	#define RNG_KERNEL_LOOP(xpu, GType, thread_id, gen, N, step, ...) \
	const index_t start = thread_id * step; \
	const index_t end = start + step; \
	typename RandGenerator<xpu, GType>::Impl genImpl(&gen, thread_id); \
	for (index_t i = start; i < end && i < N; ++i) { \
	{__VA_ARGS__} \
	}

	template<typename xpu>
	struct SampleUniformKernel {
	template<typename IType, typename OType>
	MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
	const index_t N, const index_t step,
	index_t nParm, index_t nSample,
	const IType lower, const IType upper, OType *out) {
	RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
	index_t nBatch(1 + (nSample - 1) / nParm);
	out[i] = OType(lower[i / nBatch] +
	(upper[i / nBatch] - lower[i / nBatch]) * genImpl.uniform());
	});
	}
	};

	template<typename xpu>
	struct UniformSampler {
	template<typename IType, typename OType>
	MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lower,
	const Tensor<xpu, 1, IType>& upper,
	const Tensor<xpu, 1, OType>& out,
	RandGenerator<xpu, OType> *pgen,
	Stream<xpu> *s) {
	LaunchRNG<SampleUniformKernel<xpu>, xpu>(s, pgen, out.size(0), lower.size(0), out.size(0),
	lower.dptr_, upper.dptr_, out.dptr_);
	}
	};

	template<typename xpu>
	struct SampleRandIntKernel {
	template<typename IType, typename OType>
	MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
	const index_t N, const index_t step,
	index_t nParm, index_t nSample,
	const IType lower, const IType upper, OType *out) {
	RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
	index_t nBatch(1 + (nSample - 1) / nParm);
	if (sizeof(IType) == sizeof(int64_t))
	out[i] = OType(lower[i / nBatch] +
	genImpl.rand_int64() % (upper[i / nBatch] - lower[i / nBatch]));
	else
	out[i] = OType(lower[i / nBatch] +
	genImpl.rand() % (upper[i / nBatch] - lower[i / nBatch]));
	});
	}
	};

	template<typename xpu>
	struct RandIntSampler {
	template<typename IType, typename OType>
	MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lower,
	const Tensor<xpu, 1, IType>& upper,
	const Tensor<xpu, 1, OType>& out,
	RandGenerator<xpu, OType> *pgen,
	Stream<xpu> *s) {
	LaunchRNG<SampleRandIntKernel<xpu>, xpu>(s, pgen, out.size(0), lower.size(0), out.size(0),
	lower.dptr_, upper.dptr_, out.dptr_);
	}
	};

	template<typename xpu>
	struct SampleNormalKernel {
	template<typename IType, typename OType>
	MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
	const index_t N, const index_t step,
	index_t nParm, index_t nSample,
	const IType mean, const IType std, OType *out) {
	RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
	index_t nBatch(1 + (nSample - 1) / nParm);
	out[i] = OType(genImpl.normal() * std[i / nBatch] + mean[i / nBatch]);
	});
	}
	};

	template<typename xpu>
	struct NormalSampler {
	template<typename IType, typename OType>
	MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& mean,
	const Tensor<xpu, 1, IType>& std,
	const Tensor<xpu, 1, OType>& out,
	RandGenerator<xpu, OType> *pgen,
	Stream<xpu> *s) {
	LaunchRNG<SampleNormalKernel<xpu>, xpu>(s, pgen, out.size(0), mean.size(0), out.size(0),
	mean.dptr_, std.dptr_, out.dptr_);
	}
	};

	template<typename xpu>
	struct SampleExponentialKernel {
	template<typename IType, typename OType>
	MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, OType> gen,
	const index_t N, const index_t step,
	index_t nParm, index_t nSample,
	const IType lambda, OType out) {
	RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
	index_t nBatch(1 + (nSample - 1) / nParm);
	out[i] = OType(-log(1.0 - genImpl.uniform()) / lambda[i / nBatch]);
	});
	}
	};

	template<typename xpu>
	struct ExponentialSampler {
	template<typename IType, typename OType>
	MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lambda,
	const Tensor<xpu, 1, OType>& out,
	RandGenerator<xpu, OType> *pgen,
	Stream<xpu> *s) {
	LaunchRNG<SampleExponentialKernel<xpu>, xpu>(s, pgen, out.size(0),
	lambda.size(0), out.size(0),
	lambda.dptr_, out.dptr_);
	}
	};

	template<typename xpu, typename IType, typename OType>
	MSHADOW_XINLINE OType SampleGamma(IType a, IType b, typename RandGenerator<xpu, OType>::Impl *gen) {
	// Generate one sample of the gamma distribution
	OType sample;
	OType d = a < 1 ? a + 2.0 / 3.0 : a - 1.0 / 3.0;
	OType k = sqrt(9.0 * d);
	OType c = 1.0 / k;
	while (1) {
	OType Z = gen->normal();
	if (Z > -k) {
	OType x = 1.0 + c * Z;
	OType V = x * x * x;
	if (log(1.0-gen->uniform()) < 0.5 * Z * Z + d * (1.0 - V + log(V))) {
	sample = d * V * b;
	break;
	}
	}
	}
	return a < 1 ? sample * pow(gen->uniform(), OType(1.0 / a)) : sample;
	}

	template<typename xpu>
	struct SampleGammaKernel {
	template<typename IType, typename OType, typename FType>
	MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, FType> gen,
	const index_t N, const index_t step,
	index_t nParm, index_t nSample,
	const IType alpha, const IType beta, OType *out) {
	RNG_KERNEL_LOOP(xpu, FType, id, gen, N, step, {
	index_t nBatch(1 + (nSample - 1) / nParm);
	out[i] = OType(SampleGamma<xpu, IType, FType>(alpha[i / nBatch],
	beta[i / nBatch], &genImpl));
	});
	}
	};

	template<typename xpu>
	struct GammaSampler {
	template<typename IType, typename OType>
	MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& alpha,
	const Tensor<xpu, 1, IType>& beta,
	const Tensor<xpu, 1, OType>& out,
	RandGenerator<xpu, OType> *pgen,
	Stream<xpu> *s) {
	typedef typename std::conditional<std::is_floating_point<OType>::value,
	OType, float>::type FType;
	RandGenerator<xpu, FType> gen = reinterpret_cast<RandGenerator<xpu, FType> >(pgen);
	LaunchRNG<SampleGammaKernel<xpu>, xpu>(s, gen, out.size(0), alpha.size(0), out.size(0),
	alpha.dptr_, beta.dptr_, out.dptr_);
	}
	};

	template<typename xpu>
	MSHADOW_XINLINE int SamplePoisson(float lambda, typename RandGenerator<xpu, float>::Impl *gen) {
	// Generate one sample of the poisson distribution. Intentionally written
	// towards a specific type (float) for internal computation which is sufficient
	// for accurate enough computation.
	if ( lambda < 12.0 ) {
	float t = expf(-lambda);
	int x = 0;
	for ( float prod = gen->uniform(); prod > t; prod *= gen->uniform() ) { x += 1; }
	return x;
	} else {
	// Approximation for high lambda according to:
	// Numerical Recipes in C: The Art of Scientific Computing
	// Cambridge University Press
	const float pi(3.1415926);
	const float sq(sqrt(2.0*lambda));
	const float loglambda(log(lambda));
	const float g(lambda*loglambda-lgammaf(lambda+1.0));
	float em(0), t(0), y(0);
	do {
	do {
	y = tanf(pi * gen->uniform());
	em = sq * y + lambda;
	} while (em < 0.0);
	em = floorf(em);
	t = 0.9 * (1.0 + y * y) * expf(em * loglambda - lgammaf(em + 1.0) - g);
	} while (gen->uniform() > t);
	return static_cast<int>(em);
	}
	}

	template<typename xpu>
	struct SamplePoissonKernel {
	template<typename IType, typename OType>
	MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
	const index_t N, const index_t step,
	index_t nParm, index_t nSample,
	const IType lambda, OType out) {
	RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
	index_t nBatch(1 + (nSample - 1) / nParm);
	out[i] = OType(SamplePoisson<xpu>(lambda[i / nBatch], &genImpl));
	});
	}
	};

	template<typename xpu>
	struct PoissonSampler {
	template<typename IType, typename OType>
	MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lambda,
	const Tensor<xpu, 1, OType>& out,
	RandGenerator<xpu, OType> *pgen,
	Stream<xpu> *s) {
	RandGenerator<xpu, float> gen = reinterpret_cast<RandGenerator<xpu, float> >(pgen);
	LaunchRNG<SamplePoissonKernel<xpu>, xpu>(s, gen, out.size(0), lambda.size(0), out.size(0),
	lambda.dptr_, out.dptr_);
	}
	};

	template<typename xpu>
	struct SampleNegativeBinomialKernel {
	template<typename IType, typename OType>
	MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
	const index_t N, const index_t step,
	index_t nParm, index_t nSample,
	const IType k, const IType p, OType *out) {
	RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
	index_t nBatch(1 + (nSample - 1) / nParm);
	float alpha = k[i / nBatch];
	float prob = p[i / nBatch];
	float beta = (1.0 - prob) / prob;
	float lambda = SampleGamma<xpu, IType, float>(alpha, beta, &genImpl);
	out[i] = OType(SamplePoisson<xpu>(lambda, &genImpl));
	});
	}
	};

	template<typename xpu>
	struct NegativeBinomialSampler {
	template<typename IType, typename OType>
	MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& k,
	const Tensor<xpu, 1, IType>& p,
	const Tensor<xpu, 1, OType>& out,
	RandGenerator<xpu, OType> *pgen,
	Stream<xpu> *s) {
	RandGenerator<xpu, float> gen = reinterpret_cast<RandGenerator<xpu, float> >(pgen);
	LaunchRNG<SampleNegativeBinomialKernel<xpu>, xpu>(s, gen, out.size(0), k.size(0), out.size(0),
	k.dptr_, p.dptr_, out.dptr_);
	}
	};

	template<typename xpu>
	struct SampleGeneralizedNegativeBinomialKernel {
	template<typename IType, typename OType>
	MSHADOW_XINLINE static void Map(index_t id, RandGenerator<xpu, float> gen,
	const index_t N, const index_t step,
	index_t nParm, index_t nSample,
	const IType mu, const IType alpha, OType *out) {
	RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
	index_t nBatch(1 + (nSample - 1) / nParm);
	float lambda = alpha[i / nBatch] == 0 ?
	static_cast<float>(mu[i / nBatch]) :
	SampleGamma<xpu, IType, float>(IType(1) / alpha[i / nBatch],
	alpha[i / nBatch] * mu[i / nBatch], &genImpl);
	out[i] = OType(SamplePoisson<xpu>(lambda, &genImpl));
	});
	}
	};

	template<typename xpu>
	struct GeneralizedNegativeBinomialSampler {
	template<typename IType, typename OType>
	MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& mu,
	const Tensor<xpu, 1, IType>& alpha,
	const Tensor<xpu, 1, OType>& out,
	RandGenerator<xpu, OType> *pgen,
	Stream<xpu> *s) {
	RandGenerator<xpu, float> gen = reinterpret_cast<RandGenerator<xpu, float> >(pgen);
	LaunchRNG<SampleGeneralizedNegativeBinomialKernel<xpu>, xpu>(s, gen, out.size(0),
	mu.size(0), out.size(0),
	mu.dptr_, alpha.dptr_, out.dptr_);
	}
	};

	} // namespace op
	} // namespace mxnet

	#endif // MXNET_OPERATOR_RANDOM_SAMPLER_H_