src/operator/nn/softmax-inl.h - mxnet-test - Git at Google

 /*!
  * Copyright (c) 2017 by Contributors
  * \file softmax-inl.h
  * \brief
 */
 #ifndef MXNET_OPERATOR_NN_SOFTMAX_INL_H_
 #define MXNET_OPERATOR_NN_SOFTMAX_INL_H_

 #include <vector>

 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #include "../tensor/broadcast_reduce_op.h"

 namespace mxnet {
 namespace op {
 namespace mxnet_op {

 struct softmax_fwd {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
     return DType(expf(a)/b);
   }
 };


 struct log_softmax_fwd {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
     return DType(a - logf(b));
   }
 };


 template<typename OP, typename DType, int ndim>
 inline void Softmax(Stream<cpu> *s, DType *in, DType *out,
                     Shape<ndim> shape, int axis) {
   index_t M = shape[axis];
   index_t N = shape.Size()/M;
   Shape<ndim> stride = calc_stride(shape);
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;
   index_t sa = stride[axis];

   #pragma omp parallel for
   for (int i = 0; i < N; ++i) {
     index_t base = unravel_dot(i, sshape, stride);

     DType mmax = in[base];
     for (index_t j = 1; j < M; ++j) {
       if (mmax < in[base + j*sa]) mmax = in[base + j*sa];
     }

     DType sum = DType(0);
     for (index_t j = 0; j < M; ++j) {
       sum += std::exp(in[base + j*sa] - mmax);
     }

     for (index_t j = 0; j < M; ++j) {
       out[base + j*sa] = OP::Map(in[base + j*sa] - mmax, sum);
     }
   }
 }


 struct softmax_bwd {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType ograd, DType out, DType sum) {
     return DType(out * (ograd - sum));
   }
 };


 struct log_softmax_bwd {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType ograd, DType out, DType sum) {
     return DType(ograd - expf(out)*sum);
   }
 };


 template<typename OP1, typename OP2, typename DType, int ndim>
 inline void SoftmaxGrad(Stream<cpu> *s, DType *out, DType *ograd,
                         DType *igrad, Shape<ndim> shape, int axis) {
   index_t M = shape[axis];
   index_t N = shape.Size()/M;
   Shape<ndim> stride = calc_stride(shape);
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;
   index_t sa = stride[axis];

   #pragma omp parallel for
   for (int i = 0; i < N; ++i) {
     index_t base = unravel_dot(i, sshape, stride);

     DType sum = DType(0);
     for (index_t j = 0; j < M; ++j) {
       sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
     }

     for (index_t j = 0; j < M; ++j) {
       igrad[base + j*sa] = OP2::Map(ograd[base + j*sa], out[base + j*sa], sum);
     }
   }
 }


 #ifdef __CUDACC__
 template<int x_bits, typename OP, typename DType, int ndim>
 __global__ void softmax_compute_kernel(DType *in, DType *out, index_t M, int axis,
                                        Shape<ndim> sshape, Shape<ndim> stride) {
   const unsigned x_size = 1 << x_bits;
   __shared__ DType smem[x_size];
   index_t sa = stride[axis];
   index_t base = unravel_dot(blockIdx.x, sshape, stride);
   index_t x = threadIdx.x;

   red::maximum::SetInitValue(smem[x]);
   for (index_t i = x; i < M; i += x_size) {
     red::maximum::Reduce(smem[x], in[base + i*sa]);
   }
   __syncthreads();
   cuda::Reduce1D<red::maximum, x_bits>(smem);
   __syncthreads();
   DType smax = smem[0];
   __syncthreads();

   red::sum::SetInitValue(smem[x]);
   for (index_t i = x; i < M; i += x_size) {
     red::sum::Reduce(smem[x], static_cast<DType>(expf(in[base + i*sa] - smax)));
   }
   __syncthreads();
   cuda::Reduce1D<red::sum, x_bits>(smem);
   __syncthreads();
   DType ssum = smem[0];
   __syncthreads();

   for (index_t i = x; i < M; i += x_size) {
     out[base + i*sa] = OP::Map(in[base + i*sa] - smax, ssum);
   }
 }

 template<typename OP, typename DType, int ndim>
 inline void Softmax(Stream<gpu> *s, DType *in, DType *out,
                     Shape<ndim> shape, int axis) {
   const int x_bits = 7;
   const int x_size = 1 << x_bits;
   index_t M = shape[axis];
   index_t N = shape.Size()/M;
   Shape<ndim> stride = calc_stride(shape);
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;

   softmax_compute_kernel<x_bits, OP, DType, ndim>
     <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
       in, out, M, axis, sshape, stride);
 }


 template<int x_bits, typename OP1, typename OP2, typename DType, int ndim>
 __global__ void softmax_gradient_kernel(DType *out, DType *ograd, DType *igrad,
                                         index_t M, int axis, Shape<ndim> sshape,
                                         Shape<ndim> stride) {
   const unsigned x_size = 1 << x_bits;
   __shared__ DType smem[x_size];
   index_t sa = stride[axis];
   index_t base = unravel_dot(blockIdx.x, sshape, stride);
   index_t x = threadIdx.x;

   red::sum::SetInitValue(smem[x]);
   for (index_t i = x; i < M; i += x_size) {
     red::sum::Reduce(smem[x], OP1::Map(ograd[base + i*sa], out[base + i*sa]));
   }
   __syncthreads();
   cuda::Reduce1D<red::sum, x_bits>(smem);
   __syncthreads();
   DType ssum = smem[0];
   __syncthreads();

   for (index_t i = x; i < M; i += x_size) {
     igrad[base + i*sa] = OP2::Map(ograd[base + i*sa], out[base + i*sa], ssum);
   }
 }


 template<typename OP1, typename OP2, typename DType, int ndim>
 inline void SoftmaxGrad(Stream<gpu> *s, DType *out, DType *ograd,
                         DType *igrad, Shape<ndim> shape, int axis) {
   const int x_bits = 7;
   const int x_size = 1 << x_bits;
   index_t M = shape[axis];
   index_t N = shape.Size()/M;
   Shape<ndim> stride = calc_stride(shape);
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;

   softmax_gradient_kernel<x_bits, OP1, OP2, DType, ndim>
     <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
       out, ograd, igrad, M, axis, sshape, stride);
 }
 #endif

 }  // namespace mxnet_op


 struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
   int axis;
   DMLC_DECLARE_PARAMETER(SoftmaxParam) {
     DMLC_DECLARE_FIELD(axis).set_default(-1)
       .describe("The axis along which to compute softmax.");
   }
 };

 template<typename xpu, typename OP>
 void SoftmaxCompute(const nnvm::NodeAttrs& attrs,
                     const OpContext& ctx,
                     const std::vector<TBlob>& inputs,
                     const std::vector<OpReqType>& req,
                     const std::vector<TBlob>& outputs) {
   using namespace mxnet_op;
   if (req[0] == kNullOp) return;
   CHECK_NE(req[0], kAddTo);
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
   int axis = CheckAxis(param.axis, inputs[0].ndim());
   TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     if (shape.ndim() == 2) {
       Softmax<OP>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
               outputs[0].dptr<DType>(), shape.get<2>(), axis);
     } else {
       Softmax<OP>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
               outputs[0].dptr<DType>(), shape.get<3>(), axis);
     }
   });
 }


 template<typename xpu, typename OP1, typename OP2>
 void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
                         const OpContext& ctx,
                         const std::vector<TBlob>& inputs,
                         const std::vector<OpReqType>& req,
                         const std::vector<TBlob>& outputs) {
   using namespace mxnet_op;
   if (req[0] == kNullOp) return;
   CHECK_NE(req[0], kAddTo);
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
   int axis = CheckAxis(param.axis, inputs[0].ndim());
   TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     if (shape.ndim() == 2) {
       SoftmaxGrad<OP1, OP2>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
                             inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
                             shape.get<2>(), axis);
     } else {
       SoftmaxGrad<OP1, OP2>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
                             inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
                             shape.get<3>(), axis);
     }
   });
 }

 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_NN_SOFTMAX_INL_H_
	/*!
	* Copyright (c) 2017 by Contributors
	* \file softmax-inl.h
	* \brief
	*/
	#ifndef MXNET_OPERATOR_NN_SOFTMAX_INL_H_
	#define MXNET_OPERATOR_NN_SOFTMAX_INL_H_

	#include <vector>

	#include "../mxnet_op.h"
	#include "../operator_common.h"
	#include "../tensor/broadcast_reduce_op.h"

	namespace mxnet {
	namespace op {
	namespace mxnet_op {

	struct softmax_fwd {
	template<typename DType>
	MSHADOW_XINLINE static DType Map(DType a, DType b) {
	return DType(expf(a)/b);
	}
	};


	struct log_softmax_fwd {
	template<typename DType>
	MSHADOW_XINLINE static DType Map(DType a, DType b) {
	return DType(a - logf(b));
	}
	};


	template<typename OP, typename DType, int ndim>
	inline void Softmax(Stream<cpu> s, DType in, DType *out,
	Shape<ndim> shape, int axis) {
	index_t M = shape[axis];
	index_t N = shape.Size()/M;
	Shape<ndim> stride = calc_stride(shape);
	Shape<ndim> sshape = shape;
	sshape[axis] = 1;
	index_t sa = stride[axis];

	#pragma omp parallel for
	for (int i = 0; i < N; ++i) {
	index_t base = unravel_dot(i, sshape, stride);

	DType mmax = in[base];
	for (index_t j = 1; j < M; ++j) {
	if (mmax < in[base + jsa]) mmax = in[base + jsa];
	}

	DType sum = DType(0);
	for (index_t j = 0; j < M; ++j) {
	sum += std::exp(in[base + j*sa] - mmax);
	}

	for (index_t j = 0; j < M; ++j) {
	out[base + jsa] = OP::Map(in[base + jsa] - mmax, sum);
	}
	}
	}


	struct softmax_bwd {
	template<typename DType>
	MSHADOW_XINLINE static DType Map(DType ograd, DType out, DType sum) {
	return DType(out * (ograd - sum));
	}
	};


	struct log_softmax_bwd {
	template<typename DType>
	MSHADOW_XINLINE static DType Map(DType ograd, DType out, DType sum) {
	return DType(ograd - expf(out)*sum);
	}
	};


	template<typename OP1, typename OP2, typename DType, int ndim>
	inline void SoftmaxGrad(Stream<cpu> s, DType out, DType *ograd,
	DType *igrad, Shape<ndim> shape, int axis) {
	index_t M = shape[axis];
	index_t N = shape.Size()/M;
	Shape<ndim> stride = calc_stride(shape);
	Shape<ndim> sshape = shape;
	sshape[axis] = 1;
	index_t sa = stride[axis];

	#pragma omp parallel for
	for (int i = 0; i < N; ++i) {
	index_t base = unravel_dot(i, sshape, stride);

	DType sum = DType(0);
	for (index_t j = 0; j < M; ++j) {
	sum += OP1::Map(ograd[base + jsa], out[base + jsa]);
	}

	for (index_t j = 0; j < M; ++j) {
	igrad[base + jsa] = OP2::Map(ograd[base + jsa], out[base + j*sa], sum);
	}
	}
	}


	#ifdef __CUDACC__
	template<int x_bits, typename OP, typename DType, int ndim>
	__global__ void softmax_compute_kernel(DType in, DType out, index_t M, int axis,
	Shape<ndim> sshape, Shape<ndim> stride) {
	const unsigned x_size = 1 << x_bits;
	__shared__ DType smem[x_size];
	index_t sa = stride[axis];
	index_t base = unravel_dot(blockIdx.x, sshape, stride);
	index_t x = threadIdx.x;

	red::maximum::SetInitValue(smem[x]);
	for (index_t i = x; i < M; i += x_size) {
	red::maximum::Reduce(smem[x], in[base + i*sa]);
	}
	__syncthreads();
	cuda::Reduce1D<red::maximum, x_bits>(smem);
	__syncthreads();
	DType smax = smem[0];
	__syncthreads();

	red::sum::SetInitValue(smem[x]);
	for (index_t i = x; i < M; i += x_size) {
	red::sum::Reduce(smem[x], static_cast<DType>(expf(in[base + i*sa] - smax)));
	}
	__syncthreads();
	cuda::Reduce1D<red::sum, x_bits>(smem);
	__syncthreads();
	DType ssum = smem[0];
	__syncthreads();

	for (index_t i = x; i < M; i += x_size) {
	out[base + isa] = OP::Map(in[base + isa] - smax, ssum);
	}
	}

	template<typename OP, typename DType, int ndim>
	inline void Softmax(Stream<gpu> s, DType in, DType *out,
	Shape<ndim> shape, int axis) {
	const int x_bits = 7;
	const int x_size = 1 << x_bits;
	index_t M = shape[axis];
	index_t N = shape.Size()/M;
	Shape<ndim> stride = calc_stride(shape);
	Shape<ndim> sshape = shape;
	sshape[axis] = 1;

	softmax_compute_kernel<x_bits, OP, DType, ndim>
	<<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
	in, out, M, axis, sshape, stride);
	}


	template<int x_bits, typename OP1, typename OP2, typename DType, int ndim>
	__global__ void softmax_gradient_kernel(DType out, DType ograd, DType *igrad,
	index_t M, int axis, Shape<ndim> sshape,
	Shape<ndim> stride) {
	const unsigned x_size = 1 << x_bits;
	__shared__ DType smem[x_size];
	index_t sa = stride[axis];
	index_t base = unravel_dot(blockIdx.x, sshape, stride);
	index_t x = threadIdx.x;

	red::sum::SetInitValue(smem[x]);
	for (index_t i = x; i < M; i += x_size) {
	red::sum::Reduce(smem[x], OP1::Map(ograd[base + isa], out[base + isa]));
	}
	__syncthreads();
	cuda::Reduce1D<red::sum, x_bits>(smem);
	__syncthreads();
	DType ssum = smem[0];
	__syncthreads();

	for (index_t i = x; i < M; i += x_size) {
	igrad[base + isa] = OP2::Map(ograd[base + isa], out[base + i*sa], ssum);
	}
	}


	template<typename OP1, typename OP2, typename DType, int ndim>
	inline void SoftmaxGrad(Stream<gpu> s, DType out, DType *ograd,
	DType *igrad, Shape<ndim> shape, int axis) {
	const int x_bits = 7;
	const int x_size = 1 << x_bits;
	index_t M = shape[axis];
	index_t N = shape.Size()/M;
	Shape<ndim> stride = calc_stride(shape);
	Shape<ndim> sshape = shape;
	sshape[axis] = 1;

	softmax_gradient_kernel<x_bits, OP1, OP2, DType, ndim>
	<<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
	out, ograd, igrad, M, axis, sshape, stride);
	}
	#endif

	} // namespace mxnet_op


	struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
	int axis;
	DMLC_DECLARE_PARAMETER(SoftmaxParam) {
	DMLC_DECLARE_FIELD(axis).set_default(-1)
	.describe("The axis along which to compute softmax.");
	}
	};

	template<typename xpu, typename OP>
	void SoftmaxCompute(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<TBlob>& inputs,
	const std::vector<OpReqType>& req,
	const std::vector<TBlob>& outputs) {
	using namespace mxnet_op;
	if (req[0] == kNullOp) return;
	CHECK_NE(req[0], kAddTo);
	const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
	int axis = CheckAxis(param.axis, inputs[0].ndim());
	TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
	MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
	if (shape.ndim() == 2) {
	Softmax<OP>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
	outputs[0].dptr<DType>(), shape.get<2>(), axis);
	} else {
	Softmax<OP>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
	outputs[0].dptr<DType>(), shape.get<3>(), axis);
	}
	});
	}


	template<typename xpu, typename OP1, typename OP2>
	void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<TBlob>& inputs,
	const std::vector<OpReqType>& req,
	const std::vector<TBlob>& outputs) {
	using namespace mxnet_op;
	if (req[0] == kNullOp) return;
	CHECK_NE(req[0], kAddTo);
	const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
	int axis = CheckAxis(param.axis, inputs[0].ndim());
	TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
	MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
	if (shape.ndim() == 2) {
	SoftmaxGrad<OP1, OP2>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
	inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
	shape.get<2>(), axis);
	} else {
	SoftmaxGrad<OP1, OP2>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
	inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
	shape.get<3>(), axis);
	}
	});
	}

	} // namespace op
	} // namespace mxnet

	#endif // MXNET_OPERATOR_NN_SOFTMAX_INL_H_