src/operator/softmax_output.cc - mxnet-test - Git at Google

 /*!
  * Copyright (c) 2015 by Contributors
  * \file softmax_output.cc
  * \brief
  * \author Bing Xu
 */
 #include "./softmax_output-inl.h"

 namespace mxnet {
 namespace op {
 template<>
 Operator *CreateOp<cpu>(SoftmaxOutputParam param, int dtype) {
   Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     op = new SoftmaxOutputOp<cpu, DType>(param);
   })
   return op;
 }

 // DO_BIND_DISPATCH comes from operator_common.h
 Operator *SoftmaxOutputProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                      std::vector<int> *in_type) const {
   std::vector<TShape> out_shape, aux_shape;
   std::vector<int> out_type, aux_type;
   CHECK(InferType(in_type, &out_type, &aux_type));
   CHECK(InferShape(in_shape, &out_shape, &aux_shape));
   DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
 }

 DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);

 MXNET_REGISTER_OP_PROPERTY(SoftmaxOutput, SoftmaxOutputProp)
 .describe(R"code(Computes the gradient of cross entropy loss with respect to softmax output.

 - This operator computes the gradient in two steps.
   The cross entropy loss does not actually need to be computed.

   - Applies softmax function on the input array.
   - Computes and returns the gradient of cross entropy loss w.r.t. the softmax output.

 - The softmax function, cross entropy loss and gradient is given by:

   - Softmax Function:

     .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}

   - Cross Entropy Function:

     .. math:: \text{CE(label, output)} = - \sum_i \text{label}_i \log(\text{output}_i)

   - The gradient of cross entropy loss w.r.t softmax output:

     .. math:: \text{gradient} = \text{output} - \text{label}

 - During forward propagation, the softmax function is computed for each instance in the input array.

   For general *N*-D input arrays with shape :math:`(d_1, d_2, ..., d_n)`. The size is
   :math:`s=d_1 \cdot d_2 \cdot \cdot \cdot d_n`. We can use the parameters `preserve_shape`
   and `multi_output` to specify the way to compute softmax:

   - By default, `preserve_shape` is ``false``. This operator will reshape the input array
     into a 2-D array with shape :math:`(d_1, \frac{s}{d_1})` and then compute the softmax function for
     each row in the reshaped array, and afterwards reshape it back to the original shape
     :math:`(d_1, d_2, ..., d_n)`.
   - If `preserve_shape` is ``true``, the softmax function will be computed along
     the last axis (`axis` = ``-1``).
   - If `multi_output` is ``true``, the softmax function will be computed along
     the second axis (`axis` = ``1``).

 - During backward propagation, the gradient of cross-entropy loss w.r.t softmax output array is computed.
   The provided label can be a one-hot label array or a probability label array.

   - If the parameter `use_ignore` is ``true``, `ignore_label` can specify input instances
     with a particular label to be ignored during backward propagation. **This has no effect when
     softmax `output` has same shape as `label`**.

     Example::

       data = [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
       label = [1,0,2,3]
       ignore_label = 1
       SoftmaxOutput(data=data, label = label,\
                     multi_output=true, use_ignore=true,\
                     ignore_label=ignore_label)
       ## forward softmax output
       [[ 0.0320586   0.08714432  0.23688284  0.64391428]
        [ 0.25        0.25        0.25        0.25      ]
        [ 0.25        0.25        0.25        0.25      ]
        [ 0.25        0.25        0.25        0.25      ]]
       ## backward gradient output
       [[ 0.    0.    0.    0.  ]
        [-0.75  0.25  0.25  0.25]
        [ 0.25  0.25 -0.75  0.25]
        [ 0.25  0.25  0.25 -0.75]]
       ## notice that the first row is all 0 because label[0] is 1, which is equal to ignore_label.

   - The parameter `grad_scale` can be used to rescale the gradient, which is often used to
     give each loss function different weights.

   - This operator also supports various ways to normalize the gradient by `normalization`,
     The `normalization` is applied if softmax output has different shape than the labels.
     The `normalization` mode can be set to the followings:

     - ``'null'``: do nothing.
     - ``'batch'``: divide the gradient by the batch size.
     - ``'valid'``: divide the gradient by the number of instances which are not ignored.

 )code" ADD_FILELINE)
 .add_argument("data", "NDArray-or-Symbol", "Input array.")
 .add_argument("label", "NDArray-or-Symbol", "Ground truth label.")
 .add_arguments(SoftmaxOutputParam::__FIELDS__());


 MXNET_REGISTER_OP_PROPERTY(Softmax, DeprecatedSoftmaxProp)
 .describe(R"code(Please use `SoftmaxOutput`.

 .. note::

   This operator has been renamed to `SoftmaxOutput`, which
   computes the gradient of cross-entropy loss w.r.t softmax output.
   To just compute softmax output, use the `softmax` operator.

 )code" ADD_FILELINE)
 .add_argument("data", "NDArray-or-Symbol", "Input array.")
 .add_arguments(SoftmaxOutputParam::__FIELDS__());

 }  // namespace op
 }  // namespace mxnet
	/*!
	* Copyright (c) 2015 by Contributors
	* \file softmax_output.cc
	* \brief
	* \author Bing Xu
	*/
	#include "./softmax_output-inl.h"

	namespace mxnet {
	namespace op {
	template<>
	Operator *CreateOp<cpu>(SoftmaxOutputParam param, int dtype) {
	Operator *op = NULL;
	MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
	op = new SoftmaxOutputOp<cpu, DType>(param);
	})
	return op;
	}

	// DO_BIND_DISPATCH comes from operator_common.h
	Operator SoftmaxOutputProp::CreateOperatorEx(Context ctx, std::vector<TShape> in_shape,
	std::vector<int> *in_type) const {
	std::vector<TShape> out_shape, aux_shape;
	std::vector<int> out_type, aux_type;
	CHECK(InferType(in_type, &out_type, &aux_type));
	CHECK(InferShape(in_shape, &out_shape, &aux_shape));
	DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
	}

	DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);

	MXNET_REGISTER_OP_PROPERTY(SoftmaxOutput, SoftmaxOutputProp)
	.describe(R"code(Computes the gradient of cross entropy loss with respect to softmax output.

	- This operator computes the gradient in two steps.
	The cross entropy loss does not actually need to be computed.

	- Applies softmax function on the input array.
	- Computes and returns the gradient of cross entropy loss w.r.t. the softmax output.

	- The softmax function, cross entropy loss and gradient is given by:

	- Softmax Function:

	.. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}

	- Cross Entropy Function:

	.. math:: \text{CE(label, output)} = - \sum_i \text{label}_i \log(\text{output}_i)

	- The gradient of cross entropy loss w.r.t softmax output:

	.. math:: \text{gradient} = \text{output} - \text{label}

	- During forward propagation, the softmax function is computed for each instance in the input array.

	For general N-D input arrays with shape :math:`(d_1, d_2, ..., d_n)`. The size is
	:math:`s=d_1 \cdot d_2 \cdot \cdot \cdot d_n`. We can use the parameters `preserve_shape`
	and `multi_output` to specify the way to compute softmax:

	- By default, `preserve_shape` is ``false``. This operator will reshape the input array
	into a 2-D array with shape :math:`(d_1, \frac{s}{d_1})` and then compute the softmax function for
	each row in the reshaped array, and afterwards reshape it back to the original shape
	:math:`(d_1, d_2, ..., d_n)`.
	- If `preserve_shape` is ``true``, the softmax function will be computed along
	the last axis (`axis` = ``-1``).
	- If `multi_output` is ``true``, the softmax function will be computed along
	the second axis (`axis` = ``1``).

	- During backward propagation, the gradient of cross-entropy loss w.r.t softmax output array is computed.
	The provided label can be a one-hot label array or a probability label array.

	- If the parameter `use_ignore` is ``true``, `ignore_label` can specify input instances
	with a particular label to be ignored during backward propagation. **This has no effect when
	softmax `output` has same shape as `label`**.

	Example::

	data = [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
	label = [1,0,2,3]
	ignore_label = 1
	SoftmaxOutput(data=data, label = label,\
	multi_output=true, use_ignore=true,\
	ignore_label=ignore_label)
	## forward softmax output
	[[ 0.0320586 0.08714432 0.23688284 0.64391428]
	[ 0.25 0.25 0.25 0.25 ]
	[ 0.25 0.25 0.25 0.25 ]
	[ 0.25 0.25 0.25 0.25 ]]
	## backward gradient output
	[[ 0. 0. 0. 0. ]
	[-0.75 0.25 0.25 0.25]
	[ 0.25 0.25 -0.75 0.25]
	[ 0.25 0.25 0.25 -0.75]]
	## notice that the first row is all 0 because label[0] is 1, which is equal to ignore_label.

	- The parameter `grad_scale` can be used to rescale the gradient, which is often used to
	give each loss function different weights.

	- This operator also supports various ways to normalize the gradient by `normalization`,
	The `normalization` is applied if softmax output has different shape than the labels.
	The `normalization` mode can be set to the followings:

	- ``'null'``: do nothing.
	- ``'batch'``: divide the gradient by the batch size.
	- ``'valid'``: divide the gradient by the number of instances which are not ignored.

	)code" ADD_FILELINE)
	.add_argument("data", "NDArray-or-Symbol", "Input array.")
	.add_argument("label", "NDArray-or-Symbol", "Ground truth label.")
	.add_arguments(SoftmaxOutputParam::__FIELDS__());


	MXNET_REGISTER_OP_PROPERTY(Softmax, DeprecatedSoftmaxProp)
	.describe(R"code(Please use `SoftmaxOutput`.

	.. note::

	This operator has been renamed to `SoftmaxOutput`, which
	computes the gradient of cross-entropy loss w.r.t softmax output.
	To just compute softmax output, use the `softmax` operator.

	)code" ADD_FILELINE)
	.add_argument("data", "NDArray-or-Symbol", "Input array.")
	.add_arguments(SoftmaxOutputParam::__FIELDS__());

	} // namespace op
	} // namespace mxnet