src/operator/softmax_output.cc - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file softmax_output.cc
  * \brief
  * \author Bing Xu, Zhang Rong A
  */
 #include "./softmax_output-inl.h"
 #if MXNET_USE_ONEDNN == 1
 #include "operator/nn/dnnl/dnnl_base-inl.h"
 #include "operator/nn/dnnl/dnnl_softmax_output-inl.h"
 #endif
 namespace mxnet {
 namespace op {

 DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
 struct SoftmaxOutputGrad {
   const char* op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::ObjectPtr& n,
                                           const std::vector<nnvm::NodeEntry>& ograds) const {
     std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
     for (uint32_t i = 0; i < out_data.size(); ++i) {
       out_data[i] = nnvm::NodeEntry{n, i, 0};
     }
     std::vector<nnvm::NodeEntry> heads;
     heads.push_back(out_data[softmaxout_enum::kOut]);
     heads.push_back(n->inputs[softmaxout_enum::kLabel]);

     nnvm::ObjectPtr gnode = nnvm::Node::Create();
     gnode->inputs         = std::move(heads);
     gnode->control_deps.emplace_back(n);
     gnode->attrs      = n->attrs;
     gnode->attrs.op   = nnvm::Op::Get("_backward_SoftmaxOutput");
     gnode->attrs.name = n->attrs.name + "_backward";
     std::vector<nnvm::NodeEntry> in_grad(2);
     in_grad[0] = nnvm::NodeEntry{gnode, 0, 0};
     in_grad[1] = nnvm::NodeEntry{gnode, 1, 0};
     return in_grad;
   }
 };

 static inline std::vector<std::string> ListArguments() {
   return {"data", "label"};
 }

 static bool SoftmaxOutputType(const nnvm::NodeAttrs& attrs,
                               std::vector<int>* in_type,
                               std::vector<int>* out_type) {
   CHECK_EQ(in_type->size(), 2U);
   int dtype = (*in_type)[0];
   if (type_is_none(dtype)) {
     // Input type is undefined, we try backward inference
     if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
       // Neither the input nor the output are defined,
       // types cannot be infered for this op
       return false;
     } else {
       // Input type is undefined but output type is: backward inference
       dtype = (*out_type)[0];
     }
   } else {
     // Input type is defined but output type is not: forward inference
     out_type->clear();
     out_type->push_back(dtype);
   }
   for (size_t i = 0; i < in_type->size(); ++i) {
     if ((*in_type)[i] == -1) {
       (*in_type)[i] = dtype;
     } else {
       UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
     }
   }
   return true;
 }

 static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
                                mxnet::ShapeVector* in_shape,
                                mxnet::ShapeVector* out_shape) {
   using namespace mshadow;
   const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
   const mxnet::TShape& dshape = in_shape->at(0);
   if (!mxnet::ndim_is_known(dshape))
     return false;

   // label.shape == data.shape: use probability as label
   if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
     if (param.multi_output) {
       mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size() / dshape[0] / dshape[1]);
       mxnet::TShape lshape2(dshape.ndim() - 1, -1);
       lshape2[0] = dshape[0];
       for (int i = 2; i < dshape.ndim(); ++i)
         lshape2[i - 1] = dshape[i];
       mxnet::TShape lshape3 = dshape;
       lshape3[1]            = 1;
       if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
         in_shape->at(softmaxout_enum::kLabel) = lshape1;
       } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
       } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
       } else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
       } else {
         std::ostringstream os;
         os << "Expecting " << lshape1 << " or " << lshape2 << ". But got "
            << in_shape->at(softmaxout_enum::kLabel);
         throw InferShapeError(os.str(), softmaxout_enum::kLabel);
       }
     } else {
       mxnet::TShape label_shape(dshape.ndim() - 1, -1);
       for (int i = 0; i + 1 < dshape.ndim(); ++i)
         label_shape[i] = dshape[i];
       SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
     }
   }

   out_shape->clear();
   out_shape->push_back(dshape);
   return true;
 }

 #if MXNET_USE_ONEDNN == 1
 inline static bool SoftmaxOutputStorageType(const nnvm::NodeAttrs& attrs,
                                             const int dev_mask,
                                             DispatchMode* dispatch_mode,
                                             std::vector<int>* in_attrs,
                                             std::vector<int>* out_attrs) {
   CHECK_EQ(in_attrs->size(), 2);
   CHECK_EQ(out_attrs->size(), 1);

   return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 }

 void SoftmaxOutputComputeExCPU(const nnvm::NodeAttrs& attrs,
                                const OpContext& ctx,
                                const std::vector<NDArray>& inputs,
                                const std::vector<OpReqType>& req,
                                const std::vector<NDArray>& outputs) {
   CHECK_EQ(inputs.size(), 2U);
   const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
   if (SupportDNNLSoftmaxOutput(param, inputs[0]) && !ctx.is_train) {
     DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
     DNNLRun(DNNLSoftmaxOutputForward, attrs, ctx, inputs, req, outputs);
     DNNL_OPCHECK_RUN(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
   FallBackCompute(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
 }
 #endif

 NNVM_REGISTER_OP(SoftmaxOutput)
     .describe(R"code(Computes the gradient of cross entropy loss with respect to softmax output.

 - This operator computes the gradient in two steps.
   The cross entropy loss does not actually need to be computed.

   - Applies softmax function on the input array.
   - Computes and returns the gradient of cross entropy loss w.r.t. the softmax output.

 - The softmax function, cross entropy loss and gradient is given by:

   - Softmax Function:

     .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}

   - Cross Entropy Function:

     .. math:: \text{CE(label, output)} = - \sum_i \text{label}_i \log(\text{output}_i)

   - The gradient of cross entropy loss w.r.t softmax output:

     .. math:: \text{gradient} = \text{output} - \text{label}

 - During forward propagation, the softmax function is computed for each instance in the input array.

   For general *N*-D input arrays with shape :math:`(d_1, d_2, ..., d_n)`. The size is
   :math:`s=d_1 \cdot d_2 \cdot \cdot \cdot d_n`. We can use the parameters `preserve_shape`
   and `multi_output` to specify the way to compute softmax:

   - By default, `preserve_shape` is ``false``. This operator will reshape the input array
     into a 2-D array with shape :math:`(d_1, \frac{s}{d_1})` and then compute the softmax function for
     each row in the reshaped array, and afterwards reshape it back to the original shape
     :math:`(d_1, d_2, ..., d_n)`.
   - If `preserve_shape` is ``true``, the softmax function will be computed along
     the last axis (`axis` = ``-1``).
   - If `multi_output` is ``true``, the softmax function will be computed along
     the second axis (`axis` = ``1``).

 - During backward propagation, the gradient of cross-entropy loss w.r.t softmax output array is computed.
   The provided label can be a one-hot label array or a probability label array.

   - If the parameter `use_ignore` is ``true``, `ignore_label` can specify input instances
     with a particular label to be ignored during backward propagation. **This has no effect when
     softmax `output` has same shape as `label`**.

     Example::

       data = [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
       label = [1,0,2,3]
       ignore_label = 1
       SoftmaxOutput(data=data, label = label,\
                     multi_output=true, use_ignore=true,\
                     ignore_label=ignore_label)
       ## forward softmax output
       [[ 0.0320586   0.08714432  0.23688284  0.64391428]
        [ 0.25        0.25        0.25        0.25      ]
        [ 0.25        0.25        0.25        0.25      ]
        [ 0.25        0.25        0.25        0.25      ]]
       ## backward gradient output
       [[ 0.    0.    0.    0.  ]
        [-0.75  0.25  0.25  0.25]
        [ 0.25  0.25 -0.75  0.25]
        [ 0.25  0.25  0.25 -0.75]]
       ## notice that the first row is all 0 because label[0] is 1, which is equal to ignore_label.

   - The parameter `grad_scale` can be used to rescale the gradient, which is often used to
     give each loss function different weights.

   - This operator also supports various ways to normalize the gradient by `normalization`,
     The `normalization` is applied if softmax output has different shape than the labels.
     The `normalization` mode can be set to the followings:

     - ``'null'``: do nothing.
     - ``'batch'``: divide the gradient by the batch size.
     - ``'valid'``: divide the gradient by the number of instances which are not ignored.

 )code" ADD_FILELINE)
     .set_num_inputs(2)
     .set_num_outputs(1)
     .set_attr_parser(ParamParser<SoftmaxOutputParam>)
 #if MXNET_USE_ONEDNN == 1
     .set_attr<FInferStorageType>("FInferStorageType", SoftmaxOutputStorageType)
     .set_attr<bool>("TIsDNNL", true)
     .set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxOutputComputeExCPU)
 #endif
     .set_attr<nnvm::FListInputNames>("FListInputNames",
                                      [](const NodeAttrs& attrs) {
                                        return std::vector<std::string>{"data", "label"};
                                      })
     .set_attr<nnvm::FListOutputNames>("FListOutputNames",
                                       [](const NodeAttrs& attrs) {
                                         return std::vector<std::string>{"output"};
                                       })
     .set_attr<mxnet::FInferShape>("FInferShape", SoftmaxOutputShape)
     .set_attr<nnvm::FInferType>("FInferType", SoftmaxOutputType)
     .set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputCompute<cpu>)
     .set_attr<nnvm::FGradient>("FGradient", SoftmaxOutputGrad{"_backward_SoftmaxOutput"})
     .set_attr<nnvm::FInplaceOption>("FInplaceOption",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::pair<int, int> >{{0, 0}};
                                     })
     .add_argument("data", "NDArray-or-Symbol", "Input array.")
     .add_argument("label", "NDArray-or-Symbol", "Ground truth label.")
     .add_arguments(SoftmaxOutputParam::__FIELDS__());

 // Softmax symbol is renamed to SoftmaxOutput and deprecated since Dec, 2015
 NNVM_REGISTER_OP(SoftmaxOutput).add_alias("Softmax");

 NNVM_REGISTER_OP(_backward_SoftmaxOutput)
     .set_num_inputs(2)
     .set_num_outputs(2)
     .set_attr<nnvm::TIsBackward>("TIsBackward", true)
     .set_attr<nnvm::FInplaceOption>("FInplaceOption",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::pair<int, int> >{{0, 0}};
                                     })
     .set_attr<FResourceRequest>("FResourceRequest",
                                 [](const NodeAttrs& n) {
                                   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                 })
     .set_attr_parser(ParamParser<SoftmaxOutputParam>)
     .set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputGradCompute<cpu>);
 }  // namespace op
 }  // namespace mxnet
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file softmax_output.cc
	* \brief
	* \author Bing Xu, Zhang Rong A
	*/
	#include "./softmax_output-inl.h"
	#if MXNET_USE_ONEDNN == 1
	#include "operator/nn/dnnl/dnnl_base-inl.h"
	#include "operator/nn/dnnl/dnnl_softmax_output-inl.h"
	#endif
	namespace mxnet {
	namespace op {

	DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
	struct SoftmaxOutputGrad {
	const char* op_name;
	std::vector<nnvm::NodeEntry> operator()(const nnvm::ObjectPtr& n,
	const std::vector<nnvm::NodeEntry>& ograds) const {
	std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
	for (uint32_t i = 0; i < out_data.size(); ++i) {
	out_data[i] = nnvm::NodeEntry{n, i, 0};
	}
	std::vector<nnvm::NodeEntry> heads;
	heads.push_back(out_data[softmaxout_enum::kOut]);
	heads.push_back(n->inputs[softmaxout_enum::kLabel]);

	nnvm::ObjectPtr gnode = nnvm::Node::Create();
	gnode->inputs = std::move(heads);
	gnode->control_deps.emplace_back(n);
	gnode->attrs = n->attrs;
	gnode->attrs.op = nnvm::Op::Get("_backward_SoftmaxOutput");
	gnode->attrs.name = n->attrs.name + "_backward";
	std::vector<nnvm::NodeEntry> in_grad(2);
	in_grad[0] = nnvm::NodeEntry{gnode, 0, 0};
	in_grad[1] = nnvm::NodeEntry{gnode, 1, 0};
	return in_grad;
	}
	};

	static inline std::vector<std::string> ListArguments() {
	return {"data", "label"};
	}

	static bool SoftmaxOutputType(const nnvm::NodeAttrs& attrs,
	std::vector<int>* in_type,
	std::vector<int>* out_type) {
	CHECK_EQ(in_type->size(), 2U);
	int dtype = (*in_type)[0];
	if (type_is_none(dtype)) {
	// Input type is undefined, we try backward inference
	if (out_type->size() == 0 \|\| type_is_none((*out_type)[0])) {
	// Neither the input nor the output are defined,
	// types cannot be infered for this op
	return false;
	} else {
	// Input type is undefined but output type is: backward inference
	dtype = (*out_type)[0];
	}
	} else {
	// Input type is defined but output type is not: forward inference
	out_type->clear();
	out_type->push_back(dtype);
	}
	for (size_t i = 0; i < in_type->size(); ++i) {
	if ((*in_type)[i] == -1) {
	(*in_type)[i] = dtype;
	} else {
	UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
	}
	}
	return true;
	}

	static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
	mxnet::ShapeVector* in_shape,
	mxnet::ShapeVector* out_shape) {
	using namespace mshadow;
	const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
	CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
	const mxnet::TShape& dshape = in_shape->at(0);
	if (!mxnet::ndim_is_known(dshape))
	return false;

	// label.shape == data.shape: use probability as label
	if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
	if (param.multi_output) {
	mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size() / dshape[0] / dshape[1]);
	mxnet::TShape lshape2(dshape.ndim() - 1, -1);
	lshape2[0] = dshape[0];
	for (int i = 2; i < dshape.ndim(); ++i)
	lshape2[i - 1] = dshape[i];
	mxnet::TShape lshape3 = dshape;
	lshape3[1] = 1;
	if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
	in_shape->at(softmaxout_enum::kLabel) = lshape1;
	} else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
	} else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
	} else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
	} else {
	std::ostringstream os;
	os << "Expecting " << lshape1 << " or " << lshape2 << ". But got "
	<< in_shape->at(softmaxout_enum::kLabel);
	throw InferShapeError(os.str(), softmaxout_enum::kLabel);
	}
	} else {
	mxnet::TShape label_shape(dshape.ndim() - 1, -1);
	for (int i = 0; i + 1 < dshape.ndim(); ++i)
	label_shape[i] = dshape[i];
	SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
	}
	}

	out_shape->clear();
	out_shape->push_back(dshape);
	return true;
	}

	#if MXNET_USE_ONEDNN == 1
	inline static bool SoftmaxOutputStorageType(const nnvm::NodeAttrs& attrs,
	const int dev_mask,
	DispatchMode* dispatch_mode,
	std::vector<int>* in_attrs,
	std::vector<int>* out_attrs) {
	CHECK_EQ(in_attrs->size(), 2);
	CHECK_EQ(out_attrs->size(), 1);

	return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
	}

	void SoftmaxOutputComputeExCPU(const nnvm::NodeAttrs& attrs,
	const OpContext& ctx,
	const std::vector<NDArray>& inputs,
	const std::vector<OpReqType>& req,
	const std::vector<NDArray>& outputs) {
	CHECK_EQ(inputs.size(), 2U);
	const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
	if (SupportDNNLSoftmaxOutput(param, inputs[0]) && !ctx.is_train) {
	DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
	DNNLRun(DNNLSoftmaxOutputForward, attrs, ctx, inputs, req, outputs);
	DNNL_OPCHECK_RUN(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
	return;
	}
	FallBackCompute(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
	}
	#endif

	NNVM_REGISTER_OP(SoftmaxOutput)
	.describe(R"code(Computes the gradient of cross entropy loss with respect to softmax output.

	- This operator computes the gradient in two steps.
	The cross entropy loss does not actually need to be computed.

	- Applies softmax function on the input array.
	- Computes and returns the gradient of cross entropy loss w.r.t. the softmax output.

	- The softmax function, cross entropy loss and gradient is given by:

	- Softmax Function:

	.. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}

	- Cross Entropy Function:

	.. math:: \text{CE(label, output)} = - \sum_i \text{label}_i \log(\text{output}_i)

	- The gradient of cross entropy loss w.r.t softmax output:

	.. math:: \text{gradient} = \text{output} - \text{label}

	- During forward propagation, the softmax function is computed for each instance in the input array.

	For general N-D input arrays with shape :math:`(d_1, d_2, ..., d_n)`. The size is
	:math:`s=d_1 \cdot d_2 \cdot \cdot \cdot d_n`. We can use the parameters `preserve_shape`
	and `multi_output` to specify the way to compute softmax:

	- By default, `preserve_shape` is ``false``. This operator will reshape the input array
	into a 2-D array with shape :math:`(d_1, \frac{s}{d_1})` and then compute the softmax function for
	each row in the reshaped array, and afterwards reshape it back to the original shape
	:math:`(d_1, d_2, ..., d_n)`.
	- If `preserve_shape` is ``true``, the softmax function will be computed along
	the last axis (`axis` = ``-1``).
	- If `multi_output` is ``true``, the softmax function will be computed along
	the second axis (`axis` = ``1``).

	- During backward propagation, the gradient of cross-entropy loss w.r.t softmax output array is computed.
	The provided label can be a one-hot label array or a probability label array.

	- If the parameter `use_ignore` is ``true``, `ignore_label` can specify input instances
	with a particular label to be ignored during backward propagation. **This has no effect when
	softmax `output` has same shape as `label`**.

	Example::

	data = [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
	label = [1,0,2,3]
	ignore_label = 1
	SoftmaxOutput(data=data, label = label,\
	multi_output=true, use_ignore=true,\
	ignore_label=ignore_label)
	## forward softmax output
	[[ 0.0320586 0.08714432 0.23688284 0.64391428]
	[ 0.25 0.25 0.25 0.25 ]
	[ 0.25 0.25 0.25 0.25 ]
	[ 0.25 0.25 0.25 0.25 ]]
	## backward gradient output
	[[ 0. 0. 0. 0. ]
	[-0.75 0.25 0.25 0.25]
	[ 0.25 0.25 -0.75 0.25]
	[ 0.25 0.25 0.25 -0.75]]
	## notice that the first row is all 0 because label[0] is 1, which is equal to ignore_label.

	- The parameter `grad_scale` can be used to rescale the gradient, which is often used to
	give each loss function different weights.

	- This operator also supports various ways to normalize the gradient by `normalization`,
	The `normalization` is applied if softmax output has different shape than the labels.
	The `normalization` mode can be set to the followings:

	- ``'null'``: do nothing.
	- ``'batch'``: divide the gradient by the batch size.
	- ``'valid'``: divide the gradient by the number of instances which are not ignored.

	)code" ADD_FILELINE)
	.set_num_inputs(2)
	.set_num_outputs(1)
	.set_attr_parser(ParamParser<SoftmaxOutputParam>)
	#if MXNET_USE_ONEDNN == 1
	.set_attr<FInferStorageType>("FInferStorageType", SoftmaxOutputStorageType)
	.set_attr<bool>("TIsDNNL", true)
	.set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxOutputComputeExCPU)
	#endif
	.set_attr<nnvm::FListInputNames>("FListInputNames",
	[](const NodeAttrs& attrs) {
	return std::vector<std::string>{"data", "label"};
	})
	.set_attr<nnvm::FListOutputNames>("FListOutputNames",
	[](const NodeAttrs& attrs) {
	return std::vector<std::string>{"output"};
	})
	.set_attr<mxnet::FInferShape>("FInferShape", SoftmaxOutputShape)
	.set_attr<nnvm::FInferType>("FInferType", SoftmaxOutputType)
	.set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputCompute<cpu>)
	.set_attr<nnvm::FGradient>("FGradient", SoftmaxOutputGrad{"_backward_SoftmaxOutput"})
	.set_attr<nnvm::FInplaceOption>("FInplaceOption",
	[](const NodeAttrs& attrs) {
	return std::vector<std::pair<int, int> >{{0, 0}};
	})
	.add_argument("data", "NDArray-or-Symbol", "Input array.")
	.add_argument("label", "NDArray-or-Symbol", "Ground truth label.")
	.add_arguments(SoftmaxOutputParam::__FIELDS__());

	// Softmax symbol is renamed to SoftmaxOutput and deprecated since Dec, 2015
	NNVM_REGISTER_OP(SoftmaxOutput).add_alias("Softmax");

	NNVM_REGISTER_OP(_backward_SoftmaxOutput)
	.set_num_inputs(2)
	.set_num_outputs(2)
	.set_attr<nnvm::TIsBackward>("TIsBackward", true)
	.set_attr<nnvm::FInplaceOption>("FInplaceOption",
	[](const NodeAttrs& attrs) {
	return std::vector<std::pair<int, int> >{{0, 0}};
	})
	.set_attr<FResourceRequest>("FResourceRequest",
	[](const NodeAttrs& n) {
	return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
	})
	.set_attr_parser(ParamParser<SoftmaxOutputParam>)
	.set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputGradCompute<cpu>);
	} // namespace op
	} // namespace mxnet