blob: cb0715f5089ac30b62b18cb79e62e45b2a4fdcf0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file softmax_output.cc
* \brief
* \author Bing Xu, Zhang Rong A
*/
#include "./softmax_output-inl.h"
#if MXNET_USE_ONEDNN == 1
#include "operator/nn/dnnl/dnnl_base-inl.h"
#include "operator/nn/dnnl/dnnl_softmax_output-inl.h"
#endif
namespace mxnet {
namespace op {
DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
struct SoftmaxOutputGrad {
const char* op_name;
std::vector<nnvm::NodeEntry> operator()(const nnvm::ObjectPtr& n,
const std::vector<nnvm::NodeEntry>& ograds) const {
std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
for (uint32_t i = 0; i < out_data.size(); ++i) {
out_data[i] = nnvm::NodeEntry{n, i, 0};
}
std::vector<nnvm::NodeEntry> heads;
heads.push_back(out_data[softmaxout_enum::kOut]);
heads.push_back(n->inputs[softmaxout_enum::kLabel]);
nnvm::ObjectPtr gnode = nnvm::Node::Create();
gnode->inputs = std::move(heads);
gnode->control_deps.emplace_back(n);
gnode->attrs = n->attrs;
gnode->attrs.op = nnvm::Op::Get("_backward_SoftmaxOutput");
gnode->attrs.name = n->attrs.name + "_backward";
std::vector<nnvm::NodeEntry> in_grad(2);
in_grad[0] = nnvm::NodeEntry{gnode, 0, 0};
in_grad[1] = nnvm::NodeEntry{gnode, 1, 0};
return in_grad;
}
};
static inline std::vector<std::string> ListArguments() {
return {"data", "label"};
}
static bool SoftmaxOutputType(const nnvm::NodeAttrs& attrs,
std::vector<int>* in_type,
std::vector<int>* out_type) {
CHECK_EQ(in_type->size(), 2U);
int dtype = (*in_type)[0];
if (type_is_none(dtype)) {
// Input type is undefined, we try backward inference
if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
// Neither the input nor the output are defined,
// types cannot be infered for this op
return false;
} else {
// Input type is undefined but output type is: backward inference
dtype = (*out_type)[0];
}
} else {
// Input type is defined but output type is not: forward inference
out_type->clear();
out_type->push_back(dtype);
}
for (size_t i = 0; i < in_type->size(); ++i) {
if ((*in_type)[i] == -1) {
(*in_type)[i] = dtype;
} else {
UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
}
}
return true;
}
static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
mxnet::ShapeVector* in_shape,
mxnet::ShapeVector* out_shape) {
using namespace mshadow;
const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
const mxnet::TShape& dshape = in_shape->at(0);
if (!mxnet::ndim_is_known(dshape))
return false;
// label.shape == data.shape: use probability as label
if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
if (param.multi_output) {
mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size() / dshape[0] / dshape[1]);
mxnet::TShape lshape2(dshape.ndim() - 1, -1);
lshape2[0] = dshape[0];
for (int i = 2; i < dshape.ndim(); ++i)
lshape2[i - 1] = dshape[i];
mxnet::TShape lshape3 = dshape;
lshape3[1] = 1;
if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
in_shape->at(softmaxout_enum::kLabel) = lshape1;
} else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
} else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
} else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
} else {
std::ostringstream os;
os << "Expecting " << lshape1 << " or " << lshape2 << ". But got "
<< in_shape->at(softmaxout_enum::kLabel);
throw InferShapeError(os.str(), softmaxout_enum::kLabel);
}
} else {
mxnet::TShape label_shape(dshape.ndim() - 1, -1);
for (int i = 0; i + 1 < dshape.ndim(); ++i)
label_shape[i] = dshape[i];
SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
}
}
out_shape->clear();
out_shape->push_back(dshape);
return true;
}
#if MXNET_USE_ONEDNN == 1
inline static bool SoftmaxOutputStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
DispatchMode* dispatch_mode,
std::vector<int>* in_attrs,
std::vector<int>* out_attrs) {
CHECK_EQ(in_attrs->size(), 2);
CHECK_EQ(out_attrs->size(), 1);
return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
}
void SoftmaxOutputComputeExCPU(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<NDArray>& inputs,
const std::vector<OpReqType>& req,
const std::vector<NDArray>& outputs) {
CHECK_EQ(inputs.size(), 2U);
const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
if (SupportDNNLSoftmaxOutput(param, inputs[0]) && !ctx.is_train) {
DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
DNNLRun(DNNLSoftmaxOutputForward, attrs, ctx, inputs, req, outputs);
DNNL_OPCHECK_RUN(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
return;
}
FallBackCompute(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
}
#endif
NNVM_REGISTER_OP(SoftmaxOutput)
.describe(R"code(Computes the gradient of cross entropy loss with respect to softmax output.
- This operator computes the gradient in two steps.
The cross entropy loss does not actually need to be computed.
- Applies softmax function on the input array.
- Computes and returns the gradient of cross entropy loss w.r.t. the softmax output.
- The softmax function, cross entropy loss and gradient is given by:
- Softmax Function:
.. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
- Cross Entropy Function:
.. math:: \text{CE(label, output)} = - \sum_i \text{label}_i \log(\text{output}_i)
- The gradient of cross entropy loss w.r.t softmax output:
.. math:: \text{gradient} = \text{output} - \text{label}
- During forward propagation, the softmax function is computed for each instance in the input array.
For general *N*-D input arrays with shape :math:`(d_1, d_2, ..., d_n)`. The size is
:math:`s=d_1 \cdot d_2 \cdot \cdot \cdot d_n`. We can use the parameters `preserve_shape`
and `multi_output` to specify the way to compute softmax:
- By default, `preserve_shape` is ``false``. This operator will reshape the input array
into a 2-D array with shape :math:`(d_1, \frac{s}{d_1})` and then compute the softmax function for
each row in the reshaped array, and afterwards reshape it back to the original shape
:math:`(d_1, d_2, ..., d_n)`.
- If `preserve_shape` is ``true``, the softmax function will be computed along
the last axis (`axis` = ``-1``).
- If `multi_output` is ``true``, the softmax function will be computed along
the second axis (`axis` = ``1``).
- During backward propagation, the gradient of cross-entropy loss w.r.t softmax output array is computed.
The provided label can be a one-hot label array or a probability label array.
- If the parameter `use_ignore` is ``true``, `ignore_label` can specify input instances
with a particular label to be ignored during backward propagation. **This has no effect when
softmax `output` has same shape as `label`**.
Example::
data = [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
label = [1,0,2,3]
ignore_label = 1
SoftmaxOutput(data=data, label = label,\
multi_output=true, use_ignore=true,\
ignore_label=ignore_label)
## forward softmax output
[[ 0.0320586 0.08714432 0.23688284 0.64391428]
[ 0.25 0.25 0.25 0.25 ]
[ 0.25 0.25 0.25 0.25 ]
[ 0.25 0.25 0.25 0.25 ]]
## backward gradient output
[[ 0. 0. 0. 0. ]
[-0.75 0.25 0.25 0.25]
[ 0.25 0.25 -0.75 0.25]
[ 0.25 0.25 0.25 -0.75]]
## notice that the first row is all 0 because label[0] is 1, which is equal to ignore_label.
- The parameter `grad_scale` can be used to rescale the gradient, which is often used to
give each loss function different weights.
- This operator also supports various ways to normalize the gradient by `normalization`,
The `normalization` is applied if softmax output has different shape than the labels.
The `normalization` mode can be set to the followings:
- ``'null'``: do nothing.
- ``'batch'``: divide the gradient by the batch size.
- ``'valid'``: divide the gradient by the number of instances which are not ignored.
)code" ADD_FILELINE)
.set_num_inputs(2)
.set_num_outputs(1)
.set_attr_parser(ParamParser<SoftmaxOutputParam>)
#if MXNET_USE_ONEDNN == 1
.set_attr<FInferStorageType>("FInferStorageType", SoftmaxOutputStorageType)
.set_attr<bool>("TIsDNNL", true)
.set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxOutputComputeExCPU)
#endif
.set_attr<nnvm::FListInputNames>("FListInputNames",
[](const NodeAttrs& attrs) {
return std::vector<std::string>{"data", "label"};
})
.set_attr<nnvm::FListOutputNames>("FListOutputNames",
[](const NodeAttrs& attrs) {
return std::vector<std::string>{"output"};
})
.set_attr<mxnet::FInferShape>("FInferShape", SoftmaxOutputShape)
.set_attr<nnvm::FInferType>("FInferType", SoftmaxOutputType)
.set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputCompute<cpu>)
.set_attr<nnvm::FGradient>("FGradient", SoftmaxOutputGrad{"_backward_SoftmaxOutput"})
.set_attr<nnvm::FInplaceOption>("FInplaceOption",
[](const NodeAttrs& attrs) {
return std::vector<std::pair<int, int> >{{0, 0}};
})
.add_argument("data", "NDArray-or-Symbol", "Input array.")
.add_argument("label", "NDArray-or-Symbol", "Ground truth label.")
.add_arguments(SoftmaxOutputParam::__FIELDS__());
// Softmax symbol is renamed to SoftmaxOutput and deprecated since Dec, 2015
NNVM_REGISTER_OP(SoftmaxOutput).add_alias("Softmax");
NNVM_REGISTER_OP(_backward_SoftmaxOutput)
.set_num_inputs(2)
.set_num_outputs(2)
.set_attr<nnvm::TIsBackward>("TIsBackward", true)
.set_attr<nnvm::FInplaceOption>("FInplaceOption",
[](const NodeAttrs& attrs) {
return std::vector<std::pair<int, int> >{{0, 0}};
})
.set_attr<FResourceRequest>("FResourceRequest",
[](const NodeAttrs& n) {
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
})
.set_attr_parser(ParamParser<SoftmaxOutputParam>)
.set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputGradCompute<cpu>);
} // namespace op
} // namespace mxnet