blob: 2cbd26406255c902217543dedcab7995dbdbda52 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "./softmax.h"
namespace singa {
RegisterLayerClass(singa_softmax, Softmax);
RegisterLayerClass(singacpp_softmax, Softmax);
RegisterLayerClass(singacuda_softmax, Softmax);
RegisterLayerClass(singacl_softmax, Softmax);
void Softmax::Setup(const Shape& in_sample, const LayerConf& conf) {
Layer::Setup(in_sample, conf);
CHECK_EQ(in_sample.size(), 1u);
out_sample_shape_ = in_sample;
}
const Tensor Softmax::Forward(int flag, const Tensor& input) {
CHECK_LE(input.nDim(), 2u);
Tensor output = SoftMax(input);
if (flag & kTrain)
buf_.push(output);
return output;
}
const std::pair<Tensor, vector<Tensor>> Softmax::Backward(int flag,
const Tensor& grad) {
CHECK_LE(grad.nDim(), 2u);
Tensor input_grad = grad.Clone();
CHECK(!buf_.empty());
Tensor y = buf_.top();
buf_.pop();
CHECK(y.shape() == input_grad.shape());
Tensor sigma = input_grad * y;
size_t nrow = 1, ncol = grad.Size();
if (grad.nDim() > 1) {
nrow = grad.shape(0);
ncol = grad.shape(1);
} else {
input_grad.Reshape({nrow, ncol});
sigma.Reshape({nrow, ncol});
}
Tensor sum(Shape{nrow}, grad.device(), grad.data_type());
SumColumns(sigma, &sum);
// dL / dy_i = grad_i
// dy_i / dx_i = y_i - y_i^2, if i == j
// dy_i / dx_j = - y_i * y_j, if i != j
// dL / dx_i = sum_j((dL / dy_j) * (dy_j / dx_i))
// dL / dx_i = y_i * (grad_i - sum), where sum = sum_i(grad_i * y_i);
SubColumn(sum, &input_grad);
input_grad = input_grad * y;
if (grad.nDim() == 1)
input_grad.Reshape(Shape{ncol});
// Mult(input_grad, y, &input_grad);
vector<Tensor> param_grad;
return std::make_pair(input_grad, param_grad);
}
} // namespace singa