blob: a2180d380b62a25de5bd427b711916c1b6419971 [file] [log] [blame]
/************************************************************
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*************************************************************/
#include "singa/utils/updater.h"
#include "mshadow/cxxnet_op.h"
#include "mshadow/tensor.h"
#include "singa/utils/singleton.h"
#include "singa/utils/factory.h"
namespace singa {
using mshadow::cpu;
using mshadow::expr::F;
using mshadow::op::sqrtop;
using mshadow::op::square;
using mshadow::Shape;
using mshadow::Shape1;
using mshadow::Tensor;
using mshadow::TensorContainer;
LRGenerator* LRGenerator::Create(const LRGenProto& proto) {
auto factory = Singleton<Factory<LRGenerator>>::Instance();
LRGenerator* gen = nullptr;
if (proto.has_user_type())
gen = factory->Create(proto.user_type());
else
gen = factory->Create(proto.type());
gen->Init(proto);
return gen;
}
float FixedStepLRGen::Get(int step) {
if (last_idx_ < proto_.fixedstep_conf().step_size() - 1
&& step >= proto_.fixedstep_conf().step(last_idx_ + 1)) {
last_idx_++;
}
return proto_.fixedstep_conf().step_lr(last_idx_);
}
float StepLRGen::Get(int step) {
// do not cast int to float
int freq = proto_.step_conf().change_freq();
float lr = proto_.base_lr() * pow(proto_.step_conf().gamma(), step / freq);
// LOG_IF(INFO, step % freq == 0) << "Update learning rate to " << lr
// << " @ step " << step;
return lr;
}
float LinearLRGen::Get(int step) {
int freq = proto_.linear_conf().change_freq();
float r = step * 1.0 / freq;
return (1.0 - r) * proto_.base_lr() + r * proto_.linear_conf().final_lr();
}
float ExpLRGen::Get(int step) {
int freq = proto_.exponential_conf().change_freq();
return proto_.base_lr() / pow(2, step * 1. / freq);
}
float InvLRGen::Get(int step) {
return proto_.base_lr() * pow(1.f + proto_.inverse_conf().gamma() * step,
- proto_.inverse_conf().pow());
}
float InvTLRGen::Get(int step) {
return proto_.base_lr() / (1 + step * 1. / proto_.inverset_conf().final_lr());
}
Updater* Updater::Create(const UpdaterProto& proto) {
auto factory = Singleton<Factory<Updater>>::Instance();
Updater* updater = nullptr;
if (proto.has_user_type())
updater = factory->Create(proto.user_type());
else
updater = factory->Create(proto.type());
updater->Init(proto);
return updater;
}
/**************** added for Python Binding ***************************/
Updater* Updater::CreateUpdater(const string str) {
UpdaterProto conf;
conf.ParseFromString(str);
return Updater::Create(conf);
}
/***********************Python Binding end**************************/
/***********************SGD with momentum******************************/
void Updater::Init(const UpdaterProto& proto) {
momentum_ = proto.momentum();
weight_decay_ = proto.weight_decay();
lr_gen_ = LRGenerator::Create(proto.learning_rate());
clip_low_ = proto.clip_low();
clip_high_ = proto.clip_high();
}
void Updater::Clip(const float low, const float high, Param* param) {
Blob<float>* grad = param->mutable_grad();
float* ptr = grad->mutable_cpu_data();
for (int i = 0; i < grad->count(); i++) {
if (ptr[i] > high)
ptr[i] = high;
else if (ptr[i] < low)
ptr[i] = low;
}
}
void SGDUpdater::Update(int step, Param* param, float grad_scale) {
if (clip_high_ > clip_low_)
Clip(clip_low_, clip_high_, param);
Shape<1> s = Shape1(param->size());
Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
float lr = lr_gen_->Get(step) * param->lr_scale();
float wd = weight_decay_ * param->wd_scale();
grad *= grad_scale;
if (wd > 0) // L2 regularization, should be done after timing grad_scale
grad += data * wd;
if (momentum_ > 0) {
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
history = history * momentum_ - lr * grad;
data += history;
} else {
grad *= -lr;
data += grad;
}
}
/***********************Nesterov******************************/
void NesterovUpdater::Update(int step, Param* param, float grad_scale) {
if (clip_high_ > clip_low_)
Clip(clip_low_, clip_high_, param);
Shape<1> s = Shape1(param->size());
Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
TensorContainer<cpu, 1> tmp(s);
float lr = lr_gen_->Get(step)*param->lr_scale();
float wd = weight_decay_*param->wd_scale();
grad *= grad_scale;
if (wd > 0) // L2 regularization, should be done after timing grad_scale
grad += data * wd;
Copy(tmp, history);
history = history * momentum_ + lr * grad;
tmp = history * (1 + momentum_) - tmp * momentum_;
data -= tmp;
}
/***********************AdaGrad******************************/
void AdaGradUpdater::Update(int step, Param* param, float grad_scale) {
if (clip_high_ > clip_low_)
Clip(clip_low_, clip_high_, param);
Shape<1> s = Shape1(param->size());
Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
float lr = lr_gen_->Get(step)*param->lr_scale();
float wd = weight_decay_*param->wd_scale();
grad *= grad_scale;
if (wd > 0) // L2 regularization, should be done after timing grad_scale
grad += data * wd;
history += F<square>(grad);
data -= lr * grad / (F<sqrtop>(history, proto_.delta()));
}
/***********************RMSProp******************************/
void RMSPropUpdater::Init(const UpdaterProto& proto) {
Updater::Init(proto);
rho_ = proto.rmsprop_conf().rho();
delta_ = proto.delta();
}
void RMSPropUpdater::Update(int step, Param* param, float grad_scale) {
if (clip_high_ > clip_low_)
Clip(clip_low_, clip_high_, param);
Shape<1> s=Shape1(param->size());
Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
float lr = lr_gen_->Get(step) * param->lr_scale();
float wd = weight_decay_ * param->wd_scale();
grad *= grad_scale;
if (wd > 0) // L2 regularization, should be done after timing grad_scale
grad += data * wd;
history = history * rho_ + (1 - rho_) * F<square>(grad);
data -= lr * grad / F<sqrtop>(history, delta_);
}
/***********************AdaDelta******************************/
void AdaDeltaUpdater::Init(const UpdaterProto& proto){
Updater::Init(proto);
delta_ = proto.delta();
rho_=proto.adadelta_conf().rho();
}
void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){
Shape<1> s=Shape1(param->size());
Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
TensorContainer<cpu, 1> tmp(s);
float wd = weight_decay_*param->wd_scale();
float lr = lr_gen_->Get(step) * param->lr_scale();
grad *= grad_scale;
if (wd > 0) // L2 regularization, should be done after timing grad_scale
grad += data * wd;
history = history * rho_ + (1 - rho_) * F<op::square>(grad);
tmp = grad * F<op::sqrtop>(update, delta_) / F<op::sqrtop>(history, delta_);
update = rho_ * update + (1 - rho_) * F<op::square>(tmp);
data -= lr * tmp;
}
/***********************Adam******************************/
void AdamUpdater::Init(const UpdaterProto &proto) {
Updater::Init(proto);
beta1_=proto.adam_conf().beta1();
beta2_=proto.adam_conf().beta2();
delta_ = proto.delta();
}
void AdamUpdater::Update(int step, Param* param, float grad_scale) {
Shape<1> s=Shape1(param->size());
Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
float wd = weight_decay_*param->wd_scale();
float lr = lr_gen_->Get(step) * param->lr_scale();
grad *= grad_scale;
if (wd > 0) // L2 regularization, should be done after timing grad_scale
grad += data * wd;
history = history * beta1_ + (1 - beta1_) * grad;
update = update * beta2_ + (1 - beta2_) * F<op::square>(grad);
data -= lr * history / F<op::sqrtop>(update, delta_);
}
/***********************AdamMax******************************/
void AdamMaxUpdater::Init(const UpdaterProto &proto) {
Updater::Init(proto);
beta1_=proto.adammax_conf().beta1();
beta2_=proto.adammax_conf().beta2();
delta_=proto.delta();
}
void AdamMaxUpdater::Update(int step, Param* param, float grad_scale) {
Shape<1> s=Shape1(param->size());
Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
float wd = weight_decay_*param->wd_scale();
float lr = lr_gen_->Get(step) * param->lr_scale();
grad *= grad_scale;
if (wd > 0) // L2 regularization, should be done after timing grad_scale
grad += data * wd;
history = history * beta1_ + (1 - beta1_) * grad;
update = update * beta2_;
grad = F<op::abs>(grad);
update = F<op::max>(update, grad) + delta_;
data -= lr * history / update;
}
} // namespace singa