blob: 0e7549e5e8931a64886d4fb5de060dd5260fa2ff [file] [log] [blame]
//! Gradient Descent
//!
//! Implementation of gradient descent algorithm. Module contains
//! the struct `GradientDesc` which is instantiated within models
//! implementing the Optimizable trait.
//!
//! Currently standard batch gradient descent is the only implemented
//! optimization algorithm but there is flexibility to introduce new
//! algorithms and git them into the same scheme easily.
use std::vec::*;
use learning::optim::{Optimizable, OptimAlgorithm};
use linalg::Vector;
use linalg::{Matrix, BaseMatrix};
use rulinalg::utils;
use learning::toolkit::rand_utils;
const LEARNING_EPS: f64 = 1e-20;
/// Batch Gradient Descent algorithm
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub struct GradientDesc {
/// The step-size for the gradient descent steps.
alpha: f64,
/// The number of iterations to run.
iters: usize,
}
/// The default gradient descent algorithm.
///
/// The defaults are:
///
/// - alpha = 0.3
/// - iters = 100
impl Default for GradientDesc {
fn default() -> GradientDesc {
GradientDesc {
alpha: 0.3,
iters: 100,
}
}
}
impl GradientDesc {
/// Construct a gradient descent algorithm.
///
/// Requires the step size and iteration count
/// to be specified.
///
/// # Examples
///
/// ```
/// use rusty_machine::learning::optim::grad_desc::GradientDesc;
///
/// let gd = GradientDesc::new(0.3, 10000);
/// ```
pub fn new(alpha: f64, iters: usize) -> GradientDesc {
assert!(alpha > 0f64,
"The step size (alpha) must be greater than 0.");
GradientDesc {
alpha: alpha,
iters: iters,
}
}
}
impl<M: Optimizable> OptimAlgorithm<M> for GradientDesc {
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
// Create the initial optimal parameters
let mut optimizing_val = Vector::new(start.to_vec());
// The cost at the start of each iteration
let mut start_iter_cost = 0f64;
for _ in 0..self.iters {
// Compute the cost and gradient for the current parameters
let (cost, grad) = model.compute_grad(optimizing_val.data(), inputs, targets);
// Early stopping
if (start_iter_cost - cost).abs() < LEARNING_EPS {
break;
} else {
// Update the optimal parameters using gradient descent
optimizing_val = &optimizing_val - Vector::new(grad) * self.alpha;
// Update the latest cost
start_iter_cost = cost;
}
}
optimizing_val.into_vec()
}
}
/// Stochastic Gradient Descent algorithm.
///
/// Uses basic momentum to control the learning rate.
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub struct StochasticGD {
/// Controls the momentum of the descent
alpha: f64,
/// The square root of the raw learning rate.
mu: f64,
/// The number of passes through the data.
iters: usize,
}
/// The default Stochastic GD algorithm.
///
/// The defaults are:
///
/// - alpha = 0.1
/// - mu = 0.1
/// - iters = 20
impl Default for StochasticGD {
fn default() -> StochasticGD {
StochasticGD {
alpha: 0.1,
mu: 0.1,
iters: 20,
}
}
}
impl StochasticGD {
/// Construct a stochastic gradient descent algorithm.
///
/// Requires the learning rate, momentum rate and iteration count
/// to be specified.
///
/// # Examples
///
/// ```
/// use rusty_machine::learning::optim::grad_desc::StochasticGD;
///
/// let sgd = StochasticGD::new(0.1, 0.3, 5);
/// ```
pub fn new(alpha: f64, mu: f64, iters: usize) -> StochasticGD {
assert!(alpha > 0f64, "The momentum (alpha) must be greater than 0.");
assert!(mu > 0f64, "The step size (mu) must be greater than 0.");
StochasticGD {
alpha: alpha,
mu: mu,
iters: iters,
}
}
}
impl<M> OptimAlgorithm<M> for StochasticGD
where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>
{
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
// Create the initial optimal parameters
let mut optimizing_val = Vector::new(start.to_vec());
// Create the momentum based gradient distance
let mut delta_w = Vector::zeros(start.len());
// Set up the indices for permutation
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
// The cost at the start of each iteration
let mut start_iter_cost = 0f64;
for _ in 0..self.iters {
// The cost at the end of each stochastic gd pass
let mut end_cost = 0f64;
// Permute the indices
rand_utils::in_place_fisher_yates(&mut permutation);
for i in &permutation {
// Compute the cost and gradient for this data pair
let (cost, vec_data) = model.compute_grad(optimizing_val.data(),
&inputs.select_rows(&[*i]),
&targets.select_rows(&[*i]));
// Compute the difference in gradient using momentum
delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
// Update the parameters
optimizing_val = &optimizing_val - &delta_w * self.mu;
// Set the end cost (this is only used after the last iteration)
end_cost += cost;
}
end_cost /= inputs.rows() as f64;
// Early stopping
if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
break;
} else {
// Update the cost
start_iter_cost = end_cost;
}
}
optimizing_val.into_vec()
}
}
/// Adaptive Gradient Descent
///
/// The adaptive gradient descent algorithm (Duchi et al. 2010).
#[derive(Debug, Serialize, Deserialize)]
pub struct AdaGrad {
alpha: f64,
tau: f64,
iters: usize,
}
impl AdaGrad {
/// Constructs a new AdaGrad algorithm.
///
/// # Examples
///
/// ```
/// use rusty_machine::learning::optim::grad_desc::AdaGrad;
///
/// // Create a new AdaGrad algorithm with step size 0.5
/// // and adaptive scaling constant 1.0
/// let gd = AdaGrad::new(0.5, 1.0, 100);
/// ```
pub fn new(alpha: f64, tau: f64, iters: usize) -> AdaGrad {
assert!(alpha > 0f64,
"The step size (alpha) must be greater than 0.");
assert!(tau >= 0f64,
"The adaptive constant (tau) cannot be negative.");
AdaGrad {
alpha: alpha,
tau: tau,
iters: iters,
}
}
}
impl Default for AdaGrad {
fn default() -> AdaGrad {
AdaGrad {
alpha: 1f64,
tau: 3f64,
iters: 100,
}
}
}
impl<M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>> OptimAlgorithm<M> for AdaGrad {
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
// Initialize the adaptive scaling
let mut ada_s = Vector::zeros(start.len());
// Initialize the optimal parameters
let mut optimizing_val = Vector::new(start.to_vec());
// Set up the indices for permutation
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
// The cost at the start of each iteration
let mut start_iter_cost = 0f64;
for _ in 0..self.iters {
// The cost at the end of each stochastic gd pass
let mut end_cost = 0f64;
// Permute the indices
rand_utils::in_place_fisher_yates(&mut permutation);
for i in &permutation {
// Compute the cost and gradient for this data pair
let (cost, mut vec_data) = model.compute_grad(optimizing_val.data(),
&inputs.select_rows(&[*i]),
&targets.select_rows(&[*i]));
// Update the adaptive scaling by adding the gradient squared
utils::in_place_vec_bin_op(ada_s.mut_data(), &vec_data, |x, &y| *x += y * y);
// Compute the change in gradient
utils::in_place_vec_bin_op(&mut vec_data, ada_s.data(), |x, &y| {
*x = self.alpha * (*x / (self.tau + (y).sqrt()))
});
// Update the parameters
optimizing_val = &optimizing_val - Vector::new(vec_data);
// Set the end cost (this is only used after the last iteration)
end_cost += cost;
}
end_cost /= inputs.rows() as f64;
// Early stopping
if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
break;
} else {
// Update the cost
start_iter_cost = end_cost;
}
}
optimizing_val.into_vec()
}
}
/// RMSProp
///
/// The RMSProp algorithm (Hinton et al. 2012).
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct RMSProp {
/// The base step size of gradient descent steps
learning_rate: f64,
/// Rate at which running total of average square gradients decays
decay_rate: f64,
/// Small value used to avoid divide by zero
epsilon: f64,
/// The number of passes through the data
iters: usize,
}
/// The default RMSProp configuration
///
/// The defaults are:
///
/// - learning_rate = 0.01
/// - decay_rate = 0.9
/// - epsilon = 1.0e-5
/// - iters = 50
impl Default for RMSProp {
fn default() -> RMSProp {
RMSProp {
learning_rate: 0.01,
decay_rate: 0.9,
epsilon: 1.0e-5,
iters: 50
}
}
}
impl RMSProp {
/// Construct an RMSProp algorithm.
///
/// Requires learning rate, decay rate, epsilon, and iteration count.
///
/// #Examples
///
/// ```
/// use rusty_machine::learning::optim::grad_desc::RMSProp;
///
/// let rms = RMSProp::new(0.99, 0.01, 1e-5, 20);
/// ```
pub fn new(learning_rate: f64, decay_rate: f64, epsilon: f64, iters: usize) -> RMSProp {
assert!(0f64 < learning_rate, "The learning rate must be positive");
assert!(0f64 < decay_rate && decay_rate < 1f64, "The decay rate must be between 0 and 1");
assert!(0f64 < epsilon, "Epsilon must be positive");
RMSProp {
decay_rate: decay_rate,
learning_rate: learning_rate,
epsilon: epsilon,
iters: iters
}
}
}
impl<M> OptimAlgorithm<M> for RMSProp
where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>> {
fn optimize(&self,
model: &M,
start: &[f64],
inputs: &M::Inputs,
targets: &M::Targets)
-> Vec<f64> {
// Initial parameters
let mut params = Vector::new(start.to_vec());
// Running average of squared gradients
let mut rmsprop_cache = Vector::zeros(start.len());
// Set up indices for permutation
let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
// The cost from the previous iteration
let mut prev_cost = 0f64;
for _ in 0..self.iters {
// The cost at end of each pass
let mut end_cost = 0f64;
// Permute the vertices
rand_utils::in_place_fisher_yates(&mut permutation);
for i in &permutation {
let (cost, grad) = model.compute_grad(params.data(),
&inputs.select_rows(&[*i]),
&targets.select_rows(&[*i]));
let mut grad = Vector::new(grad);
let grad_squared = grad.clone().apply(&|x| x*x);
// Update cached average of squared gradients
rmsprop_cache = &rmsprop_cache*self.decay_rate + &grad_squared*(1.0 - self.decay_rate);
// RMSProp update rule
utils::in_place_vec_bin_op(grad.mut_data(), rmsprop_cache.data(), |x, &y| {
*x = *x * self.learning_rate / (y + self.epsilon).sqrt();
});
params = &params - &grad;
end_cost += cost;
}
end_cost /= inputs.rows() as f64;
// Early stopping
if (prev_cost - end_cost).abs() < LEARNING_EPS {
break;
} else {
prev_cost = end_cost;
}
}
params.into_vec()
}
}
#[cfg(test)]
mod tests {
use super::{GradientDesc, StochasticGD, AdaGrad, RMSProp};
#[test]
#[should_panic]
fn gd_neg_stepsize() {
let _ = GradientDesc::new(-0.5, 0);
}
#[test]
#[should_panic]
fn stochastic_gd_neg_momentum() {
let _ = StochasticGD::new(-0.5, 1f64, 0);
}
#[test]
#[should_panic]
fn stochastic_gd_neg_stepsize() {
let _ = StochasticGD::new(0.5, -1f64, 0);
}
#[test]
#[should_panic]
fn adagrad_neg_stepsize() {
let _ = AdaGrad::new(-0.5, 1f64, 0);
}
#[test]
#[should_panic]
fn adagrad_neg_adaptive_scale() {
let _ = AdaGrad::new(0.5, -1f64, 0);
}
#[test]
#[should_panic]
fn rmsprop_neg_decay_rate() {
let _ = RMSProp::new(-0.5, 0.005, 1.0e-5, 0);
}
#[test]
#[should_panic]
fn rmsprop_neg_epsilon() {
let _ = RMSProp::new(0.5, 0.005, -1.0e-5, 0);
}
#[test]
#[should_panic]
fn rmsprop_neg_learning_rate() {
let _ = RMSProp::new(0.5, -0.005, 1.0e-5, 0);
}
}