third_party/rusty-machine/src/learning/optim/grad_desc.rs - incubator-teaclave-sgx-sdk - Git at Google

 //! Gradient Descent
 //!
 //! Implementation of gradient descent algorithm. Module contains
 //! the struct `GradientDesc` which is instantiated within models
 //! implementing the Optimizable trait.
 //!
 //! Currently standard batch gradient descent is the only implemented
 //! optimization algorithm but there is flexibility to introduce new
 //! algorithms and git them into the same scheme easily.
 use std::vec::*;
 use learning::optim::{Optimizable, OptimAlgorithm};
 use linalg::Vector;
 use linalg::{Matrix, BaseMatrix};
 use rulinalg::utils;

 use learning::toolkit::rand_utils;

 const LEARNING_EPS: f64 = 1e-20;

 /// Batch Gradient Descent algorithm
 #[derive(Clone, Copy, Debug, Serialize, Deserialize)]
 pub struct GradientDesc {
     /// The step-size for the gradient descent steps.
     alpha: f64,
     /// The number of iterations to run.
     iters: usize,
 }

 /// The default gradient descent algorithm.
 ///
 /// The defaults are:
 ///
 /// - alpha = 0.3
 /// - iters = 100
 impl Default for GradientDesc {
     fn default() -> GradientDesc {
         GradientDesc {
             alpha: 0.3,
             iters: 100,
         }
     }
 }

 impl GradientDesc {
     /// Construct a gradient descent algorithm.
     ///
     /// Requires the step size and iteration count
     /// to be specified.
     ///
     /// # Examples
     ///
     /// ```
     /// use rusty_machine::learning::optim::grad_desc::GradientDesc;
     ///
     /// let gd = GradientDesc::new(0.3, 10000);
     /// ```
     pub fn new(alpha: f64, iters: usize) -> GradientDesc {
         assert!(alpha > 0f64,
                 "The step size (alpha) must be greater than 0.");

         GradientDesc {
             alpha: alpha,
             iters: iters,
         }
     }
 }

 impl<M: Optimizable> OptimAlgorithm<M> for GradientDesc {
     fn optimize(&self,
                 model: &M,
                 start: &[f64],
                 inputs: &M::Inputs,
                 targets: &M::Targets)
                 -> Vec<f64> {

         // Create the initial optimal parameters
         let mut optimizing_val = Vector::new(start.to_vec());
         // The cost at the start of each iteration
         let mut start_iter_cost = 0f64;

         for _ in 0..self.iters {
             // Compute the cost and gradient for the current parameters
             let (cost, grad) = model.compute_grad(optimizing_val.data(), inputs, targets);

             // Early stopping
             if (start_iter_cost - cost).abs() < LEARNING_EPS {
                 break;
             } else {
                 // Update the optimal parameters using gradient descent
                 optimizing_val = &optimizing_val - Vector::new(grad) * self.alpha;
                 // Update the latest cost
                 start_iter_cost = cost;
             }
         }
         optimizing_val.into_vec()
     }
 }

 /// Stochastic Gradient Descent algorithm.
 ///
 /// Uses basic momentum to control the learning rate.
 #[derive(Clone, Copy, Debug, Serialize, Deserialize)]
 pub struct StochasticGD {
     /// Controls the momentum of the descent
     alpha: f64,
     /// The square root of the raw learning rate.
     mu: f64,
     /// The number of passes through the data.
     iters: usize,
 }

 /// The default Stochastic GD algorithm.
 ///
 /// The defaults are:
 ///
 /// - alpha = 0.1
 /// - mu = 0.1
 /// - iters = 20
 impl Default for StochasticGD {
     fn default() -> StochasticGD {
         StochasticGD {
             alpha: 0.1,
             mu: 0.1,
             iters: 20,
         }
     }
 }

 impl StochasticGD {
     /// Construct a stochastic gradient descent algorithm.
     ///
     /// Requires the learning rate, momentum rate and iteration count
     /// to be specified.
     ///
     /// # Examples
     ///
     /// ```
     /// use rusty_machine::learning::optim::grad_desc::StochasticGD;
     ///
     /// let sgd = StochasticGD::new(0.1, 0.3, 5);
     /// ```
     pub fn new(alpha: f64, mu: f64, iters: usize) -> StochasticGD {
         assert!(alpha > 0f64, "The momentum (alpha) must be greater than 0.");
         assert!(mu > 0f64, "The step size (mu) must be greater than 0.");

         StochasticGD {
             alpha: alpha,
             mu: mu,
             iters: iters,
         }
     }
 }

 impl<M> OptimAlgorithm<M> for StochasticGD
     where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>
 {
     fn optimize(&self,
                 model: &M,
                 start: &[f64],
                 inputs: &M::Inputs,
                 targets: &M::Targets)
                 -> Vec<f64> {

         // Create the initial optimal parameters
         let mut optimizing_val = Vector::new(start.to_vec());
         // Create the momentum based gradient distance
         let mut delta_w = Vector::zeros(start.len());

         // Set up the indices for permutation
         let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
         // The cost at the start of each iteration
         let mut start_iter_cost = 0f64;

         for _ in 0..self.iters {
             // The cost at the end of each stochastic gd pass
             let mut end_cost = 0f64;
             // Permute the indices
             rand_utils::in_place_fisher_yates(&mut permutation);
             for i in &permutation {
                 // Compute the cost and gradient for this data pair
                 let (cost, vec_data) = model.compute_grad(optimizing_val.data(),
                                                           &inputs.select_rows(&[*i]),
                                                           &targets.select_rows(&[*i]));

                 // Compute the difference in gradient using momentum
                 delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
                 // Update the parameters
                 optimizing_val = &optimizing_val - &delta_w * self.mu;
                 // Set the end cost (this is only used after the last iteration)
                 end_cost += cost;
             }

             end_cost /= inputs.rows() as f64;

             // Early stopping
             if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
                 break;
             } else {
                 // Update the cost
                 start_iter_cost = end_cost;
             }
         }
         optimizing_val.into_vec()
     }
 }

 /// Adaptive Gradient Descent
 ///
 /// The adaptive gradient descent algorithm (Duchi et al. 2010).
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AdaGrad {
     alpha: f64,
     tau: f64,
     iters: usize,
 }

 impl AdaGrad {
     /// Constructs a new AdaGrad algorithm.
     ///
     /// # Examples
     ///
     /// ```
     /// use rusty_machine::learning::optim::grad_desc::AdaGrad;
     ///
     /// // Create a new AdaGrad algorithm with step size 0.5
     /// // and adaptive scaling constant 1.0
     /// let gd = AdaGrad::new(0.5, 1.0, 100);
     /// ```
     pub fn new(alpha: f64, tau: f64, iters: usize) -> AdaGrad {
         assert!(alpha > 0f64,
                 "The step size (alpha) must be greater than 0.");
         assert!(tau >= 0f64,
                 "The adaptive constant (tau) cannot be negative.");
         AdaGrad {
             alpha: alpha,
             tau: tau,
             iters: iters,
         }
     }
 }

 impl Default for AdaGrad {
     fn default() -> AdaGrad {
         AdaGrad {
             alpha: 1f64,
             tau: 3f64,
             iters: 100,
         }
     }
 }

 impl<M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>> OptimAlgorithm<M> for AdaGrad {
     fn optimize(&self,
                 model: &M,
                 start: &[f64],
                 inputs: &M::Inputs,
                 targets: &M::Targets)
                 -> Vec<f64> {

         // Initialize the adaptive scaling
         let mut ada_s = Vector::zeros(start.len());
         // Initialize the optimal parameters
         let mut optimizing_val = Vector::new(start.to_vec());

         // Set up the indices for permutation
         let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
         // The cost at the start of each iteration
         let mut start_iter_cost = 0f64;

         for _ in 0..self.iters {
             // The cost at the end of each stochastic gd pass
             let mut end_cost = 0f64;
             // Permute the indices
             rand_utils::in_place_fisher_yates(&mut permutation);
             for i in &permutation {
                 // Compute the cost and gradient for this data pair
                 let (cost, mut vec_data) = model.compute_grad(optimizing_val.data(),
                                                               &inputs.select_rows(&[*i]),
                                                               &targets.select_rows(&[*i]));
                 // Update the adaptive scaling by adding the gradient squared
                 utils::in_place_vec_bin_op(ada_s.mut_data(), &vec_data, |x, &y| *x += y * y);

                 // Compute the change in gradient
                 utils::in_place_vec_bin_op(&mut vec_data, ada_s.data(), |x, &y| {
                     *x = self.alpha * (*x / (self.tau + (y).sqrt()))
                 });
                 // Update the parameters
                 optimizing_val = &optimizing_val - Vector::new(vec_data);
                 // Set the end cost (this is only used after the last iteration)
                 end_cost += cost;
             }
             end_cost /= inputs.rows() as f64;

             // Early stopping
             if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
                 break;
             } else {
                 // Update the cost
                 start_iter_cost = end_cost;
             }
         }
         optimizing_val.into_vec()
     }
 }

 /// RMSProp
 ///
 /// The RMSProp algorithm (Hinton et al. 2012).
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct RMSProp {
     /// The base step size of gradient descent steps
     learning_rate: f64,
     /// Rate at which running total of average square gradients decays
     decay_rate: f64,
     /// Small value used to avoid divide by zero
     epsilon: f64,
     /// The number of passes through the data
     iters: usize,
 }

 /// The default RMSProp configuration
 ///
 /// The defaults are:
 ///
 /// - learning_rate = 0.01
 /// - decay_rate = 0.9
 /// - epsilon = 1.0e-5
 /// - iters = 50
 impl Default for RMSProp {
     fn default() -> RMSProp {
         RMSProp {
             learning_rate: 0.01,
             decay_rate: 0.9,
             epsilon: 1.0e-5,
             iters: 50
         }
     }
 }

 impl RMSProp {
     /// Construct an RMSProp algorithm.
     ///
     /// Requires learning rate, decay rate, epsilon, and iteration count.
     ///
     /// #Examples
     ///
     /// ```
     /// use rusty_machine::learning::optim::grad_desc::RMSProp;
     ///
     /// let rms = RMSProp::new(0.99, 0.01, 1e-5, 20);
     /// ```
     pub fn new(learning_rate: f64, decay_rate: f64, epsilon: f64, iters: usize) -> RMSProp {
         assert!(0f64 < learning_rate, "The learning rate must be positive");
         assert!(0f64 < decay_rate && decay_rate < 1f64, "The decay rate must be between 0 and 1");
         assert!(0f64 < epsilon, "Epsilon must be positive");

         RMSProp {
             decay_rate: decay_rate,
             learning_rate: learning_rate,
             epsilon: epsilon,
             iters: iters
         }
     }
 }

 impl<M> OptimAlgorithm<M> for RMSProp
     where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>> {
     fn optimize(&self,
                 model: &M,
                 start: &[f64],
                 inputs: &M::Inputs,
                 targets: &M::Targets)
                 -> Vec<f64> {
         // Initial parameters
         let mut params = Vector::new(start.to_vec());
         // Running average of squared gradients
         let mut rmsprop_cache = Vector::zeros(start.len());

         // Set up indices for permutation
         let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
         // The cost from the previous iteration
         let mut prev_cost = 0f64;

         for _ in 0..self.iters {
             // The cost at end of each pass
             let mut end_cost = 0f64;
             // Permute the vertices
             rand_utils::in_place_fisher_yates(&mut permutation);
             for i in &permutation {
                 let (cost, grad) = model.compute_grad(params.data(),
                                                       &inputs.select_rows(&[*i]),
                                                       &targets.select_rows(&[*i]));

                 let mut grad = Vector::new(grad);
                 let grad_squared = grad.clone().apply(&|x| x*x);
                 // Update cached average of squared gradients
                 rmsprop_cache = &rmsprop_cache*self.decay_rate + &grad_squared*(1.0 - self.decay_rate);
                 // RMSProp update rule
                 utils::in_place_vec_bin_op(grad.mut_data(), rmsprop_cache.data(), |x, &y| {
                     *x = *x * self.learning_rate / (y + self.epsilon).sqrt();
                 });
                 params = &params - &grad;

                 end_cost += cost;
             }
             end_cost /= inputs.rows() as f64;

             // Early stopping
             if (prev_cost - end_cost).abs() < LEARNING_EPS {
                 break;
             } else {
                 prev_cost = end_cost;
             }
         }
         params.into_vec()
     }
 }

 #[cfg(test)]
 mod tests {

     use super::{GradientDesc, StochasticGD, AdaGrad, RMSProp};

     #[test]
     #[should_panic]
     fn gd_neg_stepsize() {
         let _ = GradientDesc::new(-0.5, 0);
     }

     #[test]
     #[should_panic]
     fn stochastic_gd_neg_momentum() {
         let _ = StochasticGD::new(-0.5, 1f64, 0);
     }

     #[test]
     #[should_panic]
     fn stochastic_gd_neg_stepsize() {
         let _ = StochasticGD::new(0.5, -1f64, 0);
     }

     #[test]
     #[should_panic]
     fn adagrad_neg_stepsize() {
         let _ = AdaGrad::new(-0.5, 1f64, 0);
     }

     #[test]
     #[should_panic]
     fn adagrad_neg_adaptive_scale() {
         let _ = AdaGrad::new(0.5, -1f64, 0);
     }

     #[test]
     #[should_panic]
     fn rmsprop_neg_decay_rate() {
         let _ = RMSProp::new(-0.5, 0.005, 1.0e-5, 0);
     }

     #[test]
     #[should_panic]
     fn rmsprop_neg_epsilon() {
         let _ = RMSProp::new(0.5, 0.005, -1.0e-5, 0);
     }

     #[test]
     #[should_panic]
     fn rmsprop_neg_learning_rate() {
         let _ = RMSProp::new(0.5, -0.005, 1.0e-5, 0);
     }
 }
	//! Gradient Descent
	//!
	//! Implementation of gradient descent algorithm. Module contains
	//! the struct `GradientDesc` which is instantiated within models
	//! implementing the Optimizable trait.
	//!
	//! Currently standard batch gradient descent is the only implemented
	//! optimization algorithm but there is flexibility to introduce new
	//! algorithms and git them into the same scheme easily.
	use std::vec::*;
	use learning::optim::{Optimizable, OptimAlgorithm};
	use linalg::Vector;
	use linalg::{Matrix, BaseMatrix};
	use rulinalg::utils;

	use learning::toolkit::rand_utils;

	const LEARNING_EPS: f64 = 1e-20;

	/// Batch Gradient Descent algorithm
	#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
	pub struct GradientDesc {
	/// The step-size for the gradient descent steps.
	alpha: f64,
	/// The number of iterations to run.
	iters: usize,
	}

	/// The default gradient descent algorithm.
	///
	/// The defaults are:
	///
	/// - alpha = 0.3
	/// - iters = 100
	impl Default for GradientDesc {
	fn default() -> GradientDesc {
	GradientDesc {
	alpha: 0.3,
	iters: 100,
	}
	}
	}

	impl GradientDesc {
	/// Construct a gradient descent algorithm.
	///
	/// Requires the step size and iteration count
	/// to be specified.
	///
	/// # Examples
	///
	/// ```
	/// use rusty_machine::learning::optim::grad_desc::GradientDesc;
	///
	/// let gd = GradientDesc::new(0.3, 10000);
	/// ```
	pub fn new(alpha: f64, iters: usize) -> GradientDesc {
	assert!(alpha > 0f64,
	"The step size (alpha) must be greater than 0.");

	GradientDesc {
	alpha: alpha,
	iters: iters,
	}
	}
	}

	impl<M: Optimizable> OptimAlgorithm<M> for GradientDesc {
	fn optimize(&self,
	model: &M,
	start: &[f64],
	inputs: &M::Inputs,
	targets: &M::Targets)
	-> Vec<f64> {

	// Create the initial optimal parameters
	let mut optimizing_val = Vector::new(start.to_vec());
	// The cost at the start of each iteration
	let mut start_iter_cost = 0f64;

	for _ in 0..self.iters {
	// Compute the cost and gradient for the current parameters
	let (cost, grad) = model.compute_grad(optimizing_val.data(), inputs, targets);

	// Early stopping
	if (start_iter_cost - cost).abs() < LEARNING_EPS {
	break;
	} else {
	// Update the optimal parameters using gradient descent
	optimizing_val = &optimizing_val - Vector::new(grad) * self.alpha;
	// Update the latest cost
	start_iter_cost = cost;
	}
	}
	optimizing_val.into_vec()
	}
	}

	/// Stochastic Gradient Descent algorithm.
	///
	/// Uses basic momentum to control the learning rate.
	#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
	pub struct StochasticGD {
	/// Controls the momentum of the descent
	alpha: f64,
	/// The square root of the raw learning rate.
	mu: f64,
	/// The number of passes through the data.
	iters: usize,
	}

	/// The default Stochastic GD algorithm.
	///
	/// The defaults are:
	///
	/// - alpha = 0.1
	/// - mu = 0.1
	/// - iters = 20
	impl Default for StochasticGD {
	fn default() -> StochasticGD {
	StochasticGD {
	alpha: 0.1,
	mu: 0.1,
	iters: 20,
	}
	}
	}

	impl StochasticGD {
	/// Construct a stochastic gradient descent algorithm.
	///
	/// Requires the learning rate, momentum rate and iteration count
	/// to be specified.
	///
	/// # Examples
	///
	/// ```
	/// use rusty_machine::learning::optim::grad_desc::StochasticGD;
	///
	/// let sgd = StochasticGD::new(0.1, 0.3, 5);
	/// ```
	pub fn new(alpha: f64, mu: f64, iters: usize) -> StochasticGD {
	assert!(alpha > 0f64, "The momentum (alpha) must be greater than 0.");
	assert!(mu > 0f64, "The step size (mu) must be greater than 0.");

	StochasticGD {
	alpha: alpha,
	mu: mu,
	iters: iters,
	}
	}
	}

	impl<M> OptimAlgorithm<M> for StochasticGD
	where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>
	{
	fn optimize(&self,
	model: &M,
	start: &[f64],
	inputs: &M::Inputs,
	targets: &M::Targets)
	-> Vec<f64> {

	// Create the initial optimal parameters
	let mut optimizing_val = Vector::new(start.to_vec());
	// Create the momentum based gradient distance
	let mut delta_w = Vector::zeros(start.len());

	// Set up the indices for permutation
	let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
	// The cost at the start of each iteration
	let mut start_iter_cost = 0f64;

	for _ in 0..self.iters {
	// The cost at the end of each stochastic gd pass
	let mut end_cost = 0f64;
	// Permute the indices
	rand_utils::in_place_fisher_yates(&mut permutation);
	for i in &permutation {
	// Compute the cost and gradient for this data pair
	let (cost, vec_data) = model.compute_grad(optimizing_val.data(),
	&inputs.select_rows(&[*i]),
	&targets.select_rows(&[*i]));

	// Compute the difference in gradient using momentum
	delta_w = Vector::new(vec_data) * self.mu + &delta_w * self.alpha;
	// Update the parameters
	optimizing_val = &optimizing_val - &delta_w * self.mu;
	// Set the end cost (this is only used after the last iteration)
	end_cost += cost;
	}

	end_cost /= inputs.rows() as f64;

	// Early stopping
	if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
	break;
	} else {
	// Update the cost
	start_iter_cost = end_cost;
	}
	}
	optimizing_val.into_vec()
	}
	}

	/// Adaptive Gradient Descent
	///
	/// The adaptive gradient descent algorithm (Duchi et al. 2010).
	#[derive(Debug, Serialize, Deserialize)]
	pub struct AdaGrad {
	alpha: f64,
	tau: f64,
	iters: usize,
	}

	impl AdaGrad {
	/// Constructs a new AdaGrad algorithm.
	///
	/// # Examples
	///
	/// ```
	/// use rusty_machine::learning::optim::grad_desc::AdaGrad;
	///
	/// // Create a new AdaGrad algorithm with step size 0.5
	/// // and adaptive scaling constant 1.0
	/// let gd = AdaGrad::new(0.5, 1.0, 100);
	/// ```
	pub fn new(alpha: f64, tau: f64, iters: usize) -> AdaGrad {
	assert!(alpha > 0f64,
	"The step size (alpha) must be greater than 0.");
	assert!(tau >= 0f64,
	"The adaptive constant (tau) cannot be negative.");
	AdaGrad {
	alpha: alpha,
	tau: tau,
	iters: iters,
	}
	}
	}

	impl Default for AdaGrad {
	fn default() -> AdaGrad {
	AdaGrad {
	alpha: 1f64,
	tau: 3f64,
	iters: 100,
	}
	}
	}

	impl<M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>>> OptimAlgorithm<M> for AdaGrad {
	fn optimize(&self,
	model: &M,
	start: &[f64],
	inputs: &M::Inputs,
	targets: &M::Targets)
	-> Vec<f64> {

	// Initialize the adaptive scaling
	let mut ada_s = Vector::zeros(start.len());
	// Initialize the optimal parameters
	let mut optimizing_val = Vector::new(start.to_vec());

	// Set up the indices for permutation
	let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
	// The cost at the start of each iteration
	let mut start_iter_cost = 0f64;

	for _ in 0..self.iters {
	// The cost at the end of each stochastic gd pass
	let mut end_cost = 0f64;
	// Permute the indices
	rand_utils::in_place_fisher_yates(&mut permutation);
	for i in &permutation {
	// Compute the cost and gradient for this data pair
	let (cost, mut vec_data) = model.compute_grad(optimizing_val.data(),
	&inputs.select_rows(&[*i]),
	&targets.select_rows(&[*i]));
	// Update the adaptive scaling by adding the gradient squared
	utils::in_place_vec_bin_op(ada_s.mut_data(), &vec_data, \|x, &y\| x += y y);

	// Compute the change in gradient
	utils::in_place_vec_bin_op(&mut vec_data, ada_s.data(), \|x, &y\| {
	x = self.alpha (*x / (self.tau + (y).sqrt()))
	});
	// Update the parameters
	optimizing_val = &optimizing_val - Vector::new(vec_data);
	// Set the end cost (this is only used after the last iteration)
	end_cost += cost;
	}
	end_cost /= inputs.rows() as f64;

	// Early stopping
	if (start_iter_cost - end_cost).abs() < LEARNING_EPS {
	break;
	} else {
	// Update the cost
	start_iter_cost = end_cost;
	}
	}
	optimizing_val.into_vec()
	}
	}

	/// RMSProp
	///
	/// The RMSProp algorithm (Hinton et al. 2012).
	#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
	pub struct RMSProp {
	/// The base step size of gradient descent steps
	learning_rate: f64,
	/// Rate at which running total of average square gradients decays
	decay_rate: f64,
	/// Small value used to avoid divide by zero
	epsilon: f64,
	/// The number of passes through the data
	iters: usize,
	}

	/// The default RMSProp configuration
	///
	/// The defaults are:
	///
	/// - learning_rate = 0.01
	/// - decay_rate = 0.9
	/// - epsilon = 1.0e-5
	/// - iters = 50
	impl Default for RMSProp {
	fn default() -> RMSProp {
	RMSProp {
	learning_rate: 0.01,
	decay_rate: 0.9,
	epsilon: 1.0e-5,
	iters: 50
	}
	}
	}

	impl RMSProp {
	/// Construct an RMSProp algorithm.
	///
	/// Requires learning rate, decay rate, epsilon, and iteration count.
	///
	/// #Examples
	///
	/// ```
	/// use rusty_machine::learning::optim::grad_desc::RMSProp;
	///
	/// let rms = RMSProp::new(0.99, 0.01, 1e-5, 20);
	/// ```
	pub fn new(learning_rate: f64, decay_rate: f64, epsilon: f64, iters: usize) -> RMSProp {
	assert!(0f64 < learning_rate, "The learning rate must be positive");
	assert!(0f64 < decay_rate && decay_rate < 1f64, "The decay rate must be between 0 and 1");
	assert!(0f64 < epsilon, "Epsilon must be positive");

	RMSProp {
	decay_rate: decay_rate,
	learning_rate: learning_rate,
	epsilon: epsilon,
	iters: iters
	}
	}
	}

	impl<M> OptimAlgorithm<M> for RMSProp
	where M: Optimizable<Inputs = Matrix<f64>, Targets = Matrix<f64>> {
	fn optimize(&self,
	model: &M,
	start: &[f64],
	inputs: &M::Inputs,
	targets: &M::Targets)
	-> Vec<f64> {
	// Initial parameters
	let mut params = Vector::new(start.to_vec());
	// Running average of squared gradients
	let mut rmsprop_cache = Vector::zeros(start.len());

	// Set up indices for permutation
	let mut permutation = (0..inputs.rows()).collect::<Vec<_>>();
	// The cost from the previous iteration
	let mut prev_cost = 0f64;

	for _ in 0..self.iters {
	// The cost at end of each pass
	let mut end_cost = 0f64;
	// Permute the vertices
	rand_utils::in_place_fisher_yates(&mut permutation);
	for i in &permutation {
	let (cost, grad) = model.compute_grad(params.data(),
	&inputs.select_rows(&[*i]),
	&targets.select_rows(&[*i]));

	let mut grad = Vector::new(grad);
	let grad_squared = grad.clone().apply(&\|x\| x*x);
	// Update cached average of squared gradients
	rmsprop_cache = &rmsprop_cacheself.decay_rate + &grad_squared(1.0 - self.decay_rate);
	// RMSProp update rule
	utils::in_place_vec_bin_op(grad.mut_data(), rmsprop_cache.data(), \|x, &y\| {
	x = x * self.learning_rate / (y + self.epsilon).sqrt();
	});
	params = &params - &grad;

	end_cost += cost;
	}
	end_cost /= inputs.rows() as f64;

	// Early stopping
	if (prev_cost - end_cost).abs() < LEARNING_EPS {
	break;
	} else {
	prev_cost = end_cost;
	}
	}
	params.into_vec()
	}
	}

	#[cfg(test)]
	mod tests {

	use super::{GradientDesc, StochasticGD, AdaGrad, RMSProp};

	#[test]
	#[should_panic]
	fn gd_neg_stepsize() {
	let _ = GradientDesc::new(-0.5, 0);
	}

	#[test]
	#[should_panic]
	fn stochastic_gd_neg_momentum() {
	let _ = StochasticGD::new(-0.5, 1f64, 0);
	}

	#[test]
	#[should_panic]
	fn stochastic_gd_neg_stepsize() {
	let _ = StochasticGD::new(0.5, -1f64, 0);
	}

	#[test]
	#[should_panic]
	fn adagrad_neg_stepsize() {
	let _ = AdaGrad::new(-0.5, 1f64, 0);
	}

	#[test]
	#[should_panic]
	fn adagrad_neg_adaptive_scale() {
	let _ = AdaGrad::new(0.5, -1f64, 0);
	}

	#[test]
	#[should_panic]
	fn rmsprop_neg_decay_rate() {
	let _ = RMSProp::new(-0.5, 0.005, 1.0e-5, 0);
	}

	#[test]
	#[should_panic]
	fn rmsprop_neg_epsilon() {
	let _ = RMSProp::new(0.5, 0.005, -1.0e-5, 0);
	}

	#[test]
	#[should_panic]
	fn rmsprop_neg_learning_rate() {
	let _ = RMSProp::new(0.5, -0.005, 1.0e-5, 0);
	}
	}