blob: 110551d68cd76a1f51ebcaac78b4a413b0a8b5cd [file] [log] [blame]
//! DBSCAN Clustering
//!
//! *Note: This module is likely to change dramatically in the future and
//! should be treated as experimental.*
//!
//! Provides an implementaton of DBSCAN clustering. The model
//! also implements a `predict` function which uses nearest neighbours
//! to classify the points. To utilize this function you must use
//! `self.set_predictive(true)` before training the model.
//!
//! The algorithm works by specifying `eps` and `min_points` parameters.
//! The `eps` parameter controls how close together points must be to be
//! placed in the same cluster. The `min_points` parameter controls how many
//! points must be within distance `eps` of eachother to be considered a cluster.
//!
//! If a point is not within distance `eps` of a cluster it will be classified
//! as noise. This means that it will be set to `None` in the clusters `Vector`.
//!
//! # Examples
//!
//! ```
//! use rusty_machine::learning::dbscan::DBSCAN;
//! use rusty_machine::learning::UnSupModel;
//! use rusty_machine::linalg::Matrix;
//!
//! let inputs = Matrix::new(6, 2, vec![1.0, 2.0,
//! 1.1, 2.2,
//! 0.9, 1.9,
//! 1.0, 2.1,
//! -2.0, 3.0,
//! -2.2, 3.1]);
//!
//! let mut model = DBSCAN::new(0.5, 2);
//! model.train(&inputs).unwrap();
//!
//! let clustering = model.clusters().unwrap();
//! ```
use std::vec::*;
use learning::{LearningResult, UnSupModel};
use learning::error::{Error, ErrorKind};
use linalg::{Matrix, Vector, BaseMatrix};
use rulinalg::utils;
use rulinalg::matrix::Row;
/// DBSCAN Model
///
/// Implements clustering using the DBSCAN algorithm
/// via the `UnSupModel` trait.
#[derive(Debug, Serialize, Deserialize)]
pub struct DBSCAN {
eps: f64,
min_points: usize,
clusters: Option<Vector<Option<usize>>>,
predictive: bool,
_visited: Vec<bool>,
_cluster_data: Option<Matrix<f64>>,
}
/// Constructs a non-predictive DBSCAN model with the
/// following parameters:
///
/// - `eps` : `0.5`
/// - `min_points` : `5`
impl Default for DBSCAN {
fn default() -> DBSCAN {
DBSCAN {
eps: 0.5,
min_points: 5,
clusters: None,
predictive: false,
_visited: Vec::new(),
_cluster_data: None,
}
}
}
impl UnSupModel<Matrix<f64>, Vector<Option<usize>>> for DBSCAN {
/// Train the classifier using input data.
fn train(&mut self, inputs: &Matrix<f64>) -> LearningResult<()> {
self.init_params(inputs.rows());
let mut cluster = 0;
for (idx, point) in inputs.row_iter().enumerate() {
let visited = self._visited[idx];
if !visited {
self._visited[idx] = true;
let neighbours = self.region_query(point, inputs);
if neighbours.len() >= self.min_points {
self.expand_cluster(inputs, idx, neighbours, cluster);
cluster += 1;
}
}
}
if self.predictive {
self._cluster_data = Some(inputs.clone());
}
Ok(())
}
fn predict(&self, inputs: &Matrix<f64>) -> LearningResult<Vector<Option<usize>>> {
if self.predictive {
if let (&Some(ref cluster_data), &Some(ref clusters)) = (&self._cluster_data,
&self.clusters) {
let mut classes = Vec::with_capacity(inputs.rows());
for input_point in inputs.row_iter() {
let mut distances = Vec::with_capacity(cluster_data.rows());
for cluster_point in cluster_data.row_iter() {
let point_distance =
utils::vec_bin_op(input_point.raw_slice(), cluster_point.raw_slice(), |x, y| x - y);
distances.push(utils::dot(&point_distance, &point_distance).sqrt());
}
let (closest_idx, closest_dist) = utils::argmin(&distances);
if closest_dist < self.eps {
classes.push(clusters[closest_idx]);
} else {
classes.push(None);
}
}
Ok(Vector::new(classes))
} else {
Err(Error::new_untrained())
}
} else {
Err(Error::new(ErrorKind::InvalidState,
"Model must be set to predictive. Use `self.set_predictive(true)`."))
}
}
}
impl DBSCAN {
/// Create a new DBSCAN model with a given
/// distance episilon and minimum points per cluster.
pub fn new(eps: f64, min_points: usize) -> DBSCAN {
assert!(eps > 0f64, "The model epsilon must be positive.");
DBSCAN {
eps: eps,
min_points: min_points,
clusters: None,
predictive: false,
_visited: Vec::new(),
_cluster_data: None,
}
}
/// Set predictive to true if the model is to be used
/// to classify future points.
///
/// If the model is set as predictive then the input data
/// will be cloned during training.
pub fn set_predictive(&mut self, predictive: bool) {
self.predictive = predictive;
}
/// Return an Option pointing to the model clusters.
pub fn clusters(&self) -> Option<&Vector<Option<usize>>> {
self.clusters.as_ref()
}
fn expand_cluster(&mut self,
inputs: &Matrix<f64>,
point_idx: usize,
neighbour_pts: Vec<usize>,
cluster: usize) {
debug_assert!(point_idx < inputs.rows(),
"Point index too large for inputs");
debug_assert!(neighbour_pts.iter().all(|x| *x < inputs.rows()),
"Neighbour indices too large for inputs");
self.clusters.as_mut().map(|x| x.mut_data()[point_idx] = Some(cluster));
for data_point_idx in &neighbour_pts {
let visited = self._visited[*data_point_idx];
if !visited {
self._visited[*data_point_idx] = true;
let data_point_row = unsafe { inputs.row_unchecked(*data_point_idx) };
let sub_neighbours = self.region_query(data_point_row, inputs);
if sub_neighbours.len() >= self.min_points {
self.expand_cluster(inputs, *data_point_idx, sub_neighbours, cluster);
}
}
}
}
fn region_query(&self, point: Row<f64>, inputs: &Matrix<f64>) -> Vec<usize> {
debug_assert!(point.cols() == inputs.cols(),
"point must be of same dimension as inputs");
let mut in_neighbourhood = Vec::new();
for (idx, data_point) in inputs.row_iter().enumerate() {
//TODO: Use `MatrixMetric` when rulinalg#154 is fixed.
let point_distance = utils::vec_bin_op(data_point.raw_slice(), point.raw_slice(), |x, y| x - y);
let dist = utils::dot(&point_distance, &point_distance).sqrt();
if dist < self.eps {
in_neighbourhood.push(idx);
}
}
in_neighbourhood
}
fn init_params(&mut self, total_points: usize) {
unsafe {
self._visited.reserve(total_points);
self._visited.set_len(total_points);
}
for i in 0..total_points {
self._visited[i] = false;
}
self.clusters = Some(Vector::new(vec![None; total_points]));
}
}
#[cfg(test)]
mod tests {
use super::DBSCAN;
use linalg::{Matrix, BaseMatrix};
#[test]
fn test_region_query() {
let model = DBSCAN::new(1.0, 3);
let inputs = Matrix::new(3, 2, vec![1.0, 1.0, 1.1, 1.9, 3.0, 3.0]);
let m = matrix![1.0, 1.0];
let row = m.row(0);
let neighbours = model.region_query(row, &inputs);
assert!(neighbours.len() == 2);
}
#[test]
fn test_region_query_small_eps() {
let model = DBSCAN::new(0.01, 3);
let inputs = Matrix::new(3, 2, vec![1.0, 1.0, 1.1, 1.9, 1.1, 1.1]);
let m = matrix![1.0, 1.0];
let row = m.row(0);
let neighbours = model.region_query(row, &inputs);
assert!(neighbours.len() == 1);
}
}