blob: 69c6887e6723cb77e47e35da354dee30593ef99c [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Implements the DBSCAN clustering algorithm using Euclidian distance matrix
#
# INPUT:
# -------------------------------------------------
# X The input Matrix to do DBSCAN on.
# eps Maximum distance between two points for one to be considered reachable for the other.
# minPts Number of points in a neighborhood for a point to be considered as a core point
# (includes the point itself).
# -------------------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------------------------
# clusterMembers clustering Matrix
# ------------------------------------------------------------------------------------------------
m_dbscan = function (Matrix[Double] X, Double eps = 0.5, Integer minPts = 5)
return (Matrix[Double] X, Matrix[Double] clusterModel, Double eps)
{
#check input parameter assertions
if(minPts < 0) { stop("DBSCAN: Stopping due to invalid inputs: minPts should be greater than 0"); }
if(eps < 0)
{
print("DBSCAN: Epsilon (eps) should be greater than 0. Setting eps = 0.5");
eps = 0.5
}
UNASSIGNED = 0;
num_records = nrow(X);
num_features = ncol(X);
neighbors = dist(X);
#find same pts and set their distance to the smallest double representation
neighbors = replace(target = neighbors, pattern = 0, replacement = 2.225e-307)
neighbors = neighbors - diag(diag(neighbors));
# neighbors within eps
withinEps = ((neighbors <= eps) * (0 < neighbors));
corePts = rowSums(withinEps) + 1 >= minPts;
clusterMembers = matrix(UNASSIGNED, num_records, 1);
clusterModel = matrix(UNASSIGNED, 0, num_features);
if (sum(corePts) != 0) {
# leave only density reachable pts
neighbors = (neighbors * corePts * withinEps) > 0;
# border pts of multiple clusters
border = neighbors * (t(corePts) == 0 & colSums(neighbors) > 1) * seq(num_records, 1);
border = (border - colMaxs(border)) == 0;
neighbors = neighbors * border;
adjacency = (neighbors + t(neighbors)) > 0;
clusterMembers = components(G=adjacency, verbose=FALSE);
# noise to 0
clusterMembers = clusterMembers * (rowSums(adjacency) > 0);
clusterModel = removeEmpty(target=X, margin="rows", select = (clusterMembers > 0))
X = clusterMembers
}
}