blob: 74d40405563d5dc60b634362f672b51e82e523a0 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# Implements the DBSCAN clustering algorithm using Euclidian distance matrix
#
# INPUT PARAMETERS:
# ----------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------
# X Matrix[Double] --- The input Matrix to do DBSCAN on.
# eps Double 0.5 Maximum distance between two points for one to be considered reachable for the other.
# minPts Int 5 Number of points in a neighborhood for a point to be considered as a core point (includes the point itself).
#
m_dbscan = function (Matrix[double] X, Double eps = 0.5, Integer minPts = 5)
return (Matrix[double] clusterMembers)
{
#check input parameter assertions
if(minPts < 0) { stop("DBSCAN: Stopping due to invalid inputs: minPts should be greater than 0"); }
if(eps < 0) { stop("DBSCAN: Stopping due to invalid inputs: Epsilon (eps) should be greater than 0"); }
UNASSIGNED = 0;
num_records = nrow(X);
num_features = ncol(X);
neighbors = dist(X);
#find same pts and set their distance to the smallest double representation
neighbors = replace(target = neighbors, pattern = 0, replacement = 2.225e-307)
neighbors = neighbors - diag(diag(neighbors));
# neighbors within eps
withinEps = ((neighbors <= eps) * (0 < neighbors));
corePts = rowSums(withinEps) + 1 >= minPts;
clusterMembers = matrix(UNASSIGNED, num_records, 1);
if (sum(corePts) != 0) {
# leave only density reachable pts
neighbors = (neighbors * corePts * withinEps) > 0;
# border pts of multiple clusters
border = neighbors * (t(corePts) == 0 & colSums(neighbors) > 1) * seq(num_records, 1);
border = (border - colMaxs(border)) == 0;
neighbors = neighbors * border;
adjacency = (neighbors + t(neighbors)) > 0;
clusterMembers = components(G=adjacency, verbose=FALSE);
# noise to 0
clusterMembers = clusterMembers * (rowSums(adjacency) > 0);
}
}