blob: 976d0653040cd094d49cf188e9ccbc6b72327b25 [file] [log] [blame]
/* ----------------------------------------------------------------------- */
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* @file dbscan.sql_in
* @brief Partitions a set of observations into clusters of arbitrary shape based on the density of nearby neighbors.
* @date May 2020
*
*/
/* ----------------------------------------------------------------------- */
/**
@addtogroup grp_dbscan
<div class="toc"><b>Contents</b>
<ul>
<li class="level1"><a href="#cluster">Clustering Function</a></li>
<li class="level1"><a href="#assignment">Cluster Assignment</a></li>
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
<li class="level1"><a href="#related">Related Topics</a></li>
</ul>
</div>
@brief Partitions a set of observations into clusters of arbitrary
shape based on the density of nearby neighbors.
Density-based spatial clustering of applications with noise (DBSCAN)
is a data clustering algorithm designed to discover clusters of arbitrary
shape [1]. It places minimum requirements on domain knowledge to determine
input parameters and has good efficiency on large databases.
Given a set of points, DBSCAN groups together points that are closely packed with many
nearby neighbors (core samples), and marks as outliers points that lie alone in
low-density regions with few nearby neighbors (non-core samples).
This method tends to be good for data which contains clusters
of similar density.
@anchor cluster
@par Clustering Function
<pre class="syntax">
dbscan( source_table,
output_table,
id_column,
expr_point,
eps,
min_samples,
metric,
algorithm,
algorithm_params
)
</pre>
\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>TEXT. Name of the table containing the input data points.
</dd>
<dt>output_table</dt>
<dd>TEXT. Name of the table containing the clustering results.
</dd>
<dt>id_column</dt>
<dd>TEXT. Name of the column containing a unique integer id for each training point.
</dd>
<dt>expr_point</dt>
<dd>TEXT. Name of the column with point coordinates in array form,
or an expression that evaluates to an array of point coordinates.
</dd>
<dt>eps</dt>
<dd>FLOAT8. Maximum distance between two samples for one to
be considered in the neighborhood of the other. (Note this is not a maximum
bound on the distances of points within a cluster.) This is an
important parameter to choose appropriately and should consider both the
nature of the data set and the distance function. Usually it is not left
at the default value.
</dd>
<dt>min_samples (optional)</dt>
<dd>INTEGER, default: 5. Number of samples in a neighborhood for a point
to be considered as a core point. This includes the point itself.
This parameter controls how tolerant the algorithm is towards
noise, so on noisy data sets it may be useful to increase
the magnitude of this parameter.
</dd>
@note
The parameters 'eps' and 'min_samples' together define the density of a cluster.
A core sample is a sample in the dataset such
that there exist 'min_samples' other samples within a distance of 'eps',
which are defined as neighbors of the core sample.
A higher value of 'min_samples' or a lower value of 'eps' indicate that higher density
is needed to form a cluster.
<dt>metric (optional)</dt>
<dd>TEXT, default: 'squared_dist_norm2'. The name of the function used
to calculate the distance between data points. The following distance
functions can be used:
<ul>
<li><b>\ref dist_norm1</b>: 1-norm/Manhattan (element-wise median).
MADlib does not provide a median aggregate function for
performance reasons.</li>
<li><b>\ref dist_norm2</b>: 2-norm/Euclidean (element-wise mean)</li>
<li><b>\ref squared_dist_norm2</b>: squared Euclidean distance (element-wise mean)</li>
<li><b>\ref dist_angle</b>: angle (element-wise mean of normalized points)</li>
<li><b>\ref dist_tanimoto</b>: tanimoto (element-wise mean of normalized points)</li>
</dd>
<dt>algorithm TBD (optional)</dt>
<dd>TEXT, default: 'brute_force'. The name of the algorithm
used to compute clusters. The following options are supported:
<ul>
<li><b>\ref brute_force</b>: Produces an exact result by searching
all points in the search space. You can also use a short
form "b" or "brute" etc. to select brute force.</li>
<li><b>\ref xx_tree</b>: Produces an approximate result by searching
a subset of the search space, that is, only certain leaf nodes in the
xx-tree as specified by "algorithm_params" below.
You can also use a short
form "x" or "xx" etc. to select xx-tree.</li></ul></dd>
<dt>algorithm_params TBD (optional)</dt>
<dd>TEXT, default: 'depth=3, leaf_nodes=2'. These parameters apply to the
xx-tree algorithm only.
<ul>
<li><b>\ref depth</b>: Depth of the xx-tree. Increasing this value will
decrease run-time but reduce the accuracy.</li>
<li><b>\ref leaf_nodes</b>: Number of leaf nodes (regions) to search for each test point.
Inceasing this value will improve the accuracy but increase run-time.</li></ul>
@note
Please note that the xx-tree accuracy will be lower for datasets with a high
number of features. It is advised to use at least two leaf nodes.
Refer to the <a href="#background">Technical Background</a> for more information
on how the xx-tree is implemented.</dd>
</dl>
<<<<<<< Updated upstream
<b>Output</b>
<br>
The output table for DBSCAN module has the following columns:
<table class="output">
<tr>
<th>id_in</th>
<td>INTEGER. Test data point id.</td>
</tr>
<tr>
<th>cluster_id</th>
<td>INTEGER. Cluster id associated with the test data point.</td>
</tr>
<tr>
<th>is_core_point</th>
<td>BOOLEAN. Indicates if the test data point is core or non-core.</td>
</tr>
<tr>
<th>points</th>
<td>TEXT. Column or expression for the data point.</td>
</tr>
</table>
@anchor assignment
@par Cluster Assignment
After clustering, the cluster assignment for each data point can be computed:
<pre class="syntax">
dbscan_predict( dbscan_table,
new_point
)
</pre>
<b>Arguments</b>
<dl class="arglist">
<dt>dbscan_table</dt>
<dd>TEXT. Name of the table created by running DBSCAN.</dd>
<dt>x</dt>
<dd>DOUBLE PRECISION[]. New points to be assigned to clusters.</dd>
</dl>
<b>Output TBD</b>
<br>
The output table has the following columns:
<table class="output">
<tr>
<th>column_id</th>
<td>INTEGER. Cluster assignment (zero-based, i.e., 0,1,2...).</td>
</tr>
<tr>
<th>distance</th>
<td>DOUBLE PRECISION. Distance to the cluster centroid.</td>
</tr>
</table>
@anchor examples
@par Examples
@anchor literature
@literature
@anchor related
@par Related Topics
*/
m4_include(`SQLCommon.m4')
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
source_table VARCHAR,
output_table VARCHAR,
id_column VARCHAR,
expr_point VARCHAR,
eps DOUBLE PRECISION,
min_samples INTEGER,
metric VARCHAR,
algorithm VARCHAR
) RETURNS VOID AS $$
PythonFunction(dbscan, dbscan, dbscan)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
source_table VARCHAR,
output_table VARCHAR,
id_column VARCHAR,
expr_point VARCHAR,
eps DOUBLE PRECISION,
min_samples INTEGER,
metric VARCHAR
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, $7, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
source_table VARCHAR,
output_table VARCHAR,
id_column VARCHAR,
expr_point VARCHAR,
eps DOUBLE PRECISION,
min_samples INTEGER
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, NULL, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
source_table VARCHAR,
output_table VARCHAR,
id_column VARCHAR,
expr_point VARCHAR,
eps DOUBLE PRECISION
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, NULL, NULL, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
dbscan_table VARCHAR,
new_point DOUBLE PRECISION[]
) RETURNS INTEGER AS $$
PythonFunction(dbscan, dbscan, dbscan_predict)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(dbscan, dbscan, dbscan_help)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
) RETURNS VARCHAR AS $$
PythonFunction(dbscan, dbscan, dbscan_help)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(dbscan, dbscan, dbscan_predict_help)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
) RETURNS VARCHAR AS $$
PythonFunction(dbscan, dbscan, dbscan_predict_help)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');