src/ports/postgres/modules/dbscan/dbscan.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- */
 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  * @file dbscan.sql_in
  * @brief Partitions a set of observations into clusters of arbitrary shape based on the density of nearby neighbors.
  * @date May 2020
  *
  */
 /* ----------------------------------------------------------------------- */


 /**
 @addtogroup grp_dbscan

 <div class="toc"><b>Contents</b>
 <ul>
 <li class="level1"><a href="#cluster">Clustering Function</a></li>
 <li class="level1"><a href="#assignment">Cluster Assignment</a></li>
 <li class="level1"><a href="#examples">Examples</a></li>
 <li class="level1"><a href="#literature">Literature</a></li>
 </ul>
 </div>

 @brief Partitions a set of observations into clusters of arbitrary
 shape based on the density of nearby neighbors.

 \warning <em> This MADlib method is still in early stage development.
 Interface and implementation are subject to change. </em>

 Density-based spatial clustering of applications with noise (DBSCAN)
 is a data clustering algorithm designed to discover clusters of arbitrary
 shape [1,2]. It places minimum requirements on domain knowledge to determine
 input parameters and has good efficiency on large databases.

 Given a set of points, DBSCAN groups together points that are closely packed with many
 nearby neighbors (\em core), and marks points that lie alone in
 low-density regions with few nearby neighbors (\em border).  Other points in the dataset
 that are not core or border are considered as \em noise.
 This method tends to be good for data which contains clusters
 of similar density.

 Currently only a brute force approach is implemented, suitable for small datasets.
 Other approaches for larger datasets will be implemented in
 a future release.

 @anchor cluster
 @par Clustering Function

 <pre class="syntax">
 dbscan( source_table,
         output_table,
         id_column,
         expr_point,
         eps,
         min_samples,
         metric,
         algorithm
       )
 </pre>

 \b Arguments
 <dl class="arglist">
 <dt>source_table</dt>
 <dd>TEXT. Name of the table containing the input data points.
 </dd>

 <dt>output_table</dt>
 <dd>TEXT. Name of the table containing the clustering results.
 </dd>

 <dt>id_column</dt>
 <dd>TEXT. Name of the column containing a unique integer id for each training point.
 </dd>

 <dt>expr_point</dt>
 <dd>TEXT. Name of the column with point coordinates in array form,
 or an expression that evaluates to an array of point coordinates.
 </dd>

 <dt>eps</dt>
 <dd>FLOAT8. Maximum distance between two samples for one to
 be considered in the neighborhood of the other. (Note this is not a maximum
 bound on the distances of points within a cluster.) This is an
 important parameter to choose appropriately and should consider both the
 nature of the data set and the distance function.
 </dd>

 <dt>min_samples (optional)</dt>
 <dd>INTEGER, default: 5.  Number of samples in a neighborhood for a point
 to be considered as a core point. This includes the point itself.
 This parameter controls how tolerant the algorithm is towards
 noise, so on noisy data sets it may be useful to increase
 the magnitude of this parameter.
 </dd>

 @note
 The parameters 'eps' and 'min_samples' together define the density of a cluster.
 A core point is where there exist 'min_samples' other points within a distance of 'eps',
 which are defined as neighbors of the core point.
 A higher value of 'min_samples' or a lower value of 'eps' indicate that higher density
 is needed to form a cluster.

 <dt>metric (optional)</dt>
 <dd>TEXT, default: 'squared_dist_norm2'. The name of the function used
 to calculate the distance between data points. The following distance
 functions can be used:
 <ul>
 <li><b>\ref dist_norm1</b>:  1-norm/Manhattan (element-wise median).
 MADlib does not provide a median aggregate function for
 performance reasons.</li>
 <li><b>\ref dist_norm2</b>: 2-norm/Euclidean (element-wise mean)</li>
 <li><b>\ref squared_dist_norm2</b>: squared Euclidean distance (element-wise mean)</li>
 <li><b>\ref dist_angle</b>: angle (element-wise mean of normalized points)</li>
 <li><b>\ref dist_tanimoto</b>: tanimoto (element-wise mean of normalized points)</li>
 </dd>

 <dt>algorithm (optional)</dt>
 <dd>TEXT, default: 'brute_force'. The name of the algorithm
 used to compute clusters. Currently only brute force is supported:
 <ul>
 <li><b>\ref brute_force</b>: Produces an exact result by searching
 all points in the search space.  Brute force can be slow and is intended
 to be used for small datasets only.  You can also use a short
 form "b" or "brute" etc. to select brute force.</li>
 </ul></dd>
 </dl>

 <b>Output</b>
 <br>
 The output table for DBSCAN module has the following columns:
 <table class="output">
     <tr>
         <th>id_column</th>
         <td>INTEGER. Test data point id.</td>
     </tr>
     <tr>
         <th>cluster_id</th>
         <td>INTEGER. Cluster id associated with the test data point.</td>
     </tr>
     <tr>
         <th>is_core_point</th>
         <td>BOOLEAN. Indicates if the test data point is core
         If it is not a core point and listed in the output table,
         it is a border point.  Noise points are not shown in this
         table.</td>
     </tr>
     <tr>
         <th>__points__</th>
         <td>TEXT. Column or expression for the data point.</td>
     </tr>
 </table>

 @anchor assignment
 @par Cluster Assignment

 After clustering, the cluster assignment for each data point can be computed:

 <pre class="syntax">
 dbscan_predict( dbscan_table,
                 source_table,
                 id_column,
                 expr_point,
                 output_table
                 )
 </pre>

 <b>Arguments</b>
 <dl class="arglist">

 <dt>dbscan_table</dt>
 <dd>TEXT. Name of the table created by running DBSCAN.</dd>

 <dt>source_table</dt>
 <dd>TEXT. Name of the table containing the input data points.
 </dd>

 <dt>id_column</dt>
 <dd>TEXT. Name of the column containing a unique integer id for each training point.
 </dd>

 <dt>expr_point</dt>
 <dd>TEXT. Name of the column with point coordinates in array form,
 or an expression that evaluates to an array of point coordinates.
 </dd>

 <dt>output_table</dt>
 <dd>TEXT. Name of the table containing the clustering results.
 </dd>
 </dl>

 <b>Output</b>
 <br>
 The output is a table with the following columns:
 <table class="output">
     <tr>
       <th>id_column</th>
       <td>INTEGER. ID column passed to the function.</td>
     </tr>
     <tr>
       <th>cluster_id</th>
       <td>INTEGER. Cluster assignment (zero-based, i.e., 0,1,2...).</td>
     </tr>
     <tr>
       <th>distance</th>
       <td>DOUBLE PRECISION. Distance to the cluster centroid.</td>
     </tr>
 </table>


 @anchor examples
 @par Examples

 -# Create input data:
 <pre class="example">
 DROP TABLE IF EXISTS dbscan_train_data;
 CREATE TABLE dbscan_train_data (pid int, points double precision[]);
 INSERT INTO dbscan_train_data VALUES
 (1,  '{1, 1}'),
 (2,  '{2, 1}'),
 (3,  '{1, 2}'),
 (4,  '{2, 2}'),
 (5,  '{3, 5}'),
 (6,  '{3, 9}'),
 (7,  '{3, 10}'),
 (8,  '{4, 10}'),
 (9,  '{4, 11}'),
 (10,  '{5, 10}'),
 (11,  '{7, 10}'),
 (12,  '{10, 9}'),
 (13,  '{10, 6}'),
 (14,  '{9, 5}'),
 (15,  '{10, 5}'),
 (16,  '{11, 5}'),
 (17,  '{9, 4}'),
 (18,  '{10, 4}'),
 (19,  '{11, 4}'),
 (20,  '{10, 3}');
 CREATE TABLE dbscan_test_data (pid int, points double precision[]);
 INSERT INTO dbscan_test_data VALUES
 (1,  '{1, 2}'),
 (2,  '{2, 2}'),
 (3,  '{1, 3}'),
 (4,  '{2, 2}'),
 (10,  '{5, 11}'),
 (11,  '{7, 10}'),
 (12,  '{10, 9}'),
 (13,  '{10, 6}'),
 (14,  '{9, 5}'),
 (15,  '{10, 6}');
 </pre>
 -#  Run DBSCAN using the brute force method with a Euclidean
 distance function:
 <pre class="example">
 DROP TABLE IF EXISTS dbscan_result, dbscan_result_summary;
 SELECT madlib.dbscan(
                 'dbscan_train_data',    -- source table
                 'dbscan_result',        -- output table
                 'pid',                  -- point id column
                 'points',               -- data point
                  1.75,                  -- epsilon
                  4,                     -- min samples
                 'dist_norm2',           -- metric
                 'brute_force');         -- algorithm
 SELECT * FROM dbscan_result ORDER BY pid;
 </pre>
 <pre class="result">
  pid | cluster_id | is_core_point | __points__
 -----+------------+---------------+------------
    1 |          0 | t             | {1,1}
    2 |          0 | t             | {2,1}
    3 |          0 | t             | {1,2}
    4 |          0 | t             | {2,2}
    6 |          1 | f             | {3,9}
    7 |          1 | t             | {3,10}
    8 |          1 | t             | {4,10}
    9 |          1 | t             | {4,11}
   10 |          1 | f             | {5,10}
   13 |          2 | t             | {10,6}
   14 |          2 | t             | {9,5}
   15 |          2 | t             | {10,5}
   16 |          2 | t             | {11,5}
   17 |          2 | t             | {9,4}
   18 |          2 | t             | {10,4}
   19 |          2 | t             | {11,4}
   20 |          2 | t             | {10,3}
 (17 rows)
 </pre>
 There are three clusters created.  All points are core points
 except for 6 and 10 which are border points.  The noise points
 do not show up in the output table. If you want to see the noise points
 you can use a query like:
 <pre class="example">
 SELECT l.* FROM dbscan_train_data l WHERE NOT EXISTS
     (SELECT NULL FROM dbscan_result r WHERE r.pid = l.pid)
     ORDER BY l.pid;
 </pre>
 <pre class="result">
  pid | points
 -----+--------
    5 | {3,5}
   11 | {7,10}
   12 | {10,9}
 </pre>
 The summary table lists the 'eps' value and the distance metric used:
 <pre class="example">
 SELECT * FROM dbscan_result_summary;
 </pre>
 <pre class="result">
  id_column | eps  |   metric
 -----------+------+------------
  pid       | 1.75 | dist_norm2
 </pre>
 -#  Find the cluster assignment for the test data points:
 <pre class="example">
 SELECT madlib.dbscan_predict(
                         'dbscan_result',        -- from DBSCAN run
                         'dbscan_test_data',     -- test dataset
                         'pid',                  -- point id column
                         'points',               -- data point
                         'dbscan_predict_out'    -- output table
                         );
 SELECT * FROM dbscan_predict_out ORDER BY pid;
 </pre>
 <pre class="result">
  pid | cluster_id | distance
 -----+------------+----------
    1 |          0 |        0
    2 |          0 |        0
    3 |          0 |        1
    4 |          0 |        0
   10 |          1 |        1
   13 |          2 |        0
   14 |          2 |        0
   15 |          2 |        0
 (8 rows)
 </pre>

 @anchor literature
 @literature

 [1] Martin Ester, Hans-Peter Kriegel, Jörg Sander, Xiaowei Xu,
 "A Density-Based Algorithm for Discovering Clusters
 in Large Spatial Databases with Noise", KDD-96 Proceedings,
 https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf

 [2] Erich Schubert, Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu,
 "DBSCAN Revisited, Revisited: Why and How You Should (Still) Use DBSCAN",
 ACM Transactions on Database Systems, July 2017, Article No. 19,
 https://dl.acm.org/doi/10.1145/3068335

 */


 m4_include(`SQLCommon.m4')

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
     source_table                VARCHAR,
     output_table                VARCHAR,
     id_column                   VARCHAR,
     expr_point                  VARCHAR,
     eps                         DOUBLE PRECISION,
     min_samples                 INTEGER,
     metric                      VARCHAR,
     algorithm                   VARCHAR
 ) RETURNS VOID AS $$
     PythonFunction(dbscan, dbscan, dbscan)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
     source_table                VARCHAR,
     output_table                VARCHAR,
     id_column                   VARCHAR,
     expr_point                  VARCHAR,
     eps                         DOUBLE PRECISION,
     min_samples                 INTEGER,
     metric                      VARCHAR
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, $7, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
     source_table                VARCHAR,
     output_table                VARCHAR,
     id_column                   VARCHAR,
     expr_point                  VARCHAR,
     eps                         DOUBLE PRECISION,
     min_samples                 INTEGER
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, NULL, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
     source_table                VARCHAR,
     output_table                VARCHAR,
     id_column                   VARCHAR,
     expr_point                  VARCHAR,
     eps                         DOUBLE PRECISION
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, NULL, NULL, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
     dbscan_table                VARCHAR,
     source_table                VARCHAR,
     id_column                   VARCHAR,
     expr_point                  VARCHAR,
     output_table                VARCHAR
 ) RETURNS VOID AS $$
     PythonFunction(dbscan, dbscan, dbscan_predict)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
     message                VARCHAR
 ) RETURNS VARCHAR AS $$
     PythonFunction(dbscan, dbscan, dbscan_help)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
 ) RETURNS VARCHAR AS $$
     PythonFunction(dbscan, dbscan, dbscan_help)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
     message                VARCHAR
 ) RETURNS VARCHAR AS $$
     PythonFunction(dbscan, dbscan, dbscan_predict_help)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
 ) RETURNS VARCHAR AS $$
     PythonFunction(dbscan, dbscan, dbscan_predict_help)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
	/* ----------------------------------------------------------------------- */
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	* @file dbscan.sql_in
	* @brief Partitions a set of observations into clusters of arbitrary shape based on the density of nearby neighbors.
	* @date May 2020
	*
	*/
	/* ----------------------------------------------------------------------- */


	/**
	@addtogroup grp_dbscan

	<div class="toc"><b>Contents</b>
	<ul>
	<li class="level1"><a href="#cluster">Clustering Function</a></li>
	<li class="level1"><a href="#assignment">Cluster Assignment</a></li>
	<li class="level1"><a href="#examples">Examples</a></li>
	<li class="level1"><a href="#literature">Literature</a></li>
	</ul>
	</div>

	@brief Partitions a set of observations into clusters of arbitrary
	shape based on the density of nearby neighbors.

	\warning <em> This MADlib method is still in early stage development.
	Interface and implementation are subject to change. </em>

	Density-based spatial clustering of applications with noise (DBSCAN)
	is a data clustering algorithm designed to discover clusters of arbitrary
	shape [1,2]. It places minimum requirements on domain knowledge to determine
	input parameters and has good efficiency on large databases.

	Given a set of points, DBSCAN groups together points that are closely packed with many
	nearby neighbors (\em core), and marks points that lie alone in
	low-density regions with few nearby neighbors (\em border). Other points in the dataset
	that are not core or border are considered as \em noise.
	This method tends to be good for data which contains clusters
	of similar density.

	Currently only a brute force approach is implemented, suitable for small datasets.
	Other approaches for larger datasets will be implemented in
	a future release.

	@anchor cluster
	@par Clustering Function

	<pre class="syntax">
	dbscan( source_table,
	output_table,
	id_column,
	expr_point,
	eps,
	min_samples,
	metric,
	algorithm
	)
	</pre>

	\b Arguments
	<dl class="arglist">
	<dt>source_table</dt>
	<dd>TEXT. Name of the table containing the input data points.
	</dd>

	<dt>output_table</dt>
	<dd>TEXT. Name of the table containing the clustering results.
	</dd>

	<dt>id_column</dt>
	<dd>TEXT. Name of the column containing a unique integer id for each training point.
	</dd>

	<dt>expr_point</dt>
	<dd>TEXT. Name of the column with point coordinates in array form,
	or an expression that evaluates to an array of point coordinates.
	</dd>

	<dt>eps</dt>
	<dd>FLOAT8. Maximum distance between two samples for one to
	be considered in the neighborhood of the other. (Note this is not a maximum
	bound on the distances of points within a cluster.) This is an
	important parameter to choose appropriately and should consider both the
	nature of the data set and the distance function.
	</dd>

	<dt>min_samples (optional)</dt>
	<dd>INTEGER, default: 5. Number of samples in a neighborhood for a point
	to be considered as a core point. This includes the point itself.
	This parameter controls how tolerant the algorithm is towards
	noise, so on noisy data sets it may be useful to increase
	the magnitude of this parameter.
	</dd>

	@note
	The parameters 'eps' and 'min_samples' together define the density of a cluster.
	A core point is where there exist 'min_samples' other points within a distance of 'eps',
	which are defined as neighbors of the core point.
	A higher value of 'min_samples' or a lower value of 'eps' indicate that higher density
	is needed to form a cluster.

	<dt>metric (optional)</dt>
	<dd>TEXT, default: 'squared_dist_norm2'. The name of the function used
	to calculate the distance between data points. The following distance
	functions can be used:
	<ul>
	<li><b>\ref dist_norm1</b>: 1-norm/Manhattan (element-wise median).
	MADlib does not provide a median aggregate function for
	performance reasons.</li>
	<li><b>\ref dist_norm2</b>: 2-norm/Euclidean (element-wise mean)</li>
	<li><b>\ref squared_dist_norm2</b>: squared Euclidean distance (element-wise mean)</li>
	<li><b>\ref dist_angle</b>: angle (element-wise mean of normalized points)</li>
	<li><b>\ref dist_tanimoto</b>: tanimoto (element-wise mean of normalized points)</li>
	</dd>

	<dt>algorithm (optional)</dt>
	<dd>TEXT, default: 'brute_force'. The name of the algorithm
	used to compute clusters. Currently only brute force is supported:
	<ul>
	<li><b>\ref brute_force</b>: Produces an exact result by searching
	all points in the search space. Brute force can be slow and is intended
	to be used for small datasets only. You can also use a short
	form "b" or "brute" etc. to select brute force.</li>
	</ul></dd>
	</dl>

	<b>Output</b>
	<br>
	The output table for DBSCAN module has the following columns:
	<table class="output">
	<tr>
	<th>id_column</th>
	<td>INTEGER. Test data point id.</td>
	</tr>
	<tr>
	<th>cluster_id</th>
	<td>INTEGER. Cluster id associated with the test data point.</td>
	</tr>
	<tr>
	<th>is_core_point</th>
	<td>BOOLEAN. Indicates if the test data point is core
	If it is not a core point and listed in the output table,
	it is a border point. Noise points are not shown in this
	table.</td>
	</tr>
	<tr>
	<th>__points__</th>
	<td>TEXT. Column or expression for the data point.</td>
	</tr>
	</table>

	@anchor assignment
	@par Cluster Assignment

	After clustering, the cluster assignment for each data point can be computed:

	<pre class="syntax">
	dbscan_predict( dbscan_table,
	source_table,
	id_column,
	expr_point,
	output_table
	)
	</pre>

	<b>Arguments</b>
	<dl class="arglist">

	<dt>dbscan_table</dt>
	<dd>TEXT. Name of the table created by running DBSCAN.</dd>

	<dt>source_table</dt>
	<dd>TEXT. Name of the table containing the input data points.
	</dd>

	<dt>id_column</dt>
	<dd>TEXT. Name of the column containing a unique integer id for each training point.
	</dd>

	<dt>expr_point</dt>
	<dd>TEXT. Name of the column with point coordinates in array form,
	or an expression that evaluates to an array of point coordinates.
	</dd>

	<dt>output_table</dt>
	<dd>TEXT. Name of the table containing the clustering results.
	</dd>
	</dl>

	<b>Output</b>
	<br>
	The output is a table with the following columns:
	<table class="output">
	<tr>
	<th>id_column</th>
	<td>INTEGER. ID column passed to the function.</td>
	</tr>
	<tr>
	<th>cluster_id</th>
	<td>INTEGER. Cluster assignment (zero-based, i.e., 0,1,2...).</td>
	</tr>
	<tr>
	<th>distance</th>
	<td>DOUBLE PRECISION. Distance to the cluster centroid.</td>
	</tr>
	</table>


	@anchor examples
	@par Examples

	-# Create input data:
	<pre class="example">
	DROP TABLE IF EXISTS dbscan_train_data;
	CREATE TABLE dbscan_train_data (pid int, points double precision[]);
	INSERT INTO dbscan_train_data VALUES
	(1, '{1, 1}'),
	(2, '{2, 1}'),
	(3, '{1, 2}'),
	(4, '{2, 2}'),
	(5, '{3, 5}'),
	(6, '{3, 9}'),
	(7, '{3, 10}'),
	(8, '{4, 10}'),
	(9, '{4, 11}'),
	(10, '{5, 10}'),
	(11, '{7, 10}'),
	(12, '{10, 9}'),
	(13, '{10, 6}'),
	(14, '{9, 5}'),
	(15, '{10, 5}'),
	(16, '{11, 5}'),
	(17, '{9, 4}'),
	(18, '{10, 4}'),
	(19, '{11, 4}'),
	(20, '{10, 3}');
	CREATE TABLE dbscan_test_data (pid int, points double precision[]);
	INSERT INTO dbscan_test_data VALUES
	(1, '{1, 2}'),
	(2, '{2, 2}'),
	(3, '{1, 3}'),
	(4, '{2, 2}'),
	(10, '{5, 11}'),
	(11, '{7, 10}'),
	(12, '{10, 9}'),
	(13, '{10, 6}'),
	(14, '{9, 5}'),
	(15, '{10, 6}');
	</pre>
	-# Run DBSCAN using the brute force method with a Euclidean
	distance function:
	<pre class="example">
	DROP TABLE IF EXISTS dbscan_result, dbscan_result_summary;
	SELECT madlib.dbscan(
	'dbscan_train_data', -- source table
	'dbscan_result', -- output table
	'pid', -- point id column
	'points', -- data point
	1.75, -- epsilon
	4, -- min samples
	'dist_norm2', -- metric
	'brute_force'); -- algorithm
	SELECT * FROM dbscan_result ORDER BY pid;
	</pre>
	<pre class="result">
	pid \| cluster_id \| is_core_point \| __points__
	-----+------------+---------------+------------
	1 \| 0 \| t \| {1,1}
	2 \| 0 \| t \| {2,1}
	3 \| 0 \| t \| {1,2}
	4 \| 0 \| t \| {2,2}
	6 \| 1 \| f \| {3,9}
	7 \| 1 \| t \| {3,10}
	8 \| 1 \| t \| {4,10}
	9 \| 1 \| t \| {4,11}
	10 \| 1 \| f \| {5,10}
	13 \| 2 \| t \| {10,6}
	14 \| 2 \| t \| {9,5}
	15 \| 2 \| t \| {10,5}
	16 \| 2 \| t \| {11,5}
	17 \| 2 \| t \| {9,4}
	18 \| 2 \| t \| {10,4}
	19 \| 2 \| t \| {11,4}
	20 \| 2 \| t \| {10,3}
	(17 rows)
	</pre>
	There are three clusters created. All points are core points
	except for 6 and 10 which are border points. The noise points
	do not show up in the output table. If you want to see the noise points
	you can use a query like:
	<pre class="example">
	SELECT l.* FROM dbscan_train_data l WHERE NOT EXISTS
	(SELECT NULL FROM dbscan_result r WHERE r.pid = l.pid)
	ORDER BY l.pid;
	</pre>
	<pre class="result">
	pid \| points
	-----+--------
	5 \| {3,5}
	11 \| {7,10}
	12 \| {10,9}
	</pre>
	The summary table lists the 'eps' value and the distance metric used:
	<pre class="example">
	SELECT * FROM dbscan_result_summary;
	</pre>
	<pre class="result">
	id_column \| eps \| metric
	-----------+------+------------
	pid \| 1.75 \| dist_norm2
	</pre>
	-# Find the cluster assignment for the test data points:
	<pre class="example">
	SELECT madlib.dbscan_predict(
	'dbscan_result', -- from DBSCAN run
	'dbscan_test_data', -- test dataset
	'pid', -- point id column
	'points', -- data point
	'dbscan_predict_out' -- output table
	);
	SELECT * FROM dbscan_predict_out ORDER BY pid;
	</pre>
	<pre class="result">
	pid \| cluster_id \| distance
	-----+------------+----------
	1 \| 0 \| 0
	2 \| 0 \| 0
	3 \| 0 \| 1
	4 \| 0 \| 0
	10 \| 1 \| 1
	13 \| 2 \| 0
	14 \| 2 \| 0
	15 \| 2 \| 0
	(8 rows)
	</pre>

	@anchor literature
	@literature

	[1] Martin Ester, Hans-Peter Kriegel, Jörg Sander, Xiaowei Xu,
	"A Density-Based Algorithm for Discovering Clusters
	in Large Spatial Databases with Noise", KDD-96 Proceedings,
	https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf

	[2] Erich Schubert, Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu,
	"DBSCAN Revisited, Revisited: Why and How You Should (Still) Use DBSCAN",
	ACM Transactions on Database Systems, July 2017, Article No. 19,
	https://dl.acm.org/doi/10.1145/3068335

	*/


	m4_include(`SQLCommon.m4')

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
	source_table VARCHAR,
	output_table VARCHAR,
	id_column VARCHAR,
	expr_point VARCHAR,
	eps DOUBLE PRECISION,
	min_samples INTEGER,
	metric VARCHAR,
	algorithm VARCHAR
	) RETURNS VOID AS $$
	PythonFunction(dbscan, dbscan, dbscan)
	$$ LANGUAGE plpythonu VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
	source_table VARCHAR,
	output_table VARCHAR,
	id_column VARCHAR,
	expr_point VARCHAR,
	eps DOUBLE PRECISION,
	min_samples INTEGER,
	metric VARCHAR
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, $7, NULL);
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
	source_table VARCHAR,
	output_table VARCHAR,
	id_column VARCHAR,
	expr_point VARCHAR,
	eps DOUBLE PRECISION,
	min_samples INTEGER
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, NULL, NULL);
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
	source_table VARCHAR,
	output_table VARCHAR,
	id_column VARCHAR,
	expr_point VARCHAR,
	eps DOUBLE PRECISION
	) RETURNS VOID AS $$
	SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, NULL, NULL, NULL);
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
	dbscan_table VARCHAR,
	source_table VARCHAR,
	id_column VARCHAR,
	expr_point VARCHAR,
	output_table VARCHAR
	) RETURNS VOID AS $$
	PythonFunction(dbscan, dbscan, dbscan_predict)
	$$ LANGUAGE plpythonu VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');


	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
	message VARCHAR
	) RETURNS VARCHAR AS $$
	PythonFunction(dbscan, dbscan, dbscan_help)
	$$ LANGUAGE plpythonu VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
	) RETURNS VARCHAR AS $$
	PythonFunction(dbscan, dbscan, dbscan_help)
	$$ LANGUAGE plpythonu VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
	message VARCHAR
	) RETURNS VARCHAR AS $$
	PythonFunction(dbscan, dbscan, dbscan_predict_help)
	$$ LANGUAGE plpythonu VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
	) RETURNS VARCHAR AS $$
	PythonFunction(dbscan, dbscan, dbscan_predict_help)
	$$ LANGUAGE plpythonu VOLATILE
	m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');