| /* ----------------------------------------------------------------------- */ |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * @file dbscan.sql_in |
| * @brief Partitions a set of observations into clusters of arbitrary shape based on the density of nearby neighbors. |
| * @date May 2020 |
| * |
| */ |
| /* ----------------------------------------------------------------------- */ |
| |
| |
| /** |
| @addtogroup grp_dbscan |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#cluster">Clustering Function</a></li> |
| <li class="level1"><a href="#assignment">Cluster Assignment</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| </ul> |
| </div> |
| |
| @brief Partitions a set of observations into clusters of arbitrary |
| shape based on the density of nearby neighbors. |
| |
| \warning <em> This MADlib method is still in early stage development. |
| Interface and implementation are subject to change. </em> |
| |
| Density-based spatial clustering of applications with noise (DBSCAN) |
| is a data clustering algorithm designed to discover clusters of arbitrary |
| shape [1,2]. It places minimum requirements on domain knowledge to determine |
| input parameters and has good efficiency on large databases. |
| |
| Given a set of points, DBSCAN groups together points that are closely packed with many |
| nearby neighbors (\em core), and marks points that lie alone in |
| low-density regions with few nearby neighbors (\em border). Other points in the dataset |
| that are not core or border are considered as \em noise. |
| This method tends to be good for data which contains clusters |
| of similar density. |
| |
| Currently only a brute force approach is implemented, suitable for small datasets. |
| Other approaches for larger datasets will be implemented in |
| a future release. |
| |
| @anchor cluster |
| @par Clustering Function |
| |
| <pre class="syntax"> |
| dbscan( source_table, |
| output_table, |
| id_column, |
| expr_point, |
| eps, |
| min_samples, |
| metric, |
| algorithm |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>source_table</dt> |
| <dd>TEXT. Name of the table containing the input data points. |
| </dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the table containing the clustering results. |
| </dd> |
| |
| <dt>id_column</dt> |
| <dd>TEXT. Name of the column containing a unique integer id for each training point. |
| </dd> |
| |
| <dt>expr_point</dt> |
| <dd>TEXT. Name of the column with point coordinates in array form, |
| or an expression that evaluates to an array of point coordinates. |
| </dd> |
| |
| <dt>eps</dt> |
| <dd>FLOAT8. Maximum distance between two samples for one to |
| be considered in the neighborhood of the other. (Note this is not a maximum |
| bound on the distances of points within a cluster.) This is an |
| important parameter to choose appropriately and should consider both the |
| nature of the data set and the distance function. |
| </dd> |
| |
| <dt>min_samples (optional)</dt> |
| <dd>INTEGER, default: 5. Number of samples in a neighborhood for a point |
| to be considered as a core point. This includes the point itself. |
| This parameter controls how tolerant the algorithm is towards |
| noise, so on noisy data sets it may be useful to increase |
| the magnitude of this parameter. |
| </dd> |
| |
| @note |
| The parameters 'eps' and 'min_samples' together define the density of a cluster. |
| A core point is where there exist 'min_samples' other points within a distance of 'eps', |
| which are defined as neighbors of the core point. |
| A higher value of 'min_samples' or a lower value of 'eps' indicate that higher density |
| is needed to form a cluster. |
| |
| <dt>metric (optional)</dt> |
| <dd>TEXT, default: 'squared_dist_norm2'. The name of the function used |
| to calculate the distance between data points. The following distance |
| functions can be used: |
| <ul> |
| <li><b>\ref dist_norm1</b>: 1-norm/Manhattan (element-wise median). |
| MADlib does not provide a median aggregate function for |
| performance reasons.</li> |
| <li><b>\ref dist_norm2</b>: 2-norm/Euclidean (element-wise mean)</li> |
| <li><b>\ref squared_dist_norm2</b>: squared Euclidean distance (element-wise mean)</li> |
| <li><b>\ref dist_angle</b>: angle (element-wise mean of normalized points)</li> |
| <li><b>\ref dist_tanimoto</b>: tanimoto (element-wise mean of normalized points)</li> |
| </dd> |
| |
| <dt>algorithm (optional)</dt> |
| <dd>TEXT, default: 'brute_force'. The name of the algorithm |
| used to compute clusters. Currently only brute force is supported: |
| <ul> |
| <li><b>\ref brute_force</b>: Produces an exact result by searching |
| all points in the search space. Brute force can be slow and is intended |
| to be used for small datasets only. You can also use a short |
| form "b" or "brute" etc. to select brute force.</li> |
| </ul></dd> |
| </dl> |
| |
| <b>Output</b> |
| <br> |
| The output table for DBSCAN module has the following columns: |
| <table class="output"> |
| <tr> |
| <th>id_column</th> |
| <td>INTEGER. Test data point id.</td> |
| </tr> |
| <tr> |
| <th>cluster_id</th> |
| <td>INTEGER. Cluster id associated with the test data point.</td> |
| </tr> |
| <tr> |
| <th>is_core_point</th> |
| <td>BOOLEAN. Indicates if the test data point is core |
| If it is not a core point and listed in the output table, |
| it is a border point. Noise points are not shown in this |
| table.</td> |
| </tr> |
| <tr> |
| <th>__points__</th> |
| <td>TEXT. Column or expression for the data point.</td> |
| </tr> |
| </table> |
| |
| @anchor assignment |
| @par Cluster Assignment |
| |
| After clustering, the cluster assignment for each data point can be computed: |
| |
| <pre class="syntax"> |
| dbscan_predict( dbscan_table, |
| source_table, |
| id_column, |
| expr_point, |
| output_table |
| ) |
| </pre> |
| |
| <b>Arguments</b> |
| <dl class="arglist"> |
| |
| <dt>dbscan_table</dt> |
| <dd>TEXT. Name of the table created by running DBSCAN.</dd> |
| |
| <dt>source_table</dt> |
| <dd>TEXT. Name of the table containing the input data points. |
| </dd> |
| |
| <dt>id_column</dt> |
| <dd>TEXT. Name of the column containing a unique integer id for each training point. |
| </dd> |
| |
| <dt>expr_point</dt> |
| <dd>TEXT. Name of the column with point coordinates in array form, |
| or an expression that evaluates to an array of point coordinates. |
| </dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the table containing the clustering results. |
| </dd> |
| </dl> |
| |
| <b>Output</b> |
| <br> |
| The output is a table with the following columns: |
| <table class="output"> |
| <tr> |
| <th>id_column</th> |
| <td>INTEGER. ID column passed to the function.</td> |
| </tr> |
| <tr> |
| <th>cluster_id</th> |
| <td>INTEGER. Cluster assignment (zero-based, i.e., 0,1,2...).</td> |
| </tr> |
| <tr> |
| <th>distance</th> |
| <td>DOUBLE PRECISION. Distance to the cluster centroid.</td> |
| </tr> |
| </table> |
| |
| |
| @anchor examples |
| @par Examples |
| |
| -# Create input data: |
| <pre class="example"> |
| DROP TABLE IF EXISTS dbscan_train_data; |
| CREATE TABLE dbscan_train_data (pid int, points double precision[]); |
| INSERT INTO dbscan_train_data VALUES |
| (1, '{1, 1}'), |
| (2, '{2, 1}'), |
| (3, '{1, 2}'), |
| (4, '{2, 2}'), |
| (5, '{3, 5}'), |
| (6, '{3, 9}'), |
| (7, '{3, 10}'), |
| (8, '{4, 10}'), |
| (9, '{4, 11}'), |
| (10, '{5, 10}'), |
| (11, '{7, 10}'), |
| (12, '{10, 9}'), |
| (13, '{10, 6}'), |
| (14, '{9, 5}'), |
| (15, '{10, 5}'), |
| (16, '{11, 5}'), |
| (17, '{9, 4}'), |
| (18, '{10, 4}'), |
| (19, '{11, 4}'), |
| (20, '{10, 3}'); |
| CREATE TABLE dbscan_test_data (pid int, points double precision[]); |
| INSERT INTO dbscan_test_data VALUES |
| (1, '{1, 2}'), |
| (2, '{2, 2}'), |
| (3, '{1, 3}'), |
| (4, '{2, 2}'), |
| (10, '{5, 11}'), |
| (11, '{7, 10}'), |
| (12, '{10, 9}'), |
| (13, '{10, 6}'), |
| (14, '{9, 5}'), |
| (15, '{10, 6}'); |
| </pre> |
| -# Run DBSCAN using the brute force method with a Euclidean |
| distance function: |
| <pre class="example"> |
| DROP TABLE IF EXISTS dbscan_result, dbscan_result_summary; |
| SELECT madlib.dbscan( |
| 'dbscan_train_data', -- source table |
| 'dbscan_result', -- output table |
| 'pid', -- point id column |
| 'points', -- data point |
| 1.75, -- epsilon |
| 4, -- min samples |
| 'dist_norm2', -- metric |
| 'brute_force'); -- algorithm |
| SELECT * FROM dbscan_result ORDER BY pid; |
| </pre> |
| <pre class="result"> |
| pid | cluster_id | is_core_point | __points__ |
| -----+------------+---------------+------------ |
| 1 | 0 | t | {1,1} |
| 2 | 0 | t | {2,1} |
| 3 | 0 | t | {1,2} |
| 4 | 0 | t | {2,2} |
| 6 | 1 | f | {3,9} |
| 7 | 1 | t | {3,10} |
| 8 | 1 | t | {4,10} |
| 9 | 1 | t | {4,11} |
| 10 | 1 | f | {5,10} |
| 13 | 2 | t | {10,6} |
| 14 | 2 | t | {9,5} |
| 15 | 2 | t | {10,5} |
| 16 | 2 | t | {11,5} |
| 17 | 2 | t | {9,4} |
| 18 | 2 | t | {10,4} |
| 19 | 2 | t | {11,4} |
| 20 | 2 | t | {10,3} |
| (17 rows) |
| </pre> |
| There are three clusters created. All points are core points |
| except for 6 and 10 which are border points. The noise points |
| do not show up in the output table. If you want to see the noise points |
| you can use a query like: |
| <pre class="example"> |
| SELECT l.* FROM dbscan_train_data l WHERE NOT EXISTS |
| (SELECT NULL FROM dbscan_result r WHERE r.pid = l.pid) |
| ORDER BY l.pid; |
| </pre> |
| <pre class="result"> |
| pid | points |
| -----+-------- |
| 5 | {3,5} |
| 11 | {7,10} |
| 12 | {10,9} |
| </pre> |
| The summary table lists the 'eps' value and the distance metric used: |
| <pre class="example"> |
| SELECT * FROM dbscan_result_summary; |
| </pre> |
| <pre class="result"> |
| id_column | eps | metric |
| -----------+------+------------ |
| pid | 1.75 | dist_norm2 |
| </pre> |
| -# Find the cluster assignment for the test data points: |
| <pre class="example"> |
| SELECT madlib.dbscan_predict( |
| 'dbscan_result', -- from DBSCAN run |
| 'dbscan_test_data', -- test dataset |
| 'pid', -- point id column |
| 'points', -- data point |
| 'dbscan_predict_out' -- output table |
| ); |
| SELECT * FROM dbscan_predict_out ORDER BY pid; |
| </pre> |
| <pre class="result"> |
| pid | cluster_id | distance |
| -----+------------+---------- |
| 1 | 0 | 0 |
| 2 | 0 | 0 |
| 3 | 0 | 1 |
| 4 | 0 | 0 |
| 10 | 1 | 1 |
| 13 | 2 | 0 |
| 14 | 2 | 0 |
| 15 | 2 | 0 |
| (8 rows) |
| </pre> |
| |
| @anchor literature |
| @literature |
| |
| [1] Martin Ester, Hans-Peter Kriegel, Jörg Sander, Xiaowei Xu, |
| "A Density-Based Algorithm for Discovering Clusters |
| in Large Spatial Databases with Noise", KDD-96 Proceedings, |
| https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf |
| |
| [2] Erich Schubert, Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu, |
| "DBSCAN Revisited, Revisited: Why and How You Should (Still) Use DBSCAN", |
| ACM Transactions on Database Systems, July 2017, Article No. 19, |
| https://dl.acm.org/doi/10.1145/3068335 |
| |
| */ |
| |
| |
| m4_include(`SQLCommon.m4') |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| id_column VARCHAR, |
| expr_point VARCHAR, |
| eps DOUBLE PRECISION, |
| min_samples INTEGER, |
| metric VARCHAR, |
| algorithm VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(dbscan, dbscan, dbscan) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| id_column VARCHAR, |
| expr_point VARCHAR, |
| eps DOUBLE PRECISION, |
| min_samples INTEGER, |
| metric VARCHAR |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, $7, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| id_column VARCHAR, |
| expr_point VARCHAR, |
| eps DOUBLE PRECISION, |
| min_samples INTEGER |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, NULL, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| id_column VARCHAR, |
| expr_point VARCHAR, |
| eps DOUBLE PRECISION |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, NULL, NULL, NULL); |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict( |
| dbscan_table VARCHAR, |
| source_table VARCHAR, |
| id_column VARCHAR, |
| expr_point VARCHAR, |
| output_table VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(dbscan, dbscan, dbscan_predict) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(dbscan, dbscan, dbscan_help) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan( |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(dbscan, dbscan, dbscan_help) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(dbscan, dbscan, dbscan_predict_help) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict( |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(dbscan, dbscan, dbscan_predict_help) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |