DBSCAN: Add new module DBSCAN This commit adds the DBSCAN module. It includes the brute force implementation of DBSCAN as well as a prediction function. In addition their associated tests and online documentation are included. Co-authored-by: Frank McQuillan <fmcquillan@pivotal.io>

commit: 6ff7c5436bb0c00c9c75e124fdb59e55b55b31d0 [log] [tgz]
author: Orhan Kislal <okislal@apache.org> Wed Apr 29 17:03:29 2020 -0400
committer: Orhan Kislal <okislal@pivotal.io> Fri May 08 13:43:52 2020 -0400
tree: ee585c131197c001a5d321dd10e8a3b55ca7c308
parent: 5014eec5a3375647f86fc9b9aa67f4424cdb3a0b [diff]
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 189b948..f4906a6 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in

@@ -241,6 +241,7 @@
     @details Methods for clustering data.
     @{
         @defgroup grp_kmeans k-Means Clustering
+        @defgroup grp_dbscan DBSCAN
     @}
 
     @defgroup grp_pca Dimensionality Reduction

diff --git a/src/config/Modules.yml b/src/config/Modules.yml
index f2da10f..cbf099f 100644
--- a/src/config/Modules.yml
+++ b/src/config/Modules.yml

@@ -11,6 +11,8 @@
     - name: convex
       depends: ['utilities']
     - name: crf
+    - name: dbscan
+      depends: ['utilities']
     - name: deep_learning
       depends: ['utilities']
     - name: elastic_net

diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
index a7f639b..9cf05a1 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in

@@ -54,7 +54,7 @@
 other fields.
 
 This type of data mining algorithm uses transactional data. Every transaction
-event has a unique identification, and each transaction consists of a set of
+event has a unique identifier, and each transaction consists of a set of
 items (or itemset). Purchases are considered binary (either it was purchased or
 not), and this implementation does not take into consideration the quantity of
 each item. For the MADlib association rules function, it is assumed that the

diff --git a/src/ports/postgres/modules/dbscan/__init__.py_in b/src/ports/postgres/modules/dbscan/__init__.py_in
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/ports/postgres/modules/dbscan/__init__.py_in


diff --git a/src/ports/postgres/modules/dbscan/dbscan.py_in b/src/ports/postgres/modules/dbscan/dbscan.py_in
new file mode 100644
index 0000000..7adc971
--- /dev/null
+++ b/src/ports/postgres/modules/dbscan/dbscan.py_in

@@ -0,0 +1,334 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import plpy
+
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import unique_string
+from utilities.utilities import add_postfix
+from utilities.utilities import NUMERIC, ONLY_ARRAY
+from utilities.utilities import is_valid_psql_type
+from utilities.utilities import is_platform_pg
+from utilities.validate_args import input_tbl_valid, output_tbl_valid
+from utilities.validate_args import is_var_valid
+from utilities.validate_args import cols_in_tbl_valid
+from utilities.validate_args import get_expr_type
+from utilities.validate_args import get_algorithm_name
+from graph.wcc import wcc
+
+BRUTE_FORCE = 'brute_force'
+KD_TREE = 'kd_tree'
+
+def dbscan(schema_madlib, source_table, output_table, id_column, expr_point, eps, min_samples, metric, algorithm, **kwargs):
+
+    with MinWarning("warning"):
+
+        min_samples = 5 if not min_samples else min_samples
+        metric = 'squared_dist_norm2' if not metric else metric
+        algorithm = 'brute' if not algorithm else algorithm
+
+        algorithm = get_algorithm_name(algorithm, BRUTE_FORCE,
+            [BRUTE_FORCE, KD_TREE], 'DBSCAN')
+
+        _validate_dbscan(schema_madlib, source_table, output_table, id_column,
+                         expr_point, eps, min_samples, metric, algorithm)
+
+        dist_src_sql = ''  if is_platform_pg() else 'DISTRIBUTED BY (__src__)'
+        dist_id_sql = ''  if is_platform_pg() else 'DISTRIBUTED BY ({0})'.format(id_column)
+        dist_reach_sql = ''  if is_platform_pg() else 'DISTRIBUTED BY (__reachable_id__)'
+
+        # Calculate pairwise distances
+        distance_table = unique_string(desp='distance_table')
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(distance_table))
+
+        sql = """
+            CREATE TABLE {distance_table} AS
+            SELECT __src__, __dest__ FROM (
+                SELECT  __t1__.{id_column} AS __src__,
+                        __t2__.{id_column} AS __dest__,
+                        {schema_madlib}.{metric}(
+                            __t1__.{expr_point}, __t2__.{expr_point}) AS __dist__
+                FROM {source_table} AS __t1__, {source_table} AS __t2__
+                WHERE __t1__.{id_column} != __t2__.{id_column}) q1
+            WHERE __dist__ < {eps}
+            {dist_src_sql}
+            """.format(**locals())
+        plpy.execute(sql)
+
+        # Find core points
+        # We use __count__ + 1 because we have to add the point itself to the count
+        core_points_table = unique_string(desp='core_points_table')
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(core_points_table))
+        sql = """
+            CREATE TABLE {core_points_table} AS
+            SELECT * FROM (SELECT __src__ AS {id_column}, count(*) AS __count__
+                           FROM {distance_table} GROUP BY __src__) q1
+            WHERE __count__ + 1 >= {min_samples}
+            {dist_id_sql}
+            """.format(**locals())
+        plpy.execute(sql)
+
+        # Find the connections between core points to form the clusters
+        core_edge_table = unique_string(desp='core_edge_table')
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(core_edge_table))
+        sql = """
+            CREATE TABLE {core_edge_table} AS
+            SELECT __src__, __dest__
+            FROM {distance_table} AS __t1__, (SELECT array_agg({id_column}) AS arr
+                                              FROM {core_points_table}) __t2__
+            WHERE __t1__.__src__ = ANY(arr) AND __t1__.__dest__ = ANY(arr)
+            {dist_src_sql}
+        """.format(**locals())
+        plpy.execute(sql)
+
+        # Run wcc to get the min id for each cluster
+        wcc(schema_madlib, core_points_table, id_column, core_edge_table, 'src=__src__, dest=__dest__',
+            output_table, None)
+        plpy.execute("""
+            ALTER TABLE {0}
+            ADD COLUMN is_core_point BOOLEAN,
+            ADD COLUMN __points__ DOUBLE PRECISION[]
+            """.format(output_table))
+        plpy.execute("""
+            ALTER TABLE {0}
+            RENAME COLUMN component_id TO cluster_id
+            """.format(output_table))
+        plpy.execute("""
+            UPDATE {0}
+            SET is_core_point = TRUE
+        """.format(output_table))
+
+        # Find reachable points
+        reachable_points_table = unique_string(desp='reachable_points_table')
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(reachable_points_table))
+        sql = """
+            CREATE TABLE {reachable_points_table} AS
+                SELECT array_agg(__src__) AS __src_list__,
+                       __dest__ AS __reachable_id__
+                FROM {distance_table} AS __t1__,
+                     (SELECT array_agg({id_column}) AS __arr__
+                      FROM {core_points_table}) __t2__
+                WHERE __src__ = ANY(__arr__) AND __dest__ != ALL(__arr__)
+                GROUP BY __dest__
+                {dist_reach_sql}
+            """.format(**locals())
+        plpy.execute(sql)
+
+        sql = """
+            INSERT INTO {output_table}
+            SELECT  __reachable_id__ as {id_column},
+                    cluster_id,
+                    FALSE AS is_core_point,
+                    NULL AS __points__
+            FROM {reachable_points_table} AS __t1__ INNER JOIN
+                 {output_table} AS __t2__
+                 ON (__src_list__[1] = {id_column})
+            """.format(**locals())
+        plpy.execute(sql)
+
+        # Add features of points to the output table to use them for prediction
+        sql = """
+            UPDATE {output_table} AS __t1__
+            SET __points__ = {expr_point}
+            FROM {source_table} AS __t2__
+            WHERE __t1__.{id_column} = __t2__.{id_column}
+        """.format(**locals())
+        plpy.execute(sql)
+
+        # Update the cluster ids to be consecutive
+        sql = """
+            UPDATE {output_table} AS __t1__
+            SET cluster_id = new_id-1
+            FROM (
+                SELECT cluster_id, row_number() OVER(ORDER BY cluster_id) AS new_id
+                FROM {output_table}
+                GROUP BY cluster_id) __t2__
+            WHERE __t1__.cluster_id = __t2__.cluster_id
+        """.format(**locals())
+        plpy.execute(sql)
+
+        output_summary_table = add_postfix(output_table, '_summary')
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(output_summary_table))
+
+        sql = """
+            CREATE TABLE {output_summary_table} AS
+            SELECT  '{id_column}'::VARCHAR AS id_column,
+                    {eps}::DOUBLE PRECISION AS eps,
+                    '{metric}'::VARCHAR AS metric
+            """.format(**locals())
+        plpy.execute(sql)
+
+        plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}, {3}".format(
+                     distance_table, core_points_table, core_edge_table,
+                     reachable_points_table))
+
+
+def dbscan_predict(schema_madlib, dbscan_table, new_point, **kwargs):
+
+    with MinWarning("warning"):
+
+        dbscan_summary_table = add_postfix(dbscan_table, '_summary')
+        summary = plpy.execute("SELECT * FROM {0}".format(dbscan_summary_table))[0]
+
+        eps = summary['eps']
+        metric = summary['metric']
+        sql = """
+            SELECT cluster_id,
+                   {schema_madlib}.{metric}(__points__, ARRAY{new_point}) as dist
+            FROM {dbscan_table}
+            WHERE is_core_point = TRUE
+            ORDER BY dist LIMIT 1
+            """.format(**locals())
+        result = plpy.execute(sql)[0]
+        dist = result['dist']
+        if dist < eps:
+            return result['cluster_id']
+        else:
+            return None
+
+def _validate_dbscan(schema_madlib, source_table, output_table, id_column, expr_point, eps, min_samples, metric, algorithm):
+
+    input_tbl_valid(source_table, 'dbscan')
+    output_tbl_valid(output_table, 'dbscan')
+    output_summary_table = add_postfix(output_table, '_summary')
+    output_tbl_valid(output_summary_table, 'dbscan')
+
+    cols_in_tbl_valid(source_table, [id_column], 'dbscan')
+
+    _assert(is_var_valid(source_table, expr_point),
+            "dbscan error: {0} is an invalid column name or "
+            "expression for expr_point param".format(expr_point))
+
+    point_col_type = get_expr_type(expr_point, source_table)
+    _assert(is_valid_psql_type(point_col_type, NUMERIC | ONLY_ARRAY),
+            "dbscan Error: Feature column or expression '{0}' in train table is not"
+            " a numeric array.".format(expr_point))
+
+    _assert(eps > 0, "dbscan Error: eps has to be a positive number")
+
+    _assert(min_samples > 0, "dbscan Error: min_samples has to be a positive number")
+
+    fn_dist_list = ['dist_norm1', 'dist_norm2', 'squared_dist_norm2', 'dist_angle', 'dist_tanimoto']
+    _assert(metric in fn_dist_list, "dbscan Error: metric has to be one of the madlib defined distance functions")
+
+
+def dbscan_help(schema_madlib, message=None, **kwargs):
+    """
+    Help function for dbscan
+
+    Args:
+        @param schema_madlib
+        @param message: string, Help message string
+        @param kwargs
+
+    Returns:
+        String. Help/usage information
+    """
+    if message is not None and \
+            message.lower() in ("usage", "help", "?"):
+        help_string = """
+-----------------------------------------------------------------------
+                            USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.dbscan(
+    source_table,       -- Name of the training data table
+    output_table,       -- Name of the output table
+    id_column,          -- Name of id column in source_table
+    expr_point,         -- Column name or expression for data points
+    eps,                -- The minimum radius of a cluster
+    min_samples,        -- The minimum size of a cluster
+    metric,             -- The name of the function to use to calculate the
+                        -- distance
+    algorithm           -- The algorithm to use for dbscan.
+    );
+
+-----------------------------------------------------------------------
+                            OUTPUT
+-----------------------------------------------------------------------
+The output of the dbscan function is a table with the following columns:
+
+id_column           The ids of test data point
+cluster_id          The id of the points associated cluster
+is_core_point       Boolean column that indicates if the point is core or not
+points              The column or expression for the data point
+"""
+    else:
+        help_string = """
+----------------------------------------------------------------------------
+                                SUMMARY
+----------------------------------------------------------------------------
+DBSCAN is a density-based clustering algorithm. Given a set of points in
+some space, it groups together points that are closely packed together
+(points with many nearby neighbors), marking as outliers points that lie
+alone in low-density regions (whose nearest neighbors are too far away).
+--
+For an overview on usage, run:
+SELECT {schema_madlib}.dbscan('usage');
+SELECT {schema_madlib}.dbscan_predict('usage');
+"""
+    return help_string.format(schema_madlib=schema_madlib)
+# ------------------------------------------------------------------------------
+
+def dbscan_predict_help(schema_madlib, message=None, **kwargs):
+    """
+    Help function for dbscan
+
+    Args:
+        @param schema_madlib
+        @param message: string, Help message string
+        @param kwargs
+
+    Returns:
+        String. Help/usage information
+    """
+    if message is not None and \
+            message.lower() in ("usage", "help", "?"):
+        help_string = """
+-----------------------------------------------------------------------
+                            USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.dbscan_predict(
+    dbscan_table,       -- Name of the tdbscan output table
+    new_point           -- Double precision array representing the point
+                        -- for prediction
+    );
+
+-----------------------------------------------------------------------
+                            OUTPUT
+-----------------------------------------------------------------------
+The output of the dbscan_predict is an integer indicating the cluster_id
+of given point
+"""
+    else:
+        help_string = """
+----------------------------------------------------------------------------
+                                SUMMARY
+----------------------------------------------------------------------------
+DBSCAN is a density-based clustering algorithm. Given a set of points in
+some space, it groups together points that are closely packed together
+(points with many nearby neighbors), marking as outliers points that lie
+alone in low-density regions (whose nearest neighbors are too far away).
+--
+For an overview on usage, run:
+SELECT {schema_madlib}.dbscan('usage');
+SELECT {schema_madlib}.dbscan_predict('usage');
+"""
+    return help_string.format(schema_madlib=schema_madlib)
+# ------------------------------------------------------------------------------

diff --git a/src/ports/postgres/modules/dbscan/dbscan.sql_in b/src/ports/postgres/modules/dbscan/dbscan.sql_in
new file mode 100644
index 0000000..976d065
--- /dev/null
+++ b/src/ports/postgres/modules/dbscan/dbscan.sql_in

@@ -0,0 +1,313 @@
+/* ----------------------------------------------------------------------- */
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * @file dbscan.sql_in
+ * @brief Partitions a set of observations into clusters of arbitrary shape based on the density of nearby neighbors.
+ * @date May 2020
+ *
+ */
+/* ----------------------------------------------------------------------- */
+
+
+/**
+@addtogroup grp_dbscan
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li class="level1"><a href="#cluster">Clustering Function</a></li>
+<li class="level1"><a href="#assignment">Cluster Assignment</a></li>
+<li class="level1"><a href="#examples">Examples</a></li>
+<li class="level1"><a href="#literature">Literature</a></li>
+<li class="level1"><a href="#related">Related Topics</a></li>
+</ul>
+</div>
+
+@brief Partitions a set of observations into clusters of arbitrary
+shape based on the density of nearby neighbors.
+
+Density-based spatial clustering of applications with noise (DBSCAN)
+is a data clustering algorithm designed to discover clusters of arbitrary
+shape [1]. It places minimum requirements on domain knowledge to determine
+input parameters and has good efficiency on large databases.
+
+Given a set of points, DBSCAN groups together points that are closely packed with many
+nearby neighbors (core samples), and marks as outliers points that lie alone in
+low-density regions with few nearby neighbors (non-core samples).
+This method tends to be good for data which contains clusters
+of similar density.
+
+@anchor cluster
+@par Clustering Function
+
+<pre class="syntax">
+dbscan( source_table,
+        output_table,
+        id_column,
+        expr_point,
+        eps,
+        min_samples,
+        metric,
+        algorithm,
+        algorithm_params
+      )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>source_table</dt>
+<dd>TEXT. Name of the table containing the input data points.
+</dd>
+
+<dt>output_table</dt>
+<dd>TEXT. Name of the table containing the clustering results.
+</dd>
+
+<dt>id_column</dt>
+<dd>TEXT. Name of the column containing a unique integer id for each training point.
+</dd>
+
+<dt>expr_point</dt>
+<dd>TEXT. Name of the column with point coordinates in array form,
+or an expression that evaluates to an array of point coordinates.
+</dd>
+
+<dt>eps</dt>
+<dd>FLOAT8. Maximum distance between two samples for one to
+be considered in the neighborhood of the other. (Note this is not a maximum
+bound on the distances of points within a cluster.) This is an
+important parameter to choose appropriately and should consider both the
+nature of the data set and the distance function. Usually it is not left
+at the default value.
+</dd>
+
+<dt>min_samples (optional)</dt>
+<dd>INTEGER, default: 5.  Number of samples in a neighborhood for a point
+to be considered as a core point. This includes the point itself.
+This parameter controls how tolerant the algorithm is towards
+noise, so on noisy data sets it may be useful to increase
+the magnitude of this parameter.
+</dd>
+
+@note
+The parameters 'eps' and 'min_samples' together define the density of a cluster.
+A core sample is a sample in the dataset such
+that there exist 'min_samples' other samples within a distance of 'eps',
+which are defined as neighbors of the core sample.
+A higher value of 'min_samples' or a lower value of 'eps' indicate that higher density
+is needed to form a cluster.
+
+<dt>metric (optional)</dt>
+<dd>TEXT, default: 'squared_dist_norm2'. The name of the function used
+to calculate the distance between data points. The following distance
+functions can be used:
+<ul>
+<li><b>\ref dist_norm1</b>:  1-norm/Manhattan (element-wise median).
+MADlib does not provide a median aggregate function for
+performance reasons.</li>
+<li><b>\ref dist_norm2</b>: 2-norm/Euclidean (element-wise mean)</li>
+<li><b>\ref squared_dist_norm2</b>: squared Euclidean distance (element-wise mean)</li>
+<li><b>\ref dist_angle</b>: angle (element-wise mean of normalized points)</li>
+<li><b>\ref dist_tanimoto</b>: tanimoto (element-wise mean of normalized points)</li>
+</dd>
+
+<dt>algorithm TBD (optional)</dt>
+<dd>TEXT, default: 'brute_force'. The name of the algorithm
+used to compute clusters. The following options are supported:
+<ul>
+<li><b>\ref brute_force</b>: Produces an exact result by searching
+all points in the search space.  You can also use a short
+form "b" or "brute" etc. to select brute force.</li>
+<li><b>\ref xx_tree</b>: Produces an approximate result by searching
+a subset of the search space, that is, only certain leaf nodes in the
+xx-tree as specified by "algorithm_params" below.
+You can also use a short
+form "x" or "xx" etc. to select xx-tree.</li></ul></dd>
+
+<dt>algorithm_params TBD (optional)</dt>
+<dd>TEXT, default: 'depth=3, leaf_nodes=2'. These parameters apply to the
+xx-tree algorithm only.
+<ul>
+<li><b>\ref depth</b>: Depth of the xx-tree. Increasing this value will
+decrease run-time but reduce the accuracy.</li>
+<li><b>\ref leaf_nodes</b>: Number of leaf nodes (regions) to search for each test point.
+Inceasing this value will improve the accuracy but increase run-time.</li></ul>
+
+@note
+Please note that the xx-tree accuracy will be lower for datasets with a high
+number of features. It is advised to use at least two leaf nodes.
+Refer to the <a href="#background">Technical Background</a> for more information
+on how the xx-tree is implemented.</dd>
+
+</dl>
+
+<<<<<<< Updated upstream
+<b>Output</b>
+<br>
+The output table for DBSCAN module has the following columns:
+<table class="output">
+    <tr>
+        <th>id_in</th>
+        <td>INTEGER. Test data point id.</td>
+    </tr>
+    <tr>
+        <th>cluster_id</th>
+        <td>INTEGER. Cluster id associated with the test data point.</td>
+    </tr>
+    <tr>
+        <th>is_core_point</th>
+        <td>BOOLEAN. Indicates if the test data point is core or non-core.</td>
+    </tr>
+    <tr>
+        <th>points</th>
+        <td>TEXT. Column or expression for the data point.</td>
+    </tr>
+</table>
+
+@anchor assignment
+@par Cluster Assignment
+
+After clustering, the cluster assignment for each data point can be computed:
+
+<pre class="syntax">
+dbscan_predict( dbscan_table,
+                new_point
+              )
+</pre>
+
+<b>Arguments</b>
+<dl class="arglist">
+<dt>dbscan_table</dt>
+<dd>TEXT. Name of the table created by running DBSCAN.</dd>
+
+<dt>x</dt>
+<dd>DOUBLE PRECISION[]. New points to be assigned to clusters.</dd>
+</dl>
+
+<b>Output TBD</b>
+<br>
+The output table has the following columns:
+<table class="output">
+    <tr>
+      <th>column_id</th>
+      <td>INTEGER. Cluster assignment (zero-based, i.e., 0,1,2...).</td>
+    </tr>
+    <tr>
+      <th>distance</th>
+      <td>DOUBLE PRECISION. Distance to the cluster centroid.</td>
+    </tr>
+</table>
+
+@anchor examples
+@par Examples
+
+@anchor literature
+@literature
+
+@anchor related
+@par Related Topics
+
+*/
+
+
+m4_include(`SQLCommon.m4')
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
+    source_table                VARCHAR,
+    output_table                VARCHAR,
+    id_column                   VARCHAR,
+    expr_point                  VARCHAR,
+    eps                         DOUBLE PRECISION,
+    min_samples                 INTEGER,
+    metric                      VARCHAR,
+    algorithm                   VARCHAR
+) RETURNS VOID AS $$
+    PythonFunction(dbscan, dbscan, dbscan)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
+    source_table                VARCHAR,
+    output_table                VARCHAR,
+    id_column                   VARCHAR,
+    expr_point                  VARCHAR,
+    eps                         DOUBLE PRECISION,
+    min_samples                 INTEGER,
+    metric                      VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, $7, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
+    source_table                VARCHAR,
+    output_table                VARCHAR,
+    id_column                   VARCHAR,
+    expr_point                  VARCHAR,
+    eps                         DOUBLE PRECISION,
+    min_samples                 INTEGER
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
+    source_table                VARCHAR,
+    output_table                VARCHAR,
+    id_column                   VARCHAR,
+    expr_point                  VARCHAR,
+    eps                         DOUBLE PRECISION
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, NULL, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
+    dbscan_table                VARCHAR,
+    new_point                   DOUBLE PRECISION[]
+) RETURNS INTEGER AS $$
+    PythonFunction(dbscan, dbscan, dbscan_predict)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
+    message                VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunction(dbscan, dbscan, dbscan_help)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
+) RETURNS VARCHAR AS $$
+    PythonFunction(dbscan, dbscan, dbscan_help)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
+    message                VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunction(dbscan, dbscan, dbscan_predict_help)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
+) RETURNS VARCHAR AS $$
+    PythonFunction(dbscan, dbscan, dbscan_predict_help)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');

diff --git a/src/ports/postgres/modules/dbscan/test/dbscan.sql_in b/src/ports/postgres/modules/dbscan/test/dbscan.sql_in
new file mode 100644
index 0000000..54fcddf
--- /dev/null
+++ b/src/ports/postgres/modules/dbscan/test/dbscan.sql_in

@@ -0,0 +1,91 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+
+DROP TABLE IF EXISTS dbscan_train_data;
+CREATE TABLE dbscan_train_data (
+id_in  integer,
+data    integer[]);
+copy dbscan_train_data (id_in, data) FROM stdin delimiter '|';
+1|{1,1}
+2|{2,2}
+3|{3,3}
+4|{4,4}
+5|{4,8}
+6|{17,8}
+7|{19,8}
+8|{19,9}
+9|{19,10}
+10|{3,111}
+11|{3,112}
+12|{3,113}
+13|{8,113}
+\.
+
+DROP TABLE IF EXISTS out1, out1_summary;
+SELECT dbscan('dbscan_train_data','out1','id_in','data',20,4,'squared_dist_norm2','brute');
+
+SELECT assert(count(DISTINCT id_in) = 5, 'Incorrect cluster 0') FROM out1 WHERE cluster_id = 0 and id_in=ANY(ARRAY[1,2,3,4,5]);
+
+SELECT assert(count(DISTINCT id_in) = 4, 'Incorrect cluster 1') FROM out1 WHERE cluster_id = 1 and id_in=ANY(ARRAY[6,7,8,9]);
+
+SELECT assert(dbscan_predict('out1', array[0,0]::double precision[]) = 0, 'Incorrect predict 0');
+SELECT assert(dbscan_predict('out1', array[9.1,10.8]::double precision[]) = 1, 'Incorrect predict 1');
+SELECT assert(dbscan_predict('out1', array[9,113]::double precision[]) IS NULL, 'Incorrect predict NULL');
+
+DROP TABLE IF EXISTS dbscan_train_data2;
+CREATE TABLE dbscan_train_data2 (pid int, points double precision[]);
+INSERT INTO dbscan_train_data2 VALUES
+(1,  '{1, 1}'),
+(2,  '{2, 1}'),
+(3,  '{1, 2}'),
+(4,  '{2, 2}'),
+(5,  '{3, 5}'),
+(6,  '{3, 9}'),
+(7,  '{3, 10}'),
+(8,  '{4, 10}'),
+(9,  '{4, 11}'),
+(10,  '{5, 10}'),
+(11,  '{7, 10}'),
+(12,  '{10, 9}'),
+(13,  '{10, 6}'),
+(14,  '{9, 5}'),
+(15,  '{10, 5}'),
+(16,  '{11, 5}'),
+(17,  '{9, 4}'),
+(18,  '{10, 4}'),
+(19,  '{11, 4}'),
+(20,  '{10, 3}');
+
+DROP TABLE IF EXISTS dbscan_result, dbscan_result_summary;
+SELECT dbscan(
+'dbscan_train_data2',    -- source table
+'dbscan_result',        -- output table
+'pid',                  -- point id column
+'points',             -- data point
+ 1.75,                  -- epsilon
+ 4,                     -- min samples
+'dist_norm2',           -- metric
+'brute_force');               -- algorithm
+
+SELECT * FROM dbscan_result ORDER BY pid;
+
+SELECT assert(count(DISTINCT cluster_id) = 3, 'Incorrect cluster count') FROM dbscan_result;

diff --git a/src/ports/postgres/modules/dbscan/test/unit_tests/plpy_mock.py_in b/src/ports/postgres/modules/dbscan/test/unit_tests/plpy_mock.py_in
new file mode 100644
index 0000000..dd18649
--- /dev/null
+++ b/src/ports/postgres/modules/dbscan/test/unit_tests/plpy_mock.py_in

@@ -0,0 +1,43 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`<!', `!>')
+def __init__(self):
+    pass
+
+def error(message):
+    raise PLPYException(message)
+
+def execute(query):
+    pass
+
+def warning(query):
+    pass
+
+def info(query):
+    print query
+
+
+class PLPYException(Exception):
+    def __init__(self, message):
+        super(PLPYException, self).__init__()
+        self.message = message
+
+    def __str__(self):
+        return repr(self.message)

diff --git a/src/ports/postgres/modules/dbscan/test/unit_tests/test_dbscan.py_in b/src/ports/postgres/modules/dbscan/test/unit_tests/test_dbscan.py_in
new file mode 100644
index 0000000..e04de94
--- /dev/null
+++ b/src/ports/postgres/modules/dbscan/test/unit_tests/test_dbscan.py_in

@@ -0,0 +1,79 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import numpy as np
+from os import path
+
+# Add modules to the pythonpath.
+sys.path.append(path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))))
+sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))
+
+import unittest
+from mock import *
+import plpy_mock as plpy
+
+m4_changequote(`<!', `!>')
+
+class DBSCANTestCase(unittest.TestCase):
+
+    def setUp(self):
+        self.plpy_mock = Mock(spec='error')
+        patches = {
+            'plpy': plpy
+        }
+        # we need to use MagicMock() instead of Mock() for the plpy.execute mock
+        # to be able to iterate on the return value
+        self.plpy_mock_execute = MagicMock()
+        plpy.execute = self.plpy_mock_execute
+
+        self.module_patcher = patch.dict('sys.modules', patches)
+        self.module_patcher.start()
+
+        self.default_dict = {
+            'schema_madlib' : "madlib",
+            'source_table' : "source",
+            'output_table' : "output",
+            'id_column' : "id_in",
+            'expr_point' : "data",
+            'eps' : 20,
+            'min_samples' : 4,
+            'metric' : 'squared_dist_norm2',
+            'algorithm' : 'brute'
+        }
+        import dbscan.dbscan
+        self.module = dbscan.dbscan
+
+    def tearDown(self):
+        self.module_patcher.stop()
+
+    def test_validate_dbscan(self):
+
+        self.module.input_tbl_valid = Mock()
+        self.module.output_tbl_valid = Mock()
+        self.module.cols_in_tbl_valid = Mock()
+        self.module.is_var_valid = Mock(return_value = True)
+        self.module.get_expr_type = Mock(return_value = 'double precision[]')
+
+        self.module._validate_dbscan(**self.default_dict)
+
+if __name__ == '__main__':
+    unittest.main()
+
+# ---------------------------------------------------------------------
commit	6ff7c5436bb0c00c9c75e124fdb59e55b55b31d0	[log] [tgz]
author	Orhan Kislal <okislal@apache.org>	Wed Apr 29 17:03:29 2020 -0400
committer	Orhan Kislal <okislal@pivotal.io>	Fri May 08 13:43:52 2020 -0400
tree	ee585c131197c001a5d321dd10e8a3b55ca7c308
parent	5014eec5a3375647f86fc9b9aa67f4424cdb3a0b [diff]