blob: 0cfac3d8fa707d320897ec715678474b87b7c405 [file] [log] [blame]
/* ----------------------------------------------------------------------- */
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
* @file dbscan.sql_in
* @brief Partitions a set of observations into clusters of arbitrary shape based on the density of nearby neighbors.
* @date May 2020
*
*/
/* ----------------------------------------------------------------------- */
/**
@addtogroup grp_dbscan
<div class="toc"><b>Contents</b>
<ul>
<li class="level1"><a href="#cluster">Clustering Function</a></li>
<li class="level1"><a href="#assignment">Cluster Assignment</a></li>
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
</ul>
</div>
@brief Partitions a set of observations into clusters of arbitrary
shape based on the density of nearby neighbors.
\warning <em> This MADlib method is still in early stage development.
Interface and implementation are subject to change. </em>
Density-based spatial clustering of applications with noise (DBSCAN)
is a data clustering algorithm designed to discover clusters of arbitrary
shape [1,2]. It places minimum requirements on domain knowledge to determine
input parameters and has good efficiency on large databases.
Given a set of points, DBSCAN groups together points that are closely packed with many
nearby neighbors (\em core), and marks points that lie alone in
low-density regions with few nearby neighbors (\em border). Other points in the dataset
that are not core or border are considered as \em noise.
This method tends to be good for data which contains clusters
of similar density.
Currently only a brute force approach is implemented, suitable for small datasets.
Other approaches for larger datasets will be implemented in
a future release.
@anchor cluster
@par Clustering Function
<pre class="syntax">
dbscan( source_table,
output_table,
id_column,
expr_point,
eps,
min_samples,
metric,
algorithm,
max_segmentation_depth
)
</pre>
\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>TEXT. Name of the table containing the input data points.
</dd>
<dt>output_table</dt>
<dd>TEXT. Name of the table containing the clustering results.
</dd>
<dt>id</dt>
<dd>TEXT. Name of a column or expression containing a unique integer id for each training point.
</dd>
<dt>point</dt>
<dd>TEXT. Name of the column with point coordinates in array form,
or an expression that evaluates to an array of point coordinates.
</dd>
<dt>eps</dt>
<dd>FLOAT8. Maximum distance between two samples for one to
be considered in the neighborhood of the other. (Note this is not a maximum
bound on the distances of points within a cluster.) This is an
important parameter to choose appropriately and should consider both the
nature of the data set and the distance function.
</dd>
<dt>min_samples (optional)</dt>
<dd>INTEGER, default: 5. Number of samples in a neighborhood for a point
to be considered as a core point. This includes the point itself.
This parameter controls how tolerant the algorithm is towards
noise, so on noisy data sets it may be useful to increase
the magnitude of this parameter.
</dd>
@note
The parameters 'eps' and 'min_samples' together define the density of a cluster.
A core point is where there exist 'min_samples' other points within a distance of 'eps',
which are defined as neighbors of the core point.
A higher value of 'min_samples' or a lower value of 'eps' indicate that higher density
is needed to form a cluster.
<dt>metric (optional)</dt>
<dd>TEXT, default: 'squared_dist_norm2'. The name of the function used
to calculate the distance between data points. The following distance
functions can be used:
<ul>
<li><b>\ref dist_norm1</b>: 1-norm/Manhattan (element-wise median).
MADlib does not provide a median aggregate function for
performance reasons.</li>
<li><b>\ref dist_norm2</b>: 2-norm/Euclidean (element-wise mean)</li>
<li><b>\ref squared_dist_norm2</b>: squared Euclidean distance (element-wise mean)</li>
<li><b>\ref dist_angle</b>: angle (element-wise mean of normalized points)</li>
<li><b>\ref dist_tanimoto</b>: tanimoto (element-wise mean of normalized points)</li>
</dd>
<dt>algorithm (optional)</dt>
<dd>TEXT, default: 'optimized'. The name of the algorithm
used to compute clusters.
<ul>
<li><b>\ref brute_force</b>: Brute force can be slow and should
not be used for large datasets. However, it does have less initial
setup overhead than the parllel-optimized algorithm, which may make
it faster for some cases involving small datasets. You can also use
a short form "b" or "brute" etc. to select brute force.</li>
<li><b>\ref optimized</b>: This uses an rtree index to accelerate range_queries
while performing the cluster detection. It is designed with gpdb in mind,
in which case it intelligently partitions the space into a number of connected
regions, runs DBSCAN in parallel on each region, and then merges the results
together. By default, the maximum number of regions it will consider using
is equal to the number of segments in the database, but if you suspect it
may be spending too much time on the segmentation, this can be limited further
by setting the max_segmentation_depth parameter to something lower. This should
decrease the segmentation overhead, but will also decrease the amount of parallelism.
If the dataset is relatively small, another option to consider is brute force,
which has very little overhead but won't scale as well. For large enough datasets,
and an appropriate choice of eps, this algorithm should be able to run in O((N/S) log N)
time where N is the number of rows (points) in the input and S is the number of regions
(often equal to the number of segments, but may be less). (Worst case for brute force
is O(N^2).) eps should be chosen based on the density of the dataset, where DBSCAN works
best for datasets where the clusters have a roughly uniform density. If eps is chosen
too large, then the runtime will explode and nearly every point will be considered
connected to every other point in one large cluster. Best practice is to start with
a relatively small eps, so it can return the first result faster; if it looks
like there are too many small clusters, increase eps and allow it to run for longer.
</ul></dd>
<dt>max_segmentation_depth (optional)</dt>
<dd>INTEGER, default: <number of segments>. The number of regions to segment
the data. See <b>optimized</b> for usage.
</dd>
</dl>
<b>Output</b>
<br>
The output table for DBSCAN module has the following columns:
<table class="output">
<tr>
<th>id</th>
<td>SMALLINT|INTEGER|BIGINT. A column or expression which specifies
unique ids for each row in the training dataset. Must evaluate
to an integer type.</td>
</tr>
<tr>
<th>cluster_id</th>
<td>INTEGER. The cluster id of each classified data point.</td>
</tr>
<tr>
<th>is_core_point</th>
<td>BOOLEAN. Indicates if the training data point is core
If it is not a core point and listed in the output table,
it is a border point. Noise points are not shown in this
table.</td>
</tr>
<tr>
<th>point</th>
<td>TEXT. The coordinates of each point classified</td>
</tr>
</table>
@anchor assignment
@par Cluster Assignment
After clustering, the cluster assignment for each data point can be computed:
<pre class="syntax">
dbscan_predict( dbscan_table,
source_table,
id,
point,
output_table
)
</pre>
<b>Arguments</b>
<dl class="arglist">
<dt>dbscan_table</dt>
<dd>TEXT. Name of the table created by running DBSCAN.</dd>
<dt>source_table</dt>
<dd>TEXT. Name of the table containing the input data points.
</dd>
<dt>id</dt>
<dd>VARCHAR. A column name or expression which selects a unique integer
id for each training point.
</dd>
<dt>point</dt>
<dd>TEXT. Name of the column with point coordinates in array form,
or an expression that evaluates to an array of point coordinates.
</dd>
<dt>output_table</dt>
<dd>TEXT. Name of the table containing the clustering results.
</dd>
</dl>
<b>Output</b>
<br>
The output is a table with the following columns:
<table class="output">
<tr>
<th>id</th>
<td>BIGINT. The unique id of each row, as it as passed in.</td>
</tr>
<tr>
<th>cluster_id</th>
<td>INTEGER. Cluster assignment (zero-based, i.e., 0,1,2...).</td>
</tr>
<tr>
<th>distance</th>
<td>DOUBLE PRECISION. Distance to the cluster centroid.</td>
</tr>
</table>
@anchor examples
@par Examples
-# Create input data:
<pre class="example">
DROP TABLE IF EXISTS dbscan_train_data;
CREATE TABLE dbscan_train_data (pid int, points double precision[]);
INSERT INTO dbscan_train_data VALUES
(1, '{1, 1}'),
(2, '{2, 1}'),
(3, '{1, 2}'),
(4, '{2, 2}'),
(5, '{3, 5}'),
(6, '{3, 9}'),
(7, '{3, 10}'),
(8, '{4, 10}'),
(9, '{4, 11}'),
(10, '{5, 10}'),
(11, '{7, 10}'),
(12, '{10, 9}'),
(13, '{10, 6}'),
(14, '{9, 5}'),
(15, '{10, 5}'),
(16, '{11, 5}'),
(17, '{9, 4}'),
(18, '{10, 4}'),
(19, '{11, 4}'),
(20, '{10, 3}');
CREATE TABLE dbscan_test_data (pid int, points double precision[]);
INSERT INTO dbscan_test_data VALUES
(1, '{1, 2}'),
(2, '{2, 2}'),
(3, '{1, 3}'),
(4, '{2, 2}'),
(10, '{5, 11}'),
(11, '{7, 10}'),
(12, '{10, 9}'),
(13, '{10, 6}'),
(14, '{9, 5}'),
(15, '{10, 6}');
</pre>
-# Run DBSCAN using the brute force method with a Euclidean
distance function:
<pre class="example">
DROP TABLE IF EXISTS dbscan_result, dbscan_result_summary;
SELECT madlib.dbscan(
'dbscan_train_data', -- source table
'dbscan_result', -- output table
'pid', -- point id column
'points', -- data point
1.75, -- epsilon
4, -- min samples
'dist_norm2', -- metric
'brute_force'); -- algorithm
SELECT * FROM dbscan_result ORDER BY pid;
</pre>
<pre class="result">
pid | cluster_id | is_core_point | __points__
-----+------------+---------------+------------
1 | 0 | t | {1,1}
2 | 0 | t | {2,1}
3 | 0 | t | {1,2}
4 | 0 | t | {2,2}
6 | 1 | f | {3,9}
7 | 1 | t | {3,10}
8 | 1 | t | {4,10}
9 | 1 | t | {4,11}
10 | 1 | f | {5,10}
13 | 2 | t | {10,6}
14 | 2 | t | {9,5}
15 | 2 | t | {10,5}
16 | 2 | t | {11,5}
17 | 2 | t | {9,4}
18 | 2 | t | {10,4}
19 | 2 | t | {11,4}
20 | 2 | t | {10,3}
(17 rows)
</pre>
There are three clusters created. All points are core points
except for 6 and 10 which are border points. The noise points
do not show up in the output table. If you want to see the noise points
you can use a query like:
<pre class="example">
SELECT l.* FROM dbscan_train_data l WHERE NOT EXISTS
(SELECT NULL FROM dbscan_result r WHERE r.pid = l.pid)
ORDER BY l.pid;
</pre>
<pre class="result">
pid | points
-----+--------
5 | {3,5}
11 | {7,10}
12 | {10,9}
</pre>
The summary table lists the 'eps' value and the distance metric used:
<pre class="example">
SELECT * FROM dbscan_result_summary;
</pre>
<pre class="result">
id | eps | metric
-----------+------+------------
pid | 1.75 | dist_norm2
</pre>
-# Find the cluster assignment for the test data points:
<pre class="example">
SELECT madlib.dbscan_predict(
'dbscan_result', -- from DBSCAN run
'dbscan_test_data', -- test dataset
'pid', -- point id column
'points', -- data point
'dbscan_predict_out' -- output table
);
SELECT * FROM dbscan_predict_out ORDER BY pid;
</pre>
<pre class="result">
pid | cluster_id | distance
-----+------------+----------
1 | 0 | 0
2 | 0 | 0
3 | 0 | 1
4 | 0 | 0
10 | 1 | 1
13 | 2 | 0
14 | 2 | 0
15 | 2 | 0
(8 rows)
</pre>
@anchor literature
@literature
[1] Martin Ester, Hans-Peter Kriegel, Jörg Sander, Xiaowei Xu,
"A Density-Based Algorithm for Discovering Clusters
in Large Spatial Databases with Noise", KDD-96 Proceedings,
https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf
[2] Erich Schubert, Jörg Sander, Martin Ester, Hans-Peter Kriegel, Xiaowei Xu,
"DBSCAN Revisited, Revisited: Why and How You Should (Still) Use DBSCAN",
ACM Transactions on Database Systems, July 2017, Article No. 19,
https://dl.acm.org/doi/10.1145/3068335
*/
m4_include(`SQLCommon.m4')
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
source_table VARCHAR,
output_table VARCHAR,
id_column VARCHAR,
expr_point VARCHAR,
eps DOUBLE PRECISION,
min_samples INTEGER,
metric VARCHAR,
algorithm VARCHAR,
max_segmentation_depth INTEGER
) RETURNS VOID AS $$
PythonFunction(dbscan, dbscan, dbscan)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
source_table VARCHAR,
output_table VARCHAR,
id_column VARCHAR,
expr_point VARCHAR,
eps DOUBLE PRECISION,
min_samples INTEGER,
metric VARCHAR,
algorithm VARCHAR
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, $7, $8, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
source_table VARCHAR,
output_table VARCHAR,
id_column VARCHAR,
expr_point VARCHAR,
eps DOUBLE PRECISION,
min_samples INTEGER,
metric VARCHAR
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, $7, NULL, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
source_table VARCHAR,
output_table VARCHAR,
id_column VARCHAR,
point VARCHAR,
eps DOUBLE PRECISION,
min_samples INTEGER
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, $6, NULL, NULL, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
source_table VARCHAR,
output_table VARCHAR,
id_column VARCHAR,
expr_point VARCHAR,
eps DOUBLE PRECISION
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.dbscan($1, $2, $3, $4, $5, NULL, NULL, NULL, NULL);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
dbscan_table VARCHAR,
source_table VARCHAR,
id_column VARCHAR,
expr_point VARCHAR,
output_table VARCHAR
) RETURNS VOID AS $$
PythonFunction(dbscan, dbscan, dbscan_predict)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(dbscan, dbscan, dbscan_help)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan(
) RETURNS VARCHAR AS $$
PythonFunction(dbscan, dbscan, dbscan_help)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(dbscan, dbscan, dbscan_predict_help)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.dbscan_predict(
) RETURNS VARCHAR AS $$
PythonFunction(dbscan, dbscan, dbscan_predict_help)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sf_merge(
state1 BIGINT[][],
state2 BIGINT[][]
) RETURNS BIGINT[][] AS $$
PythonFunctionBodyOnlyNoSchema(`dbscan', `dbscan')
return dbscan.sf_merge(**globals())
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sf_final(
state BIGINT[][]
) RETURNS BIGINT[][] AS $$
PythonFunctionBodyOnlyNoSchema(`dbscan', `dbscan')
return dbscan.sf_final(**globals())
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sf_transition(
state BIGINT[],
src BIGINT,
dest BIGINT,
n_rows INTEGER[],
gp_segment_id INTEGER
) RETURNS BIGINT[][] AS $$
PythonFunctionBodyOnlyNoSchema(`dbscan', `dbscan')
return dbscan.sf_transition(**globals())
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.build_snowflake_table(
BIGINT,
BIGINT,
INTEGER[],
INTEGER);
CREATE AGGREGATE MADLIB_SCHEMA.build_snowflake_table(
BIGINT,
BIGINT,
INTEGER[],
INTEGER
)(
STYPE=BIGINT[][],
SFUNC=MADLIB_SCHEMA.sf_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.sf_merge,')
FINALFUNC=MADLIB_SCHEMA.sf_final
);
DROP TYPE IF EXISTS MADLIB_SCHEMA.__dbscan_record CASCADE;
CREATE TYPE MADLIB_SCHEMA.__dbscan_record AS (
id BIGINT,
point DOUBLE PRECISION[],
is_core_point BOOL,
cluster_id BIGINT,
leaf_id INT,
dist_id INT
);
DROP TYPE IF EXISTS MADLIB_SCHEMA.__dbscan_edge CASCADE;
CREATE TYPE MADLIB_SCHEMA.__dbscan_edge AS (
src BIGINT,
dest BIGINT);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dbscan_unpack_edge (packed BIGINT[][])
RETURNS SETOF MADLIB_SCHEMA.__dbscan_edge
AS $$
return packed
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dbscan_safe_ln(x DOUBLE PRECISION)
RETURNS DOUBLE PRECISION AS
$$
SELECT CASE WHEN $1 > 0.0
THEN ln($1)
ELSE -1000.0
END
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dbscan_safe_exp(x DOUBLE PRECISION)
RETURNS DOUBLE PRECISION AS
$$
SELECT CASE WHEN $1 > -600.0
THEN exp($1)
ELSE 0.0
END
$$ LANGUAGE SQL IMMUTABLE;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.prod(DOUBLE PRECISION);
CREATE AGGREGATE MADLIB_SCHEMA.prod(DOUBLE PRECISION)
(
sfunc = float8mul,
stype = DOUBLE PRECISION
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dbscan_split_volume(
V DOUBLE PRECISION, -- $1
eps_bin DOUBLE PRECISION, -- $2
eps_bins DOUBLE PRECISION -- $3
) RETURNS DOUBLE PRECISION
AS
$$ SELECT
$2 / $3 * $1
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dbscan_estimate_neighbors(
internal_points DOUBLE PRECISION, -- $1
V DOUBLE PRECISION, -- $2
eps DOUBLE PRECISION, -- $3
D BIGINT -- $4
) RETURNS DOUBLE PRECISION
AS
$$ SELECT
CASE WHEN $2 > 0.0
THEN
$1 * ($3^$4 / $2)
ELSE
1.0
END
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dbscan_range_query_penalty(
internal_points BIGINT, -- $1
neighbors DOUBLE PRECISION -- $2
) RETURNS DOUBLE PRECISION
AS
$$ SELECT
greatest(
$2,
CASE WHEN $1 > 2
-- 60x faster than log(internal_points, 2.0)
-- due to DOUBLE PRECISION types instead of NUMERIC
THEN ceil(ln($1::DOUBLE PRECISION) / ln(2.0::DOUBLE PRECISION))
ELSE 1.0
END
)
$$ LANGUAGE SQL IMMUTABLE;
DROP TYPE IF EXISTS MADLIB_SCHEMA.__dbscan_losses;
CREATE TYPE MADLIB_SCHEMA.__dbscan_losses AS (
left_avail BIGINT,
right_avail BIGINT,
left_loss DOUBLE PRECISION,
right_loss DOUBLE PRECISION
);
-- Decide how much of node's available segments to allocate
-- to its left child. The rest will be allocated to its
-- right child
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dbscan_allocate_segs(
avail_segs BIGINT, -- $1
left_points DOUBLE PRECISION, -- $2
right_points DOUBLE PRECISION, -- $3
rqp_left DOUBLE PRECISION, -- $4
rqp_right DOUBLE PRECISION -- $5
) RETURNS BIGINT AS
$$
WITH p(left_penalty, right_penalty) AS (
SELECT
$4 * $2,
$5 * $3
)
SELECT
round(
left_penalty * $1 / (left_penalty + right_penalty)
)::BIGINT
FROM p
$$ LANGUAGE SQL IMMUTABLE;
-- Given internal and external point counts in left and right regions,
-- returns optimal number of segments to allocate to left and right
-- nodes, as well as their losses.
--
-- TODO: we can probably remove range-query neighbor penalty for
-- external points if we use rtree.count() instead of
-- rtree.intersection(). Just need to finish adding the
-- per-cluster rtrees
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dbscan_segmentation_loss(
left_internal BIGINT, -- $1
right_internal BIGINT, -- $2
left_external BIGINT, -- $3
right_external BIGINT, -- $4
avail_segs BIGINT, -- $5
V DOUBLE PRECISION, -- $6
eps DOUBLE PRECISION, -- $7
D BIGINT, -- $8
eps_bin DOUBLE PRECISION, -- $9
eps_bins DOUBLE PRECISION -- $10
) RETURNS MADLIB_SCHEMA.__dbscan_losses AS
$$
WITH __local AS (
SELECT
($1 + $3) left_points,
($2 + $4) right_points,
$9 left_eps_bins,
$10 - $9 right_eps_bins,
$10 all_eps_bins
), penalties AS (
SELECT
MADLIB_SCHEMA.__dbscan_range_query_penalty(
$1 + $2,
MADLIB_SCHEMA.__dbscan_estimate_neighbors(
$1 + $2,
$6, $7, $8
)
) rqp_all,
MADLIB_SCHEMA.__dbscan_range_query_penalty(
$1,
MADLIB_SCHEMA.__dbscan_estimate_neighbors(
$1,
MADLIB_SCHEMA.__dbscan_split_volume(
$6,
left_eps_bins,
all_eps_bins
),
$7,
$8
)
) rqp_left,
MADLIB_SCHEMA.__dbscan_range_query_penalty(
$2,
MADLIB_SCHEMA.__dbscan_estimate_neighbors(
$2,
MADLIB_SCHEMA.__dbscan_split_volume(
$6,
left_eps_bins,
all_eps_bins
), $7, $8
)
) rqp_right
FROM __local
), seg_alloc(left_avail, all_penalty) AS (
SELECT
MADLIB_SCHEMA.__dbscan_allocate_segs(
$5,
left_points,
right_points,
rqp_left,
rqp_right
),
(left_points + right_points) * rqp_all
FROM __local, penalties
)
SELECT
CASE WHEN
0 < left_avail AND left_avail < $5
THEN
ROW(
left_avail,
$5 - left_avail,
(left_points * 1.0 / left_avail) * rqp_left,
(right_points * 1.0 / ($5 - left_avail)) * rqp_right
)::MADLIB_SCHEMA.__dbscan_losses
ELSE -- forget about splitting, not worth it
ROW(
$5,
0,
-- We intentionally do not divide by avail_segs,
-- because if we choose not to split, then there can be no
-- more division later on
(left_points + right_points) * rqp_all,
0.0
)::MADLIB_SCHEMA.__dbscan_losses
END
FROM __local, penalties, seg_alloc
$$ LANGUAGE SQL IMMUTABLE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dbscan_leaf(
db_rec MADLIB_SCHEMA.__dbscan_record,
eps REAL,
min_samples INT,
metric TEXT,
num_internal_points BIGINT,
num_external_points BIGINT
) RETURNS SETOF MADLIB_SCHEMA.__dbscan_record AS
$$
global SD
args.append(SD)
import plpy
PythonFunctionBodyOnlyNoSchema(dbscan,dbscan)
WithTracebackForwarding(return dbscan.dbscan_leaf(*args))
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');