blob: d3c19292cbf3db0a17115be798b9089638ad3f0d [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*//* ----------------------------------------------------------------------- */
/* ----------------------------------------------------------------------- *//**
*
* @file knn.sql_in
* @brief Set of functions for k-nearest neighbors.
* @sa For a brief introduction to k-nearest neighbors algorithm for regression and classification,
* see the module description \ref grp_knn.
*
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_knn
<div class="toc"><b>Contents</b>
<ul>
<li class="level1"><a href="#knn">K-Nearest Neighbors</a></li>
<li class="level1"><a href="#usage">Usage</a></li>
<li class="level1"><a href="#output">Output Format</a></li>
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#background">Technical Background</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
</ul>
</div>
@brief Finds k nearest data points to the given data point and outputs majority vote value of output classes for classification, and average value of target values for regression.
\warning <em> This MADlib method is still in early stage development. There may be some
issues that will be addressed in a future version. Interface and implementation
are subject to change. </em>
@anchor knn
K-nearest neighbors is a method for finding the k closest points to a
given data point in terms of a given metric. Its input consists of
data points as features from testing examples, and it
looks for k closest points in the training set for each of the data
points in test set. The output of KNN depends on the type of task.
For classification, the output is the majority vote of the classes of
the k nearest data points. That is, the testing example gets assigned the
most popular class from the nearest neighbors.
For regression, the output is the average of the values of k nearest
neighbors of the given test point.
@anchor usage
@par Usage
<pre class="syntax">
knn( point_source,
point_column_name,
label_column_name,
test_source,
test_column_name,
id_column_name,
output_table,
operation,
k
)
</pre>
\b Arguments
<dl class="arglist">
<dt>point_source</dt>
<dd>TEXT. Name of the table containing the training data points.
Training data points are expected to be stored row-wise
in a column of type <tt>DOUBLE PRECISION[]</tt>.
</dd>
<dt>point_column_name</dt>
<dd>TEXT. Name of the column with training data points.</dd>
<dt>label_column_name</dt>
<dd>TEXT. Name of the column with labels/values of training data points.</dd>
<dt>test_source</dt>
<dd>TEXT. Name of the table containing the test data points.
Testing data points are expected to be stored row-wise
in a column of type <tt>DOUBLE PRECISION[]</tt>.
</dd>
<dt>test_column_name</dt>
<dd>TEXT. Name of the column with testing data points.</dd>
<dt>id_column_name</dt>
<dd>TEXT. Name of the column having ids of data points in test data table.</dd>
<dt>output_table</dt>
<dd>TEXT. Name of the table to store final results.</dd>
<dt>operation</dt>
<dd>TEXT. Type of task: 'r' for regression and 'c' for classification.</dd>
<dt>k (optional)</dt>
<dd>INTEGER. default: 1. Number of nearest neighbors to consider.
For classification, should be an odd number to break ties.</dd>
</dl>
@anchor output
@par Output Format
The output of the KNN module is a table with the following columns:
<table class="output">
<tr>
<th>id</th>
<td>INTEGER. The ids of test data points.</td>
</tr>
<tr>
<th>test_column_name</th>
<td>DOUBLE PRECISION[]. The test data points.</td>
</tr>
<tr>
<th>prediction</th>
<td>INTEGER. Label in case of classification, average value in case of regression.</td>
</tr>
</table>
@anchor examples
@examp
-# Prepare some training data:
<pre class="example">
DROP TABLE IF EXISTS knn_train_data;
CREATE TABLE knn_train_data (
id integer,
data integer[],
label float
);
INSERT INTO knn_train_data VALUES
(1, '{1,1}', 1.0),
(2, '{2,2}', 1.0),
(3, '{3,3}', 1.0),
(4, '{4,4}', 1.0),
(5, '{4,5}', 1.0),
(6, '{20,50}', 0.0),
(7, '{10,31}', 0.0),
(8, '{81,13}', 0.0),
(9, '{1,111}', 0.0);
</pre>
-# Prepare some testing data:
<pre class="example">
DROP TABLE IF EXISTS knn_test_data;
CREATE TABLE knn_test_data (
id integer,
data integer[]
);
INSERT INTO knn_test_data VALUES
(1, '{2,1}'),
(2, '{2,6}'),
(3, '{15,40}'),
(4, '{12,1}'),
(5, '{2,90}'),
(6, '{50,45}');
</pre>
-# Run KNN for classification:
<pre class="example">
DROP TABLE IF EXISTS madlib_knn_result_classification;
SELECT * FROM madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'madlib_knn_result_classification', -- Output table
'c', -- Classification
3 -- Number of nearest neighbours
);
SELECT * from madlib_knn_result_classification ORDER BY id;
</pre>
Result:
<pre class="result">
id | data | prediction
----+---------+------------
1 | {2,1} | 1
2 | {2,6} | 1
3 | {15,40} | 0
4 | {12,1} | 1
5 | {2,90} | 0
6 | {50,45} | 0
(6 rows)
</pre>
-# Run KNN for regression:
<pre class="example">
DROP TABLE IF EXISTS madlib_knn_result_regression;
SELECT * FROM madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'madlib_knn_result_regression', -- Output table
'r', -- Regressions
3 -- Number of nearest neighbours
);
SELECT * from madlib_knn_result_regression ORDER BY id;
</pre>
Result:
<pre class="result">
id | data | prediction
----+---------+-------------------
1 | {2,1} | 1
2 | {2,6} | 1
3 | {15,40} | 0.333333333333333
4 | {12,1} | 1
5 | {2,90} | 0
6 | {50,45} | 0
(6 rows)
</pre>
@anchor background
@par Technical Background
The training data points are vectors in a multidimensional feature space,
each with a class label. The training phase of the algorithm consists
only of storing the feature vectors and class labels of the training points.
In the classification phase, k is a user-defined constant, and an unlabeled
vector (a test point) is classified by assigning the label which is most
frequent among the k training samples nearest to that test point.
In case of regression, average of the values of these k training samples
is assigned to the test point.
The only distance metric supported in this version is MADlib's squared_dist_norm2.
Other distance metrics will be added in a future release of this module.
@anchor literature
@literature
@anchor knn-lit-1
[1] Wikipedia, k-nearest neighbors algorithm,
https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
@anchor knn-lit-2
[2] N. S. Altman: An Introduction to Kernel and Nearest-Neighbor Nonparametric Regression
http://www.stat.washington.edu/courses/stat527/s13/readings/Altman_AmStat_1992.pdf
@anchor knn-lit-3
[3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification,
https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf
@internal
@sa namespace knn (documenting the implementation in Python)
@endinternal
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
point_source VARCHAR,
point_column_name VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
id_column_name VARCHAR,
output_table VARCHAR,
operation VARCHAR,
k INTEGER
) RETURNS INTEGER AS $$
PythonFunctionBodyOnly(`knn', `knn')
return knn.knn_validate_src(
schema_madlib,
point_source,
point_column_name,
label_column_name,
test_source,
test_column_name,
id_column_name,
output_table,
operation,
k
)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
arg1 VARCHAR
) RETURNS VOID AS $$
BEGIN
IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN
RAISE NOTICE
'
-----------------------------------------------------------------------
USAGE
-----------------------------------------------------------------------
SELECT {schema_madlib}.knn(
point_source, -- Training data table having training features as vector column and labels
point_column_name, -- Name of column having feature vectors in training data table
label_column_name, -- Name of column having actual label/vlaue for corresponding feature vector in training data table
test_source, -- Test data table having features as vector column. Id of features is mandatory
test_column_name, -- Name of column having feature vectors in test data table
id_column_name, -- Name of column having feature vector Ids in test data table
output_table, -- Name of output table
operation, -- c for classification task, r for regression task
k -- value of k. Default will go as 1
);
-----------------------------------------------------------------------
OUTPUT
-----------------------------------------------------------------------
The output of the KNN module is a table with the following columns:
id The ids of test data points.
test_column_name The test data points.
prediction The output of KNN- label in case of classification, average value in case of regression.
';
END IF;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
) RETURNS VOID AS $$
BEGIN
RAISE NOTICE '
k-Nearest Neighbors is a method for finding k closest points to a given data
point in terms of a given metric. Its input consist of data points as features
from testing examples. For a given k, it looks for k closest points in
training set for each of the data points in test set. Algorithm generates one
output per testing example. The output of KNN depends on the type of task:
For Classification, the output is majority vote of the classes of the k
nearest data points. The testing example gets assigned the most popular class
among nearest neighbors. For Regression, the output is average of the values
of k nearest neighbors of the given testing example.
';
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
point_source VARCHAR,
point_column_name VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
id_column_name VARCHAR,
output_table VARCHAR,
operation VARCHAR,
k INTEGER
) RETURNS VARCHAR AS $$
DECLARE
l FLOAT;
id INTEGER;
vector DOUBLE PRECISION[];
cur_pid integer;
oldClientMinMessages VARCHAR;
returnstring VARCHAR;
x_temp_table VARCHAR;
y_temp_table VARCHAR;
k_val INTEGER;
label_column_name_unique VARCHAR;
test_id VARCHAR;
convert_boolean_to_int VARCHAR;
BEGIN
oldClientMinMessages :=
(SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
EXECUTE 'SET client_min_messages TO warning';
SELECT * FROM MADLIB_SCHEMA.__knn_validate_src(point_source, point_column_name, label_column_name, test_source, test_column_name, id_column_name, output_table, operation, k) INTO k_val;
PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
x_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
y_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
label_column_name_unique := 'label'||md5('knn_'||now()::text||random()::text)||'_name';
test_id := 'id'||md5('knn_'||now()::text||random()::text)||'_name';
convert_boolean_to_int := '';
IF (operation = 'c') THEN
convert_boolean_to_int := '::INTEGER';
END IF;
EXECUTE
$sql$
DROP TABLE IF EXISTS pg_temp.madlib_knn_interm;
CREATE TABLE pg_temp.madlib_knn_interm AS
SELECT *
FROM
(
SELECT row_number() over (partition by $sql$ || test_id || $sql$ order by dist) AS r, $sql$ || x_temp_table || $sql$.*
FROM
(
SELECT test.$sql$ || id_column_name || $sql$ AS $sql$ || test_id || $sql$, MADLIB_SCHEMA.squared_dist_norm2(train.$sql$ || point_column_name || $sql$,test.$sql$ || test_column_name || $sql$) AS dist, train.$sql$ || label_column_name || $sql$ $sql$ || convert_boolean_to_int || $sql$ AS $sql$ || label_column_name_unique || $sql$
FROM $sql$ || textin(regclassout(point_source)) || $sql$ AS train, $sql$ || textin(regclassout(test_source)) || $sql$ AS test
)$sql$ || x_temp_table || $sql$
)$sql$ || y_temp_table || $sql$
WHERE $sql$ || y_temp_table || $sql$.r <= $sql$ || k_val;
IF (operation = 'c') THEN
EXECUTE
$sql$
CREATE TABLE $sql$ || output_table || $sql$ AS
SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, MADLIB_SCHEMA.mode($sql$ || label_column_name_unique || $sql$) AS prediction
FROM pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ ON $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$
GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name;
ELSE
EXECUTE
$sql$
CREATE TABLE $sql$ || output_table || $sql$ AS
SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, avg($sql$ || label_column_name_unique || $sql$) AS prediction
FROM
pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ on $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$
GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name || $sql$
ORDER BY $sql$ || test_id || $sql$ $sql$;
END IF;
EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
IF (operation = 'c') THEN
returnstring := 'The classification results have been written to output table '||output_table;
ELSE
returnstring := 'The regression results have been written to output table '||output_table;
END IF;
DROP TABLE pg_temp.madlib_knn_interm;
RETURN returnstring;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
point_source VARCHAR,
point_column_name VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
id_column_name VARCHAR,
output_table VARCHAR,
operation VARCHAR
) RETURNS VARCHAR AS $$
DECLARE
returnstring VARCHAR;
BEGIN
returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1);
RETURN returnstring;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');