src/ports/postgres/modules/knn/knn.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  *//* ----------------------------------------------------------------------- */


 /* ----------------------------------------------------------------------- *//**
  *
  * @file knn.sql_in
  * @brief Set of functions for k-nearest neighbors.
  * @sa For a brief introduction to k-nearest neighbors algorithm for regression and classification,
  * see the module description \ref grp_knn.
  *
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')


 /**
 @addtogroup grp_knn

 <div class="toc"><b>Contents</b>
 <ul>
 <li class="level1"><a href="#knn">K-Nearest Neighbors</a></li>
 <li class="level1"><a href="#usage">Usage</a></li>
 <li class="level1"><a href="#output">Output Format</a></li>
 <li class="level1"><a href="#examples">Examples</a></li>
 <li class="level1"><a href="#background">Technical Background</a></li>
 <li class="level1"><a href="#literature">Literature</a></li>
 </ul>
 </div>

 @brief Finds k nearest data points to the given data point and outputs majority vote value of output classes for classification, and average value of target values for regression.

 \warning <em> This MADlib method is still in early stage development. There may be some
 issues that will be addressed in a future version. Interface and implementation
 are subject to change. </em>

 @anchor knn

 K-nearest neighbors is a method for finding the k closest points to a
 given data point in terms of a given metric. Its input consists of
 data points as features from testing examples, and it
 looks for k closest points in the training set for each of the data
 points in test set.  The output of KNN depends on the type of task.
 For classification, the output is the majority vote of the classes of
 the k nearest data points. That is, the testing example gets assigned the
 most popular class from the nearest neighbors.
 For regression, the output is the average of the values of k nearest
 neighbors of the given test point.

 @anchor usage
 @par Usage
 <pre class="syntax">
 knn( point_source,
      point_column_name,
      label_column_name,
      test_source,
      test_column_name,
      id_column_name,
      output_table,
      operation,
      k
    )
 </pre>

 \b Arguments
 <dl class="arglist">
 <dt>point_source</dt>
 <dd>TEXT. Name of the table containing the training data points.

 Training data points are expected to be stored row-wise
 in a column of type <tt>DOUBLE PRECISION[]</tt>.
 </dd>

 <dt>point_column_name</dt>
 <dd>TEXT. Name of the column with training data points.</dd>

 <dt>label_column_name</dt>
 <dd>TEXT. Name of the column with labels/values of training data points.</dd>

 <dt>test_source</dt>
 <dd>TEXT. Name of the table containing the test data points.

 Testing data points are expected to be stored row-wise
 in a column of type <tt>DOUBLE PRECISION[]</tt>.
 </dd>

 <dt>test_column_name</dt>
 <dd>TEXT. Name of the column with testing data points.</dd>

 <dt>id_column_name</dt>
 <dd>TEXT. Name of the column having ids of data points in test data table.</dd>

 <dt>output_table</dt>
 <dd>TEXT. Name of the table to store final results.</dd>

 <dt>operation</dt>
 <dd>TEXT. Type of task: 'r' for regression and 'c' for classification.</dd>

 <dt>k (optional)</dt>
 <dd>INTEGER. default: 1. Number of nearest neighbors to consider.
 For classification, should be an odd number to break ties.</dd>

 </dl>


 @anchor output
 @par Output Format

 The output of the KNN module is a table with the following columns:
 <table class="output">
     <tr>
         <th>id</th>
         <td>INTEGER. The ids of test data points.</td>
     </tr>
     <tr>
         <th>test_column_name</th>
         <td>DOUBLE PRECISION[]. The test data points.</td>
     </tr>
     <tr>
         <th>prediction</th>
         <td>INTEGER. Label in case of classification, average value in case of regression.</td>
     </tr>
 </table>


 @anchor examples
 @examp

 -#  Prepare some training data:
 <pre class="example">
 DROP TABLE IF EXISTS knn_train_data;
 CREATE TABLE knn_train_data (
                     id integer,
                     data integer[],
                     label float
                     );
 INSERT INTO knn_train_data VALUES
 (1, '{1,1}', 1.0),
 (2, '{2,2}', 1.0),
 (3, '{3,3}', 1.0),
 (4, '{4,4}', 1.0),
 (5, '{4,5}', 1.0),
 (6, '{20,50}', 0.0),
 (7, '{10,31}', 0.0),
 (8, '{81,13}', 0.0),
 (9, '{1,111}', 0.0);
 </pre>

 -#  Prepare some testing data:
 <pre class="example">
 DROP TABLE IF EXISTS knn_test_data;
 CREATE TABLE knn_test_data (
                     id integer,
                     data integer[]
                     );
 INSERT INTO knn_test_data VALUES
 (1, '{2,1}'),
 (2, '{2,6}'),
 (3, '{15,40}'),
 (4, '{12,1}'),
 (5, '{2,90}'),
 (6, '{50,45}');
 </pre>

 -#  Run KNN for classification:
 <pre class="example">
 DROP TABLE IF EXISTS madlib_knn_result_classification;
 SELECT * FROM madlib.knn(
                 'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data
                 'madlib_knn_result_classification',  -- Output table
                 'c',                   -- Classification
                  3                     -- Number of nearest neighbours
                 );
 SELECT * from madlib_knn_result_classification ORDER BY id;
 </pre>
 Result:
 <pre class="result">
  id |  data   | prediction
 ----+---------+------------
   1 | {2,1}   |          1
   2 | {2,6}   |          1
   3 | {15,40} |          0
   4 | {12,1}  |          1
   5 | {2,90}  |          0
   6 | {50,45} |          0
 (6 rows)
 </pre>

 -#  Run KNN for regression:
 <pre class="example">
 DROP TABLE IF EXISTS madlib_knn_result_regression;
 SELECT * FROM madlib.knn(
                 'knn_train_data',      -- Table of training data
                 'data',                -- Col name of training data
                 'label',               -- Training labels
                 'knn_test_data',       -- Table of test data
                 'data',                -- Col name of test data
                 'id',                  -- Col name of id in test data
                 'madlib_knn_result_regression',  -- Output table
                 'r',                   -- Regressions
                  3                     -- Number of nearest neighbours
                 );
 SELECT * from madlib_knn_result_regression ORDER BY id;
 </pre>
 Result:
 <pre class="result">
  id |  data   |    prediction
 ----+---------+-------------------
   1 | {2,1}   |                 1
   2 | {2,6}   |                 1
   3 | {15,40} | 0.333333333333333
   4 | {12,1}  |                 1
   5 | {2,90}  |                 0
   6 | {50,45} |                 0
 (6 rows)
 </pre>

 @anchor background
 @par Technical Background

 The training data points are vectors in a multidimensional feature space,
 each with a class label. The training phase of the algorithm consists
 only of storing the feature vectors and class labels of the training points.

 In the classification phase, k is a user-defined constant, and an unlabeled
 vector (a test point) is classified by assigning the label which is most
 frequent among the k training samples nearest to that test point.
 In case of regression, average of the values of these k training samples
 is assigned to the test point.
 The only distance metric supported in this version is MADlib's squared_dist_norm2.
 Other distance metrics will be added in a future release of this module.


 @anchor literature
 @literature

 @anchor knn-lit-1
 [1] Wikipedia, k-nearest neighbors algorithm,
     https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm

 @anchor knn-lit-2
 [2] N. S. Altman: An Introduction to Kernel and Nearest-Neighbor Nonparametric Regression
     http://www.stat.washington.edu/courses/stat527/s13/readings/Altman_AmStat_1992.pdf

 @anchor knn-lit-3
 [3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification,
     https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf

 @internal
 @sa namespace knn (documenting the implementation in Python)
 @endinternal
 */

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
     point_source VARCHAR,
     point_column_name VARCHAR,
     label_column_name VARCHAR,
     test_source VARCHAR,
     test_column_name VARCHAR,
     id_column_name VARCHAR,
     output_table VARCHAR,
     operation VARCHAR,
     k INTEGER
 ) RETURNS INTEGER AS $$
     PythonFunctionBodyOnly(`knn', `knn')
     return knn.knn_validate_src(
         schema_madlib,
         point_source,
         point_column_name,
         label_column_name,
         test_source,
         test_column_name,
         id_column_name,
         output_table,
         operation,
         k
     )
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     arg1 VARCHAR
 ) RETURNS VOID AS $$
 BEGIN
     IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN
 	RAISE NOTICE
 '
 -----------------------------------------------------------------------
                             USAGE
 -----------------------------------------------------------------------
 SELECT {schema_madlib}.knn(
     point_source,       -- Training data table having training features as vector column and labels
     point_column_name,  -- Name of column having feature vectors in training data table
     label_column_name,  -- Name of column having actual label/vlaue for corresponding feature vector in training data table
     test_source,        -- Test data table having features as vector column. Id of features is mandatory
     test_column_name,   -- Name of column having feature vectors in test data table
     id_column_name,     -- Name of column having feature vector Ids in test data table
     output_table,       -- Name of output table
     operation,          -- c for classification task, r for regression task
     k                   -- value of k. Default will go as 1
     );

 -----------------------------------------------------------------------
                             OUTPUT
 -----------------------------------------------------------------------
 The output of the KNN module is a table with the following columns:

 id                  The ids of test data points.
 test_column_name    The test data points.
 prediction          The output of KNN- label in case of classification, average value in case of regression.
 ';
     END IF;
 END;
 $$ LANGUAGE plpgsql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
 ) RETURNS VOID AS $$
 BEGIN
     RAISE NOTICE '
 k-Nearest Neighbors is a method for finding k closest points to a given data
 point in terms of a given metric. Its input consist of data points as features
 from testing examples. For a given k, it looks for k closest points in
 training set for each of the data points in test set. Algorithm generates one
 output per testing example. The output of KNN depends on the type of task:
 For Classification, the output is majority vote of the classes of the k
 nearest data points. The testing example gets assigned the most popular class
 among nearest neighbors. For Regression, the output is average of the values
 of k nearest neighbors of the given testing example.
     ';
 END;
 $$ LANGUAGE plpgsql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     point_source VARCHAR,
     point_column_name VARCHAR,
     label_column_name VARCHAR,
     test_source VARCHAR,
     test_column_name VARCHAR,
     id_column_name VARCHAR,
     output_table VARCHAR,
     operation VARCHAR,
     k INTEGER
 ) RETURNS VARCHAR AS $$
 DECLARE
     l FLOAT;
     id INTEGER;
     vector DOUBLE PRECISION[];
     cur_pid integer;
     oldClientMinMessages VARCHAR;
     returnstring VARCHAR;
     x_temp_table VARCHAR;
     y_temp_table VARCHAR;
     k_val INTEGER;
     label_column_name_unique VARCHAR;
     test_id VARCHAR;
     convert_boolean_to_int VARCHAR;
 BEGIN
     oldClientMinMessages :=
         (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
     EXECUTE 'SET client_min_messages TO warning';
     SELECT * FROM MADLIB_SCHEMA.__knn_validate_src(point_source, point_column_name, label_column_name, test_source, test_column_name, id_column_name, output_table, operation, k) INTO k_val;
     PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
     x_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
     y_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp';
     label_column_name_unique := 'label'||md5('knn_'||now()::text||random()::text)||'_name';
     test_id := 'id'||md5('knn_'||now()::text||random()::text)||'_name';

     convert_boolean_to_int := '';
     IF (operation = 'c') THEN
         convert_boolean_to_int := '::INTEGER';
     END IF;

     EXECUTE
         $sql$
 	DROP TABLE IF EXISTS pg_temp.madlib_knn_interm;
 	CREATE TABLE pg_temp.madlib_knn_interm AS
 	SELECT *
     FROM
         (
         SELECT row_number() over (partition by $sql$ || test_id || $sql$ order by dist) AS r, $sql$ || x_temp_table || $sql$.*
         FROM
             (
                 SELECT test.$sql$ || id_column_name || $sql$ AS $sql$ || test_id || $sql$, MADLIB_SCHEMA.squared_dist_norm2(train.$sql$ || point_column_name || $sql$,test.$sql$ || test_column_name || $sql$) AS dist, train.$sql$ || label_column_name || $sql$ $sql$ || convert_boolean_to_int || $sql$ AS $sql$ || label_column_name_unique || $sql$
                 FROM $sql$ || textin(regclassout(point_source)) || $sql$ AS train, $sql$ || textin(regclassout(test_source)) || $sql$ AS test
             )$sql$ || x_temp_table || $sql$
         )$sql$ || y_temp_table || $sql$
     WHERE $sql$ || y_temp_table || $sql$.r <= $sql$ || k_val;

     IF (operation = 'c') THEN
     	EXECUTE
         $sql$
     	CREATE TABLE $sql$ || output_table || $sql$ AS
         SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, MADLIB_SCHEMA.mode($sql$ || label_column_name_unique || $sql$) AS prediction
         FROM pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ ON $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$
         GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name;
     ELSE
         EXECUTE
         $sql$
 	    CREATE TABLE $sql$ || output_table || $sql$ AS
         SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, avg($sql$ || label_column_name_unique || $sql$) AS prediction
         FROM
             pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ on $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$
         GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name || $sql$
         ORDER BY $sql$ || test_id || $sql$ $sql$;
     END IF;

    EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages;
    IF (operation = 'c') THEN
    	returnstring := 'The classification results have been written to output table '||output_table;
    ELSE
         returnstring := 'The regression results have been written to output table '||output_table;
    END IF;
    DROP TABLE pg_temp.madlib_knn_interm;
    RETURN returnstring;
 END;
 $$ LANGUAGE plpgsql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
     point_source VARCHAR,
     point_column_name VARCHAR,
     label_column_name VARCHAR,
     test_source VARCHAR,
     test_column_name VARCHAR,
     id_column_name VARCHAR,
     output_table VARCHAR,
     operation VARCHAR
 ) RETURNS VARCHAR AS $$
 DECLARE
     returnstring VARCHAR;
 BEGIN
     returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1);
     RETURN returnstring;
 END;
 $$ LANGUAGE plpgsql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
	/* ----------------------------------------------------------------------- //*
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	// ----------------------------------------------------------------------- */


	/* ----------------------------------------------------------------------- //*
	*
	* @file knn.sql_in
	* @brief Set of functions for k-nearest neighbors.
	* @sa For a brief introduction to k-nearest neighbors algorithm for regression and classification,
	* see the module description \ref grp_knn.
	*
	*
	// ----------------------------------------------------------------------- */

	m4_include(`SQLCommon.m4')


	/**
	@addtogroup grp_knn

	<div class="toc"><b>Contents</b>
	<ul>
	<li class="level1"><a href="#knn">K-Nearest Neighbors</a></li>
	<li class="level1"><a href="#usage">Usage</a></li>
	<li class="level1"><a href="#output">Output Format</a></li>
	<li class="level1"><a href="#examples">Examples</a></li>
	<li class="level1"><a href="#background">Technical Background</a></li>
	<li class="level1"><a href="#literature">Literature</a></li>
	</ul>
	</div>

	@brief Finds k nearest data points to the given data point and outputs majority vote value of output classes for classification, and average value of target values for regression.

	\warning <em> This MADlib method is still in early stage development. There may be some
	issues that will be addressed in a future version. Interface and implementation
	are subject to change. </em>

	@anchor knn

	K-nearest neighbors is a method for finding the k closest points to a
	given data point in terms of a given metric. Its input consists of
	data points as features from testing examples, and it
	looks for k closest points in the training set for each of the data
	points in test set. The output of KNN depends on the type of task.
	For classification, the output is the majority vote of the classes of
	the k nearest data points. That is, the testing example gets assigned the
	most popular class from the nearest neighbors.
	For regression, the output is the average of the values of k nearest
	neighbors of the given test point.

	@anchor usage
	@par Usage
	<pre class="syntax">
	knn( point_source,
	point_column_name,
	label_column_name,
	test_source,
	test_column_name,
	id_column_name,
	output_table,
	operation,
	k
	)
	</pre>

	\b Arguments
	<dl class="arglist">
	<dt>point_source</dt>
	<dd>TEXT. Name of the table containing the training data points.

	Training data points are expected to be stored row-wise
	in a column of type <tt>DOUBLE PRECISION[]</tt>.
	</dd>

	<dt>point_column_name</dt>
	<dd>TEXT. Name of the column with training data points.</dd>

	<dt>label_column_name</dt>
	<dd>TEXT. Name of the column with labels/values of training data points.</dd>

	<dt>test_source</dt>
	<dd>TEXT. Name of the table containing the test data points.

	Testing data points are expected to be stored row-wise
	in a column of type <tt>DOUBLE PRECISION[]</tt>.
	</dd>

	<dt>test_column_name</dt>
	<dd>TEXT. Name of the column with testing data points.</dd>

	<dt>id_column_name</dt>
	<dd>TEXT. Name of the column having ids of data points in test data table.</dd>

	<dt>output_table</dt>
	<dd>TEXT. Name of the table to store final results.</dd>

	<dt>operation</dt>
	<dd>TEXT. Type of task: 'r' for regression and 'c' for classification.</dd>

	<dt>k (optional)</dt>
	<dd>INTEGER. default: 1. Number of nearest neighbors to consider.
	For classification, should be an odd number to break ties.</dd>

	</dl>


	@anchor output
	@par Output Format

	The output of the KNN module is a table with the following columns:
	<table class="output">
	<tr>
	<th>id</th>
	<td>INTEGER. The ids of test data points.</td>
	</tr>
	<tr>
	<th>test_column_name</th>
	<td>DOUBLE PRECISION[]. The test data points.</td>
	</tr>
	<tr>
	<th>prediction</th>
	<td>INTEGER. Label in case of classification, average value in case of regression.</td>
	</tr>
	</table>


	@anchor examples
	@examp

	-# Prepare some training data:
	<pre class="example">
	DROP TABLE IF EXISTS knn_train_data;
	CREATE TABLE knn_train_data (
	id integer,
	data integer[],
	label float
	);
	INSERT INTO knn_train_data VALUES
	(1, '{1,1}', 1.0),
	(2, '{2,2}', 1.0),
	(3, '{3,3}', 1.0),
	(4, '{4,4}', 1.0),
	(5, '{4,5}', 1.0),
	(6, '{20,50}', 0.0),
	(7, '{10,31}', 0.0),
	(8, '{81,13}', 0.0),
	(9, '{1,111}', 0.0);
	</pre>

	-# Prepare some testing data:
	<pre class="example">
	DROP TABLE IF EXISTS knn_test_data;
	CREATE TABLE knn_test_data (
	id integer,
	data integer[]
	);
	INSERT INTO knn_test_data VALUES
	(1, '{2,1}'),
	(2, '{2,6}'),
	(3, '{15,40}'),
	(4, '{12,1}'),
	(5, '{2,90}'),
	(6, '{50,45}');
	</pre>

	-# Run KNN for classification:
	<pre class="example">
	DROP TABLE IF EXISTS madlib_knn_result_classification;
	SELECT * FROM madlib.knn(
	'knn_train_data', -- Table of training data
	'data', -- Col name of training data
	'label', -- Training labels
	'knn_test_data', -- Table of test data
	'data', -- Col name of test data
	'id', -- Col name of id in test data
	'madlib_knn_result_classification', -- Output table
	'c', -- Classification
	3 -- Number of nearest neighbours
	);
	SELECT * from madlib_knn_result_classification ORDER BY id;
	</pre>
	Result:
	<pre class="result">
	id \| data \| prediction
	----+---------+------------
	1 \| {2,1} \| 1
	2 \| {2,6} \| 1
	3 \| {15,40} \| 0
	4 \| {12,1} \| 1
	5 \| {2,90} \| 0
	6 \| {50,45} \| 0
	(6 rows)
	</pre>

	-# Run KNN for regression:
	<pre class="example">
	DROP TABLE IF EXISTS madlib_knn_result_regression;
	SELECT * FROM madlib.knn(
	'knn_train_data', -- Table of training data
	'data', -- Col name of training data
	'label', -- Training labels
	'knn_test_data', -- Table of test data
	'data', -- Col name of test data
	'id', -- Col name of id in test data
	'madlib_knn_result_regression', -- Output table
	'r', -- Regressions
	3 -- Number of nearest neighbours
	);
	SELECT * from madlib_knn_result_regression ORDER BY id;
	</pre>
	Result:
	<pre class="result">
	id \| data \| prediction
	----+---------+-------------------
	1 \| {2,1} \| 1
	2 \| {2,6} \| 1
	3 \| {15,40} \| 0.333333333333333
	4 \| {12,1} \| 1
	5 \| {2,90} \| 0
	6 \| {50,45} \| 0
	(6 rows)
	</pre>

	@anchor background
	@par Technical Background

	The training data points are vectors in a multidimensional feature space,
	each with a class label. The training phase of the algorithm consists
	only of storing the feature vectors and class labels of the training points.

	In the classification phase, k is a user-defined constant, and an unlabeled
	vector (a test point) is classified by assigning the label which is most
	frequent among the k training samples nearest to that test point.
	In case of regression, average of the values of these k training samples
	is assigned to the test point.
	The only distance metric supported in this version is MADlib's squared_dist_norm2.
	Other distance metrics will be added in a future release of this module.


	@anchor literature
	@literature

	@anchor knn-lit-1
	[1] Wikipedia, k-nearest neighbors algorithm,
	https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm

	@anchor knn-lit-2
	[2] N. S. Altman: An Introduction to Kernel and Nearest-Neighbor Nonparametric Regression
	http://www.stat.washington.edu/courses/stat527/s13/readings/Altman_AmStat_1992.pdf

	@anchor knn-lit-3
	[3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification,
	https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf

	@internal
	@sa namespace knn (documenting the implementation in Python)
	@endinternal
	*/

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
	point_source VARCHAR,
	point_column_name VARCHAR,
	label_column_name VARCHAR,
	test_source VARCHAR,
	test_column_name VARCHAR,
	id_column_name VARCHAR,
	output_table VARCHAR,
	operation VARCHAR,
	k INTEGER
	) RETURNS INTEGER AS $$
	PythonFunctionBodyOnly(`knn', `knn')
	return knn.knn_validate_src(
	schema_madlib,
	point_source,
	point_column_name,
	label_column_name,
	test_source,
	test_column_name,
	id_column_name,
	output_table,
	operation,
	k
	)
	$$ LANGUAGE plpythonu VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
	arg1 VARCHAR
	) RETURNS VOID AS $$
	BEGIN
	IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN
	RAISE NOTICE
	'
	-----------------------------------------------------------------------
	USAGE
	-----------------------------------------------------------------------
	SELECT {schema_madlib}.knn(
	point_source, -- Training data table having training features as vector column and labels
	point_column_name, -- Name of column having feature vectors in training data table
	label_column_name, -- Name of column having actual label/vlaue for corresponding feature vector in training data table
	test_source, -- Test data table having features as vector column. Id of features is mandatory
	test_column_name, -- Name of column having feature vectors in test data table
	id_column_name, -- Name of column having feature vector Ids in test data table
	output_table, -- Name of output table
	operation, -- c for classification task, r for regression task
	k -- value of k. Default will go as 1
	);

	-----------------------------------------------------------------------
	OUTPUT
	-----------------------------------------------------------------------
	The output of the KNN module is a table with the following columns:

	id The ids of test data points.
	test_column_name The test data points.
	prediction The output of KNN- label in case of classification, average value in case of regression.
	';
	END IF;
	END;
	$$ LANGUAGE plpgsql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
	) RETURNS VOID AS $$
	BEGIN
	RAISE NOTICE '
	k-Nearest Neighbors is a method for finding k closest points to a given data
	point in terms of a given metric. Its input consist of data points as features
	from testing examples. For a given k, it looks for k closest points in
	training set for each of the data points in test set. Algorithm generates one
	output per testing example. The output of KNN depends on the type of task:
	For Classification, the output is majority vote of the classes of the k
	nearest data points. The testing example gets assigned the most popular class
	among nearest neighbors. For Regression, the output is average of the values
	of k nearest neighbors of the given testing example.
	';
	END;
	$$ LANGUAGE plpgsql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');


	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
	point_source VARCHAR,
	point_column_name VARCHAR,
	label_column_name VARCHAR,
	test_source VARCHAR,
	test_column_name VARCHAR,
	id_column_name VARCHAR,
	output_table VARCHAR,
	operation VARCHAR,
	k INTEGER
	) RETURNS VARCHAR AS $$
	DECLARE
	l FLOAT;
	id INTEGER;
	vector DOUBLE PRECISION[];
	cur_pid integer;
	oldClientMinMessages VARCHAR;
	returnstring VARCHAR;
	x_temp_table VARCHAR;
	y_temp_table VARCHAR;
	k_val INTEGER;
	label_column_name_unique VARCHAR;
	test_id VARCHAR;
	convert_boolean_to_int VARCHAR;
	BEGIN
	oldClientMinMessages :=
	(SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
	EXECUTE 'SET client_min_messages TO warning';
	SELECT * FROM MADLIB_SCHEMA.__knn_validate_src(point_source, point_column_name, label_column_name, test_source, test_column_name, id_column_name, output_table, operation, k) INTO k_val;
	PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
	x_temp_table := 'knn_'\|\|md5('knn_'\|\|now()::text\|\|random()::text)\|\|'_temp';
	y_temp_table := 'knn_'\|\|md5('knn_'\|\|now()::text\|\|random()::text)\|\|'_temp';
	label_column_name_unique := 'label'\|\|md5('knn_'\|\|now()::text\|\|random()::text)\|\|'_name';
	test_id := 'id'\|\|md5('knn_'\|\|now()::text\|\|random()::text)\|\|'_name';

	convert_boolean_to_int := '';
	IF (operation = 'c') THEN
	convert_boolean_to_int := '::INTEGER';
	END IF;

	EXECUTE
	$sql$
	DROP TABLE IF EXISTS pg_temp.madlib_knn_interm;
	CREATE TABLE pg_temp.madlib_knn_interm AS
	SELECT *
	FROM
	(
	SELECT row_number() over (partition by $sql$ \|\| test_id \|\| $sql$ order by dist) AS r, $sql$ \|\| x_temp_table \|\| $sql$.*
	FROM
	(
	SELECT test.$sql$ \|\| id_column_name \|\| $sql$ AS $sql$ \|\| test_id \|\| $sql$, MADLIB_SCHEMA.squared_dist_norm2(train.$sql$ \|\| point_column_name \|\| $sql$,test.$sql$ \|\| test_column_name \|\| $sql$) AS dist, train.$sql$ \|\| label_column_name \|\| $sql$ $sql$ \|\| convert_boolean_to_int \|\| $sql$ AS $sql$ \|\| label_column_name_unique \|\| $sql$
	FROM $sql$ \|\| textin(regclassout(point_source)) \|\| $sql$ AS train, $sql$ \|\| textin(regclassout(test_source)) \|\| $sql$ AS test
	)$sql$ \|\| x_temp_table \|\| $sql$
	)$sql$ \|\| y_temp_table \|\| $sql$
	WHERE $sql$ \|\| y_temp_table \|\| $sql$.r <= $sql$ \|\| k_val;

	IF (operation = 'c') THEN
	EXECUTE
	$sql$
	CREATE TABLE $sql$ \|\| output_table \|\| $sql$ AS
	SELECT $sql$ \|\| test_id \|\| $sql$ AS id, $sql$ \|\| test_column_name \|\| $sql$, MADLIB_SCHEMA.mode($sql$ \|\| label_column_name_unique \|\| $sql$) AS prediction
	FROM pg_temp.madlib_knn_interm join $sql$ \|\| textin(regclassout(test_source)) \|\| $sql$ ON $sql$ \|\| test_id \|\| $sql$=$sql$ \|\| id_column_name \|\| $sql$
	GROUP BY $sql$ \|\| test_id \|\| $sql$, $sql$ \|\| test_column_name;
	ELSE
	EXECUTE
	$sql$
	CREATE TABLE $sql$ \|\| output_table \|\| $sql$ AS
	SELECT $sql$ \|\| test_id \|\| $sql$ AS id, $sql$ \|\| test_column_name \|\| $sql$, avg($sql$ \|\| label_column_name_unique \|\| $sql$) AS prediction
	FROM
	pg_temp.madlib_knn_interm join $sql$ \|\| textin(regclassout(test_source)) \|\| $sql$ on $sql$ \|\| test_id \|\| $sql$=$sql$ \|\| id_column_name \|\| $sql$
	GROUP BY $sql$ \|\| test_id \|\| $sql$, $sql$ \|\| test_column_name \|\| $sql$
	ORDER BY $sql$ \|\| test_id \|\| $sql$ $sql$;
	END IF;

	EXECUTE 'SET client_min_messages TO ' \|\| oldClientMinMessages;
	IF (operation = 'c') THEN
	returnstring := 'The classification results have been written to output table '\|\|output_table;
	ELSE
	returnstring := 'The regression results have been written to output table '\|\|output_table;
	END IF;
	DROP TABLE pg_temp.madlib_knn_interm;
	RETURN returnstring;
	END;
	$$ LANGUAGE plpgsql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
	point_source VARCHAR,
	point_column_name VARCHAR,
	label_column_name VARCHAR,
	test_source VARCHAR,
	test_column_name VARCHAR,
	id_column_name VARCHAR,
	output_table VARCHAR,
	operation VARCHAR
	) RETURNS VARCHAR AS $$
	DECLARE
	returnstring VARCHAR;
	BEGIN
	returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1);
	RETURN returnstring;
	END;
	$$ LANGUAGE plpgsql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');