| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| |
| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file knn.sql_in |
| * @brief Set of functions for k-nearest neighbors. |
| * @sa For a brief introduction to k-nearest neighbors algorithm for regression and classification, |
| * see the module description \ref grp_knn. |
| * |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| |
| /** |
| @addtogroup grp_knn |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#knn">K-Nearest Neighbors</a></li> |
| <li class="level1"><a href="#usage">Usage</a></li> |
| <li class="level1"><a href="#output">Output Format</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#background">Technical Background</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| </ul> |
| </div> |
| |
| @brief Finds k nearest data points to the given data point and outputs majority vote value of output classes for classification, and average value of target values for regression. |
| |
| \warning <em> This MADlib method is still in early stage development. There may be some |
| issues that will be addressed in a future version. Interface and implementation |
| are subject to change. </em> |
| |
| @anchor knn |
| |
| K-nearest neighbors is a method for finding the k closest points to a |
| given data point in terms of a given metric. Its input consists of |
| data points as features from testing examples, and it |
| looks for k closest points in the training set for each of the data |
| points in test set. The output of KNN depends on the type of task. |
| For classification, the output is the majority vote of the classes of |
| the k nearest data points. That is, the testing example gets assigned the |
| most popular class from the nearest neighbors. |
| For regression, the output is the average of the values of k nearest |
| neighbors of the given test point. |
| |
| @anchor usage |
| @par Usage |
| <pre class="syntax"> |
| knn( point_source, |
| point_column_name, |
| label_column_name, |
| test_source, |
| test_column_name, |
| id_column_name, |
| output_table, |
| operation, |
| k |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>point_source</dt> |
| <dd>TEXT. Name of the table containing the training data points. |
| |
| Training data points are expected to be stored row-wise |
| in a column of type <tt>DOUBLE PRECISION[]</tt>. |
| </dd> |
| |
| <dt>point_column_name</dt> |
| <dd>TEXT. Name of the column with training data points.</dd> |
| |
| <dt>label_column_name</dt> |
| <dd>TEXT. Name of the column with labels/values of training data points.</dd> |
| |
| <dt>test_source</dt> |
| <dd>TEXT. Name of the table containing the test data points. |
| |
| Testing data points are expected to be stored row-wise |
| in a column of type <tt>DOUBLE PRECISION[]</tt>. |
| </dd> |
| |
| <dt>test_column_name</dt> |
| <dd>TEXT. Name of the column with testing data points.</dd> |
| |
| <dt>id_column_name</dt> |
| <dd>TEXT. Name of the column having ids of data points in test data table.</dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the table to store final results.</dd> |
| |
| <dt>operation</dt> |
| <dd>TEXT. Type of task: 'r' for regression and 'c' for classification.</dd> |
| |
| <dt>k (optional)</dt> |
| <dd>INTEGER. default: 1. Number of nearest neighbors to consider. |
| For classification, should be an odd number to break ties.</dd> |
| |
| </dl> |
| |
| |
| @anchor output |
| @par Output Format |
| |
| The output of the KNN module is a table with the following columns: |
| <table class="output"> |
| <tr> |
| <th>id</th> |
| <td>INTEGER. The ids of test data points.</td> |
| </tr> |
| <tr> |
| <th>test_column_name</th> |
| <td>DOUBLE PRECISION[]. The test data points.</td> |
| </tr> |
| <tr> |
| <th>prediction</th> |
| <td>INTEGER. Label in case of classification, average value in case of regression.</td> |
| </tr> |
| </table> |
| |
| |
| @anchor examples |
| @examp |
| |
| -# Prepare some training data: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_train_data; |
| CREATE TABLE knn_train_data ( |
| id integer, |
| data integer[], |
| label float |
| ); |
| INSERT INTO knn_train_data VALUES |
| (1, '{1,1}', 1.0), |
| (2, '{2,2}', 1.0), |
| (3, '{3,3}', 1.0), |
| (4, '{4,4}', 1.0), |
| (5, '{4,5}', 1.0), |
| (6, '{20,50}', 0.0), |
| (7, '{10,31}', 0.0), |
| (8, '{81,13}', 0.0), |
| (9, '{1,111}', 0.0); |
| </pre> |
| |
| -# Prepare some testing data: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_test_data; |
| CREATE TABLE knn_test_data ( |
| id integer, |
| data integer[] |
| ); |
| INSERT INTO knn_test_data VALUES |
| (1, '{2,1}'), |
| (2, '{2,6}'), |
| (3, '{15,40}'), |
| (4, '{12,1}'), |
| (5, '{2,90}'), |
| (6, '{50,45}'); |
| </pre> |
| |
| -# Run KNN for classification: |
| <pre class="example"> |
| DROP TABLE IF EXISTS madlib_knn_result_classification; |
| SELECT * FROM madlib.knn( |
| 'knn_train_data', -- Table of training data |
| 'data', -- Col name of training data |
| 'label', -- Training labels |
| 'knn_test_data', -- Table of test data |
| 'data', -- Col name of test data |
| 'id', -- Col name of id in test data |
| 'madlib_knn_result_classification', -- Output table |
| 'c', -- Classification |
| 3 -- Number of nearest neighbours |
| ); |
| SELECT * from madlib_knn_result_classification ORDER BY id; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | data | prediction |
| ----+---------+------------ |
| 1 | {2,1} | 1 |
| 2 | {2,6} | 1 |
| 3 | {15,40} | 0 |
| 4 | {12,1} | 1 |
| 5 | {2,90} | 0 |
| 6 | {50,45} | 0 |
| (6 rows) |
| </pre> |
| |
| -# Run KNN for regression: |
| <pre class="example"> |
| DROP TABLE IF EXISTS madlib_knn_result_regression; |
| SELECT * FROM madlib.knn( |
| 'knn_train_data', -- Table of training data |
| 'data', -- Col name of training data |
| 'label', -- Training labels |
| 'knn_test_data', -- Table of test data |
| 'data', -- Col name of test data |
| 'id', -- Col name of id in test data |
| 'madlib_knn_result_regression', -- Output table |
| 'r', -- Regressions |
| 3 -- Number of nearest neighbours |
| ); |
| SELECT * from madlib_knn_result_regression ORDER BY id; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | data | prediction |
| ----+---------+------------------- |
| 1 | {2,1} | 1 |
| 2 | {2,6} | 1 |
| 3 | {15,40} | 0.333333333333333 |
| 4 | {12,1} | 1 |
| 5 | {2,90} | 0 |
| 6 | {50,45} | 0 |
| (6 rows) |
| </pre> |
| |
| @anchor background |
| @par Technical Background |
| |
| The training data points are vectors in a multidimensional feature space, |
| each with a class label. The training phase of the algorithm consists |
| only of storing the feature vectors and class labels of the training points. |
| |
| In the classification phase, k is a user-defined constant, and an unlabeled |
| vector (a test point) is classified by assigning the label which is most |
| frequent among the k training samples nearest to that test point. |
| In case of regression, average of the values of these k training samples |
| is assigned to the test point. |
| The only distance metric supported in this version is MADlib's squared_dist_norm2. |
| Other distance metrics will be added in a future release of this module. |
| |
| |
| @anchor literature |
| @literature |
| |
| @anchor knn-lit-1 |
| [1] Wikipedia, k-nearest neighbors algorithm, |
| https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm |
| |
| @anchor knn-lit-2 |
| [2] N. S. Altman: An Introduction to Kernel and Nearest-Neighbor Nonparametric Regression |
| http://www.stat.washington.edu/courses/stat527/s13/readings/Altman_AmStat_1992.pdf |
| |
| @anchor knn-lit-3 |
| [3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification, |
| https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf |
| |
| @internal |
| @sa namespace knn (documenting the implementation in Python) |
| @endinternal |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| id_column_name VARCHAR, |
| output_table VARCHAR, |
| operation VARCHAR, |
| k INTEGER |
| ) RETURNS INTEGER AS $$ |
| PythonFunctionBodyOnly(`knn', `knn') |
| return knn.knn_validate_src( |
| schema_madlib, |
| point_source, |
| point_column_name, |
| label_column_name, |
| test_source, |
| test_column_name, |
| id_column_name, |
| output_table, |
| operation, |
| k |
| ) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| arg1 VARCHAR |
| ) RETURNS VOID AS $$ |
| BEGIN |
| IF arg1 = 'help' OR arg1 = 'usage' OR arg1 = '?' THEN |
| RAISE NOTICE |
| ' |
| ----------------------------------------------------------------------- |
| USAGE |
| ----------------------------------------------------------------------- |
| SELECT {schema_madlib}.knn( |
| point_source, -- Training data table having training features as vector column and labels |
| point_column_name, -- Name of column having feature vectors in training data table |
| label_column_name, -- Name of column having actual label/vlaue for corresponding feature vector in training data table |
| test_source, -- Test data table having features as vector column. Id of features is mandatory |
| test_column_name, -- Name of column having feature vectors in test data table |
| id_column_name, -- Name of column having feature vector Ids in test data table |
| output_table, -- Name of output table |
| operation, -- c for classification task, r for regression task |
| k -- value of k. Default will go as 1 |
| ); |
| |
| ----------------------------------------------------------------------- |
| OUTPUT |
| ----------------------------------------------------------------------- |
| The output of the KNN module is a table with the following columns: |
| |
| id The ids of test data points. |
| test_column_name The test data points. |
| prediction The output of KNN- label in case of classification, average value in case of regression. |
| '; |
| END IF; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| ) RETURNS VOID AS $$ |
| BEGIN |
| RAISE NOTICE ' |
| k-Nearest Neighbors is a method for finding k closest points to a given data |
| point in terms of a given metric. Its input consist of data points as features |
| from testing examples. For a given k, it looks for k closest points in |
| training set for each of the data points in test set. Algorithm generates one |
| output per testing example. The output of KNN depends on the type of task: |
| For Classification, the output is majority vote of the classes of the k |
| nearest data points. The testing example gets assigned the most popular class |
| among nearest neighbors. For Regression, the output is average of the values |
| of k nearest neighbors of the given testing example. |
| '; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| id_column_name VARCHAR, |
| output_table VARCHAR, |
| operation VARCHAR, |
| k INTEGER |
| ) RETURNS VARCHAR AS $$ |
| DECLARE |
| l FLOAT; |
| id INTEGER; |
| vector DOUBLE PRECISION[]; |
| cur_pid integer; |
| oldClientMinMessages VARCHAR; |
| returnstring VARCHAR; |
| x_temp_table VARCHAR; |
| y_temp_table VARCHAR; |
| k_val INTEGER; |
| label_column_name_unique VARCHAR; |
| test_id VARCHAR; |
| convert_boolean_to_int VARCHAR; |
| BEGIN |
| oldClientMinMessages := |
| (SELECT setting FROM pg_settings WHERE name = 'client_min_messages'); |
| EXECUTE 'SET client_min_messages TO warning'; |
| SELECT * FROM MADLIB_SCHEMA.__knn_validate_src(point_source, point_column_name, label_column_name, test_source, test_column_name, id_column_name, output_table, operation, k) INTO k_val; |
| PERFORM MADLIB_SCHEMA.create_schema_pg_temp(); |
| x_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp'; |
| y_temp_table := 'knn_'||md5('knn_'||now()::text||random()::text)||'_temp'; |
| label_column_name_unique := 'label'||md5('knn_'||now()::text||random()::text)||'_name'; |
| test_id := 'id'||md5('knn_'||now()::text||random()::text)||'_name'; |
| |
| convert_boolean_to_int := ''; |
| IF (operation = 'c') THEN |
| convert_boolean_to_int := '::INTEGER'; |
| END IF; |
| |
| EXECUTE |
| $sql$ |
| DROP TABLE IF EXISTS pg_temp.madlib_knn_interm; |
| CREATE TABLE pg_temp.madlib_knn_interm AS |
| SELECT * |
| FROM |
| ( |
| SELECT row_number() over (partition by $sql$ || test_id || $sql$ order by dist) AS r, $sql$ || x_temp_table || $sql$.* |
| FROM |
| ( |
| SELECT test.$sql$ || id_column_name || $sql$ AS $sql$ || test_id || $sql$, MADLIB_SCHEMA.squared_dist_norm2(train.$sql$ || point_column_name || $sql$,test.$sql$ || test_column_name || $sql$) AS dist, train.$sql$ || label_column_name || $sql$ $sql$ || convert_boolean_to_int || $sql$ AS $sql$ || label_column_name_unique || $sql$ |
| FROM $sql$ || textin(regclassout(point_source)) || $sql$ AS train, $sql$ || textin(regclassout(test_source)) || $sql$ AS test |
| )$sql$ || x_temp_table || $sql$ |
| )$sql$ || y_temp_table || $sql$ |
| WHERE $sql$ || y_temp_table || $sql$.r <= $sql$ || k_val; |
| |
| IF (operation = 'c') THEN |
| EXECUTE |
| $sql$ |
| CREATE TABLE $sql$ || output_table || $sql$ AS |
| SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, MADLIB_SCHEMA.mode($sql$ || label_column_name_unique || $sql$) AS prediction |
| FROM pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ ON $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$ |
| GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name; |
| ELSE |
| EXECUTE |
| $sql$ |
| CREATE TABLE $sql$ || output_table || $sql$ AS |
| SELECT $sql$ || test_id || $sql$ AS id, $sql$ || test_column_name || $sql$, avg($sql$ || label_column_name_unique || $sql$) AS prediction |
| FROM |
| pg_temp.madlib_knn_interm join $sql$ || textin(regclassout(test_source)) || $sql$ on $sql$ || test_id || $sql$=$sql$ || id_column_name || $sql$ |
| GROUP BY $sql$ || test_id || $sql$, $sql$ || test_column_name || $sql$ |
| ORDER BY $sql$ || test_id || $sql$ $sql$; |
| END IF; |
| |
| EXECUTE 'SET client_min_messages TO ' || oldClientMinMessages; |
| IF (operation = 'c') THEN |
| returnstring := 'The classification results have been written to output table '||output_table; |
| ELSE |
| returnstring := 'The regression results have been written to output table '||output_table; |
| END IF; |
| DROP TABLE pg_temp.madlib_knn_interm; |
| RETURN returnstring; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| id_column_name VARCHAR, |
| output_table VARCHAR, |
| operation VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| DECLARE |
| returnstring VARCHAR; |
| BEGIN |
| returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1); |
| RETURN returnstring; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |