| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| |
| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file knn.sql_in |
| * @brief Set of functions for k-nearest neighbors. |
| * @sa For a brief introduction to k-nearest neighbors algorithm for regression and classification, |
| * see the module description \ref grp_knn. |
| * |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_knn |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li class="level1"><a href="#knn">K-Nearest Neighbors</a></li> |
| <li class="level1"><a href="#usage">Usage</a></li> |
| <li class="level1"><a href="#output">Output Format</a></li> |
| <li class="level1"><a href="#examples">Examples</a></li> |
| <li class="level1"><a href="#background">Technical Background</a></li> |
| <li class="level1"><a href="#literature">Literature</a></li> |
| </ul> |
| </div> |
| |
| @brief Finds \f$k\f$ nearest data points to the given data point and outputs majority |
| vote value of output classes for classification, or average value of target |
| values for regression. |
| |
| @anchor knn |
| |
| K-nearest neighbors is a method for finding the \f$k\f$ closest points to a given data |
| point in terms of a given metric. Its input consists of data points as features |
| from testing examples and it looks for \f$k\f$ closest points in the training set |
| for each of the data points in test set. The output of KNN depends on the type |
| of task. For classification, the output is the majority vote of the classes of |
| the \f$k\f$ nearest data points. For regression, the output is the average of the |
| values of \f$k\f$ nearest neighbors of the given test point. |
| |
| For unsupervised nearest neighbors, set the training set to match the test set so the |
| nearest neighbor of each point is the point itself, with zero distance. |
| |
| Both exact and approximate methods are supported. The approximate methods can be |
| used in the case that run-time is too long using the exact method. |
| |
| @anchor usage |
| @par Usage |
| <pre class="syntax"> |
| knn( point_source, |
| point_column_name, |
| point_id, |
| label_column_name, |
| test_source, |
| test_column_name, |
| test_id, |
| output_table, |
| k, |
| output_neighbors, |
| fn_dist, |
| weighted_avg, |
| algorithm, |
| algorithm_params |
| ) |
| </pre> |
| |
| \b Arguments |
| <dl class="arglist"> |
| <dt>point_source</dt> |
| <dd>TEXT. Name of the table containing the training data points. |
| Training data points are expected to be stored row-wise |
| in a column of type <tt>DOUBLE PRECISION[]</tt>. |
| </dd> |
| |
| <dt>point_column_name</dt> |
| <dd>TEXT. Name of the column with training data points |
| or expression that evaluates to a numeric array</dd> |
| |
| <dt>point_id</dt> |
| <dd>TEXT. Name of the column in 'point_source’ containing source data ids. |
| The ids are of type INTEGER with no duplicates. They do not need to be contiguous. |
| This parameter must be used if the list of nearest neighbors are to be output, i.e., |
| if the parameter 'output_neighbors' below is TRUE or if 'label_column_name' is NULL. |
| |
| <dt>label_column_name</dt> |
| <dd>TEXT. Name of the column with labels/values of training data points. |
| If this column is a Boolean, integer or text, it will run KNN classification, |
| else if it is double precision values will run KNN regression. |
| If you set this to NULL, it will only return the set of neighbors without |
| actually doing classification or regression.</dd> |
| |
| <dt>test_source</dt> |
| <dd>TEXT. Name of the table containing the test data points. |
| Testing data points are expected to be stored row-wise |
| in a column of type <tt>DOUBLE PRECISION[]</tt>. |
| </dd> |
| |
| <dt>test_column_name</dt> |
| <dd>TEXT. Name of the column with testing data points |
| or expression that evaluates to a numeric array</dd> |
| |
| @note |
| For unsupervised nearest neighbors, make the test dataset the same as the source dataset, |
| so the nearest neighbor of each point is the point itself, with a zero distance. |
| |
| <dt>test_id</dt> |
| <dd>TEXT. Name of the column having ids of data points in test data table.</dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the table to store final results.</dd> |
| |
| <dt>k (optional)</dt> |
| <dd>INTEGER. default: 1. Number of nearest neighbors to consider. |
| For classification, should be an odd number to break ties, |
| otherwise the result may depend on ordering of the input data.</dd> |
| |
| <dt>output_neighbors (optional) </dt> |
| <dd>BOOLEAN default: TRUE. Outputs the list of k-nearest |
| neighbors (and their respective distances to the target point) that were used |
| in the voting/averaging, sorted from closest to furthest.</dd> |
| |
| <dt>fn_dist (optional)</dt> |
| <dd>TEXT, default: 'squared_dist_norm2'. The name of the function |
| used to calculate the distance between data points. |
| |
| The following distance functions can be used: |
| <ul> |
| <li><b>\ref dist_norm1</b>: 1-norm/Manhattan</li> |
| <li><b>\ref dist_norm2</b>: 2-norm/Euclidean</li> |
| <li><b>\ref squared_dist_norm2</b>: squared Euclidean distance</li> |
| <li><b>\ref dist_angle</b>: angle</li> |
| <li><b>\ref dist_tanimoto</b>: tanimoto</li> |
| <li><b>user defined function</b> with signature <tt>DOUBLE PRECISION[] x, DOUBLE PRECISION[] y -> DOUBLE PRECISION.</tt> |
| Must return a value greater than or equal to zero.</li></ul></dd> |
| @note |
| Always qualify the distance function with the schema name. For example, if you install MADlib in a |
| schema called 'madlib' then the 'fn_dist' parameter would be 'madlib.dist_norm2', etc. |
| |
| <dt>weighted_avg (optional)</dt> |
| <dd>BOOLEAN, default: FALSE. Calculates classification or |
| regression values using a weighted average. The idea is to |
| weigh the contribution of each of the k neighbors according |
| to their distance to the test point, giving greater influence |
| to closer neighbors. The distance function 'fn_dist' specified |
| above is used. For classification, majority voting weighs a neighbor |
| according to inverse distance. For regression, the inverse distance |
| weighting approach is used from Shepard [4]. |
| |
| <dt>algorithm (optional)</dt> |
| <dd>TEXT, default: 'brute_force'. The name of the algorithm |
| used to compute nearest neighbors. The following options are supported: |
| <ul> |
| <li><b>\ref brute_force</b>: Produces an exact result by searching |
| all points in the search space. You can also use a short |
| form "b" or "brute" etc. to select brute force.</li> |
| <li><b>\ref kd_tree</b>: Produces an approximate result by searching |
| a subset of the search space, that is, only certain leaf nodes in the |
| kd-tree as specified by "algorithm_params" below. |
| You can also use a short |
| form "k" or "kd" etc. to select kd-tree.</li></ul></dd> |
| |
| <dt>algorithm_params (optional)</dt> |
| <dd>TEXT, default: 'depth=3, leaf_nodes=2'. These parameters apply to the |
| kd-tree algorithm only. |
| <ul> |
| <li><b>\ref depth</b>: Depth of the kd-tree. Increasing this value will |
| decrease run-time but reduce the accuracy.</li> |
| <li><b>\ref leaf_nodes</b>: Number of leaf nodes (regions) to search for each test point. |
| Inceasing this value will improve the accuracy but increase run-time.</li></ul> |
| |
| @note |
| Please note that the kd-tree accuracy will be lower for datasets with a high |
| number of features. It is advised to use at least two leaf nodes. |
| Refer to the <a href="#background">Technical Background</a> for more information |
| on how the kd-tree is implemented.</dd> |
| |
| </dl> |
| |
| |
| @anchor output |
| @par Output Format |
| |
| The output of the KNN module is a table with the following columns: |
| <table class="output"> |
| <tr> |
| <th>id</th> |
| <td>INTEGER. The ids of test data points.</td> |
| </tr> |
| <tr> |
| <th>test_column_name</th> |
| <td>DOUBLE PRECISION[]. The test data points.</td> |
| </tr> |
| <tr> |
| <th>prediction</th> |
| <td>INTEGER. Label in case of classification, average value in case of regression.</td> |
| </tr> |
| <tr> |
| <th>k_nearest_neighbours</th> |
| <td>INTEGER[]. List of nearest neighbors, sorted closest to furthest |
| from the corresponding test point.</td> |
| </tr> |
| <tr> |
| <th>distance</th> |
| <td>DOUBLE PRECISION[]. List of distance to nearest neighbors, sorted closest to furthest |
| from the corresponding test point.</td> |
| </tr> |
| </table> |
| |
| |
| @anchor examples |
| @examp |
| |
| -# Prepare some training data for classification: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_train_data; |
| CREATE TABLE knn_train_data ( |
| id integer, |
| data integer[], |
| label integer -- Integer label means for classification |
| ); |
| INSERT INTO knn_train_data VALUES |
| (1, '{1,1}', 1), |
| (2, '{2,2}', 1), |
| (3, '{3,3}', 1), |
| (4, '{4,4}', 1), |
| (5, '{4,5}', 1), |
| (6, '{20,50}', 0), |
| (7, '{10,31}', 0), |
| (8, '{81,13}', 0), |
| (9, '{1,111}', 0); |
| </pre> |
| |
| -# Prepare some training data for regression: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_train_data_reg; |
| CREATE TABLE knn_train_data_reg ( |
| id integer, |
| data integer[], |
| label float -- Float label means for regression |
| ); |
| INSERT INTO knn_train_data_reg VALUES |
| (1, '{1,1}', 1.0), |
| (2, '{2,2}', 1.0), |
| (3, '{3,3}', 1.0), |
| (4, '{4,4}', 1.0), |
| (5, '{4,5}', 1.0), |
| (6, '{20,50}', 0.0), |
| (7, '{10,31}', 0.0), |
| (8, '{81,13}', 0.0), |
| (9, '{1,111}', 0.0); |
| </pre> |
| |
| -# Prepare some testing data: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_test_data CASCADE; |
| CREATE TABLE knn_test_data ( |
| id integer, |
| data integer[] |
| ); |
| INSERT INTO knn_test_data VALUES |
| (1, '{2,1}'), |
| (2, '{2,6}'), |
| (3, '{15,40}'), |
| (4, '{12,1}'), |
| (5, '{2,90}'), |
| (6, '{50,45}'); |
| </pre> |
| |
| -# Run KNN for classification. Prepend the distance function parameter with the schema |
| where MADlib is installed (in this example 'madlib.squared_dist_norm2'): |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_result_classification; |
| SELECT * FROM madlib.knn( |
| 'knn_train_data', -- Table of training data |
| 'data', -- Col name of training data |
| 'id', -- Col name of id in train data |
| 'label', -- Training labels |
| 'knn_test_data', -- Table of test data |
| 'data', -- Col name of test data |
| 'id', -- Col name of id in test data |
| 'knn_result_classification', -- Output table |
| 3, -- Number of nearest neighbors |
| True, -- True to list nearest-neighbors by id |
| 'madlib.squared_dist_norm2' -- Distance function |
| ); |
| SELECT * from knn_result_classification ORDER BY id; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | data | prediction | k_nearest_neighbours | distance |
| ----+---------+------------+----------------------+--------------------- |
| 1 | {2,1} | 1 | {1,2,3} | {1,1,5} |
| 2 | {2,6} | 1 | {5,4,3} | {5,8,10} |
| 3 | {15,40} | 0 | {7,6,5} | {106,125,1346} |
| 4 | {12,1} | 1 | {4,5,3} | {73,80,85} |
| 5 | {2,90} | 0 | {9,6,7} | {442,1924,3545} |
| 6 | {50,45} | 0 | {6,7,8} | {925,1796,1985} |
| (6 rows) |
| </pre> |
| Note that the nearest neighbors are sorted from closest |
| to furthest from the corresponding test point. |
| |
| -# Run KNN for regression: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_result_regression; |
| SELECT * FROM madlib.knn( |
| 'knn_train_data_reg', -- Table of training data |
| 'data', -- Col name of training data |
| 'id', -- Col Name of id in train data |
| 'label', -- Training labels |
| 'knn_test_data', -- Table of test data |
| 'data', -- Col name of test data |
| 'id', -- Col name of id in test data |
| 'knn_result_regression', -- Output table |
| 3, -- Number of nearest neighbors |
| True, -- True to list nearest-neighbors by id |
| 'madlib.dist_norm2' -- Distance function |
| ); |
| SELECT * FROM knn_result_regression ORDER BY id; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | data | prediction | k_nearest_neighbours | distance |
| ----+---------+-------------------+----------------------+------------------------------------------------------ |
| 1 | {2,1} | 1 | {1,2,3} | {1,1,2.23606797749979} |
| 2 | {2,6} | 1 | {5,4,3} | {2.23606797749979,2.82842712474619,3.16227766016838} |
| 3 | {15,40} | 0.333333333333333 | {7,6,5} | {10.295630140987,11.1803398874989,36.6878726556883} |
| 4 | {12,1} | 1 | {4,5,3} | {8.54400374531753,8.94427190999916,9.21954445729289} |
| 5 | {2,90} | 0 | {9,6,7} | {21.0237960416286,43.8634243989226,59.5399025864168} |
| 6 | {50,45} | 0 | {6,7,8} | {30.4138126514911,42.3792402008342,44.5533388198909} |
| (6 rows) |
| </pre> |
| |
| -# List nearest neighbors only, without doing classification |
| or regression: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_result_list_neighbors; |
| SELECT * FROM madlib.knn( |
| 'knn_train_data_reg', -- Table of training data |
| 'data', -- Col name of training data |
| 'id', -- Col Name of id in train data |
| NULL, -- NULL training labels means just list neighbors |
| 'knn_test_data', -- Table of test data |
| 'data', -- Col name of test data |
| 'id', -- Col name of id in test data |
| 'knn_result_list_neighbors', -- Output table |
| 3 -- Number of nearest neighbors |
| ); |
| SELECT * FROM knn_result_list_neighbors ORDER BY id; |
| </pre> |
| Result, with neighbors sorted from closest to furthest: |
| <pre class="result"> |
| id | data | k_nearest_neighbours | distance |
| ----+---------+----------------------+--------------------- |
| 1 | {2,1} | {1,2,3} | {1,1,5} |
| 2 | {2,6} | {5,4,3} | {5,8,10} |
| 3 | {15,40} | {7,6,5} | {106,125,1346} |
| 4 | {12,1} | {4,5,3} | {73,80,85} |
| 5 | {2,90} | {9,6,7} | {442,1924,3545} |
| 6 | {50,45} | {6,7,8} | {925,1796,1985} |
| (6 rows) |
| </pre> |
| |
| |
| -# Run KNN for classification using the |
| weighted average: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_result_classification; |
| SELECT * FROM madlib.knn( |
| 'knn_train_data', -- Table of training data |
| 'data', -- Col name of training data |
| 'id', -- Col name of id in train data |
| 'label', -- Training labels |
| 'knn_test_data', -- Table of test data |
| 'data', -- Col name of test data |
| 'id', -- Col name of id in test data |
| 'knn_result_classification', -- Output table |
| 3, -- Number of nearest neighbors |
| True, -- True to list nearest-neighbors by id |
| 'madlib.squared_dist_norm2', -- Distance function |
| True -- For weighted average |
| ); |
| SELECT * FROM knn_result_classification ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | data | prediction | k_nearest_neighbours | distance |
| ----+---------+------------+----------------------+--------------------- |
| 1 | {2,1} | 1 | {1,2,3} | {1,1,5} |
| 2 | {2,6} | 1 | {5,4,3} | {5,8,10} |
| 3 | {15,40} | 0 | {7,6,5} | {106,125,1346} |
| 4 | {12,1} | 1 | {4,5,3} | {73,80,85} |
| 5 | {2,90} | 0 | {9,6,7} | {442,1924,3545} |
| 6 | {50,45} | 0 | {6,7,8} | {925,1796,1985} |
| (6 rows) |
| </pre> |
| |
| -# Use kd-tree option. First we build a kd-tree to depth 4 and |
| search half (8) of the 16 leaf nodes (i.e., 2^4 total leaf nodes): |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_result_classification_kd; |
| SELECT madlib.knn( |
| 'knn_train_data', -- Table of training data |
| 'data', -- Col name of training data |
| 'id', -- Col name of id in train data |
| NULL, -- Training labels |
| 'knn_test_data', -- Table of test data |
| 'data', -- Col name of test data |
| 'id', -- Col name of id in test data |
| 'knn_result_classification_kd', -- Output table |
| 3, -- Number of nearest neighbors |
| True, -- True to list nearest-neighbors by id |
| 'madlib.squared_dist_norm2', -- Distance function |
| False, -- For weighted average |
| 'kd_tree', -- Use kd-tree |
| 'depth=4, leaf_nodes=8' -- Kd-tree options |
| ); |
| SELECT * FROM knn_result_classification_kd ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | data | k_nearest_neighbours | distance |
| ----+---------+----------------------+--------------------- |
| 1 | {2,1} | {1,2,3} | {1,1,5} |
| 2 | {2,6} | {5,4,3} | {5,8,10} |
| 3 | {15,40} | {7,6,5} | {106,125,1346} |
| 4 | {12,1} | {4,5,3} | {73,80,85} |
| 5 | {2,90} | {9,6,7} | {442,1924,3545} |
| 6 | {50,45} | {6,7,8} | {925,1796,1985} |
| (6 rows) |
| </pre> |
| The result above is the same as brute force. If we search just 1 leaf node, |
| run-time will be faster but accuracy will be lower. This shows up in this |
| very small data set by not being able to find 3 nearest neighbors for all test points: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_result_classification_kd; |
| SELECT madlib.knn( |
| 'knn_train_data', -- Table of training data |
| 'data', -- Col name of training data |
| 'id', -- Col name of id in train data |
| NULL, -- Training labels |
| 'knn_test_data', -- Table of test data |
| 'data', -- Col name of test data |
| 'id', -- Col name of id in test data |
| 'knn_result_classification_kd', -- Output table |
| 3, -- Number of nearest neighbors |
| True, -- True to list nearest-neighbors by id |
| 'madlib.squared_dist_norm2', -- Distance function |
| False, -- For weighted average |
| 'kd_tree', -- Use kd-tree |
| 'depth=4, leaf_nodes=1' -- Kd-tree options |
| ); |
| SELECT * FROM knn_result_classification_kd ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | data | k_nearest_neighbours | distance |
| ----+---------+----------------------+--------------------- |
| 1 | {2,1} | {1} | {1} |
| 2 | {2,6} | {3,2} | {10,16} |
| 3 | {15,40} | {7} | {106} |
| 5 | {2,90} | {3,2} | {7570,7744} |
| 6 | {50,45} | {6,8} | {925,1985} |
| (5 rows) |
| </pre> |
| |
| -# Unsupervised nearest neighbors. Here the training set matches the |
| test set so the nearest neighbor of each point is the point itself, with a zero distance: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_result_classification_unsup; |
| SELECT * FROM madlib.knn( |
| 'knn_train_data', -- Table of training data |
| 'data', -- Col name of training data |
| 'id', -- Col name of id in train data |
| NULL, -- NULL training labels means just list neighbors |
| 'knn_train_data', -- Table of test data (same as training data) |
| 'data', -- Col name of test data |
| 'id', -- Col name of id in test data |
| 'knn_result_classification_unsup', -- Output table |
| 3, -- Number of nearest neighbors |
| True, -- True to list nearest-neighbors by id |
| 'madlib.squared_dist_norm2' -- Distance function |
| ); |
| SELECT * from knn_result_classification_unsup ORDER BY id; |
| </pre> |
| Result, with point and neighbors sorted from closest to furthest: |
| <pre class="result"> |
| id | data | k_nearest_neighbours | distance |
| ----+---------+----------------------+--------------- |
| 1 | {1,1} | {1,2,3} | {0,2,8} |
| 2 | {2,2} | {2,3,1} | {0,2,2} |
| 3 | {3,3} | {3,2,4} | {0,2,2} |
| 4 | {4,4} | {4,5,3} | {0,1,2} |
| 5 | {4,5} | {5,4,3} | {0,1,5} |
| 6 | {20,50} | {6,7,5} | {0,461,2281} |
| 7 | {10,31} | {7,6,5} | {0,461,712} |
| 8 | {81,13} | {8,6,7} | {0,5090,5365} |
| 9 | {1,111} | {9,6,7} | {0,4082,6481} |
| (9 rows) |
| </pre> |
| |
| -# User defined distance function. There are several built-in distance |
| functions, but you can create your own using a UDF if desired. |
| For example, to create a Chebyshev distance function [6], first create the function: |
| <pre class="example"> |
| CREATE OR REPLACE FUNCTION chebychev_distance (x double precision[], y double precision[]) |
| RETURNS double precision |
| AS $$ |
| from scipy.spatial import distance |
| return distance.chebyshev(x, y) |
| $$ LANGUAGE plpythonu; |
| </pre> |
| Then pass the function as an argument: |
| <pre class="example"> |
| DROP TABLE IF EXISTS knn_result_classification_udf; |
| SELECT * FROM madlib.knn( |
| 'knn_train_data', -- Table of training data |
| 'data', -- Col name of training data |
| 'id', -- Col name of id in train data |
| 'label', -- Training labels |
| 'knn_test_data', -- Table of test data |
| 'data', -- Col name of test data |
| 'id', -- Col name of id in test data |
| 'knn_result_classification_udf', -- Output table |
| 3, -- Number of nearest neighbors |
| True, -- True to list nearest-neighbors by id |
| 'chebychev_distance' -- Distance function |
| ); |
| SELECT * from knn_result_classification_udf ORDER BY id; |
| </pre> |
| Result, with point and neighbors sorted from closest to furthest: |
| <pre class="result"> |
| id | data | prediction | k_nearest_neighbours | distance |
| ----+---------+------------+----------------------+------------ |
| 1 | {2,1} | 1 | {1,2,3} | {1,1,2} |
| 2 | {2,6} | 1 | {5,4,3} | {2,2,3} |
| 3 | {15,40} | 0 | {7,6,5} | {9,10,35} |
| 4 | {12,1} | 1 | {5,4,3} | {8,8,9} |
| 5 | {2,90} | 0 | {9,6,7} | {21,40,59} |
| 6 | {50,45} | 0 | {6,8,7} | {30,32,40} |
| (6 rows) |
| </pre> |
| |
| @anchor background |
| @par Technical Background |
| |
| The training data points are vectors in a multidimensional feature space, |
| each with a class label. The training phase of the algorithm consists |
| only of storing the feature vectors and class labels of the training points. |
| |
| In the prediction phase, \f$k\f$ is a user-defined constant, and an unlabeled vector |
| (a test point) is predicted by using the label from the the \f$k\f$ training samples |
| nearest to that test point. |
| |
| Since distances between points are used to find the nearest neighbors, the data |
| should be standardized across features. This ensures that all features are given |
| equal weightage in the distance computation. |
| |
| An approximation method can be used to speed the prediction phase by building |
| appropriate data structures in the training phase. An example of such a data |
| structure is kd-trees [5]. Using the kd-tree algorithm can improve the execution |
| time of the \f$k\f$-NN operation, but at expense of sacrificing some accuracy. The |
| kd-tree implementation divides the training dataset into multiple regions that |
| correspond to the leaf nodes of a tree. For example, a tree of depth \f$3\f$ will have |
| a total of \f$2^3 = 8\f$ regions. The algorithm will look for the nearest neighbors |
| in a subset of all regions instead of searching the whole dataset. For a given |
| test point, the first (home) region is found by traversing the tree and finding |
| its associated node. If the user requests additional leaf nodes to be searched, |
| we look at the distance between the point and the centroids of other regions and |
| expand the search to the specified number of closest regions. |
| |
| It's important to note that the nodes that each level of the kd-tree search over |
| a single feature and the features are explored in the same order as that in the |
| data. |
| |
| The kd-tree accuracy might suffer on datasets with a high number of features |
| (dimensions). For example, let's say we are using a dataset with 20 features and |
| kd-tree depth of only 3. This means the kd-tree is constructed based on the |
| first 3 features. Therefore, it is possible to miss nearest neighbors that are |
| closer in those 17 dimensions because they got assigned to a further region (the |
| distance computation would still uses all 20 features). |
| |
| @anchor literature |
| @literature |
| |
| @anchor knn-lit-1 |
| [1] Wikipedia, k-nearest neighbors algorithm, |
| https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm |
| |
| @anchor knn-lit-2 |
| [2] N. S. Altman: An Introduction to Kernel and Nearest-Neighbor Nonparametric Regression |
| http://www.stat.washington.edu/courses/stat527/s13/readings/Altman_AmStat_1992.pdf |
| |
| @anchor knn-lit-3 |
| [3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification, |
| https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf |
| |
| @anchor knn-lit-4 |
| [4] Shepard, Donald (1968). "A two-dimensional interpolation function for |
| irregularly-spaced data". Proceedings of the 1968 ACM National Conference. pp. 517–524. |
| |
| @anchor knn-lit-5 |
| [5] Bentley, J. L. (1975). "Multidimensional binary search trees used for |
| associative searching". Communications of the ACM. 18 (9): 509. doi:10.1145/361002.361007. |
| |
| @anchor knn-lit-6 |
| [6] https://en.wikipedia.org/wiki/Chebyshev_distance |
| |
| |
| @internal |
| @sa namespace knn (documenting the implementation in Python) |
| @endinternal |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| test_id VARCHAR, |
| output_table VARCHAR, |
| k INTEGER, |
| output_neighbors BOOLEAN, |
| fn_dist VARCHAR |
| ) RETURNS INTEGER AS $$ |
| PythonFunctionBody(`knn', `knn', `knn_validate_src') |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| point_id VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| test_id VARCHAR, |
| output_table VARCHAR, |
| k INTEGER, |
| output_neighbors BOOLEAN, |
| fn_dist TEXT, |
| weighted_avg BOOLEAN, |
| algorithm VARCHAR, |
| algorithm_params VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(`knn', `knn', `knn') |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| point_id VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| test_id VARCHAR, |
| output_table VARCHAR, |
| k INTEGER, |
| output_neighbors BOOLEAN, |
| fn_dist TEXT, |
| weighted_avg BOOLEAN, |
| algorithm VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| DECLARE |
| returnstring VARCHAR; |
| BEGIN |
| returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13, |
| NULL); |
| RETURN returnstring; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| point_id VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| test_id VARCHAR, |
| output_table VARCHAR, |
| k INTEGER, |
| output_neighbors BOOLEAN, |
| fn_dist TEXT, |
| weighted_avg BOOLEAN |
| ) RETURNS VARCHAR AS $$ |
| DECLARE |
| returnstring VARCHAR; |
| BEGIN |
| returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12, |
| NULL, NULL); |
| RETURN returnstring; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| point_id VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| test_id VARCHAR, |
| output_table VARCHAR, |
| k INTEGER, |
| output_neighbors BOOLEAN, |
| fn_dist TEXT |
| ) RETURNS VARCHAR AS $$ |
| DECLARE |
| returnstring VARCHAR; |
| BEGIN |
| returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11, |
| FALSE, NULL, NULL); |
| RETURN returnstring; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| point_id VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| test_id VARCHAR, |
| output_table VARCHAR, |
| k INTEGER, |
| output_neighbors BOOLEAN |
| ) RETURNS VARCHAR AS $$ |
| DECLARE |
| returnstring VARCHAR; |
| BEGIN |
| returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,$10, |
| 'MADLIB_SCHEMA.squared_dist_norm2', FALSE, |
| NULL, NULL); |
| RETURN returnstring; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| point_id VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| test_id VARCHAR, |
| output_table VARCHAR, |
| k INTEGER |
| ) RETURNS VARCHAR AS $$ |
| DECLARE |
| returnstring VARCHAR; |
| BEGIN |
| returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,TRUE, |
| 'MADLIB_SCHEMA.squared_dist_norm2', FALSE, |
| NULL, NULL); |
| RETURN returnstring; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| point_source VARCHAR, |
| point_column_name VARCHAR, |
| point_id VARCHAR, |
| label_column_name VARCHAR, |
| test_source VARCHAR, |
| test_column_name VARCHAR, |
| test_id VARCHAR, |
| output_table VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| DECLARE |
| returnstring VARCHAR; |
| BEGIN |
| returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1,TRUE, |
| 'MADLIB_SCHEMA.squared_dist_norm2',FALSE, |
| NULL, NULL); |
| RETURN returnstring; |
| END; |
| $$ LANGUAGE plpgsql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- Online help |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(knn, knn, knn_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn() |
| RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.knn(''); |
| $$ LANGUAGE sql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |