blob: 22822ed5225451f165a76ba309e6d94ff6f9b49c [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*//* ----------------------------------------------------------------------- */
/* ----------------------------------------------------------------------- *//**
*
* @file knn.sql_in
* @brief Set of functions for k-nearest neighbors.
* @sa For a brief introduction to k-nearest neighbors algorithm for regression and classification,
* see the module description \ref grp_knn.
*
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_knn
<div class="toc"><b>Contents</b>
<ul>
<li class="level1"><a href="#knn">K-Nearest Neighbors</a></li>
<li class="level1"><a href="#usage">Usage</a></li>
<li class="level1"><a href="#output">Output Format</a></li>
<li class="level1"><a href="#examples">Examples</a></li>
<li class="level1"><a href="#background">Technical Background</a></li>
<li class="level1"><a href="#literature">Literature</a></li>
</ul>
</div>
@brief Finds \f$k\f$ nearest data points to the given data point and outputs majority
vote value of output classes for classification, or average value of target
values for regression.
@anchor knn
K-nearest neighbors is a method for finding the \f$k\f$ closest points to a given data
point in terms of a given metric. Its input consists of data points as features
from testing examples and it looks for \f$k\f$ closest points in the training set
for each of the data points in test set. The output of KNN depends on the type
of task. For classification, the output is the majority vote of the classes of
the \f$k\f$ nearest data points. For regression, the output is the average of the
values of \f$k\f$ nearest neighbors of the given test point.
For unsupervised nearest neighbors, set the training set to match the test set so the
nearest neighbor of each point is the point itself, with zero distance.
Both exact and approximate methods are supported. The approximate methods can be
used in the case that run-time is too long using the exact method.
@anchor usage
@par Usage
<pre class="syntax">
knn( point_source,
point_column_name,
point_id,
label_column_name,
test_source,
test_column_name,
test_id,
output_table,
k,
output_neighbors,
fn_dist,
weighted_avg,
algorithm,
algorithm_params
)
</pre>
\b Arguments
<dl class="arglist">
<dt>point_source</dt>
<dd>TEXT. Name of the table containing the training data points.
Training data points are expected to be stored row-wise
in a column of type <tt>DOUBLE PRECISION[]</tt>.
</dd>
<dt>point_column_name</dt>
<dd>TEXT. Name of the column with training data points
or expression that evaluates to a numeric array</dd>
<dt>point_id</dt>
<dd>TEXT. Name of the column in 'point_source’ containing source data ids.
The ids are of type INTEGER with no duplicates. They do not need to be contiguous.
This parameter must be used if the list of nearest neighbors are to be output, i.e.,
if the parameter 'output_neighbors' below is TRUE or if 'label_column_name' is NULL.
<dt>label_column_name</dt>
<dd>TEXT. Name of the column with labels/values of training data points.
If this column is a Boolean, integer or text, it will run KNN classification,
else if it is double precision values will run KNN regression.
If you set this to NULL, it will only return the set of neighbors without
actually doing classification or regression.</dd>
<dt>test_source</dt>
<dd>TEXT. Name of the table containing the test data points.
Testing data points are expected to be stored row-wise
in a column of type <tt>DOUBLE PRECISION[]</tt>.
</dd>
<dt>test_column_name</dt>
<dd>TEXT. Name of the column with testing data points
or expression that evaluates to a numeric array</dd>
@note
For unsupervised nearest neighbors, make the test dataset the same as the source dataset,
so the nearest neighbor of each point is the point itself, with a zero distance.
<dt>test_id</dt>
<dd>TEXT. Name of the column having ids of data points in test data table.</dd>
<dt>output_table</dt>
<dd>TEXT. Name of the table to store final results.</dd>
<dt>k (optional)</dt>
<dd>INTEGER. default: 1. Number of nearest neighbors to consider.
For classification, should be an odd number to break ties,
otherwise the result may depend on ordering of the input data.</dd>
<dt>output_neighbors (optional) </dt>
<dd>BOOLEAN default: TRUE. Outputs the list of k-nearest
neighbors (and their respective distances to the target point) that were used
in the voting/averaging, sorted from closest to furthest.</dd>
<dt>fn_dist (optional)</dt>
<dd>TEXT, default: 'squared_dist_norm2'. The name of the function
used to calculate the distance between data points.
The following distance functions can be used:
<ul>
<li><b>\ref dist_norm1</b>: 1-norm/Manhattan</li>
<li><b>\ref dist_norm2</b>: 2-norm/Euclidean</li>
<li><b>\ref squared_dist_norm2</b>: squared Euclidean distance</li>
<li><b>\ref dist_angle</b>: angle</li>
<li><b>\ref dist_tanimoto</b>: tanimoto</li>
<li><b>user defined function</b> with signature <tt>DOUBLE PRECISION[] x, DOUBLE PRECISION[] y -> DOUBLE PRECISION.</tt>
Must return a value greater than or equal to zero.</li></ul></dd>
@note
Always qualify the distance function with the schema name. For example, if you install MADlib in a
schema called 'madlib' then the 'fn_dist' parameter would be 'madlib.dist_norm2', etc.
<dt>weighted_avg (optional)</dt>
<dd>BOOLEAN, default: FALSE. Calculates classification or
regression values using a weighted average. The idea is to
weigh the contribution of each of the k neighbors according
to their distance to the test point, giving greater influence
to closer neighbors. The distance function 'fn_dist' specified
above is used. For classification, majority voting weighs a neighbor
according to inverse distance. For regression, the inverse distance
weighting approach is used from Shepard [4].
<dt>algorithm (optional)</dt>
<dd>TEXT, default: 'brute_force'. The name of the algorithm
used to compute nearest neighbors. The following options are supported:
<ul>
<li><b>\ref brute_force</b>: Produces an exact result by searching
all points in the search space. You can also use a short
form "b" or "brute" etc. to select brute force.</li>
<li><b>\ref kd_tree</b>: Produces an approximate result by searching
a subset of the search space, that is, only certain leaf nodes in the
kd-tree as specified by "algorithm_params" below.
You can also use a short
form "k" or "kd" etc. to select kd-tree.</li></ul></dd>
<dt>algorithm_params (optional)</dt>
<dd>TEXT, default: 'depth=3, leaf_nodes=2'. These parameters apply to the
kd-tree algorithm only.
<ul>
<li><b>\ref depth</b>: Depth of the kd-tree. Increasing this value will
decrease run-time but reduce the accuracy.</li>
<li><b>\ref leaf_nodes</b>: Number of leaf nodes (regions) to search for each test point.
Inceasing this value will improve the accuracy but increase run-time.</li></ul>
@note
Please note that the kd-tree accuracy will be lower for datasets with a high
number of features. It is advised to use at least two leaf nodes.
Refer to the <a href="#background">Technical Background</a> for more information
on how the kd-tree is implemented.</dd>
</dl>
@anchor output
@par Output Format
The output of the KNN module is a table with the following columns:
<table class="output">
<tr>
<th>id</th>
<td>INTEGER. The ids of test data points.</td>
</tr>
<tr>
<th>test_column_name</th>
<td>DOUBLE PRECISION[]. The test data points.</td>
</tr>
<tr>
<th>prediction</th>
<td>INTEGER. Label in case of classification, average value in case of regression.</td>
</tr>
<tr>
<th>k_nearest_neighbours</th>
<td>INTEGER[]. List of nearest neighbors, sorted closest to furthest
from the corresponding test point.</td>
</tr>
<tr>
<th>distance</th>
<td>DOUBLE PRECISION[]. List of distance to nearest neighbors, sorted closest to furthest
from the corresponding test point.</td>
</tr>
</table>
@anchor examples
@examp
-# Prepare some training data for classification:
<pre class="example">
DROP TABLE IF EXISTS knn_train_data;
CREATE TABLE knn_train_data (
id integer,
data integer[],
label integer -- Integer label means for classification
);
INSERT INTO knn_train_data VALUES
(1, '{1,1}', 1),
(2, '{2,2}', 1),
(3, '{3,3}', 1),
(4, '{4,4}', 1),
(5, '{4,5}', 1),
(6, '{20,50}', 0),
(7, '{10,31}', 0),
(8, '{81,13}', 0),
(9, '{1,111}', 0);
</pre>
-# Prepare some training data for regression:
<pre class="example">
DROP TABLE IF EXISTS knn_train_data_reg;
CREATE TABLE knn_train_data_reg (
id integer,
data integer[],
label float -- Float label means for regression
);
INSERT INTO knn_train_data_reg VALUES
(1, '{1,1}', 1.0),
(2, '{2,2}', 1.0),
(3, '{3,3}', 1.0),
(4, '{4,4}', 1.0),
(5, '{4,5}', 1.0),
(6, '{20,50}', 0.0),
(7, '{10,31}', 0.0),
(8, '{81,13}', 0.0),
(9, '{1,111}', 0.0);
</pre>
-# Prepare some testing data:
<pre class="example">
DROP TABLE IF EXISTS knn_test_data CASCADE;
CREATE TABLE knn_test_data (
id integer,
data integer[]
);
INSERT INTO knn_test_data VALUES
(1, '{2,1}'),
(2, '{2,6}'),
(3, '{15,40}'),
(4, '{12,1}'),
(5, '{2,90}'),
(6, '{50,45}');
</pre>
-# Run KNN for classification. Prepend the distance function parameter with the schema
where MADlib is installed (in this example 'madlib.squared_dist_norm2'):
<pre class="example">
DROP TABLE IF EXISTS knn_result_classification;
SELECT * FROM madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
'id', -- Col name of id in train data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_classification', -- Output table
3, -- Number of nearest neighbors
True, -- True to list nearest-neighbors by id
'madlib.squared_dist_norm2' -- Distance function
);
SELECT * from knn_result_classification ORDER BY id;
</pre>
Result:
<pre class="result">
id | data | prediction | k_nearest_neighbours | distance
----+---------+------------+----------------------+---------------------
1 | {2,1} | 1 | {1,2,3} | {1,1,5}
2 | {2,6} | 1 | {5,4,3} | {5,8,10}
3 | {15,40} | 0 | {7,6,5} | {106,125,1346}
4 | {12,1} | 1 | {4,5,3} | {73,80,85}
5 | {2,90} | 0 | {9,6,7} | {442,1924,3545}
6 | {50,45} | 0 | {6,7,8} | {925,1796,1985}
(6 rows)
</pre>
Note that the nearest neighbors are sorted from closest
to furthest from the corresponding test point.
-# Run KNN for regression:
<pre class="example">
DROP TABLE IF EXISTS knn_result_regression;
SELECT * FROM madlib.knn(
'knn_train_data_reg', -- Table of training data
'data', -- Col name of training data
'id', -- Col Name of id in train data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_regression', -- Output table
3, -- Number of nearest neighbors
True, -- True to list nearest-neighbors by id
'madlib.dist_norm2' -- Distance function
);
SELECT * FROM knn_result_regression ORDER BY id;
</pre>
Result:
<pre class="result">
id | data | prediction | k_nearest_neighbours | distance
----+---------+-------------------+----------------------+------------------------------------------------------
1 | {2,1} | 1 | {1,2,3} | {1,1,2.23606797749979}
2 | {2,6} | 1 | {5,4,3} | {2.23606797749979,2.82842712474619,3.16227766016838}
3 | {15,40} | 0.333333333333333 | {7,6,5} | {10.295630140987,11.1803398874989,36.6878726556883}
4 | {12,1} | 1 | {4,5,3} | {8.54400374531753,8.94427190999916,9.21954445729289}
5 | {2,90} | 0 | {9,6,7} | {21.0237960416286,43.8634243989226,59.5399025864168}
6 | {50,45} | 0 | {6,7,8} | {30.4138126514911,42.3792402008342,44.5533388198909}
(6 rows)
</pre>
-# List nearest neighbors only, without doing classification
or regression:
<pre class="example">
DROP TABLE IF EXISTS knn_result_list_neighbors;
SELECT * FROM madlib.knn(
'knn_train_data_reg', -- Table of training data
'data', -- Col name of training data
'id', -- Col Name of id in train data
NULL, -- NULL training labels means just list neighbors
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_list_neighbors', -- Output table
3 -- Number of nearest neighbors
);
SELECT * FROM knn_result_list_neighbors ORDER BY id;
</pre>
Result, with neighbors sorted from closest to furthest:
<pre class="result">
id | data | k_nearest_neighbours | distance
----+---------+----------------------+---------------------
1 | {2,1} | {1,2,3} | {1,1,5}
2 | {2,6} | {5,4,3} | {5,8,10}
3 | {15,40} | {7,6,5} | {106,125,1346}
4 | {12,1} | {4,5,3} | {73,80,85}
5 | {2,90} | {9,6,7} | {442,1924,3545}
6 | {50,45} | {6,7,8} | {925,1796,1985}
(6 rows)
</pre>
-# Run KNN for classification using the
weighted average:
<pre class="example">
DROP TABLE IF EXISTS knn_result_classification;
SELECT * FROM madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
'id', -- Col name of id in train data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_classification', -- Output table
3, -- Number of nearest neighbors
True, -- True to list nearest-neighbors by id
'madlib.squared_dist_norm2', -- Distance function
True -- For weighted average
);
SELECT * FROM knn_result_classification ORDER BY id;
</pre>
<pre class="result">
id | data | prediction | k_nearest_neighbours | distance
----+---------+------------+----------------------+---------------------
1 | {2,1} | 1 | {1,2,3} | {1,1,5}
2 | {2,6} | 1 | {5,4,3} | {5,8,10}
3 | {15,40} | 0 | {7,6,5} | {106,125,1346}
4 | {12,1} | 1 | {4,5,3} | {73,80,85}
5 | {2,90} | 0 | {9,6,7} | {442,1924,3545}
6 | {50,45} | 0 | {6,7,8} | {925,1796,1985}
(6 rows)
</pre>
-# Use kd-tree option. First we build a kd-tree to depth 4 and
search half (8) of the 16 leaf nodes (i.e., 2^4 total leaf nodes):
<pre class="example">
DROP TABLE IF EXISTS knn_result_classification_kd;
SELECT madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
'id', -- Col name of id in train data
NULL, -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_classification_kd', -- Output table
3, -- Number of nearest neighbors
True, -- True to list nearest-neighbors by id
'madlib.squared_dist_norm2', -- Distance function
False, -- For weighted average
'kd_tree', -- Use kd-tree
'depth=4, leaf_nodes=8' -- Kd-tree options
);
SELECT * FROM knn_result_classification_kd ORDER BY id;
</pre>
<pre class="result">
id | data | k_nearest_neighbours | distance
----+---------+----------------------+---------------------
1 | {2,1} | {1,2,3} | {1,1,5}
2 | {2,6} | {5,4,3} | {5,8,10}
3 | {15,40} | {7,6,5} | {106,125,1346}
4 | {12,1} | {4,5,3} | {73,80,85}
5 | {2,90} | {9,6,7} | {442,1924,3545}
6 | {50,45} | {6,7,8} | {925,1796,1985}
(6 rows)
</pre>
The result above is the same as brute force. If we search just 1 leaf node,
run-time will be faster but accuracy will be lower. This shows up in this
very small data set by not being able to find 3 nearest neighbors for all test points:
<pre class="example">
DROP TABLE IF EXISTS knn_result_classification_kd;
SELECT madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
'id', -- Col name of id in train data
NULL, -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_classification_kd', -- Output table
3, -- Number of nearest neighbors
True, -- True to list nearest-neighbors by id
'madlib.squared_dist_norm2', -- Distance function
False, -- For weighted average
'kd_tree', -- Use kd-tree
'depth=4, leaf_nodes=1' -- Kd-tree options
);
SELECT * FROM knn_result_classification_kd ORDER BY id;
</pre>
<pre class="result">
id | data | k_nearest_neighbours | distance
----+---------+----------------------+---------------------
1 | {2,1} | {1} | {1}
2 | {2,6} | {3,2} | {10,16}
3 | {15,40} | {7} | {106}
5 | {2,90} | {3,2} | {7570,7744}
6 | {50,45} | {6,8} | {925,1985}
(5 rows)
</pre>
-# Unsupervised nearest neighbors. Here the training set matches the
test set so the nearest neighbor of each point is the point itself, with a zero distance:
<pre class="example">
DROP TABLE IF EXISTS knn_result_classification_unsup;
SELECT * FROM madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
'id', -- Col name of id in train data
NULL, -- NULL training labels means just list neighbors
'knn_train_data', -- Table of test data (same as training data)
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_classification_unsup', -- Output table
3, -- Number of nearest neighbors
True, -- True to list nearest-neighbors by id
'madlib.squared_dist_norm2' -- Distance function
);
SELECT * from knn_result_classification_unsup ORDER BY id;
</pre>
Result, with point and neighbors sorted from closest to furthest:
<pre class="result">
id | data | k_nearest_neighbours | distance
----+---------+----------------------+---------------
1 | {1,1} | {1,2,3} | {0,2,8}
2 | {2,2} | {2,3,1} | {0,2,2}
3 | {3,3} | {3,2,4} | {0,2,2}
4 | {4,4} | {4,5,3} | {0,1,2}
5 | {4,5} | {5,4,3} | {0,1,5}
6 | {20,50} | {6,7,5} | {0,461,2281}
7 | {10,31} | {7,6,5} | {0,461,712}
8 | {81,13} | {8,6,7} | {0,5090,5365}
9 | {1,111} | {9,6,7} | {0,4082,6481}
(9 rows)
</pre>
-# User defined distance function. There are several built-in distance
functions, but you can create your own using a UDF if desired.
For example, to create a Chebyshev distance function [6], first create the function:
<pre class="example">
CREATE OR REPLACE FUNCTION chebychev_distance (x double precision[], y double precision[])
RETURNS double precision
AS $$
from scipy.spatial import distance
return distance.chebyshev(x, y)
$$ LANGUAGE plpythonu;
</pre>
Then pass the function as an argument:
<pre class="example">
DROP TABLE IF EXISTS knn_result_classification_udf;
SELECT * FROM madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
'id', -- Col name of id in train data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_classification_udf', -- Output table
3, -- Number of nearest neighbors
True, -- True to list nearest-neighbors by id
'chebychev_distance' -- Distance function
);
SELECT * from knn_result_classification_udf ORDER BY id;
</pre>
Result, with point and neighbors sorted from closest to furthest:
<pre class="result">
id | data | prediction | k_nearest_neighbours | distance
----+---------+------------+----------------------+------------
1 | {2,1} | 1 | {1,2,3} | {1,1,2}
2 | {2,6} | 1 | {5,4,3} | {2,2,3}
3 | {15,40} | 0 | {7,6,5} | {9,10,35}
4 | {12,1} | 1 | {5,4,3} | {8,8,9}
5 | {2,90} | 0 | {9,6,7} | {21,40,59}
6 | {50,45} | 0 | {6,8,7} | {30,32,40}
(6 rows)
</pre>
@anchor background
@par Technical Background
The training data points are vectors in a multidimensional feature space,
each with a class label. The training phase of the algorithm consists
only of storing the feature vectors and class labels of the training points.
In the prediction phase, \f$k\f$ is a user-defined constant, and an unlabeled vector
(a test point) is predicted by using the label from the the \f$k\f$ training samples
nearest to that test point.
Since distances between points are used to find the nearest neighbors, the data
should be standardized across features. This ensures that all features are given
equal weightage in the distance computation.
An approximation method can be used to speed the prediction phase by building
appropriate data structures in the training phase. An example of such a data
structure is kd-trees [5]. Using the kd-tree algorithm can improve the execution
time of the \f$k\f$-NN operation, but at expense of sacrificing some accuracy. The
kd-tree implementation divides the training dataset into multiple regions that
correspond to the leaf nodes of a tree. For example, a tree of depth \f$3\f$ will have
a total of \f$2^3 = 8\f$ regions. The algorithm will look for the nearest neighbors
in a subset of all regions instead of searching the whole dataset. For a given
test point, the first (home) region is found by traversing the tree and finding
its associated node. If the user requests additional leaf nodes to be searched,
we look at the distance between the point and the centroids of other regions and
expand the search to the specified number of closest regions.
It's important to note that the nodes that each level of the kd-tree search over
a single feature and the features are explored in the same order as that in the
data.
The kd-tree accuracy might suffer on datasets with a high number of features
(dimensions). For example, let's say we are using a dataset with 20 features and
kd-tree depth of only 3. This means the kd-tree is constructed based on the
first 3 features. Therefore, it is possible to miss nearest neighbors that are
closer in those 17 dimensions because they got assigned to a further region (the
distance computation would still uses all 20 features).
@anchor literature
@literature
@anchor knn-lit-1
[1] Wikipedia, k-nearest neighbors algorithm,
https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
@anchor knn-lit-2
[2] N. S. Altman: An Introduction to Kernel and Nearest-Neighbor Nonparametric Regression
http://www.stat.washington.edu/courses/stat527/s13/readings/Altman_AmStat_1992.pdf
@anchor knn-lit-3
[3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification,
https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf
@anchor knn-lit-4
[4] Shepard, Donald (1968). "A two-dimensional interpolation function for
irregularly-spaced data". Proceedings of the 1968 ACM National Conference. pp. 517–524.
@anchor knn-lit-5
[5] Bentley, J. L. (1975). "Multidimensional binary search trees used for
associative searching". Communications of the ACM. 18 (9): 509. doi:10.1145/361002.361007.
@anchor knn-lit-6
[6] https://en.wikipedia.org/wiki/Chebyshev_distance
@internal
@sa namespace knn (documenting the implementation in Python)
@endinternal
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__knn_validate_src(
point_source VARCHAR,
point_column_name VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
test_id VARCHAR,
output_table VARCHAR,
k INTEGER,
output_neighbors BOOLEAN,
fn_dist VARCHAR
) RETURNS INTEGER AS $$
PythonFunctionBody(`knn', `knn', `knn_validate_src')
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
point_source VARCHAR,
point_column_name VARCHAR,
point_id VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
test_id VARCHAR,
output_table VARCHAR,
k INTEGER,
output_neighbors BOOLEAN,
fn_dist TEXT,
weighted_avg BOOLEAN,
algorithm VARCHAR,
algorithm_params VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(`knn', `knn', `knn')
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
point_source VARCHAR,
point_column_name VARCHAR,
point_id VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
test_id VARCHAR,
output_table VARCHAR,
k INTEGER,
output_neighbors BOOLEAN,
fn_dist TEXT,
weighted_avg BOOLEAN,
algorithm VARCHAR
) RETURNS VARCHAR AS $$
DECLARE
returnstring VARCHAR;
BEGIN
returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,
NULL);
RETURN returnstring;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
point_source VARCHAR,
point_column_name VARCHAR,
point_id VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
test_id VARCHAR,
output_table VARCHAR,
k INTEGER,
output_neighbors BOOLEAN,
fn_dist TEXT,
weighted_avg BOOLEAN
) RETURNS VARCHAR AS $$
DECLARE
returnstring VARCHAR;
BEGIN
returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,
NULL, NULL);
RETURN returnstring;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
point_source VARCHAR,
point_column_name VARCHAR,
point_id VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
test_id VARCHAR,
output_table VARCHAR,
k INTEGER,
output_neighbors BOOLEAN,
fn_dist TEXT
) RETURNS VARCHAR AS $$
DECLARE
returnstring VARCHAR;
BEGIN
returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,
FALSE, NULL, NULL);
RETURN returnstring;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
point_source VARCHAR,
point_column_name VARCHAR,
point_id VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
test_id VARCHAR,
output_table VARCHAR,
k INTEGER,
output_neighbors BOOLEAN
) RETURNS VARCHAR AS $$
DECLARE
returnstring VARCHAR;
BEGIN
returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,
'MADLIB_SCHEMA.squared_dist_norm2', FALSE,
NULL, NULL);
RETURN returnstring;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
point_source VARCHAR,
point_column_name VARCHAR,
point_id VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
test_id VARCHAR,
output_table VARCHAR,
k INTEGER
) RETURNS VARCHAR AS $$
DECLARE
returnstring VARCHAR;
BEGIN
returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,$9,TRUE,
'MADLIB_SCHEMA.squared_dist_norm2', FALSE,
NULL, NULL);
RETURN returnstring;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
point_source VARCHAR,
point_column_name VARCHAR,
point_id VARCHAR,
label_column_name VARCHAR,
test_source VARCHAR,
test_column_name VARCHAR,
test_id VARCHAR,
output_table VARCHAR
) RETURNS VARCHAR AS $$
DECLARE
returnstring VARCHAR;
BEGIN
returnstring = MADLIB_SCHEMA.knn($1,$2,$3,$4,$5,$6,$7,$8,1,TRUE,
'MADLIB_SCHEMA.squared_dist_norm2',FALSE,
NULL, NULL);
RETURN returnstring;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-- Online help
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(knn, knn, knn_help)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.knn()
RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.knn('');
$$ LANGUAGE sql IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');