blob: 97cf49fec1060485fa70e361e888c47184eb72f4 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.14"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: k-Nearest Neighbors</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
$(document).ready(initResizable);
/* @license-end */</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
$(document).ready(function() { init_search(); });
/* @license-end */
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.15.1</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.14 -->
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
var searchBox = new SearchBox("searchBox", "search",false,'Search');
/* @license-end */
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt GPL-v2 */
$(document).ready(function(){initNavTree('group__grp__knn.html','');});
/* @license-end */
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">k-Nearest Neighbors<div class="ingroups"><a class="el" href="group__grp__early__stage.html">Early Stage Development</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b> <ul>
<li class="level1">
<a href="#knn">K-Nearest Neighbors</a> </li>
<li class="level1">
<a href="#usage">Usage</a> </li>
<li class="level1">
<a href="#output">Output Format</a> </li>
<li class="level1">
<a href="#examples">Examples</a> </li>
<li class="level1">
<a href="#background">Technical Background</a> </li>
<li class="level1">
<a href="#literature">Literature</a> </li>
</ul>
</div><dl class="section warning"><dt>Warning</dt><dd><em> This MADlib method is still in early stage development. There may be some issues that will be addressed in a future version. Interface and implementation are subject to change. </em></dd></dl>
<p><a class="anchor" id="knn"></a> K-nearest neighbors is a method for finding the k closest points to a given data point in terms of a given metric. Its input consists of data points as features from testing examples, and it looks for k closest points in the training set for each of the data points in test set. The output of KNN depends on the type of task. For classification, the output is the majority vote of the classes of the k nearest data points. That is, the testing example gets assigned the most popular class from the nearest neighbors. For regression, the output is the average of the values of k nearest neighbors of the given test point.</p>
<p><a class="anchor" id="usage"></a></p><dl class="section user"><dt>Usage</dt><dd><pre class="syntax">
knn( point_source,
point_column_name,
point_id,
label_column_name,
test_source,
test_column_name,
test_id,
output_table,
k,
output_neighbors,
fn_dist,
weighted_avg
)
</pre></dd></dl>
<p><b>Arguments</b> </p><dl class="arglist">
<dt>point_source </dt>
<dd><p class="startdd">TEXT. Name of the table containing the training data points. Training data points are expected to be stored row-wise in a column of type <code>DOUBLE PRECISION[]</code>. </p>
<p class="enddd"></p>
</dd>
<dt>point_column_name </dt>
<dd><p class="startdd">TEXT. Name of the column with training data points or expression that evaluates to a numeric array</p>
<p class="enddd"></p>
</dd>
<dt>point_id </dt>
<dd><p class="startdd">TEXT. Name of the column in 'point_source’ containing source data ids. The ids are of type INTEGER with no duplicates. They do not need to be contiguous. This parameter must be used if the list of nearest neighbors are to be output, i.e., if the parameter 'output_neighbors' below is TRUE or if 'label_column_name' is NULL.</p>
<p class="enddd"></p>
</dd>
<dt>label_column_name </dt>
<dd><p class="startdd">TEXT. Name of the column with labels/values of training data points. If this column is a Boolean, integer or text, it will run KNN classification, else if it is double precision values will run KNN regression. If you set this to NULL, it will only return the set of neighbors without actually doing classification or regression.</p>
<p class="enddd"></p>
</dd>
<dt>test_source </dt>
<dd><p class="startdd">TEXT. Name of the table containing the test data points. Testing data points are expected to be stored row-wise in a column of type <code>DOUBLE PRECISION[]</code>. </p>
<p class="enddd"></p>
</dd>
<dt>test_column_name </dt>
<dd><p class="startdd">TEXT. Name of the column with testing data points or expression that evaluates to a numeric array</p>
<p class="enddd"></p>
</dd>
<dt>test_id </dt>
<dd><p class="startdd">TEXT. Name of the column having ids of data points in test data table.</p>
<p class="enddd"></p>
</dd>
<dt>output_table </dt>
<dd><p class="startdd">TEXT. Name of the table to store final results.</p>
<p class="enddd"></p>
</dd>
<dt>k (optional) </dt>
<dd><p class="startdd">INTEGER. default: 1. Number of nearest neighbors to consider. For classification, should be an odd number to break ties, otherwise the result may depend on ordering of the input data.</p>
<p class="enddd"></p>
</dd>
<dt>output_neighbors (optional) </dt>
<dd><p class="startdd">BOOLEAN default: TRUE. Outputs the list of k-nearest neighbors that were used in the voting/averaging, sorted from closest to furthest.</p>
<p class="enddd"></p>
</dd>
<dt>fn_dist (optional) </dt>
<dd><p class="startdd">TEXT, default: 'squared_dist_norm2'. The name of the function used to calculate the distance between data points.</p>
<p>The following distance functions can be used: </p><ul>
<li>
<b><a class="el" href="linalg_8sql__in.html#aad193850e79c4b9d811ca9bc53e13476">dist_norm1</a></b>: 1-norm/Manhattan </li>
<li>
<b><a class="el" href="linalg_8sql__in.html#aa58e51526edea6ea98db30b6f250adb4">dist_norm2</a></b>: 2-norm/Euclidean </li>
<li>
<b><a class="el" href="linalg_8sql__in.html#a00a08e69f27524f2096032214e15b668">squared_dist_norm2</a></b>: squared Euclidean distance </li>
<li>
<b><a class="el" href="linalg_8sql__in.html#a8c7b9281a72ff22caf06161701b27e84">dist_angle</a></b>: angle </li>
<li>
<b><a class="el" href="linalg_8sql__in.html#afa13b4c6122b99422d666dedea136c18">dist_tanimoto</a></b>: tanimoto </li>
<li>
<b>user defined function</b> with signature <code>DOUBLE PRECISION[] x, DOUBLE PRECISION[] y -&gt; DOUBLE PRECISION</code></li>
</ul>
<p class="enddd"></p>
</dd>
<dt>weighted_avg (optional) </dt>
<dd><p class="startdd">BOOLEAN, default: FALSE. Calculates classification or regression values using a weighted average. The idea is to weigh the contribution of each of the k neighbors according to their distance to the test point, giving greater influence to closer neighbors. The distance function 'fn_dist' specified above is used.</p>
<p>For classification, majority voting weighs a neighbor according to inverse distance.</p>
<p class="enddd">For regression, the inverse distance weighting approach is used from Shepard [4]. </p>
</dd>
</dl>
<p><a class="anchor" id="output"></a></p><dl class="section user"><dt>Output Format</dt><dd></dd></dl>
<p>The output of the KNN module is a table with the following columns: </p><table class="output">
<tr>
<th>id </th><td>INTEGER. The ids of test data points. </td></tr>
<tr>
<th>test_column_name </th><td>DOUBLE PRECISION[]. The test data points. </td></tr>
<tr>
<th>prediction </th><td>INTEGER. Label in case of classification, average value in case of regression. </td></tr>
<tr>
<th>k_nearest_neighbours </th><td>INTEGER[]. List of nearest neighbors, sorted closest to furthest from the corresponding test point. </td></tr>
</table>
<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl>
<ol type="1">
<li>Prepare some training data for classification: <pre class="example">
DROP TABLE IF EXISTS knn_train_data;
CREATE TABLE knn_train_data (
id integer,
data integer[],
label integer -- Integer label means for classification
);
INSERT INTO knn_train_data VALUES
(1, '{1,1}', 1),
(2, '{2,2}', 1),
(3, '{3,3}', 1),
(4, '{4,4}', 1),
(5, '{4,5}', 1),
(6, '{20,50}', 0),
(7, '{10,31}', 0),
(8, '{81,13}', 0),
(9, '{1,111}', 0);
</pre></li>
<li>Prepare some training data for regression: <pre class="example">
DROP TABLE IF EXISTS knn_train_data_reg;
CREATE TABLE knn_train_data_reg (
id integer,
data integer[],
label float -- Float label means for regression
);
INSERT INTO knn_train_data_reg VALUES
(1, '{1,1}', 1.0),
(2, '{2,2}', 1.0),
(3, '{3,3}', 1.0),
(4, '{4,4}', 1.0),
(5, '{4,5}', 1.0),
(6, '{20,50}', 0.0),
(7, '{10,31}', 0.0),
(8, '{81,13}', 0.0),
(9, '{1,111}', 0.0);
</pre></li>
<li>Prepare some testing data: <pre class="example">
DROP TABLE IF EXISTS knn_test_data;
CREATE TABLE knn_test_data (
id integer,
data integer[]
);
INSERT INTO knn_test_data VALUES
(1, '{2,1}'),
(2, '{2,6}'),
(3, '{15,40}'),
(4, '{12,1}'),
(5, '{2,90}'),
(6, '{50,45}');
</pre></li>
<li>Run KNN for classification: <pre class="example">
DROP TABLE IF EXISTS knn_result_classification;
SELECT * FROM madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
'id', -- Col name of id in train data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_classification', -- Output table
3, -- Number of nearest neighbors
True, -- True to list nearest-neighbors by id
'madlib.squared_dist_norm2' -- Distance function
);
SELECT * from knn_result_classification ORDER BY id;
</pre> Result: <pre class="result">
id | data | prediction | k_nearest_neighbours
----+---------+------------+----------------------
1 | {2,1} | 1 | {2,1,3}
2 | {2,6} | 1 | {5,4,3}
3 | {15,40} | 0 | {7,6,5}
4 | {12,1} | 1 | {4,5,3}
5 | {2,90} | 0 | {9,6,7}
6 | {50,45} | 0 | {6,7,8}
(6 rows)
</pre> Note that the nearest neighbors are sorted from closest to furthest from the corresponding test point.</li>
<li>Run KNN for regression: <pre class="example">
DROP TABLE IF EXISTS knn_result_regression;
SELECT * FROM madlib.knn(
'knn_train_data_reg', -- Table of training data
'data', -- Col name of training data
'id', -- Col Name of id in train data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_regression', -- Output table
3, -- Number of nearest neighbors
True, -- True to list nearest-neighbors by id
'madlib.dist_norm2' -- Distance function
);
SELECT * FROM knn_result_regression ORDER BY id;
</pre> Result: <pre class="result">
id | data | prediction | k_nearest_neighbours
----+---------+-------------------+----------------------
1 | {2,1} | 1 | {2,1,3}
2 | {2,6} | 1 | {5,4,3}
3 | {15,40} | 0.333333333333333 | {7,6,5}
4 | {12,1} | 1 | {4,5,3}
5 | {2,90} | 0 | {9,6,7}
6 | {50,45} | 0 | {6,7,8}
(6 rows)
</pre></li>
<li>List nearest neighbors only, without doing classification or regression: <pre class="example">
DROP TABLE IF EXISTS knn_result_list_neighbors;
SELECT * FROM madlib.knn(
'knn_train_data_reg', -- Table of training data
'data', -- Col name of training data
'id', -- Col Name of id in train data
NULL, -- NULL training labels means just list neighbors
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_list_neighbors', -- Output table
3 -- Number of nearest neighbors
);
SELECT * FROM knn_result_list_neighbors ORDER BY id;
</pre> Result, with neighbors sorted from closest to furthest: <pre class="result">
id | data | k_nearest_neighbours
----+---------+----------------------
1 | {2,1} | {2,1,3}
2 | {2,6} | {5,4,3}
3 | {15,40} | {7,6,5}
4 | {12,1} | {4,5,3}
5 | {2,90} | {9,6,7}
6 | {50,45} | {6,7,8}
(6 rows)
</pre></li>
<li>Run KNN for classification using the weighted average: <pre class="example">
DROP TABLE IF EXISTS knn_result_classification;
SELECT * FROM madlib.knn(
'knn_train_data', -- Table of training data
'data', -- Col name of training data
'id', -- Col name of id in train data
'label', -- Training labels
'knn_test_data', -- Table of test data
'data', -- Col name of test data
'id', -- Col name of id in test data
'knn_result_classification', -- Output table
3, -- Number of nearest neighbors
True, -- True to list nearest-neighbors by id
'madlib.squared_dist_norm2', -- Distance function
True -- For weighted average
);
SELECT * FROM knn_result_classification ORDER BY id;
</pre> <pre class="result">
id | data | prediction | k_nearest_neighbours
----+---------+---------------------+----------------------
1 | {2,1} | 1 | {2,1,3}
2 | {2,6} | 1 | {5,4,3}
3 | {15,40} | 0 | {7,6,5}
4 | {12,1} | 1 | {4,5,3}
5 | {2,90} | 0 | {9,6,7}
6 | {50,45} | 0 | {6,7,8}
(6 rows)
</pre></li>
</ol>
<p><a class="anchor" id="background"></a></p><dl class="section user"><dt>Technical Background</dt><dd></dd></dl>
<p>The training data points are vectors in a multidimensional feature space, each with a class label. The training phase of the algorithm consists only of storing the feature vectors and class labels of the training points.</p>
<p>In the classification phase, k is a user-defined constant, and an unlabeled vector (a test point) is classified by assigning the label which is most frequent among the k training samples nearest to that test point. In case of regression, average of the values of these k training samples is assigned to the test point.</p>
<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl>
<p><a class="anchor" id="knn-lit-1"></a>[1] Wikipedia, k-nearest neighbors algorithm, <a href="https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm">https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm</a></p>
<p><a class="anchor" id="knn-lit-2"></a>[2] N. S. Altman: An Introduction to Kernel and Nearest-Neighbor Nonparametric Regression <a href="http://www.stat.washington.edu/courses/stat527/s13/readings/Altman_AmStat_1992.pdf">http://www.stat.washington.edu/courses/stat527/s13/readings/Altman_AmStat_1992.pdf</a></p>
<p><a class="anchor" id="knn-lit-3"></a>[3] Gongde Guo1, Hui Wang, David Bell, Yaxin Bi, Kieran Greer: KNN Model-Based Approach in Classification, <a href="https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf">https://ai2-s2-pdfs.s3.amazonaws.com/a7e2/814ec5db800d2f8c4313fd436e9cf8273821.pdf</a></p>
<p><a class="anchor" id="knn-lit-4"></a>[4] Shepard, Donald (1968). "A two-dimensional interpolation function for
irregularly-spaced data". Proceedings of the 1968 ACM National Conference. pp. 517–524.</p>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
</ul>
</div>
</body>
</html>