blob: d85b7a8d03eebbf57bf9ed0c481971c1cc8eba2c [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file svdmf.sql_in
*
* @brief SQL functions for SVD Matrix Factorization
* @date January 2011
*
* @sa For a brief introduction to SVD Matrix Factorization, see the module
* description \ref grp_svd.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_svdmf
\warning <em> This is an old implementation of Matrix Decomposition and
has been deprecated. For SVD decomposition, please see \ref grp_svd;
for the latest version of low-rank approximation, please see \ref grp_lmf</em>
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#syntax">SVD Function Syntax</a></li>
<li><a href="#xamples">Examples</a></li>
<li><a href="#literature">Literature</a></li>
<li><a href="#related">Related Topics</a></li>
</ul>
</div>
@brief Computes low-rank approximation of a sparse matrix.
This module implements "partial SVD decomposition" method for representing a sparse matrix using a low-rank approximation.
Mathematically, this algorithm seeks to find matrices U and V that, for any given A, minimizes:\n
\f[ ||\boldsymbol A - \boldsymbol UV ||_2
\f]
subject to \f$ rank(\boldsymbol UV) \leq k \f$, where \f$ ||\cdot||_2 \f$ denotes the Frobenius norm and \f$ k \leq rank(\boldsymbol A)\f$.
If A is \f$ m \times n \f$, then U will be \f$ m \times k \f$ and V will be \f$ k \times n \f$.
This algorithm is not intended to do the full decomposition, or to be used as part of
inverse procedure. It effectively computes the SVD of a low-rank approximation of A (preferably sparse), with the singular values absorbed in U and V.
Code is based on the write-up as appears at [1], with some modifications.
@anchor syntax
@par Function Syntax
The SVD function is called as follows:
<pre class="syntax">
svdmf_run( input_table,
col_name,
row_name,
value, num_features)
</pre>
The <b>input matrix</b> is expected to be of the following form:
<pre>{TABLE|VIEW} <em>input_table</em> (
<em>col_num</em> INTEGER,
<em>row_num</em> INTEGER,
<em>value</em> FLOAT
)</pre>
Input is contained in a table where column number and row number for each cell
are sequential; that is to say that if the data was written as a matrix, those values would be the
actual row and column numbers and not some random identifiers. All rows and columns must be associated with a value.
There should not be any missing row, columns or values.
The function returns two tables \c matrix_u and \c matrix_v, which represent the matrices U and V in table format.
@anchor examples
@examp
-# Prepare an input table/view.
<pre class="example">
CREATE TABLE svd_test ( col INT,
row INT,
val FLOAT
);
</pre>
-# Populate the input table with some data.
<pre class="example">
INSERT INTO svd_test SELECT ( g.a%1000)+1, g.a/1000+1, random()
FROM generate_series(1,1000) AS g(a);
</pre>
-# Call the svdmf_run() stored procedure.
<pre class="example">
SELECT madlib.svdmf_run( 'svd_test',
'col',
'row',
'val',
3);
</pre>
Example result:
<pre class="result">
INFO: ('Started svdmf_run() with parameters:',)
INFO: (' * input_matrix = madlib_svdsparse_test.test',)
INFO: (' * col_name = col_num',)
INFO: (' * row_name = row_num',)
INFO: (' * value = val',)
INFO: (' * num_features = 3',)
INFO: ('Copying the source data into a temporary table...',)
INFO: ('Estimating feature: 1',)
INFO: ('...Iteration 1: residual_error = 33345014611.1, step_size = 4.9997500125e-10, min_improvement = 1.0',)
INFO: ('...Iteration 2: residual_error = 33345014557.6, step_size = 5.49972501375e-10, min_improvement = 1.0',)
INFO: ('...Iteration 3: residual_error = 33345014054.3, step_size = 6.04969751512e-10, min_improvement = 1.0',)
...
INFO: ('...Iteration 78: residual_error = 2.02512133868, step_size = 5.78105354457e-10, min_improvement = 1.0',)
INFO: ('...Iteration 79: residual_error = 0.893810181282, step_size = 6.35915889903e-10, min_improvement = 1.0',)
INFO: ('...Iteration 80: residual_error = 0.34496773222, step_size = 6.99507478893e-10, min_improvement = 1.0',)
INFO: ('Swapping residual error matrix...',)
svdmf_run
&nbsp;-------------------------------------------------------------------------------------------
Finished SVD matrix factorisation for madlib_svdsparse_test.test (row_num, col_num, val).
Results:
* total error = 0.34496773222
* number of estimated features = 1
Output:
* table : madlib.matrix_u
* table : madlib.matrix_v
Time elapsed: 4 minutes 47.86839 seconds.
</pre>
@anchor literature
@literature
[1] Simon Funk, Netflix Update: Try This at Home, December 11 2006,
http://sifter.org/~simon/journal/20061211.html
@anchor related
@par Related Topics
File svdmf.sql_in documenting the SQL functions.
@internal
@sa namespace svdmf (documenting the implementation in Python)
@endinternal
*/
/**
* @brief Partial SVD decomposition of a sparse matrix into U and V components
*
* This function takes as input the table representation of a sparse matrix and
* decomposes it into the specified set of most significant features of matrices
* of U and V matrix.
*
* @param input_table Name of the table/view with the source data
* @param col_name Name of the column containing cell column number
* @param row_name Name of the column containing cell row number
* @param value Name of the column containing cell value
* @param num_features Rank of desired approximation
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svdmf_run(
input_table TEXT, col_name TEXT, row_name TEXT, value TEXT, num_features INT
)
RETURNS TEXT
AS $$
PythonFunctionBodyOnly(`svd_mf', `svdmf')
# schema_madlib comes from PythonFunctionBodyOnly
return svdmf.svdmf_run( schema_madlib, input_table, col_name, row_name, value, num_features);
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Partial SVD decomposition of a sparse matrix into U and V components
*
* This function takes as input the table representation of a sparse matrix and
* decomposes it into the specified set of most significant features of matrices
* of U and V matrix.
*
* @param input_table Name of the table/view with the source data
* @param col_name Name of the column containing cell column number
* @param row_name Name of the column containing cell row number
* @param value Name of the column containing cell value
* @param num_features Rank of desired approximation
* @param num_iterations Maximum number if iterations to perform regardless of convergence
* @param min_error Acceptable level of error in convergence.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svdmf_run(
input_table TEXT, col_name TEXT, row_name TEXT, value TEXT, num_features INT, num_iterations INT, min_error FLOAT
)
RETURNS TEXT
AS $$
PythonFunctionBodyOnly(`svd_mf', `svdmf')
# schema_madlib comes from PythonFunctionBodyOnly
return svdmf.svdmf_run_full( schema_madlib, input_table, col_name, row_name, value, num_features, num_iterations, min_error);
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');