blob: ac75f77a2f1dc96315f6a0ecffd416be178c7beb [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file svdmf.sql_in
*
* @brief SQL functions for SVD Matrix Factorization
* @date January 2011
*
* @sa For a brief introduction to SVD Matrix Factorization, see the module
* description \ref grp_svdmf.
*
*//* ----------------------------------------------------------------------- */
/**
@addtogroup grp_svdmf
@about
This module implements "partial SVD decomposition" method for a table
representing a sparse matrix. Code is based on the write-up as appears at
[1], with some modifications.
This algorithm is not intended to do the full decomposition, or to be used as part of
inverse procedure. It is meant to compute a low-rank approximation of the U and V matrices, which
is used in machine learning applications.
It expects input to be contained in a table where column number and row number for each cell
are sequential; that is to say that if the data was written as a matrix, those values would be the
actual row and column numbers and not some random identifiers. Hence each row and each column must
have at least one value.
@prereq
None.
@usage
Function: <tt>svdmf_run( '<em>input_table</em>', '<em>col_name</em>',
'<em>row_name</em>', '<em>value</em>', <em>num_features</em>)</tt>
Parameters:
- <em>input_table</em> : name of the table/view with the source data
- <em>col_name</em> : name of the column containing cell column number
- <em>row_name</em> : name of the column containing cell row number
- <em>value</em> : name of the column containing cell value
- <em>num_features</em> : number of features to specify
@examp
1) Prepare an input table/view:
\code
CREATE TABLE svd_test (
col INT,
row INT,
val FLOAT
);
\endcode
2) Populate the input table with some data. For example:
\code
SQL> INSERT INTO svd_test SELECT (g.a%1000)+1, g.a/1000+1, random() FROM generate_series(1,1000) AS g(a);
\endcode
3) Call svdmf_run() stored procedure, e.g.:
\code
SQL> select madlib.svdmf_run( 'svd_test', 'col', 'row', 'val', 3);
\endcode
4) Sample Output:
\code
INFO: ('Started svdmf_run() with parameters:',)
INFO: (' * input_matrix = madlib_svdsparse_test.test',)
INFO: (' * col_name = col_num',)
INFO: (' * row_name = row_num',)
INFO: (' * value = val',)
INFO: (' * num_features = 3',)
INFO: ('Copying the source data into a temporary table...',)
INFO: ('Estimating feature: 1',)
INFO: ('...Iteration 1: residual_error = 33345014611.1, step_size = 4.9997500125e-10, min_improvement = 1.0',)
INFO: ('...Iteration 2: residual_error = 33345014557.6, step_size = 5.49972501375e-10, min_improvement = 1.0',)
INFO: ('...Iteration 3: residual_error = 33345014054.3, step_size = 6.04969751512e-10, min_improvement = 1.0',)
...
INFO: ('...Iteration 78: residual_error = 2.02512133868, step_size = 5.78105354457e-10, min_improvement = 1.0',)
INFO: ('...Iteration 79: residual_error = 0.893810181282, step_size = 6.35915889903e-10, min_improvement = 1.0',)
INFO: ('...Iteration 80: residual_error = 0.34496773222, step_size = 6.99507478893e-10, min_improvement = 1.0',)
INFO: ('Swapping residual error matrix...',)
svdmf_run
--------------------------------------------------------------------------------------------
Finished SVD matrix factorisation for madlib_svdsparse_test.test (row_num, col_num, val).
Results:
* total error = 0.34496773222
* number of estimated features = 1
Output:
* table : madlib.matrix_u
* table : madlib.matrix_v
Time elapsed: 4 minutes 47.86839 seconds.
\endcode
@sa file svdmf.sql_in (documenting the SQL functions)
@internal
@sa namespace svdmf (documenting the implementation in Python)
@endinternal
@literature
[1] Simon Funk, Netflix Update: Try This at Home, December 11 2006,
http://sifter.org/~simon/journal/20061211.html
*/
/**
* @brief Partial SVD decomposition of a sparse matrix into U and V components
*
* This function takes as input the table representation of a sparse matrix and
* decomposes it into the specified set of most significant features of matrices
* of U and V matrix. With Delta values being randomly distributed between U
* and V.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.svdmf_run(
input_table TEXT, col_name TEXT, row_name TEXT, value TEXT, num_features INT
)
RETURNS TEXT
AS $$
import sys
try:
from madlib import svdmf
except:
sys.path.append("PLPYTHON_LIBDIR")
from madlib import svdmf
plpy.execute( 'set client_min_messages=warning');
return svdmf.svdmf_run( input_table, col_name, row_name, value, num_features);
$$ LANGUAGE plpythonu;