blob: e608d0a11a61f2b188bd5950451cea2915374428 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file lmf.sql_in
*
* @brief SQL functions for low-rank matrix factorization
* @date June 2012
*
* @sa For a brief introduction to Low-rank Matrix Factorization, see the module
* description \ref grp_lmf.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_lmf
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#syntax">Function Syntax</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#literature">Literature</a></li>
</ul>
</div>
@brief Performs low-rank matrix factorization for an incomplete matrix.
This module implements "factor model" for representing an incomplete matrix using a low-rank approximation [1].
Mathematically, this model seeks to find matrices U and V (also referred as factors) that, for any given incomplete matrix A, minimizes:
\f[ \|\boldsymbol A - \boldsymbol UV^{T} \|_2 \f]
subject to \f$rank(\boldsymbol UV^{T}) \leq r\f$, where \f$\|\cdot\|_2\f$ denotes the Frobenius norm.
Let \f$A\f$ be a \f$m \times n\f$ matrix, then \f$U\f$ will be \f$m \times r\f$ and \f$V\f$ will be \f$n \times r\f$, in dimension, and \f$1 \leq r \ll \min(m, n)\f$.
This model is not intended to do the full decomposition, or to be used as part of inverse procedure.
This model has been widely used in recommendation systems (e.g., Netflix [2]) and feature selection (e.g., image processing [3]).
@anchor syntax
@par Function Syntax
Low-rank matrix factorization of an incomplete matrix into two factors.
<pre class="syntax">
lmf_igd_run( rel_output,
rel_source,
col_row,
col_column,
col_value,
row_dim,
column_dim,
max_rank,
stepsize,
scale_factor,
num_iterations,
tolerance
)
</pre>
\b Arguments
<dl class="arglist">
<dt>rel_output</dt>
<dd>TEXT. The name of the table to receive the output.
Output factors matrix U and V are in a flattened format.
<pre>RESULT AS (
matrix_u DOUBLE PRECISION[],
matrix_v DOUBLE PRECISION[],
rmse DOUBLE PRECISION
);</pre>
Features correspond to row i is
<code>matrix_u[i:i][1:r]</code>.
Features correspond to column j is
<code>matrix_v[j:j][1:r]</code>.
</dd>
<dt>rel_source</dt>
<dd>TEXT. The name of the table containing the input data.
The input matrix> is expected to be of the following form:
<pre>{TABLE|VIEW} <em>input_table</em> (
<em>row</em> INTEGER,
<em>col</em> INTEGER,
<em>value</em> DOUBLE PRECISION
)</pre>
Input is contained in a table that describes an incomplete matrix, with available entries specified as (row, column, value).
The input matrix should be 1-based, which means row &gt;= 1, and col &gt;= 1.
NULL values are not expected.
</dd>
<dt>col_row</dt>
<dd>TEXT. The name of the column containing the row number.</dd>
<dt>col_column</dt>
<dd>TEXT. The name of the column containing the column number.</dd>
<dt>col_value</dt>
<dd>DOUBLE PRECISION. The value at (row, col).</dd>
<dt>row_dim (optional)</dt>
<dd>INTEGER, default: "SELECT max(col_row) FROM rel_source". The number of columns in the matrix.</dd>
<dt>column_dim (optional)</dt>
<dd>INTEGER, default: "SELECT max(col_col) FROM rel_source". The number of rows in the matrix.</dd>
<dt>max_rank</dt>
<dd>INTEGER, default: 20. The rank of desired approximation.</dd>
<dt>stepsize (optional)</dt>
<dd>DOUBLE PRECISION, default: 0.01. Hyper-parameter that decides how aggressive the gradient steps are. </dd>
<dt>scale_factor (optional)</dt>
<dd>DOUBLE PRECISION, default: 0.1. Hyper-parameter that decides scale of initial factors.</dd>
<dt>num_iterations (optional)</dt>
<dd> INTEGER, default: 10. Maximum number if iterations to perform regardless of convergence.</dd>
<dt>tolerance (optional)</dt>
<dd> DOUBLE PRECISION, default: 0.0001. Acceptable level of error in convergence.</dd>
</dl>
@anchor examples
@examp
-# Prepare an input table/view:
<pre class="example">
DROP TABLE IF EXISTS lmf_data;
CREATE TABLE lmf_data (
row INT,
col INT,
val FLOAT8
);
</pre>
-# Populate the input table with some data.
<pre class="example">
INSERT INTO lmf_data VALUES (1, 1, 5.0);
INSERT INTO lmf_data VALUES (3, 100, 1.0);
INSERT INTO lmf_data VALUES (999, 10000, 2.0);
</pre>
-# Call the lmf_igd_run() stored procedure.
<pre class="example">
DROP TABLE IF EXISTS lmf_model;
SELECT madlib.lmf_igd_run( 'lmf_model',
'lmf_data',
'row',
'col',
'val',
999,
10000,
3,
0.1,
2,
10,
1e-9
);
</pre>
Example result (the exact result may not be the same).
<pre class="result">
NOTICE:
Finished low-rank matrix factorization using incremental gradient
DETAIL:
* table : lmf_data (row, col, val)
Results:
* RMSE = 0.0145966345300041
Output:
* view : SELECT * FROM lmf_model WHERE id = 1
lmf_igd_run
&nbsp;-----------
1
(1 row)
</pre>
-# Sanity check of the result. You may need a model id returned and also indicated by the function lmf_igd_run(), assuming 1 here:
<pre class="example">
SELECT array_dims(matrix_u) AS u_dims, array_dims(matrix_v) AS v_dims
FROM lmf_model
WHERE id = 1;
</pre>
Result:
<pre class="result">
u_dims | v_dims
--------------+----------------
[1:999][1:3] | [1:10000][1:3]
(1 row)
</pre>
-# Query the result value.
<pre class="example">
SELECT matrix_u[2:2][1:3] AS row_2_features
FROM lmf_model
WHERE id = 1;
</pre>
Example output (the exact result may not be the same):
<pre class="result">
row_2_features
&nbsp;---------------------------------------------------------
{{1.12030523084104,0.522217971272767,0.0264869043603539}}
(1 row)
</pre>
-# Make prediction of a missing entry (row=2, col=7654).
<pre class="example">
SELECT madlib.array_dot(
matrix_u[2:2][1:3],
matrix_v[7654:7654][1:3]
) AS row_2_col_7654
FROM lmf_model
WHERE id = 1;
</pre>
Example output (the exact result may not be the same due the randomness of the algorithm):
<pre class="result">
row_2_col_7654
&nbsp;------------------
1.3201582940851
(1 row)
</pre>
@anchor literature
@literature
[1] N. Srebro and T. Jaakkola. “Weighted Low-Rank Approximations.” In: ICML. Ed. by T. Fawcett and N. Mishra. AAAI Press, 2003, pp. 720–727. isbn: 1-57735-189-4.
[2] Simon Funk, Netflix Update: Try This at Home, December 11 2006, http://sifter.org/~simon/journal/20061211.html
[3] J. Wright, A. Ganesh, S. Rao, Y. Peng, and Y. Ma. “Robust Principal Component Analysis: Exact Recovery of Corrupted Low-Rank Matrices via Convex Optimization.” In: NIPS. Ed. by Y. Bengio, D. Schuurmans, J. D. Lafferty, C. K. I. Williams, and A. Culotta. Curran Associates, Inc., 2009, pp. 2080–2088. isbn: 9781615679119.
*/
DROP TYPE IF EXISTS MADLIB_SCHEMA.lmf_result CASCADE;
CREATE TYPE MADLIB_SCHEMA.lmf_result AS (
matrix_u DOUBLE PRECISION[],
matrix_v DOUBLE PRECISION[],
rmse DOUBLE PRECISION
);
--------------------------------------------------------------------------
-- create SQL functions for IGD optimizer
--------------------------------------------------------------------------
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.lmf_igd_transition(
float8[], smallint, smallint, float8, float8[], smallint,
smallint, smallint, float8, float8) CASCADE;
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lmf_igd_transition(
state DOUBLE PRECISION[],
row_num INTEGER,
column_num INTEGER,
val DOUBLE PRECISION,
previous_state DOUBLE PRECISION[],
row_dim INTEGER,
column_dim INTEGER,
max_rank INTEGER,
stepsize DOUBLE PRECISION,
scale_factor DOUBLE PRECISION)
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lmf_igd_merge(
state1 DOUBLE PRECISION[],
state2 DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lmf_igd_final(
state DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
/**
* @internal
* @brief Perform one iteration of the incremental gradient
* method for computing low-rank matrix factorization
*/
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.lmf_igd_step(
INTEGER, INTEGER, DOUBLE PRECISION, DOUBLE PRECISION[],
INTEGER, INTEGER, INTEGER, DOUBLE PRECISION, DOUBLE PRECISION);
CREATE AGGREGATE MADLIB_SCHEMA.lmf_igd_step(
/*+ row_num */ INTEGER,
/*+ column_num */ INTEGER,
/*+ val */ DOUBLE PRECISION,
/*+ previous_state */ DOUBLE PRECISION[],
/*+ row_dim */ INTEGER,
/*+ column_dim */ INTEGER,
/*+ max_rank */ INTEGER,
/*+ stepsize */ DOUBLE PRECISION,
/*+ scale_factor */ DOUBLE PRECISION) (
STYPE=DOUBLE PRECISION[],
SFUNC=MADLIB_SCHEMA.lmf_igd_transition,
-- m4_ifdef(`__GREENPLUM__',`PREFUNC=MADLIB_SCHEMA.lmf_igd_merge,')
FINALFUNC=MADLIB_SCHEMA.lmf_igd_final,
INITCOND='{0,0,0,0,0,0,0,0,0}'
);
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_distance(
/*+ state1 */ DOUBLE PRECISION[],
/*+ state2 */ DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION AS
'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_result(
/*+ state */ DOUBLE PRECISION[])
RETURNS MADLIB_SCHEMA.lmf_result AS
'MODULE_PATHNAME'
LANGUAGE c IMMUTABLE STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_execute_using_lmf_igd_args(
sql VARCHAR, INTEGER, INTEGER, INTEGER, DOUBLE PRECISION,
DOUBLE PRECISION, INTEGER, DOUBLE PRECISION
) RETURNS VOID
IMMUTABLE
CALLED ON NULL INPUT
LANGUAGE c
AS 'MODULE_PATHNAME', 'exec_sql_using'
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_compute_lmf_igd(
rel_args VARCHAR,
rel_state VARCHAR,
rel_source VARCHAR,
col_row VARCHAR,
col_column VARCHAR,
col_value VARCHAR)
RETURNS INTEGER
AS $$PythonFunction(convex, lmf_igd, compute_lmf_igd)$$
LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Low-rank matrix factorization of a incomplete matrix into two factors
*
* This function takes as input the table representation of a incomplete matrix
* in the sparse (i, j, value) format and decomposes it into the specified set
* of most significant features of matrices of U and V matrix. The input matrix
* is expected to have dimension [1:row_dim][1:column_dim], but in sparse
* format.
*
* @param rel_output Name of the table that the factors will be appended to
* @param rel_source Name of the table/view with the source data
* @param col_row Name of the column containing cell row number
* @param col_column Name of the column containing cell column number
* @param col_value Name of the column containing cell value
* @param row_dim Maximum number of rows of input
* @param column_dim Maximum number of columns of input
* @param max_rank Rank of desired approximation
* @param stepsize Hyper-parameter that decides how aggressive that the gradient steps are
* @param scale_factor Hyper-parameter that decides scale of initial factors
* @param num_iterations Maximum number if iterations to perform regardless of convergence
* @param tolerance Acceptable level of error in convergence.
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
rel_output VARCHAR,
rel_source REGCLASS,
col_row VARCHAR,
col_column VARCHAR,
col_value VARCHAR,
row_dim INTEGER /*+ DEFAULT 'SELECT max(col_row) FROM rel_source' */,
column_dim INTEGER /*+ DEFAULT 'SELECT max(col_col) FROM rel_source' */,
max_rank INTEGER /*+ DEFAULT 20 */,
stepsize DOUBLE PRECISION /*+ DEFAULT 0.01 */,
scale_factor DOUBLE PRECISION /*+ DEFAULT 0.1 */,
num_iterations INTEGER /*+ DEFAULT 10 */,
tolerance DOUBLE PRECISION /*+ DEFAULT 0.0001 */)
RETURNS INTEGER AS $$
DECLARE
iteration_run INTEGER;
model_id INTEGER;
rmse DOUBLE PRECISION;
old_messages VARCHAR;
rel_output_id VARCHAR;
BEGIN
RAISE NOTICE 'Matrix % to be factorized: % x %', rel_source, row_dim, column_dim;
-- We first setup the argument table. Rationale: We want to avoid all data
-- conversion between native types and Python code. Instead, we use Python
-- as a pure driver layer.
old_messages :=
(SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
EXECUTE 'SET client_min_messages TO warning';
PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
-- Unfortunately, the EXECUTE USING syntax is only available starting
-- PostgreSQL 8.4:
-- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN
-- We therefore have to emulate.
PERFORM MADLIB_SCHEMA.internal_execute_using_lmf_igd_args($sql$
DROP TABLE IF EXISTS pg_temp._madlib_lmf_igd_args;
CREATE TABLE pg_temp._madlib_lmf_igd_args AS
SELECT
$1 AS row_dim,
$2 AS column_dim,
$3 AS max_rank,
$4 AS stepsize,
$5 AS scale_factor,
$6 AS num_iterations,
$7 AS tolerance;
$sql$,
row_dim, column_dim, max_rank, stepsize,
scale_factor, num_iterations, tolerance);
EXECUTE 'SET client_min_messages TO ' || old_messages;
-- Perform acutal computation.
-- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
-- operators from regclass to varchar/text.
iteration_run := MADLIB_SCHEMA.internal_compute_lmf_igd(
'_madlib_lmf_igd_args', '_madlib_lmf_igd_state',
textin(regclassout(rel_source)), col_row, col_column, col_value);
-- create result table if it does not exist
BEGIN
EXECUTE 'SELECT 1 FROM ' || rel_output || ' LIMIT 0';
EXCEPTION
WHEN undefined_table THEN
EXECUTE '
CREATE TABLE ' || rel_output || ' (
id SERIAL,
matrix_u DOUBLE PRECISION[],
matrix_v DOUBLE PRECISION[],
rmse DOUBLE PRECISION)';
END;
-- A work-around for GPDB not supporting RETURNING for INSERT
-- We generate an id using nextval before INSERT
rel_output_id := trim(rel_output);
rel_output_id := case when substring(rel_output_id, 1, 1 ) = '"' and substring(rel_output_id, length(rel_output_id), 1) = '"' then substring(rel_output_id, 1, length(rel_output_id) -1) || '_id_seq' || '"' else rel_output_id || '_id_seq' end;
EXECUTE '
SELECT nextval(' || quote_literal(rel_output_id) ||'::regclass)'
INTO model_id;
-- output model
-- Retrieve result from state table and insert it
EXECUTE '
INSERT INTO ' || rel_output || '
SELECT ' || model_id || ', (result).*
FROM (
SELECT MADLIB_SCHEMA.internal_lmf_igd_result(_state) AS result
FROM _madlib_lmf_igd_state
WHERE _iteration = ' || iteration_run || '
) subq';
EXECUTE '
SELECT rmse
FROM ' || rel_output || '
WHERE id = ' || model_id
INTO rmse;
-- return description
RAISE NOTICE '
Finished low-rank matrix factorization using incremental gradient
* table : % (%, %, %)
Results:
* RMSE = %
Output:
* view : SELECT * FROM % WHERE id = %',
rel_source, col_row, col_column, col_value, rmse, rel_output, model_id;
RETURN model_id;
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
rel_output VARCHAR,
rel_source REGCLASS,
col_row VARCHAR,
col_column VARCHAR,
col_value VARCHAR,
row_dim INTEGER,
column_dim INTEGER,
max_rank INTEGER,
stepsize DOUBLE PRECISION,
scale_factor DOUBLE PRECISION)
RETURNS INTEGER AS $$
SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 10, 0.0001);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
rel_output VARCHAR,
rel_source REGCLASS,
col_row VARCHAR,
col_column VARCHAR,
col_value VARCHAR,
row_dim INTEGER,
column_dim INTEGER,
max_rank INTEGER,
stepsize DOUBLE PRECISION)
RETURNS INTEGER AS $$
-- set scale_factor as default 0.1
SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, 0.1);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
rel_output VARCHAR,
rel_source REGCLASS,
col_row VARCHAR,
col_column VARCHAR,
col_value VARCHAR,
row_dim INTEGER,
column_dim INTEGER,
max_rank INTEGER)
RETURNS INTEGER AS $$
-- set stepsize as default 0.01
SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, 0.01);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
rel_output VARCHAR,
rel_source REGCLASS,
col_row VARCHAR,
col_column VARCHAR,
col_value TEXT)
RETURNS INTEGER AS $$
DECLARE
row_dim INTEGER;
column_dim INTEGER;
BEGIN
EXECUTE '
SELECT max(' || col_row || '), max(' || col_column || ')
FROM ' || textin(regclassout(rel_source))
INTO row_dim, column_dim;
RETURN (SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, row_dim, column_dim, 20));
END;
$$ LANGUAGE plpgsql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');