src/ports/postgres/modules/convex/lmf.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file lmf.sql_in
  *
  * @brief SQL functions for low-rank matrix factorization
  * @date June 2012
  *
  * @sa For a brief introduction to Low-rank Matrix Factorization, see the module
  *     description \ref grp_lmf.
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_lmf


 @about

 This module implements "factor model" for representing an incomplete matrix using a low-rank approximation [1].
 Mathematically, this model seeks to find matrices U and V (also referred as factors) that, for any given incomplete matrix A, minimizes:
 \f[ \|\boldsymbol A - \boldsymbol UV^{T} \|_2 \f]
 subject to \f$rank(\boldsymbol UV^{T}) \leq r\f$, where \f$\|\cdot\|_2\f$ denotes the Frobenius norm.
 Let \f$A\f$ be a \f$m \times n\f$ matrix, then \f$U\f$ will be \f$m \times r\f$ and \f$V\f$ will be \f$n \times r\f$, in dimension, and \f$1 \leq r \ll \min(m, n)\f$.
 This model is not intended to do the full decomposition, or to be used as part of inverse procedure.
 This model has been widely used in recommendation systems (e.g., Netflix [2]) and feature selection (e.g., image processing [3]).


 @input

 The <b>input matrix</b> is expected to be of the following form:
 <pre>{TABLE|VIEW} <em>input_table</em> (
     <em>row</em>    INTEGER,
     <em>col</em>    INTEGER,
     <em>value</em>  DOUBLE PRECISION
 )</pre>

 Input is contained in a table that describes an incomplete matrix, by having available entries specified as (row, column, value).
 The input matrix is expected to be based 1, which means row >= 1, and col >= 1.
 NULL values are not expected.


 @usage

 Please find descriptions of SQL functions in lmf.sql_in

 Output factors matrix U and V are in flatten format.
 <pre>RESULT AS (
         matrix_u    DOUBLE PRECISION[],
         matrix_v    DOUBLE PRECISION[],
         rmse        DOUBLE PRECISION
 );</pre>

 Features correspond to row i is
 <code>matrix_u[i:i][1:r]</code>.
 Features correspond to column j is
 <code>matrix_v[j:j][1:r]</code>.


 @examp

 -# Prepare an input table/view:
 \code
 CREATE TABLE lmf_data (
  column INT,
  row    INT,
  value  FLOAT8
 );
 \endcode
 -# Populate the input table with some data. e.g.:
 \code
 INSERT INTO lmf_data VALUES (1, 1, 5.0);
 INSERT INTO lmf_data VALUES (3, 100, 1.0);
 INSERT INTO lmf_data VALUES (999, 10000, 2.0);
 \endcode
 -# Call lmf_igd_run() stored procedure, e.g.:
 \code
 SELECT madlib.lmf_igd_run(
 'lmf_model',                 -- result table
 'lmf_data',                  -- input table
 'row', 'col', 'value',       -- table column names
 999,                         -- row dimension
 10000,                       -- column dimension
 3,                           -- rank (number of features)
 0.1,                         -- stepsize
 2,                           -- initial value scale factor
 10,                          -- maximal number of iterations
 1e-9);                       -- error tolerance
 \endcode
 Example output (the exact result may not be the same):
 \code
 NOTICE:
 Finished low-rank matrix factorization using incremental gradient
 DETAIL:
  * table : lmf_data (row, col, value)
 Results:
  * RMSE = 4.31144557397543e-05
 Output:
  * view : SELECT * FROM lmf_model WHERE id = 1
  lmf_igd_run
 -------------
            1
 (1 row)
 \endcode
 -# Sanity check of the result. You may need a model id returned and also indicated by the function lmf_igd_run(), assuming 1 here, e.g.:
 \code
 SELECT array_dims(matrix_u), array_dims(matrix_v) FROM lmf_model WHERE id = 1;
 \endcode
 Example output:
 \code
   array_dims  |   array_dims
 --------------+----------------
  [1:999][1:3] | [1:10000][1:3]
 (1 row)
 \endcode
 -# Query the result value, e.g.:
 \code
 SELECT matrix_u[2:2][1:3] AS row_2_features FROM lmf_model WHERE id = 1;
 \endcode
 Example output (the exact result may not be the same):
 \code
                       row_2_features
 ----------------------------------------------------------
  {{0.51117920037359,0.169582297094166,0.837417622096837}}
 (1 row)
 \endcode


 @literature

 [1] N. Srebro and T. Jaakkola. “Weighted Low-Rank Approximations.” In: ICML. Ed. by T. Fawcett and N. Mishra. AAAI Press, 2003, pp. 720–727. isbn: 1-57735-189-4.

 [2] Simon Funk, Netflix Update: Try This at Home, December 11 2006, http://sifter.org/~simon/journal/20061211.html

 [3] J. Wright, A. Ganesh, S. Rao, Y. Peng, and Y. Ma. “Robust Principal Component Analysis: Exact Recovery of Corrupted Low-Rank Matrices via Convex Optimization.” In: NIPS. Ed. by Y. Bengio, D. Schuurmans, J. D. Lafferty, C. K. I. Williams, and A. Culotta. Curran Associates, Inc., 2009, pp. 2080–2088. isbn: 9781615679119.

 */

 CREATE TYPE MADLIB_SCHEMA.lmf_result AS (
         matrix_u    DOUBLE PRECISION[],
         matrix_v    DOUBLE PRECISION[],
         rmse        DOUBLE PRECISION
 );

 --------------------------------------------------------------------------
 -- create SQL functions for IGD optimizer
 --------------------------------------------------------------------------
 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_transition(
         state           DOUBLE PRECISION[],
         row_num         SMALLINT,
         column_num      SMALLINT,
         val             DOUBLE PRECISION,
         previous_state  DOUBLE PRECISION[],
         row_dim         SMALLINT,
         column_dim      SMALLINT,
         max_rank        SMALLINT,
         stepsize        DOUBLE PRECISION,
         scale_factor    DOUBLE PRECISION)
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE;

 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_merge(
         state1 DOUBLE PRECISION[],
         state2 DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;

 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_final(
         state DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION[]
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT;

 /**
  * @internal
  * @brief Perform one iteration of the incremental gradient
  *        method for computing low-rank matrix factorization
  */
 CREATE AGGREGATE MADLIB_SCHEMA.lmf_igd_step(
         /*+ row_num */          SMALLINT,
         /*+ column_num */       SMALLINT,
         /*+ val */              DOUBLE PRECISION,
         /*+ previous_state */   DOUBLE PRECISION[],
         /*+ row_dim */          SMALLINT,
         /*+ column_dim */       SMALLINT,
         /*+ max_rank */         SMALLINT,
         /*+ stepsize */         DOUBLE PRECISION,
         /*+ scale_factor */     DOUBLE PRECISION) (
     STYPE=DOUBLE PRECISION[],
     SFUNC=MADLIB_SCHEMA.lmf_igd_transition,
     m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.lmf_igd_merge,')
     FINALFUNC=MADLIB_SCHEMA.lmf_igd_final,
     INITCOND='{0,0,0,0,0,0,0,0,0}'
 );

 CREATE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_distance(
     /*+ state1 */ DOUBLE PRECISION[],
     /*+ state2 */ DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION AS
 'MODULE_PATHNAME'
 LANGUAGE c IMMUTABLE STRICT;

 CREATE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_result(
     /*+ state */ DOUBLE PRECISION[])
 RETURNS MADLIB_SCHEMA.lmf_result AS
 'MODULE_PATHNAME'
 LANGUAGE c IMMUTABLE STRICT;


 CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_lmf_igd_args(
     sql VARCHAR, INTEGER, INTEGER, INTEGER, DOUBLE PRECISION,
     DOUBLE PRECISION, INTEGER, DOUBLE PRECISION
 ) RETURNS VOID
 IMMUTABLE
 CALLED ON NULL INPUT
 LANGUAGE c
 AS 'MODULE_PATHNAME', 'exec_sql_using';

 CREATE FUNCTION MADLIB_SCHEMA.internal_compute_lmf_igd(
     rel_args        VARCHAR,
     rel_state       VARCHAR,
     rel_source      VARCHAR,
     col_row         VARCHAR,
     col_column      VARCHAR,
     col_value       VARCHAR)
 RETURNS INTEGER
 AS $$PythonFunction(convex, lmf_igd, compute_lmf_igd)$$
 LANGUAGE plpythonu VOLATILE;

 /**
  * @brief Low-rank matrix factorization of a incomplete matrix into two factors
  *
  * This function takes as input the table representation of a incomplete matrix
  * in the sparse (i, j, value) format and decomposes it into the specified set
  * of most significant features of matrices of U and V matrix. The input matrix
  * is expected to have dimension [1:row_dim][1:column_dim], but in sparse
  * format.
  *
  *   @param rel_output  Name of the table that the factors will be appended to
  *   @param rel_source  Name of the table/view with the source data
  *   @param col_row  Name of the column containing cell row number
  *   @param col_column  Name of the column containing cell column number
  *   @param col_value  Name of the column containing cell value
  *   @param row_dim  Maximum number of rows of input
  *   @param column_dim  Maximum number of columns of input
  *   @param max_rank  Rank of desired approximation
  *   @param stepsize  Hyper-parameter that decides how aggressive that the gradient steps are
  *   @param scale_factor  Hyper-parameter that decides scale of initial factors
  *   @param num_iterations  Maximum number if iterations to perform regardless of convergence
  *   @param tolerance  Acceptable level of error in convergence.
  *
  */
 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
     rel_output      VARCHAR,
     rel_source      REGCLASS,
     col_row         VARCHAR,
     col_column      VARCHAR,
     col_value       VARCHAR,
     row_dim         INTEGER /*+ DEFAULT 'SELECT max(col_row) FROM rel_source' */,
     column_dim      INTEGER /*+ DEFAULT 'SELECT max(col_col) FROM rel_source' */,
     max_rank        INTEGER /*+ DEFAULT 20 */,
     stepsize        DOUBLE PRECISION /*+ DEFAULT 0.01 */,
     scale_factor    DOUBLE PRECISION /*+ DEFAULT 0.1 */,
     num_iterations  INTEGER /*+ DEFAULT 10 */,
     tolerance       DOUBLE PRECISION /*+ DEFAULT 0.0001 */)
 RETURNS INTEGER AS $$
 DECLARE
     iteration_run   INTEGER;
     model_id        INTEGER;
     rmse            DOUBLE PRECISION;
     old_messages    VARCHAR;
 BEGIN
     RAISE NOTICE 'Matrix % to be factorized: % x %', rel_source, row_dim, column_dim;

     -- We first setup the argument table. Rationale: We want to avoid all data
     -- conversion between native types and Python code. Instead, we use Python
     -- as a pure driver layer.
     old_messages :=
         (SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
     EXECUTE 'SET client_min_messages TO warning';
     PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
     -- Unfortunately, the EXECUTE USING syntax is only available starting
     -- PostgreSQL 8.4:
     -- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN
     -- We therefore have to emulate.
     PERFORM MADLIB_SCHEMA.internal_execute_using_lmf_igd_args($sql$
         DROP TABLE IF EXISTS pg_temp._madlib_lmf_igd_args;
         CREATE TABLE pg_temp._madlib_lmf_igd_args AS
         SELECT
             $1 AS row_dim,
             $2 AS column_dim,
             $3 AS max_rank,
             $4 AS stepsize,
             $5 AS scale_factor,
             $6 AS num_iterations,
             $7 AS tolerance;
         $sql$,
         row_dim, column_dim, max_rank, stepsize,
         scale_factor, num_iterations, tolerance);
     EXECUTE 'SET client_min_messages TO ' || old_messages;

     -- Perform acutal computation.
     -- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
     -- operators from regclass to varchar/text.
     iteration_run := MADLIB_SCHEMA.internal_compute_lmf_igd(
             '_madlib_lmf_igd_args', '_madlib_lmf_igd_state',
             textin(regclassout(rel_source)), col_row, col_column, col_value);

     -- create result table if it does not exist
     BEGIN
         EXECUTE 'SELECT 1 FROM ' || rel_output || ' LIMIT 0';
     EXCEPTION
         WHEN undefined_table THEN
             EXECUTE '
             CREATE TABLE ' || rel_output || ' (
                 id          SERIAL,
                 matrix_u    DOUBLE PRECISION[],
                 matrix_v    DOUBLE PRECISION[],
                 rmse        DOUBLE PRECISION)';
     END;

     -- A work-around for GPDB not supporting RETURNING for INSERT
     -- We generate an id using nextval before INSERT
     EXECUTE '
     SELECT nextval(' || quote_literal(rel_output || '_id_seq') ||'::regclass)'
     INTO model_id;

     -- output model
     -- Retrieve result from state table and insert it
     EXECUTE '
     INSERT INTO ' || rel_output || '
     SELECT ' || model_id || ', (result).*
     FROM (
         SELECT MADLIB_SCHEMA.internal_lmf_igd_result(_state) AS result
         FROM _madlib_lmf_igd_state
         WHERE _iteration = ' || iteration_run || '
         ) subq';

     EXECUTE '
     SELECT rmse
     FROM ' || rel_output || '
     WHERE id = ' || model_id
     INTO rmse;

     -- return description
     RAISE NOTICE '
 Finished low-rank matrix factorization using incremental gradient
  * table : % (%, %, %)
 Results:
  * RMSE = %
 Output:
  * view : SELECT * FROM % WHERE id = %',
     rel_source, col_row, col_column, col_value, rmse, rel_output, model_id;

     RETURN model_id;
 END;
 $$ LANGUAGE plpgsql VOLATILE;

 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
     rel_output      VARCHAR,
     rel_source      REGCLASS,
     col_row         VARCHAR,
     col_column      VARCHAR,
     col_value       VARCHAR,
     row_dim         INTEGER,
     column_dim      INTEGER,
     max_rank        INTEGER,
     stepsize        DOUBLE PRECISION,
     scale_factor    DOUBLE PRECISION)
 RETURNS INTEGER AS $$
     SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 10, 0.0001);
 $$ LANGUAGE sql VOLATILE;

 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
     rel_output      VARCHAR,
     rel_source      REGCLASS,
     col_row         VARCHAR,
     col_column      VARCHAR,
     col_value       VARCHAR,
     row_dim         INTEGER,
     column_dim      INTEGER,
     max_rank        INTEGER,
     stepsize        DOUBLE PRECISION)
 RETURNS INTEGER AS $$
     -- set scale_factor as default 0.1
     SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, 0.1);
 $$ LANGUAGE sql VOLATILE;

 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
     rel_output      VARCHAR,
     rel_source      REGCLASS,
     col_row         VARCHAR,
     col_column      VARCHAR,
     col_value       VARCHAR,
     row_dim         INTEGER,
     column_dim      INTEGER,
     max_rank        INTEGER)
 RETURNS INTEGER AS $$
     -- set stepsize as default 0.01
     SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, 0.01);
 $$ LANGUAGE sql VOLATILE;

 CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
     rel_output      VARCHAR,
     rel_source      REGCLASS,
     col_row         VARCHAR,
     col_column      VARCHAR,
     col_value       TEXT)
 RETURNS INTEGER AS $$
 DECLARE
     row_dim INTEGER;
     column_dim INTEGER;
 BEGIN
     EXECUTE '
     SELECT max(' || col_row || '), max(' || col_column || ')
     FROM ' || textin(regclassout(rel_source))
     INTO row_dim, column_dim;

     RETURN (SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, row_dim, column_dim, 20));
 END;
 $$ LANGUAGE plpgsql VOLATILE;
	/* ----------------------------------------------------------------------- //*
	*
	* @file lmf.sql_in
	*
	* @brief SQL functions for low-rank matrix factorization
	* @date June 2012
	*
	* @sa For a brief introduction to Low-rank Matrix Factorization, see the module
	* description \ref grp_lmf.
	*
	// ----------------------------------------------------------------------- */

	m4_include(`SQLCommon.m4')

	/**
	@addtogroup grp_lmf


	@about

	This module implements "factor model" for representing an incomplete matrix using a low-rank approximation [1].
	Mathematically, this model seeks to find matrices U and V (also referred as factors) that, for any given incomplete matrix A, minimizes:
	\f[ \\|\boldsymbol A - \boldsymbol UV^{T} \\|_2 \f]
	subject to \f$rank(\boldsymbol UV^{T}) \leq r\f$, where \f$\\|\cdot\\|_2\f$ denotes the Frobenius norm.
	Let \f$A\f$ be a \f$m \times n\f$ matrix, then \f$U\f$ will be \f$m \times r\f$ and \f$V\f$ will be \f$n \times r\f$, in dimension, and \f$1 \leq r \ll \min(m, n)\f$.
	This model is not intended to do the full decomposition, or to be used as part of inverse procedure.
	This model has been widely used in recommendation systems (e.g., Netflix [2]) and feature selection (e.g., image processing [3]).


	@input

	The <b>input matrix</b> is expected to be of the following form:
	<pre>{TABLE\|VIEW} <em>input_table</em> (
	<em>row</em> INTEGER,
	<em>col</em> INTEGER,
	<em>value</em> DOUBLE PRECISION
	)</pre>

	Input is contained in a table that describes an incomplete matrix, by having available entries specified as (row, column, value).
	The input matrix is expected to be based 1, which means row >= 1, and col >= 1.
	NULL values are not expected.


	@usage

	Please find descriptions of SQL functions in lmf.sql_in

	Output factors matrix U and V are in flatten format.
	<pre>RESULT AS (
	matrix_u DOUBLE PRECISION[],
	matrix_v DOUBLE PRECISION[],
	rmse DOUBLE PRECISION
	);</pre>

	Features correspond to row i is
	<code>matrix_u[i:i][1:r]</code>.
	Features correspond to column j is
	<code>matrix_v[j:j][1:r]</code>.


	@examp

	-# Prepare an input table/view:
	\code
	CREATE TABLE lmf_data (
	column INT,
	row INT,
	value FLOAT8
	);
	\endcode
	-# Populate the input table with some data. e.g.:
	\code
	INSERT INTO lmf_data VALUES (1, 1, 5.0);
	INSERT INTO lmf_data VALUES (3, 100, 1.0);
	INSERT INTO lmf_data VALUES (999, 10000, 2.0);
	\endcode
	-# Call lmf_igd_run() stored procedure, e.g.:
	\code
	SELECT madlib.lmf_igd_run(
	'lmf_model', -- result table
	'lmf_data', -- input table
	'row', 'col', 'value', -- table column names
	999, -- row dimension
	10000, -- column dimension
	3, -- rank (number of features)
	0.1, -- stepsize
	2, -- initial value scale factor
	10, -- maximal number of iterations
	1e-9); -- error tolerance
	\endcode
	Example output (the exact result may not be the same):
	\code
	NOTICE:
	Finished low-rank matrix factorization using incremental gradient
	DETAIL:
	* table : lmf_data (row, col, value)
	Results:
	* RMSE = 4.31144557397543e-05
	Output:
	* view : SELECT * FROM lmf_model WHERE id = 1
	lmf_igd_run
	-------------
	1
	(1 row)
	\endcode
	-# Sanity check of the result. You may need a model id returned and also indicated by the function lmf_igd_run(), assuming 1 here, e.g.:
	\code
	SELECT array_dims(matrix_u), array_dims(matrix_v) FROM lmf_model WHERE id = 1;
	\endcode
	Example output:
	\code
	array_dims \| array_dims
	--------------+----------------
	[1:999][1:3] \| [1:10000][1:3]
	(1 row)
	\endcode
	-# Query the result value, e.g.:
	\code
	SELECT matrix_u[2:2][1:3] AS row_2_features FROM lmf_model WHERE id = 1;
	\endcode
	Example output (the exact result may not be the same):
	\code
	row_2_features
	----------------------------------------------------------
	{{0.51117920037359,0.169582297094166,0.837417622096837}}
	(1 row)
	\endcode


	@literature

	[1] N. Srebro and T. Jaakkola. “Weighted Low-Rank Approximations.” In: ICML. Ed. by T. Fawcett and N. Mishra. AAAI Press, 2003, pp. 720–727. isbn: 1-57735-189-4.

	[2] Simon Funk, Netflix Update: Try This at Home, December 11 2006, http://sifter.org/~simon/journal/20061211.html

	[3] J. Wright, A. Ganesh, S. Rao, Y. Peng, and Y. Ma. “Robust Principal Component Analysis: Exact Recovery of Corrupted Low-Rank Matrices via Convex Optimization.” In: NIPS. Ed. by Y. Bengio, D. Schuurmans, J. D. Lafferty, C. K. I. Williams, and A. Culotta. Curran Associates, Inc., 2009, pp. 2080–2088. isbn: 9781615679119.

	*/

	CREATE TYPE MADLIB_SCHEMA.lmf_result AS (
	matrix_u DOUBLE PRECISION[],
	matrix_v DOUBLE PRECISION[],
	rmse DOUBLE PRECISION
	);

	--------------------------------------------------------------------------
	-- create SQL functions for IGD optimizer
	--------------------------------------------------------------------------
	CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_transition(
	state DOUBLE PRECISION[],
	row_num SMALLINT,
	column_num SMALLINT,
	val DOUBLE PRECISION,
	previous_state DOUBLE PRECISION[],
	row_dim SMALLINT,
	column_dim SMALLINT,
	max_rank SMALLINT,
	stepsize DOUBLE PRECISION,
	scale_factor DOUBLE PRECISION)
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C IMMUTABLE;

	CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_merge(
	state1 DOUBLE PRECISION[],
	state2 DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C IMMUTABLE STRICT;

	CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_final(
	state DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION[]
	AS 'MODULE_PATHNAME'
	LANGUAGE C IMMUTABLE STRICT;

	/**
	* @internal
	* @brief Perform one iteration of the incremental gradient
	* method for computing low-rank matrix factorization
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.lmf_igd_step(
	/+ row_num / SMALLINT,
	/+ column_num / SMALLINT,
	/+ val / DOUBLE PRECISION,
	/+ previous_state / DOUBLE PRECISION[],
	/+ row_dim / SMALLINT,
	/+ column_dim / SMALLINT,
	/+ max_rank / SMALLINT,
	/+ stepsize / DOUBLE PRECISION,
	/+ scale_factor / DOUBLE PRECISION) (
	STYPE=DOUBLE PRECISION[],
	SFUNC=MADLIB_SCHEMA.lmf_igd_transition,
	m4_ifdef(`__GREENPLUM__',`prefunc=MADLIB_SCHEMA.lmf_igd_merge,')
	FINALFUNC=MADLIB_SCHEMA.lmf_igd_final,
	INITCOND='{0,0,0,0,0,0,0,0,0}'
	);

	CREATE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_distance(
	/+ state1 / DOUBLE PRECISION[],
	/+ state2 / DOUBLE PRECISION[])
	RETURNS DOUBLE PRECISION AS
	'MODULE_PATHNAME'
	LANGUAGE c IMMUTABLE STRICT;

	CREATE FUNCTION MADLIB_SCHEMA.internal_lmf_igd_result(
	/+ state / DOUBLE PRECISION[])
	RETURNS MADLIB_SCHEMA.lmf_result AS
	'MODULE_PATHNAME'
	LANGUAGE c IMMUTABLE STRICT;


	CREATE FUNCTION MADLIB_SCHEMA.internal_execute_using_lmf_igd_args(
	sql VARCHAR, INTEGER, INTEGER, INTEGER, DOUBLE PRECISION,
	DOUBLE PRECISION, INTEGER, DOUBLE PRECISION
	) RETURNS VOID
	IMMUTABLE
	CALLED ON NULL INPUT
	LANGUAGE c
	AS 'MODULE_PATHNAME', 'exec_sql_using';

	CREATE FUNCTION MADLIB_SCHEMA.internal_compute_lmf_igd(
	rel_args VARCHAR,
	rel_state VARCHAR,
	rel_source VARCHAR,
	col_row VARCHAR,
	col_column VARCHAR,
	col_value VARCHAR)
	RETURNS INTEGER
	AS $$PythonFunction(convex, lmf_igd, compute_lmf_igd)$$
	LANGUAGE plpythonu VOLATILE;

	/**
	* @brief Low-rank matrix factorization of a incomplete matrix into two factors
	*
	* This function takes as input the table representation of a incomplete matrix
	* in the sparse (i, j, value) format and decomposes it into the specified set
	* of most significant features of matrices of U and V matrix. The input matrix
	* is expected to have dimension [1:row_dim][1:column_dim], but in sparse
	* format.
	*
	* @param rel_output Name of the table that the factors will be appended to
	* @param rel_source Name of the table/view with the source data
	* @param col_row Name of the column containing cell row number
	* @param col_column Name of the column containing cell column number
	* @param col_value Name of the column containing cell value
	* @param row_dim Maximum number of rows of input
	* @param column_dim Maximum number of columns of input
	* @param max_rank Rank of desired approximation
	* @param stepsize Hyper-parameter that decides how aggressive that the gradient steps are
	* @param scale_factor Hyper-parameter that decides scale of initial factors
	* @param num_iterations Maximum number if iterations to perform regardless of convergence
	* @param tolerance Acceptable level of error in convergence.
	*
	*/
	CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
	rel_output VARCHAR,
	rel_source REGCLASS,
	col_row VARCHAR,
	col_column VARCHAR,
	col_value VARCHAR,
	row_dim INTEGER /+ DEFAULT 'SELECT max(col_row) FROM rel_source' /,
	column_dim INTEGER /+ DEFAULT 'SELECT max(col_col) FROM rel_source' /,
	max_rank INTEGER /+ DEFAULT 20 /,
	stepsize DOUBLE PRECISION /+ DEFAULT 0.01 /,
	scale_factor DOUBLE PRECISION /+ DEFAULT 0.1 /,
	num_iterations INTEGER /+ DEFAULT 10 /,
	tolerance DOUBLE PRECISION /+ DEFAULT 0.0001 /)
	RETURNS INTEGER AS $$
	DECLARE
	iteration_run INTEGER;
	model_id INTEGER;
	rmse DOUBLE PRECISION;
	old_messages VARCHAR;
	BEGIN
	RAISE NOTICE 'Matrix % to be factorized: % x %', rel_source, row_dim, column_dim;

	-- We first setup the argument table. Rationale: We want to avoid all data
	-- conversion between native types and Python code. Instead, we use Python
	-- as a pure driver layer.
	old_messages :=
	(SELECT setting FROM pg_settings WHERE name = 'client_min_messages');
	EXECUTE 'SET client_min_messages TO warning';
	PERFORM MADLIB_SCHEMA.create_schema_pg_temp();
	-- Unfortunately, the EXECUTE USING syntax is only available starting
	-- PostgreSQL 8.4:
	-- http://www.postgresql.org/docs/8.4/static/plpgsql-statements.html#PLPGSQL-STATEMENTS-EXECUTING-DYN
	-- We therefore have to emulate.
	PERFORM MADLIB_SCHEMA.internal_execute_using_lmf_igd_args($sql$
	DROP TABLE IF EXISTS pg_temp._madlib_lmf_igd_args;
	CREATE TABLE pg_temp._madlib_lmf_igd_args AS
	SELECT
	$1 AS row_dim,
	$2 AS column_dim,
	$3 AS max_rank,
	$4 AS stepsize,
	$5 AS scale_factor,
	$6 AS num_iterations,
	$7 AS tolerance;
	$sql$,
	row_dim, column_dim, max_rank, stepsize,
	scale_factor, num_iterations, tolerance);
	EXECUTE 'SET client_min_messages TO ' \|\| old_messages;

	-- Perform acutal computation.
	-- Unfortunately, Greenplum and PostgreSQL <= 8.2 do not have conversion
	-- operators from regclass to varchar/text.
	iteration_run := MADLIB_SCHEMA.internal_compute_lmf_igd(
	'_madlib_lmf_igd_args', '_madlib_lmf_igd_state',
	textin(regclassout(rel_source)), col_row, col_column, col_value);

	-- create result table if it does not exist
	BEGIN
	EXECUTE 'SELECT 1 FROM ' \|\| rel_output \|\| ' LIMIT 0';
	EXCEPTION
	WHEN undefined_table THEN
	EXECUTE '
	CREATE TABLE ' \|\| rel_output \|\| ' (
	id SERIAL,
	matrix_u DOUBLE PRECISION[],
	matrix_v DOUBLE PRECISION[],
	rmse DOUBLE PRECISION)';
	END;

	-- A work-around for GPDB not supporting RETURNING for INSERT
	-- We generate an id using nextval before INSERT
	EXECUTE '
	SELECT nextval(' \|\| quote_literal(rel_output \|\| '_id_seq') \|\|'::regclass)'
	INTO model_id;

	-- output model
	-- Retrieve result from state table and insert it
	EXECUTE '
	INSERT INTO ' \|\| rel_output \|\| '
	SELECT ' \|\| model_id \|\| ', (result).*
	FROM (
	SELECT MADLIB_SCHEMA.internal_lmf_igd_result(_state) AS result
	FROM _madlib_lmf_igd_state
	WHERE _iteration = ' \|\| iteration_run \|\| '
	) subq';

	EXECUTE '
	SELECT rmse
	FROM ' \|\| rel_output \|\| '
	WHERE id = ' \|\| model_id
	INTO rmse;

	-- return description
	RAISE NOTICE '
	Finished low-rank matrix factorization using incremental gradient
	* table : % (%, %, %)
	Results:
	* RMSE = %
	Output:
	* view : SELECT * FROM % WHERE id = %',
	rel_source, col_row, col_column, col_value, rmse, rel_output, model_id;

	RETURN model_id;
	END;
	$$ LANGUAGE plpgsql VOLATILE;

	CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
	rel_output VARCHAR,
	rel_source REGCLASS,
	col_row VARCHAR,
	col_column VARCHAR,
	col_value VARCHAR,
	row_dim INTEGER,
	column_dim INTEGER,
	max_rank INTEGER,
	stepsize DOUBLE PRECISION,
	scale_factor DOUBLE PRECISION)
	RETURNS INTEGER AS $$
	SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 10, 0.0001);
	$$ LANGUAGE sql VOLATILE;

	CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
	rel_output VARCHAR,
	rel_source REGCLASS,
	col_row VARCHAR,
	col_column VARCHAR,
	col_value VARCHAR,
	row_dim INTEGER,
	column_dim INTEGER,
	max_rank INTEGER,
	stepsize DOUBLE PRECISION)
	RETURNS INTEGER AS $$
	-- set scale_factor as default 0.1
	SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, $9, 0.1);
	$$ LANGUAGE sql VOLATILE;

	CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
	rel_output VARCHAR,
	rel_source REGCLASS,
	col_row VARCHAR,
	col_column VARCHAR,
	col_value VARCHAR,
	row_dim INTEGER,
	column_dim INTEGER,
	max_rank INTEGER)
	RETURNS INTEGER AS $$
	-- set stepsize as default 0.01
	SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, $6, $7, $8, 0.01);
	$$ LANGUAGE sql VOLATILE;

	CREATE FUNCTION MADLIB_SCHEMA.lmf_igd_run(
	rel_output VARCHAR,
	rel_source REGCLASS,
	col_row VARCHAR,
	col_column VARCHAR,
	col_value TEXT)
	RETURNS INTEGER AS $$
	DECLARE
	row_dim INTEGER;
	column_dim INTEGER;
	BEGIN
	EXECUTE '
	SELECT max(' \|\| col_row \|\| '), max(' \|\| col_column \|\| ')
	FROM ' \|\| textin(regclassout(rel_source))
	INTO row_dim, column_dim;

	RETURN (SELECT MADLIB_SCHEMA.lmf_igd_run($1, $2, $3, $4, $5, row_dim, column_dim, 20));
	END;
	$$ LANGUAGE plpgsql VOLATILE;