methods/regress/src/pg_gp/logRegress.py_in - madlib - Git at Google

 # coding=utf-8

 # This file is preprocessed with m4. Since we only want macro expansion at
 # very few places, most of the file is quoted. Macros can be used by enclosing
 # them in <m4> and </m4>.
 changequote(`</m4>', `<m4>') </m4>

 """
 @file logRegress.py_in

 @brief Logistic Regression: Driver functions

 @namespace logRegress

 Logistic Regression: Driver functions
 """

 import plpy

 def __runIterativeAlg(stateType, initialState, source, updateExpr,
     terminateExpr, cyclesPerIteration, maxNumIterations,
     returnExpr = "st.state"):
     """
     Driver for an iterative algorithm

     A general driver function for most iterative algorithms: The state between
     iterations is kept in a variable of type <tt>stateType</tt>, which is
     initialized with <tt><em>initialState</em></tt>. During each iteration, the
     SQL statement <tt>updateSQL</tt> is executed in the database. Afterwards,
     the SQL query <tt>updateSQL</tt> decides whether the algorithm terminates.
     At the end of the algorithm, the last state (or an arbitrary transformation
     as given by <tt>returnExpr</tt>) will be returned.

     @param stateType SQL type of the state between iterations
     @param initialState The initial value of the SQL state variable
     @param source The source relation
     @param updateExpr SQL expression that returns the new state of type
         <tt>stateType</tt>. The expression may use the replacement fields
         <tt>"{state}"</tt>, <tt>"{iteration}"</tt>, and
         <tt>"{sourceAlias}"</tt>. Source alias is an alias for the source
         relation <tt><em>source</em></tt>.
     @param terminateExpr SQL expression that returns whether the algorithm should
         terminate. The expression may use the replacement fields
         <tt>"{oldState}"</tt>, <tt>"{newState}"</tt>, and
         <tt>"{iteration}"</tt>. It must return a BOOLEAN value.
     @param cyclesPerIteration Number of aggregate function calls per iteration.
     @param maxNumIterations Maximum number of iterations. Algorithm will then
         terminate even when <tt>terminateExpr</tt> does not evaluate to \c true
     @param returnExpr An optional transformation of the final state. If omitted,
         the final state will be returned. <tt>returnExpr</tt> can be, e.g., a
         function call or access to a tuple element. Use "{state}" to refer
         to the state variable. E.g., if <tt>stateType</tt> is a tuple containing
         a field <tt>coefficients</tt>, and this should be returned as the result
         of the algorithm, pass <tt>"{state}.coefficients"</tt> here. The
         replacement field <tt>"{iteration}"</tt> can also be used.
     """

     state = "(st.state)"
     sourceAlias = "src"
     oldState = "(older.state)"
     newState = "(newer.state)"

     updateExpr = updateExpr.format(**locals())
     terminateExpr = terminateExpr.format(**locals())
     returnExpr = returnExpr.format(**locals())

     updateSQL = """
         INSERT INTO _madlib_iterative_alg
         SELECT
             {iteration},
             {updateExpr}
         FROM
             _madlib_iterative_alg AS st,
             {source} AS src
         WHERE
             st.iteration = {iteration} - 1
         """
     terminateSQL = """
         SELECT
             {terminateExpr} AS should_terminate
         FROM
         (
             SELECT state
             FROM _madlib_iterative_alg
             WHERE iteration = {iteration} - {cyclesPerIteration}
         ) AS older,
         (
             SELECT state
             FROM _madlib_iterative_alg
             WHERE iteration = {iteration}
         ) AS newer
         """
     returnSQL = """
         SELECT
             {returnExpr} AS return_value
         FROM
             _madlib_iterative_alg AS st
         WHERE
             st.iteration = {iteration}
         """

     oldMsgLevel = plpy.execute("SHOW client_min_messages")[0]['client_min_messages']
     plpy.execute("""
         SET client_min_messages = error;
         DROP TABLE IF EXISTS _madlib_iterative_alg;
         CREATE TEMPORARY TABLE _madlib_iterative_alg (
             iteration INTEGER PRIMARY KEY,
             state {stateType}
         );
         SET client_min_messages = {oldMsgLevel};
         """.format(**locals()))

     iteration = 0
     plpy.execute("""
         INSERT INTO _madlib_iterative_alg VALUES ({iteration}, {initialState})
         """.format(**locals()))
     while True:
         iteration = iteration + 1
         plpy.execute(updateSQL.format(**locals()))
         if iteration > cyclesPerIteration and (
             iteration >= cyclesPerIteration * maxNumIterations or
             plpy.execute(terminateSQL.format(**locals()))[0]['should_terminate']
                 == True):
             break

     # FIXME: Returning the result set from Python code means that values
     # pass through Python (and there is a potential loss of precision by
     # conversion)

     returnValue = plpy.execute(returnSQL.format(**locals()))[0]['return_value']
     plpy.execute("DROP TABLE _madlib_iterative_alg")
     return returnValue


 def __cg_logregr_coef(**kwargs):
     """
     Logistic regression algorithm with the conjugate-gradient method

     The parameters are the same as for compute_logregr_coef(), except that
     <tt>optimizer</tt> should not be set. This function sets up all SQL
     expression as needed for the conjugate-gradient method and then calls
     __runIterativeAlg().
     """

     stateType = "<m4>MADLIB_SCHEMA</m4>.logregr_cg_state"
     initialState = "NULL"
     source = kwargs['source']

     # "{state}", "{sourceAlias}", "{oldState}", and "{newState}" will not be
     # substituted here but will be passed on to __runIterativeAlg and
     # substituted there
     updateExpr = """
         <m4>MADLIB_SCHEMA</m4>.logregr_cg_step(
             {{sourceAlias}}.{depColumn},
             {{sourceAlias}}.{indepColumn},
             {{state}}
         )
         """.format(**kwargs)
     if kwargs['precision'] == 0.:
         terminateExpr = "FALSE"
     else:
         terminateExpr = """
             {{newState}}.loglikelihood - {{oldState}}.loglikelihood < {precision}
             """.format(**kwargs)

     # One iteration consists of two calls of the aggregate function
     # See float8_cg_update_final().
     cyclesPerIteration = 2
     maxNumIterations = kwargs['numIterations']
     returnExpr = "{state}.coef"
     return __runIterativeAlg(stateType, initialState, source, updateExpr,
         terminateExpr, cyclesPerIteration, maxNumIterations, returnExpr)


 def __irls__logregr_coef(**kwargs):
     """
     Logistic regression algorithm with the iteratively-reweighted-least-squares method

     The parameters are the same as for compute_logregr_coef(), except that
     <tt>optimizer</tt> should not be set. This function sets up all SQL
     expression as needed for the iteratively-reweighted-least-squares method and
     then calls __runIterativeAlg().
     """

     stateType = "<m4>MADLIB_SCHEMA</m4>.logregr_irls_state"
     initialState = "NULL"
     source = kwargs['source']
     updateExpr = """
         <m4>MADLIB_SCHEMA</m4>.logregr_irls_step(
             {{sourceAlias}}.{depColumn},
             {{sourceAlias}}.{indepColumn},
             {{state}}
         )
         """.format(**kwargs)
     if kwargs['precision'] == 0.:
         terminateExpr = "FALSE"
     else:
         terminateExpr = """
             {{newState}}.loglikelihood - {{oldState}}.loglikelihood < {precision}
             """.format(**kwargs)

     cyclesPerIteration = 1
     maxNumIterations = kwargs['numIterations']
     returnExpr = "{state}.coef"
     return __runIterativeAlg(stateType, initialState, source, updateExpr,
         terminateExpr, cyclesPerIteration, maxNumIterations, returnExpr)


 def compute_logregr_coef(**kwargs):
     """
     Compute logistic regression coefficients

     This method serves as an interface to different optimization algorithms.
     By default, iteratively reweighted least squares is used, but for data with
     a lot of columns the conjugate-gradient method might perform better.

     @param source Name of relation containing the training data
     @param depColumn Name of dependent column in training data (of type BOOLEAN)
     @param indepColumn Name of independent column in training data (of type
            DOUBLE PRECISION[])

     Optionally also provide the following:
     @param optimizer Name of the optimizer. 'newton' or 'irls': Iteratively
         reweighted least squares, 'cg': conjugate gradient (default = 'irls')
     @param numIterations Maximum number of iterations (default = 20)
     @param precision Terminate if two consecutive iterations have a difference
            in the log-likelihood of less than <tt>precision</tt>. In other
            words, we terminate if the objective function value has converged.
            If this parameter is 0.0, then the algorithm will not check for
            convergence and only terminate after <tt>numIterations</tt>
            iterations.

     @return array with coefficients in case of convergence, otherwise None

     """
     if not 'optimizer' in kwargs:
         kwargs.update(optimizer = 'irls')
     if not 'numIterations' in kwargs:
         kwargs.update(numIterations = 20)
     if not 'precision' in kwargs:
         kwargs.update(precision = 0.0001)

     if kwargs['optimizer'] == 'cg':
         return __cg_logregr_coef(**kwargs)
     elif kwargs['optimizer'] in ['irls', 'newton']:
         return __irls__logregr_coef(**kwargs)
     else:
         plpy.error("Unknown optimizer requested. Must be 'newton'/'irls' or 'cg'")

     return None

 # The m4 preprocessor complains if eof is reach in quoted mode.
 <m4>
	# coding=utf-8

	# This file is preprocessed with m4. Since we only want macro expansion at
	# very few places, most of the file is quoted. Macros can be used by enclosing
	# them in <m4> and </m4>.
	changequote(`</m4>', `<m4>') </m4>

	"""
	@file logRegress.py_in

	@brief Logistic Regression: Driver functions

	@namespace logRegress

	Logistic Regression: Driver functions
	"""

	import plpy

	def __runIterativeAlg(stateType, initialState, source, updateExpr,
	terminateExpr, cyclesPerIteration, maxNumIterations,
	returnExpr = "st.state"):
	"""
	Driver for an iterative algorithm

	A general driver function for most iterative algorithms: The state between
	iterations is kept in a variable of type <tt>stateType</tt>, which is
	initialized with <tt><em>initialState</em></tt>. During each iteration, the
	SQL statement <tt>updateSQL</tt> is executed in the database. Afterwards,
	the SQL query <tt>updateSQL</tt> decides whether the algorithm terminates.
	At the end of the algorithm, the last state (or an arbitrary transformation
	as given by <tt>returnExpr</tt>) will be returned.

	@param stateType SQL type of the state between iterations
	@param initialState The initial value of the SQL state variable
	@param source The source relation
	@param updateExpr SQL expression that returns the new state of type
	<tt>stateType</tt>. The expression may use the replacement fields
	<tt>"{state}"</tt>, <tt>"{iteration}"</tt>, and
	<tt>"{sourceAlias}"</tt>. Source alias is an alias for the source
	relation <tt><em>source</em></tt>.
	@param terminateExpr SQL expression that returns whether the algorithm should
	terminate. The expression may use the replacement fields
	<tt>"{oldState}"</tt>, <tt>"{newState}"</tt>, and
	<tt>"{iteration}"</tt>. It must return a BOOLEAN value.
	@param cyclesPerIteration Number of aggregate function calls per iteration.
	@param maxNumIterations Maximum number of iterations. Algorithm will then
	terminate even when <tt>terminateExpr</tt> does not evaluate to \c true
	@param returnExpr An optional transformation of the final state. If omitted,
	the final state will be returned. <tt>returnExpr</tt> can be, e.g., a
	function call or access to a tuple element. Use "{state}" to refer
	to the state variable. E.g., if <tt>stateType</tt> is a tuple containing
	a field <tt>coefficients</tt>, and this should be returned as the result
	of the algorithm, pass <tt>"{state}.coefficients"</tt> here. The
	replacement field <tt>"{iteration}"</tt> can also be used.
	"""

	state = "(st.state)"
	sourceAlias = "src"
	oldState = "(older.state)"
	newState = "(newer.state)"

	updateExpr = updateExpr.format(**locals())
	terminateExpr = terminateExpr.format(**locals())
	returnExpr = returnExpr.format(**locals())

	updateSQL = """
	INSERT INTO _madlib_iterative_alg
	SELECT
	{iteration},
	{updateExpr}
	FROM
	_madlib_iterative_alg AS st,
	{source} AS src
	WHERE
	st.iteration = {iteration} - 1
	"""
	terminateSQL = """
	SELECT
	{terminateExpr} AS should_terminate
	FROM
	(
	SELECT state
	FROM _madlib_iterative_alg
	WHERE iteration = {iteration} - {cyclesPerIteration}
	) AS older,
	(
	SELECT state
	FROM _madlib_iterative_alg
	WHERE iteration = {iteration}
	) AS newer
	"""
	returnSQL = """
	SELECT
	{returnExpr} AS return_value
	FROM
	_madlib_iterative_alg AS st
	WHERE
	st.iteration = {iteration}
	"""

	oldMsgLevel = plpy.execute("SHOW client_min_messages")[0]['client_min_messages']
	plpy.execute("""
	SET client_min_messages = error;
	DROP TABLE IF EXISTS _madlib_iterative_alg;
	CREATE TEMPORARY TABLE _madlib_iterative_alg (
	iteration INTEGER PRIMARY KEY,
	state {stateType}
	);
	SET client_min_messages = {oldMsgLevel};
	""".format(**locals()))

	iteration = 0
	plpy.execute("""
	INSERT INTO _madlib_iterative_alg VALUES ({iteration}, {initialState})
	""".format(**locals()))
	while True:
	iteration = iteration + 1
	plpy.execute(updateSQL.format(**locals()))
	if iteration > cyclesPerIteration and (
	iteration >= cyclesPerIteration * maxNumIterations or
	plpy.execute(terminateSQL.format(**locals()))[0]['should_terminate']
	== True):
	break

	# FIXME: Returning the result set from Python code means that values
	# pass through Python (and there is a potential loss of precision by
	# conversion)

	returnValue = plpy.execute(returnSQL.format(**locals()))[0]['return_value']
	plpy.execute("DROP TABLE _madlib_iterative_alg")
	return returnValue


	def __cg_logregr_coef(**kwargs):
	"""
	Logistic regression algorithm with the conjugate-gradient method

	The parameters are the same as for compute_logregr_coef(), except that
	<tt>optimizer</tt> should not be set. This function sets up all SQL
	expression as needed for the conjugate-gradient method and then calls
	__runIterativeAlg().
	"""

	stateType = "<m4>MADLIB_SCHEMA</m4>.logregr_cg_state"
	initialState = "NULL"
	source = kwargs['source']

	# "{state}", "{sourceAlias}", "{oldState}", and "{newState}" will not be
	# substituted here but will be passed on to __runIterativeAlg and
	# substituted there
	updateExpr = """
	<m4>MADLIB_SCHEMA</m4>.logregr_cg_step(
	{{sourceAlias}}.{depColumn},
	{{sourceAlias}}.{indepColumn},
	{{state}}
	)
	""".format(**kwargs)
	if kwargs['precision'] == 0.:
	terminateExpr = "FALSE"
	else:
	terminateExpr = """
	{{newState}}.loglikelihood - {{oldState}}.loglikelihood < {precision}
	""".format(**kwargs)

	# One iteration consists of two calls of the aggregate function
	# See float8_cg_update_final().
	cyclesPerIteration = 2
	maxNumIterations = kwargs['numIterations']
	returnExpr = "{state}.coef"
	return __runIterativeAlg(stateType, initialState, source, updateExpr,
	terminateExpr, cyclesPerIteration, maxNumIterations, returnExpr)


	def __irls__logregr_coef(**kwargs):
	"""
	Logistic regression algorithm with the iteratively-reweighted-least-squares method

	The parameters are the same as for compute_logregr_coef(), except that
	<tt>optimizer</tt> should not be set. This function sets up all SQL
	expression as needed for the iteratively-reweighted-least-squares method and
	then calls __runIterativeAlg().
	"""

	stateType = "<m4>MADLIB_SCHEMA</m4>.logregr_irls_state"
	initialState = "NULL"
	source = kwargs['source']
	updateExpr = """
	<m4>MADLIB_SCHEMA</m4>.logregr_irls_step(
	{{sourceAlias}}.{depColumn},
	{{sourceAlias}}.{indepColumn},
	{{state}}
	)
	""".format(**kwargs)
	if kwargs['precision'] == 0.:
	terminateExpr = "FALSE"
	else:
	terminateExpr = """
	{{newState}}.loglikelihood - {{oldState}}.loglikelihood < {precision}
	""".format(**kwargs)

	cyclesPerIteration = 1
	maxNumIterations = kwargs['numIterations']
	returnExpr = "{state}.coef"
	return __runIterativeAlg(stateType, initialState, source, updateExpr,
	terminateExpr, cyclesPerIteration, maxNumIterations, returnExpr)


	def compute_logregr_coef(**kwargs):
	"""
	Compute logistic regression coefficients

	This method serves as an interface to different optimization algorithms.
	By default, iteratively reweighted least squares is used, but for data with
	a lot of columns the conjugate-gradient method might perform better.

	@param source Name of relation containing the training data
	@param depColumn Name of dependent column in training data (of type BOOLEAN)
	@param indepColumn Name of independent column in training data (of type
	DOUBLE PRECISION[])

	Optionally also provide the following:
	@param optimizer Name of the optimizer. 'newton' or 'irls': Iteratively
	reweighted least squares, 'cg': conjugate gradient (default = 'irls')
	@param numIterations Maximum number of iterations (default = 20)
	@param precision Terminate if two consecutive iterations have a difference
	in the log-likelihood of less than <tt>precision</tt>. In other
	words, we terminate if the objective function value has converged.
	If this parameter is 0.0, then the algorithm will not check for
	convergence and only terminate after <tt>numIterations</tt>
	iterations.

	@return array with coefficients in case of convergence, otherwise None

	"""
	if not 'optimizer' in kwargs:
	kwargs.update(optimizer = 'irls')
	if not 'numIterations' in kwargs:
	kwargs.update(numIterations = 20)
	if not 'precision' in kwargs:
	kwargs.update(precision = 0.0001)

	if kwargs['optimizer'] == 'cg':
	return __cg_logregr_coef(**kwargs)
	elif kwargs['optimizer'] in ['irls', 'newton']:
	return __irls__logregr_coef(**kwargs)
	else:
	plpy.error("Unknown optimizer requested. Must be 'newton'/'irls' or 'cg'")

	return None

	# The m4 preprocessor complains if eof is reach in quoted mode.
	<m4>