examples/hello_world/iterative/simple_logistic.py_in - madlib - Git at Google

 # coding=utf-8
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.


 """
 @file simple_logistic.py_in

 @brief Logistic Regression: Driver functions

 @namespace simple_logistic

 @brief Logistic Regression: Driver functions
 """
 import plpy
 from utilities.validate_args import table_exists
 from utilities.validate_args import table_is_empty
 # ------------------------------------------------------------------------


 def logregr_simple_train(
         schema_madlib, source_table, out_table, dependent_varname,
         independent_varname, max_iter=None,
         tolerance=None, verbose=None, **kwargs):
     """
     Train logistic model

     @param schema_madlib Name of the MADlib schema, properly escaped/quoted
     @param source_table Name of relation containing the training data
     @param out_table Name of relation where model will be outputted
     @param dependent_varname Name of dependent column in training data (of type BOOLEAN)
     @param independent_varname Name of independent column in training data (of type
                    DOUBLE PRECISION[])
     @param max_iter The maximum number of iterations that are allowed.
     @param tolerance The precision that the results should have
     @param kwargs We allow the caller to specify additional arguments (all of
            which will be ignored though). The purpose of this is to allow the
            caller to unpack a dictionary whose element set is a superset of
            the required arguments by this function.

     @return A composite value which is __logregr_simple_result defined in simple_logistic.sql_in
     """

     return __logregr_train_compute(
         schema_madlib, source_table, out_table, dependent_varname,
         independent_varname, max_iter, tolerance,
         verbose, **kwargs)

 # ========================================================================


 def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
                             ind_col, max_iter, tolerance, verbose, **kwargs):
     """
     Validate the arguments
     """
     if tbl_source is None or tbl_source.strip().lower() in ('null', ''):
         plpy.error("Logregr error: Invalid data table name!")
     if not table_exists(tbl_source):
         plpy.error("Logregr error: Data table does not exist!")
     if table_is_empty(tbl_source):
         plpy.error("Logregr error: Data table is empty!")

     if tbl_output is None or tbl_output.strip().lower() in ('null', ''):
         plpy.error("Logregr error: Invalid output table name!")

     if not dep_col or dep_col.strip().lower() in ('null', ''):
         plpy.error("Logregr error: Invalid dependent column name!")

     # if not columns_exist_in_table(tbl_source, [dep_col]):
     #     plpy.error("Logregr error: Dependent column does not exist!")

     if not ind_col or ind_col.lower() in ('null', ''):
         plpy.error("Logregr error: Invalid independent column name!")

     if max_iter <= 0:
         plpy.error("Logregr error: Maximum number of iterations must be positive!")

     if tolerance < 0:
         plpy.error("Logregr error: The tolerance cannot be negative!")

     update_plan = plpy.prepare(
         """
         SELECT
             {schema_madlib}.__logregr_simple_step(
                 ({dep_col})::boolean,
                 ({ind_col})::double precision[],
                 ($1))
         FROM {tbl_source}
         """.format(
             tbl_output=tbl_output,
             schema_madlib=schema_madlib,
             dep_col=dep_col,
             ind_col=ind_col,
             tbl_source=tbl_source), ["double precision[]"])

     state = None
     for it in range(0, max_iter):
         res_tuple = plpy.execute(update_plan, [state])
         state = res_tuple[0].values()[0]

     output_table = plpy.prepare(
             """
             drop table if exists {tbl_output};
             create table {tbl_output} as
                 select
                     (result).coef as coef,
                     (result).log_likelihood as log_likelihood
                 from
                 (
                     select * from
                     {schema_madlib}.__logregr_simple_finalizer($1)
                 ) result
             """.format(schema_madlib=schema_madlib,
                        tbl_output=tbl_output), ["double precision[]"])

     plpy.execute(output_table, [state])
     return None
 # --------------------------------------------------------------------


 def logregr_simple_help_msg(schema_madlib, message, **kwargs):
     """ Help message for logistic regression

     @param message A string, the help message indicator

     Returns:
       A string, contains the help message
     """
     if not message:

         help_string = """
 ----------------------------------------------------------------
                         SUMMARY
 ----------------------------------------------------------------
 Binomial logistic regression models the relationship between a
 dichotomous dependent variable and one or more predictor variables.

 The dependent variable may be a Boolean value or a categorical variable
 that can be represented with a Boolean expression.

 For more details on function usage:
     SELECT {schema_madlib}.logregr_train('usage')

 For a small example on using the function:
     SELECT {schema_madlib}.logregr_train('example')
         """
     elif message in ['usage', 'help', '?']:

         help_string = """
 ------------------------------------------------------------------
                         USAGE
 ------------------------------------------------------------------
 SELECT {schema_madlib}.logregr_train(
     source_table,         -- name of input table
     out_table,            -- name of output table
     dependent_varname,    -- name of dependent variable
     independent_varname,  -- names of independent variables
     max_iter,             -- optional, default 20, maximum iteration number
     tolerance,            -- optional, default 0.0001, the stopping threshold
     verbose               -- optional, default FALSE, whether to print useful info
 );

 ------------------------------------------------------------------
                         OUTPUT
 ------------------------------------------------------------------
 The output table ('out_table' above) has the following columns:
     <...>,                                          -- Grouping column values used during training
     'coef',                     double precision[], -- vector of fitting coefficients
     'log_likelihood',           double precision,   -- log likelihood
     'std_err',                  double precision[], -- vector of standard errors of the fitting coefficients
     'z_stats',                  double precision[], -- vector of the z-statistics of the coefficients
     'p_values',                 double precision[], -- vector of the p values
     'odds_ratios',              double precision[], -- vector of odds ratios, exp(coefficients)
     'condition_no',             double precision,   -- the condition number
     'num_rows_processed',       integer,            -- how many rows are actually used in the computation
     'num_missing_rows_skipped', integer,            -- number of rows that contain NULL and were skipped per group
     'num_iterations'            double precision    -- how many iterations are used in the computation per group
         """
     elif message in ['example', 'examples']:

         help_string = """
 CREATE TABLE patients( id INTEGER NOT NULL,
                        second_attack BOOLEAN,
                        treatment INTEGER,
                        trait_anxiety INTEGER);
 COPY patients FROM STDIN WITH DELIMITER '|';
   1 |   True  |         1 |            70
   3 |   True  |         1 |            50
   5 |   True  |         0 |            40
   7 |   True  |         0 |            75
   9 |   True  |         0 |            70
  11 |   False |         1 |            65
  13 |   False |         1 |            45
  15 |   False |         1 |            40
  17 |   False |         0 |            55
  19 |   False |         0 |            50
   2 |   True  |         1 |            80
   4 |   True  |         0 |            60
   6 |   True  |         0 |            65
   8 |   True  |         0 |            80
  10 |   True  |         0 |            60
  12 |   False |         1 |            50
  14 |   False |         1 |            35
  16 |   False |         1 |            50
  18 |   False |         0 |            45
  20 |   False |         0 |            60
 \.

 -- Drop output tables before calling the function
 DROP TABLE IF EXISTS patients_logregr;
 DROP TABLE IF EXISTS patients_logregr_summary;

 SELECT madlib.logregr_simple_train( 'patients',
                              'patients_logregr',
                              'second_attack',
                              'ARRAY[1, treatment, trait_anxiety]');

 SELECT * from patients_logregr;
         """
     else:
         help_string = "No such option. Use {schema_madlib}.logregr_simple_train('help')"

     return help_string.format(schema_madlib=schema_madlib)
	# coding=utf-8
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.


	"""
	@file simple_logistic.py_in

	@brief Logistic Regression: Driver functions

	@namespace simple_logistic

	@brief Logistic Regression: Driver functions
	"""
	import plpy
	from utilities.validate_args import table_exists
	from utilities.validate_args import table_is_empty
	# ------------------------------------------------------------------------


	def logregr_simple_train(
	schema_madlib, source_table, out_table, dependent_varname,
	independent_varname, max_iter=None,
	tolerance=None, verbose=None, **kwargs):
	"""
	Train logistic model

	@param schema_madlib Name of the MADlib schema, properly escaped/quoted
	@param source_table Name of relation containing the training data
	@param out_table Name of relation where model will be outputted
	@param dependent_varname Name of dependent column in training data (of type BOOLEAN)
	@param independent_varname Name of independent column in training data (of type
	DOUBLE PRECISION[])
	@param max_iter The maximum number of iterations that are allowed.
	@param tolerance The precision that the results should have
	@param kwargs We allow the caller to specify additional arguments (all of
	which will be ignored though). The purpose of this is to allow the
	caller to unpack a dictionary whose element set is a superset of
	the required arguments by this function.

	@return A composite value which is __logregr_simple_result defined in simple_logistic.sql_in
	"""

	return __logregr_train_compute(
	schema_madlib, source_table, out_table, dependent_varname,
	independent_varname, max_iter, tolerance,
	verbose, **kwargs)

	# ========================================================================


	def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
	ind_col, max_iter, tolerance, verbose, **kwargs):
	"""
	Validate the arguments
	"""
	if tbl_source is None or tbl_source.strip().lower() in ('null', ''):
	plpy.error("Logregr error: Invalid data table name!")
	if not table_exists(tbl_source):
	plpy.error("Logregr error: Data table does not exist!")
	if table_is_empty(tbl_source):
	plpy.error("Logregr error: Data table is empty!")

	if tbl_output is None or tbl_output.strip().lower() in ('null', ''):
	plpy.error("Logregr error: Invalid output table name!")

	if not dep_col or dep_col.strip().lower() in ('null', ''):
	plpy.error("Logregr error: Invalid dependent column name!")

	# if not columns_exist_in_table(tbl_source, [dep_col]):
	# plpy.error("Logregr error: Dependent column does not exist!")

	if not ind_col or ind_col.lower() in ('null', ''):
	plpy.error("Logregr error: Invalid independent column name!")

	if max_iter <= 0:
	plpy.error("Logregr error: Maximum number of iterations must be positive!")

	if tolerance < 0:
	plpy.error("Logregr error: The tolerance cannot be negative!")

	update_plan = plpy.prepare(
	"""
	SELECT
	{schema_madlib}.__logregr_simple_step(
	({dep_col})::boolean,
	({ind_col})::double precision[],
	($1))
	FROM {tbl_source}
	""".format(
	tbl_output=tbl_output,
	schema_madlib=schema_madlib,
	dep_col=dep_col,
	ind_col=ind_col,
	tbl_source=tbl_source), ["double precision[]"])

	state = None
	for it in range(0, max_iter):
	res_tuple = plpy.execute(update_plan, [state])
	state = res_tuple[0].values()[0]

	output_table = plpy.prepare(
	"""
	drop table if exists {tbl_output};
	create table {tbl_output} as
	select
	(result).coef as coef,
	(result).log_likelihood as log_likelihood
	from
	(
	select * from
	{schema_madlib}.__logregr_simple_finalizer($1)
	) result
	""".format(schema_madlib=schema_madlib,
	tbl_output=tbl_output), ["double precision[]"])

	plpy.execute(output_table, [state])
	return None
	# --------------------------------------------------------------------


	def logregr_simple_help_msg(schema_madlib, message, **kwargs):
	""" Help message for logistic regression

	@param message A string, the help message indicator

	Returns:
	A string, contains the help message
	"""
	if not message:

	help_string = """
	----------------------------------------------------------------
	SUMMARY
	----------------------------------------------------------------
	Binomial logistic regression models the relationship between a
	dichotomous dependent variable and one or more predictor variables.

	The dependent variable may be a Boolean value or a categorical variable
	that can be represented with a Boolean expression.

	For more details on function usage:
	SELECT {schema_madlib}.logregr_train('usage')

	For a small example on using the function:
	SELECT {schema_madlib}.logregr_train('example')
	"""
	elif message in ['usage', 'help', '?']:

	help_string = """
	------------------------------------------------------------------
	USAGE
	------------------------------------------------------------------
	SELECT {schema_madlib}.logregr_train(
	source_table, -- name of input table
	out_table, -- name of output table
	dependent_varname, -- name of dependent variable
	independent_varname, -- names of independent variables
	max_iter, -- optional, default 20, maximum iteration number
	tolerance, -- optional, default 0.0001, the stopping threshold
	verbose -- optional, default FALSE, whether to print useful info
	);

	------------------------------------------------------------------
	OUTPUT
	------------------------------------------------------------------
	The output table ('out_table' above) has the following columns:
	<...>, -- Grouping column values used during training
	'coef', double precision[], -- vector of fitting coefficients
	'log_likelihood', double precision, -- log likelihood
	'std_err', double precision[], -- vector of standard errors of the fitting coefficients
	'z_stats', double precision[], -- vector of the z-statistics of the coefficients
	'p_values', double precision[], -- vector of the p values
	'odds_ratios', double precision[], -- vector of odds ratios, exp(coefficients)
	'condition_no', double precision, -- the condition number
	'num_rows_processed', integer, -- how many rows are actually used in the computation
	'num_missing_rows_skipped', integer, -- number of rows that contain NULL and were skipped per group
	'num_iterations' double precision -- how many iterations are used in the computation per group
	"""
	elif message in ['example', 'examples']:

	help_string = """
	CREATE TABLE patients( id INTEGER NOT NULL,
	second_attack BOOLEAN,
	treatment INTEGER,
	trait_anxiety INTEGER);
	COPY patients FROM STDIN WITH DELIMITER '\|';
	1 \| True \| 1 \| 70
	3 \| True \| 1 \| 50
	5 \| True \| 0 \| 40
	7 \| True \| 0 \| 75
	9 \| True \| 0 \| 70
	11 \| False \| 1 \| 65
	13 \| False \| 1 \| 45
	15 \| False \| 1 \| 40
	17 \| False \| 0 \| 55
	19 \| False \| 0 \| 50
	2 \| True \| 1 \| 80
	4 \| True \| 0 \| 60
	6 \| True \| 0 \| 65
	8 \| True \| 0 \| 80
	10 \| True \| 0 \| 60
	12 \| False \| 1 \| 50
	14 \| False \| 1 \| 35
	16 \| False \| 1 \| 50
	18 \| False \| 0 \| 45
	20 \| False \| 0 \| 60
	\.

	-- Drop output tables before calling the function
	DROP TABLE IF EXISTS patients_logregr;
	DROP TABLE IF EXISTS patients_logregr_summary;

	SELECT madlib.logregr_simple_train( 'patients',
	'patients_logregr',
	'second_attack',
	'ARRAY[1, treatment, trait_anxiety]');

	SELECT * from patients_logregr;
	"""
	else:
	help_string = "No such option. Use {schema_madlib}.logregr_simple_train('help')"

	return help_string.format(schema_madlib=schema_madlib)