src/ports/postgres/modules/summary/summary.py_in - madlib - Git at Google

 """
 @file summary.py_in

 @brief Summary function for descriptive statistics

 @namespace summary
 """
 import plpy
 from time import time
 from utilities.utilities import __mad_version
 from Summarizer import Summarizer
 version_wrapper = __mad_version()
 _get_vector = version_wrapper.select_vecfunc()

 def summary(schema_madlib, source_table, output_table, target_cols, grouping_cols,
     get_distinct, get_quartiles, ntile_array, how_many_mfv, get_estimates):
     """
         Main summary function that is called by SQL to execute summary
         statistics on a table.

         @param schema_madlib        Madlib Schema namespace
         @param source_table         Name of input table
         @param output_table         Name of output table
         @param target_cols          Names of specific columns for which to get summary
         @param grouping_cols        Names of columns on which to group-by
                                         (no summary is provided for these columns)
         @param get_distinct         Should summary include distinct count
         @param get_quartiles        Should summary include quartile information
         @param ntile_array          Array for quantiles to include in summary
                                         (each element should be in [0, 1])
         @param how_many_mfv         How many frequent values to output?
         @param get_estimates        Should the summmary information be estimated or exact?

     """

     old_msg_level = plpy.execute(
         """select setting from pg_settings where
         name='client_min_messages'""")[0]['setting']
     plpy.execute("set client_min_messages to error")

     # 'Estimated', 'Exact', None
     distinctify = 'Estimated'
     xtileify = 'Exact'          # 'Estimated' not supported at present
     get_mfv_quick = True

     if not get_estimates:
         distinctify = 'Exact'
         #xtileify = 'Exact'
         get_mfv_quick = False

     if not get_distinct:
         distinctify = 'Skip'

     if (version_wrapper.is_pg_version_less_than('9.4') or
             version_wrapper.is_gp_version_less_than('4.2.2')):
         # PERCENTILE_CONT not available in PostgreSQL < 9.4 or GPDB < 4.2.2.
         # The function is available in HAWQ 1.2.0 (even though HAWQ 1.2.0
         #   is based on GPDB 4.2.0)
         xtileify = 'Skip'

     # GPDB < 4.2 and PG < 9.0 passes vector as a string.
     # Ensure that ntile_array is a list of floats not a string
     if ntile_array:
         try:
             ntile_array = [float(tile) for tile in _get_vector(ntile_array)]
         except ValueError:
             plpy.error("""Summary -- Invalid paramter:
                         Percentile array has non-float data""")

     # convert comma delimited string input to a list of column names
     (target_cols, grouping_cols) = _analyze_str_inputs(target_cols, grouping_cols)

     start = time()
     summarizer = Summarizer(
         schema_madlib, source_table, output_table, target_cols, grouping_cols,
         distinctify, get_quartiles, xtileify, ntile_array, how_many_mfv,
         get_mfv_quick)
     summarizer.run()
     end = time()

     row_count = plpy.execute(
         "SELECT count(*) FROM {output_table}".format(
             output_table = output_table))[0]['count']

     plpy.execute("set client_min_messages to " + old_msg_level)
     return (output_table, row_count, end - start)

 # -----------------------------------------------------------------------
 # Input parameter checks and edits
 # -----------------------------------------------------------------------
 def _analyze_str_inputs(target_cols, grouping_cols):
     """
     Analyze target_col and grouping_cols string input and convert them to a list
     """
     if not target_cols or target_cols.strip() in ('', '*'):
         target_cols = None
     else:
         target_cols = target_cols.replace(' ', '').split(',')
     if not grouping_cols or grouping_cols.strip() == '':
         # We make grouping_cols a list with None as an element
         # since we always want to return summary for complete table
         grouping_cols = [None]
     else:
         grouping_cols = grouping_cols.replace(' ', '').split(',')
         grouping_cols.append(None) # to return summary on complete table
     return (target_cols, grouping_cols)

 # -----------------------------------------------------------------------
 # Help messages
 # -----------------------------------------------------------------------
 def summary_help_message(schema_madlib, message, **kwargs):
     """
     Given a help string, provide usage information
     """
     if message is not None and \
             message.lower() in ("usage", "help", "?"):
         return """
         Usage:
         -----------------------------------------------------------------------
         SELECT {schema_madlib}.summary
         (
             source_table            TEXT,       -- Source table name (Required)
             output_table            TEXT,       -- Output table name (Required)
             target_cols             TEXT,       -- Comma separated columns for which summary is desired
                                                 --      (Default: NULL - produces result for all columns)
             grouping_cols           TEXT,       -- Comma separated columns on which to group results
                                                 --      (Default: NULL - provides summary on complete table)
             get_distinct            BOOLEAN,    -- Are distinct values required?
                                                 --      (Default: True)
             get_quartiles           BOOLEAN,    -- Are quartiles required?
                                                 --      (Default: True)
             ntile_array             FLOAT8[],   -- Array of quantile values to compute
                                                 --      (Default: NULL - Quantile array is not included in result)
             how_many_mfv            INTEGER,    -- How many most-frequent-values (MFVs) to compute?
                                                 --      (Default: 10)
             get_estimates           BOOLEAN     -- Should we produce an estimated
                                                 -- (as opposed to an exact but slow) value for distincts and MFVs?
         )                                       --      (Default: True)
         -----------------------------------------------------------------------
         Output table will be in following format
         (presence of some columns depends on the arguments):
             - group_by_column       : Group-by column names (NULL if none provided)
             - group_by_value        : Values of the Group-by column (NULL if no grouping)
             - target_column         : Targeted column values for which summary is requested
             - column_number         : Physical column number for the target column, as described in pg_attribute
             - data_type             : Data type of target column. Standard GPDB descriptors will be displayed
             - row_count             : Number of rows for the target column
             - distinct_values       : Number of distinct values in the target column
             - missing_values        : Number of missing values in the target column
             - blank_values          : Number of blank values (blanks are defined by the regular expression '^\w*$')
             - fraction_missing      : Percentage of total rows that are missing. Will be expressed as a decimal (e.g. 0.3)
             - fraction_blank        : Percentage of total rows that are blank. Will be expressed as a decimal (e.g. 0.3)
             - mean                  : Mean value of target column (if target is numeric, else NULL)
             - variance              : Variance of target columns (if target is numeric, else NULL for strings)
             - min                   : Min value of target column (for strings this is the length of the shortest string)
             - max                   : Max value of target column (for strings this is the length of the longest string)
             - first_quartile        : First quartile (25th percentile, valid only for numeric columns)
             - median                : Median value of target column (valid only for numeric columns)
             - third_quartile        : Third quartile (75th percentile, valid only for numeric columns)
             - quantile_array        : Percentile values corresponding to ntile_array
             - most_frequent_values  : Most frequent values
             - mfv_frequencies       : Frequency of the most frequent values
         """.format(schema_madlib = schema_madlib)
     elif message is not None and message.lower() in ('example', 'examples'):
         return """
             DROP TABLE IF EXISTS example_data;
             CREATE TABLE example_data(
                 id SERIAL,
                 outlook text,
                 temperature float8,
                 humidity float8,
                 windy text,
                 class text) ;

             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 85, 85, 'false', E'Don\\'t Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 80, 90, 'true', E'Don\\'t Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 83, 78, 'false', 'Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 70, 96, 'false', 'Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 68, 80, 'false', 'Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 65, 70, 'true', E'Don\\'t Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 64, 65, 'true', 'Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 72, 95, 'false', E'Don\\'t Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 69, 70, 'false', 'Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 75, 80, 'false', 'Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 75, 70, 'true', 'Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 72, 90, 'true', 'Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 81, 75, 'false', 'Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 71, 80, 'true', E'Don\\'t Play');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES(' ', 100, 100, 'true', ' ');
             INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('', 110, 100, 'true', '');

             SELECT madlib.summary('example_data', 'example_data_output');
             SELECT madlib.summary('example_data', 'example_data_output', 'windy');
             SELECT madlib.summary('example_data', 'example_data_output', 'windy,humidity');
             SELECT madlib.summary('example_data', 'example_data_output', 'id', 'windy');
             SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3]);
             SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2);
             SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2, False);
          """
     else:
        return """
             'summary' is a generic function used to produce summary statistics
             of any data table.  The function invokes particular 'methods' from
             the MADlib library to provide an overview of the data.
             -------
             For an overview on usage, run:
             SELECT {schema_madlib}.summary('usage');
             -------
             For an example, run:
             SELECT {schema_madlib}.summary('example')
             """.format(schema_madlib = schema_madlib)
	"""
	@file summary.py_in

	@brief Summary function for descriptive statistics

	@namespace summary
	"""
	import plpy
	from time import time
	from utilities.utilities import __mad_version
	from Summarizer import Summarizer
	version_wrapper = __mad_version()
	_get_vector = version_wrapper.select_vecfunc()

	def summary(schema_madlib, source_table, output_table, target_cols, grouping_cols,
	get_distinct, get_quartiles, ntile_array, how_many_mfv, get_estimates):
	"""
	Main summary function that is called by SQL to execute summary
	statistics on a table.

	@param schema_madlib Madlib Schema namespace
	@param source_table Name of input table
	@param output_table Name of output table
	@param target_cols Names of specific columns for which to get summary
	@param grouping_cols Names of columns on which to group-by
	(no summary is provided for these columns)
	@param get_distinct Should summary include distinct count
	@param get_quartiles Should summary include quartile information
	@param ntile_array Array for quantiles to include in summary
	(each element should be in [0, 1])
	@param how_many_mfv How many frequent values to output?
	@param get_estimates Should the summmary information be estimated or exact?

	"""

	old_msg_level = plpy.execute(
	"""select setting from pg_settings where
	name='client_min_messages'""")[0]['setting']
	plpy.execute("set client_min_messages to error")

	# 'Estimated', 'Exact', None
	distinctify = 'Estimated'
	xtileify = 'Exact' # 'Estimated' not supported at present
	get_mfv_quick = True

	if not get_estimates:
	distinctify = 'Exact'
	#xtileify = 'Exact'
	get_mfv_quick = False

	if not get_distinct:
	distinctify = 'Skip'

	if (version_wrapper.is_pg_version_less_than('9.4') or
	version_wrapper.is_gp_version_less_than('4.2.2')):
	# PERCENTILE_CONT not available in PostgreSQL < 9.4 or GPDB < 4.2.2.
	# The function is available in HAWQ 1.2.0 (even though HAWQ 1.2.0
	# is based on GPDB 4.2.0)
	xtileify = 'Skip'

	# GPDB < 4.2 and PG < 9.0 passes vector as a string.
	# Ensure that ntile_array is a list of floats not a string
	if ntile_array:
	try:
	ntile_array = [float(tile) for tile in _get_vector(ntile_array)]
	except ValueError:
	plpy.error("""Summary -- Invalid paramter:
	Percentile array has non-float data""")

	# convert comma delimited string input to a list of column names
	(target_cols, grouping_cols) = _analyze_str_inputs(target_cols, grouping_cols)

	start = time()
	summarizer = Summarizer(
	schema_madlib, source_table, output_table, target_cols, grouping_cols,
	distinctify, get_quartiles, xtileify, ntile_array, how_many_mfv,
	get_mfv_quick)
	summarizer.run()
	end = time()

	row_count = plpy.execute(
	"SELECT count(*) FROM {output_table}".format(
	output_table = output_table))[0]['count']

	plpy.execute("set client_min_messages to " + old_msg_level)
	return (output_table, row_count, end - start)

	# -----------------------------------------------------------------------
	# Input parameter checks and edits
	# -----------------------------------------------------------------------
	def _analyze_str_inputs(target_cols, grouping_cols):
	"""
	Analyze target_col and grouping_cols string input and convert them to a list
	"""
	if not target_cols or target_cols.strip() in ('', '*'):
	target_cols = None
	else:
	target_cols = target_cols.replace(' ', '').split(',')
	if not grouping_cols or grouping_cols.strip() == '':
	# We make grouping_cols a list with None as an element
	# since we always want to return summary for complete table
	grouping_cols = [None]
	else:
	grouping_cols = grouping_cols.replace(' ', '').split(',')
	grouping_cols.append(None) # to return summary on complete table
	return (target_cols, grouping_cols)

	# -----------------------------------------------------------------------
	# Help messages
	# -----------------------------------------------------------------------
	def summary_help_message(schema_madlib, message, **kwargs):
	"""
	Given a help string, provide usage information
	"""
	if message is not None and \
	message.lower() in ("usage", "help", "?"):
	return """
	Usage:
	-----------------------------------------------------------------------
	SELECT {schema_madlib}.summary
	(
	source_table TEXT, -- Source table name (Required)
	output_table TEXT, -- Output table name (Required)
	target_cols TEXT, -- Comma separated columns for which summary is desired
	-- (Default: NULL - produces result for all columns)
	grouping_cols TEXT, -- Comma separated columns on which to group results
	-- (Default: NULL - provides summary on complete table)
	get_distinct BOOLEAN, -- Are distinct values required?
	-- (Default: True)
	get_quartiles BOOLEAN, -- Are quartiles required?
	-- (Default: True)
	ntile_array FLOAT8[], -- Array of quantile values to compute
	-- (Default: NULL - Quantile array is not included in result)
	how_many_mfv INTEGER, -- How many most-frequent-values (MFVs) to compute?
	-- (Default: 10)
	get_estimates BOOLEAN -- Should we produce an estimated
	-- (as opposed to an exact but slow) value for distincts and MFVs?
	) -- (Default: True)
	-----------------------------------------------------------------------
	Output table will be in following format
	(presence of some columns depends on the arguments):
	- group_by_column : Group-by column names (NULL if none provided)
	- group_by_value : Values of the Group-by column (NULL if no grouping)
	- target_column : Targeted column values for which summary is requested
	- column_number : Physical column number for the target column, as described in pg_attribute
	- data_type : Data type of target column. Standard GPDB descriptors will be displayed
	- row_count : Number of rows for the target column
	- distinct_values : Number of distinct values in the target column
	- missing_values : Number of missing values in the target column
	- blank_values : Number of blank values (blanks are defined by the regular expression '^\w*$')
	- fraction_missing : Percentage of total rows that are missing. Will be expressed as a decimal (e.g. 0.3)
	- fraction_blank : Percentage of total rows that are blank. Will be expressed as a decimal (e.g. 0.3)
	- mean : Mean value of target column (if target is numeric, else NULL)
	- variance : Variance of target columns (if target is numeric, else NULL for strings)
	- min : Min value of target column (for strings this is the length of the shortest string)
	- max : Max value of target column (for strings this is the length of the longest string)
	- first_quartile : First quartile (25th percentile, valid only for numeric columns)
	- median : Median value of target column (valid only for numeric columns)
	- third_quartile : Third quartile (75th percentile, valid only for numeric columns)
	- quantile_array : Percentile values corresponding to ntile_array
	- most_frequent_values : Most frequent values
	- mfv_frequencies : Frequency of the most frequent values
	""".format(schema_madlib = schema_madlib)
	elif message is not None and message.lower() in ('example', 'examples'):
	return """
	DROP TABLE IF EXISTS example_data;
	CREATE TABLE example_data(
	id SERIAL,
	outlook text,
	temperature float8,
	humidity float8,
	windy text,
	class text) ;

	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 85, 85, 'false', E'Don\\'t Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 80, 90, 'true', E'Don\\'t Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 83, 78, 'false', 'Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 70, 96, 'false', 'Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 68, 80, 'false', 'Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 65, 70, 'true', E'Don\\'t Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 64, 65, 'true', 'Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 72, 95, 'false', E'Don\\'t Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 69, 70, 'false', 'Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 75, 80, 'false', 'Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 75, 70, 'true', 'Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 72, 90, 'true', 'Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 81, 75, 'false', 'Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 71, 80, 'true', E'Don\\'t Play');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES(' ', 100, 100, 'true', ' ');
	INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('', 110, 100, 'true', '');

	SELECT madlib.summary('example_data', 'example_data_output');
	SELECT madlib.summary('example_data', 'example_data_output', 'windy');
	SELECT madlib.summary('example_data', 'example_data_output', 'windy,humidity');
	SELECT madlib.summary('example_data', 'example_data_output', 'id', 'windy');
	SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3]);
	SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2);
	SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2, False);
	"""
	else:
	return """
	'summary' is a generic function used to produce summary statistics
	of any data table. The function invokes particular 'methods' from
	the MADlib library to provide an overview of the data.
	-------
	For an overview on usage, run:
	SELECT {schema_madlib}.summary('usage');
	-------
	For an example, run:
	SELECT {schema_madlib}.summary('example')
	""".format(schema_madlib = schema_madlib)