blob: 8d5f1fa1b48e77bdedef8a1ca69e92ffb58d4a4e [file] [log] [blame]
"""
@file summary.py_in
@brief Summary function for descriptive statistics
@namespace summary
"""
import plpy
from time import time
from utilities.utilities import __mad_version
from Summarizer import Summarizer
version_wrapper = __mad_version()
_get_vector = version_wrapper.select_vecfunc()
def summary(schema_madlib, source_table, output_table, target_cols, grouping_cols,
get_distinct, get_quartiles, ntile_array, how_many_mfv, get_estimates):
"""
Main summary function that is called by SQL to execute summary
statistics on a table.
@param schema_madlib Madlib Schema namespace
@param source_table Name of input table
@param output_table Name of output table
@param target_cols Names of specific columns for which to get summary
@param grouping_cols Names of columns on which to group-by
(no summary is provided for these columns)
@param get_distinct Should summary include distinct count
@param get_quartiles Should summary include quartile information
@param ntile_array Array for quantiles to include in summary
(each element should be in [0, 1])
@param how_many_mfv How many frequent values to output?
@param get_estimates Should the summmary information be estimated or exact?
"""
old_msg_level = plpy.execute(
"""select setting from pg_settings where
name='client_min_messages'""")[0]['setting']
plpy.execute("set client_min_messages to error")
# 'Estimated', 'Exact', None
distinctify = 'Estimated'
xtileify = 'Exact' # 'Estimated' not supported at present
get_mfv_quick = True
if not get_estimates:
distinctify = 'Exact'
#xtileify = 'Exact'
get_mfv_quick = False
if not get_distinct:
distinctify = 'Skip'
if (version_wrapper.is_pg_version_less_than('9.4') or
version_wrapper.is_gp_version_less_than('4.2.2')):
# PERCENTILE_CONT not available in PostgreSQL < 9.4 or GPDB < 4.2.2.
# The function is available in HAWQ 1.2.0 (even though HAWQ 1.2.0
# is based on GPDB 4.2.0)
xtileify = 'Skip'
# GPDB < 4.2 and PG < 9.0 passes vector as a string.
# Ensure that ntile_array is a list of floats not a string
if ntile_array:
try:
ntile_array = [float(tile) for tile in _get_vector(ntile_array)]
except ValueError:
plpy.error("""Summary -- Invalid paramter:
Percentile array has non-float data""")
# convert comma delimited string input to a list of column names
(target_cols, grouping_cols) = _analyze_str_inputs(target_cols, grouping_cols)
start = time()
summarizer = Summarizer(
schema_madlib, source_table, output_table, target_cols, grouping_cols,
distinctify, get_quartiles, xtileify, ntile_array, how_many_mfv,
get_mfv_quick)
summarizer.run()
end = time()
row_count = plpy.execute(
"SELECT count(*) FROM {output_table}".format(
output_table = output_table))[0]['count']
plpy.execute("set client_min_messages to " + old_msg_level)
return (output_table, row_count, end - start)
# -----------------------------------------------------------------------
# Input parameter checks and edits
# -----------------------------------------------------------------------
def _analyze_str_inputs(target_cols, grouping_cols):
"""
Analyze target_col and grouping_cols string input and convert them to a list
"""
if not target_cols or target_cols.strip() in ('', '*'):
target_cols = None
else:
target_cols = target_cols.replace(' ', '').split(',')
if not grouping_cols or grouping_cols.strip() == '':
# We make grouping_cols a list with None as an element
# since we always want to return summary for complete table
grouping_cols = [None]
else:
grouping_cols = grouping_cols.replace(' ', '').split(',')
grouping_cols.append(None) # to return summary on complete table
return (target_cols, grouping_cols)
# -----------------------------------------------------------------------
# Help messages
# -----------------------------------------------------------------------
def summary_help_message(schema_madlib, message, **kwargs):
"""
Given a help string, provide usage information
"""
if message is not None and \
message.lower() in ("usage", "help", "?"):
return """
Usage:
-----------------------------------------------------------------------
SELECT {schema_madlib}.summary
(
source_table TEXT, -- Source table name (Required)
output_table TEXT, -- Output table name (Required)
target_cols TEXT, -- Comma separated columns for which summary is desired
-- (Default: NULL - produces result for all columns)
grouping_cols TEXT, -- Comma separated columns on which to group results
-- (Default: NULL - provides summary on complete table)
get_distinct BOOLEAN, -- Are distinct values required?
-- (Default: True)
get_quartiles BOOLEAN, -- Are quartiles required?
-- (Default: True)
ntile_array FLOAT8[], -- Array of quantile values to compute
-- (Default: NULL - Quantile array is not included in result)
how_many_mfv INTEGER, -- How many most-frequent-values (MFVs) to compute?
-- (Default: 10)
get_estimates BOOLEAN -- Should we produce an estimated
-- (as opposed to an exact but slow) value for distincts and MFVs?
) -- (Default: True)
-----------------------------------------------------------------------
Output table will be in following format
(presence of some columns depends on the arguments):
- group_by_column : Group-by column names (NULL if none provided)
- group_by_value : Values of the Group-by column (NULL if no grouping)
- target_column : Targeted column values for which summary is requested
- column_number : Physical column number for the target column, as described in pg_attribute
- data_type : Data type of target column. Standard GPDB descriptors will be displayed
- row_count : Number of rows for the target column
- distinct_values : Number of distinct values in the target column
- missing_values : Number of missing values in the target column
- blank_values : Number of blank values (blanks are defined by the regular expression '^\w*$')
- fraction_missing : Percentage of total rows that are missing. Will be expressed as a decimal (e.g. 0.3)
- fraction_blank : Percentage of total rows that are blank. Will be expressed as a decimal (e.g. 0.3)
- mean : Mean value of target column (if target is numeric, else NULL)
- variance : Variance of target columns (if target is numeric, else NULL for strings)
- min : Min value of target column (for strings this is the length of the shortest string)
- max : Max value of target column (for strings this is the length of the longest string)
- first_quartile : First quartile (25th percentile, valid only for numeric columns)
- median : Median value of target column (valid only for numeric columns)
- third_quartile : Third quartile (75th percentile, valid only for numeric columns)
- quantile_array : Percentile values corresponding to ntile_array
- most_frequent_values : Most frequent values
- mfv_frequencies : Frequency of the most frequent values
""".format(schema_madlib = schema_madlib)
elif message is not None and message.lower() in ('example', 'examples'):
return """
DROP TABLE IF EXISTS example_data;
CREATE TABLE example_data(
id SERIAL,
outlook text,
temperature float8,
humidity float8,
windy text,
class text) ;
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 85, 85, 'false', E'Don\\'t Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 80, 90, 'true', E'Don\\'t Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 83, 78, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 70, 96, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 68, 80, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 65, 70, 'true', E'Don\\'t Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 64, 65, 'true', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 72, 95, 'false', E'Don\\'t Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 69, 70, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 75, 80, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('sunny', 75, 70, 'true', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 72, 90, 'true', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('overcast', 81, 75, 'false', 'Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('rain', 71, 80, 'true', E'Don\\'t Play');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES(' ', 100, 100, 'true', ' ');
INSERT INTO example_data(outlook, temperature, humidity, windy, class) VALUES('', 110, 100, 'true', '');
SELECT madlib.summary('example_data', 'example_data_output');
SELECT madlib.summary('example_data', 'example_data_output', 'windy');
SELECT madlib.summary('example_data', 'example_data_output', 'windy,humidity');
SELECT madlib.summary('example_data', 'example_data_output', 'id', 'windy');
SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3]);
SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2);
SELECT madlib.summary('example_data', 'example_data_output', NULL, NULL, True, True, array[0.1, 0.2, 0.3], 2, False);
"""
else:
return """
'summary' is a generic function used to produce summary statistics
of any data table. The function invokes particular 'methods' from
the MADlib library to provide an overview of the data.
-------
For an overview on usage, run:
SELECT {schema_madlib}.summary('usage');
-------
For an example, run:
SELECT {schema_madlib}.summary('example')
""".format(schema_madlib = schema_madlib)