blob: ecd8726462f4440e5a54ccafc158dac5a42cbb75 [file] [log] [blame]
"""
@file summary.py_in
@brief Summary function for descriptive statistics
@namespace summary
"""
import plpy
from time import time
from utilities.control import MinWarning
from Summarizer import Summarizer
def summary(schema_madlib, source_table, output_table, target_cols, grouping_cols,
get_distinct, get_quartiles, ntile_array, how_many_mfv,
get_estimates, n_cols_per_run):
"""
Main summary function that is called by SQL to compute summary
statistics on a table.
@param schema_madlib Madlib Schema namespace
@param source_table Name of input table
@param output_table Name of output table
@param target_cols Names of specific columns for which to get summary
@param grouping_cols Names of columns on which to group-by
(no summary is provided for these columns)
@param get_distinct Should summary include distinct count
@param get_quartiles Should summary include quartile information
@param ntile_array Array for quantiles to include in summary
(each element should be in [0, 1])
@param how_many_mfv How many frequent values to output?
@param get_estimates Should the summmary information be estimated or exact?
@param n_cols_per_run Number of columns to compute statistics in a single run
"""
with MinWarning('error'):
# 'Estimated', 'Exact', None
distinctify = 'Estimated'
xtileify = 'Exact' # 'Estimated' not supported at present
get_mfv_quick = True
if not get_estimates:
distinctify = 'Exact'
# xtileify = 'Exact'
get_mfv_quick = False
if not get_distinct:
distinctify = 'Skip'
# Ensure that ntile_array is a list of floats not a string
if ntile_array:
try:
ntile_array = [float(tile) for tile in ntile_array]
except ValueError:
plpy.error("""Summary -- Invalid paramter:
Percentile array has non-float data""")
# convert comma delimited string input to a list of column names
(target_cols, grouping_cols) = _analyze_str_inputs(target_cols, grouping_cols)
start = time()
summarizer = Summarizer(
schema_madlib, source_table, output_table, target_cols, grouping_cols,
distinctify, get_quartiles, xtileify, ntile_array, how_many_mfv,
get_mfv_quick, n_cols_per_run=n_cols_per_run)
summarizer.run()
end = time()
num_col_summarized = plpy.execute(
"SELECT count(*) FROM {output_table}".format(
output_table=output_table))[0]['count']
return (output_table, num_col_summarized, end - start)
# -----------------------------------------------------------------------
# Input parameter checks and edits
# -----------------------------------------------------------------------
def _analyze_str_inputs(target_cols, grouping_cols):
"""
Analyze target_col and grouping_cols string input and convert them to a list
"""
if not target_cols or target_cols.strip() in ('', '*'):
target_cols = None
else:
target_cols = target_cols.replace(' ', '').split(',')
if not grouping_cols or grouping_cols.strip() == '':
# We make grouping_cols a list with None as an element
# since we always want to return summary for complete table
grouping_cols = [None]
else:
grouping_cols = grouping_cols.replace(' ', '').split(',')
grouping_cols.append(None) # to return summary on complete table
return (target_cols, grouping_cols)
# -----------------------------------------------------------------------
# Help messages
# -----------------------------------------------------------------------
def summary_help_message(schema_madlib, message, **kwargs):
"""
Given a help string, provide usage information
"""
if message is not None and \
message.lower() in ("usage", "help", "?"):
return """
Usage:
-----------------------------------------------------------------------
SELECT {madlib}.summary
(
source_table TEXT, -- Source table name (Required)
output_table TEXT, -- Output table name (Required)
target_cols TEXT, -- Comma separated columns for which summary is desired
-- (Default: NULL - produces result for all columns)
grouping_cols TEXT, -- Comma separated columns on which to group results
-- (Default: NULL - provides summary on complete table)
get_distinct BOOLEAN, -- Are distinct values required?
-- (Default: True)
get_quartiles BOOLEAN, -- Are quartiles required?
-- (Default: True)
ntile_array FLOAT8[], -- Array of quantile values to compute
-- (Default: NULL - Quantile array is not included in result)
how_many_mfv INTEGER, -- How many most-frequent-values (MFVs) to compute?
-- (Default: 10)
get_estimates BOOLEAN, -- Should we produce an estimated
-- (as opposed to an exact but slow) value for distincts and MFVs?
-- (Default: True)
n_cols_per_run INTEGER -- Number of columns to collect summary statistics in
-- one pass of the data
)
-----------------------------------------------------------------------
Output table will be in following format
(presence of some columns depends on the arguments):
- group_by_column : Group-by column names (NULL if none provided)
- group_by_value : Values of the Group-by column (NULL if no grouping)
- target_column : Targeted column values for which summary is requested
- column_number : Physical column number for the target column, as described in pg_attribute
- data_type : Data type of target column. Standard GPDB descriptors will be displayed
- row_count : Number of rows for the target column
- distinct_values : Number of distinct values in the target column
- missing_values : Number of missing values in the target column
- blank_values : Number of blank values (blanks are defined by the regular expression '^\w*$')
- fraction_missing : Percentage of total rows that are missing. Will be expressed as a decimal (e.g. 0.3)
- fraction_blank : Percentage of total rows that are blank. Will be expressed as a decimal (e.g. 0.3)
- positive_values : Number of positive values in the target column (if target is numeric, else NULL)
- negative_values : Number of negative values in the target column (if target is numeric, else NULL)
- zero_values : Number of zero values in the target column (if target is numeric, else NULL)
- mean : Mean value of target column (if target is numeric, else NULL)
- variance : Variance of target columns (if target is numeric, else NULL for strings)
- confidence_interval : Confidence interval (95% using z-score) of the mean value for the target column (if target is numeric, else NULL)
- min : Min value of target column (for strings this is the length of the shortest string)
- max : Max value of target column (for strings this is the length of the longest string)
- first_quartile : First quartile (25th percentile, valid only for numeric columns)
- median : Median value of target column (valid only for numeric columns)
- third_quartile : Third quartile (75th percentile, valid only for numeric columns)
- quantile_array : Percentile values corresponding to ntile_array
- most_frequent_values : Most frequent values
- mfv_frequencies : Frequency of the most frequent values
""".format(madlib=schema_madlib)
else:
return """
'summary' is a generic function used to produce summary statistics
of any data table. The function invokes particular 'methods' from
the MADlib library to provide an overview of the data.
-------
For an overview on usage, run:
SELECT {madlib}.summary('usage');
""".format(madlib=schema_madlib)