| """ |
| @file summary.py_in |
| |
| @brief Summary function for descriptive statistics |
| |
| @namespace summary |
| """ |
| import plpy |
| from time import time |
| |
| from utilities.control import MinWarning |
| from Summarizer import Summarizer |
| |
| |
| def summary(schema_madlib, source_table, output_table, target_cols, grouping_cols, |
| get_distinct, get_quartiles, ntile_array, how_many_mfv, |
| get_estimates, n_cols_per_run): |
| """ |
| Main summary function that is called by SQL to compute summary |
| statistics on a table. |
| |
| @param schema_madlib Madlib Schema namespace |
| @param source_table Name of input table |
| @param output_table Name of output table |
| @param target_cols Names of specific columns for which to get summary |
| @param grouping_cols Names of columns on which to group-by |
| (no summary is provided for these columns) |
| @param get_distinct Should summary include distinct count |
| @param get_quartiles Should summary include quartile information |
| @param ntile_array Array for quantiles to include in summary |
| (each element should be in [0, 1]) |
| @param how_many_mfv How many frequent values to output? |
| @param get_estimates Should the summmary information be estimated or exact? |
| @param n_cols_per_run Number of columns to compute statistics in a single run |
| |
| """ |
| |
| with MinWarning('error'): |
| # 'Estimated', 'Exact', None |
| distinctify = 'Estimated' |
| xtileify = 'Exact' # 'Estimated' not supported at present |
| get_mfv_quick = True |
| |
| if not get_estimates: |
| distinctify = 'Exact' |
| # xtileify = 'Exact' |
| get_mfv_quick = False |
| |
| if not get_distinct: |
| distinctify = 'Skip' |
| |
| # Ensure that ntile_array is a list of floats not a string |
| if ntile_array: |
| try: |
| ntile_array = [float(tile) for tile in ntile_array] |
| except ValueError: |
| plpy.error("""Summary -- Invalid paramter: |
| Percentile array has non-float data""") |
| |
| # convert comma delimited string input to a list of column names |
| (target_cols, grouping_cols) = _analyze_str_inputs(target_cols, grouping_cols) |
| |
| start = time() |
| summarizer = Summarizer( |
| schema_madlib, source_table, output_table, target_cols, grouping_cols, |
| distinctify, get_quartiles, xtileify, ntile_array, how_many_mfv, |
| get_mfv_quick, n_cols_per_run=n_cols_per_run) |
| summarizer.run() |
| end = time() |
| |
| num_col_summarized = plpy.execute( |
| "SELECT count(*) FROM {output_table}".format( |
| output_table=output_table))[0]['count'] |
| |
| return (output_table, num_col_summarized, end - start) |
| |
| |
| # ----------------------------------------------------------------------- |
| # Input parameter checks and edits |
| # ----------------------------------------------------------------------- |
| def _analyze_str_inputs(target_cols, grouping_cols): |
| """ |
| Analyze target_col and grouping_cols string input and convert them to a list |
| """ |
| if not target_cols or target_cols.strip() in ('', '*'): |
| target_cols = None |
| else: |
| target_cols = target_cols.replace(' ', '').split(',') |
| if not grouping_cols or grouping_cols.strip() == '': |
| # We make grouping_cols a list with None as an element |
| # since we always want to return summary for complete table |
| grouping_cols = [None] |
| else: |
| grouping_cols = grouping_cols.replace(' ', '').split(',') |
| grouping_cols.append(None) # to return summary on complete table |
| return (target_cols, grouping_cols) |
| |
| |
| # ----------------------------------------------------------------------- |
| # Help messages |
| # ----------------------------------------------------------------------- |
| def summary_help_message(schema_madlib, message, **kwargs): |
| """ |
| Given a help string, provide usage information |
| """ |
| if message is not None and \ |
| message.lower() in ("usage", "help", "?"): |
| return """ |
| Usage: |
| ----------------------------------------------------------------------- |
| SELECT {madlib}.summary |
| ( |
| source_table TEXT, -- Source table name (Required) |
| output_table TEXT, -- Output table name (Required) |
| target_cols TEXT, -- Comma separated columns for which summary is desired |
| -- (Default: NULL - produces result for all columns) |
| grouping_cols TEXT, -- Comma separated columns on which to group results |
| -- (Default: NULL - provides summary on complete table) |
| get_distinct BOOLEAN, -- Are distinct values required? |
| -- (Default: True) |
| get_quartiles BOOLEAN, -- Are quartiles required? |
| -- (Default: True) |
| ntile_array FLOAT8[], -- Array of quantile values to compute |
| -- (Default: NULL - Quantile array is not included in result) |
| how_many_mfv INTEGER, -- How many most-frequent-values (MFVs) to compute? |
| -- (Default: 10) |
| get_estimates BOOLEAN, -- Should we produce an estimated |
| -- (as opposed to an exact but slow) value for distincts and MFVs? |
| -- (Default: True) |
| n_cols_per_run INTEGER -- Number of columns to collect summary statistics in |
| -- one pass of the data |
| ) |
| ----------------------------------------------------------------------- |
| Output table will be in following format |
| (presence of some columns depends on the arguments): |
| - group_by_column : Group-by column names (NULL if none provided) |
| - group_by_value : Values of the Group-by column (NULL if no grouping) |
| - target_column : Targeted column values for which summary is requested |
| - column_number : Physical column number for the target column, as described in pg_attribute |
| - data_type : Data type of target column. Standard GPDB descriptors will be displayed |
| - row_count : Number of rows for the target column |
| - distinct_values : Number of distinct values in the target column |
| - missing_values : Number of missing values in the target column |
| - blank_values : Number of blank values (blanks are defined by the regular expression '^\w*$') |
| - fraction_missing : Percentage of total rows that are missing. Will be expressed as a decimal (e.g. 0.3) |
| - fraction_blank : Percentage of total rows that are blank. Will be expressed as a decimal (e.g. 0.3) |
| - positive_values : Number of positive values in the target column (if target is numeric, else NULL) |
| - negative_values : Number of negative values in the target column (if target is numeric, else NULL) |
| - zero_values : Number of zero values in the target column (if target is numeric, else NULL) |
| - mean : Mean value of target column (if target is numeric, else NULL) |
| - variance : Variance of target columns (if target is numeric, else NULL for strings) |
| - confidence_interval : Confidence interval (95% using z-score) of the mean value for the target column (if target is numeric, else NULL) |
| - min : Min value of target column (for strings this is the length of the shortest string) |
| - max : Max value of target column (for strings this is the length of the longest string) |
| - first_quartile : First quartile (25th percentile, valid only for numeric columns) |
| - median : Median value of target column (valid only for numeric columns) |
| - third_quartile : Third quartile (75th percentile, valid only for numeric columns) |
| - quantile_array : Percentile values corresponding to ntile_array |
| - most_frequent_values : Most frequent values |
| - mfv_frequencies : Frequency of the most frequent values |
| """.format(madlib=schema_madlib) |
| else: |
| return """ |
| 'summary' is a generic function used to produce summary statistics |
| of any data table. The function invokes particular 'methods' from |
| the MADlib library to provide an overview of the data. |
| ------- |
| For an overview on usage, run: |
| SELECT {madlib}.summary('usage'); |
| """.format(madlib=schema_madlib) |