src/ports/postgres/modules/summary/summary.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file summary.sql_in
  *
  * @brief Summary function for descriptive statistics
  * @date Mar 2013
  *
  *//* ------------------------------------------------------------------------*/

 m4_include(`SQLCommon.m4')

 /**

 @addtogroup grp_summary

 <div class="toc"><b>Contents</b>
 <ul>
 <li><a href="#usage">Summary Function Syntax</a></li>
 <li><a href="#examples">Examples</a></li>
 <li><a href="#notes">Notes</a></li>
 <li><a href="#related">Related Topics</a></li>
 </ul>
 </div>

 @brief Calculates general descriptive statistics for any data table.


 The MADlib \b summary() function produces summary
 statistics for any data table. The function invokes various methods from
 the MADlib library to provide the data overview.

 @anchor usage
 @par Summary Function Syntax
 The \b summary() function has the following syntax:

 <pre class="syntax">
 summary ( source_table,
           output_table,
           target_cols,
           grouping_cols,
           get_distinct,
           get_quartiles,
           ntile_array,
           how_many_mfv,
           get_estimates,
           n_cols_per_run
         )
 </pre>
 The \b summary() function returns a composite type containing three fields:
 <table class="output">
     <tr>
         <th>output_table</th>
         <td>TEXT. The name of the output table.</td>
     </tr>
     <tr>
         <th>num_col_summarized</th>
         <td>INTEGER. The number of columns from the source table that have been summarized.</td>
     </tr>
     <tr>
         <th>duration</th>
         <td>FLOAT8. The time taken (in seconds) to compute the summary.</td>
     </tr>
 </table>
 \b Arguments
 <DL class="arglist">
 <dt>source_table</dt>
 <dd>TEXT. Name of the table containing the input data.</dd>

 <dt>output_table</dt>
 <dd>TEXT. Name of the table for the output summary statistics.
 This table contains the following columns:
 <table class="output">
     <tr>
         <th>group_by</th>
         <td>Group-by column name. NULL if none provided.</td>
     </tr>
     <tr>
         <th>group_by_value</th>
         <td>Value of the group-by column. NULL if there is no grouping.</td>
     </tr>
     <tr>
         <th>target_column</th>
         <td>Targeted column values for which summary is requested.</td>
     </tr>
     <tr>
         <th>column_number</th>
         <td>Physical column number for the target column, as described in \e pg_attribute</td> catalog.
     </tr>
     <tr>
         <th>data_type</th>
         <td>Data type of the target column. Standard GPDB type descriptors are displayed.</td>
     </tr>
     <tr>
         <th>row_count</th>
         <td>Number of rows for the target column.</td>
     </tr>
     <tr>
         <th>distinct_values</th>
         <td>Number of distinct values in the target column.
         If the summary() function is called with the <em>get_estimates</em>
         argument set to TRUE (default), then this is an estimated statistic based on the
         Flajolet-Martin distinct count estimator.  If the <em>get_estimates</em>
         argument set to FALSE, will use PostgreSQL COUNT DISTINCT.</td>
     </tr>
     <tr>
         <th>missing_values</th>
         <td>Number of missing values in the target column.</td>
     </tr>
     <tr>
         <th>blank_values</th>
         <td>Number of blank values. Blanks are defined by this regular expression: \verbatim '^\w*$'\endverbatim</td>
     </tr>
     <tr>
         <th>fraction_missing</th>
         <td>Percentage of total rows that are missing, as a decimal value, e.g. 0.3.</td>
     </tr>
     <tr>
         <th>fraction_blank</th>
         <td>Percentage of total rows that are blank, as a decimal value, e.g. 0.3.</td>
     </tr>
     <tr>
         <th>positive_values</th>
         <td>Number of positive values in the target column if target is numeric, otherwise NULL.</td>
     </tr>
     <tr>
         <th>negative_values</th>
         <td>Number of negative values in the target column if target is numeric, otherwise NULL.</td>
     </tr>
     <tr>
         <th>zero_values</th>
         <td>Number of zero values in the target column if target is numeric, otherwise NULL.
         Note that we are reporting exact equality to 0.0 here, so even if you have a
         float value that is extremely small (say due to rounding), it will not be
         reported as a zero value.</td>
     </tr>
     <tr>
         <th>mean</th>
         <td>Mean value of target column if target is numeric, otherwise NULL.</td>
     </tr>
     <tr>
         <th>variance</th>
         <td>Variance of target column if target is numeric, otherwise NULL.</td>
     </tr>
     <tr>
         <th>confidence_interval</th>
         <td>Confidence interval (95% using z-score) of the mean value for the target column
         if target is numeric, otherwise NULL.
         Presented as an array of two elements in the form {lower bound, upper bound}.</td>
     </tr>
     <tr>
         <th>min</th>
         <td>Minimum value of target column. For strings this is the length of the shortest string.</td>
     </tr>
     <tr>
         <th>max</th>
         <td>Maximum value of target column. For strings this is the length of the longest string.</td>
     </tr>
     <tr>
         <th>first_quartile</th>
         <td>First quartile (25th percentile), only for numeric columns.
         (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>median</th>
         <td>Median value of target column, if target is numeric, otherwise NULL.
         (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>third_quartile</th>
         <td>Third quartile (25th percentile), only for numeric columns.
         (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>quantile_array</th>
         <td>Percentile values corresponding to \e ntile_array.
         (Unavailable for PostgreSQL 9.3 or lower.)</td>
     </tr>
     <tr>
         <th>most_frequent_values</th>
         <td>An array containing the most frequently occurring values. The \e
             how_many_mfv argument determines the length of the array, which is 10 by
             default. If the summary() function is called with the \e
             get_estimates argument set to TRUE (default), the frequent values
             computation is performed using a parallel aggregation method that is
             faster, but in some cases may fail to detect the exact most frequent
             values.</td>
     </tr>
     <tr>
         <th>mfv_frequencies</th>
         <td>Array containing the frequency count for each of the most frequent values. </td>
     </tr>
 </table></dd>

 <dt>target_columns (optional)</dt>
 <dd>TEXT, default NULL. A comma-separated list of columns to summarize.
 If NULL, summaries are produced for all columns.</dd>

 <dt>grouping_cols (optional)</dt>
 <dd>TEXT, default: null. A comma-separated list of columns on which to
 group results. If NULL, summaries are produced for the complete table.</dd>
 @note Please note that summary statistics are calculated for each grouping
 column independently.  That is, grouping columns are not combined together
 as in the regular PostgreSQL style GROUP BY directive.  (This was done
 to reduce long run time and huge output table size which would otherwise
 result in the case of large input tables with a lot of grouping_cols and
 target_cols specified.)

 <dt>get_distinct (optional)</dt>
 <dd>BOOLEAN, default TRUE. If true, distinct values are counted.
 The method for computing distinct values depends on the setting of
 the 'get_estimates' parameter below.</dd>

 <dt>get_quartiles (optional)</dt>
 <dd>BOOLEAN, default TRUE. If TRUE, quartiles are computed.</dd>

 <dt>ntile_array (optional)</dt>
 <dd>FLOAT8[], default NULL. An array of quantile values to compute.
 If NULL, quantile values are not computed.</dd>
 @note Quartile and quantile functions are not available in PostgreSQL 9.3 or
 lower.  If you are using PostgreSQL 9.3 or lower, the output table will not
 contain these values, even if you set 'get_quartiles' = TRUE or
 provide an array of quantile values for the parameter 'ntile_array'.

 <dt>how_many_mfv (optional)</dt>
 <dd>INTEGER, default: 10. The number of most-frequent-values to compute.
 The method for computing MFV depends on the setting of
 the 'get_estimates' parameter below.</dd>

 <dt>get_estimates (optional)</dt>
 <dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced for
 distinct values and most frequent values. If FALSE, exact values are
 calculated which will take longer to run, with the impact depending on
 data size.</dd>

 <dt>n_cols_per_run (optional)</dt>
 <dd>INTEGER, default: 15. The number of columns to collect summary statistics in
 one pass of the data.
 This parameter determines the number of passes through the data. For e.g.,
 with a total of 40 columns to summarize and 'n_cols_per_run = 15', there will be
 3 passes through the data, with each pass summarizing a maximum of 15 columns.
 @note This parameter should be used with caution. Increasing this parameter could
 decrease the total run time (if number of passes decreases), but will increase
 the memory consumption during each run. Since PostgreSQL limits the memory available
 for a single aggregate run, this increased memory consumption could result in an
 out-of-memory termination error.

 </dd>
 </DL>


 @anchor examples
 @examp

 -# View online help for the summary() function.
 <pre class="example">
 SELECT * FROM madlib.summary();
 </pre>

 -# Create an input data table using part of the well known
 iris data set.
 <pre class="example">
 DROP TABLE IF EXISTS iris;
 CREATE TABLE iris (id INT, sepal_length FLOAT, sepal_width FLOAT,
                     petal_length FLOAT, petal_width FLOAT,
                    class_name text);
 INSERT INTO iris VALUES
 (1,5.1,3.5,1.4,0.2,'Iris-setosa'),
 (2,4.9,3.0,1.4,0.2,'Iris-setosa'),
 (3,4.7,3.2,1.3,0.2,'Iris-setosa'),
 (4,4.6,3.1,1.5,0.2,'Iris-setosa'),
 (5,5.0,3.6,1.4,0.2,'Iris-setosa'),
 (6,5.4,3.9,1.7,0.4,'Iris-setosa'),
 (7,4.6,3.4,1.4,0.3,'Iris-setosa'),
 (8,5.0,3.4,1.5,0.2,'Iris-setosa'),
 (9,4.4,2.9,1.4,0.2,'Iris-setosa'),
 (10,4.9,3.1,1.5,0.1,'Iris-setosa'),
 (11,7.0,3.2,4.7,1.4,'Iris-versicolor'),
 (12,6.4,3.2,4.5,1.5,'Iris-versicolor'),
 (13,6.9,3.1,4.9,1.5,'Iris-versicolor'),
 (14,5.5,2.3,4.0,1.3,'Iris-versicolor'),
 (15,6.5,2.8,4.6,1.5,'Iris-versicolor'),
 (16,5.7,2.8,4.5,1.3,'Iris-versicolor'),
 (17,6.3,3.3,4.7,1.6,'Iris-versicolor'),
 (18,4.9,2.4,3.3,1.0,'Iris-versicolor'),
 (19,6.6,2.9,4.6,1.3,'Iris-versicolor'),
 (20,5.2,2.7,3.9,1.4,'Iris-versicolor'),
 (21,6.3,3.3,6.0,2.5,'Iris-virginica'),
 (22,5.8,2.7,5.1,1.9,'Iris-virginica'),
 (23,7.1,3.0,5.9,2.1,'Iris-virginica'),
 (24,6.3,2.9,5.6,1.8,'Iris-virginica'),
 (25,6.5,3.0,5.8,2.2,'Iris-virginica'),
 (26,7.6,3.0,6.6,2.1,'Iris-virginica'),
 (27,4.9,2.5,4.5,1.7,'Iris-virginica'),
 (28,7.3,2.9,6.3,1.8,'Iris-virginica'),
 (29,6.7,2.5,5.8,1.8,'Iris-virginica'),
 (30,7.2,3.6,6.1,2.5,'Iris-virginica');
 </pre>

 -# Run the \b summary() function using all defaults.
 <pre class="example">
 DROP TABLE IF EXISTS iris_summary;
 SELECT * FROM madlib.summary( 'iris',            -- Source table
                               'iris_summary'     -- Output table
                             );
 </pre>
 Result:
 <pre class="result">
  output_table | num_col_summarized |     duration
 --------------+--------------------+-------------------
  iris_summary |                  6 | 0.574938058853149
 (1 row)
 </pre>
 View the summary data.
 <pre class=example>
 -- Turn on expanded display for readability.
 \\x on
 SELECT * FROM iris_summary;
 </pre>
 Result (partial):
 <pre class="result">
 ...
 &nbsp;-[ RECORD 2 ]--------+---------------------------------------------
 group_by             |
 group_by_value       |
 target_column        | sepal_length
 column_number        | 2
 data_type            | float8
 row_count            | 30
 distinct_values      | 22
 missing_values       | 0
 blank_values         |
 fraction_missing     | 0
 fraction_blank       |
 positive_values      | 30
 negative_values      | 0
 zero_values          | 0
 mean                 | 5.84333333333333
 variance             | 0.929436781609188
 confidence_interval  | {5.49834423494374,6.18832243172292}
 min                  | 4.4
 max                  | 7.6
 first_quartile       | 4.925
 median               | 5.75
 third_quartile       | 6.575
 most_frequent_values | {4.9,6.3,5,6.5,4.6,7.2,5.5,5.7,7.3,6.7}
 mfv_frequencies      | {4,3,2,2,2,1,1,1,1,1}
 ...
 &nbsp;-[ RECORD 6 ]--------+---------------------------------------------
 group_by             |
 group_by_value       |
 target_column        | class_name
 column_number        | 6
 data_type            | text
 row_count            | 30
 distinct_values      | 3
 missing_values       | 0
 blank_values         | 0
 fraction_missing     | 0
 fraction_blank       | 0
 positive_values      |
 negative_values      |
 zero_values          |
 mean                 |
 variance             |
 confidence_interval  |
 min                  | 11
 max                  | 15
 first_quartile       |
 median               |
 third_quartile       |
 most_frequent_values | {Iris-setosa,Iris-versicolor,Iris-virginica}
 mfv_frequencies      | {10,10,10}
 </pre>
 Note that for the text column in record 6, some statistics are n/a,
 and the min and max values represent the length of the shortest and
 longest strings respectively.

 -# Now group by the class of iris:
 <pre class="example">
 DROP TABLE IF EXISTS iris_summary;
 SELECT * FROM madlib.summary( 'iris',                       -- Source table
                               'iris_summary',               -- Output table
                               'sepal_length, sepal_width',  -- Columns to summarize
                               'class_name'                  -- Grouping column
                             );
 SELECT * FROM iris_summary;
 </pre>
 Result (partial):
 <pre class="result">
 &nbsp;-[ RECORD 1 ]--------+----------------------------------------
 group_by             | class_name
 group_by_value       | Iris-setosa
 target_column        | sepal_length
 column_number        | 2
 data_type            | float8
 row_count            | 10
 distinct_values      | 7
 missing_values       | 0
 blank_values         |
 fraction_missing     | 0
 fraction_blank       |
 positive_values      | 10
 negative_values      | 0
 zero_values          | 0
 mean                 | 4.86
 variance             | 0.0848888888888875
 confidence_interval  | {4.67941507384182,5.04058492615818}
 min                  | 4.4
 max                  | 5.4
 first_quartile       | 4.625
 median               | 4.9
 third_quartile       | 5
 most_frequent_values | {4.9,5,4.6,5.1,4.7,5.4,4.4}
 mfv_frequencies      | {2,2,2,1,1,1,1}
 ...
 &nbsp;-[ RECORD 3 ]--------+----------------------------------------
 group_by             | class_name
 group_by_value       | Iris-versicolor
 target_column        | sepal_length
 column_number        | 2
 data_type            | float8
 row_count            | 10
 distinct_values      | 10
 missing_values       | 0
 blank_values         |
 fraction_missing     | 0
 fraction_blank       |
 positive_values      | 10
 negative_values      | 0
 zero_values          | 0
 mean                 | 6.1
 variance             | 0.528888888888893
 confidence_interval  | {5.64924734548141,6.55075265451859}
 min                  | 4.9
 max                  | 7
 first_quartile       | 5.55
 median               | 6.35
 third_quartile       | 6.575
 most_frequent_values | {6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2,7,6.4}
 mfv_frequencies      | {1,1,1,1,1,1,1,1,1,1}
 ...
 </pre>

 -# Trying some other parameters:
 <pre class="example">
 DROP TABLE IF EXISTS iris_summary;
 SELECT * FROM madlib.summary( 'iris',                       -- Source table
                               'iris_summary',               -- Output table
                               'sepal_length, sepal_width',  -- Columns to summarize
                                NULL,                        -- No grouping
                                TRUE,                        -- Get distinct values
                                FALSE,                       -- Dont get quartiles
                                ARRAY[0.33, 0.66],           -- Get ntiles
                                3,                           -- Number of MFV to compute
                                FALSE                        -- Get exact values
                             );
 SELECT * FROM iris_summary;
 </pre>
 Result:
 <pre class="result">
 &nbsp;-[ RECORD 1 ]--------+------------------------------------
 group_by             |
 group_by_value       |
 target_column        | sepal_length
 column_number        | 2
 data_type            | float8
 row_count            | 30
 distinct_values      | 22
 missing_values       | 0
 blank_values         |
 fraction_missing     | 0
 fraction_blank       |
 positive_values      | 30
 negative_values      | 0
 zero_values          | 0
 mean                 | 5.84333333333333
 variance             | 0.929436781609175
 confidence_interval  | {5.49834423494375,6.18832243172292}
 min                  | 4.4
 max                  | 7.6
 quantile_array       | {5.057,6.414}
 most_frequent_values | {4.9,6.3,6.5}
 mfv_frequencies      | {4,3,2}
 &nbsp;-[ RECORD 2 ]--------+------------------------------------
 group_by             |
 group_by_value       |
 target_column        | sepal_width
 column_number        | 3
 data_type            | float8
 row_count            | 30
 distinct_values      | 14
 missing_values       | 0
 blank_values         |
 fraction_missing     | 0
 fraction_blank       |
 positive_values      | 30
 negative_values      | 0
 zero_values          | 0
 mean                 | 3.04
 variance             | 0.13903448275862
 confidence_interval  | {2.90656901047539,3.17343098952461}
 min                  | 2.3
 max                  | 3.9
 quantile_array       | {2.9,3.2}
 most_frequent_values | {2.9,3,3.2}
 mfv_frequencies      | {4,4,3}
 </pre>

 @anchor notes
 @par Notes
 - Table names can be optionally schema qualified (current_schemas() would be
 searched if a schema name is not provided) and table and column names
 should follow case-sensitivity and quoting rules per the database.
 (For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'.
 If mixed-case or multi-byte characters are desired for entity names then the
 string should be double-quoted; in this case the input would be '"MyTable"').
 - The <em>get_estimates</em> parameter controls computation for both distinct
 count and most frequent values:
     -  If <em>get_estimates</em> is TRUE then the distinct value computation is
     estimated using Flajolet-Martin. MFV is computed using a
     fast method that does parallel aggregation in Greenplum Database at the expense
     of missing or duplicating some of the most frequent values.
     -  If <em>get_estimates</em> is FALSE then the distinct values are computed
     in a slower but exact method using PostgreSQL COUNT DISTINCT.  MFV is computed using a
     faithful implementation that preserves the approximation guarantees of
     the Cormode/Muthukrishnan method (more information at \ref grp_mfvsketch).


 @anchor related
 @par Related Topics
 File summary.sql_in documenting the \b summary() function

 \ref grp_fmsketch <br/>
 \ref grp_mfvsketch <br/>
 \ref grp_countmin

 */

 DROP TYPE IF EXISTS MADLIB_SCHEMA.summary_result CASCADE;
 CREATE TYPE MADLIB_SCHEMA.summary_result AS
 (
     output_table             TEXT,
     num_col_summarized       INT4,
     duration                 FLOAT8
 );


 -----------------------------------------------------------------------
 -- Main function for summary
 -----------------------------------------------------------------------
 /*
  * @brief Compute a summary statistics on a table with optional grouping support
  *
  * @param source_table      Name of source relation containing the data
  * @param output_table      Name of output table name to store the summary
  * @param target_cols       String with comma separated list of columns on which summary is desired
  * @param grouping_cols     String with comma separated list of columns on which to group the data by
  * @param get_distinct      Should distinct values count be included in result
  * @param get_quartiles     Should first, second (median), and third quartiles be included in result
  * @param ntile_array       Array of percentiles to compute
  * @param how_many_mfv      How many most frequent values to compute?
  * @param get_estimates     Should distinct counts be an estimated (faster) or exact count?
  * @param n_cols_per_run    Number of columns to use per run of summary
  *
  * @usage
  *
  * <pre> SELECT MADLIB_SCHEMA.summary (
  *       '<em>source_table</em>', '<em>output_table</em>',
  *       '<em>target_cols</em>', '<em>grouping_cols</em>',
  *       '<em>get_distinct</em>', '<em>get_quartiles</em>',
  *       '<em>ntile_array</em>', '<em>how_many_mfv</em>',
  *       '<em>get_estimates</em>', '<em>n_cols_per_run</em>'
  *   );
  *   SELECT * FROM '<em>output_table</em>'
  *  </pre>
  */
 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,       -- source table name
     output_table            TEXT,       -- output table name
     target_cols             TEXT,       -- comma separated list of output cols
     grouping_cols           TEXT,       -- comma separated names of grouping cols
     get_distinct            BOOLEAN,    -- Are distinct values required
     get_quartiles           BOOLEAN,    -- Are quartiles required
     ntile_array             FLOAT8[],   -- Array of quantiles to compute
     how_many_mfv            INTEGER,    -- How many most frequent values to compute?
     get_estimates           BOOLEAN,    -- Should we produce exact or estimated
                                         --      values for distinct computation
     n_cols_per_run          INTEGER     -- Number of columns to use per run of summary
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     PythonFunctionBodyOnly(`summary', `summary')
     with AOControl(False):
         return summary.summary(
             schema_madlib, source_table, output_table, target_cols, grouping_cols,
             get_distinct, get_quartiles, ntile_array, how_many_mfv,
             get_estimates, n_cols_per_run)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------
 --- Overloaded functions to support optional parameters
 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT,
     get_distinct            BOOLEAN,
     get_quartiles           BOOLEAN,
     ntile_array             FLOAT8[],
     how_many_mfv            INTEGER,
     get_estimates           BOOLEAN
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, $5, $6, $7, $8, $9, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT,
     get_distinct            BOOLEAN,
     get_quartiles           BOOLEAN,
     ntile_array             FLOAT8[],
     how_many_mfv            INTEGER
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, $5, $6, $7, $8, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT,
     get_distinct            BOOLEAN,
     get_quartiles           BOOLEAN,
     ntile_array             FLOAT8[]
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, $5, $6, $7, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT,
     get_distinct            BOOLEAN,
     get_quartiles           BOOLEAN
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, $5, $6, NULL, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT,
     get_distinct            BOOLEAN
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, $5, True, NULL, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, True, True, NULL, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, NULL, True, True, NULL, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, NULL, NULL, True, True, NULL, 10, True, 15)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------
 -- Help functions
 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.summary(
     input_message            TEXT
 )
 RETURNS TEXT AS $$
 PythonFunctionBodyOnly(`summary', `summary')
     with AOControl(False):
         return summary.summary_help_message(schema_madlib, input_message)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.summary()
 RETURNS TEXT AS $$
 PythonFunctionBodyOnly(`summary', `summary')
     with AOControl(False):
         return summary.summary_help_message(schema_madlib, None)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');