src/ports/postgres/modules/summary/summary.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file summary.sql_in
  *
  * @brief Summary function for descriptive statistics
  * @date Mar 2013
  *
  *//* ------------------------------------------------------------------------*/

 m4_include(`SQLCommon.m4')

 /**

 @addtogroup grp_summary

 <div class="toc"><b>Contents</b>
 <ul>
 <li><a href="#usage">Summary Function Syntax</a></li>
 <li><a href="#examples">Examples</a></li>
 <li><a href="#notes">Notes</a></li>
 <li><a href="#related">Related Topics</a></li>
 </ul>
 </div>

 @brief Calculates general descriptive statistics for any data table.


 The MADlib \b summary() function is a generic function that produces summary
 statistics from any data table. The function invokes various methods from
 the MADlib library to  provide an overview of the data.

 @anchor usage
 @par Summary Function Syntax
 The \b summary() function has the following syntax:

 <pre class="syntax">
 summary ( source_table,
           output_table,
           target_cols,
           grouping_cols,
           get_distinct,
           get_quartiles,
           ntile_array,
           how_many_mfv,
           get_estimates
         )
 </pre>
 \b Arguments
 <DL class="arglist">
 <dt>source_table</dt>
 <dd>TEXT. The name of the table containing the input data.</dd>
 <dt>output_table</dt>
 <dd>TEXT. The name of the table to contain the output summary data.

 The \b summary() function returns a composite type containing three fields:
 <table class="output">
     <tr>
         <th>output_table</th>
         <td>TEXT. The name of the output table.</td>
     </tr>
     <tr>
         <th>row_count</th>
         <td>INTEGER. The number of rows in the output table.</td>
     </tr>
     <tr>
         <th>duration</th>
         <td>FLOAT8. The time taken (in seconds) to compute the summary.</td>
     </tr>
 </table>


 Summary statistics are saved in a table with the name specifed in the
 <em>output_table</em> argument. The table can contain the
 following columns:
 <table class="output">
     <tr>
         <th>group_by</th>
         <td>Group-by column name. NULL if none provided.</td>
     </tr>
     <tr>
         <th>group_by_value</th>
         <td>Value of the Group-by column. NULL if there is no grouping.</td>
     </tr>
     <tr>
         <th>target_column</th>
         <td>Targeted column values for which summary is requested.</td>
     </tr>
     <tr>
         <th>column_number</th>
         <td>Physical column number for the target column, as described in \e pg_attribute.</td>
     </tr>
     <tr>
         <th>data_type</th>
         <td>Data type of the target column. Standard GPDB type descriptors are displayed.</td>
     </tr>
     <tr>
         <th>row_count</th>
         <td>Number of rows for the target column.</td>
     </tr>
     <tr>
         <th>distinct_values</th>
         <td>Number of distinct values in the target column. When the summary() function is called with the <em>get_estimates</em> argument set to TRUE, this is an estimated statistic based on the Flajolet-Martin distinct count estimator.</td>
     </tr>
     <tr>
         <th>missing_values</th>
         <td>Number of missing values in the target column.</td>
     </tr>
     <tr>
         <th>blank_values</th>
         <td>Number of blank values. Blanks are defined by this regular expression: \verbatim '^\w*$'\endverbatim</td>
     </tr>
     <tr>
         <th>fraction_missing</th>
         <td>Percentage of total rows that are missing, as a decimal value, e.g. 0.3.</td>
     </tr>
     <tr>
         <th>fraction_blank</th>
         <td>Percentage of total rows that are blank, as a decimal value, e.g. 0.3.</td>
     </tr>
     <tr>
         <th>mean</th>
         <td>Mean value of target column if target is numeric, otherwise NULL.</td>
     </tr>
     <tr>
         <th>variance</th>
         <td>Variance of target column if target is numeric, otherwise NULL.</td>
     </tr>
     <tr>
         <th>min</th>
         <td>Minimum value of target column. For strings this is the length of the shortest string.</td>
     </tr>
     <tr>
         <th>max</th>
         <td>Maximum value of target column. For strings this is the length of the longest string.</td>
     </tr>
     <tr>
         <th>first_quartile</th>
         <td>First quartile (25th percentile), only for numeric columns. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
     </tr>
     <tr>
         <th>median</th>
         <td>Median value of target column, if target is numeric, otherwise NULL. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
     </tr>
     <tr>
         <th>third_quartile</th>
         <td>Third quartile (25th percentile), only for numeric columns. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
     </tr>
     <tr>
         <th>quantile_array</th>
         <td>Percentile values corresponding to \e ntile_array. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
     </tr>
     <tr>
         <th>most_frequent_values</th>
         <td>An array containing the most frequently occurring values. The \e
             how_many_mfv argument determines the length of the array, 10 by
             default. If the summary() function is called with the \e
             get_estimates argument set to TRUE (default), the frequent values
             computation is performed using a parallel aggregation method that is
             faster, but in some cases can fail to detect the exact most frequent
             values.</td>
     </tr>
     <tr>
         <th>mfv_frequencies</th>
         <td>Array containing the frequency count for each of the most frequent values. </td>
     </tr>
 </table></dd>
 <dt>target_columns (optional)</dt>
 <dd>TEXT, default NULL. A comma-separated list of columns to summarize. If NULL, summaries are produced for all columns.</dd>
 <dt>grouping_cols (optional)</dt>
 <dd>TEXT, default: null. A comma-separated list of columns on which to group results. If NULL, summaries are produced on the complete table.</dd>
 <dt>get_distinct (optional)</dt>
 <dd>BOOLEAN, default TRUE. If true, distinct values are counted.</dd>
 <dt>get_quartiles (optional)</dt>
 <dd>BOOLEAN, default TRUE. If TRUE, qartiles are computed.</dd>
 <dt>ntile_array (optional)</dt>
 <dd>FLOAT8[], default NULL. An array of quantile values to compute. If NULL, quantile values are not computed.</dd>
 <dt>how_many_mfv (optional)</dt>
 <dd>INTEGER, default: 10. The number of most-frequent-values to compute.</dd>
 <dt>get_estimates (optional)</dt>
 <dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced. If FALSE, exact values are calculated.</dd>
 </DL>


 @anchor examples
 @examp

 -# View online help for the summary() function.
 <pre class="example">
 SELECT * FROM madlib.summary();
 </pre>

 -# Create an input data set.
 <pre class="example">
 CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
              size INT, lot INT);
 COPY houses FROM STDIN WITH DELIMITER '|';
   1 |  590 |       2 |    1 |  50000 |  770 | 22100
   2 | 1050 |       3 |    2 |  85000 | 1410 | 12000
   3 |   20 |       3 |    1 |  22500 | 1060 |  3500
   4 |  870 |       2 |    2 |  90000 | 1300 | 17500
   5 | 1320 |       3 |    2 | 133000 | 1500 | 30000
   6 | 1350 |       2 |    1 |  90500 |  820 | 25700
   7 | 2790 |       3 |  2.5 | 260000 | 2130 | 25000
   8 |  680 |       2 |    1 | 142500 | 1170 | 22000
   9 | 1840 |       3 |    2 | 160000 | 1500 | 19000
  10 | 3680 |       4 |    2 | 240000 | 2790 | 20000
  11 | 1660 |       3 |    1 |  87000 | 1030 | 17500
  12 | 1620 |       3 |    2 | 118600 | 1250 | 20000
  13 | 3100 |       3 |    2 | 140000 | 1760 | 38000
  14 | 2070 |       2 |    3 | 148000 | 1550 | 14000
  15 |  650 |       3 |  1.5 |  65000 | 1450 | 12000
 \\.
 </pre>

 -# Run the \b summary() function.
 <pre class="example">
 SELECT * FROM madlib.summary( 'houses',
                               'houses_summary',
                               'tax,bedroom,lot,bath,price,size,lot',
                               'bedroom',
                               TRUE,
                               TRUE,
                               NULL,
                               5,
                               FALSE
                             );
 </pre>
 Result:
 <pre class="result">
   output_table  | row_count |    duration
 ----------------+-----------+----------------
  houses_summary |        21 | 0.207587003708
 (1 row)
 </pre>

 -# View the summary data.
 <pre class=example>
 -- Turn on expanded display for readability.
 \\x on
 SELECT * FROM houses_summary;
 </pre>
 Result:
 <pre class="result">
 &nbsp;-[ RECORD 1 ]--------+-----------------------------------
  group_by             | bedroom
  group_by_value       | 3
  target_column        | tax
  column_number        | 2
  data_type            | int4
  row_count            | 9
  distinct_values      | 9
  missing_values       | 0
  blank_values         |
  fraction_missing     | 0
  fraction_blank       |
  mean                 | 1561.11111111111
  variance             | 936736.111111111
  min                  | 20
  max                  | 3100
  most_frequent_values | {20,1320,2790,1840,1660}
  mfv_frequencies      | {1,1,1,1,1}
 &nbsp;-[ RECORD 2 ]--------+-----------------------------------
  group_by             | bedroom
  group_by_value       | 3
  target_column        | bath
  column_number        | 4
  ...
 </pre>

 @anchor notes
 @par Notes
 - Table names can be optionally schema qualified (current_schemas() would be
 searched if a schema name is not provided) and table and column names
 should follow case-sensitivity and quoting rules per the database.
 (For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'.
 If mixed-case or multi-byte characters are desired for entity names then the
 string should be double-quoted; in this case the input would be '"MyTable"').
 - Estimated values are only implemented for the distinct values computation.
 - The <em>get_estimates</em> parameter controls computation for two statistics:
     -  If <em>get_estimates</em> is TRUE then the distinct value computation is
     estimated. Further, the most frequent values computation is computed using a
     "quick and dirty" method that does parallel aggregation in Greenplum Database at the expense
     of missing some of the most frequent values.
     -  If <em>get_estimates</em> is FALSE then the distinct values are computed
     in a slow but exact method. The most frequent values are computed using a
     faithful implementation that preserves the approximation guarantees of
     the Cormode/Muthukrishnan method (more information in \ref grp_mfvsketch).


 @anchor related
 @par Related Topics
 File summary.sql_in documenting the \b summary() function

 \ref grp_mfvsketch

 */

 DROP TYPE IF EXISTS MADLIB_SCHEMA.summary_result CASCADE;
 CREATE TYPE MADLIB_SCHEMA.summary_result AS
 (
     output_table    TEXT,
     row_count       INT4,
     duration        FLOAT8
 );


 -----------------------------------------------------------------------
 -- Main function for summary
 -----------------------------------------------------------------------
 /*
  * @brief Compute a summary statistics on a table with optional grouping support
  *
  * @param source_table      Name of source relation containing the data
  * @param output_table      Name of output table name to store the summary
  * @param target_cols       String with comma separated list of columns on which summary is desired
  * @param grouping_cols     String with comma separated list of columns on which to group the data by
  * @param get_distinct      Should distinct values count be included in result
  * @param get_quartiles     Should first, second (median), and third quartiles be included in result
  * @param ntile_array       Array of percentiles to compute
  * @param how_many_mfv      How many most frequent values to compute?
  * @param get_estimates     Should distinct counts be an estimated (faster) or exact count?
  *
  * @usage
  *
  * <pre> SELECT MADLIB_SCHEMA.summary (
  *       '<em>source_table</em>', '<em>output_table</em>',
  *       '<em>target_cols</em>', '<em>grouping_cols</em>',
  *       '<em>get_distinct</em>', '<em>get_quartiles</em>',
  *       '<em>ntile_array</em>', '<em>how_many_mfv</em>',
  *       '<em>get_estimates</em>'
  *   );
  *   SELECT * FROM '<em>output_table</em>'
  *  </pre>
  */
 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,       -- source table name
     output_table            TEXT,       -- output table name
     target_cols             TEXT,       -- comma separated list of output cols
     grouping_cols           TEXT,       -- comma separated names of grouping cols
     get_distinct            BOOLEAN,    -- Are distinct values required
     get_quartiles           BOOLEAN,    -- Are quartiles required
     ntile_array             FLOAT8[],   -- Array of quantiles to compute
     how_many_mfv            INTEGER,    -- How many most frequent values to compute?
     get_estimates           BOOLEAN     -- Should we produce exact or estimated
                                         --      values for distinct computation
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     PythonFunctionBodyOnly(`summary', `summary')
     return summary.summary(
         schema_madlib, source_table, output_table, target_cols, grouping_cols,
         get_distinct, get_quartiles, ntile_array, how_many_mfv, get_estimates)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------
 --- Overloaded functions to support optional parameters
 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT,
     get_distinct            BOOLEAN,
     get_quartiles           BOOLEAN,
     ntile_array             FLOAT8[],
     how_many_mfv            INTEGER
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, $5, $6, $7, $8, True)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT,
     get_distinct            BOOLEAN,
     get_quartiles           BOOLEAN,
     ntile_array             FLOAT8[]
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, $5, $6, $7, 10, True)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT,
     get_distinct            BOOLEAN,
     get_quartiles           BOOLEAN
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, $5, $6, NULL, 10, True)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT,
     get_distinct            BOOLEAN
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, $5, True, NULL, 10, True)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT,
     grouping_cols           TEXT
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, $4, True, True, NULL, 10, True)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT,
     target_cols             TEXT
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, $3, NULL, True, True, NULL, 10, True)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.summary
 (
     source_table            TEXT,
     output_table            TEXT
 )
 RETURNS MADLIB_SCHEMA.summary_result AS $$
     SELECT MADLIB_SCHEMA.summary(
         $1, $2, NULL, NULL, True, True, NULL, 10, True)
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 -----------------------------------------------------------------------
 -- Help functions
 -----------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.summary(
     input_message            TEXT
 )
 RETURNS TEXT AS $$
 PythonFunctionBodyOnly(`summary', `summary')
     return summary.summary_help_message(schema_madlib, input_message)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.summary()
 RETURNS TEXT AS $$
 PythonFunctionBodyOnly(`summary', `summary')
     return summary.summary_help_message(schema_madlib, None)
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
	/* ----------------------------------------------------------------------- //*
	*
	* @file summary.sql_in
	*
	* @brief Summary function for descriptive statistics
	* @date Mar 2013
	*
	// ------------------------------------------------------------------------*/

	m4_include(`SQLCommon.m4')

	/**

	@addtogroup grp_summary

	<div class="toc"><b>Contents</b>
	<ul>
	<li><a href="#usage">Summary Function Syntax</a></li>
	<li><a href="#examples">Examples</a></li>
	<li><a href="#notes">Notes</a></li>
	<li><a href="#related">Related Topics</a></li>
	</ul>
	</div>

	@brief Calculates general descriptive statistics for any data table.


	The MADlib \b summary() function is a generic function that produces summary
	statistics from any data table. The function invokes various methods from
	the MADlib library to provide an overview of the data.

	@anchor usage
	@par Summary Function Syntax
	The \b summary() function has the following syntax:

	<pre class="syntax">
	summary ( source_table,
	output_table,
	target_cols,
	grouping_cols,
	get_distinct,
	get_quartiles,
	ntile_array,
	how_many_mfv,
	get_estimates
	)
	</pre>
	\b Arguments
	<DL class="arglist">
	<dt>source_table</dt>
	<dd>TEXT. The name of the table containing the input data.</dd>
	<dt>output_table</dt>
	<dd>TEXT. The name of the table to contain the output summary data.

	The \b summary() function returns a composite type containing three fields:
	<table class="output">
	<tr>
	<th>output_table</th>
	<td>TEXT. The name of the output table.</td>
	</tr>
	<tr>
	<th>row_count</th>
	<td>INTEGER. The number of rows in the output table.</td>
	</tr>
	<tr>
	<th>duration</th>
	<td>FLOAT8. The time taken (in seconds) to compute the summary.</td>
	</tr>
	</table>


	Summary statistics are saved in a table with the name specifed in the
	<em>output_table</em> argument. The table can contain the
	following columns:
	<table class="output">
	<tr>
	<th>group_by</th>
	<td>Group-by column name. NULL if none provided.</td>
	</tr>
	<tr>
	<th>group_by_value</th>
	<td>Value of the Group-by column. NULL if there is no grouping.</td>
	</tr>
	<tr>
	<th>target_column</th>
	<td>Targeted column values for which summary is requested.</td>
	</tr>
	<tr>
	<th>column_number</th>
	<td>Physical column number for the target column, as described in \e pg_attribute.</td>
	</tr>
	<tr>
	<th>data_type</th>
	<td>Data type of the target column. Standard GPDB type descriptors are displayed.</td>
	</tr>
	<tr>
	<th>row_count</th>
	<td>Number of rows for the target column.</td>
	</tr>
	<tr>
	<th>distinct_values</th>
	<td>Number of distinct values in the target column. When the summary() function is called with the <em>get_estimates</em> argument set to TRUE, this is an estimated statistic based on the Flajolet-Martin distinct count estimator.</td>
	</tr>
	<tr>
	<th>missing_values</th>
	<td>Number of missing values in the target column.</td>
	</tr>
	<tr>
	<th>blank_values</th>
	<td>Number of blank values. Blanks are defined by this regular expression: \verbatim '^\w*$'\endverbatim</td>
	</tr>
	<tr>
	<th>fraction_missing</th>
	<td>Percentage of total rows that are missing, as a decimal value, e.g. 0.3.</td>
	</tr>
	<tr>
	<th>fraction_blank</th>
	<td>Percentage of total rows that are blank, as a decimal value, e.g. 0.3.</td>
	</tr>
	<tr>
	<th>mean</th>
	<td>Mean value of target column if target is numeric, otherwise NULL.</td>
	</tr>
	<tr>
	<th>variance</th>
	<td>Variance of target column if target is numeric, otherwise NULL.</td>
	</tr>
	<tr>
	<th>min</th>
	<td>Minimum value of target column. For strings this is the length of the shortest string.</td>
	</tr>
	<tr>
	<th>max</th>
	<td>Maximum value of target column. For strings this is the length of the longest string.</td>
	</tr>
	<tr>
	<th>first_quartile</th>
	<td>First quartile (25th percentile), only for numeric columns. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
	</tr>
	<tr>
	<th>median</th>
	<td>Median value of target column, if target is numeric, otherwise NULL. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
	</tr>
	<tr>
	<th>third_quartile</th>
	<td>Third quartile (25th percentile), only for numeric columns. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
	</tr>
	<tr>
	<th>quantile_array</th>
	<td>Percentile values corresponding to \e ntile_array. <b>Currently unavailable for PostgreSQL 9.3 or lower</b>.</td>
	</tr>
	<tr>
	<th>most_frequent_values</th>
	<td>An array containing the most frequently occurring values. The \e
	how_many_mfv argument determines the length of the array, 10 by
	default. If the summary() function is called with the \e
	get_estimates argument set to TRUE (default), the frequent values
	computation is performed using a parallel aggregation method that is
	faster, but in some cases can fail to detect the exact most frequent
	values.</td>
	</tr>
	<tr>
	<th>mfv_frequencies</th>
	<td>Array containing the frequency count for each of the most frequent values. </td>
	</tr>
	</table></dd>
	<dt>target_columns (optional)</dt>
	<dd>TEXT, default NULL. A comma-separated list of columns to summarize. If NULL, summaries are produced for all columns.</dd>
	<dt>grouping_cols (optional)</dt>
	<dd>TEXT, default: null. A comma-separated list of columns on which to group results. If NULL, summaries are produced on the complete table.</dd>
	<dt>get_distinct (optional)</dt>
	<dd>BOOLEAN, default TRUE. If true, distinct values are counted.</dd>
	<dt>get_quartiles (optional)</dt>
	<dd>BOOLEAN, default TRUE. If TRUE, qartiles are computed.</dd>
	<dt>ntile_array (optional)</dt>
	<dd>FLOAT8[], default NULL. An array of quantile values to compute. If NULL, quantile values are not computed.</dd>
	<dt>how_many_mfv (optional)</dt>
	<dd>INTEGER, default: 10. The number of most-frequent-values to compute.</dd>
	<dt>get_estimates (optional)</dt>
	<dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced. If FALSE, exact values are calculated.</dd>
	</DL>


	@anchor examples
	@examp

	-# View online help for the summary() function.
	<pre class="example">
	SELECT * FROM madlib.summary();
	</pre>

	-# Create an input data set.
	<pre class="example">
	CREATE TABLE houses (id INT, tax INT, bedroom INT, bath FLOAT, price INT,
	size INT, lot INT);
	COPY houses FROM STDIN WITH DELIMITER '\|';
	1 \| 590 \| 2 \| 1 \| 50000 \| 770 \| 22100
	2 \| 1050 \| 3 \| 2 \| 85000 \| 1410 \| 12000
	3 \| 20 \| 3 \| 1 \| 22500 \| 1060 \| 3500
	4 \| 870 \| 2 \| 2 \| 90000 \| 1300 \| 17500
	5 \| 1320 \| 3 \| 2 \| 133000 \| 1500 \| 30000
	6 \| 1350 \| 2 \| 1 \| 90500 \| 820 \| 25700
	7 \| 2790 \| 3 \| 2.5 \| 260000 \| 2130 \| 25000
	8 \| 680 \| 2 \| 1 \| 142500 \| 1170 \| 22000
	9 \| 1840 \| 3 \| 2 \| 160000 \| 1500 \| 19000
	10 \| 3680 \| 4 \| 2 \| 240000 \| 2790 \| 20000
	11 \| 1660 \| 3 \| 1 \| 87000 \| 1030 \| 17500
	12 \| 1620 \| 3 \| 2 \| 118600 \| 1250 \| 20000
	13 \| 3100 \| 3 \| 2 \| 140000 \| 1760 \| 38000
	14 \| 2070 \| 2 \| 3 \| 148000 \| 1550 \| 14000
	15 \| 650 \| 3 \| 1.5 \| 65000 \| 1450 \| 12000
	\\.
	</pre>

	-# Run the \b summary() function.
	<pre class="example">
	SELECT * FROM madlib.summary( 'houses',
	'houses_summary',
	'tax,bedroom,lot,bath,price,size,lot',
	'bedroom',
	TRUE,
	TRUE,
	NULL,
	5,
	FALSE
	);
	</pre>
	Result:
	<pre class="result">
	output_table \| row_count \| duration
	----------------+-----------+----------------
	houses_summary \| 21 \| 0.207587003708
	(1 row)
	</pre>

	-# View the summary data.
	<pre class=example>
	-- Turn on expanded display for readability.
	\\x on
	SELECT * FROM houses_summary;
	</pre>
	Result:
	<pre class="result">
	-[ RECORD 1 ]--------+-----------------------------------
	group_by \| bedroom
	group_by_value \| 3
	target_column \| tax
	column_number \| 2
	data_type \| int4
	row_count \| 9
	distinct_values \| 9
	missing_values \| 0
	blank_values \|
	fraction_missing \| 0
	fraction_blank \|
	mean \| 1561.11111111111
	variance \| 936736.111111111
	min \| 20
	max \| 3100
	most_frequent_values \| {20,1320,2790,1840,1660}
	mfv_frequencies \| {1,1,1,1,1}
	-[ RECORD 2 ]--------+-----------------------------------
	group_by \| bedroom
	group_by_value \| 3
	target_column \| bath
	column_number \| 4
	...
	</pre>

	@anchor notes
	@par Notes
	- Table names can be optionally schema qualified (current_schemas() would be
	searched if a schema name is not provided) and table and column names
	should follow case-sensitivity and quoting rules per the database.
	(For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'.
	If mixed-case or multi-byte characters are desired for entity names then the
	string should be double-quoted; in this case the input would be '"MyTable"').
	- Estimated values are only implemented for the distinct values computation.
	- The <em>get_estimates</em> parameter controls computation for two statistics:
	- If <em>get_estimates</em> is TRUE then the distinct value computation is
	estimated. Further, the most frequent values computation is computed using a
	"quick and dirty" method that does parallel aggregation in Greenplum Database at the expense
	of missing some of the most frequent values.
	- If <em>get_estimates</em> is FALSE then the distinct values are computed
	in a slow but exact method. The most frequent values are computed using a
	faithful implementation that preserves the approximation guarantees of
	the Cormode/Muthukrishnan method (more information in \ref grp_mfvsketch).


	@anchor related
	@par Related Topics
	File summary.sql_in documenting the \b summary() function

	\ref grp_mfvsketch

	*/

	DROP TYPE IF EXISTS MADLIB_SCHEMA.summary_result CASCADE;
	CREATE TYPE MADLIB_SCHEMA.summary_result AS
	(
	output_table TEXT,
	row_count INT4,
	duration FLOAT8
	);


	-----------------------------------------------------------------------
	-- Main function for summary
	-----------------------------------------------------------------------
	/*
	* @brief Compute a summary statistics on a table with optional grouping support
	*
	* @param source_table Name of source relation containing the data
	* @param output_table Name of output table name to store the summary
	* @param target_cols String with comma separated list of columns on which summary is desired
	* @param grouping_cols String with comma separated list of columns on which to group the data by
	* @param get_distinct Should distinct values count be included in result
	* @param get_quartiles Should first, second (median), and third quartiles be included in result
	* @param ntile_array Array of percentiles to compute
	* @param how_many_mfv How many most frequent values to compute?
	* @param get_estimates Should distinct counts be an estimated (faster) or exact count?
	*
	* @usage
	*
	* <pre> SELECT MADLIB_SCHEMA.summary (
	* '<em>source_table</em>', '<em>output_table</em>',
	* '<em>target_cols</em>', '<em>grouping_cols</em>',
	* '<em>get_distinct</em>', '<em>get_quartiles</em>',
	* '<em>ntile_array</em>', '<em>how_many_mfv</em>',
	* '<em>get_estimates</em>'
	* );
	* SELECT * FROM '<em>output_table</em>'
	* </pre>
	*/
	CREATE OR REPLACE FUNCTION
	MADLIB_SCHEMA.summary
	(
	source_table TEXT, -- source table name
	output_table TEXT, -- output table name
	target_cols TEXT, -- comma separated list of output cols
	grouping_cols TEXT, -- comma separated names of grouping cols
	get_distinct BOOLEAN, -- Are distinct values required
	get_quartiles BOOLEAN, -- Are quartiles required
	ntile_array FLOAT8[], -- Array of quantiles to compute
	how_many_mfv INTEGER, -- How many most frequent values to compute?
	get_estimates BOOLEAN -- Should we produce exact or estimated
	-- values for distinct computation
	)
	RETURNS MADLIB_SCHEMA.summary_result AS $$
	PythonFunctionBodyOnly(`summary', `summary')
	return summary.summary(
	schema_madlib, source_table, output_table, target_cols, grouping_cols,
	get_distinct, get_quartiles, ntile_array, how_many_mfv, get_estimates)
	$$ LANGUAGE plpythonu VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

	-----------------------------------------------------------------------
	--- Overloaded functions to support optional parameters
	-----------------------------------------------------------------------
	CREATE OR REPLACE FUNCTION
	MADLIB_SCHEMA.summary
	(
	source_table TEXT,
	output_table TEXT,
	target_cols TEXT,
	grouping_cols TEXT,
	get_distinct BOOLEAN,
	get_quartiles BOOLEAN,
	ntile_array FLOAT8[],
	how_many_mfv INTEGER
	)
	RETURNS MADLIB_SCHEMA.summary_result AS $$
	SELECT MADLIB_SCHEMA.summary(
	$1, $2, $3, $4, $5, $6, $7, $8, True)
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION
	MADLIB_SCHEMA.summary
	(
	source_table TEXT,
	output_table TEXT,
	target_cols TEXT,
	grouping_cols TEXT,
	get_distinct BOOLEAN,
	get_quartiles BOOLEAN,
	ntile_array FLOAT8[]
	)
	RETURNS MADLIB_SCHEMA.summary_result AS $$
	SELECT MADLIB_SCHEMA.summary(
	$1, $2, $3, $4, $5, $6, $7, 10, True)
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION
	MADLIB_SCHEMA.summary
	(
	source_table TEXT,
	output_table TEXT,
	target_cols TEXT,
	grouping_cols TEXT,
	get_distinct BOOLEAN,
	get_quartiles BOOLEAN
	)
	RETURNS MADLIB_SCHEMA.summary_result AS $$
	SELECT MADLIB_SCHEMA.summary(
	$1, $2, $3, $4, $5, $6, NULL, 10, True)
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION
	MADLIB_SCHEMA.summary
	(
	source_table TEXT,
	output_table TEXT,
	target_cols TEXT,
	grouping_cols TEXT,
	get_distinct BOOLEAN
	)
	RETURNS MADLIB_SCHEMA.summary_result AS $$
	SELECT MADLIB_SCHEMA.summary(
	$1, $2, $3, $4, $5, True, NULL, 10, True)
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION
	MADLIB_SCHEMA.summary
	(
	source_table TEXT,
	output_table TEXT,
	target_cols TEXT,
	grouping_cols TEXT
	)
	RETURNS MADLIB_SCHEMA.summary_result AS $$
	SELECT MADLIB_SCHEMA.summary(
	$1, $2, $3, $4, True, True, NULL, 10, True)
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION
	MADLIB_SCHEMA.summary
	(
	source_table TEXT,
	output_table TEXT,
	target_cols TEXT
	)
	RETURNS MADLIB_SCHEMA.summary_result AS $$
	SELECT MADLIB_SCHEMA.summary(
	$1, $2, $3, NULL, True, True, NULL, 10, True)
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

	CREATE OR REPLACE FUNCTION
	MADLIB_SCHEMA.summary
	(
	source_table TEXT,
	output_table TEXT
	)
	RETURNS MADLIB_SCHEMA.summary_result AS $$
	SELECT MADLIB_SCHEMA.summary(
	$1, $2, NULL, NULL, True, True, NULL, 10, True)
	$$ LANGUAGE sql VOLATILE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

	-----------------------------------------------------------------------
	-- Help functions
	-----------------------------------------------------------------------
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.summary(
	input_message TEXT
	)
	RETURNS TEXT AS $$
	PythonFunctionBodyOnly(`summary', `summary')
	return summary.summary_help_message(schema_madlib, input_message)
	$$ LANGUAGE plpythonu IMMUTABLE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');


	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.summary()
	RETURNS TEXT AS $$
	PythonFunctionBodyOnly(`summary', `summary')
	return summary.summary_help_message(schema_madlib, None)
	$$ LANGUAGE plpythonu IMMUTABLE
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');