| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file summary.sql_in |
| * |
| * @brief Summary function for descriptive statistics |
| * @date Mar 2013 |
| * |
| *//* ------------------------------------------------------------------------*/ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| |
| @addtogroup grp_summary |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#usage">Summary Function Syntax</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#notes">Notes</a></li> |
| <li><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| @brief Calculates general descriptive statistics for any data table. |
| |
| |
| The MADlib \b summary() function produces summary |
| statistics for any data table. The function invokes various methods from |
| the MADlib library to provide the data overview. |
| |
| @anchor usage |
| @par Summary Function Syntax |
| The \b summary() function has the following syntax: |
| |
| <pre class="syntax"> |
| summary ( source_table, |
| output_table, |
| target_cols, |
| grouping_cols, |
| get_distinct, |
| get_quartiles, |
| ntile_array, |
| how_many_mfv, |
| get_estimates, |
| n_cols_per_run |
| ) |
| </pre> |
| The \b summary() function returns a composite type containing three fields: |
| <table class="output"> |
| <tr> |
| <th>output_table</th> |
| <td>TEXT. The name of the output table.</td> |
| </tr> |
| <tr> |
| <th>num_col_summarized</th> |
| <td>INTEGER. The number of columns from the source table that have been summarized.</td> |
| </tr> |
| <tr> |
| <th>duration</th> |
| <td>FLOAT8. The time taken (in seconds) to compute the summary.</td> |
| </tr> |
| </table> |
| \b Arguments |
| <DL class="arglist"> |
| <dt>source_table</dt> |
| <dd>TEXT. Name of the table containing the input data.</dd> |
| |
| <dt>output_table</dt> |
| <dd>TEXT. Name of the table for the output summary statistics. |
| This table contains the following columns: |
| <table class="output"> |
| <tr> |
| <th>group_by</th> |
| <td>Group-by column name. NULL if none provided.</td> |
| </tr> |
| <tr> |
| <th>group_by_value</th> |
| <td>Value of the group-by column. NULL if there is no grouping.</td> |
| </tr> |
| <tr> |
| <th>target_column</th> |
| <td>Targeted column values for which summary is requested.</td> |
| </tr> |
| <tr> |
| <th>column_number</th> |
| <td>Physical column number for the target column, as described in \e pg_attribute</td> catalog. |
| </tr> |
| <tr> |
| <th>data_type</th> |
| <td>Data type of the target column. Standard GPDB type descriptors are displayed.</td> |
| </tr> |
| <tr> |
| <th>row_count</th> |
| <td>Number of rows for the target column.</td> |
| </tr> |
| <tr> |
| <th>distinct_values</th> |
| <td>Number of distinct values in the target column. |
| If the summary() function is called with the <em>get_estimates</em> |
| argument set to TRUE (default), then this is an estimated statistic based on the |
| Flajolet-Martin distinct count estimator. If the <em>get_estimates</em> |
| argument set to FALSE, will use PostgreSQL COUNT DISTINCT.</td> |
| </tr> |
| <tr> |
| <th>missing_values</th> |
| <td>Number of missing values in the target column.</td> |
| </tr> |
| <tr> |
| <th>blank_values</th> |
| <td>Number of blank values. Blanks are defined by this regular expression: \verbatim '^\w*$'\endverbatim</td> |
| </tr> |
| <tr> |
| <th>fraction_missing</th> |
| <td>Percentage of total rows that are missing, as a decimal value, e.g. 0.3.</td> |
| </tr> |
| <tr> |
| <th>fraction_blank</th> |
| <td>Percentage of total rows that are blank, as a decimal value, e.g. 0.3.</td> |
| </tr> |
| <tr> |
| <th>positive_values</th> |
| <td>Number of positive values in the target column if target is numeric, otherwise NULL.</td> |
| </tr> |
| <tr> |
| <th>negative_values</th> |
| <td>Number of negative values in the target column if target is numeric, otherwise NULL.</td> |
| </tr> |
| <tr> |
| <th>zero_values</th> |
| <td>Number of zero values in the target column if target is numeric, otherwise NULL. |
| Note that we are reporting exact equality to 0.0 here, so even if you have a |
| float value that is extremely small (say due to rounding), it will not be |
| reported as a zero value.</td> |
| </tr> |
| <tr> |
| <th>mean</th> |
| <td>Mean value of target column if target is numeric, otherwise NULL.</td> |
| </tr> |
| <tr> |
| <th>variance</th> |
| <td>Variance of target column if target is numeric, otherwise NULL.</td> |
| </tr> |
| <tr> |
| <th>confidence_interval</th> |
| <td>Confidence interval (95% using z-score) of the mean value for the target column |
| if target is numeric, otherwise NULL. |
| Presented as an array of two elements in the form {lower bound, upper bound}.</td> |
| </tr> |
| <tr> |
| <th>min</th> |
| <td>Minimum value of target column. For strings this is the length of the shortest string.</td> |
| </tr> |
| <tr> |
| <th>max</th> |
| <td>Maximum value of target column. For strings this is the length of the longest string.</td> |
| </tr> |
| <tr> |
| <th>first_quartile</th> |
| <td>First quartile (25th percentile), only for numeric columns. |
| (Unavailable for PostgreSQL 9.3 or lower.)</td> |
| </tr> |
| <tr> |
| <th>median</th> |
| <td>Median value of target column, if target is numeric, otherwise NULL. |
| (Unavailable for PostgreSQL 9.3 or lower.)</td> |
| </tr> |
| <tr> |
| <th>third_quartile</th> |
| <td>Third quartile (25th percentile), only for numeric columns. |
| (Unavailable for PostgreSQL 9.3 or lower.)</td> |
| </tr> |
| <tr> |
| <th>quantile_array</th> |
| <td>Percentile values corresponding to \e ntile_array. |
| (Unavailable for PostgreSQL 9.3 or lower.)</td> |
| </tr> |
| <tr> |
| <th>most_frequent_values</th> |
| <td>An array containing the most frequently occurring values. The \e |
| how_many_mfv argument determines the length of the array, which is 10 by |
| default. If the summary() function is called with the \e |
| get_estimates argument set to TRUE (default), the frequent values |
| computation is performed using a parallel aggregation method that is |
| faster, but in some cases may fail to detect the exact most frequent |
| values.</td> |
| </tr> |
| <tr> |
| <th>mfv_frequencies</th> |
| <td>Array containing the frequency count for each of the most frequent values. </td> |
| </tr> |
| </table></dd> |
| |
| <dt>target_columns (optional)</dt> |
| <dd>TEXT, default NULL. A comma-separated list of columns to summarize. |
| If NULL, summaries are produced for all columns.</dd> |
| |
| <dt>grouping_cols (optional)</dt> |
| <dd>TEXT, default: null. A comma-separated list of columns on which to |
| group results. If NULL, summaries are produced for the complete table.</dd> |
| @note Please note that summary statistics are calculated for each grouping |
| column independently. That is, grouping columns are not combined together |
| as in the regular PostgreSQL style GROUP BY directive. (This was done |
| to reduce long run time and huge output table size which would otherwise |
| result in the case of large input tables with a lot of grouping_cols and |
| target_cols specified.) |
| |
| <dt>get_distinct (optional)</dt> |
| <dd>BOOLEAN, default TRUE. If true, distinct values are counted. |
| The method for computing distinct values depends on the setting of |
| the 'get_estimates' parameter below.</dd> |
| |
| <dt>get_quartiles (optional)</dt> |
| <dd>BOOLEAN, default TRUE. If TRUE, quartiles are computed.</dd> |
| |
| <dt>ntile_array (optional)</dt> |
| <dd>FLOAT8[], default NULL. An array of quantile values to compute. |
| If NULL, quantile values are not computed.</dd> |
| @note Quartile and quantile functions are not available in PostgreSQL 9.3 or |
| lower. If you are using PostgreSQL 9.3 or lower, the output table will not |
| contain these values, even if you set 'get_quartiles' = TRUE or |
| provide an array of quantile values for the parameter 'ntile_array'. |
| |
| <dt>how_many_mfv (optional)</dt> |
| <dd>INTEGER, default: 10. The number of most-frequent-values to compute. |
| The method for computing MFV depends on the setting of |
| the 'get_estimates' parameter below.</dd> |
| |
| <dt>get_estimates (optional)</dt> |
| <dd>BOOLEAN, default TRUE. If TRUE, estimated values are produced for |
| distinct values and most frequent values. If FALSE, exact values are |
| calculated which will take longer to run, with the impact depending on |
| data size.</dd> |
| |
| <dt>n_cols_per_run (optional)</dt> |
| <dd>INTEGER, default: 15. The number of columns to collect summary statistics in |
| one pass of the data. |
| This parameter determines the number of passes through the data. For e.g., |
| with a total of 40 columns to summarize and 'n_cols_per_run = 15', there will be |
| 3 passes through the data, with each pass summarizing a maximum of 15 columns. |
| @note This parameter should be used with caution. Increasing this parameter could |
| decrease the total run time (if number of passes decreases), but will increase |
| the memory consumption during each run. Since PostgreSQL limits the memory available |
| for a single aggregate run, this increased memory consumption could result in an |
| out-of-memory termination error. |
| |
| </dd> |
| </DL> |
| |
| |
| @anchor examples |
| @examp |
| |
| -# View online help for the summary() function. |
| <pre class="example"> |
| SELECT * FROM madlib.summary(); |
| </pre> |
| |
| -# Create an input data table using part of the well known |
| iris data set. |
| <pre class="example"> |
| DROP TABLE IF EXISTS iris; |
| CREATE TABLE iris (id INT, sepal_length FLOAT, sepal_width FLOAT, |
| petal_length FLOAT, petal_width FLOAT, |
| class_name text); |
| INSERT INTO iris VALUES |
| (1,5.1,3.5,1.4,0.2,'Iris-setosa'), |
| (2,4.9,3.0,1.4,0.2,'Iris-setosa'), |
| (3,4.7,3.2,1.3,0.2,'Iris-setosa'), |
| (4,4.6,3.1,1.5,0.2,'Iris-setosa'), |
| (5,5.0,3.6,1.4,0.2,'Iris-setosa'), |
| (6,5.4,3.9,1.7,0.4,'Iris-setosa'), |
| (7,4.6,3.4,1.4,0.3,'Iris-setosa'), |
| (8,5.0,3.4,1.5,0.2,'Iris-setosa'), |
| (9,4.4,2.9,1.4,0.2,'Iris-setosa'), |
| (10,4.9,3.1,1.5,0.1,'Iris-setosa'), |
| (11,7.0,3.2,4.7,1.4,'Iris-versicolor'), |
| (12,6.4,3.2,4.5,1.5,'Iris-versicolor'), |
| (13,6.9,3.1,4.9,1.5,'Iris-versicolor'), |
| (14,5.5,2.3,4.0,1.3,'Iris-versicolor'), |
| (15,6.5,2.8,4.6,1.5,'Iris-versicolor'), |
| (16,5.7,2.8,4.5,1.3,'Iris-versicolor'), |
| (17,6.3,3.3,4.7,1.6,'Iris-versicolor'), |
| (18,4.9,2.4,3.3,1.0,'Iris-versicolor'), |
| (19,6.6,2.9,4.6,1.3,'Iris-versicolor'), |
| (20,5.2,2.7,3.9,1.4,'Iris-versicolor'), |
| (21,6.3,3.3,6.0,2.5,'Iris-virginica'), |
| (22,5.8,2.7,5.1,1.9,'Iris-virginica'), |
| (23,7.1,3.0,5.9,2.1,'Iris-virginica'), |
| (24,6.3,2.9,5.6,1.8,'Iris-virginica'), |
| (25,6.5,3.0,5.8,2.2,'Iris-virginica'), |
| (26,7.6,3.0,6.6,2.1,'Iris-virginica'), |
| (27,4.9,2.5,4.5,1.7,'Iris-virginica'), |
| (28,7.3,2.9,6.3,1.8,'Iris-virginica'), |
| (29,6.7,2.5,5.8,1.8,'Iris-virginica'), |
| (30,7.2,3.6,6.1,2.5,'Iris-virginica'); |
| </pre> |
| |
| -# Run the \b summary() function using all defaults. |
| <pre class="example"> |
| DROP TABLE IF EXISTS iris_summary; |
| SELECT * FROM madlib.summary( 'iris', -- Source table |
| 'iris_summary' -- Output table |
| ); |
| </pre> |
| Result: |
| <pre class="result"> |
| output_table | num_col_summarized | duration |
| --------------+--------------------+------------------- |
| iris_summary | 6 | 0.574938058853149 |
| (1 row) |
| </pre> |
| View the summary data. |
| <pre class=example> |
| -- Turn on expanded display for readability. |
| \\x on |
| SELECT * FROM iris_summary; |
| </pre> |
| Result (partial): |
| <pre class="result"> |
| ... |
| -[ RECORD 2 ]--------+--------------------------------------------- |
| group_by | |
| group_by_value | |
| target_column | sepal_length |
| column_number | 2 |
| data_type | float8 |
| row_count | 30 |
| distinct_values | 22 |
| missing_values | 0 |
| blank_values | |
| fraction_missing | 0 |
| fraction_blank | |
| positive_values | 30 |
| negative_values | 0 |
| zero_values | 0 |
| mean | 5.84333333333333 |
| variance | 0.929436781609188 |
| confidence_interval | {5.49834423494374,6.18832243172292} |
| min | 4.4 |
| max | 7.6 |
| first_quartile | 4.925 |
| median | 5.75 |
| third_quartile | 6.575 |
| most_frequent_values | {4.9,6.3,5,6.5,4.6,7.2,5.5,5.7,7.3,6.7} |
| mfv_frequencies | {4,3,2,2,2,1,1,1,1,1} |
| ... |
| -[ RECORD 6 ]--------+--------------------------------------------- |
| group_by | |
| group_by_value | |
| target_column | class_name |
| column_number | 6 |
| data_type | text |
| row_count | 30 |
| distinct_values | 3 |
| missing_values | 0 |
| blank_values | 0 |
| fraction_missing | 0 |
| fraction_blank | 0 |
| positive_values | |
| negative_values | |
| zero_values | |
| mean | |
| variance | |
| confidence_interval | |
| min | 11 |
| max | 15 |
| first_quartile | |
| median | |
| third_quartile | |
| most_frequent_values | {Iris-setosa,Iris-versicolor,Iris-virginica} |
| mfv_frequencies | {10,10,10} |
| </pre> |
| Note that for the text column in record 6, some statistics are n/a, |
| and the min and max values represent the length of the shortest and |
| longest strings respectively. |
| |
| -# Now group by the class of iris: |
| <pre class="example"> |
| DROP TABLE IF EXISTS iris_summary; |
| SELECT * FROM madlib.summary( 'iris', -- Source table |
| 'iris_summary', -- Output table |
| 'sepal_length, sepal_width', -- Columns to summarize |
| 'class_name' -- Grouping column |
| ); |
| SELECT * FROM iris_summary; |
| </pre> |
| Result (partial): |
| <pre class="result"> |
| -[ RECORD 1 ]--------+---------------------------------------- |
| group_by | class_name |
| group_by_value | Iris-setosa |
| target_column | sepal_length |
| column_number | 2 |
| data_type | float8 |
| row_count | 10 |
| distinct_values | 7 |
| missing_values | 0 |
| blank_values | |
| fraction_missing | 0 |
| fraction_blank | |
| positive_values | 10 |
| negative_values | 0 |
| zero_values | 0 |
| mean | 4.86 |
| variance | 0.0848888888888875 |
| confidence_interval | {4.67941507384182,5.04058492615818} |
| min | 4.4 |
| max | 5.4 |
| first_quartile | 4.625 |
| median | 4.9 |
| third_quartile | 5 |
| most_frequent_values | {4.9,5,4.6,5.1,4.7,5.4,4.4} |
| mfv_frequencies | {2,2,2,1,1,1,1} |
| ... |
| -[ RECORD 3 ]--------+---------------------------------------- |
| group_by | class_name |
| group_by_value | Iris-versicolor |
| target_column | sepal_length |
| column_number | 2 |
| data_type | float8 |
| row_count | 10 |
| distinct_values | 10 |
| missing_values | 0 |
| blank_values | |
| fraction_missing | 0 |
| fraction_blank | |
| positive_values | 10 |
| negative_values | 0 |
| zero_values | 0 |
| mean | 6.1 |
| variance | 0.528888888888893 |
| confidence_interval | {5.64924734548141,6.55075265451859} |
| min | 4.9 |
| max | 7 |
| first_quartile | 5.55 |
| median | 6.35 |
| third_quartile | 6.575 |
| most_frequent_values | {6.9,5.5,6.5,5.7,6.3,4.9,6.6,5.2,7,6.4} |
| mfv_frequencies | {1,1,1,1,1,1,1,1,1,1} |
| ... |
| </pre> |
| |
| -# Trying some other parameters: |
| <pre class="example"> |
| DROP TABLE IF EXISTS iris_summary; |
| SELECT * FROM madlib.summary( 'iris', -- Source table |
| 'iris_summary', -- Output table |
| 'sepal_length, sepal_width', -- Columns to summarize |
| NULL, -- No grouping |
| TRUE, -- Get distinct values |
| FALSE, -- Dont get quartiles |
| ARRAY[0.33, 0.66], -- Get ntiles |
| 3, -- Number of MFV to compute |
| FALSE -- Get exact values |
| ); |
| SELECT * FROM iris_summary; |
| </pre> |
| Result: |
| <pre class="result"> |
| -[ RECORD 1 ]--------+------------------------------------ |
| group_by | |
| group_by_value | |
| target_column | sepal_length |
| column_number | 2 |
| data_type | float8 |
| row_count | 30 |
| distinct_values | 22 |
| missing_values | 0 |
| blank_values | |
| fraction_missing | 0 |
| fraction_blank | |
| positive_values | 30 |
| negative_values | 0 |
| zero_values | 0 |
| mean | 5.84333333333333 |
| variance | 0.929436781609175 |
| confidence_interval | {5.49834423494375,6.18832243172292} |
| min | 4.4 |
| max | 7.6 |
| quantile_array | {5.057,6.414} |
| most_frequent_values | {4.9,6.3,6.5} |
| mfv_frequencies | {4,3,2} |
| -[ RECORD 2 ]--------+------------------------------------ |
| group_by | |
| group_by_value | |
| target_column | sepal_width |
| column_number | 3 |
| data_type | float8 |
| row_count | 30 |
| distinct_values | 14 |
| missing_values | 0 |
| blank_values | |
| fraction_missing | 0 |
| fraction_blank | |
| positive_values | 30 |
| negative_values | 0 |
| zero_values | 0 |
| mean | 3.04 |
| variance | 0.13903448275862 |
| confidence_interval | {2.90656901047539,3.17343098952461} |
| min | 2.3 |
| max | 3.9 |
| quantile_array | {2.9,3.2} |
| most_frequent_values | {2.9,3,3.2} |
| mfv_frequencies | {4,4,3} |
| </pre> |
| |
| @anchor notes |
| @par Notes |
| - Table names can be optionally schema qualified (current_schemas() would be |
| searched if a schema name is not provided) and table and column names |
| should follow case-sensitivity and quoting rules per the database. |
| (For instance, 'mytable' and 'MyTable' both resolve to the same entity, i.e. 'mytable'. |
| If mixed-case or multi-byte characters are desired for entity names then the |
| string should be double-quoted; in this case the input would be '"MyTable"'). |
| - The <em>get_estimates</em> parameter controls computation for both distinct |
| count and most frequent values: |
| - If <em>get_estimates</em> is TRUE then the distinct value computation is |
| estimated using Flajolet-Martin. MFV is computed using a |
| fast method that does parallel aggregation in Greenplum Database at the expense |
| of missing or duplicating some of the most frequent values. |
| - If <em>get_estimates</em> is FALSE then the distinct values are computed |
| in a slower but exact method using PostgreSQL COUNT DISTINCT. MFV is computed using a |
| faithful implementation that preserves the approximation guarantees of |
| the Cormode/Muthukrishnan method (more information at \ref grp_mfvsketch). |
| |
| |
| @anchor related |
| @par Related Topics |
| File summary.sql_in documenting the \b summary() function |
| |
| \ref grp_fmsketch <br/> |
| \ref grp_mfvsketch <br/> |
| \ref grp_countmin |
| |
| */ |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.summary_result CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.summary_result AS |
| ( |
| output_table TEXT, |
| num_col_summarized INT4, |
| duration FLOAT8 |
| ); |
| |
| |
| ----------------------------------------------------------------------- |
| -- Main function for summary |
| ----------------------------------------------------------------------- |
| /* |
| * @brief Compute a summary statistics on a table with optional grouping support |
| * |
| * @param source_table Name of source relation containing the data |
| * @param output_table Name of output table name to store the summary |
| * @param target_cols String with comma separated list of columns on which summary is desired |
| * @param grouping_cols String with comma separated list of columns on which to group the data by |
| * @param get_distinct Should distinct values count be included in result |
| * @param get_quartiles Should first, second (median), and third quartiles be included in result |
| * @param ntile_array Array of percentiles to compute |
| * @param how_many_mfv How many most frequent values to compute? |
| * @param get_estimates Should distinct counts be an estimated (faster) or exact count? |
| * @param n_cols_per_run Number of columns to use per run of summary |
| * |
| * @usage |
| * |
| * <pre> SELECT MADLIB_SCHEMA.summary ( |
| * '<em>source_table</em>', '<em>output_table</em>', |
| * '<em>target_cols</em>', '<em>grouping_cols</em>', |
| * '<em>get_distinct</em>', '<em>get_quartiles</em>', |
| * '<em>ntile_array</em>', '<em>how_many_mfv</em>', |
| * '<em>get_estimates</em>', '<em>n_cols_per_run</em>' |
| * ); |
| * SELECT * FROM '<em>output_table</em>' |
| * </pre> |
| */ |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.summary |
| ( |
| source_table TEXT, -- source table name |
| output_table TEXT, -- output table name |
| target_cols TEXT, -- comma separated list of output cols |
| grouping_cols TEXT, -- comma separated names of grouping cols |
| get_distinct BOOLEAN, -- Are distinct values required |
| get_quartiles BOOLEAN, -- Are quartiles required |
| ntile_array FLOAT8[], -- Array of quantiles to compute |
| how_many_mfv INTEGER, -- How many most frequent values to compute? |
| get_estimates BOOLEAN, -- Should we produce exact or estimated |
| -- values for distinct computation |
| n_cols_per_run INTEGER -- Number of columns to use per run of summary |
| ) |
| RETURNS MADLIB_SCHEMA.summary_result AS $$ |
| PythonFunctionBodyOnly(`summary', `summary') |
| with AOControl(False): |
| return summary.summary( |
| schema_madlib, source_table, output_table, target_cols, grouping_cols, |
| get_distinct, get_quartiles, ntile_array, how_many_mfv, |
| get_estimates, n_cols_per_run) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ----------------------------------------------------------------------- |
| --- Overloaded functions to support optional parameters |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.summary |
| ( |
| source_table TEXT, |
| output_table TEXT, |
| target_cols TEXT, |
| grouping_cols TEXT, |
| get_distinct BOOLEAN, |
| get_quartiles BOOLEAN, |
| ntile_array FLOAT8[], |
| how_many_mfv INTEGER, |
| get_estimates BOOLEAN |
| ) |
| RETURNS MADLIB_SCHEMA.summary_result AS $$ |
| SELECT MADLIB_SCHEMA.summary( |
| $1, $2, $3, $4, $5, $6, $7, $8, $9, 15) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.summary |
| ( |
| source_table TEXT, |
| output_table TEXT, |
| target_cols TEXT, |
| grouping_cols TEXT, |
| get_distinct BOOLEAN, |
| get_quartiles BOOLEAN, |
| ntile_array FLOAT8[], |
| how_many_mfv INTEGER |
| ) |
| RETURNS MADLIB_SCHEMA.summary_result AS $$ |
| SELECT MADLIB_SCHEMA.summary( |
| $1, $2, $3, $4, $5, $6, $7, $8, True, 15) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.summary |
| ( |
| source_table TEXT, |
| output_table TEXT, |
| target_cols TEXT, |
| grouping_cols TEXT, |
| get_distinct BOOLEAN, |
| get_quartiles BOOLEAN, |
| ntile_array FLOAT8[] |
| ) |
| RETURNS MADLIB_SCHEMA.summary_result AS $$ |
| SELECT MADLIB_SCHEMA.summary( |
| $1, $2, $3, $4, $5, $6, $7, 10, True, 15) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.summary |
| ( |
| source_table TEXT, |
| output_table TEXT, |
| target_cols TEXT, |
| grouping_cols TEXT, |
| get_distinct BOOLEAN, |
| get_quartiles BOOLEAN |
| ) |
| RETURNS MADLIB_SCHEMA.summary_result AS $$ |
| SELECT MADLIB_SCHEMA.summary( |
| $1, $2, $3, $4, $5, $6, NULL, 10, True, 15) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.summary |
| ( |
| source_table TEXT, |
| output_table TEXT, |
| target_cols TEXT, |
| grouping_cols TEXT, |
| get_distinct BOOLEAN |
| ) |
| RETURNS MADLIB_SCHEMA.summary_result AS $$ |
| SELECT MADLIB_SCHEMA.summary( |
| $1, $2, $3, $4, $5, True, NULL, 10, True, 15) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.summary |
| ( |
| source_table TEXT, |
| output_table TEXT, |
| target_cols TEXT, |
| grouping_cols TEXT |
| ) |
| RETURNS MADLIB_SCHEMA.summary_result AS $$ |
| SELECT MADLIB_SCHEMA.summary( |
| $1, $2, $3, $4, True, True, NULL, 10, True, 15) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.summary |
| ( |
| source_table TEXT, |
| output_table TEXT, |
| target_cols TEXT |
| ) |
| RETURNS MADLIB_SCHEMA.summary_result AS $$ |
| SELECT MADLIB_SCHEMA.summary( |
| $1, $2, $3, NULL, True, True, NULL, 10, True, 15) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION |
| MADLIB_SCHEMA.summary |
| ( |
| source_table TEXT, |
| output_table TEXT |
| ) |
| RETURNS MADLIB_SCHEMA.summary_result AS $$ |
| SELECT MADLIB_SCHEMA.summary( |
| $1, $2, NULL, NULL, True, True, NULL, 10, True, 15) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| ----------------------------------------------------------------------- |
| -- Help functions |
| ----------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.summary( |
| input_message TEXT |
| ) |
| RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`summary', `summary') |
| with AOControl(False): |
| return summary.summary_help_message(schema_madlib, input_message) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.summary() |
| RETURNS TEXT AS $$ |
| PythonFunctionBodyOnly(`summary', `summary') |
| with AOControl(False): |
| return summary.summary_help_message(schema_madlib, None) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |