| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file profile.sql_in |
| * |
| * @brief SQL function for single-pass table profiles |
| * @date January 2011 |
| * |
| * @sa For a brief introduction to "profiles", see the module |
| * description grp_profile. Cf. also the module grp_sketches. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_profile |
| |
| \warning <em> This is an old implementation of summary function. Replacement of this |
| function is available as the Summary module \ref grp_summary</em> |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#syntax">Function Syntax</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#notes">Implementation Notes</a></li> |
| <li><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| @brief Produces a "profile" of a table or view by running a predefined set of aggregates on each column. |
| |
| This module computes a "profile" of a table or view: a predefined set of |
| aggregates to be run on each column of a table. |
| |
| The following aggregates are called on every integer column: |
| - min(), max(), avg() |
| - madlib.cmsketch_median() |
| - madlib.cmsketch_depth_histogram() |
| - madlib.cmsketch_width_histogram() |
| |
| The following aggregates are called on non-integer columns: |
| - madlib.fmsketch_dcount() |
| - madlib.mfvsketch_quick_histogram() |
| - madlib.mfvsketch_top_histogram() |
| |
| Because the input schema of the table or view is unknown, SQL is synthesized |
| to suit the input. This is done either via the <c>profile</c> or <c>profile_full</c> |
| user defined function. |
| |
| @anchor syntax |
| @par Function Syntax |
| |
| Generate basic profile information (subset of predefined aggregate functions) |
| for all columns of the input table. |
| <pre class="syntax"> |
| profile( input_table ) |
| </pre> |
| |
| |
| Generate full profile information (all predefined aggregate functions) |
| for all columns of the input table. |
| <pre class="syntax"> |
| profile_full( input_table, |
| buckets |
| ) |
| </pre> |
| |
| @anchor examples |
| @examp |
| |
| -# Generate basic profile information. |
| <pre class="example"> |
| SELECT * FROM profile( 'pg_catalog.pg_tables'); |
| </pre> |
| Result: |
| <pre class="result"> |
| schema_name | table_name | column_name | function | value |
| ------------+------------+-------------+-------------------+------- |
| pg_catalog | pg_tables | * | COUNT() | 105 |
| pg_catalog | pg_tables | schemaname | fmsketch_dcount() | 6 |
| pg_catalog | pg_tables | tablename | fmsketch_dcount() | 104 |
| pg_catalog | pg_tables | tableowner | fmsketch_dcount() | 2 |
| pg_catalog | pg_tables | tablespace | fmsketch_dcount() | 1 |
| pg_catalog | pg_tables | hasindexes | fmsketch_dcount() | 2 |
| pg_catalog | pg_tables | hasrules | fmsketch_dcount() | 1 |
| pg_catalog | pg_tables | hastriggers | fmsketch_dcount() | 2 |
| (8 rows) |
| </pre> |
| |
| -# Generate full profile information. |
| <pre class="example"> |
| SELECT * FROM profile_full( 'pg_catalog.pg_tables', |
| 5 |
| ); |
| </pre> |
| Result: |
| <pre class="result"> |
| schema_name | table_name | column_name | function | value |
| ------------+------------+-------------+-------------------------------------------------+---------------------------------------------------------------------------------------------------- |
| pg_catalog | pg_tables | * | COUNT() | 105 |
| pg_catalog | pg_tables | schemaname | fmsketch_dcount() | 6 |
| pg_catalog | pg_tables | schemaname | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5} |
| pg_catalog | pg_tables | schemaname | array_collapse(mfvsketch_top_histogram((),5)) | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5} |
| pg_catalog | pg_tables | tablename | fmsketch_dcount() | 104 |
| pg_catalog | pg_tables | tablename | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1} |
| pg_catalog | pg_tables | tablename | array_collapse(mfvsketch_top_histogram((),5)) | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1} |
| pg_catalog | pg_tables | tableowner | fmsketch_dcount() | 2 |
| pg_catalog | pg_tables | tableowner | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={agorajek:104,alex:1} |
| pg_catalog | pg_tables | tableowner | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={agorajek:104,alex:1} |
| pg_catalog | pg_tables | tablespace | fmsketch_dcount() | 1 |
| pg_catalog | pg_tables | tablespace | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={pg_global:28} |
| pg_catalog | pg_tables | tablespace | array_collapse(mfvsketch_top_histogram((),5)) | [0:0]={pg_global:28} |
| pg_catalog | pg_tables | hasindexes | fmsketch_dcount() | 2 |
| pg_catalog | pg_tables | hasindexes | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={t:59,f:46} |
| pg_catalog | pg_tables | hasindexes | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={t:59,f:46} |
| pg_catalog | pg_tables | hasrules | fmsketch_dcount() | 1 |
| pg_catalog | pg_tables | hasrules | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={f:105} |
| pg_catalog | pg_tables | hasrules | array_collapse(mfvsketch_top_histogram((),5)) | [0:0]={f:105} |
| pg_catalog | pg_tables | hastriggers | fmsketch_dcount() | 2 |
| pg_catalog | pg_tables | hastriggers | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={f:102,t:3} |
| pg_catalog | pg_tables | hastriggers | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={f:102,t:3} |
| (22 rows) |
| </pre> |
| |
| @anchor notes |
| @par Implementation Notes |
| |
| Because some of the aggregate functions used in profile return multi-dimensional |
| arrays, which are not easily handled in pl/python, we are using |
| <c>array_collapse</c> function to collaps the n-dim arrays to 1-dim arrays. |
| All values of 2 and upper dimensions are separated with ':' character. |
| |
| Note that this module is not available in HAWQ. |
| Instead, we suggest the Summary module in Descriptive Statistics as a replacement of this module. |
| |
| @anchor related |
| @par Related Topics |
| File profile.sql_in documenting SQL functions. |
| */ |
| |
| CREATE TYPE MADLIB_SCHEMA.profile_result AS ( |
| schema_name TEXT |
| , table_name TEXT |
| , column_name TEXT |
| , function TEXT |
| , value TEXT |
| ); |
| |
| /** |
| * @brief Compute a simple "profile" of a table or view |
| * |
| * @param input_table table name to analyze |
| * @return Set of PROFILE_RESULT type |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile( input_table text) |
| RETURNS SETOF MADLIB_SCHEMA.profile_result |
| AS $$ |
| |
| PythonFunctionBodyOnly(`data_profile', `profile') |
| |
| # schema_madlib comes from PythonFunctionBodyOnly |
| return profile.profile( schema_madlib, input_table, 'bas', None); |
| |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| /** |
| * @brief Compute a full "profile" of a table or view |
| * |
| * @param input_table table name to analyze |
| * @param buckets number of buckets for histogram functions |
| * @return Set of PROFILE_RESULT type |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile_full( input_table text, buckets integer) |
| RETURNS SETOF MADLIB_SCHEMA.profile_result |
| AS $$ |
| |
| PythonFunctionBodyOnly(`data_profile', `profile') |
| |
| # schema_madlib comes from PythonFunctionBodyOnly |
| return profile.profile( schema_madlib, input_table, 'all', buckets); |
| |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `'); |
| |
| /** |
| * @brief Collapses n-dimensional Arrays to 1-dim Array, so they can be read in |
| * pl/python can read it. Otherwise we would get |
| * ERROR: cannot convert multidimensional array to Python list |
| * |
| * @param input Mult-dim array |
| * @return One dimensional anyarray |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.array_collapse( input anyarray) |
| RETURNS anyarray |
| AS $$ |
| DECLARE |
| x TEXT[]; |
| i INTEGER; |
| BEGIN |
| IF array_lower( input, 1) is NULL OR array_upper( input, 1) is NULL THEN |
| x[1] = NULL; |
| RETURN x; |
| END IF; |
| FOR i IN array_lower( input, 1)..array_upper( input, 1) LOOP |
| x[i] := array_to_string( input[i:i][array_lower( input, 2):array_upper( input, 2)], ':'); |
| END LOOP; |
| RETURN x; |
| END; |
| $$ LANGUAGE plpgsql |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |