blob: 0b00c7471dbbdb01b64839bfb27fdb52bf8957b1 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file profile.sql_in
*
* @brief SQL function for single-pass table profiles
* @date January 2011
*
* @sa For a brief introduction to "profiles", see the module
* description grp_profile. Cf. also the module grp_sketches.
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_profile
\warning <em> This is an old implementation of summary function. Replacement of this
function is available as the Summary module \ref grp_summary</em>
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#syntax">Function Syntax</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#notes">Implementation Notes</a></li>
<li><a href="#related">Related Topics</a></li>
</ul>
</div>
@brief Produces a "profile" of a table or view by running a predefined set of aggregates on each column.
This module computes a "profile" of a table or view: a predefined set of
aggregates to be run on each column of a table.
The following aggregates are called on every integer column:
- min(), max(), avg()
- madlib.cmsketch_median()
- madlib.cmsketch_depth_histogram()
- madlib.cmsketch_width_histogram()
The following aggregates are called on non-integer columns:
- madlib.fmsketch_dcount()
- madlib.mfvsketch_quick_histogram()
- madlib.mfvsketch_top_histogram()
Because the input schema of the table or view is unknown, SQL is synthesized
to suit the input. This is done either via the <c>profile</c> or <c>profile_full</c>
user defined function.
@anchor syntax
@par Function Syntax
Generate basic profile information (subset of predefined aggregate functions)
for all columns of the input table.
<pre class="syntax">
profile( input_table )
</pre>
Generate full profile information (all predefined aggregate functions)
for all columns of the input table.
<pre class="syntax">
profile_full( input_table,
buckets
)
</pre>
@anchor examples
@examp
-# Generate basic profile information.
<pre class="example">
SELECT * FROM profile( 'pg_catalog.pg_tables');
</pre>
Result:
<pre class="result">
schema_name | table_name | column_name | function | value
&nbsp;------------+------------+-------------+-------------------+-------
pg_catalog | pg_tables | * | COUNT() | 105
pg_catalog | pg_tables | schemaname | fmsketch_dcount() | 6
pg_catalog | pg_tables | tablename | fmsketch_dcount() | 104
pg_catalog | pg_tables | tableowner | fmsketch_dcount() | 2
pg_catalog | pg_tables | tablespace | fmsketch_dcount() | 1
pg_catalog | pg_tables | hasindexes | fmsketch_dcount() | 2
pg_catalog | pg_tables | hasrules | fmsketch_dcount() | 1
pg_catalog | pg_tables | hastriggers | fmsketch_dcount() | 2
(8 rows)
</pre>
-# Generate full profile information.
<pre class="example">
SELECT * FROM profile_full( 'pg_catalog.pg_tables',
5
);
</pre>
Result:
<pre class="result">
schema_name | table_name | column_name | function | value
&nbsp;------------+------------+-------------+-------------------------------------------------+----------------------------------------------------------------------------------------------------
pg_catalog | pg_tables | * | COUNT() | 105
pg_catalog | pg_tables | schemaname | fmsketch_dcount() | 6
pg_catalog | pg_tables | schemaname | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5}
pg_catalog | pg_tables | schemaname | array_collapse(mfvsketch_top_histogram((),5)) | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5}
pg_catalog | pg_tables | tablename | fmsketch_dcount() | 104
pg_catalog | pg_tables | tablename | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1}
pg_catalog | pg_tables | tablename | array_collapse(mfvsketch_top_histogram((),5)) | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1}
pg_catalog | pg_tables | tableowner | fmsketch_dcount() | 2
pg_catalog | pg_tables | tableowner | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={agorajek:104,alex:1}
pg_catalog | pg_tables | tableowner | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={agorajek:104,alex:1}
pg_catalog | pg_tables | tablespace | fmsketch_dcount() | 1
pg_catalog | pg_tables | tablespace | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={pg_global:28}
pg_catalog | pg_tables | tablespace | array_collapse(mfvsketch_top_histogram((),5)) | [0:0]={pg_global:28}
pg_catalog | pg_tables | hasindexes | fmsketch_dcount() | 2
pg_catalog | pg_tables | hasindexes | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={t:59,f:46}
pg_catalog | pg_tables | hasindexes | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={t:59,f:46}
pg_catalog | pg_tables | hasrules | fmsketch_dcount() | 1
pg_catalog | pg_tables | hasrules | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={f:105}
pg_catalog | pg_tables | hasrules | array_collapse(mfvsketch_top_histogram((),5)) | [0:0]={f:105}
pg_catalog | pg_tables | hastriggers | fmsketch_dcount() | 2
pg_catalog | pg_tables | hastriggers | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={f:102,t:3}
pg_catalog | pg_tables | hastriggers | array_collapse(mfvsketch_top_histogram((),5)) | [0:1]={f:102,t:3}
(22 rows)
</pre>
@anchor notes
@par Implementation Notes
Because some of the aggregate functions used in profile return multi-dimensional
arrays, which are not easily handled in pl/python, we are using
<c>array_collapse</c> function to collaps the n-dim arrays to 1-dim arrays.
All values of 2 and upper dimensions are separated with ':' character.
Note that this module is not available in HAWQ.
Instead, we suggest the Summary module in Descriptive Statistics as a replacement of this module.
@anchor related
@par Related Topics
File profile.sql_in documenting SQL functions.
*/
CREATE TYPE MADLIB_SCHEMA.profile_result AS (
schema_name TEXT
, table_name TEXT
, column_name TEXT
, function TEXT
, value TEXT
);
/**
* @brief Compute a simple "profile" of a table or view
*
* @param input_table table name to analyze
* @return Set of PROFILE_RESULT type
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile( input_table text)
RETURNS SETOF MADLIB_SCHEMA.profile_result
AS $$
PythonFunctionBodyOnly(`data_profile', `profile')
# schema_madlib comes from PythonFunctionBodyOnly
return profile.profile( schema_madlib, input_table, 'bas', None);
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
/**
* @brief Compute a full "profile" of a table or view
*
* @param input_table table name to analyze
* @param buckets number of buckets for histogram functions
* @return Set of PROFILE_RESULT type
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile_full( input_table text, buckets integer)
RETURNS SETOF MADLIB_SCHEMA.profile_result
AS $$
PythonFunctionBodyOnly(`data_profile', `profile')
# schema_madlib comes from PythonFunctionBodyOnly
return profile.profile( schema_madlib, input_table, 'all', buckets);
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
/**
* @brief Collapses n-dimensional Arrays to 1-dim Array, so they can be read in
* pl/python can read it. Otherwise we would get
* ERROR: cannot convert multidimensional array to Python list
*
* @param input Mult-dim array
* @return One dimensional anyarray
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.array_collapse( input anyarray)
RETURNS anyarray
AS $$
DECLARE
x TEXT[];
i INTEGER;
BEGIN
IF array_lower( input, 1) is NULL OR array_upper( input, 1) is NULL THEN
x[1] = NULL;
RETURN x;
END IF;
FOR i IN array_lower( input, 1)..array_upper( input, 1) LOOP
x[i] := array_to_string( input[i:i][array_lower( input, 2):array_upper( input, 2)], ':');
END LOOP;
RETURN x;
END;
$$ LANGUAGE plpgsql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');