src/ports/postgres/modules/data_profile/profile.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file profile.sql_in
  *
  * @brief SQL function for single-pass table profiles
  * @date   January 2011
  *
  * @sa For a brief introduction to "profiles", see the module
  *     description grp_profile. Cf. also the module grp_sketches.
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_profile

 \warning <em> This is an old implementation of summary function. Replacement of this
 function is available as the Summary module \ref grp_summary</em>

 <div class="toc"><b>Contents</b>
 <ul>
 <li><a href="#syntax">Function Syntax</a></li>
 <li><a href="#examples">Examples</a></li>
 <li><a href="#notes">Implementation Notes</a></li>
 <li><a href="#related">Related Topics</a></li>
 </ul>
 </div>

 @brief Produces a "profile" of a table or view by running a predefined set of aggregates on each column.

 This module computes a "profile" of a table or view: a predefined set of
 aggregates to be run on each column of a table.

 The following aggregates are called on every integer column:
 - min(), max(), avg()
 - madlib.cmsketch_median()
 - madlib.cmsketch_depth_histogram()
 - madlib.cmsketch_width_histogram()

 The following aggregates are called on non-integer columns:
 - madlib.fmsketch_dcount()
 - madlib.mfvsketch_quick_histogram()
 - madlib.mfvsketch_top_histogram()

 Because the input schema of the table or view is unknown, SQL is synthesized
 to suit the input. This is done either via the <c>profile</c> or <c>profile_full</c>
 user defined function.

 @anchor syntax
 @par Function Syntax

 Generate basic profile information (subset of predefined aggregate functions)
 for all columns of the input table.
 <pre class="syntax">
 profile( input_table )
 </pre>


 Generate full profile information (all predefined aggregate functions)
   for all columns of the input table.
 <pre class="syntax">
 profile_full( input_table,
               buckets
             )
 </pre>

 @anchor examples
 @examp

 -# Generate basic profile information.
 <pre class="example">
 SELECT * FROM profile( 'pg_catalog.pg_tables');
 </pre>
 Result:
 <pre class="result">
  schema_name | table_name | column_name |       function    | value
 &nbsp;------------+------------+-------------+-------------------+-------
  pg_catalog  | pg_tables  | *           | COUNT()           | 105
  pg_catalog  | pg_tables  | schemaname  | fmsketch_dcount() | 6
  pg_catalog  | pg_tables  | tablename   | fmsketch_dcount() | 104
  pg_catalog  | pg_tables  | tableowner  | fmsketch_dcount() | 2
  pg_catalog  | pg_tables  | tablespace  | fmsketch_dcount() | 1
  pg_catalog  | pg_tables  | hasindexes  | fmsketch_dcount() | 2
  pg_catalog  | pg_tables  | hasrules    | fmsketch_dcount() | 1
  pg_catalog  | pg_tables  | hastriggers | fmsketch_dcount() | 2
 (8 rows)
 </pre>

 -# Generate full profile information.
 <pre class="example">
 SELECT * FROM profile_full( 'pg_catalog.pg_tables',
                              5
                           );
 </pre>
 Result:
 <pre class="result">
  schema_name | table_name | column_name |                        function                 |                                               value
 &nbsp;------------+------------+-------------+-------------------------------------------------+----------------------------------------------------------------------------------------------------
  pg_catalog  | pg_tables  | *           | COUNT()                                         | 105
  pg_catalog  | pg_tables  | schemaname  | fmsketch_dcount()                               | 6
  pg_catalog  | pg_tables  | schemaname  | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5}
  pg_catalog  | pg_tables  | schemaname  | array_collapse(mfvsketch_top_histogram((),5))   | [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5}
  pg_catalog  | pg_tables  | tablename   | fmsketch_dcount()                               | 104
  pg_catalog  | pg_tables  | tablename   | array_collapse(mfvsketch_quick_histogram((),5)) | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1}
  pg_catalog  | pg_tables  | tablename   | array_collapse(mfvsketch_top_histogram((),5))   | [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1}
  pg_catalog  | pg_tables  | tableowner  | fmsketch_dcount()                               | 2
  pg_catalog  | pg_tables  | tableowner  | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={agorajek:104,alex:1}
  pg_catalog  | pg_tables  | tableowner  | array_collapse(mfvsketch_top_histogram((),5))   | [0:1]={agorajek:104,alex:1}
  pg_catalog  | pg_tables  | tablespace  | fmsketch_dcount()                               | 1
  pg_catalog  | pg_tables  | tablespace  | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={pg_global:28}
  pg_catalog  | pg_tables  | tablespace  | array_collapse(mfvsketch_top_histogram((),5))   | [0:0]={pg_global:28}
  pg_catalog  | pg_tables  | hasindexes  | fmsketch_dcount()                               | 2
  pg_catalog  | pg_tables  | hasindexes  | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={t:59,f:46}
  pg_catalog  | pg_tables  | hasindexes  | array_collapse(mfvsketch_top_histogram((),5))   | [0:1]={t:59,f:46}
  pg_catalog  | pg_tables  | hasrules    | fmsketch_dcount()                               | 1
  pg_catalog  | pg_tables  | hasrules    | array_collapse(mfvsketch_quick_histogram((),5)) | [0:0]={f:105}
  pg_catalog  | pg_tables  | hasrules    | array_collapse(mfvsketch_top_histogram((),5))   | [0:0]={f:105}
  pg_catalog  | pg_tables  | hastriggers | fmsketch_dcount()                               | 2
  pg_catalog  | pg_tables  | hastriggers | array_collapse(mfvsketch_quick_histogram((),5)) | [0:1]={f:102,t:3}
  pg_catalog  | pg_tables  | hastriggers | array_collapse(mfvsketch_top_histogram((),5))   | [0:1]={f:102,t:3}
 (22 rows)
 </pre>

 @anchor notes
 @par Implementation Notes

 Because some of the aggregate functions used in profile return multi-dimensional
 arrays, which are not easily handled in pl/python, we are using
 <c>array_collapse</c> function to collaps the n-dim arrays to 1-dim arrays.
 All values of 2 and upper dimensions are separated with ':' character.

 Note that this module is not available in HAWQ.
 Instead, we suggest the Summary module in Descriptive Statistics as a replacement of this module.

 @anchor related
 @par Related Topics
 File profile.sql_in documenting SQL functions.
 */

 CREATE TYPE MADLIB_SCHEMA.profile_result AS (
       schema_name TEXT
     , table_name  TEXT
     , column_name TEXT
     , function    TEXT
     , value       TEXT
 );

 /**
  * @brief Compute a simple "profile" of a table or view
  *
  * @param input_table table name to analyze
  * @return Set of PROFILE_RESULT type
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile( input_table text)
   RETURNS SETOF MADLIB_SCHEMA.profile_result
 AS $$

     PythonFunctionBodyOnly(`data_profile', `profile')

     # schema_madlib comes from PythonFunctionBodyOnly
     return profile.profile( schema_madlib, input_table, 'bas', None);

 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 /**
  * @brief Compute a full "profile" of a table or view
  *
  * @param input_table table name to analyze
  * @param buckets number of buckets for histogram functions
  * @return Set of PROFILE_RESULT type
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile_full( input_table text, buckets integer)
   RETURNS SETOF MADLIB_SCHEMA.profile_result
 AS $$

     PythonFunctionBodyOnly(`data_profile', `profile')

     # schema_madlib comes from PythonFunctionBodyOnly
     return profile.profile( schema_madlib, input_table, 'all', buckets);

 $$ LANGUAGE plpythonu
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

 /**
  * @brief Collapses n-dimensional Arrays to 1-dim Array, so they can be read in
  *        pl/python can read it. Otherwise we would get
  *        ERROR: cannot convert multidimensional array to Python list
  *
  * @param input Mult-dim array
  * @return One dimensional anyarray
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.array_collapse( input anyarray)
   RETURNS anyarray
 AS $$
 DECLARE
   x TEXT[];
   i INTEGER;
 BEGIN
   IF array_lower( input, 1) is NULL OR array_upper( input, 1) is NULL THEN
     x[1] = NULL;
     RETURN x;
   END IF;
   FOR i IN array_lower( input, 1)..array_upper( input, 1) LOOP
     x[i] := array_to_string( input[i:i][array_lower( input, 2):array_upper( input, 2)], ':');
   END LOOP;
   RETURN x;
 END;
 $$ LANGUAGE plpgsql
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
	/* ----------------------------------------------------------------------- //*
	*
	* @file profile.sql_in
	*
	* @brief SQL function for single-pass table profiles
	* @date January 2011
	*
	* @sa For a brief introduction to "profiles", see the module
	* description grp_profile. Cf. also the module grp_sketches.
	*
	// ----------------------------------------------------------------------- */

	m4_include(`SQLCommon.m4')

	/**
	@addtogroup grp_profile

	\warning <em> This is an old implementation of summary function. Replacement of this
	function is available as the Summary module \ref grp_summary</em>

	<div class="toc"><b>Contents</b>
	<ul>
	<li><a href="#syntax">Function Syntax</a></li>
	<li><a href="#examples">Examples</a></li>
	<li><a href="#notes">Implementation Notes</a></li>
	<li><a href="#related">Related Topics</a></li>
	</ul>
	</div>

	@brief Produces a "profile" of a table or view by running a predefined set of aggregates on each column.

	This module computes a "profile" of a table or view: a predefined set of
	aggregates to be run on each column of a table.

	The following aggregates are called on every integer column:
	- min(), max(), avg()
	- madlib.cmsketch_median()
	- madlib.cmsketch_depth_histogram()
	- madlib.cmsketch_width_histogram()

	The following aggregates are called on non-integer columns:
	- madlib.fmsketch_dcount()
	- madlib.mfvsketch_quick_histogram()
	- madlib.mfvsketch_top_histogram()

	Because the input schema of the table or view is unknown, SQL is synthesized
	to suit the input. This is done either via the <c>profile</c> or <c>profile_full</c>
	user defined function.

	@anchor syntax
	@par Function Syntax

	Generate basic profile information (subset of predefined aggregate functions)
	for all columns of the input table.
	<pre class="syntax">
	profile( input_table )
	</pre>


	Generate full profile information (all predefined aggregate functions)
	for all columns of the input table.
	<pre class="syntax">
	profile_full( input_table,
	buckets
	)
	</pre>

	@anchor examples
	@examp

	-# Generate basic profile information.
	<pre class="example">
	SELECT * FROM profile( 'pg_catalog.pg_tables');
	</pre>
	Result:
	<pre class="result">
	schema_name \| table_name \| column_name \| function \| value
	------------+------------+-------------+-------------------+-------
	pg_catalog \| pg_tables \| * \| COUNT() \| 105
	pg_catalog \| pg_tables \| schemaname \| fmsketch_dcount() \| 6
	pg_catalog \| pg_tables \| tablename \| fmsketch_dcount() \| 104
	pg_catalog \| pg_tables \| tableowner \| fmsketch_dcount() \| 2
	pg_catalog \| pg_tables \| tablespace \| fmsketch_dcount() \| 1
	pg_catalog \| pg_tables \| hasindexes \| fmsketch_dcount() \| 2
	pg_catalog \| pg_tables \| hasrules \| fmsketch_dcount() \| 1
	pg_catalog \| pg_tables \| hastriggers \| fmsketch_dcount() \| 2
	(8 rows)
	</pre>

	-# Generate full profile information.
	<pre class="example">
	SELECT * FROM profile_full( 'pg_catalog.pg_tables',
	5
	);
	</pre>
	Result:
	<pre class="result">
	schema_name \| table_name \| column_name \| function \| value
	------------+------------+-------------+-------------------------------------------------+----------------------------------------------------------------------------------------------------
	pg_catalog \| pg_tables \| * \| COUNT() \| 105
	pg_catalog \| pg_tables \| schemaname \| fmsketch_dcount() \| 6
	pg_catalog \| pg_tables \| schemaname \| array_collapse(mfvsketch_quick_histogram((),5)) \| [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5}
	pg_catalog \| pg_tables \| schemaname \| array_collapse(mfvsketch_top_histogram((),5)) \| [0:4]={pg_catalog:68,public:19,information_schema:7,gp_toolkit:5,maddy:5}
	pg_catalog \| pg_tables \| tablename \| fmsketch_dcount() \| 104
	pg_catalog \| pg_tables \| tablename \| array_collapse(mfvsketch_quick_histogram((),5)) \| [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1}
	pg_catalog \| pg_tables \| tablename \| array_collapse(mfvsketch_top_histogram((),5)) \| [0:4]={migrationhistory:2,pg_statistic:1,sql_features:1,sql_implementation_info:1,sql_languages:1}
	pg_catalog \| pg_tables \| tableowner \| fmsketch_dcount() \| 2
	pg_catalog \| pg_tables \| tableowner \| array_collapse(mfvsketch_quick_histogram((),5)) \| [0:1]={agorajek:104,alex:1}
	pg_catalog \| pg_tables \| tableowner \| array_collapse(mfvsketch_top_histogram((),5)) \| [0:1]={agorajek:104,alex:1}
	pg_catalog \| pg_tables \| tablespace \| fmsketch_dcount() \| 1
	pg_catalog \| pg_tables \| tablespace \| array_collapse(mfvsketch_quick_histogram((),5)) \| [0:0]={pg_global:28}
	pg_catalog \| pg_tables \| tablespace \| array_collapse(mfvsketch_top_histogram((),5)) \| [0:0]={pg_global:28}
	pg_catalog \| pg_tables \| hasindexes \| fmsketch_dcount() \| 2
	pg_catalog \| pg_tables \| hasindexes \| array_collapse(mfvsketch_quick_histogram((),5)) \| [0:1]={t:59,f:46}
	pg_catalog \| pg_tables \| hasindexes \| array_collapse(mfvsketch_top_histogram((),5)) \| [0:1]={t:59,f:46}
	pg_catalog \| pg_tables \| hasrules \| fmsketch_dcount() \| 1
	pg_catalog \| pg_tables \| hasrules \| array_collapse(mfvsketch_quick_histogram((),5)) \| [0:0]={f:105}
	pg_catalog \| pg_tables \| hasrules \| array_collapse(mfvsketch_top_histogram((),5)) \| [0:0]={f:105}
	pg_catalog \| pg_tables \| hastriggers \| fmsketch_dcount() \| 2
	pg_catalog \| pg_tables \| hastriggers \| array_collapse(mfvsketch_quick_histogram((),5)) \| [0:1]={f:102,t:3}
	pg_catalog \| pg_tables \| hastriggers \| array_collapse(mfvsketch_top_histogram((),5)) \| [0:1]={f:102,t:3}
	(22 rows)
	</pre>

	@anchor notes
	@par Implementation Notes

	Because some of the aggregate functions used in profile return multi-dimensional
	arrays, which are not easily handled in pl/python, we are using
	<c>array_collapse</c> function to collaps the n-dim arrays to 1-dim arrays.
	All values of 2 and upper dimensions are separated with ':' character.

	Note that this module is not available in HAWQ.
	Instead, we suggest the Summary module in Descriptive Statistics as a replacement of this module.

	@anchor related
	@par Related Topics
	File profile.sql_in documenting SQL functions.
	*/

	CREATE TYPE MADLIB_SCHEMA.profile_result AS (
	schema_name TEXT
	, table_name TEXT
	, column_name TEXT
	, function TEXT
	, value TEXT
	);

	/**
	* @brief Compute a simple "profile" of a table or view
	*
	* @param input_table table name to analyze
	* @return Set of PROFILE_RESULT type
	*/
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile( input_table text)
	RETURNS SETOF MADLIB_SCHEMA.profile_result
	AS $$

	PythonFunctionBodyOnly(`data_profile', `profile')

	# schema_madlib comes from PythonFunctionBodyOnly
	return profile.profile( schema_madlib, input_table, 'bas', None);

	$$ LANGUAGE plpythonu
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

	/**
	* @brief Compute a full "profile" of a table or view
	*
	* @param input_table table name to analyze
	* @param buckets number of buckets for histogram functions
	* @return Set of PROFILE_RESULT type
	*/
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile_full( input_table text, buckets integer)
	RETURNS SETOF MADLIB_SCHEMA.profile_result
	AS $$

	PythonFunctionBodyOnly(`data_profile', `profile')

	# schema_madlib comes from PythonFunctionBodyOnly
	return profile.profile( schema_madlib, input_table, 'all', buckets);

	$$ LANGUAGE plpythonu
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');

	/**
	* @brief Collapses n-dimensional Arrays to 1-dim Array, so they can be read in
	* pl/python can read it. Otherwise we would get
	* ERROR: cannot convert multidimensional array to Python list
	*
	* @param input Mult-dim array
	* @return One dimensional anyarray
	*/
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.array_collapse( input anyarray)
	RETURNS anyarray
	AS $$
	DECLARE
	x TEXT[];
	i INTEGER;
	BEGIN
	IF array_lower( input, 1) is NULL OR array_upper( input, 1) is NULL THEN
	x[1] = NULL;
	RETURN x;
	END IF;
	FOR i IN array_lower( input, 1)..array_upper( input, 1) LOOP
	x[i] := array_to_string( input[i:i][array_lower( input, 2):array_upper( input, 2)], ':');
	END LOOP;
	RETURN x;
	END;
	$$ LANGUAGE plpgsql
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');