methods/sketch/src/pg_gp/sketch.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file sketch.sql_in
  *
  * @brief SQL functions for sketch-based approximations of descriptive statistics
  * @date  April 2011
  *
  * @sa For a brief introduction to sketches, see the module description grp_sketches
  *
  *//* ----------------------------------------------------------------------- */

 m4_include(`SQLCommon.m4')

 /**
 @addtogroup grp_sketches

 @about

 Sketches (sometimes called "synopsis data structures") are small randomized
 in-memory data structures that capture statistical properties of a large set
 of values (e.g. a column of a table).  Sketches can be formed in a single
 pass of the data, and used to approximate a variety of descriptive statistics.

 We implement sketches as SQL User-Defined Aggregates (UDAs).  Because they
 are single-pass, small-space and parallelized, a single query can
 use many sketches to gather summary statistics on many columns of a table efficiently.

 This module currently implements user-defined aggregates based on three main sketch methods:
  - <i>Flajolet-Martin (FM)</i> sketches for approximating <c>COUNT(DISTINCT)</c>.
  - <i>Count-Min (CM)</i> sketches, which can be used to approximate a number of descriptive statistics including
    - <c>COUNT(*)</c> of rows whose column value matches a given value in a set
    - <c>COUNT(*)</c> of rows whose column value falls in a range (*)
    - order statistics including <i>median</i> and <i>centiles</i> (*)
    - <i>histograms</i>: both <i>equi-width</i> and <i>equi-depth</i> (*)
  - <i>Most Frequent Value (MFV)</i> sketches, which output the most
 frequently-occuring values in a column, along with their associated counts.

  <i>Note:</i> Features marked with a star (*) only work for discrete types that
  can be cast to int8.

 @implementation
 The sketch methods consists of a number of SQL UDAs (user defined aggregates)
 and UDFs (user defined functions), to be used directly in SQL queries.
 */

 /**
 @addtogroup grp_fmsketch

 @about
 Flajolet-Martin's distinct count estimation
 implemented as a user-defined aggregate.

 @usage
 - Get the number of distinct values in a designated column.
   <pre>SELECT \ref fmsketch_dcount(<em>col_name</em>) FROM table_name;</pre>

 @implementation
 \ref fmsketch_dcount can be run on a column of any type.
 It returns an approximation to the number of distinct values
 (a la <c>COUNT(DISTINCT x)</c>), but faster and approximate.
 Like any aggregate, it can be combined with a GROUP BY clause to do distinct
 counts per group.

 @examp
 -# Generate some data:
 \verbatim
 sql> CREATE TABLE data(class INT, a1 INT);
 sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
 sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
 sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
 sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
 sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
 \endverbatim
 -# Find distinct number of values for each class
 \verbatim
 sql> SELECT class,fmsketch_dcount(a1) FROM data GROUP BY data.class;
 class | fmsketch_dcount
 -------+-----------------
     2 |               2
     1 |               3
 (2 rows)
 \endverbatim

 @literature
 [1] P. Flajolet and N.G. Martin.  Probabilistic counting algorithms for data base applications, Journal of Computer and System Sciences 31(2), pp 182-209, 1985.  http://algo.inria.fr/flajolet/Publications/FlMa85.pdf

 @sa File sketch.sql_in documenting the SQL function.

 */

 /**
 @addtogroup grp_countmin

 @about
 This module implements Cormode-Muthukrishnan <i>CountMin</i> sketches
 on integer values, implemented as a user-defined aggregate.  It also provides
 scalar functions over the sketches to produce approximate counts, order
 statistics, and histograms.

 @usage

 - Get a sketch of a selected column specified by <em>col_name</em>.
   <pre>SELECT \ref cmsketch(<em>col_name</em>) FROM table_name;</pre>

 - Get the number of rows where <em>col_name = p</em>, computed from the sketch
   obtained from <tt>cmsketch</tt>.
   <pre>SELECT \ref cmsketch_count(<em>cmsketch</em>,<em>p</em>) FROM table_name;</pre>

 - Get the number of rows where <em>col_name</em> is between <em>m</em> and <em>n</em> inclusive.
   <pre>SELECT \ref cmsketch_rangecount(<em>cmsketch</em>,<em>m</em>,<em>n</em>) FROM table_name;</pre>

 - Get the <em>k</em>th percentile of <em>col_name</em> where <em>count</em> specifies number of rows. <em>k</em> should be an integer between 1 to 99.
   <pre>SELECT \ref cmsketch_centile(<em>cmsketch</em>,<em>k</em>,<em>count</em>) FROM table_name;</pre>

 - Get the median of <em>col_name</em> where <em>count</em> specifies number of rows. This is equivalent to <tt>\ref cmsketch_centile(<em>cmsketch</em>,50,<em>count</em>)</tt>.
   <pre>SELECT \ref cmsketch_median(<em>cmsketch</em>,<em>count</em>) FROM table_name;</pre>

 - Get an n-bucket histogram for values between min and max for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.
   <pre>SELECT \ref cmsketch_width_histogram(<em>cmsketch</em>,<em>min</em>,<em>max</em>,<em>n</em>) FROM table_name;</pre>

 - Get an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.  Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles.
   <pre>SELECT \ref cmsketch_depth_histogram(<em>cmsketch</em>,<em>n</em>) FROM table_name;</pre>

 @examp

 -# Generate some data
 \verbatim
 sql> CREATE TABLE data(class INT, a1 INT);
 sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
 sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
 sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
 sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
 sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
 \endverbatim
 -# Count number of rows where a1 = 2 in each class
 \verbatim
 sql> SELECT class,cmsketch_count(cmsketch(a1),2) FROM data GROUP BY data.class;
  class | cmsketch_count
 -------+----------------
      2 |              0
      1 |          15000
 (2 rows)
 \endverbatim
 -# Count number of rows where a1 is between 3 and 6
 \verbatim
 sql> SELECT class,cmsketch_rangecount(cmsketch(a1),3,6) FROM data GROUP BY data.class;
  class | cmsketch_rangecount
 -------+---------------------
      2 |                2000
      1 |               10000
 (2 rows)
 \endverbatim
 -# Compute the 90th percentile of all of a1
 \verbatim
 sql> SELECT cmsketch_centile(cmsketch(a1),90,count(*)) FROM data;
  cmsketch_centile
 ------------------
                 3
 (1 row)
 \endverbatim
 -# Produce an equi-width histogram with 2 bins between 0 and 10
 \verbatim
 sql> SELECT cmsketch_width_histogram(cmsketch(a1),0,10,2) FROM data;
       cmsketch_width_histogram
 ------------------------------------
  [[0L, 4L, 35000], [5L, 10L, 2000]]
 (1 row)
 \endverbatim
 -# Produce an equi-depth histogram of a1 with 2 bins of approximately equal depth
 \verbatim
 sql> SELECT cmsketch_depth_histogram(cmsketch(a1),2) FROM data;
                        cmsketch_depth_histogram
 -----------------------------------------------------------------------
  [[-9223372036854775807L, 1, 10000], [2, 9223372036854775807L, 27000]]
 (1 row)
 \endverbatim

 @literature

 [1] G. Cormode and S. Muthukrishnan. An improved data stream summary: The count-min sketch and its applications.  LATIN 2004, J. Algorithm 55(1): 58-75 (2005) .  http://dimacs.rutgers.edu/~graham/pubs/html/CormodeMuthukrishnan04CMLatin.html

 [2] G. Cormode. Encyclopedia entry on 'Count-Min Sketch'. In L. Liu and M. T. Ozsu, editors, Encyclopedia of Database Systems, pages 511-516. Springer, 2009. http://dimacs.rutgers.edu/~graham/pubs/html/Cormode09b.html

 @sa File sketch.sql_in documenting the SQL functions.
 \n\n Module grp_quantile for a different implementation of quantile function.

 */

 /**
 @addtogroup grp_mfvsketch

 @about
 MFVSketch: Most Frequent Values variant of CountMin sketch, implemented
 as a UDA.

 @usage
 Produces an n-bucket histogram for a column where each bucket counts one of the
 most frequent values in the column. The output is an array of doubles {value, count}
 in descending order of frequency; counts are approximated via CountMin sketches.
 Ties are handled arbitrarily.
 <pre>SELECT \ref mfvsketch_top_histogram(<em>col_name</em>,n) FROM table_name;</pre>
 <pre>SELECT \ref mfvsketch_top_histogram(<em>col_name</em>,n) FROM table_name;</pre>

 The MFV frequent-value UDA comes in two different versions:
 - a faithful implementation that preserves the approximation guarantees
 of Cormode/Muthukrishnan,
 - and a "quick and dirty" version that can do parallel aggregation in Greenplum
 at the expense of missing some of the most frequent values.

 In PostgreSQL the two UDAs are identical. In Greenplum, the quick version should
 produce good results unless the number of values requested is very small,
 or the distribution is very flat.

 @examp

 -# Generate some data
 \verbatim
 sql> CREATE TABLE data(class INT, a1 INT);
 sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
 sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
 sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
 sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
 sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
 \endverbatim
 -# Produce histogram of 5 bins and return the most frequent value and associated
 count in each bin:
 \verbatim
 sql> SELECT mfvsketch_top_histogram(a1,5) FROM data;

                 mfvsketch_top_histogram
 --------------------------------------------------------------
 [0:4][0:1]={{2,15000},{1,10000},{3,10000},{5,1000},{6,1000}}
 (1 row)
 \endverbatim

 @literature
 This method is not usually called an MFV sketch in the literature; it
 is a natural extension of the CountMin sketch.

 @sa File sketch.sql_in documenting the SQL functions.
 \n\n Module grp_countmin.
 */

 -- FM Sketch Functions
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea)
 RETURNS int8
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.fmsketch_dcount(anyelement);

 /**
  * @brief Flajolet-Martin's distinct count estimation
  * @param column name
  */
 CREATE AGGREGATE MADLIB_SCHEMA.fmsketch_dcount(/*+ column */ anyelement)
 (
     sfunc = MADLIB_SCHEMA.__fmsketch_trans,
     stype = bytea,
     finalfunc = MADLIB_SCHEMA.__fmsketch_count_distinct,
     m4_ifdef(`GREENPLUM',`prefunc = MADLIB_SCHEMA.__fmsketch_merge,')
     initcond = ''
 );


 -- CM Sketch Functions

 -- We register __cmsketch_int8_trans for varying numbers of arguments to support
 -- a variety of agg function signatures.  The first 2 args are used to
 -- aggregate; the remaining args are carried along unchanged inside the
 -- return structure for the use of the UDA finalizer.
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8, int8) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8, arg3 int8)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_final(bytea) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_final(counters bytea)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_base64_final(bytea) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_base64_final(sketch bytea)
 RETURNS text
 AS $$
 select encode(MADLIB_SCHEMA.__cmsketch_final($1), 'base64');
 $$ LANGUAGE SQL;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch(int8);
 /**
  *@brief <c>cmsketch</c> is a UDA that can be run on columns of type int8,
  * or any column that can be cast to an int8.  It produces a base64 string
  * representing a CountMin sketch: a large array of counters that is intended
  * to be passed into a UDF like <c>cmsketch_width_histogram</c> described below.
  */
 CREATE AGGREGATE MADLIB_SCHEMA.cmsketch(/*+ column */ INT8)
 (
     sfunc = MADLIB_SCHEMA.__cmsketch_int8_trans,
     stype = bytea,
     finalfunc = MADLIB_SCHEMA.__cmsketch_base64_final,
 		m4_ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,')
     initcond = ''
 );

 /**
  @brief <c>cmsketch_count</c> is a scalar UDF to compute the approximate
  number of occurences of a value in a column summarized by a cmsketch.  Takes
  the results of the <c>cmsketch</c> aggregate as its first argument, and the
  desired value as the second.
  */
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_count(text, int8) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_count(sketches64 text, val int8)
 RETURNS int8
 AS $$
     PythonFunctionBodyOnly(`sketch', `countmin')
     # MADlibSchema comes from PythonFunctionBodyOnly
     return countmin.count(sketches64, val)
 $$ LANGUAGE plpythonu;


 /**
  @brief <c>cmsketch_rangecount</c> is a scalar UDF to approximate the number
  of occurrences of values in the range <c>[lo,hi]</c> inclusive, given a
  cmsketch of a column.  Takes the results
  of the <c>cmsketch</c> aggregate as its first argument, and the desired range
  boundaries as the second and third.
  */
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_rangecount(text, int8, int8) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_rangecount(sketches64 text, bot int8, top int8)
 RETURNS int8
 AS $$
     PythonFunctionBodyOnly(`sketch', `countmin')
     # MADlibSchema comes from PythonFunctionBodyOnly
     return countmin.rangecount(sketches64, bot, top)
 $$ LANGUAGE plpythonu;

 /**
  @brief <c>cmsketch_centile</c> is a scalar UDF to compute a centile value
  from a cmsketch.  Takes the results of the <c>cmsketch</c> aggregate as its
  first argument, a number between 1 and 99 as the desired centile in the
  second, and the count of the column as the third.  Produces a value from the
  sketched column that is approximately at the centile's position in sorted
  order.
  */
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_centile(text, int8, int8) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_centile(sketches64 text, centile int8, cnt int8)
 RETURNS int8
 AS $$
     PythonFunctionBodyOnly(`sketch', `countmin')
     # MADlibSchema comes from PythonFunctionBodyOnly
     return countmin.centile(sketches64, centile, cnt)
 $$ LANGUAGE plpythonu;

 /**
  @brief <c>cmsketch_median</c> is a scalar UDF to compute a median value
  from a cmsketch.  Takes the results of the <c>cmsketch</c> aggregate as its
  first argument, and the count as the second.  Produces a value from the
  sketched column that is approximately at the halfway position in sorted
  order.
  */
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_median(text, int8) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_median(sketches64 text, cnt int8)
 RETURNS int8
 AS $$
     PythonFunctionBodyOnly(`sketch', `countmin')
     # MADlibSchema comes from PythonFunctionBodyOnly
     return countmin.centile(sketches64, 50, cnt)
 $$ LANGUAGE plpythonu;

 /**
  \brief <c>cmsketch_width_histogram</c>  is a scalar UDF that takes three aggregates of a column -- cmsketch, min and max-- as well as a number of buckets, and produces an n-bucket histogram for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.
  */
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_width_histogram(text, /*+ min */int8, /*+ max */int8, /*+ nbuckets */ int4) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_width_histogram(sketches64 text, themin int8, themax int8,  nbuckets int4)
 RETURNS text
 AS $$
     PythonFunctionBodyOnly(`sketch', `countmin')
     # MADlibSchema comes from PythonFunctionBodyOnly
     return countmin.width_histogram( sketches64, themin, themax, nbuckets)
 $$ LANGUAGE plpythonu;

 /** @brief <c>cmsketch_depth_histogram</c> is a UDA that takes a cmsketch and a number of buckets n, and produces an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.  Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles.
 */
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_depth_histogram(text, /*+ nbuckets */ int4) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.cmsketch_depth_histogram(sketches64 text, nbuckets int4)
 RETURNS text
 AS $$
     PythonFunctionBodyOnly(`sketch', `countmin')
     # MADlibSchema comes from PythonFunctionBodyOnly
     return countmin.depth_histogram(sketches64, nbuckets)
 $$ LANGUAGE plpythonu;

 -- MFV Sketch functions

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_final(bytea) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_final(bytea)
 RETURNS text[][]
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea) CASCADE;
 CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea)
 RETURNS bytea
 AS 'MODULE_PATHNAME'
 LANGUAGE C STRICT;

 CREATE FUNCTION MADLIB_SCHEMA.__sketch_rightmost_one(bytea, integer, integer)
 RETURNS integer AS 'MODULE_PATHNAME', 'sketch_rightmost_one' LANGUAGE C STRICT;

 CREATE FUNCTION MADLIB_SCHEMA.__sketch_leftmost_zero(bytea, integer, integer)
 RETURNS integer AS 'MODULE_PATHNAME', 'sketch_leftmost_zero' LANGUAGE C STRICT;

 CREATE FUNCTION MADLIB_SCHEMA.__sketch_array_set_bit_in_place(bytea, integer, integer, integer, integer)
 RETURNS bytea AS 'MODULE_PATHNAME', 'sketch_array_set_bit_in_place' LANGUAGE C STRICT;

 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_top_histogram( anyelement, int4);
 /**
  * @brief Produces an n-bucket histogram for a column where each bucket counts
  * one of the most frequent values in the column. The output is an array of
  * doubles {value, count} in descending order of frequency; counts are
  * approximated via CountMin sketches. Ties are handled arbitrarily.
 */
 CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_top_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4)
 (
     sfunc = MADLIB_SCHEMA.__mfvsketch_trans,
     stype = bytea,
     finalfunc = MADLIB_SCHEMA.__mfvsketch_final,
     initcond = ''
 );

 DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_quick_histogram(anyelement, int4);
 /**
  * @brief On Postgres it works the same way as \ref mfvsketch_top_histogram but,
  * in Greenplum it does parallel aggregation to provide a "quick and dirty" answer.
 */
 CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_quick_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4)
 (
     sfunc = MADLIB_SCHEMA.__mfvsketch_trans,
     stype = bytea,
     finalfunc = MADLIB_SCHEMA.__mfvsketch_final,
 		m4_ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__mfvsketch_merge,')
     initcond = ''
 );
	/* ----------------------------------------------------------------------- //*
	*
	* @file sketch.sql_in
	*
	* @brief SQL functions for sketch-based approximations of descriptive statistics
	* @date April 2011
	*
	* @sa For a brief introduction to sketches, see the module description grp_sketches
	*
	// ----------------------------------------------------------------------- */

	m4_include(`SQLCommon.m4')

	/**
	@addtogroup grp_sketches

	@about

	Sketches (sometimes called "synopsis data structures") are small randomized
	in-memory data structures that capture statistical properties of a large set
	of values (e.g. a column of a table). Sketches can be formed in a single
	pass of the data, and used to approximate a variety of descriptive statistics.

	We implement sketches as SQL User-Defined Aggregates (UDAs). Because they
	are single-pass, small-space and parallelized, a single query can
	use many sketches to gather summary statistics on many columns of a table efficiently.

	This module currently implements user-defined aggregates based on three main sketch methods:
	- <i>Flajolet-Martin (FM)</i> sketches for approximating <c>COUNT(DISTINCT)</c>.
	- <i>Count-Min (CM)</i> sketches, which can be used to approximate a number of descriptive statistics including
	- <c>COUNT(*)</c> of rows whose column value matches a given value in a set
	- <c>COUNT()</c> of rows whose column value falls in a range ()
	- order statistics including <i>median</i> and <i>centiles</i> (*)
	- <i>histograms</i>: both <i>equi-width</i> and <i>equi-depth</i> (*)
	- <i>Most Frequent Value (MFV)</i> sketches, which output the most
	frequently-occuring values in a column, along with their associated counts.

	<i>Note:</i> Features marked with a star (*) only work for discrete types that
	can be cast to int8.

	@implementation
	The sketch methods consists of a number of SQL UDAs (user defined aggregates)
	and UDFs (user defined functions), to be used directly in SQL queries.
	*/

	/**
	@addtogroup grp_fmsketch

	@about
	Flajolet-Martin's distinct count estimation
	implemented as a user-defined aggregate.

	@usage
	- Get the number of distinct values in a designated column.
	<pre>SELECT \ref fmsketch_dcount(<em>col_name</em>) FROM table_name;</pre>

	@implementation
	\ref fmsketch_dcount can be run on a column of any type.
	It returns an approximation to the number of distinct values
	(a la <c>COUNT(DISTINCT x)</c>), but faster and approximate.
	Like any aggregate, it can be combined with a GROUP BY clause to do distinct
	counts per group.

	@examp
	-# Generate some data:
	\verbatim
	sql> CREATE TABLE data(class INT, a1 INT);
	sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
	sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
	sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
	sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
	sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
	\endverbatim
	-# Find distinct number of values for each class
	\verbatim
	sql> SELECT class,fmsketch_dcount(a1) FROM data GROUP BY data.class;
	class \| fmsketch_dcount
	-------+-----------------
	2 \| 2
	1 \| 3
	(2 rows)
	\endverbatim

	@literature
	[1] P. Flajolet and N.G. Martin. Probabilistic counting algorithms for data base applications, Journal of Computer and System Sciences 31(2), pp 182-209, 1985. http://algo.inria.fr/flajolet/Publications/FlMa85.pdf

	@sa File sketch.sql_in documenting the SQL function.

	*/

	/**
	@addtogroup grp_countmin

	@about
	This module implements Cormode-Muthukrishnan <i>CountMin</i> sketches
	on integer values, implemented as a user-defined aggregate. It also provides
	scalar functions over the sketches to produce approximate counts, order
	statistics, and histograms.

	@usage

	- Get a sketch of a selected column specified by <em>col_name</em>.
	<pre>SELECT \ref cmsketch(<em>col_name</em>) FROM table_name;</pre>

	- Get the number of rows where <em>col_name = p</em>, computed from the sketch
	obtained from <tt>cmsketch</tt>.
	<pre>SELECT \ref cmsketch_count(<em>cmsketch</em>,<em>p</em>) FROM table_name;</pre>

	- Get the number of rows where <em>col_name</em> is between <em>m</em> and <em>n</em> inclusive.
	<pre>SELECT \ref cmsketch_rangecount(<em>cmsketch</em>,<em>m</em>,<em>n</em>) FROM table_name;</pre>

	- Get the <em>k</em>th percentile of <em>col_name</em> where <em>count</em> specifies number of rows. <em>k</em> should be an integer between 1 to 99.
	<pre>SELECT \ref cmsketch_centile(<em>cmsketch</em>,<em>k</em>,<em>count</em>) FROM table_name;</pre>

	- Get the median of <em>col_name</em> where <em>count</em> specifies number of rows. This is equivalent to <tt>\ref cmsketch_centile(<em>cmsketch</em>,50,<em>count</em>)</tt>.
	<pre>SELECT \ref cmsketch_median(<em>cmsketch</em>,<em>count</em>) FROM table_name;</pre>

	- Get an n-bucket histogram for values between min and max for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.
	<pre>SELECT \ref cmsketch_width_histogram(<em>cmsketch</em>,<em>min</em>,<em>max</em>,<em>n</em>) FROM table_name;</pre>

	- Get an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles.
	<pre>SELECT \ref cmsketch_depth_histogram(<em>cmsketch</em>,<em>n</em>) FROM table_name;</pre>

	@examp

	-# Generate some data
	\verbatim
	sql> CREATE TABLE data(class INT, a1 INT);
	sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
	sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
	sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
	sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
	sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
	\endverbatim
	-# Count number of rows where a1 = 2 in each class
	\verbatim
	sql> SELECT class,cmsketch_count(cmsketch(a1),2) FROM data GROUP BY data.class;
	class \| cmsketch_count
	-------+----------------
	2 \| 0
	1 \| 15000
	(2 rows)
	\endverbatim
	-# Count number of rows where a1 is between 3 and 6
	\verbatim
	sql> SELECT class,cmsketch_rangecount(cmsketch(a1),3,6) FROM data GROUP BY data.class;
	class \| cmsketch_rangecount
	-------+---------------------
	2 \| 2000
	1 \| 10000
	(2 rows)
	\endverbatim
	-# Compute the 90th percentile of all of a1
	\verbatim
	sql> SELECT cmsketch_centile(cmsketch(a1),90,count(*)) FROM data;
	cmsketch_centile
	------------------
	3
	(1 row)
	\endverbatim
	-# Produce an equi-width histogram with 2 bins between 0 and 10
	\verbatim
	sql> SELECT cmsketch_width_histogram(cmsketch(a1),0,10,2) FROM data;
	cmsketch_width_histogram
	------------------------------------
	[[0L, 4L, 35000], [5L, 10L, 2000]]
	(1 row)
	\endverbatim
	-# Produce an equi-depth histogram of a1 with 2 bins of approximately equal depth
	\verbatim
	sql> SELECT cmsketch_depth_histogram(cmsketch(a1),2) FROM data;
	cmsketch_depth_histogram
	-----------------------------------------------------------------------
	[[-9223372036854775807L, 1, 10000], [2, 9223372036854775807L, 27000]]
	(1 row)
	\endverbatim

	@literature

	[1] G. Cormode and S. Muthukrishnan. An improved data stream summary: The count-min sketch and its applications. LATIN 2004, J. Algorithm 55(1): 58-75 (2005) . http://dimacs.rutgers.edu/~graham/pubs/html/CormodeMuthukrishnan04CMLatin.html

	[2] G. Cormode. Encyclopedia entry on 'Count-Min Sketch'. In L. Liu and M. T. Ozsu, editors, Encyclopedia of Database Systems, pages 511-516. Springer, 2009. http://dimacs.rutgers.edu/~graham/pubs/html/Cormode09b.html

	@sa File sketch.sql_in documenting the SQL functions.
	\n\n Module grp_quantile for a different implementation of quantile function.

	*/

	/**
	@addtogroup grp_mfvsketch

	@about
	MFVSketch: Most Frequent Values variant of CountMin sketch, implemented
	as a UDA.

	@usage
	Produces an n-bucket histogram for a column where each bucket counts one of the
	most frequent values in the column. The output is an array of doubles {value, count}
	in descending order of frequency; counts are approximated via CountMin sketches.
	Ties are handled arbitrarily.
	<pre>SELECT \ref mfvsketch_top_histogram(<em>col_name</em>,n) FROM table_name;</pre>
	<pre>SELECT \ref mfvsketch_top_histogram(<em>col_name</em>,n) FROM table_name;</pre>

	The MFV frequent-value UDA comes in two different versions:
	- a faithful implementation that preserves the approximation guarantees
	of Cormode/Muthukrishnan,
	- and a "quick and dirty" version that can do parallel aggregation in Greenplum
	at the expense of missing some of the most frequent values.

	In PostgreSQL the two UDAs are identical. In Greenplum, the quick version should
	produce good results unless the number of values requested is very small,
	or the distribution is very flat.

	@examp

	-# Generate some data
	\verbatim
	sql> CREATE TABLE data(class INT, a1 INT);
	sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
	sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
	sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
	sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
	sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
	\endverbatim
	-# Produce histogram of 5 bins and return the most frequent value and associated
	count in each bin:
	\verbatim
	sql> SELECT mfvsketch_top_histogram(a1,5) FROM data;

	mfvsketch_top_histogram
	--------------------------------------------------------------
	[0:4][0:1]={{2,15000},{1,10000},{3,10000},{5,1000},{6,1000}}
	(1 row)
	\endverbatim

	@literature
	This method is not usually called an MFV sketch in the literature; it
	is a natural extension of the CountMin sketch.

	@sa File sketch.sql_in documenting the SQL functions.
	\n\n Module grp_countmin.
	*/

	-- FM Sketch Functions
	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea)
	RETURNS int8
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.fmsketch_dcount(anyelement);

	/**
	* @brief Flajolet-Martin's distinct count estimation
	* @param column name
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.fmsketch_dcount(/+ column / anyelement)
	(
	sfunc = MADLIB_SCHEMA.__fmsketch_trans,
	stype = bytea,
	finalfunc = MADLIB_SCHEMA.__fmsketch_count_distinct,
	m4_ifdef(`GREENPLUM',`prefunc = MADLIB_SCHEMA.__fmsketch_merge,')
	initcond = ''
	);


	-- CM Sketch Functions

	-- We register __cmsketch_int8_trans for varying numbers of arguments to support
	-- a variety of agg function signatures. The first 2 args are used to
	-- aggregate; the remaining args are carried along unchanged inside the
	-- return structure for the use of the UDA finalizer.
	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8, int8) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8, arg3 int8)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_final(bytea) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_final(counters bytea)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_base64_final(bytea) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_base64_final(sketch bytea)
	RETURNS text
	AS $$
	select encode(MADLIB_SCHEMA.__cmsketch_final($1), 'base64');
	$$ LANGUAGE SQL;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch(int8);
	/**
	*@brief <c>cmsketch</c> is a UDA that can be run on columns of type int8,
	* or any column that can be cast to an int8. It produces a base64 string
	* representing a CountMin sketch: a large array of counters that is intended
	* to be passed into a UDF like <c>cmsketch_width_histogram</c> described below.
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.cmsketch(/+ column / INT8)
	(
	sfunc = MADLIB_SCHEMA.__cmsketch_int8_trans,
	stype = bytea,
	finalfunc = MADLIB_SCHEMA.__cmsketch_base64_final,
	m4_ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,')
	initcond = ''
	);

	/**
	@brief <c>cmsketch_count</c> is a scalar UDF to compute the approximate
	number of occurences of a value in a column summarized by a cmsketch. Takes
	the results of the <c>cmsketch</c> aggregate as its first argument, and the
	desired value as the second.
	*/
	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_count(text, int8) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.cmsketch_count(sketches64 text, val int8)
	RETURNS int8
	AS $$
	PythonFunctionBodyOnly(`sketch', `countmin')
	# MADlibSchema comes from PythonFunctionBodyOnly
	return countmin.count(sketches64, val)
	$$ LANGUAGE plpythonu;


	/**
	@brief <c>cmsketch_rangecount</c> is a scalar UDF to approximate the number
	of occurrences of values in the range <c>[lo,hi]</c> inclusive, given a
	cmsketch of a column. Takes the results
	of the <c>cmsketch</c> aggregate as its first argument, and the desired range
	boundaries as the second and third.
	*/
	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_rangecount(text, int8, int8) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.cmsketch_rangecount(sketches64 text, bot int8, top int8)
	RETURNS int8
	AS $$
	PythonFunctionBodyOnly(`sketch', `countmin')
	# MADlibSchema comes from PythonFunctionBodyOnly
	return countmin.rangecount(sketches64, bot, top)
	$$ LANGUAGE plpythonu;

	/**
	@brief <c>cmsketch_centile</c> is a scalar UDF to compute a centile value
	from a cmsketch. Takes the results of the <c>cmsketch</c> aggregate as its
	first argument, a number between 1 and 99 as the desired centile in the
	second, and the count of the column as the third. Produces a value from the
	sketched column that is approximately at the centile's position in sorted
	order.
	*/
	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_centile(text, int8, int8) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.cmsketch_centile(sketches64 text, centile int8, cnt int8)
	RETURNS int8
	AS $$
	PythonFunctionBodyOnly(`sketch', `countmin')
	# MADlibSchema comes from PythonFunctionBodyOnly
	return countmin.centile(sketches64, centile, cnt)
	$$ LANGUAGE plpythonu;

	/**
	@brief <c>cmsketch_median</c> is a scalar UDF to compute a median value
	from a cmsketch. Takes the results of the <c>cmsketch</c> aggregate as its
	first argument, and the count as the second. Produces a value from the
	sketched column that is approximately at the halfway position in sorted
	order.
	*/
	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_median(text, int8) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.cmsketch_median(sketches64 text, cnt int8)
	RETURNS int8
	AS $$
	PythonFunctionBodyOnly(`sketch', `countmin')
	# MADlibSchema comes from PythonFunctionBodyOnly
	return countmin.centile(sketches64, 50, cnt)
	$$ LANGUAGE plpythonu;

	/**
	\brief <c>cmsketch_width_histogram</c> is a scalar UDF that takes three aggregates of a column -- cmsketch, min and max-- as well as a number of buckets, and produces an n-bucket histogram for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.
	*/
	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_width_histogram(text, /+ min /int8, /+ max /int8, /+ nbuckets / int4) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.cmsketch_width_histogram(sketches64 text, themin int8, themax int8, nbuckets int4)
	RETURNS text
	AS $$
	PythonFunctionBodyOnly(`sketch', `countmin')
	# MADlibSchema comes from PythonFunctionBodyOnly
	return countmin.width_histogram( sketches64, themin, themax, nbuckets)
	$$ LANGUAGE plpythonu;

	/** @brief <c>cmsketch_depth_histogram</c> is a UDA that takes a cmsketch and a number of buckets n, and produces an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles.
	*/
	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_depth_histogram(text, /+ nbuckets / int4) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.cmsketch_depth_histogram(sketches64 text, nbuckets int4)
	RETURNS text
	AS $$
	PythonFunctionBodyOnly(`sketch', `countmin')
	# MADlibSchema comes from PythonFunctionBodyOnly
	return countmin.depth_histogram(sketches64, nbuckets)
	$$ LANGUAGE plpythonu;

	-- MFV Sketch functions

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_final(bytea) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_final(bytea)
	RETURNS text[][]
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea) CASCADE;
	CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea)
	RETURNS bytea
	AS 'MODULE_PATHNAME'
	LANGUAGE C STRICT;

	CREATE FUNCTION MADLIB_SCHEMA.__sketch_rightmost_one(bytea, integer, integer)
	RETURNS integer AS 'MODULE_PATHNAME', 'sketch_rightmost_one' LANGUAGE C STRICT;

	CREATE FUNCTION MADLIB_SCHEMA.__sketch_leftmost_zero(bytea, integer, integer)
	RETURNS integer AS 'MODULE_PATHNAME', 'sketch_leftmost_zero' LANGUAGE C STRICT;

	CREATE FUNCTION MADLIB_SCHEMA.__sketch_array_set_bit_in_place(bytea, integer, integer, integer, integer)
	RETURNS bytea AS 'MODULE_PATHNAME', 'sketch_array_set_bit_in_place' LANGUAGE C STRICT;

	DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_top_histogram( anyelement, int4);
	/**
	* @brief Produces an n-bucket histogram for a column where each bucket counts
	* one of the most frequent values in the column. The output is an array of
	* doubles {value, count} in descending order of frequency; counts are
	* approximated via CountMin sketches. Ties are handled arbitrarily.
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_top_histogram(/+ column / anyelement, /+ number_of_buckets / int4)
	(
	sfunc = MADLIB_SCHEMA.__mfvsketch_trans,
	stype = bytea,
	finalfunc = MADLIB_SCHEMA.__mfvsketch_final,
	initcond = ''
	);

	DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_quick_histogram(anyelement, int4);
	/**
	* @brief On Postgres it works the same way as \ref mfvsketch_top_histogram but,
	* in Greenplum it does parallel aggregation to provide a "quick and dirty" answer.
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_quick_histogram(/+ column / anyelement, /+ number_of_buckets / int4)
	(
	sfunc = MADLIB_SCHEMA.__mfvsketch_trans,
	stype = bytea,
	finalfunc = MADLIB_SCHEMA.__mfvsketch_final,
	m4_ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__mfvsketch_merge,')
	initcond = ''
	);