| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file sketch.sql_in |
| * |
| * @brief SQL functions for sketch-based approximations of descriptive statistics |
| * @date April 2011 |
| * |
| * @sa For a brief introduction to sketches, see the module description grp_sketches |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_sketches |
| |
| @about |
| |
| Sketches (sometimes called "synopsis data structures") are small randomized |
| in-memory data structures that capture statistical properties of a large set |
| of values (e.g. a column of a table). Sketches can be formed in a single |
| pass of the data, and used to approximate a variety of descriptive statistics. |
| |
| We implement sketches as SQL User-Defined Aggregates (UDAs). Because they |
| are single-pass, small-space and parallelized, a single query can |
| use many sketches to gather summary statistics on many columns of a table efficiently. |
| |
| This module currently implements user-defined aggregates based on three main sketch methods: |
| - <i>Flajolet-Martin (FM)</i> sketches for approximating <c>COUNT(DISTINCT)</c>. |
| - <i>Count-Min (CM)</i> sketches, which can be used to approximate a number of descriptive statistics including |
| - <c>COUNT(*)</c> of rows whose column value matches a given value in a set |
| - <c>COUNT(*)</c> of rows whose column value falls in a range (*) |
| - order statistics including <i>median</i> and <i>centiles</i> (*) |
| - <i>histograms</i>: both <i>equi-width</i> and <i>equi-depth</i> (*) |
| - <i>Most Frequent Value (MFV)</i> sketches, which output the most |
| frequently-occuring values in a column, along with their associated counts. |
| |
| <i>Note:</i> Features marked with a star (*) only work for discrete types that |
| can be cast to int8. |
| |
| @implementation |
| The sketch methods consists of a number of SQL UDAs (user defined aggregates) |
| and UDFs (user defined functions), to be used directly in SQL queries. |
| */ |
| |
| /** |
| @addtogroup grp_fmsketch |
| |
| @about |
| Flajolet-Martin's distinct count estimation |
| implemented as a user-defined aggregate. |
| |
| @usage |
| - Get the number of distinct values in a designated column. |
| <pre>SELECT \ref fmsketch_dcount(<em>col_name</em>) FROM table_name;</pre> |
| |
| @implementation |
| \ref fmsketch_dcount can be run on a column of any type. |
| It returns an approximation to the number of distinct values |
| (a la <c>COUNT(DISTINCT x)</c>), but faster and approximate. |
| Like any aggregate, it can be combined with a GROUP BY clause to do distinct |
| counts per group. |
| |
| @examp |
| -# Generate some data: |
| \verbatim |
| sql> CREATE TABLE data(class INT, a1 INT); |
| sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000); |
| sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000); |
| sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000); |
| sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000); |
| sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000); |
| \endverbatim |
| -# Find distinct number of values for each class |
| \verbatim |
| sql> SELECT class,fmsketch_dcount(a1) FROM data GROUP BY data.class; |
| class | fmsketch_dcount |
| -------+----------------- |
| 2 | 2 |
| 1 | 3 |
| (2 rows) |
| \endverbatim |
| |
| @literature |
| [1] P. Flajolet and N.G. Martin. Probabilistic counting algorithms for data base applications, Journal of Computer and System Sciences 31(2), pp 182-209, 1985. http://algo.inria.fr/flajolet/Publications/FlMa85.pdf |
| |
| @sa File sketch.sql_in documenting the SQL function. |
| |
| */ |
| |
| /** |
| @addtogroup grp_countmin |
| |
| @about |
| This module implements Cormode-Muthukrishnan <i>CountMin</i> sketches |
| on integer values, implemented as a user-defined aggregate. It also provides |
| scalar functions over the sketches to produce approximate counts, order |
| statistics, and histograms. |
| |
| @usage |
| |
| - Get a sketch of a selected column specified by <em>col_name</em>. |
| <pre>SELECT \ref cmsketch(<em>col_name</em>) FROM table_name;</pre> |
| |
| - Get the number of rows where <em>col_name = p</em>, computed from the sketch |
| obtained from <tt>cmsketch</tt>. |
| <pre>SELECT \ref cmsketch_count(<em>cmsketch</em>,<em>p</em>) FROM table_name;</pre> |
| |
| - Get the number of rows where <em>col_name</em> is between <em>m</em> and <em>n</em> inclusive. |
| <pre>SELECT \ref cmsketch_rangecount(<em>cmsketch</em>,<em>m</em>,<em>n</em>) FROM table_name;</pre> |
| |
| - Get the <em>k</em>th percentile of <em>col_name</em> where <em>count</em> specifies number of rows. <em>k</em> should be an integer between 1 to 99. |
| <pre>SELECT \ref cmsketch_centile(<em>cmsketch</em>,<em>k</em>,<em>count</em>) FROM table_name;</pre> |
| |
| - Get the median of <em>col_name</em> where <em>count</em> specifies number of rows. This is equivalent to <tt>\ref cmsketch_centile(<em>cmsketch</em>,50,<em>count</em>)</tt>. |
| <pre>SELECT \ref cmsketch_median(<em>cmsketch</em>,<em>count</em>) FROM table_name;</pre> |
| |
| - Get an n-bucket histogram for values between min and max for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. |
| <pre>SELECT \ref cmsketch_width_histogram(<em>cmsketch</em>,<em>min</em>,<em>max</em>,<em>n</em>) FROM table_name;</pre> |
| |
| - Get an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles. |
| <pre>SELECT \ref cmsketch_depth_histogram(<em>cmsketch</em>,<em>n</em>) FROM table_name;</pre> |
| |
| @examp |
| |
| -# Generate some data |
| \verbatim |
| sql> CREATE TABLE data(class INT, a1 INT); |
| sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000); |
| sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000); |
| sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000); |
| sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000); |
| sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000); |
| \endverbatim |
| -# Count number of rows where a1 = 2 in each class |
| \verbatim |
| sql> SELECT class,cmsketch_count(cmsketch(a1),2) FROM data GROUP BY data.class; |
| class | cmsketch_count |
| -------+---------------- |
| 2 | 0 |
| 1 | 15000 |
| (2 rows) |
| \endverbatim |
| -# Count number of rows where a1 is between 3 and 6 |
| \verbatim |
| sql> SELECT class,cmsketch_rangecount(cmsketch(a1),3,6) FROM data GROUP BY data.class; |
| class | cmsketch_rangecount |
| -------+--------------------- |
| 2 | 2000 |
| 1 | 10000 |
| (2 rows) |
| \endverbatim |
| -# Compute the 90th percentile of all of a1 |
| \verbatim |
| sql> SELECT cmsketch_centile(cmsketch(a1),90,count(*)) FROM data; |
| cmsketch_centile |
| ------------------ |
| 3 |
| (1 row) |
| \endverbatim |
| -# Produce an equi-width histogram with 2 bins between 0 and 10 |
| \verbatim |
| sql> SELECT cmsketch_width_histogram(cmsketch(a1),0,10,2) FROM data; |
| cmsketch_width_histogram |
| ------------------------------------ |
| [[0L, 4L, 35000], [5L, 10L, 2000]] |
| (1 row) |
| \endverbatim |
| -# Produce an equi-depth histogram of a1 with 2 bins of approximately equal depth |
| \verbatim |
| sql> SELECT cmsketch_depth_histogram(cmsketch(a1),2) FROM data; |
| cmsketch_depth_histogram |
| ----------------------------------------------------------------------- |
| [[-9223372036854775807L, 1, 10000], [2, 9223372036854775807L, 27000]] |
| (1 row) |
| \endverbatim |
| |
| @literature |
| |
| [1] G. Cormode and S. Muthukrishnan. An improved data stream summary: The count-min sketch and its applications. LATIN 2004, J. Algorithm 55(1): 58-75 (2005) . http://dimacs.rutgers.edu/~graham/pubs/html/CormodeMuthukrishnan04CMLatin.html |
| |
| [2] G. Cormode. Encyclopedia entry on 'Count-Min Sketch'. In L. Liu and M. T. Ozsu, editors, Encyclopedia of Database Systems, pages 511-516. Springer, 2009. http://dimacs.rutgers.edu/~graham/pubs/html/Cormode09b.html |
| |
| @sa File sketch.sql_in documenting the SQL functions. |
| \n\n Module grp_quantile for a different implementation of quantile function. |
| |
| */ |
| |
| /** |
| @addtogroup grp_mfvsketch |
| |
| @about |
| MFVSketch: Most Frequent Values variant of CountMin sketch, implemented |
| as a UDA. |
| |
| @usage |
| Produces an n-bucket histogram for a column where each bucket counts one of the |
| most frequent values in the column. The output is an array of doubles {value, count} |
| in descending order of frequency; counts are approximated via CountMin sketches. |
| Ties are handled arbitrarily. |
| <pre>SELECT \ref mfvsketch_top_histogram(<em>col_name</em>,n) FROM table_name;</pre> |
| <pre>SELECT \ref mfvsketch_top_histogram(<em>col_name</em>,n) FROM table_name;</pre> |
| |
| The MFV frequent-value UDA comes in two different versions: |
| - a faithful implementation that preserves the approximation guarantees |
| of Cormode/Muthukrishnan, |
| - and a "quick and dirty" version that can do parallel aggregation in Greenplum |
| at the expense of missing some of the most frequent values. |
| |
| In PostgreSQL the two UDAs are identical. In Greenplum, the quick version should |
| produce good results unless the number of values requested is very small, |
| or the distribution is very flat. |
| |
| @examp |
| |
| -# Generate some data |
| \verbatim |
| sql> CREATE TABLE data(class INT, a1 INT); |
| sql> INSERT INTO data SELECT 1,1 FROM generate_series(1,10000); |
| sql> INSERT INTO data SELECT 1,2 FROM generate_series(1,15000); |
| sql> INSERT INTO data SELECT 1,3 FROM generate_series(1,10000); |
| sql> INSERT INTO data SELECT 2,5 FROM generate_series(1,1000); |
| sql> INSERT INTO data SELECT 2,6 FROM generate_series(1,1000); |
| \endverbatim |
| -# Produce histogram of 5 bins and return the most frequent value and associated |
| count in each bin: |
| \verbatim |
| sql> SELECT mfvsketch_top_histogram(a1,5) FROM data; |
| |
| mfvsketch_top_histogram |
| -------------------------------------------------------------- |
| [0:4][0:1]={{2,15000},{1,10000},{3,10000},{5,1000},{6,1000}} |
| (1 row) |
| \endverbatim |
| |
| @literature |
| This method is not usually called an MFV sketch in the literature; it |
| is a natural extension of the CountMin sketch. |
| |
| @sa File sketch.sql_in documenting the SQL functions. |
| \n\n Module grp_countmin. |
| */ |
| |
| -- FM Sketch Functions |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) |
| RETURNS int8 |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.fmsketch_dcount(anyelement); |
| |
| /** |
| * @brief Flajolet-Martin's distinct count estimation |
| * @param column name |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.fmsketch_dcount(/*+ column */ anyelement) |
| ( |
| sfunc = MADLIB_SCHEMA.__fmsketch_trans, |
| stype = bytea, |
| finalfunc = MADLIB_SCHEMA.__fmsketch_count_distinct, |
| m4_ifdef(`GREENPLUM',`prefunc = MADLIB_SCHEMA.__fmsketch_merge,') |
| initcond = '' |
| ); |
| |
| |
| -- CM Sketch Functions |
| |
| -- We register __cmsketch_int8_trans for varying numbers of arguments to support |
| -- a variety of agg function signatures. The first 2 args are used to |
| -- aggregate; the remaining args are carried along unchanged inside the |
| -- return structure for the use of the UDA finalizer. |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8, arg3 int8) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_final(bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_final(counters bytea) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_base64_final(bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_base64_final(sketch bytea) |
| RETURNS text |
| AS $$ |
| select encode(MADLIB_SCHEMA.__cmsketch_final($1), 'base64'); |
| $$ LANGUAGE SQL; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch(int8); |
| /** |
| *@brief <c>cmsketch</c> is a UDA that can be run on columns of type int8, |
| * or any column that can be cast to an int8. It produces a base64 string |
| * representing a CountMin sketch: a large array of counters that is intended |
| * to be passed into a UDF like <c>cmsketch_width_histogram</c> described below. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.cmsketch(/*+ column */ INT8) |
| ( |
| sfunc = MADLIB_SCHEMA.__cmsketch_int8_trans, |
| stype = bytea, |
| finalfunc = MADLIB_SCHEMA.__cmsketch_base64_final, |
| m4_ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,') |
| initcond = '' |
| ); |
| |
| /** |
| @brief <c>cmsketch_count</c> is a scalar UDF to compute the approximate |
| number of occurences of a value in a column summarized by a cmsketch. Takes |
| the results of the <c>cmsketch</c> aggregate as its first argument, and the |
| desired value as the second. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_count(text, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_count(sketches64 text, val int8) |
| RETURNS int8 |
| AS $$ |
| PythonFunctionBodyOnly(`sketch', `countmin') |
| # MADlibSchema comes from PythonFunctionBodyOnly |
| return countmin.count(sketches64, val) |
| $$ LANGUAGE plpythonu; |
| |
| |
| /** |
| @brief <c>cmsketch_rangecount</c> is a scalar UDF to approximate the number |
| of occurrences of values in the range <c>[lo,hi]</c> inclusive, given a |
| cmsketch of a column. Takes the results |
| of the <c>cmsketch</c> aggregate as its first argument, and the desired range |
| boundaries as the second and third. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_rangecount(text, int8, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_rangecount(sketches64 text, bot int8, top int8) |
| RETURNS int8 |
| AS $$ |
| PythonFunctionBodyOnly(`sketch', `countmin') |
| # MADlibSchema comes from PythonFunctionBodyOnly |
| return countmin.rangecount(sketches64, bot, top) |
| $$ LANGUAGE plpythonu; |
| |
| /** |
| @brief <c>cmsketch_centile</c> is a scalar UDF to compute a centile value |
| from a cmsketch. Takes the results of the <c>cmsketch</c> aggregate as its |
| first argument, a number between 1 and 99 as the desired centile in the |
| second, and the count of the column as the third. Produces a value from the |
| sketched column that is approximately at the centile's position in sorted |
| order. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_centile(text, int8, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_centile(sketches64 text, centile int8, cnt int8) |
| RETURNS int8 |
| AS $$ |
| PythonFunctionBodyOnly(`sketch', `countmin') |
| # MADlibSchema comes from PythonFunctionBodyOnly |
| return countmin.centile(sketches64, centile, cnt) |
| $$ LANGUAGE plpythonu; |
| |
| /** |
| @brief <c>cmsketch_median</c> is a scalar UDF to compute a median value |
| from a cmsketch. Takes the results of the <c>cmsketch</c> aggregate as its |
| first argument, and the count as the second. Produces a value from the |
| sketched column that is approximately at the halfway position in sorted |
| order. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_median(text, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_median(sketches64 text, cnt int8) |
| RETURNS int8 |
| AS $$ |
| PythonFunctionBodyOnly(`sketch', `countmin') |
| # MADlibSchema comes from PythonFunctionBodyOnly |
| return countmin.centile(sketches64, 50, cnt) |
| $$ LANGUAGE plpythonu; |
| |
| /** |
| \brief <c>cmsketch_width_histogram</c> is a scalar UDF that takes three aggregates of a column -- cmsketch, min and max-- as well as a number of buckets, and produces an n-bucket histogram for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_width_histogram(text, /*+ min */int8, /*+ max */int8, /*+ nbuckets */ int4) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_width_histogram(sketches64 text, themin int8, themax int8, nbuckets int4) |
| RETURNS text |
| AS $$ |
| PythonFunctionBodyOnly(`sketch', `countmin') |
| # MADlibSchema comes from PythonFunctionBodyOnly |
| return countmin.width_histogram( sketches64, themin, themax, nbuckets) |
| $$ LANGUAGE plpythonu; |
| |
| /** @brief <c>cmsketch_depth_histogram</c> is a UDA that takes a cmsketch and a number of buckets n, and produces an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_depth_histogram(text, /*+ nbuckets */ int4) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_depth_histogram(sketches64 text, nbuckets int4) |
| RETURNS text |
| AS $$ |
| PythonFunctionBodyOnly(`sketch', `countmin') |
| # MADlibSchema comes from PythonFunctionBodyOnly |
| return countmin.depth_histogram(sketches64, nbuckets) |
| $$ LANGUAGE plpythonu; |
| |
| -- MFV Sketch functions |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_final(bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_final(bytea) |
| RETURNS text[][] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.__sketch_rightmost_one(bytea, integer, integer) |
| RETURNS integer AS 'MODULE_PATHNAME', 'sketch_rightmost_one' LANGUAGE C STRICT; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.__sketch_leftmost_zero(bytea, integer, integer) |
| RETURNS integer AS 'MODULE_PATHNAME', 'sketch_leftmost_zero' LANGUAGE C STRICT; |
| |
| CREATE FUNCTION MADLIB_SCHEMA.__sketch_array_set_bit_in_place(bytea, integer, integer, integer, integer) |
| RETURNS bytea AS 'MODULE_PATHNAME', 'sketch_array_set_bit_in_place' LANGUAGE C STRICT; |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_top_histogram( anyelement, int4); |
| /** |
| * @brief Produces an n-bucket histogram for a column where each bucket counts |
| * one of the most frequent values in the column. The output is an array of |
| * doubles {value, count} in descending order of frequency; counts are |
| * approximated via CountMin sketches. Ties are handled arbitrarily. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_top_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4) |
| ( |
| sfunc = MADLIB_SCHEMA.__mfvsketch_trans, |
| stype = bytea, |
| finalfunc = MADLIB_SCHEMA.__mfvsketch_final, |
| initcond = '' |
| ); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_quick_histogram(anyelement, int4); |
| /** |
| * @brief On Postgres it works the same way as \ref mfvsketch_top_histogram but, |
| * in Greenplum it does parallel aggregation to provide a "quick and dirty" answer. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_quick_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4) |
| ( |
| sfunc = MADLIB_SCHEMA.__mfvsketch_trans, |
| stype = bytea, |
| finalfunc = MADLIB_SCHEMA.__mfvsketch_final, |
| m4_ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__mfvsketch_merge,') |
| initcond = '' |
| ); |