| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file sketch.sql_in |
| * |
| * @brief SQL functions for sketch-based approximations of descriptive statistics |
| * @date April 2011 |
| * |
| * @sa For a brief introduction to sketches, see the module description grp_sketches |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_sketches |
| @brief A collection of methods to estimate the number of unique values contained in the data. |
| |
| \warning <em> These MADlib methods are still in early stage development. There may be some |
| issues that will be addressed in future versions. Interface and implementation |
| is subject to change. </em> |
| |
| Sketches (sometimes called "synopsis data structures") are small randomized |
| in-memory data structures that capture statistical properties of a large set |
| of values (e.g., a column of a table). Sketches can be formed in a single |
| pass of the data, and used to approximate a variety of descriptive statistics. |
| |
| We implement sketches as SQL User-Defined Aggregates (UDAs). Because they |
| are single-pass, small-space and parallelized, a single query can |
| use many sketches to gather summary statistics on many columns of a table efficiently. |
| |
| This module currently implements user-defined aggregates based on three main sketch methods: |
| - <i>Count-Min (CM)</i> sketches, which can be used to approximate a number of descriptive statistics including |
| - <c>COUNT(*)</c> of rows whose column value matches a given value in a set |
| - <c>COUNT(*)</c> of rows whose column value falls in a range (*) |
| - order statistics including <i>median</i> and <i>centiles</i> (*) |
| - <i>histograms</i>: both <i>equi-width</i> and <i>equi-depth</i> (*) |
| - <i>Flajolet-Martin (FM)</i> sketches for approximating <c>COUNT(DISTINCT)</c>. |
| - <i>Most Frequent Value (MFV)</i> sketches, which output the most |
| frequently-occuring values in a column, along with their associated counts. |
| |
| <i>Note:</i> Features marked with a star (*) only work for discrete types that |
| can be cast to int8. |
| |
| The sketch methods consist of a number of SQL UDAs (user-defined aggregates) |
| and UDFs (user-defined functions), to be used directly in SQL queries. |
| */ |
| |
| /** |
| @addtogroup grp_fmsketch |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#syntax">Syntax</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#literature">Literature</a></li> |
| <li><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| \warning <em> This MADlib method is still in early stage development. There may be some |
| issues that will be addressed in a future version. Interface and implementation |
| is subject to change. </em> |
| |
| |
| @brief Implements Flajolet-Martin's distinct count estimation |
| as a user-defined aggregate. |
| |
| \ref fmsketch_dcount can be run on a column of any type. |
| It returns an approximation to the number of distinct values |
| (a la <c>COUNT(DISTINCT x)</c>), but faster and approximate. |
| Like any aggregate, it can be combined with a GROUP BY clause to do distinct |
| counts per group. |
| |
| @anchor syntax |
| @par Syntax |
| |
| Get the number of distinct values in a designated column. |
| <pre class="syntax"> |
| fmsketch_dcount( col_name ) |
| </pre> |
| |
| @anchor examples |
| @examp |
| -# Generate some data. |
| <pre class="example"> |
| CREATE TABLE data(class INT, a1 INT); |
| INSERT INTO data SELECT 1,1 FROM generate_series(1,10000); |
| INSERT INTO data SELECT 1,2 FROM generate_series(1,15000); |
| INSERT INTO data SELECT 1,3 FROM generate_series(1,10000); |
| INSERT INTO data SELECT 2,5 FROM generate_series(1,1000); |
| INSERT INTO data SELECT 2,6 FROM generate_series(1,1000); |
| </pre> |
| |
| -# Find the distinct number of values for each class. |
| <pre class="example"> |
| SELECT class, fmsketch_dcount(a1) |
| FROM data |
| GROUP BY data.class; |
| </pre> |
| Result: |
| <pre class="result"> |
| class | fmsketch_dcount |
| ------+----------------- |
| 2 | 2 |
| 1 | 3 |
| (2 rows) |
| </pre> |
| |
| @anchor literature |
| @literature |
| [1] P. Flajolet and N.G. Martin. Probabilistic counting algorithms for data base applications, Journal of Computer and System Sciences 31(2), pp 182-209, 1985. http://algo.inria.fr/flajolet/Publications/FlMa85.pdf |
| |
| @anchor related |
| @par Related Topics |
| File sketch.sql_in documenting the SQL function. |
| |
| */ |
| |
| /** |
| @addtogroup grp_countmin |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#syntax">Syntax</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#literature">Literature</a></li> |
| <li><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| \warning <em> This MADlib method is still in early stage development. There may be some |
| issues that will be addressed in a future version. Interface and implementation |
| is subject to change. </em> |
| |
| @brief Implements Cormode-Mathukrishnan <i>CountMin</i> sketches on integer |
| values as a user-defined aggregate. |
| |
| This module implements Cormode-Muthukrishnan <i>CountMin</i> sketches |
| on integer values, implemented as a user-defined aggregate. It also provides |
| scalar functions over the sketches to produce approximate counts, order |
| statistics, and histograms. |
| |
| @anchor syntax |
| @par Syntax |
| - Get a sketch of a selected column specified by <em>col_name</em>. |
| <pre class="syntax"> |
| cmsketch( col_name ) |
| </pre> |
| |
| - Get the number of rows where <em>col_name = p</em>, computed from the sketch |
| obtained from <tt>cmsketch</tt>. |
| <pre class="syntax"> |
| cmsketch_count( cmsketch, |
| p |
| ) |
| </pre> |
| |
| - Get the number of rows where <em>col_name</em> is between <em>m</em> and <em>n</em> inclusive. |
| <pre class="syntax"> |
| cmsketch_rangecount( cmsketch, |
| m, |
| n |
| ) |
| </pre> |
| |
| - Get the <em>k</em>th percentile of <em>col_name</em> where <em>count</em> specifies number of rows. <em>k</em> should be an integer between 1 to 99. |
| <pre class="syntax"> |
| cmsketch_centile( cmsketch, |
| k, |
| count |
| ) |
| </pre> |
| |
| - Get the median of col_name where <em>count</em> specifies number of rows. This is equivalent to <tt>\ref cmsketch_centile(<em>cmsketch</em>,50,<em>count</em>)</tt>. |
| <pre class="syntax"> |
| cmsketch_median( cmsketch, |
| count |
| ) |
| </pre> |
| |
| - Get an n-bucket histogram for values between min and max for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. |
| <pre class="syntax"> |
| cmsketch_width_histogram( cmsketch, |
| min, |
| max, |
| n |
| ) |
| </pre> |
| |
| - Get an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles. |
| <pre class="syntax"> |
| cmsketch_depth_histogram( cmsketch, |
| n |
| ) |
| </pre> |
| |
| @anchor examples |
| @examp |
| |
| -# Generate some data. |
| <pre class="example"> |
| CREATE TABLE data(class INT, a1 INT); |
| INSERT INTO data SELECT 1,1 FROM generate_series(1,10000); |
| INSERT INTO data SELECT 1,2 FROM generate_series(1,15000); |
| INSERT INTO data SELECT 1,3 FROM generate_series(1,10000); |
| INSERT INTO data SELECT 2,5 FROM generate_series(1,1000); |
| INSERT INTO data SELECT 2,6 FROM generate_series(1,1000); |
| </pre> |
| |
| -# Count number of rows where a1 = 2 in each class. |
| <pre class="example"> |
| SELECT class, |
| cmsketch_count( |
| cmsketch( a1 ), |
| 2 |
| ) |
| FROM data GROUP BY data.class; |
| </pre> |
| Result: |
| <pre class="result"> |
| class | cmsketch_count |
| ------+---------------- |
| 2 | 0 |
| 1 | 15000 |
| (2 rows) |
| </pre> |
| |
| -# Count number of rows where a1 is between 3 and 6. |
| <pre class="example"> |
| SELECT class, |
| cmsketch_rangecount( |
| cmsketch(a1), |
| 3, |
| 6 |
| ) |
| FROM data GROUP BY data.class; |
| </pre> |
| Result: |
| <pre class="result"> |
| class | cmsketch_rangecount |
| ------+--------------------- |
| 2 | 2000 |
| 1 | 10000 |
| (2 rows) |
| </pre> |
| |
| -# Compute the 90th percentile of all of a1. |
| <pre class="example"> |
| SELECT cmsketch_centile( |
| cmsketch( a1 ), |
| 90, |
| count(*) |
| ) |
| FROM data; |
| </pre> |
| Result: |
| <pre class="result"> |
| cmsketch_centile |
| ----------------- |
| 3 |
| (1 row) |
| </pre> |
| |
| -# Produce an equi-width histogram with 2 bins between 0 and 10. |
| <pre class="example"> |
| SELECT cmsketch_width_histogram( |
| cmsketch( a1 ), |
| 0, |
| 10, |
| 2 |
| ) |
| FROM data; |
| </pre> |
| Result: |
| <pre class="result"> |
| cmsketch_width_histogram |
| ----------------------------------- |
| [[0L, 4L, 35000], [5L, 10L, 2000]] |
| (1 row) |
| </pre> |
| |
| -# Produce an equi-depth histogram of a1 with 2 bins of approximately equal depth. |
| <pre class="example"> |
| SELECT cmsketch_depth_histogram( |
| cmsketch( a1 ), |
| 2 |
| ) |
| FROM data; |
| </pre> |
| Result: |
| <pre class="result"> |
| cmsketch_depth_histogram |
| ---------------------------------------------------------------------- |
| [[-9223372036854775807L, 1, 10000], [2, 9223372036854775807L, 27000]] |
| (1 row) |
| </pre> |
| |
| @anchor literature |
| @literature |
| |
| [1] G. Cormode and S. Muthukrishnan. An improved data stream summary: The count-min sketch and its applications. LATIN 2004, J. Algorithm 55(1): 58-75 (2005) . http://dimacs.rutgers.edu/~graham/pubs/html/CormodeMuthukrishnan04CMLatin.html |
| |
| [2] G. Cormode. Encyclopedia entry on 'Count-Min Sketch'. In L. Liu and M. T. Ozsu, editors, Encyclopedia of Database Systems, pages 511-516. Springer, 2009. http://dimacs.rutgers.edu/~graham/pubs/html/Cormode09b.html |
| |
| @anchor related |
| File sketch.sql_in documenting the SQL functions. |
| |
| Module \ref grp_quantile for a different implementation of quantile function. |
| |
| */ |
| |
| /** |
| @addtogroup grp_mfvsketch |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#syntax">Syntax</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#literature">Literature</a></li> |
| <li><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| \warning <em> This MADlib method is still in early stage development. There may be some |
| issues that will be addressed in a future version. Interface and implementation |
| is subject to change. </em> |
| |
| @brief Implements the most frequent values variant of the CountMin sketch as a user-defined aggregate. |
| |
| MFVSketch: Most Frequent Values variant of CountMin sketch, implemented |
| as a UDA. |
| |
| Produces an n-bucket histogram for a column where each bucket counts one of the |
| most frequent values in the column. The output is an array of doubles {value, count} |
| in descending order of frequency; counts are approximated via CountMin sketches. |
| Ties are handled arbitrarily. |
| |
| @anchor syntax |
| |
| <pre class="syntax"> |
| mfvsketch_top_histogram( col_name, |
| n |
| ) |
| </pre> |
| |
| The MFV frequent-value UDA comes in two different versions: |
| - a faithful implementation that preserves the approximation guarantees |
| of Cormode/Muthukrishnan, |
| - and a "quick and dirty" version that can do parallel aggregation in Greenplum |
| at the expense of missing some of the most frequent values. |
| |
| In PostgreSQL the two UDAs are identical. In Greenplum, the quick version should |
| produce good results unless the number of values requested is very small, |
| or the distribution is very flat. |
| |
| @anchor examples |
| @examp |
| |
| -# Generate some data. |
| <pre class="example"> |
| CREATE TABLE data(class INT, a1 INT); |
| INSERT INTO data SELECT 1,1 FROM generate_series(1,10000); |
| INSERT INTO data SELECT 1,2 FROM generate_series(1,15000); |
| INSERT INTO data SELECT 1,3 FROM generate_series(1,10000); |
| INSERT INTO data SELECT 2,5 FROM generate_series(1,1000); |
| INSERT INTO data SELECT 2,6 FROM generate_series(1,1000); |
| </pre> |
| |
| -# Produce a histogram of 5 bins and return the most frequent value and associated |
| count in each bin. |
| <pre class="example"> |
| SELECT mfvsketch_top_histogram( a1, |
| 5 |
| ) |
| FROM data; |
| </pre> |
| Result: |
| <pre class="result"> |
| mfvsketch_top_histogram |
| ------------------------------------------------------------- |
| [0:4][0:1]={{2,15000},{1,10000},{3,10000},{5,1000},{6,1000}} |
| (1 row) |
| </pre> |
| |
| @anchor literature |
| @literature |
| This method is not usually called an MFV sketch in the literature; it |
| is a natural extension of the CountMin sketch. |
| |
| @anchor related |
| @par Related Topics |
| |
| File sketch.sql_in documenting the SQL functions. |
| |
| Module \ref grp_countmin. |
| */ |
| |
| -- FM Sketch Functions |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) |
| RETURNS int8 |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.fmsketch_dcount(anyelement); |
| |
| /** |
| * @brief Flajolet-Martin's distinct count estimation |
| * @param column name |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.fmsketch_dcount(/*+ column */ anyelement) |
| ( |
| sfunc = MADLIB_SCHEMA.__fmsketch_trans, |
| stype = bytea, |
| finalfunc = MADLIB_SCHEMA.__fmsketch_count_distinct, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc = MADLIB_SCHEMA.__fmsketch_merge,') |
| initcond = '' |
| ); |
| |
| |
| -- CM Sketch Functions |
| |
| -- We register __cmsketch_int8_trans for varying numbers of arguments to support |
| -- a variety of agg function signatures. The first 2 args are used to |
| -- aggregate; the remaining args are carried along unchanged inside the |
| -- return structure for the use of the UDA finalizer. |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8, arg3 int8) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_base64_final(bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_base64_final(sketch bytea) |
| RETURNS text |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch(int8); |
| /** |
| *@brief <c>cmsketch</c> is a UDA that can be run on columns of type int8, |
| * or any column that can be cast to an int8. It produces a base64 string |
| * representing a CountMin sketch: a large array of counters that is intended |
| * to be passed into a UDF like <c>cmsketch_width_histogram</c> described below. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.cmsketch(/*+ column */ INT8) |
| ( |
| sfunc = MADLIB_SCHEMA.__cmsketch_int8_trans, |
| stype = bytea, |
| finalfunc = MADLIB_SCHEMA.__cmsketch_base64_final, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,') |
| initcond = '' |
| ); |
| |
| /** |
| @brief <c>cmsketch_count</c> is a scalar UDF to compute the approximate |
| number of occurences of a value in a column summarized by a cmsketch. Takes |
| the results of the <c>cmsketch</c> aggregate as its first argument, and the |
| desired value as the second. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_count(text, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_count(sketches64 text, val int8) |
| RETURNS int8 |
| AS $$ |
| PythonFunctionBodyOnlyNoSchema(`sketch', `countmin') |
| return countmin.count(sketches64, val) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| |
| /** |
| @brief <c>cmsketch_rangecount</c> is a scalar UDF to approximate the number |
| of occurrences of values in the range <c>[lo,hi]</c> inclusive, given a |
| cmsketch of a column. Takes the results |
| of the <c>cmsketch</c> aggregate as its first argument, and the desired range |
| boundaries as the second and third. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_rangecount(text, int8, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_rangecount(sketches64 text, bot int8, top int8) |
| RETURNS int8 |
| AS $$ |
| PythonFunctionBodyOnlyNoSchema(`sketch', `countmin') |
| return countmin.rangecount(sketches64, bot, top) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| /** |
| @brief <c>cmsketch_centile</c> is a scalar UDF to compute a centile value |
| from a cmsketch. Takes the results of the <c>cmsketch</c> aggregate as its |
| first argument, a number between 1 and 99 as the desired centile in the |
| second, and the count of the column as the third. Produces a value from the |
| sketched column that is approximately at the centile's position in sorted |
| order. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_centile(text, int8, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_centile(sketches64 text, centile int8, cnt int8) |
| RETURNS int8 |
| AS $$ |
| PythonFunctionBodyOnlyNoSchema(`sketch', `countmin') |
| return countmin.centile(sketches64, centile, cnt) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| /** |
| @brief <c>cmsketch_median</c> is a scalar UDF to compute a median value |
| from a cmsketch. Takes the results of the <c>cmsketch</c> aggregate as its |
| first argument, and the count as the second. Produces a value from the |
| sketched column that is approximately at the halfway position in sorted |
| order. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_median(text, int8) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_median(sketches64 text, cnt int8) |
| RETURNS int8 |
| AS $$ |
| PythonFunctionBodyOnlyNoSchema(`sketch', `countmin') |
| return countmin.centile(sketches64, 50, cnt) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| /** |
| \brief <c>cmsketch_width_histogram</c> is a scalar UDF that takes three aggregates of a column -- cmsketch, min and max-- as well as a number of buckets, and produces an n-bucket histogram for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_width_histogram(text, /*+ min */int8, /*+ max */int8, /*+ nbuckets */ int4) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_width_histogram(sketches64 text, themin int8, themax int8, nbuckets int4) |
| RETURNS text |
| AS $$ |
| PythonFunctionBodyOnlyNoSchema(`sketch', `countmin') |
| # schema_madlib comes from PythonFunctionBodyOnly |
| return countmin.width_histogram( sketches64, themin, themax, nbuckets) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| /** @brief <c>cmsketch_depth_histogram</c> is a UDA that takes a cmsketch and a number of buckets n, and produces an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles. |
| */ |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_depth_histogram(text, /*+ nbuckets */ int4) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.cmsketch_depth_histogram(sketches64 text, nbuckets int4) |
| RETURNS text |
| AS $$ |
| PythonFunctionBodyOnlyNoSchema(`sketch', `countmin') |
| return countmin.depth_histogram(sketches64, nbuckets) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| -- MFV Sketch functions |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_final(bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_final(bytea) |
| RETURNS text[][] |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea) CASCADE; |
| CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea) |
| RETURNS bytea |
| AS 'MODULE_PATHNAME' |
| LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sketch_rightmost_one(bytea, integer, integer) |
| RETURNS integer AS 'MODULE_PATHNAME', 'sketch_rightmost_one' LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sketch_leftmost_zero(bytea, integer, integer) |
| RETURNS integer AS 'MODULE_PATHNAME', 'sketch_leftmost_zero' LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sketch_array_set_bit_in_place(bytea, integer, integer, integer, integer) |
| RETURNS bytea AS 'MODULE_PATHNAME', 'sketch_array_set_bit_in_place' LANGUAGE C STRICT |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_top_histogram( anyelement, int4); |
| /** |
| * @brief Produces an n-bucket histogram for a column where each bucket counts |
| * one of the most frequent values in the column. The output is an array of |
| * doubles {value, count} in descending order of frequency; counts are |
| * approximated via CountMin sketches. Ties are handled arbitrarily. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_top_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4) |
| ( |
| sfunc = MADLIB_SCHEMA.__mfvsketch_trans, |
| stype = bytea, |
| finalfunc = MADLIB_SCHEMA.__mfvsketch_final, |
| initcond = '' |
| ); |
| |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_quick_histogram(anyelement, int4); |
| /** |
| * @brief On Postgres it works the same way as \ref mfvsketch_top_histogram but, |
| * in Greenplum it does parallel aggregation to provide a "quick and dirty" answer. |
| */ |
| CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_quick_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4) |
| ( |
| sfunc = MADLIB_SCHEMA.__mfvsketch_trans, |
| stype = bytea, |
| finalfunc = MADLIB_SCHEMA.__mfvsketch_final, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc = MADLIB_SCHEMA.__mfvsketch_merge,') |
| initcond = '' |
| ); |