blob: 5985aa347a329876b2361e36cfa42f8e8c692adf [file] [log] [blame]
/**
@file sketches.sql_in
@brief SQL functions for sketch-based approximations of descriptive statistics
@date January 2011
@sa For a brief introduction to sketches, see the module description
grp_sketches
*//* --------------------------------------------------------------------------
*/
/**
@addtogroup grp_sketches
@about
Sketches (sometimes called "synopsis data structures") are small randomized
in-memory data structures that capture statistical properties of a large set
of values (e.g. a column of a table). Sketches can be formed in a single
pass of the data, and used to approximate a variety of descriptive statistics.
We implement sketches as SQL User-Defined Aggregates (UDAs). Because they
are single-pass, small-space and parallelized, a single query can
use many sketches to gather summary statistics on many columns of a table efficiently.
This module currently implements user-defined aggregates based on three main sketch methods:
- <i>Flajolet-Martin (FM)</i> sketches for approximating <c>COUNT(DISTINCT)</c>.
- <i>Count-Min (CM)</i> sketches, which can be used to approximate a number of descriptive statistics including
- <c>COUNT(*)</c> of rows whose column value matches a given value in a set
- <c>COUNT(*)</c> of rows whose column value falls in a range (*)
- order statistics including <i>median</i> and <i>centiles</i> (*)
- <i>histograms</i>: both <i>equi-width</i> and <i>equi-depth</i> (*)
- <i>Most Frequent Value (MFV)</i> sketches, which output the most
frequently-occuring values in a column, along with their associated counts.
<i>Note:</i> Features marked with a single star (*) only work for discrete types that can be cast to int8.
@prereq
Because sketches are essentially a high-performance compression technique, they were custom-coded for efficiency in C for PostgreSQL/Greenplum.
@usage
The sketch method consists of a number of SQL UDAs and UDFs, to be used
directly in SQL queries.
*/
/**
@addtogroup grp_fmsketch
@about
Flajolet-Martin's distinct count estimation
implemented as a user-defined aggregate.
@usage
<c>fmsketch_dcount</c> is a UDA that can be run on any column of any type.
It returns an approximation to the number of distinct values in the column
(a la <c>COUNT(DISTINCT x)</c>, but faster and approximate). Like any aggregate, it can be combined with a GROUP BY clause to do distinct counts per group. Example:@code
-- find distinct number of proname values for each value of pronargs
SELECT pronargs, madlib.fmsketch_dcount(proname) AS distinct_hat, count(proname)
FROM pg_proc
GROUP BY pronargs;
@endcode
@sa file sketches.sql_in (documenting the SQL function)
@literature
[1] P. Flajolet and N.G. Martin. Probabilistic counting algorithms for data base applications, Journal of Computer and System Sciences 31(2), pp 182-209, 1985. http://algo.inria.fr/flajolet/Publications/FlMa85.pdf
*/
/**
@addtogroup grp_countmin
@about
This module implements Cormode-Muthukrishnan <i>CountMin</i> sketch estimators for various descriptive statistics
on integer values, implemented as user-defined aggregates. It provides approximate counts, order statistics,
and histograms.
@examp
@code
-- count number of rows with pronargs = 3
SELECT pronamespace, madlib.cmsketch_count(pronargs, 3)
FROM pg_proc
GROUP BY pronamespace;
@endcode
@code
-- count number of rows with pronargs between 3 and 5 inclusive
SELECT pronamespace, madlib.cmsketch_rangecount(pronargs, 3, 5)
FROM pg_proc
GROUP BY pronamespace;
@endcode
@code
-- find 75th percentile value of oid
SELECT relnamespace, madlib.cmsketch_centile(oid::int8, 75)
FROM pg_class
GROUP BY relnamespace;
@endcode
@code
-- produce equi-width histogram of oid
SELECT madlib.cmsketch_width_histogram(madlib.cmsketch(oid::int8),
min(oid::int8),
max(oid::int8),
10)
FROM pg_class;
@endcode
@code
-- produce equi-depth histogram of oid
SELECT madlib.cmsketch_depth_histogram(oid::int8, 10)
FROM pg_class;
@endcode
@sa file sketches.sql_in (documenting the SQL functions)
@literature
[1] G. Cormode and S. Muthukrishnan. An improved data stream summary: The count-min sketch and its applications. LATIN 2004, J. Algorithm 55(1): 58-75 (2005) . http://dimacs.rutgers.edu/~graham/pubs/html/CormodeMuthukrishnan04CMLatin.html
[2] G. Cormode. Encyclopedia entry on 'Count-Min Sketch'. In L. Liu and M. T. Ozsu, editors, Encyclopedia of Database Systems, pages 511-516. Springer, 2009. http://dimacs.rutgers.edu/~graham/pubs/html/Cormode09b.html
@sa module grp_quantile
*/
/**
@addtogroup grp_mfvsketch
@about
MFVSketch: Most Frequent Values variant of CountMin sketch, implemented
as a UDA.
\usage
The MFV frequent-value UDA comes in two different versions: a faithful implementation that preserves the
approximation guarantees of Cormode/Muthukrishnan, and a "quick and dirty"
version that can do parallel aggregation in Greenplum at the expense
of missing some of the most frequent values.
In PostgreSQL the two UDAs are identical.
In Greenplum, the quick version should
produce good results unless the number of values requested is very small,
or the distribution is very flat.
Examples:
@code
-- Find most frequent values of proname, and their counts.
-- No parallel aggregation
SELECT madlib.mfvsketch_top_histogram(proname, 4)
FROM pg_proc;
-- Find most frequent values of proname, and their counts.
-- In Greenplum, aggregation will run in parallel, but some
-- of the mfvs may be skipped
SELECT madlib.mfvsketch_quick_histogram(proname, 4)
FROM pg_proc;
@endcode
@sa file sketches.sql_in (documenting the SQL functions)
\literature
This method is not usually called an MFV sketch in the literature; it
is a natural extension of the CountMin sketch.
\sa file sketches.sql_in (documenting the SQL functions), module grp_countmin
*/
-- FM Sketch Functions
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea)
RETURNS int8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.fmsketch_dcount(anyelement);
/**
* @brief Flajolet-Martin's distinct count estimation
* @param column name
*/
CREATE AGGREGATE MADLIB_SCHEMA.fmsketch_dcount(/*+ column */ anyelement)
(
sfunc = MADLIB_SCHEMA.__fmsketch_trans,
stype = bytea,
finalfunc = MADLIB_SCHEMA.__fmsketch_count_distinct,
ifdef(`GREENPLUM',`prefunc = MADLIB_SCHEMA.__fmsketch_merge,')
initcond = ''
);
-- CM Sketch Functions
-- We register __cmsketch_trans for varying numbers of arguments to support
-- a variety of agg function signatures. The first 2 args are used to
-- aggregate; the remaining args are carried along unchanged inside the
-- return structure for the use of the UDA finalizer.
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_trans(bytea, int8) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_trans(bitmaps bytea, input int8)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_trans(bytea, int8, int8) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_trans(bitmaps bytea, input int8, arg1 int8)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_trans(bytea, int8, int8, int8) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_trans(bytea, int8, int8, int8, int8) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8, arg3 int8)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_final(bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_final(counters bytea)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch(int8);
/**
@brief <c>cmsketch</c> is a UDA that can be run on columns of type int8, or any column that can be cast to an int8. It produces a CountMin sketch: a large array of counters that is intended to be passed into a UDF like <c>cmsketch_width_histogram</c> described below.
*/
CREATE AGGREGATE MADLIB_SCHEMA.cmsketch(/*+ column */ int8)
(
sfunc = MADLIB_SCHEMA.__cmsketch_trans,
stype = bytea,
finalfunc = MADLIB_SCHEMA.__cmsketch_final,
ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,')
initcond = ''
);
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_count_final(bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_count_final(bytea)
RETURNS int8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch_count(int8, int8);
/**
@brief <c>cmsketch_count</c> is a UDA that takes a column and a constant value of type int8, and reports the approximate number of occurrences of the value in the column.
*/
CREATE AGGREGATE MADLIB_SCHEMA.cmsketch_count(/*+ column */ int8, /*+ value */ int8)
(
sfunc = MADLIB_SCHEMA.__cmsketch_trans,
stype = bytea,
finalfunc = MADLIB_SCHEMA.__cmsketch_count_final,
ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,')
initcond = ''
);
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_rangecount_final(bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_rangecount_final(bytea)
RETURNS int8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch_rangecount(int8, int8, int8);
/** \brief <c>cmsketch_rangecount</c> is a UDA that takes a column and a range <c>[low, hi]</c>, and reports the approximate number of column values that fall between
<c>low</c> and <c>hi</c> inclusive.
*/
CREATE AGGREGATE MADLIB_SCHEMA.cmsketch_rangecount(/*+ column */ int8, /*+ low */ int8, /*+ hi */ int8)
(
sfunc = MADLIB_SCHEMA.__cmsketch_trans,
stype = bytea,
finalfunc = MADLIB_SCHEMA.__cmsketch_rangecount_final,
ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,')
initcond = ''
);
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_centile_final(bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_centile_final(bytea)
RETURNS int8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch_centile(int8, int8);
/**
@brief <c>cmsketch_centile</c> is a UDA that takes a column and a percentage value between 1 and 99, and reports a value in the column that is approximately at the percentage (centile) position in sorted order.
*/
CREATE AGGREGATE MADLIB_SCHEMA.cmsketch_centile(int8, int8)
(
sfunc = MADLIB_SCHEMA.__cmsketch_trans,
stype = bytea,
finalfunc = MADLIB_SCHEMA.__cmsketch_centile_final,
ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,')
initcond = ''
);
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_median_final(bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_median_final(bytea)
RETURNS int8
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch_median(int8);
CREATE AGGREGATE MADLIB_SCHEMA.cmsketch_median(int8)
(
sfunc = MADLIB_SCHEMA.__cmsketch_trans,
stype = bytea,
finalfunc = MADLIB_SCHEMA.__cmsketch_median_final,
ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,')
initcond = ''
);
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.cmsketch_width_histogram(bytea, int8, int8, int4) CASCADE;
/**
\brief <c>cmsketch_width_histogram</c> is a UDF (not an aggregate!) that takes three aggregates of a column -- cmsketch, min and max-- as well as a number of buckets, and produces an n-bucket histogram for the column where each bucket has approximately the same width. The output is an array of triples {lo, hi, count} representing the buckets; counts are approximate.
*/
CREATE FUNCTION MADLIB_SCHEMA.cmsketch_width_histogram(/*+ cmsketch */bytea, /*+ min */int8, /*+ max */int8, /*+ nbuckets */ int4)
RETURNS int8[]
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_dhist_final(bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_dhist_final(bytea)
RETURNS int8[]
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch_depth_histogram(int8, int8);
/** @brief <c>cmsketch_depth_histogram</c> is a UDA that takes a column and a number of buckets n, and produces an n-bucket histogram for the column where each bucket has approximately the same count. The output is an array of triples {lo, hi, count} representing the buckets; counts are approximate. Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles.
*/
CREATE AGGREGATE MADLIB_SCHEMA.cmsketch_depth_histogram(/*+ column */int8, /*+ num_buckets */int8)
(
sfunc = MADLIB_SCHEMA.__cmsketch_trans,
stype = bytea,
finalfunc = MADLIB_SCHEMA.__cmsketch_dhist_final,
ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__cmsketch_merge,')
initcond = ''
);
-- MFV Sketch functions
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_trans(bytea, anyelement, int4)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_final(bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_final(bytea)
RETURNS text[][]
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea) CASCADE;
CREATE FUNCTION MADLIB_SCHEMA.__mfvsketch_merge(bytea, bytea)
RETURNS bytea
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_top_histogram(anyelement, int4);
/**
<c>mfvsketch_top_histogram</c> produces an n-bucket histogram for a
column where each bucket counts one of the most frequent values in the column. The output is an array of doubles {value, count}
in descending order of frequency; counts are approximated via CountMin sketches. Ties are handled arbitrarily.
*/
CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_top_histogram(/*+ column */ anyelement, /*+ number_of_buckets */ int4)
(
sfunc = MADLIB_SCHEMA.__mfvsketch_trans,
stype = bytea,
finalfunc = MADLIB_SCHEMA.__mfvsketch_final,
initcond = ''
);
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.mfvsketch_quick_histogram(anyelement, int4);
/**
<c>mfvsketch_quick_histogram</c> is the same as the above
but, in Greenplum it does parallel aggregation to provide a "quick and dirty"
answer. In Postgres it is identical to <c>mfvsketch_top_histogram</c>.
*/
CREATE AGGREGATE MADLIB_SCHEMA.mfvsketch_quick_histogram(anyelement, int4)
(
sfunc = MADLIB_SCHEMA.__mfvsketch_trans,
stype = bytea,
finalfunc = MADLIB_SCHEMA.__mfvsketch_final,
ifdef(`GREENPLUM', `prefunc = MADLIB_SCHEMA.__mfvsketch_merge,')
initcond = ''
);