blob: 42450f20822d69db5cd53ff380a834f53c4fc87f [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file quantile.sql_in
*
* @brief SQL function for Quantile
* @date January 2011
*
* @sa For a brief introduction to quantiles, see the module
* description \ref grp_quantile.
*
*//* ----------------------------------------------------------------------- */
/**
@addtogroup grp_quantile
\warning <em> This is an old implementation of quantile. Replacement of this
function is available as built-in PERCENTILE_CONT in databases</em>
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#syntax">Function Syntax</a></li>
<li><a href="#examples">Examples</a></li>
<li><a href="#related">Related Topics</a></li>
</ul>
</div>
@brief Computes a quantile value for a column in a table.
This module computes the specified quantile value. It reads the name of the
table, the specific column, and computes the quantile value based on the
fraction specified as the third argument.
For an implementation of quantile using sketches, check out the cmsketch_centile()
aggregate in the \ref grp_countmin module.
@anchor syntax
@par Function Syntax
There are two implementations of quantile available depending on the size of
the table.
quantile() is best used for small tables (i.e., less than
5000 rows, with 1-2 columns in total).
<pre class="syntax">
quantile( table_name,
col_name,
quantile
)
</pre>
For larger tables, consider using quantile_big() instead.
<pre class="syntax">
quantile_big( table_name,
col_name,
quantile
)
</pre>
@anchor examples
@examp
-# Prepare some input.
<pre class="example">
CREATE TABLE tab1 AS SELECT generate_series(1, 1000) AS col1;
</pre>
-# Run the quantile() function.
<pre class="example">
SELECT quantile( 'tab1',
'col1',
.3
);
</pre>
Result:
<pre class="result">
quantile
&nbsp;-------------
301.48046875
(1 row)
</pre>
Note that this module is not available in HAWQ.
GPDB 4.2+ and HAWQ 1.2+ support the SQL Standard percentile_cont() inverse distribution function which should be used preferentially to this implementation.
It is also planned for support in Postgres 9.4.
This implementation will be retired once the functionality is available in Postgres.
@anchor related
@par Related Topics
File quantile.sql_in documenting the SQL function.
Module \ref grp_countmin for an approximate quantile implementation.
*/
/**
* @brief Computes quantile
*
* @param table_name name of the table from which quantile is to be taken
* @param col_name name of the column that is to be used for quantile calculation
* @param quantile desired quantile value \f$ \in (0,1) \f$
* @returns The quantile value
*
* This function computes the specified quantile value. It reads the name of the
* table, the specific column, and computes the quantile value based on the
* fraction specified as the third argument. The functionality is the same as <tt>quantile</tt> except this implementation is designed to work more efficiently with large tables.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile_big(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$
declare
size FLOAT[];
count BIGINT;
increment INT := 0;
curr_old BIGINT;
last_values FLOAT[];
last_count BIGINT;
last_value1 FLOAT;
last_count1 BIGINT;
last_value2 FLOAT;
last_count2 BIGINT;
quantile_size BIGINT;
full_size BIGINT;
rows_removed BIGINT := 0;
Begin
/*
This portion computes basic statistics on the table, finding:
MIN value
AVG value
MAX value
COOUNT of the elemens
Which at stored in that order into 'size', count object
'quantile_size' is computed in terms of element count
*/
EXECUTE 'SELECT array[MIN('||col_name||'), AVG('||col_name||'), MAX('||col_name||')], COUNT(*) FROM '||table_name||' ' INTO size, count;
quantile_size = (count*quantile)::BIGINT;
full_size = count;
-- check for bad input
IF(quantile < 0) OR (quantile >= 1) THEN
RAISE EXCEPTION 'Quantile should be between 0 and 0.99';
ELSIF(quantile = 0) THEN
RETURN size[1];
END IF;
-- create some temp tables to use as swap
DROP TABLE IF EXISTS temptable0;
CREATE TEMP TABLE temptable0(val FLOAT);
DROP TABLE IF EXISTS temptable1;
CREATE TEMP TABLE temptable1(val FLOAT);
/*
This is the main loop of the algorithm. Its goal is to do a binarry search over the table to find the value that is the closest to the position corresponding to the
quantile size.
In each itteration for a given value 'size[2]' following are computed:
MIN value less than or equal to 'size[2]'
AVERAGE value less than or equal to 'size[2]'
MAX value less than or equal to 'size[2]'
COUNT of the values less than or equal to 'size[2]'
This results are stored into 'last_values', last_count in that order
*/
LOOP
IF(increment = 0) THEN
EXECUTE 'SELECT ARRAY[MIN('||col_name||'),AVG('||col_name||'),MAX('||col_name||')],COUNT(*) FROM '||table_name||' WHERE '||col_name||' <= '||size[2]||';' INTO last_values, last_count;
ELSE
EXECUTE 'SELECT ARRAY[MIN(val),AVG(val),MAX(val)],COUNT(*) FROM temptable'||increment%2||' WHERE val <= '||size[2]||';' INTO last_values, last_count;
END IF;
last_count = last_count + rows_removed;
IF(last_count=rows_removed) THEN
/*
If there are no more rows left, we exit.
*/
EXIT;
ELSIF((increment > 0)AND(curr_old = last_count)) THEN
/*
We will exit the loop if there was not change in the count from previous itteration
which mean that process will make no further progress.
*/
EXIT;
ELSIF((last_count - quantile_size) > 1) THEN
/*
If current COUNT is greater than 'size[2]' we will reduce the value of 'size[2]'
in binarry search fashion. And then update upper limit to our search the max value observed in this round
*/
size[2] = (last_values[3]+size[1])/2.0;
size[3] = last_values[3];
--remove all rows that are larger than new max
IF(increment = 0) THEN
EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT '||col_name||' FROM '||table_name||' WHERE '||col_name||' <= '||size[3]||';';
ELSE
EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT val FROM temptable'||increment%2||' WHERE val <= '||size[3]||';';
EXECUTE 'TRUNCATE temptable'||increment%2||';';
END IF;
ELSIF((quantile_size - last_count) > 1) THEN
/*
If current COUNT is less than 'size[2]' we will increse the value of 'size[2]'
in binarry search fashion. And then update lower limit to our search the max value observed in this round
*/
size[1] = last_values[3];
size[2] = (last_values[3]+size[3])/2.0;
--remove all rows that are smaller than new min
IF(increment = 0) THEN
--add a small offset to ensure the value that is equal to size[1] is NOT kept
EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT '||col_name||' FROM '||table_name||' WHERE '||col_name||' > '||size[1]||'+1e-10;';
ELSE
EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT val FROM temptable'||increment%2||' WHERE val > '||size[1]||'+1e-10;';
EXECUTE 'TRUNCATE temptable'||increment%2||';';
END IF;
rows_removed = last_count;
ELSE
/*
EXIT since we are closer than 1 element away from the quantile size
*/
IF((quantile_size - last_count) < 0)THEN
size[2] = last_values[3];
END IF;
EXIT;
END IF;
increment = increment+1;
curr_old = last_count;
END LOOP;
/*
At this point we terminated the binary search but we do not know what the reason why no progress could be made
following is the code that determines what is the reason for the termination, and finds the exact solution depending on the reason
*/
EXECUTE 'SELECT MAX('||col_name||'),COUNT(*) FROM '||table_name||' WHERE '||col_name||' < '||size[2]||';' INTO last_value1, last_count1;
IF(last_count1 >= quantile_size) THEN
RETURN last_value1;
END IF;
EXECUTE 'SELECT MIN('||col_name||'),'||full_size||'-COUNT(*)+1 FROM '||table_name||' WHERE '||col_name||' > '||size[2]||';' INTO last_value2, last_count2;
IF(last_count >= quantile_size) THEN
--If the difference is greater than 1 element away, then there are probably many repeated values
IF(last_count-quantile_size >= 1) THEN
RETURN last_values[3];
END IF;
RETURN last_values[3]*(quantile_size-last_count1)/(last_count-last_count1)+last_value1*(last_count-quantile_size)/(last_count-last_count1);
ELSE
--If the difference is greater than 1 element away, then there are probably many repeated values
IF(quantile_size-last_count > 1) THEN
RETURN last_value2;
END IF;
RETURN last_value2*(quantile_size-last_count)/(last_count2-last_count)+last_values[3]*(last_count2-quantile_size)/(last_count2-last_count);
END IF;
-- Cleanup
DROP TABLE IF EXISTS temptable0;
DROP TABLE IF EXISTS temptable1;
end
$$ LANGUAGE plpgsql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
/**
* @brief Computes quantile
*
* @param table_name name of the table from which quantile is to be taken
* @param col_name name of the column that is to be used for quantile calculation
* @param quantile desired quantile value \f$ \in (0,1) \f$
* @returns The quantile value
*
* This function computes the specified quantile value. It reads the name of the
* table, the specific column, and computes the quantile value based on the
* fraction specified as the third argument.
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$
declare
size FLOAT;
result FLOAT[];
res FLOAT;
begin
-- check for bad input
IF(quantile < 0) OR (quantile >= 1) THEN
RAISE EXCEPTION 'Quantile should be between 0 and 0.99';
END IF;
EXECUTE 'SELECT COUNT(*)*'||quantile||' FROM '||table_name||' ' INTO size;
EXECUTE 'SELECT ARRAY(SELECT '||col_name||' FROM (SELECT '||col_name||' FROM '||table_name||' ORDER BY '||col_name||' OFFSET '||floor(size)||'-1 LIMIT 2) AS g)' INTO result;
EXECUTE 'SELECT '||result[2]||'*('||size||'%1)+'||result[1]||'*(1-'||size||'%1)' INTO res;
return res;
end
$$ LANGUAGE plpgsql
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');