src/ports/postgres/modules/quantile/quantile.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file quantile.sql_in
  *
  * @brief SQL function for Quantile
  * @date   January 2011
  *
  * @sa For a brief introduction to quantiles, see the module
  *     description \ref grp_quantile.
  *
  *//* ----------------------------------------------------------------------- */

 /**
 @addtogroup grp_quantile

 \warning <em> This is an old implementation of quantile. Replacement of this
 function is available as built-in PERCENTILE_CONT in databases</em>

 <div class="toc"><b>Contents</b>
 <ul>
 <li><a href="#syntax">Function Syntax</a></li>
 <li><a href="#examples">Examples</a></li>
 <li><a href="#related">Related Topics</a></li>
 </ul>
 </div>

 @brief Computes a quantile value for a column in a table.

 This module computes the specified quantile value. It reads the name of the
 table, the specific column, and computes the quantile value based on the
 fraction specified as the third argument.

 For an implementation of quantile using sketches, check out the cmsketch_centile()
 aggregate in the \ref grp_countmin module.


 @anchor syntax
 @par Function Syntax

 There are two implementations of quantile available depending on the size of
 the table.

 quantile() is best used for small tables (i.e., less than
 5000 rows, with 1-2 columns in total).

 <pre class="syntax">
 quantile( table_name,
           col_name,
           quantile
         )
 </pre>

 For larger tables, consider using quantile_big() instead.

 <pre class="syntax">
 quantile_big( table_name,
               col_name,
               quantile
             )
 </pre>

 @anchor examples
 @examp

 -# Prepare some input.
 <pre class="example">
 CREATE TABLE tab1 AS SELECT generate_series(1, 1000) AS col1;
 </pre>

 -# Run the quantile() function.
 <pre class="example">
 SELECT quantile( 'tab1',
                  'col1',
                  .3
                );
 </pre>
 Result:
 <pre class="result">
    quantile
 &nbsp;-------------
  301.48046875
 (1 row)
 </pre>

 Note that this module is not available in HAWQ.
 GPDB 4.2+ and HAWQ 1.2+ support the SQL Standard percentile_cont() inverse distribution function which should be used preferentially to this implementation.
 It is also planned for support in Postgres 9.4.
 This implementation will be retired once the functionality is available in Postgres.

 @anchor related
 @par Related Topics
 File quantile.sql_in documenting the SQL function.

 Module \ref grp_countmin for an approximate quantile implementation.
 */


 /**
  * @brief Computes quantile
  *
  * @param table_name name of the table from which quantile is to be taken
  * @param col_name name of the column that is to be used for quantile calculation
  * @param quantile desired quantile value \f$ \in (0,1) \f$
  * @returns The quantile value
  *
  * This function computes the specified quantile value. It reads the name of the
  * table, the specific column, and computes the quantile value based on the
  * fraction specified as the third argument. The functionality is the same as <tt>quantile</tt> except this implementation is designed to work more efficiently with large tables.
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile_big(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$
 declare
     size            FLOAT[];
     count           BIGINT;
     increment       INT := 0;
     curr_old        BIGINT;
     last_values     FLOAT[];
     last_count      BIGINT;
     last_value1     FLOAT;
     last_count1     BIGINT;
     last_value2     FLOAT;
     last_count2     BIGINT;
     quantile_size   BIGINT;
     full_size       BIGINT;
     rows_removed    BIGINT := 0;
 Begin
     /*
          This portion computes basic statistics on the table, finding:
              MIN value
              AVG value
              MAX value
              COOUNT of the elemens
          Which at stored in that order into 'size', count object
          'quantile_size' is computed in terms of element count
     */
     EXECUTE 'SELECT array[MIN('||col_name||'), AVG('||col_name||'), MAX('||col_name||')], COUNT(*) FROM '||table_name||' ' INTO size, count;
     quantile_size = (count*quantile)::BIGINT;
     full_size = count;

     -- check for bad input
     IF(quantile < 0) OR (quantile >= 1) THEN
         RAISE EXCEPTION 'Quantile should be between 0 and 0.99';
     ELSIF(quantile = 0) THEN
         RETURN size[1];
     END IF;

     -- create some temp tables to use as swap
     DROP TABLE IF EXISTS temptable0;
     CREATE TEMP TABLE temptable0(val FLOAT);

     DROP TABLE IF EXISTS temptable1;
     CREATE TEMP TABLE temptable1(val FLOAT);


     /*
         This is the main loop of the algorithm. Its goal is to do a binarry search over the table to find the value that is the closest to the position corresponding to the
         quantile size.

         In each itteration for a given value 'size[2]' following are computed:
             MIN value less than or equal to 'size[2]'
             AVERAGE value less than or equal to 'size[2]'
             MAX value less than or equal to 'size[2]'
             COUNT of the values less than or equal to 'size[2]'
         This results are stored into 'last_values', last_count in that order
     */
     LOOP
         IF(increment = 0) THEN
             EXECUTE 'SELECT ARRAY[MIN('||col_name||'),AVG('||col_name||'),MAX('||col_name||')],COUNT(*) FROM '||table_name||' WHERE '||col_name||' <= '||size[2]||';' INTO last_values, last_count;
         ELSE
             EXECUTE 'SELECT ARRAY[MIN(val),AVG(val),MAX(val)],COUNT(*) FROM temptable'||increment%2||' WHERE val <= '||size[2]||';' INTO last_values, last_count;
         END IF;
         last_count = last_count + rows_removed;


         IF(last_count=rows_removed) THEN
             /*
                 If there are no more rows left, we exit.
             */
             EXIT;

         ELSIF((increment > 0)AND(curr_old = last_count)) THEN
             /*
                 We will exit the loop if there was not change in the count from previous itteration
                 which mean that process will make no further progress.
             */
             EXIT;
         ELSIF((last_count - quantile_size) > 1) THEN
             /*
                 If current COUNT is greater than 'size[2]' we will reduce the value of 'size[2]'
                 in binarry search fashion. And then update upper limit to our search the max value observed in this round
             */
             size[2] =  (last_values[3]+size[1])/2.0;
             size[3] = last_values[3];

             --remove all rows that are larger than new max
             IF(increment = 0) THEN
                 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT '||col_name||' FROM '||table_name||' WHERE '||col_name||' <= '||size[3]||';';
             ELSE
                 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT val FROM temptable'||increment%2||' WHERE val <= '||size[3]||';';
                 EXECUTE 'TRUNCATE temptable'||increment%2||';';
             END IF;


         ELSIF((quantile_size - last_count) > 1) THEN
             /*
                 If current COUNT is less than 'size[2]' we will increse the value of 'size[2]'
                 in binarry search fashion. And then update lower limit to our search the max value observed in this round
             */
             size[1] = last_values[3];
             size[2] = (last_values[3]+size[3])/2.0;

             --remove all rows that are smaller than new min
             IF(increment = 0) THEN
                 --add a small offset to ensure the value that is equal to size[1] is NOT kept
                 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT '||col_name||' FROM '||table_name||' WHERE '||col_name||' > '||size[1]||'+1e-10;';
             ELSE
                 EXECUTE 'INSERT INTO temptable'||(increment+1)%2||' SELECT val FROM temptable'||increment%2||' WHERE val > '||size[1]||'+1e-10;';
                 EXECUTE 'TRUNCATE temptable'||increment%2||';';
             END IF;
             rows_removed = last_count;

         ELSE
             /*
                 EXIT since we are closer than 1 element away from the quantile size
             */
             IF((quantile_size - last_count) < 0)THEN
                 size[2] = last_values[3];
             END IF;
             EXIT;
         END IF;
         increment = increment+1;
         curr_old = last_count;
     END LOOP;

     /*
         At this point we terminated the binary search but we do not know what the reason why no progress could be made
         following is the code that determines what is the reason for the termination, and finds the exact solution depending on the reason
     */
     EXECUTE 'SELECT MAX('||col_name||'),COUNT(*) FROM '||table_name||' WHERE '||col_name||' < '||size[2]||';' INTO last_value1, last_count1;

     IF(last_count1 >= quantile_size) THEN
         RETURN last_value1;
     END IF;

     EXECUTE 'SELECT MIN('||col_name||'),'||full_size||'-COUNT(*)+1 FROM '||table_name||' WHERE '||col_name||' > '||size[2]||';' INTO last_value2, last_count2;


     IF(last_count >= quantile_size) THEN
         --If the difference is greater than 1 element away, then there are probably many repeated values
         IF(last_count-quantile_size >= 1) THEN
             RETURN last_values[3];
         END IF;
         RETURN last_values[3]*(quantile_size-last_count1)/(last_count-last_count1)+last_value1*(last_count-quantile_size)/(last_count-last_count1);
     ELSE
         --If the difference is greater than 1 element away, then there are probably many repeated values
         IF(quantile_size-last_count > 1) THEN
             RETURN last_value2;
         END IF;
         RETURN last_value2*(quantile_size-last_count)/(last_count2-last_count)+last_values[3]*(last_count2-quantile_size)/(last_count2-last_count);
     END IF;

     -- Cleanup
     DROP TABLE IF EXISTS temptable0;
     DROP TABLE IF EXISTS temptable1;

 end
 $$ LANGUAGE plpgsql
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

 /**
  * @brief Computes quantile
  *
  * @param table_name name of the table from which quantile is to be taken
  * @param col_name name of the column that is to be used for quantile calculation
  * @param quantile desired quantile value \f$ \in (0,1) \f$
  * @returns The quantile value
  *
  * This function computes the specified quantile value. It reads the name of the
  * table, the specific column, and computes the quantile value based on the
  * fraction specified as the third argument.
  */
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$
 declare
     size FLOAT;
     result FLOAT[];
     res FLOAT;
 begin
     -- check for bad input
     IF(quantile < 0) OR (quantile >= 1) THEN
         RAISE EXCEPTION 'Quantile should be between 0 and 0.99';
     END IF;

     EXECUTE 'SELECT COUNT(*)*'||quantile||' FROM '||table_name||' ' INTO size;
     EXECUTE 'SELECT ARRAY(SELECT '||col_name||' FROM (SELECT '||col_name||' FROM '||table_name||' ORDER BY '||col_name||' OFFSET '||floor(size)||'-1 LIMIT 2) AS g)' INTO result;
     EXECUTE 'SELECT '||result[2]||'*('||size||'%1)+'||result[1]||'*(1-'||size||'%1)' INTO res;
     return res;
 end
 $$ LANGUAGE plpgsql
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
	/* ----------------------------------------------------------------------- //*
	*
	* @file quantile.sql_in
	*
	* @brief SQL function for Quantile
	* @date January 2011
	*
	* @sa For a brief introduction to quantiles, see the module
	* description \ref grp_quantile.
	*
	// ----------------------------------------------------------------------- */

	/**
	@addtogroup grp_quantile

	\warning <em> This is an old implementation of quantile. Replacement of this
	function is available as built-in PERCENTILE_CONT in databases</em>

	<div class="toc"><b>Contents</b>
	<ul>
	<li><a href="#syntax">Function Syntax</a></li>
	<li><a href="#examples">Examples</a></li>
	<li><a href="#related">Related Topics</a></li>
	</ul>
	</div>

	@brief Computes a quantile value for a column in a table.

	This module computes the specified quantile value. It reads the name of the
	table, the specific column, and computes the quantile value based on the
	fraction specified as the third argument.

	For an implementation of quantile using sketches, check out the cmsketch_centile()
	aggregate in the \ref grp_countmin module.


	@anchor syntax
	@par Function Syntax

	There are two implementations of quantile available depending on the size of
	the table.

	quantile() is best used for small tables (i.e., less than
	5000 rows, with 1-2 columns in total).

	<pre class="syntax">
	quantile( table_name,
	col_name,
	quantile
	)
	</pre>

	For larger tables, consider using quantile_big() instead.

	<pre class="syntax">
	quantile_big( table_name,
	col_name,
	quantile
	)
	</pre>

	@anchor examples
	@examp

	-# Prepare some input.
	<pre class="example">
	CREATE TABLE tab1 AS SELECT generate_series(1, 1000) AS col1;
	</pre>

	-# Run the quantile() function.
	<pre class="example">
	SELECT quantile( 'tab1',
	'col1',
	.3
	);
	</pre>
	Result:
	<pre class="result">
	quantile
	-------------
	301.48046875
	(1 row)
	</pre>

	Note that this module is not available in HAWQ.
	GPDB 4.2+ and HAWQ 1.2+ support the SQL Standard percentile_cont() inverse distribution function which should be used preferentially to this implementation.
	It is also planned for support in Postgres 9.4.
	This implementation will be retired once the functionality is available in Postgres.

	@anchor related
	@par Related Topics
	File quantile.sql_in documenting the SQL function.

	Module \ref grp_countmin for an approximate quantile implementation.
	*/


	/**
	* @brief Computes quantile
	*
	* @param table_name name of the table from which quantile is to be taken
	* @param col_name name of the column that is to be used for quantile calculation
	* @param quantile desired quantile value \f$ \in (0,1) \f$
	* @returns The quantile value
	*
	* This function computes the specified quantile value. It reads the name of the
	* table, the specific column, and computes the quantile value based on the
	* fraction specified as the third argument. The functionality is the same as <tt>quantile</tt> except this implementation is designed to work more efficiently with large tables.
	*/
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile_big(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$
	declare
	size FLOAT[];
	count BIGINT;
	increment INT := 0;
	curr_old BIGINT;
	last_values FLOAT[];
	last_count BIGINT;
	last_value1 FLOAT;
	last_count1 BIGINT;
	last_value2 FLOAT;
	last_count2 BIGINT;
	quantile_size BIGINT;
	full_size BIGINT;
	rows_removed BIGINT := 0;
	Begin
	/*
	This portion computes basic statistics on the table, finding:
	MIN value
	AVG value
	MAX value
	COOUNT of the elemens
	Which at stored in that order into 'size', count object
	'quantile_size' is computed in terms of element count
	*/
	EXECUTE 'SELECT array[MIN('\|\|col_name\|\|'), AVG('\|\|col_name\|\|'), MAX('\|\|col_name\|\|')], COUNT(*) FROM '\|\|table_name\|\|' ' INTO size, count;
	quantile_size = (count*quantile)::BIGINT;
	full_size = count;

	-- check for bad input
	IF(quantile < 0) OR (quantile >= 1) THEN
	RAISE EXCEPTION 'Quantile should be between 0 and 0.99';
	ELSIF(quantile = 0) THEN
	RETURN size[1];
	END IF;

	-- create some temp tables to use as swap
	DROP TABLE IF EXISTS temptable0;
	CREATE TEMP TABLE temptable0(val FLOAT);

	DROP TABLE IF EXISTS temptable1;
	CREATE TEMP TABLE temptable1(val FLOAT);


	/*
	This is the main loop of the algorithm. Its goal is to do a binarry search over the table to find the value that is the closest to the position corresponding to the
	quantile size.

	In each itteration for a given value 'size[2]' following are computed:
	MIN value less than or equal to 'size[2]'
	AVERAGE value less than or equal to 'size[2]'
	MAX value less than or equal to 'size[2]'
	COUNT of the values less than or equal to 'size[2]'
	This results are stored into 'last_values', last_count in that order
	*/
	LOOP
	IF(increment = 0) THEN
	EXECUTE 'SELECT ARRAY[MIN('\|\|col_name\|\|'),AVG('\|\|col_name\|\|'),MAX('\|\|col_name\|\|')],COUNT(*) FROM '\|\|table_name\|\|' WHERE '\|\|col_name\|\|' <= '\|\|size[2]\|\|';' INTO last_values, last_count;
	ELSE
	EXECUTE 'SELECT ARRAY[MIN(val),AVG(val),MAX(val)],COUNT(*) FROM temptable'\|\|increment%2\|\|' WHERE val <= '\|\|size[2]\|\|';' INTO last_values, last_count;
	END IF;
	last_count = last_count + rows_removed;


	IF(last_count=rows_removed) THEN
	/*
	If there are no more rows left, we exit.
	*/
	EXIT;

	ELSIF((increment > 0)AND(curr_old = last_count)) THEN
	/*
	We will exit the loop if there was not change in the count from previous itteration
	which mean that process will make no further progress.
	*/
	EXIT;
	ELSIF((last_count - quantile_size) > 1) THEN
	/*
	If current COUNT is greater than 'size[2]' we will reduce the value of 'size[2]'
	in binarry search fashion. And then update upper limit to our search the max value observed in this round
	*/
	size[2] = (last_values[3]+size[1])/2.0;
	size[3] = last_values[3];

	--remove all rows that are larger than new max
	IF(increment = 0) THEN
	EXECUTE 'INSERT INTO temptable'\|\|(increment+1)%2\|\|' SELECT '\|\|col_name\|\|' FROM '\|\|table_name\|\|' WHERE '\|\|col_name\|\|' <= '\|\|size[3]\|\|';';
	ELSE
	EXECUTE 'INSERT INTO temptable'\|\|(increment+1)%2\|\|' SELECT val FROM temptable'\|\|increment%2\|\|' WHERE val <= '\|\|size[3]\|\|';';
	EXECUTE 'TRUNCATE temptable'\|\|increment%2\|\|';';
	END IF;


	ELSIF((quantile_size - last_count) > 1) THEN
	/*
	If current COUNT is less than 'size[2]' we will increse the value of 'size[2]'
	in binarry search fashion. And then update lower limit to our search the max value observed in this round
	*/
	size[1] = last_values[3];
	size[2] = (last_values[3]+size[3])/2.0;

	--remove all rows that are smaller than new min
	IF(increment = 0) THEN
	--add a small offset to ensure the value that is equal to size[1] is NOT kept
	EXECUTE 'INSERT INTO temptable'\|\|(increment+1)%2\|\|' SELECT '\|\|col_name\|\|' FROM '\|\|table_name\|\|' WHERE '\|\|col_name\|\|' > '\|\|size[1]\|\|'+1e-10;';
	ELSE
	EXECUTE 'INSERT INTO temptable'\|\|(increment+1)%2\|\|' SELECT val FROM temptable'\|\|increment%2\|\|' WHERE val > '\|\|size[1]\|\|'+1e-10;';
	EXECUTE 'TRUNCATE temptable'\|\|increment%2\|\|';';
	END IF;
	rows_removed = last_count;

	ELSE
	/*
	EXIT since we are closer than 1 element away from the quantile size
	*/
	IF((quantile_size - last_count) < 0)THEN
	size[2] = last_values[3];
	END IF;
	EXIT;
	END IF;
	increment = increment+1;
	curr_old = last_count;
	END LOOP;

	/*
	At this point we terminated the binary search but we do not know what the reason why no progress could be made
	following is the code that determines what is the reason for the termination, and finds the exact solution depending on the reason
	*/
	EXECUTE 'SELECT MAX('\|\|col_name\|\|'),COUNT(*) FROM '\|\|table_name\|\|' WHERE '\|\|col_name\|\|' < '\|\|size[2]\|\|';' INTO last_value1, last_count1;

	IF(last_count1 >= quantile_size) THEN
	RETURN last_value1;
	END IF;

	EXECUTE 'SELECT MIN('\|\|col_name\|\|'),'\|\|full_size\|\|'-COUNT(*)+1 FROM '\|\|table_name\|\|' WHERE '\|\|col_name\|\|' > '\|\|size[2]\|\|';' INTO last_value2, last_count2;


	IF(last_count >= quantile_size) THEN
	--If the difference is greater than 1 element away, then there are probably many repeated values
	IF(last_count-quantile_size >= 1) THEN
	RETURN last_values[3];
	END IF;
	RETURN last_values[3](quantile_size-last_count1)/(last_count-last_count1)+last_value1(last_count-quantile_size)/(last_count-last_count1);
	ELSE
	--If the difference is greater than 1 element away, then there are probably many repeated values
	IF(quantile_size-last_count > 1) THEN
	RETURN last_value2;
	END IF;
	RETURN last_value2(quantile_size-last_count)/(last_count2-last_count)+last_values[3](last_count2-quantile_size)/(last_count2-last_count);
	END IF;

	-- Cleanup
	DROP TABLE IF EXISTS temptable0;
	DROP TABLE IF EXISTS temptable1;

	end
	$$ LANGUAGE plpgsql
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

	/**
	* @brief Computes quantile
	*
	* @param table_name name of the table from which quantile is to be taken
	* @param col_name name of the column that is to be used for quantile calculation
	* @param quantile desired quantile value \f$ \in (0,1) \f$
	* @returns The quantile value
	*
	* This function computes the specified quantile value. It reads the name of the
	* table, the specific column, and computes the quantile value based on the
	* fraction specified as the third argument.
	*/
	CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.quantile(table_name TEXT, col_name TEXT, quantile FLOAT) RETURNS FLOAT AS $$
	declare
	size FLOAT;
	result FLOAT[];
	res FLOAT;
	begin
	-- check for bad input
	IF(quantile < 0) OR (quantile >= 1) THEN
	RAISE EXCEPTION 'Quantile should be between 0 and 0.99';
	END IF;

	EXECUTE 'SELECT COUNT()'\|\|quantile\|\|' FROM '\|\|table_name\|\|' ' INTO size;
	EXECUTE 'SELECT ARRAY(SELECT '\|\|col_name\|\|' FROM (SELECT '\|\|col_name\|\|' FROM '\|\|table_name\|\|' ORDER BY '\|\|col_name\|\|' OFFSET '\|\|floor(size)\|\|'-1 LIMIT 2) AS g)' INTO result;
	EXECUTE 'SELECT '\|\|result[2]\|\|'('\|\|size\|\|'%1)+'\|\|result[1]\|\|'(1-'\|\|size\|\|'%1)' INTO res;
	return res;
	end
	$$ LANGUAGE plpgsql
	m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');