src/backend/utils/misc/sampling.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * sampling.c
  *	  Relation block sampling routines.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  src/backend/utils/misc/sampling.c
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include <math.h>

 #include "utils/sampling.h"


 /*
  * BlockSampler_Init -- prepare for random sampling of blocknumbers
  *
  * BlockSampler provides algorithm for block level sampling of a relation
  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
  * It selects a random sample of samplesize blocks out of
  * the nblocks blocks in the table. If the table has less than
  * samplesize blocks, all blocks are selected.
  *
  * Since we know the total number of blocks in advance, we can use the
  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
  * algorithm.
  *
  * Returns the number of blocks that BlockSampler_Next will return.
  */
 BlockNumber
 BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
 				  long randseed)
 {
 	bs->N = nblocks;			/* measured table size */

 	/*
 	 * If we decide to reduce samplesize for tables that have less or not much
 	 * more than samplesize blocks, here is the place to do it.
 	 */
 	bs->n = samplesize;
 	bs->t = 0;					/* blocks scanned so far */
 	bs->m = 0;					/* blocks selected so far */

 	sampler_random_init_state(randseed, bs->randstate);

 	return Min(bs->n, bs->N);
 }

 bool
 BlockSampler_HasMore(BlockSampler bs)
 {
 	return (bs->t < bs->N) && (bs->m < bs->n);
 }

 BlockNumber
 BlockSampler_Next(BlockSampler bs)
 {
 	BlockNumber K = bs->N - bs->t;	/* remaining blocks */
 	int			k = bs->n - bs->m;	/* blocks still to sample */
 	double		p;				/* probability to skip block */
 	double		V;				/* random */

 	Assert(BlockSampler_HasMore(bs));	/* hence K > 0 and k > 0 */

 	if ((BlockNumber) k >= K)
 	{
 		/* need all the rest */
 		bs->m++;
 		return bs->t++;
 	}

 	/*----------
 	 * It is not obvious that this code matches Knuth's Algorithm S.
 	 * Knuth says to skip the current block with probability 1 - k/K.
 	 * If we are to skip, we should advance t (hence decrease K), and
 	 * repeat the same probabilistic test for the next block.  The naive
 	 * implementation thus requires a sampler_random_fract() call for each
 	 * block number.  But we can reduce this to one sampler_random_fract()
 	 * call per selected block, by noting that each time the while-test
 	 * succeeds, we can reinterpret V as a uniform random number in the range
 	 * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
 	 * the appropriate fraction of its former value, and our next loop
 	 * makes the appropriate probabilistic test.
 	 *
 	 * We have initially K > k > 0.  If the loop reduces K to equal k,
 	 * the next while-test must fail since p will become exactly zero
 	 * (we assume there will not be roundoff error in the division).
 	 * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
 	 * to be doubly sure about roundoff error.)  Therefore K cannot become
 	 * less than k, which means that we cannot fail to select enough blocks.
 	 *----------
 	 */
 	V = sampler_random_fract(bs->randstate);
 	p = 1.0 - (double) k / (double) K;
 	while (V < p)
 	{
 		/* skip */
 		bs->t++;
 		K--;					/* keep K == N - t */

 		/* adjust p to be new cutoff point in reduced range */
 		p *= 1.0 - (double) k / (double) K;
 	}

 	/* select */
 	bs->m++;
 	return bs->t++;
 }

 /*
  * This is used for sampling AO/CO row numbers, in the flattened
  * row number space, across all segfile tuple counts. 64 bits is
  * used for simplicity and is sufficient to hold a maximum tuple
  * count of (AOTupleId_MaxSegmentFileNum * MAX_AOREL_CONCURRENCY),
  * which can be represented with 48 bits.
  *
  * The code is same as BlockSampler except replacing
  * int type of variables with int64, which is intended
  * to support larger size of the data set (N).
  *
  * Duplicate code for not willing to break the original
  * design to conflict with upstream for some special case.
  */
 void
 RowSampler_Init(RowSampler rs, int64 nobjects, int64 samplesize,
 				long randseed)
 {
 	rs->N = nobjects;			/* measured table size */

 	/*
 	 * If we decide to reduce samplesize for tables that have less or not much
 	 * more than samplesize objects, here is the place to do it.
 	 */
 	rs->n = samplesize;
 	rs->t = 0;					/* objects scanned so far */
 	rs->m = 0;					/* objects selected so far */

 	sampler_random_init_state(randseed, rs->randstate);
 }

 bool
 RowSampler_HasMore(RowSampler rs)
 {
 	return (rs->t < rs->N) && (rs->m < rs->n);
 }

 int64
 RowSampler_Next(RowSampler rs)
 {
 	int64       K = rs->N - rs->t;	/* remaining objects */
 	int64		k = rs->n - rs->m;	/* objects still to sample */
 	double		p;				    /* probability to skip object */
 	double		V;				    /* random */

 	Assert(RowSampler_HasMore(rs));	/* hence K > 0 and k > 0 */

 	if (k >= K)
 	{
 		/* need all the rest */
 		rs->m++;
 		return rs->t++;
 	}

     /*
      * It is not obvious that this code matches Knuth's Algorithm S.
      * Refer to BlockSampler_Next() for detail.
      */
 	V = sampler_random_fract(rs->randstate);
     /*
 	 * Don't bother overflow of conversion from int64 K (N) as it was
 	 * already converted to "double" range value when initialized.
 	 */
 	p = 1.0 - (double) k / (double) K;
 	while (V < p)
 	{
 		/* skip */
 		rs->t++;
 		K--; /* keep K == N - t */

 		/* adjust p to be new cutoff point in reduced range */
 		p *= 1.0 - (double) k / (double) K;
 	}

 	/* select */
 	rs->m++;
 	return rs->t++;
 }

 /*
  * These two routines embody Algorithm Z from "Random sampling with a
  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
  * of the count S of records to skip before processing another record.
  * It is computed primarily based on t, the number of records already read.
  * The only extra state needed between calls is W, a random state variable.
  *
  * reservoir_init_selection_state computes the initial W value.
  *
  * Given that we've already read t records (t >= n), reservoir_get_next_S
  * determines the number of records to skip before the next record is
  * processed.
  */
 void
 reservoir_init_selection_state(ReservoirState rs, int n)
 {
 	/*
 	 * Reservoir sampling is not used anywhere where it would need to return
 	 * repeatable results so we can initialize it randomly.
 	 */
 	sampler_random_init_state(random(), rs->randstate);

 	/* Initial value of W (for use when Algorithm Z is first applied) */
 	rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
 }

 double
 reservoir_get_next_S(ReservoirState rs, double t, int n)
 {
 	double		S;

 	/* The magic constant here is T from Vitter's paper */
 	if (t <= (22.0 * n))
 	{
 		/* Process records using Algorithm X until t is large enough */
 		double		V,
 					quot;

 		V = sampler_random_fract(rs->randstate);	/* Generate V */
 		S = 0;
 		t += 1;
 		/* Note: "num" in Vitter's code is always equal to t - n */
 		quot = (t - (double) n) / t;
 		/* Find min S satisfying (4.1) */
 		while (quot > V)
 		{
 			S += 1;
 			t += 1;
 			quot *= (t - (double) n) / t;
 		}
 	}
 	else
 	{
 		/* Now apply Algorithm Z */
 		double		W = rs->W;
 		double		term = t - (double) n + 1;

 		for (;;)
 		{
 			double		numer,
 						numer_lim,
 						denom;
 			double		U,
 						X,
 						lhs,
 						rhs,
 						y,
 						tmp;

 			/* Generate U and X */
 			U = sampler_random_fract(rs->randstate);
 			X = t * (W - 1.0);
 			S = floor(X);		/* S is tentatively set to floor(X) */
 			/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
 			tmp = (t + 1) / term;
 			lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
 			rhs = (((t + X) / (term + S)) * term) / t;
 			if (lhs <= rhs)
 			{
 				W = rhs / lhs;
 				break;
 			}
 			/* Test if U <= f(S)/cg(X) */
 			y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
 			if ((double) n < S)
 			{
 				denom = t;
 				numer_lim = term + S;
 			}
 			else
 			{
 				denom = t - (double) n + S;
 				numer_lim = t + 1;
 			}
 			for (numer = t + S; numer >= numer_lim; numer -= 1)
 			{
 				y *= numer / denom;
 				denom -= 1;
 			}
 			W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
 			if (exp(log(y) / n) <= (t + X) / t)
 				break;
 		}
 		rs->W = W;
 	}
 	return S;
 }


 /*----------
  * Random number generator used by sampling
  *----------
  */
 void
 sampler_random_init_state(long seed, SamplerRandomState randstate)
 {
 	randstate[0] = 0x330e;		/* same as pg_erand48, but could be anything */
 	randstate[1] = (unsigned short) seed;
 	randstate[2] = (unsigned short) (seed >> 16);
 }

 /* Select a random value R uniformly distributed in (0 - 1) */
 double
 sampler_random_fract(SamplerRandomState randstate)
 {
 	double		res;

 	/* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
 	do
 	{
 		res = pg_erand48(randstate);
 	} while (res == 0.0);
 	return res;
 }


 /*
  * Backwards-compatible API for block sampling
  *
  * This code is now deprecated, but since it's still in use by many FDWs,
  * we should keep it for awhile at least.  The functionality is the same as
  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
  * except that a common random state is used across all callers.
  */
 static ReservoirStateData oldrs;

 double
 anl_random_fract(void)
 {
 	/* initialize if first time through */
 	if (oldrs.randstate[0] == 0)
 		sampler_random_init_state(random(), oldrs.randstate);

 	/* and compute a random fraction */
 	return sampler_random_fract(oldrs.randstate);
 }

 double
 anl_init_selection_state(int n)
 {
 	/* initialize if first time through */
 	if (oldrs.randstate[0] == 0)
 		sampler_random_init_state(random(), oldrs.randstate);

 	/* Initial value of W (for use when Algorithm Z is first applied) */
 	return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
 }

 double
 anl_get_next_S(double t, int n, double *stateptr)
 {
 	double		result;

 	oldrs.W = *stateptr;
 	result = reservoir_get_next_S(&oldrs, t, n);
 	*stateptr = oldrs.W;
 	return result;
 }
	/*-------------------------------------------------------------------------
	*
	* sampling.c
	* Relation block sampling routines.
	*
	* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
	* Portions Copyright (c) 1994, Regents of the University of California
	*
	*
	* IDENTIFICATION
	* src/backend/utils/misc/sampling.c
	*
	*-------------------------------------------------------------------------
	*/

	#include "postgres.h"

	#include <math.h>

	#include "utils/sampling.h"


	/*
	* BlockSampler_Init -- prepare for random sampling of blocknumbers
	*
	* BlockSampler provides algorithm for block level sampling of a relation
	* as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
	* It selects a random sample of samplesize blocks out of
	* the nblocks blocks in the table. If the table has less than
	* samplesize blocks, all blocks are selected.
	*
	* Since we know the total number of blocks in advance, we can use the
	* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
	* algorithm.
	*
	* Returns the number of blocks that BlockSampler_Next will return.
	*/
	BlockNumber
	BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
	long randseed)
	{
	bs->N = nblocks; /* measured table size */

	/*
	* If we decide to reduce samplesize for tables that have less or not much
	* more than samplesize blocks, here is the place to do it.
	*/
	bs->n = samplesize;
	bs->t = 0; /* blocks scanned so far */
	bs->m = 0; /* blocks selected so far */

	sampler_random_init_state(randseed, bs->randstate);

	return Min(bs->n, bs->N);
	}

	bool
	BlockSampler_HasMore(BlockSampler bs)
	{
	return (bs->t < bs->N) && (bs->m < bs->n);
	}

	BlockNumber
	BlockSampler_Next(BlockSampler bs)
	{
	BlockNumber K = bs->N - bs->t; /* remaining blocks */
	int k = bs->n - bs->m; /* blocks still to sample */
	double p; /* probability to skip block */
	double V; /* random */

	Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */

	if ((BlockNumber) k >= K)
	{
	/* need all the rest */
	bs->m++;
	return bs->t++;
	}

	/*----------
	* It is not obvious that this code matches Knuth's Algorithm S.
	* Knuth says to skip the current block with probability 1 - k/K.
	* If we are to skip, we should advance t (hence decrease K), and
	* repeat the same probabilistic test for the next block. The naive
	* implementation thus requires a sampler_random_fract() call for each
	* block number. But we can reduce this to one sampler_random_fract()
	* call per selected block, by noting that each time the while-test
	* succeeds, we can reinterpret V as a uniform random number in the range
	* 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
	* the appropriate fraction of its former value, and our next loop
	* makes the appropriate probabilistic test.
	*
	* We have initially K > k > 0. If the loop reduces K to equal k,
	* the next while-test must fail since p will become exactly zero
	* (we assume there will not be roundoff error in the division).
	* (Note: Knuth suggests a "<=" loop condition, but we use "<" just
	* to be doubly sure about roundoff error.) Therefore K cannot become
	* less than k, which means that we cannot fail to select enough blocks.
	*----------
	*/
	V = sampler_random_fract(bs->randstate);
	p = 1.0 - (double) k / (double) K;
	while (V < p)
	{
	/* skip */
	bs->t++;
	K--; /* keep K == N - t */

	/* adjust p to be new cutoff point in reduced range */
	p *= 1.0 - (double) k / (double) K;
	}

	/* select */
	bs->m++;
	return bs->t++;
	}

	/*
	* This is used for sampling AO/CO row numbers, in the flattened
	* row number space, across all segfile tuple counts. 64 bits is
	* used for simplicity and is sufficient to hold a maximum tuple
	* count of (AOTupleId_MaxSegmentFileNum * MAX_AOREL_CONCURRENCY),
	* which can be represented with 48 bits.
	*
	* The code is same as BlockSampler except replacing
	* int type of variables with int64, which is intended
	* to support larger size of the data set (N).
	*
	* Duplicate code for not willing to break the original
	* design to conflict with upstream for some special case.
	*/
	void
	RowSampler_Init(RowSampler rs, int64 nobjects, int64 samplesize,
	long randseed)
	{
	rs->N = nobjects; /* measured table size */

	/*
	* If we decide to reduce samplesize for tables that have less or not much
	* more than samplesize objects, here is the place to do it.
	*/
	rs->n = samplesize;
	rs->t = 0; /* objects scanned so far */
	rs->m = 0; /* objects selected so far */

	sampler_random_init_state(randseed, rs->randstate);
	}

	bool
	RowSampler_HasMore(RowSampler rs)
	{
	return (rs->t < rs->N) && (rs->m < rs->n);
	}

	int64
	RowSampler_Next(RowSampler rs)
	{
	int64 K = rs->N - rs->t; /* remaining objects */
	int64 k = rs->n - rs->m; /* objects still to sample */
	double p; /* probability to skip object */
	double V; /* random */

	Assert(RowSampler_HasMore(rs)); /* hence K > 0 and k > 0 */

	if (k >= K)
	{
	/* need all the rest */
	rs->m++;
	return rs->t++;
	}

	/*
	* It is not obvious that this code matches Knuth's Algorithm S.
	* Refer to BlockSampler_Next() for detail.
	*/
	V = sampler_random_fract(rs->randstate);
	/*
	* Don't bother overflow of conversion from int64 K (N) as it was
	* already converted to "double" range value when initialized.
	*/
	p = 1.0 - (double) k / (double) K;
	while (V < p)
	{
	/* skip */
	rs->t++;
	K--; /* keep K == N - t */

	/* adjust p to be new cutoff point in reduced range */
	p *= 1.0 - (double) k / (double) K;
	}

	/* select */
	rs->m++;
	return rs->t++;
	}

	/*
	* These two routines embody Algorithm Z from "Random sampling with a
	* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
	* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
	* of the count S of records to skip before processing another record.
	* It is computed primarily based on t, the number of records already read.
	* The only extra state needed between calls is W, a random state variable.
	*
	* reservoir_init_selection_state computes the initial W value.
	*
	* Given that we've already read t records (t >= n), reservoir_get_next_S
	* determines the number of records to skip before the next record is
	* processed.
	*/
	void
	reservoir_init_selection_state(ReservoirState rs, int n)
	{
	/*
	* Reservoir sampling is not used anywhere where it would need to return
	* repeatable results so we can initialize it randomly.
	*/
	sampler_random_init_state(random(), rs->randstate);

	/* Initial value of W (for use when Algorithm Z is first applied) */
	rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
	}

	double
	reservoir_get_next_S(ReservoirState rs, double t, int n)
	{
	double S;

	/* The magic constant here is T from Vitter's paper */
	if (t <= (22.0 * n))
	{
	/* Process records using Algorithm X until t is large enough */
	double V,
	quot;

	V = sampler_random_fract(rs->randstate); /* Generate V */
	S = 0;
	t += 1;
	/* Note: "num" in Vitter's code is always equal to t - n */
	quot = (t - (double) n) / t;
	/* Find min S satisfying (4.1) */
	while (quot > V)
	{
	S += 1;
	t += 1;
	quot *= (t - (double) n) / t;
	}
	}
	else
	{
	/* Now apply Algorithm Z */
	double W = rs->W;
	double term = t - (double) n + 1;

	for (;;)
	{
	double numer,
	numer_lim,
	denom;
	double U,
	X,
	lhs,
	rhs,
	y,
	tmp;

	/* Generate U and X */
	U = sampler_random_fract(rs->randstate);
	X = t * (W - 1.0);
	S = floor(X); /* S is tentatively set to floor(X) */
	/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
	tmp = (t + 1) / term;
	lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
	rhs = (((t + X) / (term + S)) * term) / t;
	if (lhs <= rhs)
	{
	W = rhs / lhs;
	break;
	}
	/* Test if U <= f(S)/cg(X) */
	y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
	if ((double) n < S)
	{
	denom = t;
	numer_lim = term + S;
	}
	else
	{
	denom = t - (double) n + S;
	numer_lim = t + 1;
	}
	for (numer = t + S; numer >= numer_lim; numer -= 1)
	{
	y *= numer / denom;
	denom -= 1;
	}
	W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
	if (exp(log(y) / n) <= (t + X) / t)
	break;
	}
	rs->W = W;
	}
	return S;
	}


	/*----------
	* Random number generator used by sampling
	*----------
	*/
	void
	sampler_random_init_state(long seed, SamplerRandomState randstate)
	{
	randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */
	randstate[1] = (unsigned short) seed;
	randstate[2] = (unsigned short) (seed >> 16);
	}

	/* Select a random value R uniformly distributed in (0 - 1) */
	double
	sampler_random_fract(SamplerRandomState randstate)
	{
	double res;

	/* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
	do
	{
	res = pg_erand48(randstate);
	} while (res == 0.0);
	return res;
	}


	/*
	* Backwards-compatible API for block sampling
	*
	* This code is now deprecated, but since it's still in use by many FDWs,
	* we should keep it for awhile at least. The functionality is the same as
	* sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
	* except that a common random state is used across all callers.
	*/
	static ReservoirStateData oldrs;

	double
	anl_random_fract(void)
	{
	/* initialize if first time through */
	if (oldrs.randstate[0] == 0)
	sampler_random_init_state(random(), oldrs.randstate);

	/* and compute a random fraction */
	return sampler_random_fract(oldrs.randstate);
	}

	double
	anl_init_selection_state(int n)
	{
	/* initialize if first time through */
	if (oldrs.randstate[0] == 0)
	sampler_random_init_state(random(), oldrs.randstate);

	/* Initial value of W (for use when Algorithm Z is first applied) */
	return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
	}

	double
	anl_get_next_S(double t, int n, double *stateptr)
	{
	double result;

	oldrs.W = *stateptr;
	result = reservoir_get_next_S(&oldrs, t, n);
	*stateptr = oldrs.W;
	return result;
	}