blob: 58361eb2bff19cd4c37d894d3820a90263f4dda1 [file] [log] [blame]
/*-------------------------------------------------------------------------
*
* sampling.c
* Relation block sampling routines.
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/utils/misc/sampling.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <math.h>
#include "utils/sampling.h"
/*
* BlockSampler_Init -- prepare for random sampling of blocknumbers
*
* BlockSampler provides algorithm for block level sampling of a relation
* as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
* It selects a random sample of samplesize blocks out of
* the nblocks blocks in the table. If the table has less than
* samplesize blocks, all blocks are selected.
*
* Since we know the total number of blocks in advance, we can use the
* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
* algorithm.
*
* Returns the number of blocks that BlockSampler_Next will return.
*/
BlockNumber
BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
long randseed)
{
bs->N = nblocks; /* measured table size */
/*
* If we decide to reduce samplesize for tables that have less or not much
* more than samplesize blocks, here is the place to do it.
*/
bs->n = samplesize;
bs->t = 0; /* blocks scanned so far */
bs->m = 0; /* blocks selected so far */
sampler_random_init_state(randseed, bs->randstate);
return Min(bs->n, bs->N);
}
bool
BlockSampler_HasMore(BlockSampler bs)
{
return (bs->t < bs->N) && (bs->m < bs->n);
}
BlockNumber
BlockSampler_Next(BlockSampler bs)
{
BlockNumber K = bs->N - bs->t; /* remaining blocks */
int k = bs->n - bs->m; /* blocks still to sample */
double p; /* probability to skip block */
double V; /* random */
Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
if ((BlockNumber) k >= K)
{
/* need all the rest */
bs->m++;
return bs->t++;
}
/*----------
* It is not obvious that this code matches Knuth's Algorithm S.
* Knuth says to skip the current block with probability 1 - k/K.
* If we are to skip, we should advance t (hence decrease K), and
* repeat the same probabilistic test for the next block. The naive
* implementation thus requires a sampler_random_fract() call for each
* block number. But we can reduce this to one sampler_random_fract()
* call per selected block, by noting that each time the while-test
* succeeds, we can reinterpret V as a uniform random number in the range
* 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
* the appropriate fraction of its former value, and our next loop
* makes the appropriate probabilistic test.
*
* We have initially K > k > 0. If the loop reduces K to equal k,
* the next while-test must fail since p will become exactly zero
* (we assume there will not be roundoff error in the division).
* (Note: Knuth suggests a "<=" loop condition, but we use "<" just
* to be doubly sure about roundoff error.) Therefore K cannot become
* less than k, which means that we cannot fail to select enough blocks.
*----------
*/
V = sampler_random_fract(bs->randstate);
p = 1.0 - (double) k / (double) K;
while (V < p)
{
/* skip */
bs->t++;
K--; /* keep K == N - t */
/* adjust p to be new cutoff point in reduced range */
p *= 1.0 - (double) k / (double) K;
}
/* select */
bs->m++;
return bs->t++;
}
/*
* This is used for sampling AO/CO row numbers, in the flattened
* row number space, across all segfile tuple counts. 64 bits is
* used for simplicity and is sufficient to hold a maximum tuple
* count of (AOTupleId_MaxSegmentFileNum * MAX_AOREL_CONCURRENCY),
* which can be represented with 48 bits.
*
* The code is same as BlockSampler except replacing
* int type of variables with int64, which is intended
* to support larger size of the data set (N).
*
* Duplicate code for not willing to break the original
* design to conflict with upstream for some special case.
*/
void
RowSampler_Init(RowSampler rs, int64 nobjects, int64 samplesize,
long randseed)
{
rs->N = nobjects; /* measured table size */
/*
* If we decide to reduce samplesize for tables that have less or not much
* more than samplesize objects, here is the place to do it.
*/
rs->n = samplesize;
rs->t = 0; /* objects scanned so far */
rs->m = 0; /* objects selected so far */
sampler_random_init_state(randseed, rs->randstate);
}
bool
RowSampler_HasMore(RowSampler rs)
{
return (rs->t < rs->N) && (rs->m < rs->n);
}
int64
RowSampler_Next(RowSampler rs)
{
int64 K = rs->N - rs->t; /* remaining objects */
int64 k = rs->n - rs->m; /* objects still to sample */
double p; /* probability to skip object */
double V; /* random */
Assert(RowSampler_HasMore(rs)); /* hence K > 0 and k > 0 */
if (k >= K)
{
/* need all the rest */
rs->m++;
return rs->t++;
}
/*
* It is not obvious that this code matches Knuth's Algorithm S.
* Refer to BlockSampler_Next() for detail.
*/
V = sampler_random_fract(rs->randstate);
/*
* Don't bother overflow of conversion from int64 K (N) as it was
* already converted to "double" range value when initialized.
*/
p = 1.0 - (double) k / (double) K;
while (V < p)
{
/* skip */
rs->t++;
K--; /* keep K == N - t */
/* adjust p to be new cutoff point in reduced range */
p *= 1.0 - (double) k / (double) K;
}
/* select */
rs->m++;
return rs->t++;
}
/*
* These two routines embody Algorithm Z from "Random sampling with a
* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
* of the count S of records to skip before processing another record.
* It is computed primarily based on t, the number of records already read.
* The only extra state needed between calls is W, a random state variable.
*
* reservoir_init_selection_state computes the initial W value.
*
* Given that we've already read t records (t >= n), reservoir_get_next_S
* determines the number of records to skip before the next record is
* processed.
*/
void
reservoir_init_selection_state(ReservoirState rs, int n)
{
/*
* Reservoir sampling is not used anywhere where it would need to return
* repeatable results so we can initialize it randomly.
*/
sampler_random_init_state(random(), rs->randstate);
/* Initial value of W (for use when Algorithm Z is first applied) */
rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
}
double
reservoir_get_next_S(ReservoirState rs, double t, int n)
{
double S;
/* The magic constant here is T from Vitter's paper */
if (t <= (22.0 * n))
{
/* Process records using Algorithm X until t is large enough */
double V,
quot;
V = sampler_random_fract(rs->randstate); /* Generate V */
S = 0;
t += 1;
/* Note: "num" in Vitter's code is always equal to t - n */
quot = (t - (double) n) / t;
/* Find min S satisfying (4.1) */
while (quot > V)
{
S += 1;
t += 1;
quot *= (t - (double) n) / t;
}
}
else
{
/* Now apply Algorithm Z */
double W = rs->W;
double term = t - (double) n + 1;
for (;;)
{
double numer,
numer_lim,
denom;
double U,
X,
lhs,
rhs,
y,
tmp;
/* Generate U and X */
U = sampler_random_fract(rs->randstate);
X = t * (W - 1.0);
S = floor(X); /* S is tentatively set to floor(X) */
/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
tmp = (t + 1) / term;
lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
rhs = (((t + X) / (term + S)) * term) / t;
if (lhs <= rhs)
{
W = rhs / lhs;
break;
}
/* Test if U <= f(S)/cg(X) */
y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
if ((double) n < S)
{
denom = t;
numer_lim = term + S;
}
else
{
denom = t - (double) n + S;
numer_lim = t + 1;
}
for (numer = t + S; numer >= numer_lim; numer -= 1)
{
y *= numer / denom;
denom -= 1;
}
W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
if (exp(log(y) / n) <= (t + X) / t)
break;
}
rs->W = W;
}
return S;
}
/*----------
* Random number generator used by sampling
*----------
*/
void
sampler_random_init_state(long seed, SamplerRandomState randstate)
{
randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */
randstate[1] = (unsigned short) seed;
randstate[2] = (unsigned short) (seed >> 16);
}
/* Select a random value R uniformly distributed in (0 - 1) */
double
sampler_random_fract(SamplerRandomState randstate)
{
double res;
/* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
do
{
res = pg_erand48(randstate);
} while (res == 0.0);
return res;
}
/*
* Backwards-compatible API for block sampling
*
* This code is now deprecated, but since it's still in use by many FDWs,
* we should keep it for awhile at least. The functionality is the same as
* sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
* except that a common random state is used across all callers.
*/
static ReservoirStateData oldrs;
double
anl_random_fract(void)
{
/* initialize if first time through */
if (oldrs.randstate[0] == 0)
sampler_random_init_state(random(), oldrs.randstate);
/* and compute a random fraction */
return sampler_random_fract(oldrs.randstate);
}
double
anl_init_selection_state(int n)
{
/* initialize if first time through */
if (oldrs.randstate[0] == 0)
sampler_random_init_state(random(), oldrs.randstate);
/* Initial value of W (for use when Algorithm Z is first applied) */
return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
}
double
anl_get_next_S(double t, int n, double *stateptr)
{
double result;
oldrs.W = *stateptr;
result = reservoir_get_next_S(&oldrs, t, n);
*stateptr = oldrs.W;
return result;
}