| /*------------------------------------------------------------------------- |
| * |
| * sampling.c |
| * Relation block sampling routines. |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/utils/misc/sampling.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include <math.h> |
| |
| #include "utils/sampling.h" |
| |
| |
| /* |
| * BlockSampler_Init -- prepare for random sampling of blocknumbers |
| * |
| * BlockSampler provides algorithm for block level sampling of a relation |
| * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB") |
| * It selects a random sample of samplesize blocks out of |
| * the nblocks blocks in the table. If the table has less than |
| * samplesize blocks, all blocks are selected. |
| * |
| * Since we know the total number of blocks in advance, we can use the |
| * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's |
| * algorithm. |
| * |
| * Returns the number of blocks that BlockSampler_Next will return. |
| */ |
| BlockNumber |
| BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize, |
| long randseed) |
| { |
| bs->N = nblocks; /* measured table size */ |
| |
| /* |
| * If we decide to reduce samplesize for tables that have less or not much |
| * more than samplesize blocks, here is the place to do it. |
| */ |
| bs->n = samplesize; |
| bs->t = 0; /* blocks scanned so far */ |
| bs->m = 0; /* blocks selected so far */ |
| |
| sampler_random_init_state(randseed, bs->randstate); |
| |
| return Min(bs->n, bs->N); |
| } |
| |
| bool |
| BlockSampler_HasMore(BlockSampler bs) |
| { |
| return (bs->t < bs->N) && (bs->m < bs->n); |
| } |
| |
| BlockNumber |
| BlockSampler_Next(BlockSampler bs) |
| { |
| BlockNumber K = bs->N - bs->t; /* remaining blocks */ |
| int k = bs->n - bs->m; /* blocks still to sample */ |
| double p; /* probability to skip block */ |
| double V; /* random */ |
| |
| Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */ |
| |
| if ((BlockNumber) k >= K) |
| { |
| /* need all the rest */ |
| bs->m++; |
| return bs->t++; |
| } |
| |
| /*---------- |
| * It is not obvious that this code matches Knuth's Algorithm S. |
| * Knuth says to skip the current block with probability 1 - k/K. |
| * If we are to skip, we should advance t (hence decrease K), and |
| * repeat the same probabilistic test for the next block. The naive |
| * implementation thus requires a sampler_random_fract() call for each |
| * block number. But we can reduce this to one sampler_random_fract() |
| * call per selected block, by noting that each time the while-test |
| * succeeds, we can reinterpret V as a uniform random number in the range |
| * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be |
| * the appropriate fraction of its former value, and our next loop |
| * makes the appropriate probabilistic test. |
| * |
| * We have initially K > k > 0. If the loop reduces K to equal k, |
| * the next while-test must fail since p will become exactly zero |
| * (we assume there will not be roundoff error in the division). |
| * (Note: Knuth suggests a "<=" loop condition, but we use "<" just |
| * to be doubly sure about roundoff error.) Therefore K cannot become |
| * less than k, which means that we cannot fail to select enough blocks. |
| *---------- |
| */ |
| V = sampler_random_fract(bs->randstate); |
| p = 1.0 - (double) k / (double) K; |
| while (V < p) |
| { |
| /* skip */ |
| bs->t++; |
| K--; /* keep K == N - t */ |
| |
| /* adjust p to be new cutoff point in reduced range */ |
| p *= 1.0 - (double) k / (double) K; |
| } |
| |
| /* select */ |
| bs->m++; |
| return bs->t++; |
| } |
| |
| /* |
| * This is used for sampling AO/CO row numbers, in the flattened |
| * row number space, across all segfile tuple counts. 64 bits is |
| * used for simplicity and is sufficient to hold a maximum tuple |
| * count of (AOTupleId_MaxSegmentFileNum * MAX_AOREL_CONCURRENCY), |
| * which can be represented with 48 bits. |
| * |
| * The code is same as BlockSampler except replacing |
| * int type of variables with int64, which is intended |
| * to support larger size of the data set (N). |
| * |
| * Duplicate code for not willing to break the original |
| * design to conflict with upstream for some special case. |
| */ |
| void |
| RowSampler_Init(RowSampler rs, int64 nobjects, int64 samplesize, |
| long randseed) |
| { |
| rs->N = nobjects; /* measured table size */ |
| |
| /* |
| * If we decide to reduce samplesize for tables that have less or not much |
| * more than samplesize objects, here is the place to do it. |
| */ |
| rs->n = samplesize; |
| rs->t = 0; /* objects scanned so far */ |
| rs->m = 0; /* objects selected so far */ |
| |
| sampler_random_init_state(randseed, rs->randstate); |
| } |
| |
| bool |
| RowSampler_HasMore(RowSampler rs) |
| { |
| return (rs->t < rs->N) && (rs->m < rs->n); |
| } |
| |
| int64 |
| RowSampler_Next(RowSampler rs) |
| { |
| int64 K = rs->N - rs->t; /* remaining objects */ |
| int64 k = rs->n - rs->m; /* objects still to sample */ |
| double p; /* probability to skip object */ |
| double V; /* random */ |
| |
| Assert(RowSampler_HasMore(rs)); /* hence K > 0 and k > 0 */ |
| |
| if (k >= K) |
| { |
| /* need all the rest */ |
| rs->m++; |
| return rs->t++; |
| } |
| |
| /* |
| * It is not obvious that this code matches Knuth's Algorithm S. |
| * Refer to BlockSampler_Next() for detail. |
| */ |
| V = sampler_random_fract(rs->randstate); |
| /* |
| * Don't bother overflow of conversion from int64 K (N) as it was |
| * already converted to "double" range value when initialized. |
| */ |
| p = 1.0 - (double) k / (double) K; |
| while (V < p) |
| { |
| /* skip */ |
| rs->t++; |
| K--; /* keep K == N - t */ |
| |
| /* adjust p to be new cutoff point in reduced range */ |
| p *= 1.0 - (double) k / (double) K; |
| } |
| |
| /* select */ |
| rs->m++; |
| return rs->t++; |
| } |
| |
| /* |
| * These two routines embody Algorithm Z from "Random sampling with a |
| * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1 |
| * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms |
| * of the count S of records to skip before processing another record. |
| * It is computed primarily based on t, the number of records already read. |
| * The only extra state needed between calls is W, a random state variable. |
| * |
| * reservoir_init_selection_state computes the initial W value. |
| * |
| * Given that we've already read t records (t >= n), reservoir_get_next_S |
| * determines the number of records to skip before the next record is |
| * processed. |
| */ |
| void |
| reservoir_init_selection_state(ReservoirState rs, int n) |
| { |
| /* |
| * Reservoir sampling is not used anywhere where it would need to return |
| * repeatable results so we can initialize it randomly. |
| */ |
| sampler_random_init_state(random(), rs->randstate); |
| |
| /* Initial value of W (for use when Algorithm Z is first applied) */ |
| rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n); |
| } |
| |
| double |
| reservoir_get_next_S(ReservoirState rs, double t, int n) |
| { |
| double S; |
| |
| /* The magic constant here is T from Vitter's paper */ |
| if (t <= (22.0 * n)) |
| { |
| /* Process records using Algorithm X until t is large enough */ |
| double V, |
| quot; |
| |
| V = sampler_random_fract(rs->randstate); /* Generate V */ |
| S = 0; |
| t += 1; |
| /* Note: "num" in Vitter's code is always equal to t - n */ |
| quot = (t - (double) n) / t; |
| /* Find min S satisfying (4.1) */ |
| while (quot > V) |
| { |
| S += 1; |
| t += 1; |
| quot *= (t - (double) n) / t; |
| } |
| } |
| else |
| { |
| /* Now apply Algorithm Z */ |
| double W = rs->W; |
| double term = t - (double) n + 1; |
| |
| for (;;) |
| { |
| double numer, |
| numer_lim, |
| denom; |
| double U, |
| X, |
| lhs, |
| rhs, |
| y, |
| tmp; |
| |
| /* Generate U and X */ |
| U = sampler_random_fract(rs->randstate); |
| X = t * (W - 1.0); |
| S = floor(X); /* S is tentatively set to floor(X) */ |
| /* Test if U <= h(S)/cg(X) in the manner of (6.3) */ |
| tmp = (t + 1) / term; |
| lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n); |
| rhs = (((t + X) / (term + S)) * term) / t; |
| if (lhs <= rhs) |
| { |
| W = rhs / lhs; |
| break; |
| } |
| /* Test if U <= f(S)/cg(X) */ |
| y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X); |
| if ((double) n < S) |
| { |
| denom = t; |
| numer_lim = term + S; |
| } |
| else |
| { |
| denom = t - (double) n + S; |
| numer_lim = t + 1; |
| } |
| for (numer = t + S; numer >= numer_lim; numer -= 1) |
| { |
| y *= numer / denom; |
| denom -= 1; |
| } |
| W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */ |
| if (exp(log(y) / n) <= (t + X) / t) |
| break; |
| } |
| rs->W = W; |
| } |
| return S; |
| } |
| |
| |
| /*---------- |
| * Random number generator used by sampling |
| *---------- |
| */ |
| void |
| sampler_random_init_state(long seed, SamplerRandomState randstate) |
| { |
| randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */ |
| randstate[1] = (unsigned short) seed; |
| randstate[2] = (unsigned short) (seed >> 16); |
| } |
| |
| /* Select a random value R uniformly distributed in (0 - 1) */ |
| double |
| sampler_random_fract(SamplerRandomState randstate) |
| { |
| double res; |
| |
| /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */ |
| do |
| { |
| res = pg_erand48(randstate); |
| } while (res == 0.0); |
| return res; |
| } |
| |
| |
| /* |
| * Backwards-compatible API for block sampling |
| * |
| * This code is now deprecated, but since it's still in use by many FDWs, |
| * we should keep it for awhile at least. The functionality is the same as |
| * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S, |
| * except that a common random state is used across all callers. |
| */ |
| static ReservoirStateData oldrs; |
| |
| double |
| anl_random_fract(void) |
| { |
| /* initialize if first time through */ |
| if (oldrs.randstate[0] == 0) |
| sampler_random_init_state(random(), oldrs.randstate); |
| |
| /* and compute a random fraction */ |
| return sampler_random_fract(oldrs.randstate); |
| } |
| |
| double |
| anl_init_selection_state(int n) |
| { |
| /* initialize if first time through */ |
| if (oldrs.randstate[0] == 0) |
| sampler_random_init_state(random(), oldrs.randstate); |
| |
| /* Initial value of W (for use when Algorithm Z is first applied) */ |
| return exp(-log(sampler_random_fract(oldrs.randstate)) / n); |
| } |
| |
| double |
| anl_get_next_S(double t, int n, double *stateptr) |
| { |
| double result; |
| |
| oldrs.W = *stateptr; |
| result = reservoir_get_next_S(&oldrs, t, n); |
| *stateptr = oldrs.W; |
| return result; |
| } |