| /*------------------------------------------------------------------------- |
| * |
| * mvdistinct.c |
| * POSTGRES multivariate ndistinct coefficients |
| * |
| * Estimating number of groups in a combination of columns (e.g. for GROUP BY) |
| * is tricky, and the estimation error is often significant. |
| |
| * The multivariate ndistinct coefficients address this by storing ndistinct |
| * estimates for combinations of the user-specified columns. So for example |
| * given a statistics object on three columns (a,b,c), this module estimates |
| * and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column |
| * estimates are already available in pg_statistic. |
| * |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/statistics/mvdistinct.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <math.h> |
| |
| #include "access/htup_details.h" |
| #include "catalog/pg_statistic_ext.h" |
| #include "catalog/pg_statistic_ext_data.h" |
| #include "lib/stringinfo.h" |
| #include "statistics/extended_stats_internal.h" |
| #include "statistics/statistics.h" |
| #include "utils/fmgrprotos.h" |
| #include "utils/lsyscache.h" |
| #include "utils/syscache.h" |
| #include "utils/typcache.h" |
| |
| static double ndistinct_for_combination(double totalrows, StatsBuildData *data, |
| int k, int *combination); |
| static double estimate_ndistinct(double totalrows, int numrows, int d, int f1); |
| static int n_choose_k(int n, int k); |
| static int num_combinations(int n); |
| |
| /* size of the struct header fields (magic, type, nitems) */ |
| #define SizeOfHeader (3 * sizeof(uint32)) |
| |
| /* size of a serialized ndistinct item (coefficient, natts, atts) */ |
| #define SizeOfItem(natts) \ |
| (sizeof(double) + sizeof(int) + (natts) * sizeof(AttrNumber)) |
| |
| /* minimal size of a ndistinct item (with two attributes) */ |
| #define MinSizeOfItem SizeOfItem(2) |
| |
| /* minimal size of mvndistinct, when all items are minimal */ |
| #define MinSizeOfItems(nitems) \ |
| (SizeOfHeader + (nitems) * MinSizeOfItem) |
| |
| /* Combination generator API */ |
| |
| /* internal state for generator of k-combinations of n elements */ |
| typedef struct CombinationGenerator |
| { |
| int k; /* size of the combination */ |
| int n; /* total number of elements */ |
| int current; /* index of the next combination to return */ |
| int ncombinations; /* number of combinations (size of array) */ |
| int *combinations; /* array of pre-built combinations */ |
| } CombinationGenerator; |
| |
| static CombinationGenerator *generator_init(int n, int k); |
| static void generator_free(CombinationGenerator *state); |
| static int *generator_next(CombinationGenerator *state); |
| static void generate_combinations(CombinationGenerator *state); |
| |
| |
| /* |
| * statext_ndistinct_build |
| * Compute ndistinct coefficient for the combination of attributes. |
| * |
| * This computes the ndistinct estimate using the same estimator used |
| * in analyze.c and then computes the coefficient. |
| * |
| * To handle expressions easily, we treat them as system attributes with |
| * negative attnums, and offset everything by number of expressions to |
| * allow using Bitmapsets. |
| */ |
| MVNDistinct * |
| statext_ndistinct_build(double totalrows, StatsBuildData *data) |
| { |
| MVNDistinct *result; |
| int k; |
| int itemcnt; |
| int numattrs = data->nattnums; |
| int numcombs = num_combinations(numattrs); |
| |
| result = palloc(offsetof(MVNDistinct, items) + |
| numcombs * sizeof(MVNDistinctItem)); |
| result->magic = STATS_NDISTINCT_MAGIC; |
| result->type = STATS_NDISTINCT_TYPE_BASIC; |
| result->nitems = numcombs; |
| |
| itemcnt = 0; |
| for (k = 2; k <= numattrs; k++) |
| { |
| int *combination; |
| CombinationGenerator *generator; |
| |
| /* generate combinations of K out of N elements */ |
| generator = generator_init(numattrs, k); |
| |
| while ((combination = generator_next(generator))) |
| { |
| MVNDistinctItem *item = &result->items[itemcnt]; |
| int j; |
| |
| item->attributes = palloc(sizeof(AttrNumber) * k); |
| item->nattributes = k; |
| |
| /* translate the indexes to attnums */ |
| for (j = 0; j < k; j++) |
| { |
| item->attributes[j] = data->attnums[combination[j]]; |
| |
| Assert(AttributeNumberIsValid(item->attributes[j])); |
| } |
| |
| item->ndistinct = |
| ndistinct_for_combination(totalrows, data, k, combination); |
| |
| itemcnt++; |
| Assert(itemcnt <= result->nitems); |
| } |
| |
| generator_free(generator); |
| } |
| |
| /* must consume exactly the whole output array */ |
| Assert(itemcnt == result->nitems); |
| |
| return result; |
| } |
| |
| /* |
| * statext_ndistinct_load |
| * Load the ndistinct value for the indicated pg_statistic_ext tuple |
| */ |
| MVNDistinct * |
| statext_ndistinct_load(Oid mvoid, bool inh) |
| { |
| MVNDistinct *result; |
| bool isnull; |
| Datum ndist; |
| HeapTuple htup; |
| |
| htup = SearchSysCache2(STATEXTDATASTXOID, |
| ObjectIdGetDatum(mvoid), BoolGetDatum(inh)); |
| if (!HeapTupleIsValid(htup)) |
| elog(ERROR, "cache lookup failed for statistics object %u", mvoid); |
| |
| ndist = SysCacheGetAttr(STATEXTDATASTXOID, htup, |
| Anum_pg_statistic_ext_data_stxdndistinct, &isnull); |
| if (isnull) |
| elog(ERROR, |
| "requested statistics kind \"%c\" is not yet built for statistics object %u", |
| STATS_EXT_NDISTINCT, mvoid); |
| |
| result = statext_ndistinct_deserialize(DatumGetByteaPP(ndist)); |
| |
| ReleaseSysCache(htup); |
| |
| return result; |
| } |
| |
| /* |
| * statext_ndistinct_serialize |
| * serialize ndistinct to the on-disk bytea format |
| */ |
| bytea * |
| statext_ndistinct_serialize(MVNDistinct *ndistinct) |
| { |
| int i; |
| bytea *output; |
| char *tmp; |
| Size len; |
| |
| Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC); |
| Assert(ndistinct->type == STATS_NDISTINCT_TYPE_BASIC); |
| |
| /* |
| * Base size is size of scalar fields in the struct, plus one base struct |
| * for each item, including number of items for each. |
| */ |
| len = VARHDRSZ + SizeOfHeader; |
| |
| /* and also include space for the actual attribute numbers */ |
| for (i = 0; i < ndistinct->nitems; i++) |
| { |
| int nmembers; |
| |
| nmembers = ndistinct->items[i].nattributes; |
| Assert(nmembers >= 2); |
| |
| len += SizeOfItem(nmembers); |
| } |
| |
| output = (bytea *) palloc(len); |
| SET_VARSIZE(output, len); |
| |
| tmp = VARDATA(output); |
| |
| /* Store the base struct values (magic, type, nitems) */ |
| memcpy(tmp, &ndistinct->magic, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| memcpy(tmp, &ndistinct->type, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| memcpy(tmp, &ndistinct->nitems, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| |
| /* |
| * store number of attributes and attribute numbers for each entry |
| */ |
| for (i = 0; i < ndistinct->nitems; i++) |
| { |
| MVNDistinctItem item = ndistinct->items[i]; |
| int nmembers = item.nattributes; |
| |
| memcpy(tmp, &item.ndistinct, sizeof(double)); |
| tmp += sizeof(double); |
| memcpy(tmp, &nmembers, sizeof(int)); |
| tmp += sizeof(int); |
| |
| memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers); |
| tmp += nmembers * sizeof(AttrNumber); |
| |
| /* protect against overflows */ |
| Assert(tmp <= ((char *) output + len)); |
| } |
| |
| /* check we used exactly the expected space */ |
| Assert(tmp == ((char *) output + len)); |
| |
| return output; |
| } |
| |
| /* |
| * statext_ndistinct_deserialize |
| * Read an on-disk bytea format MVNDistinct to in-memory format |
| */ |
| MVNDistinct * |
| statext_ndistinct_deserialize(bytea *data) |
| { |
| int i; |
| Size minimum_size; |
| MVNDistinct ndist; |
| MVNDistinct *ndistinct; |
| char *tmp; |
| |
| if (data == NULL) |
| return NULL; |
| |
| /* we expect at least the basic fields of MVNDistinct struct */ |
| if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader) |
| elog(ERROR, "invalid MVNDistinct size %zu (expected at least %zu)", |
| VARSIZE_ANY_EXHDR(data), SizeOfHeader); |
| |
| /* initialize pointer to the data part (skip the varlena header) */ |
| tmp = VARDATA_ANY(data); |
| |
| /* read the header fields and perform basic sanity checks */ |
| memcpy(&ndist.magic, tmp, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| memcpy(&ndist.type, tmp, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| memcpy(&ndist.nitems, tmp, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| |
| if (ndist.magic != STATS_NDISTINCT_MAGIC) |
| elog(ERROR, "invalid ndistinct magic %08x (expected %08x)", |
| ndist.magic, STATS_NDISTINCT_MAGIC); |
| if (ndist.type != STATS_NDISTINCT_TYPE_BASIC) |
| elog(ERROR, "invalid ndistinct type %d (expected %d)", |
| ndist.type, STATS_NDISTINCT_TYPE_BASIC); |
| if (ndist.nitems == 0) |
| elog(ERROR, "invalid zero-length item array in MVNDistinct"); |
| |
| /* what minimum bytea size do we expect for those parameters */ |
| minimum_size = MinSizeOfItems(ndist.nitems); |
| if (VARSIZE_ANY_EXHDR(data) < minimum_size) |
| elog(ERROR, "invalid MVNDistinct size %zu (expected at least %zu)", |
| VARSIZE_ANY_EXHDR(data), minimum_size); |
| |
| /* |
| * Allocate space for the ndistinct items (no space for each item's |
| * attnos: those live in bitmapsets allocated separately) |
| */ |
| ndistinct = palloc0(MAXALIGN(offsetof(MVNDistinct, items)) + |
| (ndist.nitems * sizeof(MVNDistinctItem))); |
| ndistinct->magic = ndist.magic; |
| ndistinct->type = ndist.type; |
| ndistinct->nitems = ndist.nitems; |
| |
| for (i = 0; i < ndistinct->nitems; i++) |
| { |
| MVNDistinctItem *item = &ndistinct->items[i]; |
| |
| /* ndistinct value */ |
| memcpy(&item->ndistinct, tmp, sizeof(double)); |
| tmp += sizeof(double); |
| |
| /* number of attributes */ |
| memcpy(&item->nattributes, tmp, sizeof(int)); |
| tmp += sizeof(int); |
| Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS)); |
| |
| item->attributes |
| = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber)); |
| |
| memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes); |
| tmp += sizeof(AttrNumber) * item->nattributes; |
| |
| /* still within the bytea */ |
| Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); |
| } |
| |
| /* we should have consumed the whole bytea exactly */ |
| Assert(tmp == ((char *) data + VARSIZE_ANY(data))); |
| |
| return ndistinct; |
| } |
| |
| /* |
| * pg_ndistinct_in |
| * input routine for type pg_ndistinct |
| * |
| * pg_ndistinct is real enough to be a table column, but it has no |
| * operations of its own, and disallows input (just like pg_node_tree). |
| */ |
| Datum |
| pg_ndistinct_in(PG_FUNCTION_ARGS) |
| { |
| ereport(ERROR, |
| (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| errmsg("cannot accept a value of type %s", "pg_ndistinct"))); |
| |
| PG_RETURN_VOID(); /* keep compiler quiet */ |
| } |
| |
| /* |
| * pg_ndistinct |
| * output routine for type pg_ndistinct |
| * |
| * Produces a human-readable representation of the value. |
| */ |
| Datum |
| pg_ndistinct_out(PG_FUNCTION_ARGS) |
| { |
| bytea *data = PG_GETARG_BYTEA_PP(0); |
| MVNDistinct *ndist = statext_ndistinct_deserialize(data); |
| int i; |
| StringInfoData str; |
| |
| initStringInfo(&str); |
| appendStringInfoChar(&str, '{'); |
| |
| for (i = 0; i < ndist->nitems; i++) |
| { |
| int j; |
| MVNDistinctItem item = ndist->items[i]; |
| |
| if (i > 0) |
| appendStringInfoString(&str, ", "); |
| |
| for (j = 0; j < item.nattributes; j++) |
| { |
| AttrNumber attnum = item.attributes[j]; |
| |
| appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum); |
| } |
| appendStringInfo(&str, "\": %d", (int) item.ndistinct); |
| } |
| |
| appendStringInfoChar(&str, '}'); |
| |
| PG_RETURN_CSTRING(str.data); |
| } |
| |
| /* |
| * pg_ndistinct_recv |
| * binary input routine for type pg_ndistinct |
| */ |
| Datum |
| pg_ndistinct_recv(PG_FUNCTION_ARGS) |
| { |
| ereport(ERROR, |
| (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| errmsg("cannot accept a value of type %s", "pg_ndistinct"))); |
| |
| PG_RETURN_VOID(); /* keep compiler quiet */ |
| } |
| |
| /* |
| * pg_ndistinct_send |
| * binary output routine for type pg_ndistinct |
| * |
| * n-distinct is serialized into a bytea value, so let's send that. |
| */ |
| Datum |
| pg_ndistinct_send(PG_FUNCTION_ARGS) |
| { |
| return byteasend(fcinfo); |
| } |
| |
| /* |
| * ndistinct_for_combination |
| * Estimates number of distinct values in a combination of columns. |
| * |
| * This uses the same ndistinct estimator as compute_scalar_stats() in |
| * ANALYZE, i.e., |
| * n*d / (n - f1 + f1*n/N) |
| * |
| * except that instead of values in a single column we are dealing with |
| * combination of multiple columns. |
| */ |
| static double |
| ndistinct_for_combination(double totalrows, StatsBuildData *data, |
| int k, int *combination) |
| { |
| int i, |
| j; |
| int f1, |
| cnt, |
| d; |
| bool *isnull; |
| Datum *values; |
| SortItem *items; |
| MultiSortSupport mss; |
| int numrows = data->numrows; |
| |
| mss = multi_sort_init(k); |
| |
| /* |
| * In order to determine the number of distinct elements, create separate |
| * values[]/isnull[] arrays with all the data we have, then sort them |
| * using the specified column combination as dimensions. We could try to |
| * sort in place, but it'd probably be more complex and bug-prone. |
| */ |
| items = (SortItem *) palloc(numrows * sizeof(SortItem)); |
| values = (Datum *) palloc0(sizeof(Datum) * numrows * k); |
| isnull = (bool *) palloc0(sizeof(bool) * numrows * k); |
| |
| for (i = 0; i < numrows; i++) |
| { |
| items[i].values = &values[i * k]; |
| items[i].isnull = &isnull[i * k]; |
| } |
| |
| /* |
| * For each dimension, set up sort-support and fill in the values from the |
| * sample data. |
| * |
| * We use the column data types' default sort operators and collations; |
| * perhaps at some point it'd be worth using column-specific collations? |
| */ |
| for (i = 0; i < k; i++) |
| { |
| Oid typid; |
| TypeCacheEntry *type; |
| Oid collid = InvalidOid; |
| VacAttrStats *colstat = data->stats[combination[i]]; |
| |
| typid = colstat->attrtypid; |
| collid = colstat->attrcollid; |
| |
| type = lookup_type_cache(typid, TYPECACHE_LT_OPR); |
| if (type->lt_opr == InvalidOid) /* shouldn't happen */ |
| elog(ERROR, "cache lookup failed for ordering operator for type %u", |
| typid); |
| |
| /* prepare the sort function for this dimension */ |
| multi_sort_add_dimension(mss, i, type->lt_opr, collid); |
| |
| /* accumulate all the data for this dimension into the arrays */ |
| for (j = 0; j < numrows; j++) |
| { |
| items[j].values[i] = data->values[combination[i]][j]; |
| items[j].isnull[i] = data->nulls[combination[i]][j]; |
| } |
| } |
| |
| /* We can sort the array now ... */ |
| qsort_interruptible(items, numrows, sizeof(SortItem), |
| multi_sort_compare, mss); |
| |
| /* ... and count the number of distinct combinations */ |
| |
| f1 = 0; |
| cnt = 1; |
| d = 1; |
| for (i = 1; i < numrows; i++) |
| { |
| if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0) |
| { |
| if (cnt == 1) |
| f1 += 1; |
| |
| d++; |
| cnt = 0; |
| } |
| |
| cnt += 1; |
| } |
| |
| if (cnt == 1) |
| f1 += 1; |
| |
| return estimate_ndistinct(totalrows, numrows, d, f1); |
| } |
| |
| /* The Duj1 estimator (already used in analyze.c). */ |
| static double |
| estimate_ndistinct(double totalrows, int numrows, int d, int f1) |
| { |
| double numer, |
| denom, |
| ndistinct; |
| |
| numer = (double) numrows * (double) d; |
| |
| denom = (double) (numrows - f1) + |
| (double) f1 * (double) numrows / totalrows; |
| |
| ndistinct = numer / denom; |
| |
| /* Clamp to sane range in case of roundoff error */ |
| if (ndistinct < (double) d) |
| ndistinct = (double) d; |
| |
| if (ndistinct > totalrows) |
| ndistinct = totalrows; |
| |
| return floor(ndistinct + 0.5); |
| } |
| |
| /* |
| * n_choose_k |
| * computes binomial coefficients using an algorithm that is both |
| * efficient and prevents overflows |
| */ |
| static int |
| n_choose_k(int n, int k) |
| { |
| int d, |
| r; |
| |
| Assert((k > 0) && (n >= k)); |
| |
| /* use symmetry of the binomial coefficients */ |
| k = Min(k, n - k); |
| |
| r = 1; |
| for (d = 1; d <= k; ++d) |
| { |
| r *= n--; |
| r /= d; |
| } |
| |
| return r; |
| } |
| |
| /* |
| * num_combinations |
| * number of combinations, excluding single-value combinations |
| */ |
| static int |
| num_combinations(int n) |
| { |
| return (1 << n) - (n + 1); |
| } |
| |
| /* |
| * generator_init |
| * initialize the generator of combinations |
| * |
| * The generator produces combinations of K elements in the interval (0..N). |
| * We prebuild all the combinations in this method, which is simpler than |
| * generating them on the fly. |
| */ |
| static CombinationGenerator * |
| generator_init(int n, int k) |
| { |
| CombinationGenerator *state; |
| |
| Assert((n >= k) && (k > 0)); |
| |
| /* allocate the generator state as a single chunk of memory */ |
| state = (CombinationGenerator *) palloc(sizeof(CombinationGenerator)); |
| |
| state->ncombinations = n_choose_k(n, k); |
| |
| /* pre-allocate space for all combinations */ |
| state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations); |
| |
| state->current = 0; |
| state->k = k; |
| state->n = n; |
| |
| /* now actually pre-generate all the combinations of K elements */ |
| generate_combinations(state); |
| |
| /* make sure we got the expected number of combinations */ |
| Assert(state->current == state->ncombinations); |
| |
| /* reset the number, so we start with the first one */ |
| state->current = 0; |
| |
| return state; |
| } |
| |
| /* |
| * generator_next |
| * returns the next combination from the prebuilt list |
| * |
| * Returns a combination of K array indexes (0 .. N), as specified to |
| * generator_init), or NULL when there are no more combination. |
| */ |
| static int * |
| generator_next(CombinationGenerator *state) |
| { |
| if (state->current == state->ncombinations) |
| return NULL; |
| |
| return &state->combinations[state->k * state->current++]; |
| } |
| |
| /* |
| * generator_free |
| * free the internal state of the generator |
| * |
| * Releases the generator internal state (pre-built combinations). |
| */ |
| static void |
| generator_free(CombinationGenerator *state) |
| { |
| pfree(state->combinations); |
| pfree(state); |
| } |
| |
| /* |
| * generate_combinations_recurse |
| * given a prefix, generate all possible combinations |
| * |
| * Given a prefix (first few elements of the combination), generate following |
| * elements recursively. We generate the combinations in lexicographic order, |
| * which eliminates permutations of the same combination. |
| */ |
| static void |
| generate_combinations_recurse(CombinationGenerator *state, |
| int index, int start, int *current) |
| { |
| /* If we haven't filled all the elements, simply recurse. */ |
| if (index < state->k) |
| { |
| int i; |
| |
| /* |
| * The values have to be in ascending order, so make sure we start |
| * with the value passed by parameter. |
| */ |
| |
| for (i = start; i < state->n; i++) |
| { |
| current[index] = i; |
| generate_combinations_recurse(state, (index + 1), (i + 1), current); |
| } |
| |
| return; |
| } |
| else |
| { |
| /* we got a valid combination, add it to the array */ |
| memcpy(&state->combinations[(state->k * state->current)], |
| current, state->k * sizeof(int)); |
| state->current++; |
| } |
| } |
| |
| /* |
| * generate_combinations |
| * generate all k-combinations of N elements |
| */ |
| static void |
| generate_combinations(CombinationGenerator *state) |
| { |
| int *current = (int *) palloc0(sizeof(int) * state->k); |
| |
| generate_combinations_recurse(state, 0, 0, current); |
| |
| pfree(current); |
| } |