| /*------------------------------------------------------------------------- |
| * |
| * dependencies.c |
| * POSTGRES functional dependencies |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/statistics/dependencies.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include "access/htup_details.h" |
| #include "access/sysattr.h" |
| #include "catalog/pg_operator.h" |
| #include "catalog/pg_statistic_ext.h" |
| #include "catalog/pg_statistic_ext_data.h" |
| #include "lib/stringinfo.h" |
| #include "nodes/nodeFuncs.h" |
| #include "nodes/nodes.h" |
| #include "nodes/pathnodes.h" |
| #include "optimizer/clauses.h" |
| #include "optimizer/optimizer.h" |
| #include "parser/parsetree.h" |
| #include "statistics/extended_stats_internal.h" |
| #include "statistics/statistics.h" |
| #include "utils/bytea.h" |
| #include "utils/fmgroids.h" |
| #include "utils/fmgrprotos.h" |
| #include "utils/lsyscache.h" |
| #include "utils/memutils.h" |
| #include "utils/selfuncs.h" |
| #include "utils/syscache.h" |
| #include "utils/typcache.h" |
| |
| /* size of the struct header fields (magic, type, ndeps) */ |
| #define SizeOfHeader (3 * sizeof(uint32)) |
| |
| /* size of a serialized dependency (degree, natts, atts) */ |
| #define SizeOfItem(natts) \ |
| (sizeof(double) + sizeof(AttrNumber) * (1 + (natts))) |
| |
| /* minimal size of a dependency (with two attributes) */ |
| #define MinSizeOfItem SizeOfItem(2) |
| |
| /* minimal size of dependencies, when all deps are minimal */ |
| #define MinSizeOfItems(ndeps) \ |
| (SizeOfHeader + (ndeps) * MinSizeOfItem) |
| |
| /* |
| * Internal state for DependencyGenerator of dependencies. Dependencies are similar to |
| * k-permutations of n elements, except that the order does not matter for the |
| * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent. |
| */ |
| typedef struct DependencyGeneratorData |
| { |
| int k; /* size of the dependency */ |
| int n; /* number of possible attributes */ |
| int current; /* next dependency to return (index) */ |
| AttrNumber ndependencies; /* number of dependencies generated */ |
| AttrNumber *dependencies; /* array of pre-generated dependencies */ |
| } DependencyGeneratorData; |
| |
| typedef DependencyGeneratorData *DependencyGenerator; |
| |
| static void generate_dependencies_recurse(DependencyGenerator state, |
| int index, AttrNumber start, AttrNumber *current); |
| static void generate_dependencies(DependencyGenerator state); |
| static DependencyGenerator DependencyGenerator_init(int n, int k); |
| static void DependencyGenerator_free(DependencyGenerator state); |
| static AttrNumber *DependencyGenerator_next(DependencyGenerator state); |
| static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency); |
| static bool dependency_is_fully_matched(MVDependency *dependency, |
| Bitmapset *attnums); |
| static bool dependency_is_compatible_clause(Node *clause, Index relid, |
| AttrNumber *attnum); |
| static bool dependency_is_compatible_expression(Node *clause, Index relid, |
| List *statlist, Node **expr); |
| static MVDependency *find_strongest_dependency(MVDependencies **dependencies, |
| int ndependencies, Bitmapset *attnums); |
| static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses, |
| int varRelid, JoinType jointype, |
| SpecialJoinInfo *sjinfo, |
| MVDependency **dependencies, |
| int ndependencies, |
| AttrNumber *list_attnums, |
| Bitmapset **estimatedclauses); |
| |
| static void |
| generate_dependencies_recurse(DependencyGenerator state, int index, |
| AttrNumber start, AttrNumber *current) |
| { |
| /* |
| * The generator handles the first (k-1) elements differently from the |
| * last element. |
| */ |
| if (index < (state->k - 1)) |
| { |
| AttrNumber i; |
| |
| /* |
| * The first (k-1) values have to be in ascending order, which we |
| * generate recursively. |
| */ |
| |
| for (i = start; i < state->n; i++) |
| { |
| current[index] = i; |
| generate_dependencies_recurse(state, (index + 1), (i + 1), current); |
| } |
| } |
| else |
| { |
| int i; |
| |
| /* |
| * the last element is the implied value, which does not respect the |
| * ascending order. We just need to check that the value is not in the |
| * first (k-1) elements. |
| */ |
| |
| for (i = 0; i < state->n; i++) |
| { |
| int j; |
| bool match = false; |
| |
| current[index] = i; |
| |
| for (j = 0; j < index; j++) |
| { |
| if (current[j] == i) |
| { |
| match = true; |
| break; |
| } |
| } |
| |
| /* |
| * If the value is not found in the first part of the dependency, |
| * we're done. |
| */ |
| if (!match) |
| { |
| state->dependencies = (AttrNumber *) repalloc(state->dependencies, |
| state->k * (state->ndependencies + 1) * sizeof(AttrNumber)); |
| memcpy(&state->dependencies[(state->k * state->ndependencies)], |
| current, state->k * sizeof(AttrNumber)); |
| state->ndependencies++; |
| } |
| } |
| } |
| } |
| |
| /* generate all dependencies (k-permutations of n elements) */ |
| static void |
| generate_dependencies(DependencyGenerator state) |
| { |
| AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k); |
| |
| generate_dependencies_recurse(state, 0, 0, current); |
| |
| pfree(current); |
| } |
| |
| /* |
| * initialize the DependencyGenerator of variations, and prebuild the variations |
| * |
| * This pre-builds all the variations. We could also generate them in |
| * DependencyGenerator_next(), but this seems simpler. |
| */ |
| static DependencyGenerator |
| DependencyGenerator_init(int n, int k) |
| { |
| DependencyGenerator state; |
| |
| Assert((n >= k) && (k > 0)); |
| |
| /* allocate the DependencyGenerator state */ |
| state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData)); |
| state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber)); |
| |
| state->ndependencies = 0; |
| state->current = 0; |
| state->k = k; |
| state->n = n; |
| |
| /* now actually pre-generate all the variations */ |
| generate_dependencies(state); |
| |
| return state; |
| } |
| |
| /* free the DependencyGenerator state */ |
| static void |
| DependencyGenerator_free(DependencyGenerator state) |
| { |
| pfree(state->dependencies); |
| pfree(state); |
| |
| } |
| |
| /* generate next combination */ |
| static AttrNumber * |
| DependencyGenerator_next(DependencyGenerator state) |
| { |
| if (state->current == state->ndependencies) |
| return NULL; |
| |
| return &state->dependencies[state->k * state->current++]; |
| } |
| |
| |
| /* |
| * validates functional dependency on the data |
| * |
| * An actual work horse of detecting functional dependencies. Given a variation |
| * of k attributes, it checks that the first (k-1) are sufficient to determine |
| * the last one. |
| */ |
| static double |
| dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency) |
| { |
| int i, |
| nitems; |
| MultiSortSupport mss; |
| SortItem *items; |
| AttrNumber *attnums_dep; |
| |
| /* counters valid within a group */ |
| int group_size = 0; |
| int n_violations = 0; |
| |
| /* total number of rows supporting (consistent with) the dependency */ |
| int n_supporting_rows = 0; |
| |
| /* Make sure we have at least two input attributes. */ |
| Assert(k >= 2); |
| |
| /* sort info for all attributes columns */ |
| mss = multi_sort_init(k); |
| |
| /* |
| * Translate the array of indexes to regular attnums for the dependency |
| * (we will need this to identify the columns in StatsBuildData). |
| */ |
| attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber)); |
| for (i = 0; i < k; i++) |
| attnums_dep[i] = data->attnums[dependency[i]]; |
| |
| /* |
| * Verify the dependency (a,b,...)->z, using a rather simple algorithm: |
| * |
| * (a) sort the data lexicographically |
| * |
| * (b) split the data into groups by first (k-1) columns |
| * |
| * (c) for each group count different values in the last column |
| * |
| * We use the column data types' default sort operators and collations; |
| * perhaps at some point it'd be worth using column-specific collations? |
| */ |
| |
| /* prepare the sort function for the dimensions */ |
| for (i = 0; i < k; i++) |
| { |
| VacAttrStats *colstat = data->stats[dependency[i]]; |
| TypeCacheEntry *type; |
| |
| type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); |
| if (type->lt_opr == InvalidOid) /* shouldn't happen */ |
| elog(ERROR, "cache lookup failed for ordering operator for type %u", |
| colstat->attrtypid); |
| |
| /* prepare the sort function for this dimension */ |
| multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid); |
| } |
| |
| /* |
| * build an array of SortItem(s) sorted using the multi-sort support |
| * |
| * XXX This relies on all stats entries pointing to the same tuple |
| * descriptor. For now that assumption holds, but it might change in the |
| * future for example if we support statistics on multiple tables. |
| */ |
| items = build_sorted_items(data, &nitems, mss, k, attnums_dep); |
| |
| /* |
| * Walk through the sorted array, split it into rows according to the |
| * first (k-1) columns. If there's a single value in the last column, we |
| * count the group as 'supporting' the functional dependency. Otherwise we |
| * count it as contradicting. |
| */ |
| |
| /* start with the first row forming a group */ |
| group_size = 1; |
| |
| /* loop 1 beyond the end of the array so that we count the final group */ |
| for (i = 1; i <= nitems; i++) |
| { |
| /* |
| * Check if the group ended, which may be either because we processed |
| * all the items (i==nitems), or because the i-th item is not equal to |
| * the preceding one. |
| */ |
| if (i == nitems || |
| multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0) |
| { |
| /* |
| * If no violations were found in the group then track the rows of |
| * the group as supporting the functional dependency. |
| */ |
| if (n_violations == 0) |
| n_supporting_rows += group_size; |
| |
| /* Reset counters for the new group */ |
| n_violations = 0; |
| group_size = 1; |
| continue; |
| } |
| /* first columns match, but the last one does not (so contradicting) */ |
| else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0) |
| n_violations++; |
| |
| group_size++; |
| } |
| |
| /* Compute the 'degree of validity' as (supporting/total). */ |
| return (n_supporting_rows * 1.0 / data->numrows); |
| } |
| |
| /* |
| * detects functional dependencies between groups of columns |
| * |
| * Generates all possible subsets of columns (variations) and computes |
| * the degree of validity for each one. For example when creating statistics |
| * on three columns (a,b,c) there are 9 possible dependencies |
| * |
| * two columns three columns |
| * ----------- ------------- |
| * (a) -> b (a,b) -> c |
| * (a) -> c (a,c) -> b |
| * (b) -> a (b,c) -> a |
| * (b) -> c |
| * (c) -> a |
| * (c) -> b |
| */ |
| MVDependencies * |
| statext_dependencies_build(StatsBuildData *data) |
| { |
| int i, |
| k; |
| |
| /* result */ |
| MVDependencies *dependencies = NULL; |
| MemoryContext cxt; |
| |
| Assert(data->nattnums >= 2); |
| |
| /* tracks memory allocated by dependency_degree calls */ |
| cxt = AllocSetContextCreate(CurrentMemoryContext, |
| "dependency_degree cxt", |
| ALLOCSET_DEFAULT_SIZES); |
| |
| /* |
| * We'll try build functional dependencies starting from the smallest ones |
| * covering just 2 columns, to the largest ones, covering all columns |
| * included in the statistics object. We start from the smallest ones |
| * because we want to be able to skip already implied ones. |
| */ |
| for (k = 2; k <= data->nattnums; k++) |
| { |
| AttrNumber *dependency; /* array with k elements */ |
| |
| /* prepare a DependencyGenerator of variation */ |
| DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k); |
| |
| /* generate all possible variations of k values (out of n) */ |
| while ((dependency = DependencyGenerator_next(DependencyGenerator))) |
| { |
| double degree; |
| MVDependency *d; |
| MemoryContext oldcxt; |
| |
| /* release memory used by dependency degree calculation */ |
| oldcxt = MemoryContextSwitchTo(cxt); |
| |
| /* compute how valid the dependency seems */ |
| degree = dependency_degree(data, k, dependency); |
| |
| MemoryContextSwitchTo(oldcxt); |
| MemoryContextReset(cxt); |
| |
| /* |
| * if the dependency seems entirely invalid, don't store it |
| */ |
| if (degree == 0.0) |
| continue; |
| |
| d = (MVDependency *) palloc0(offsetof(MVDependency, attributes) |
| + k * sizeof(AttrNumber)); |
| |
| /* copy the dependency (and keep the indexes into stxkeys) */ |
| d->degree = degree; |
| d->nattributes = k; |
| for (i = 0; i < k; i++) |
| d->attributes[i] = data->attnums[dependency[i]]; |
| |
| /* initialize the list of dependencies */ |
| if (dependencies == NULL) |
| { |
| dependencies |
| = (MVDependencies *) palloc0(sizeof(MVDependencies)); |
| |
| dependencies->magic = STATS_DEPS_MAGIC; |
| dependencies->type = STATS_DEPS_TYPE_BASIC; |
| dependencies->ndeps = 0; |
| } |
| |
| dependencies->ndeps++; |
| dependencies = (MVDependencies *) repalloc(dependencies, |
| offsetof(MVDependencies, deps) |
| + dependencies->ndeps * sizeof(MVDependency *)); |
| |
| dependencies->deps[dependencies->ndeps - 1] = d; |
| } |
| |
| /* |
| * we're done with variations of k elements, so free the |
| * DependencyGenerator |
| */ |
| DependencyGenerator_free(DependencyGenerator); |
| } |
| |
| MemoryContextDelete(cxt); |
| |
| return dependencies; |
| } |
| |
| |
| /* |
| * Serialize list of dependencies into a bytea value. |
| */ |
| bytea * |
| statext_dependencies_serialize(MVDependencies *dependencies) |
| { |
| int i; |
| bytea *output; |
| char *tmp; |
| Size len; |
| |
| /* we need to store ndeps, with a number of attributes for each one */ |
| len = VARHDRSZ + SizeOfHeader; |
| |
| /* and also include space for the actual attribute numbers and degrees */ |
| for (i = 0; i < dependencies->ndeps; i++) |
| len += SizeOfItem(dependencies->deps[i]->nattributes); |
| |
| output = (bytea *) palloc0(len); |
| SET_VARSIZE(output, len); |
| |
| tmp = VARDATA(output); |
| |
| /* Store the base struct values (magic, type, ndeps) */ |
| memcpy(tmp, &dependencies->magic, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| memcpy(tmp, &dependencies->type, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| memcpy(tmp, &dependencies->ndeps, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| |
| /* store number of attributes and attribute numbers for each dependency */ |
| for (i = 0; i < dependencies->ndeps; i++) |
| { |
| MVDependency *d = dependencies->deps[i]; |
| |
| memcpy(tmp, &d->degree, sizeof(double)); |
| tmp += sizeof(double); |
| |
| memcpy(tmp, &d->nattributes, sizeof(AttrNumber)); |
| tmp += sizeof(AttrNumber); |
| |
| memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes); |
| tmp += sizeof(AttrNumber) * d->nattributes; |
| |
| /* protect against overflow */ |
| Assert(tmp <= ((char *) output + len)); |
| } |
| |
| /* make sure we've produced exactly the right amount of data */ |
| Assert(tmp == ((char *) output + len)); |
| |
| return output; |
| } |
| |
| /* |
| * Reads serialized dependencies into MVDependencies structure. |
| */ |
| MVDependencies * |
| statext_dependencies_deserialize(bytea *data) |
| { |
| int i; |
| Size min_expected_size; |
| MVDependencies *dependencies; |
| char *tmp; |
| |
| if (data == NULL) |
| return NULL; |
| |
| if (VARSIZE_ANY_EXHDR(data) < SizeOfHeader) |
| elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)", |
| VARSIZE_ANY_EXHDR(data), SizeOfHeader); |
| |
| /* read the MVDependencies header */ |
| dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies)); |
| |
| /* initialize pointer to the data part (skip the varlena header) */ |
| tmp = VARDATA_ANY(data); |
| |
| /* read the header fields and perform basic sanity checks */ |
| memcpy(&dependencies->magic, tmp, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| memcpy(&dependencies->type, tmp, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| memcpy(&dependencies->ndeps, tmp, sizeof(uint32)); |
| tmp += sizeof(uint32); |
| |
| if (dependencies->magic != STATS_DEPS_MAGIC) |
| elog(ERROR, "invalid dependency magic %d (expected %d)", |
| dependencies->magic, STATS_DEPS_MAGIC); |
| |
| if (dependencies->type != STATS_DEPS_TYPE_BASIC) |
| elog(ERROR, "invalid dependency type %d (expected %d)", |
| dependencies->type, STATS_DEPS_TYPE_BASIC); |
| |
| if (dependencies->ndeps == 0) |
| elog(ERROR, "invalid zero-length item array in MVDependencies"); |
| |
| /* what minimum bytea size do we expect for those parameters */ |
| min_expected_size = SizeOfItem(dependencies->ndeps); |
| |
| if (VARSIZE_ANY_EXHDR(data) < min_expected_size) |
| elog(ERROR, "invalid dependencies size %zd (expected at least %zd)", |
| VARSIZE_ANY_EXHDR(data), min_expected_size); |
| |
| /* allocate space for the MCV items */ |
| dependencies = repalloc(dependencies, offsetof(MVDependencies, deps) |
| + (dependencies->ndeps * sizeof(MVDependency *))); |
| |
| for (i = 0; i < dependencies->ndeps; i++) |
| { |
| double degree; |
| AttrNumber k; |
| MVDependency *d; |
| |
| /* degree of validity */ |
| memcpy(°ree, tmp, sizeof(double)); |
| tmp += sizeof(double); |
| |
| /* number of attributes */ |
| memcpy(&k, tmp, sizeof(AttrNumber)); |
| tmp += sizeof(AttrNumber); |
| |
| /* is the number of attributes valid? */ |
| Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS)); |
| |
| /* now that we know the number of attributes, allocate the dependency */ |
| d = (MVDependency *) palloc0(offsetof(MVDependency, attributes) |
| + (k * sizeof(AttrNumber))); |
| |
| d->degree = degree; |
| d->nattributes = k; |
| |
| /* copy attribute numbers */ |
| memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes); |
| tmp += sizeof(AttrNumber) * d->nattributes; |
| |
| dependencies->deps[i] = d; |
| |
| /* still within the bytea */ |
| Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); |
| } |
| |
| /* we should have consumed the whole bytea exactly */ |
| Assert(tmp == ((char *) data + VARSIZE_ANY(data))); |
| |
| return dependencies; |
| } |
| |
| /* |
| * dependency_is_fully_matched |
| * checks that a functional dependency is fully matched given clauses on |
| * attributes (assuming the clauses are suitable equality clauses) |
| */ |
| static bool |
| dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums) |
| { |
| int j; |
| |
| /* |
| * Check that the dependency actually is fully covered by clauses. We have |
| * to translate all attribute numbers, as those are referenced |
| */ |
| for (j = 0; j < dependency->nattributes; j++) |
| { |
| int attnum = dependency->attributes[j]; |
| |
| if (!bms_is_member(attnum, attnums)) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /* |
| * statext_dependencies_load |
| * Load the functional dependencies for the indicated pg_statistic_ext tuple |
| */ |
| MVDependencies * |
| statext_dependencies_load(Oid mvoid, bool allow_null) |
| { |
| MVDependencies *result; |
| bool isnull; |
| Datum deps; |
| HeapTuple htup; |
| |
| htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(mvoid)); |
| if (!HeapTupleIsValid(htup)) |
| elog(ERROR, "cache lookup failed for statistics object %u", mvoid); |
| |
| deps = SysCacheGetAttr(STATEXTDATASTXOID, htup, |
| Anum_pg_statistic_ext_data_stxddependencies, &isnull); |
| if (isnull) |
| { |
| if (!allow_null) |
| { |
| elog(ERROR, |
| "requested statistics kind \"%c\" is not yet built for statistics object %u", |
| STATS_EXT_DEPENDENCIES, mvoid); |
| } |
| |
| ReleaseSysCache(htup); |
| return NULL; |
| } |
| |
| result = statext_dependencies_deserialize(DatumGetByteaPP(deps)); |
| |
| ReleaseSysCache(htup); |
| |
| return result; |
| } |
| |
| /* |
| * pg_dependencies_in - input routine for type pg_dependencies. |
| * |
| * pg_dependencies is real enough to be a table column, but it has no operations |
| * of its own, and disallows input too |
| */ |
| Datum |
| pg_dependencies_in(PG_FUNCTION_ARGS) |
| { |
| /* |
| * pg_node_list stores the data in binary form and parsing text input is |
| * not needed, so disallow this. |
| */ |
| ereport(ERROR, |
| (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| errmsg("cannot accept a value of type %s", "pg_dependencies"))); |
| |
| PG_RETURN_VOID(); /* keep compiler quiet */ |
| } |
| |
| /* |
| * pg_dependencies - output routine for type pg_dependencies. |
| */ |
| Datum |
| pg_dependencies_out(PG_FUNCTION_ARGS) |
| { |
| bytea *data = PG_GETARG_BYTEA_PP(0); |
| MVDependencies *dependencies = statext_dependencies_deserialize(data); |
| int i, |
| j; |
| StringInfoData str; |
| |
| initStringInfo(&str); |
| appendStringInfoChar(&str, '{'); |
| |
| for (i = 0; i < dependencies->ndeps; i++) |
| { |
| MVDependency *dependency = dependencies->deps[i]; |
| |
| if (i > 0) |
| appendStringInfoString(&str, ", "); |
| |
| appendStringInfoChar(&str, '"'); |
| for (j = 0; j < dependency->nattributes; j++) |
| { |
| if (j == dependency->nattributes - 1) |
| appendStringInfoString(&str, " => "); |
| else if (j > 0) |
| appendStringInfoString(&str, ", "); |
| |
| appendStringInfo(&str, "%d", dependency->attributes[j]); |
| } |
| appendStringInfo(&str, "\": %f", dependency->degree); |
| } |
| |
| appendStringInfoChar(&str, '}'); |
| |
| PG_RETURN_CSTRING(str.data); |
| } |
| |
| /* |
| * pg_dependencies_recv - binary input routine for type pg_dependencies. |
| */ |
| Datum |
| pg_dependencies_recv(PG_FUNCTION_ARGS) |
| { |
| ereport(ERROR, |
| (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
| errmsg("cannot accept a value of type %s", "pg_dependencies"))); |
| |
| PG_RETURN_VOID(); /* keep compiler quiet */ |
| } |
| |
| /* |
| * pg_dependencies_send - binary output routine for type pg_dependencies. |
| * |
| * Functional dependencies are serialized in a bytea value (although the type |
| * is named differently), so let's just send that. |
| */ |
| Datum |
| pg_dependencies_send(PG_FUNCTION_ARGS) |
| { |
| return byteasend(fcinfo); |
| } |
| |
| /* |
| * dependency_is_compatible_clause |
| * Determines if the clause is compatible with functional dependencies |
| * |
| * Only clauses that have the form of equality to a pseudoconstant, or can be |
| * interpreted that way, are currently accepted. Furthermore the variable |
| * part of the clause must be a simple Var belonging to the specified |
| * relation, whose attribute number we return in *attnum on success. |
| */ |
| static bool |
| dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) |
| { |
| Var *var; |
| Node *clause_expr; |
| |
| if (IsA(clause, RestrictInfo)) |
| { |
| RestrictInfo *rinfo = (RestrictInfo *) clause; |
| |
| /* Pseudoconstants are not interesting (they couldn't contain a Var) */ |
| if (rinfo->pseudoconstant) |
| return false; |
| |
| /* Clauses referencing multiple, or no, varnos are incompatible */ |
| if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) |
| return false; |
| |
| clause = (Node *) rinfo->clause; |
| } |
| |
| if (is_opclause(clause)) |
| { |
| /* If it's an opclause, check for Var = Const or Const = Var. */ |
| OpExpr *expr = (OpExpr *) clause; |
| |
| /* Only expressions with two arguments are candidates. */ |
| if (list_length(expr->args) != 2) |
| return false; |
| |
| /* Make sure non-selected argument is a pseudoconstant. */ |
| if (is_pseudo_constant_clause(lsecond(expr->args))) |
| clause_expr = linitial(expr->args); |
| else if (is_pseudo_constant_clause(linitial(expr->args))) |
| clause_expr = lsecond(expr->args); |
| else |
| return false; |
| |
| /* |
| * If it's not an "=" operator, just ignore the clause, as it's not |
| * compatible with functional dependencies. |
| * |
| * This uses the function for estimating selectivity, not the operator |
| * directly (a bit awkward, but well ...). |
| * |
| * XXX this is pretty dubious; probably it'd be better to check btree |
| * or hash opclass membership, so as not to be fooled by custom |
| * selectivity functions, and to be more consistent with decisions |
| * elsewhere in the planner. |
| */ |
| if (get_oprrest(expr->opno) != F_EQSEL) |
| return false; |
| |
| /* OK to proceed with checking "var" */ |
| } |
| else if (IsA(clause, ScalarArrayOpExpr)) |
| { |
| /* If it's an scalar array operator, check for Var IN Const. */ |
| ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause; |
| |
| /* |
| * Reject ALL() variant, we only care about ANY/IN. |
| * |
| * XXX Maybe we should check if all the values are the same, and allow |
| * ALL in that case? Doesn't seem very practical, though. |
| */ |
| if (!expr->useOr) |
| return false; |
| |
| /* Only expressions with two arguments are candidates. */ |
| if (list_length(expr->args) != 2) |
| return false; |
| |
| /* |
| * We know it's always (Var IN Const), so we assume the var is the |
| * first argument, and pseudoconstant is the second one. |
| */ |
| if (!is_pseudo_constant_clause(lsecond(expr->args))) |
| return false; |
| |
| clause_expr = linitial(expr->args); |
| |
| /* |
| * If it's not an "=" operator, just ignore the clause, as it's not |
| * compatible with functional dependencies. The operator is identified |
| * simply by looking at which function it uses to estimate |
| * selectivity. That's a bit strange, but it's what other similar |
| * places do. |
| */ |
| if (get_oprrest(expr->opno) != F_EQSEL) |
| return false; |
| |
| /* OK to proceed with checking "var" */ |
| } |
| else if (is_orclause(clause)) |
| { |
| BoolExpr *bool_expr = (BoolExpr *) clause; |
| ListCell *lc; |
| |
| /* start with no attribute number */ |
| *attnum = InvalidAttrNumber; |
| |
| foreach(lc, bool_expr->args) |
| { |
| AttrNumber clause_attnum; |
| |
| /* |
| * Had we found incompatible clause in the arguments, treat the |
| * whole clause as incompatible. |
| */ |
| if (!dependency_is_compatible_clause((Node *) lfirst(lc), |
| relid, &clause_attnum)) |
| return false; |
| |
| if (*attnum == InvalidAttrNumber) |
| *attnum = clause_attnum; |
| |
| /* ensure all the variables are the same (same attnum) */ |
| if (*attnum != clause_attnum) |
| return false; |
| } |
| |
| /* the Var is already checked by the recursive call */ |
| return true; |
| } |
| else if (is_notclause(clause)) |
| { |
| /* |
| * "NOT x" can be interpreted as "x = false", so get the argument and |
| * proceed with seeing if it's a suitable Var. |
| */ |
| clause_expr = (Node *) get_notclausearg(clause); |
| } |
| else |
| { |
| /* |
| * A boolean expression "x" can be interpreted as "x = true", so |
| * proceed with seeing if it's a suitable Var. |
| */ |
| clause_expr = (Node *) clause; |
| } |
| |
| /* |
| * We may ignore any RelabelType node above the operand. (There won't be |
| * more than one, since eval_const_expressions has been applied already.) |
| */ |
| if (IsA(clause_expr, RelabelType)) |
| clause_expr = (Node *) ((RelabelType *) clause_expr)->arg; |
| |
| /* We only support plain Vars for now */ |
| if (!IsA(clause_expr, Var)) |
| return false; |
| |
| /* OK, we know we have a Var */ |
| var = (Var *) clause_expr; |
| |
| /* Ensure Var is from the correct relation */ |
| if (var->varno != relid) |
| return false; |
| |
| /* We also better ensure the Var is from the current level */ |
| if (var->varlevelsup != 0) |
| return false; |
| |
| /* Also ignore system attributes (we don't allow stats on those) */ |
| if (!AttrNumberIsForUserDefinedAttr(var->varattno)) |
| return false; |
| |
| *attnum = var->varattno; |
| return true; |
| } |
| |
| /* |
| * find_strongest_dependency |
| * find the strongest dependency on the attributes |
| * |
| * When applying functional dependencies, we start with the strongest |
| * dependencies. That is, we select the dependency that: |
| * |
| * (a) has all attributes covered by equality clauses |
| * |
| * (b) has the most attributes |
| * |
| * (c) has the highest degree of validity |
| * |
| * This guarantees that we eliminate the most redundant conditions first |
| * (see the comment in dependencies_clauselist_selectivity). |
| */ |
| static MVDependency * |
| find_strongest_dependency(MVDependencies **dependencies, int ndependencies, |
| Bitmapset *attnums) |
| { |
| int i, |
| j; |
| MVDependency *strongest = NULL; |
| |
| /* number of attnums in clauses */ |
| int nattnums = bms_num_members(attnums); |
| |
| /* |
| * Iterate over the MVDependency items and find the strongest one from the |
| * fully-matched dependencies. We do the cheap checks first, before |
| * matching it against the attnums. |
| */ |
| for (i = 0; i < ndependencies; i++) |
| { |
| for (j = 0; j < dependencies[i]->ndeps; j++) |
| { |
| MVDependency *dependency = dependencies[i]->deps[j]; |
| |
| /* |
| * Skip dependencies referencing more attributes than available |
| * clauses, as those can't be fully matched. |
| */ |
| if (dependency->nattributes > nattnums) |
| continue; |
| |
| if (strongest) |
| { |
| /* skip dependencies on fewer attributes than the strongest. */ |
| if (dependency->nattributes < strongest->nattributes) |
| continue; |
| |
| /* also skip weaker dependencies when attribute count matches */ |
| if (strongest->nattributes == dependency->nattributes && |
| strongest->degree > dependency->degree) |
| continue; |
| } |
| |
| /* |
| * this dependency is stronger, but we must still check that it's |
| * fully matched to these attnums. We perform this check last as |
| * it's slightly more expensive than the previous checks. |
| */ |
| if (dependency_is_fully_matched(dependency, attnums)) |
| strongest = dependency; /* save new best match */ |
| } |
| } |
| |
| return strongest; |
| } |
| |
| /* |
| * clauselist_apply_dependencies |
| * Apply the specified functional dependencies to a list of clauses and |
| * return the estimated selectivity of the clauses that are compatible |
| * with any of the given dependencies. |
| * |
| * This will estimate all not-already-estimated clauses that are compatible |
| * with functional dependencies, and which have an attribute mentioned by any |
| * of the given dependencies (either as an implying or implied attribute). |
| * |
| * Given (lists of) clauses on attributes (a,b) and a functional dependency |
| * (a=>b), the per-column selectivities P(a) and P(b) are notionally combined |
| * using the formula |
| * |
| * P(a,b) = f * P(a) + (1-f) * P(a) * P(b) |
| * |
| * where 'f' is the degree of dependency. This reflects the fact that we |
| * expect a fraction f of all rows to be consistent with the dependency |
| * (a=>b), and so have a selectivity of P(a), while the remaining rows are |
| * treated as independent. |
| * |
| * In practice, we use a slightly modified version of this formula, which uses |
| * a selectivity of Min(P(a), P(b)) for the dependent rows, since the result |
| * should obviously not exceed either column's individual selectivity. I.e., |
| * we actually combine selectivities using the formula |
| * |
| * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b) |
| * |
| * This can make quite a difference if the specific values matching the |
| * clauses are not consistent with the functional dependency. |
| */ |
| static Selectivity |
| clauselist_apply_dependencies(PlannerInfo *root, List *clauses, |
| int varRelid, JoinType jointype, |
| SpecialJoinInfo *sjinfo, |
| MVDependency **dependencies, int ndependencies, |
| AttrNumber *list_attnums, |
| Bitmapset **estimatedclauses) |
| { |
| Bitmapset *attnums; |
| int i; |
| int j; |
| int nattrs; |
| Selectivity *attr_sel; |
| int attidx; |
| int listidx; |
| ListCell *l; |
| Selectivity s1; |
| |
| /* |
| * Extract the attnums of all implying and implied attributes from all the |
| * given dependencies. Each of these attributes is expected to have at |
| * least 1 not-already-estimated compatible clause that we will estimate |
| * here. |
| */ |
| attnums = NULL; |
| for (i = 0; i < ndependencies; i++) |
| { |
| for (j = 0; j < dependencies[i]->nattributes; j++) |
| { |
| AttrNumber attnum = dependencies[i]->attributes[j]; |
| |
| attnums = bms_add_member(attnums, attnum); |
| } |
| } |
| |
| /* |
| * Compute per-column selectivity estimates for each of these attributes, |
| * and mark all the corresponding clauses as estimated. |
| */ |
| nattrs = bms_num_members(attnums); |
| attr_sel = (Selectivity *) palloc(sizeof(Selectivity) * nattrs); |
| |
| attidx = 0; |
| i = -1; |
| while ((i = bms_next_member(attnums, i)) >= 0) |
| { |
| List *attr_clauses = NIL; |
| Selectivity simple_sel; |
| |
| listidx = -1; |
| foreach(l, clauses) |
| { |
| Node *clause = (Node *) lfirst(l); |
| |
| listidx++; |
| if (list_attnums[listidx] == i) |
| { |
| attr_clauses = lappend(attr_clauses, clause); |
| *estimatedclauses = bms_add_member(*estimatedclauses, listidx); |
| } |
| } |
| |
| simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid, |
| jointype, sjinfo, false, false); |
| attr_sel[attidx++] = simple_sel; |
| } |
| |
| /* |
| * Now combine these selectivities using the dependency information. For |
| * chains of dependencies such as a -> b -> c, the b -> c dependency will |
| * come before the a -> b dependency in the array, so we traverse the |
| * array backwards to ensure such chains are computed in the right order. |
| * |
| * As explained above, pairs of selectivities are combined using the |
| * formula |
| * |
| * P(a,b) = f * Min(P(a), P(b)) + (1-f) * P(a) * P(b) |
| * |
| * to ensure that the combined selectivity is never greater than either |
| * individual selectivity. |
| * |
| * Where multiple dependencies apply (e.g., a -> b -> c), we use |
| * conditional probabilities to compute the overall result as follows: |
| * |
| * P(a,b,c) = P(c|a,b) * P(a,b) = P(c|a,b) * P(b|a) * P(a) |
| * |
| * so we replace the selectivities of all implied attributes with |
| * conditional probabilities, that are conditional on all their implying |
| * attributes. The selectivities of all other non-implied attributes are |
| * left as they are. |
| */ |
| for (i = ndependencies - 1; i >= 0; i--) |
| { |
| MVDependency *dependency = dependencies[i]; |
| AttrNumber attnum; |
| Selectivity s2; |
| double f; |
| |
| /* Selectivity of all the implying attributes */ |
| s1 = 1.0; |
| for (j = 0; j < dependency->nattributes - 1; j++) |
| { |
| attnum = dependency->attributes[j]; |
| attidx = bms_member_index(attnums, attnum); |
| s1 *= attr_sel[attidx]; |
| } |
| |
| /* Original selectivity of the implied attribute */ |
| attnum = dependency->attributes[j]; |
| attidx = bms_member_index(attnums, attnum); |
| s2 = attr_sel[attidx]; |
| |
| /* |
| * Replace s2 with the conditional probability s2 given s1, computed |
| * using the formula P(b|a) = P(a,b) / P(a), which simplifies to |
| * |
| * P(b|a) = f * Min(P(a), P(b)) / P(a) + (1-f) * P(b) |
| * |
| * where P(a) = s1, the selectivity of the implying attributes, and |
| * P(b) = s2, the selectivity of the implied attribute. |
| */ |
| f = dependency->degree; |
| |
| if (s1 <= s2) |
| attr_sel[attidx] = f + (1 - f) * s2; |
| else |
| attr_sel[attidx] = f * s2 / s1 + (1 - f) * s2; |
| } |
| |
| /* |
| * The overall selectivity of all the clauses on all these attributes is |
| * then the product of all the original (non-implied) probabilities and |
| * the new conditional (implied) probabilities. |
| */ |
| s1 = 1.0; |
| for (i = 0; i < nattrs; i++) |
| s1 *= attr_sel[i]; |
| |
| CLAMP_PROBABILITY(s1); |
| |
| pfree(attr_sel); |
| bms_free(attnums); |
| |
| return s1; |
| } |
| |
| /* |
| * dependency_is_compatible_expression |
| * Determines if the expression is compatible with functional dependencies |
| * |
| * Similar to dependency_is_compatible_clause, but doesn't enforce that the |
| * expression is a simple Var. OTOH we check that there's at least one |
| * statistics object matching the expression. |
| */ |
| static bool |
| dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr) |
| { |
| List *vars; |
| ListCell *lc, |
| *lc2; |
| Node *clause_expr; |
| |
| if (IsA(clause, RestrictInfo)) |
| { |
| RestrictInfo *rinfo = (RestrictInfo *) clause; |
| |
| /* Pseudoconstants are not interesting (they couldn't contain a Var) */ |
| if (rinfo->pseudoconstant) |
| return false; |
| |
| /* Clauses referencing multiple, or no, varnos are incompatible */ |
| if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) |
| return false; |
| |
| clause = (Node *) rinfo->clause; |
| } |
| |
| if (is_opclause(clause)) |
| { |
| /* If it's an opclause, check for Var = Const or Const = Var. */ |
| OpExpr *expr = (OpExpr *) clause; |
| |
| /* Only expressions with two arguments are candidates. */ |
| if (list_length(expr->args) != 2) |
| return false; |
| |
| /* Make sure non-selected argument is a pseudoconstant. */ |
| if (is_pseudo_constant_clause(lsecond(expr->args))) |
| clause_expr = linitial(expr->args); |
| else if (is_pseudo_constant_clause(linitial(expr->args))) |
| clause_expr = lsecond(expr->args); |
| else |
| return false; |
| |
| /* |
| * If it's not an "=" operator, just ignore the clause, as it's not |
| * compatible with functional dependencies. |
| * |
| * This uses the function for estimating selectivity, not the operator |
| * directly (a bit awkward, but well ...). |
| * |
| * XXX this is pretty dubious; probably it'd be better to check btree |
| * or hash opclass membership, so as not to be fooled by custom |
| * selectivity functions, and to be more consistent with decisions |
| * elsewhere in the planner. |
| */ |
| if (get_oprrest(expr->opno) != F_EQSEL) |
| return false; |
| |
| /* OK to proceed with checking "var" */ |
| } |
| else if (IsA(clause, ScalarArrayOpExpr)) |
| { |
| /* If it's an scalar array operator, check for Var IN Const. */ |
| ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause; |
| |
| /* |
| * Reject ALL() variant, we only care about ANY/IN. |
| * |
| * FIXME Maybe we should check if all the values are the same, and |
| * allow ALL in that case? Doesn't seem very practical, though. |
| */ |
| if (!expr->useOr) |
| return false; |
| |
| /* Only expressions with two arguments are candidates. */ |
| if (list_length(expr->args) != 2) |
| return false; |
| |
| /* |
| * We know it's always (Var IN Const), so we assume the var is the |
| * first argument, and pseudoconstant is the second one. |
| */ |
| if (!is_pseudo_constant_clause(lsecond(expr->args))) |
| return false; |
| |
| clause_expr = linitial(expr->args); |
| |
| /* |
| * If it's not an "=" operator, just ignore the clause, as it's not |
| * compatible with functional dependencies. The operator is identified |
| * simply by looking at which function it uses to estimate |
| * selectivity. That's a bit strange, but it's what other similar |
| * places do. |
| */ |
| if (get_oprrest(expr->opno) != F_EQSEL) |
| return false; |
| |
| /* OK to proceed with checking "var" */ |
| } |
| else if (is_orclause(clause)) |
| { |
| BoolExpr *bool_expr = (BoolExpr *) clause; |
| ListCell *lc; |
| |
| /* start with no expression (we'll use the first match) */ |
| *expr = NULL; |
| |
| foreach(lc, bool_expr->args) |
| { |
| Node *or_expr = NULL; |
| |
| /* |
| * Had we found incompatible expression in the arguments, treat |
| * the whole expression as incompatible. |
| */ |
| if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid, |
| statlist, &or_expr)) |
| return false; |
| |
| if (*expr == NULL) |
| *expr = or_expr; |
| |
| /* ensure all the expressions are the same */ |
| if (!equal(or_expr, *expr)) |
| return false; |
| } |
| |
| /* the expression is already checked by the recursive call */ |
| return true; |
| } |
| else if (is_notclause(clause)) |
| { |
| /* |
| * "NOT x" can be interpreted as "x = false", so get the argument and |
| * proceed with seeing if it's a suitable Var. |
| */ |
| clause_expr = (Node *) get_notclausearg(clause); |
| } |
| else |
| { |
| /* |
| * A boolean expression "x" can be interpreted as "x = true", so |
| * proceed with seeing if it's a suitable Var. |
| */ |
| clause_expr = (Node *) clause; |
| } |
| |
| /* |
| * We may ignore any RelabelType node above the operand. (There won't be |
| * more than one, since eval_const_expressions has been applied already.) |
| */ |
| if (IsA(clause_expr, RelabelType)) |
| clause_expr = (Node *) ((RelabelType *) clause_expr)->arg; |
| |
| vars = pull_var_clause(clause_expr, 0); |
| |
| foreach(lc, vars) |
| { |
| Var *var = (Var *) lfirst(lc); |
| |
| /* Ensure Var is from the correct relation */ |
| if (var->varno != relid) |
| return false; |
| |
| /* We also better ensure the Var is from the current level */ |
| if (var->varlevelsup != 0) |
| return false; |
| |
| /* Also ignore system attributes (we don't allow stats on those) */ |
| if (!AttrNumberIsForUserDefinedAttr(var->varattno)) |
| return false; |
| } |
| |
| /* |
| * Check if we actually have a matching statistics for the expression. |
| * |
| * XXX Maybe this is an overkill. We'll eliminate the expressions later. |
| */ |
| foreach(lc, statlist) |
| { |
| StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); |
| |
| /* ignore stats without dependencies */ |
| if (info->kind != STATS_EXT_DEPENDENCIES) |
| continue; |
| |
| foreach(lc2, info->exprs) |
| { |
| Node *stat_expr = (Node *) lfirst(lc2); |
| |
| if (equal(clause_expr, stat_expr)) |
| { |
| *expr = stat_expr; |
| return true; |
| } |
| } |
| } |
| |
| return false; |
| } |
| |
| /* |
| * dependencies_clauselist_selectivity |
| * Return the estimated selectivity of (a subset of) the given clauses |
| * using functional dependency statistics, or 1.0 if no useful functional |
| * dependency statistic exists. |
| * |
| * 'estimatedclauses' is an input/output argument that gets a bit set |
| * corresponding to the (zero-based) list index of each clause that is included |
| * in the estimated selectivity. |
| * |
| * Given equality clauses on attributes (a,b) we find the strongest dependency |
| * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected |
| * dependency, we then combine the per-clause selectivities using the formula |
| * |
| * P(a,b) = f * P(a) + (1-f) * P(a) * P(b) |
| * |
| * where 'f' is the degree of the dependency. (Actually we use a slightly |
| * modified version of this formula -- see clauselist_apply_dependencies()). |
| * |
| * With clauses on more than two attributes, the dependencies are applied |
| * recursively, starting with the widest/strongest dependencies. For example |
| * P(a,b,c) is first split like this: |
| * |
| * P(a,b,c) = f * P(a,b) + (1-f) * P(a,b) * P(c) |
| * |
| * assuming (a,b=>c) is the strongest dependency. |
| */ |
| Selectivity |
| dependencies_clauselist_selectivity(PlannerInfo *root, |
| List *clauses, |
| int varRelid, |
| JoinType jointype, |
| SpecialJoinInfo *sjinfo, |
| RelOptInfo *rel, |
| Bitmapset **estimatedclauses) |
| { |
| Selectivity s1 = 1.0; |
| ListCell *l; |
| Bitmapset *clauses_attnums = NULL; |
| AttrNumber *list_attnums; |
| int listidx; |
| MVDependencies **func_dependencies; |
| int nfunc_dependencies; |
| int total_ndeps; |
| MVDependency **dependencies; |
| int ndependencies; |
| int i; |
| AttrNumber attnum_offset; |
| RangeTblEntry *rte = planner_rt_fetch(rel->relid, root); |
| |
| /* unique expressions */ |
| Node **unique_exprs; |
| int unique_exprs_cnt; |
| |
| /* |
| * When dealing with regular inheritance trees, ignore extended stats |
| * (which were built without data from child rels, and thus do not |
| * represent them). For partitioned tables data there's no data in the |
| * non-leaf relations, so we build stats only for the inheritance tree. |
| * So for partitioned tables we do consider extended stats. |
| */ |
| if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE) |
| return 1.0; |
| |
| /* check if there's any stats that might be useful for us. */ |
| if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) |
| return 1.0; |
| |
| list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * |
| list_length(clauses)); |
| |
| /* |
| * We allocate space as if every clause was a unique expression, although |
| * that's probably overkill. Some will be simple column references that |
| * we'll translate to attnums, and there might be duplicates. But it's |
| * easier and cheaper to just do one allocation than repalloc later. |
| */ |
| unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses)); |
| unique_exprs_cnt = 0; |
| |
| /* |
| * Pre-process the clauses list to extract the attnums seen in each item. |
| * We need to determine if there's any clauses which will be useful for |
| * dependency selectivity estimations. Along the way we'll record all of |
| * the attnums for each clause in a list which we'll reference later so we |
| * don't need to repeat the same work again. We'll also keep track of all |
| * attnums seen. |
| * |
| * We also skip clauses that we already estimated using different types of |
| * statistics (we treat them as incompatible). |
| * |
| * To handle expressions, we assign them negative attnums, as if it was a |
| * system attribute (this is fine, as we only allow extended stats on user |
| * attributes). And then we offset everything by the number of |
| * expressions, so that we can store the values in a bitmapset. |
| */ |
| listidx = 0; |
| foreach(l, clauses) |
| { |
| Node *clause = (Node *) lfirst(l); |
| AttrNumber attnum; |
| Node *expr = NULL; |
| |
| /* ignore clause by default */ |
| list_attnums[listidx] = InvalidAttrNumber; |
| |
| if (!bms_is_member(listidx, *estimatedclauses)) |
| { |
| /* |
| * If it's a simple column reference, just extract the attnum. If |
| * it's an expression, assign a negative attnum as if it was a |
| * system attribute. |
| */ |
| if (dependency_is_compatible_clause(clause, rel->relid, &attnum)) |
| { |
| list_attnums[listidx] = attnum; |
| } |
| else if (dependency_is_compatible_expression(clause, rel->relid, |
| rel->statlist, |
| &expr)) |
| { |
| /* special attnum assigned to this expression */ |
| attnum = InvalidAttrNumber; |
| |
| Assert(expr != NULL); |
| |
| /* If the expression is duplicate, use the same attnum. */ |
| for (i = 0; i < unique_exprs_cnt; i++) |
| { |
| if (equal(unique_exprs[i], expr)) |
| { |
| /* negative attribute number to expression */ |
| attnum = -(i + 1); |
| break; |
| } |
| } |
| |
| /* not found in the list, so add it */ |
| if (attnum == InvalidAttrNumber) |
| { |
| unique_exprs[unique_exprs_cnt++] = expr; |
| |
| /* after incrementing the value, to get -1, -2, ... */ |
| attnum = (-unique_exprs_cnt); |
| } |
| |
| /* remember which attnum was assigned to this clause */ |
| list_attnums[listidx] = attnum; |
| } |
| } |
| |
| listidx++; |
| } |
| |
| Assert(listidx == list_length(clauses)); |
| |
| /* |
| * How much we need to offset the attnums? If there are no expressions, |
| * then no offset is needed. Otherwise we need to offset enough for the |
| * lowest value (-unique_exprs_cnt) to become 1. |
| */ |
| if (unique_exprs_cnt > 0) |
| attnum_offset = (unique_exprs_cnt + 1); |
| else |
| attnum_offset = 0; |
| |
| /* |
| * Now that we know how many expressions there are, we can offset the |
| * values just enough to build the bitmapset. |
| */ |
| for (i = 0; i < list_length(clauses); i++) |
| { |
| AttrNumber attnum; |
| |
| /* ignore incompatible or already estimated clauses */ |
| if (list_attnums[i] == InvalidAttrNumber) |
| continue; |
| |
| /* make sure the attnum is in the expected range */ |
| Assert(list_attnums[i] >= (-unique_exprs_cnt)); |
| Assert(list_attnums[i] <= MaxHeapAttributeNumber); |
| |
| /* make sure the attnum is positive (valid AttrNumber) */ |
| attnum = list_attnums[i] + attnum_offset; |
| |
| /* |
| * Either it's a regular attribute, or it's an expression, in which |
| * case we must not have seen it before (expressions are unique). |
| * |
| * XXX Check whether it's a regular attribute has to be done using the |
| * original attnum, while the second check has to use the value with |
| * an offset. |
| */ |
| Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) || |
| !bms_is_member(attnum, clauses_attnums)); |
| |
| /* |
| * Remember the offset attnum, both for attributes and expressions. |
| * We'll pass list_attnums to clauselist_apply_dependencies, which |
| * uses it to identify clauses in a bitmap. We could also pass the |
| * offset, but this is more convenient. |
| */ |
| list_attnums[i] = attnum; |
| |
| clauses_attnums = bms_add_member(clauses_attnums, attnum); |
| } |
| |
| /* |
| * If there's not at least two distinct attnums and expressions, then |
| * reject the whole list of clauses. We must return 1.0 so the calling |
| * function's selectivity is unaffected. |
| */ |
| if (bms_membership(clauses_attnums) != BMS_MULTIPLE) |
| { |
| bms_free(clauses_attnums); |
| pfree(list_attnums); |
| return 1.0; |
| } |
| |
| /* |
| * Load all functional dependencies matching at least two parameters. We |
| * can simply consider all dependencies at once, without having to search |
| * for the best statistics object. |
| * |
| * To not waste cycles and memory, we deserialize dependencies only for |
| * statistics that match at least two attributes. The array is allocated |
| * with the assumption that all objects match - we could grow the array to |
| * make it just the right size, but it's likely wasteful anyway thanks to |
| * moving the freed chunks to freelists etc. |
| */ |
| func_dependencies = (MVDependencies **) palloc(sizeof(MVDependencies *) * |
| list_length(rel->statlist)); |
| nfunc_dependencies = 0; |
| total_ndeps = 0; |
| |
| foreach(l, rel->statlist) |
| { |
| StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l); |
| int nmatched; |
| int nexprs; |
| int k; |
| MVDependencies *deps; |
| |
| /* skip statistics that are not of the correct type */ |
| if (stat->kind != STATS_EXT_DEPENDENCIES) |
| continue; |
| |
| /* |
| * Count matching attributes - we have to undo the attnum offsets. The |
| * input attribute numbers are not offset (expressions are not |
| * included in stat->keys, so it's not necessary). But we need to |
| * offset it before checking against clauses_attnums. |
| */ |
| nmatched = 0; |
| k = -1; |
| while ((k = bms_next_member(stat->keys, k)) >= 0) |
| { |
| AttrNumber attnum = (AttrNumber) k; |
| |
| /* skip expressions */ |
| if (!AttrNumberIsForUserDefinedAttr(attnum)) |
| continue; |
| |
| /* apply the same offset as above */ |
| attnum += attnum_offset; |
| |
| if (bms_is_member(attnum, clauses_attnums)) |
| nmatched++; |
| } |
| |
| /* count matching expressions */ |
| nexprs = 0; |
| for (i = 0; i < unique_exprs_cnt; i++) |
| { |
| ListCell *lc; |
| |
| foreach(lc, stat->exprs) |
| { |
| Node *stat_expr = (Node *) lfirst(lc); |
| |
| /* try to match it */ |
| if (equal(stat_expr, unique_exprs[i])) |
| nexprs++; |
| } |
| } |
| |
| /* |
| * Skip objects matching fewer than two attributes/expressions from |
| * clauses. |
| */ |
| if (nmatched + nexprs < 2) |
| continue; |
| |
| deps = statext_dependencies_load(stat->statOid, false); |
| |
| /* |
| * The expressions may be represented by different attnums in the |
| * stats, we need to remap them to be consistent with the clauses. |
| * That will make the later steps (e.g. picking the strongest item and |
| * so on) much simpler and cheaper, because it won't need to care |
| * about the offset at all. |
| * |
| * When we're at it, we can ignore dependencies that are not fully |
| * matched by clauses (i.e. referencing attributes or expressions that |
| * are not in the clauses). |
| * |
| * We have to do this for all statistics, as long as there are any |
| * expressions - we need to shift the attnums in all dependencies. |
| * |
| * XXX Maybe we should do this always, because it also eliminates some |
| * of the dependencies early. It might be cheaper than having to walk |
| * the longer list in find_strongest_dependency later, especially as |
| * we need to do that repeatedly? |
| * |
| * XXX We have to do this even when there are no expressions in |
| * clauses, otherwise find_strongest_dependency may fail for stats |
| * with expressions (due to lookup of negative value in bitmap). So we |
| * need to at least filter out those dependencies. Maybe we could do |
| * it in a cheaper way (if there are no expr clauses, we can just |
| * discard all negative attnums without any lookups). |
| */ |
| if (unique_exprs_cnt > 0 || stat->exprs != NIL) |
| { |
| int ndeps = 0; |
| |
| for (i = 0; i < deps->ndeps; i++) |
| { |
| bool skip = false; |
| MVDependency *dep = deps->deps[i]; |
| int j; |
| |
| for (j = 0; j < dep->nattributes; j++) |
| { |
| int idx; |
| Node *expr; |
| int k; |
| AttrNumber unique_attnum = InvalidAttrNumber; |
| AttrNumber attnum; |
| |
| /* undo the per-statistics offset */ |
| attnum = dep->attributes[j]; |
| |
| /* |
| * For regular attributes we can simply check if it |
| * matches any clause. If there's no matching clause, we |
| * can just ignore it. We need to offset the attnum |
| * though. |
| */ |
| if (AttrNumberIsForUserDefinedAttr(attnum)) |
| { |
| dep->attributes[j] = attnum + attnum_offset; |
| |
| if (!bms_is_member(dep->attributes[j], clauses_attnums)) |
| { |
| skip = true; |
| break; |
| } |
| |
| continue; |
| } |
| |
| /* |
| * the attnum should be a valid system attnum (-1, -2, |
| * ...) |
| */ |
| Assert(AttributeNumberIsValid(attnum)); |
| |
| /* |
| * For expressions, we need to do two translations. First |
| * we have to translate the negative attnum to index in |
| * the list of expressions (in the statistics object). |
| * Then we need to see if there's a matching clause. The |
| * index of the unique expression determines the attnum |
| * (and we offset it). |
| */ |
| idx = -(1 + attnum); |
| |
| /* Is the expression index is valid? */ |
| Assert((idx >= 0) && (idx < list_length(stat->exprs))); |
| |
| expr = (Node *) list_nth(stat->exprs, idx); |
| |
| /* try to find the expression in the unique list */ |
| for (k = 0; k < unique_exprs_cnt; k++) |
| { |
| /* |
| * found a matching unique expression, use the attnum |
| * (derived from index of the unique expression) |
| */ |
| if (equal(unique_exprs[k], expr)) |
| { |
| unique_attnum = -(k + 1) + attnum_offset; |
| break; |
| } |
| } |
| |
| /* |
| * Found no matching expression, so we can simply skip |
| * this dependency, because there's no chance it will be |
| * fully covered. |
| */ |
| if (unique_attnum == InvalidAttrNumber) |
| { |
| skip = true; |
| break; |
| } |
| |
| /* otherwise remap it to the new attnum */ |
| dep->attributes[j] = unique_attnum; |
| } |
| |
| /* if found a matching dependency, keep it */ |
| if (!skip) |
| { |
| /* maybe we've skipped something earlier, so move it */ |
| if (ndeps != i) |
| deps->deps[ndeps] = deps->deps[i]; |
| |
| ndeps++; |
| } |
| } |
| |
| deps->ndeps = ndeps; |
| } |
| |
| /* |
| * It's possible we've removed all dependencies, in which case we |
| * don't bother adding it to the list. |
| */ |
| if (deps->ndeps > 0) |
| { |
| func_dependencies[nfunc_dependencies] = deps; |
| total_ndeps += deps->ndeps; |
| nfunc_dependencies++; |
| } |
| } |
| |
| /* if no matching stats could be found then we've nothing to do */ |
| if (nfunc_dependencies == 0) |
| { |
| pfree(func_dependencies); |
| bms_free(clauses_attnums); |
| pfree(list_attnums); |
| pfree(unique_exprs); |
| return 1.0; |
| } |
| |
| /* |
| * Work out which dependencies we can apply, starting with the |
| * widest/strongest ones, and proceeding to smaller/weaker ones. |
| */ |
| dependencies = (MVDependency **) palloc(sizeof(MVDependency *) * |
| total_ndeps); |
| ndependencies = 0; |
| |
| while (true) |
| { |
| MVDependency *dependency; |
| AttrNumber attnum; |
| |
| /* the widest/strongest dependency, fully matched by clauses */ |
| dependency = find_strongest_dependency(func_dependencies, |
| nfunc_dependencies, |
| clauses_attnums); |
| if (!dependency) |
| break; |
| |
| dependencies[ndependencies++] = dependency; |
| |
| /* Ignore dependencies using this implied attribute in later loops */ |
| attnum = dependency->attributes[dependency->nattributes - 1]; |
| clauses_attnums = bms_del_member(clauses_attnums, attnum); |
| } |
| |
| /* |
| * If we found applicable dependencies, use them to estimate all |
| * compatible clauses on attributes that they refer to. |
| */ |
| if (ndependencies != 0) |
| s1 = clauselist_apply_dependencies(root, clauses, varRelid, jointype, |
| sjinfo, dependencies, ndependencies, |
| list_attnums, estimatedclauses); |
| |
| /* free deserialized functional dependencies (and then the array) */ |
| for (i = 0; i < nfunc_dependencies; i++) |
| pfree(func_dependencies[i]); |
| |
| pfree(dependencies); |
| pfree(func_dependencies); |
| bms_free(clauses_attnums); |
| pfree(list_attnums); |
| pfree(unique_exprs); |
| |
| return s1; |
| } |