diff options
Diffstat (limited to 'src/backend/statistics/dependencies.c')
-rw-r--r-- | src/backend/statistics/dependencies.c | 1079 |
1 files changed, 1079 insertions, 0 deletions
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c new file mode 100644 index 00000000000..fb958e1b0a5 --- /dev/null +++ b/src/backend/statistics/dependencies.c @@ -0,0 +1,1079 @@ +/*------------------------------------------------------------------------- + * + * dependencies.c + * POSTGRES functional dependencies + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/statistics/dependencies.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/sysattr.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_statistic_ext.h" +#include "lib/stringinfo.h" +#include "optimizer/clauses.h" +#include "optimizer/cost.h" +#include "optimizer/var.h" +#include "nodes/nodes.h" +#include "nodes/relation.h" +#include "statistics/extended_stats_internal.h" +#include "statistics/statistics.h" +#include "utils/bytea.h" +#include "utils/fmgroids.h" +#include "utils/fmgrprotos.h" +#include "utils/lsyscache.h" +#include "utils/syscache.h" +#include "utils/typcache.h" + +/* + * Internal state for DependencyGenerator of dependencies. Dependencies are similar to + * k-permutations of n elements, except that the order does not matter for the + * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent. + */ +typedef struct DependencyGeneratorData +{ + int k; /* size of the dependency */ + int n; /* number of possible attributes */ + int current; /* next dependency to return (index) */ + AttrNumber ndependencies; /* number of dependencies generated */ + AttrNumber *dependencies; /* array of pre-generated dependencies */ +} DependencyGeneratorData; + +typedef DependencyGeneratorData *DependencyGenerator; + +static void generate_dependencies_recurse(DependencyGenerator state, + int index, AttrNumber start, AttrNumber *current); +static void generate_dependencies(DependencyGenerator state); +static DependencyGenerator DependencyGenerator_init(int n, int k); +static void DependencyGenerator_free(DependencyGenerator state); +static AttrNumber *DependencyGenerator_next(DependencyGenerator state); +static double dependency_degree(int numrows, HeapTuple *rows, int k, + AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs); +static bool dependency_is_fully_matched(MVDependency *dependency, + Bitmapset *attnums); +static bool dependency_implies_attribute(MVDependency *dependency, + AttrNumber attnum); +static bool dependency_is_compatible_clause(Node *clause, Index relid, + AttrNumber *attnum); +static MVDependency *find_strongest_dependency(StatisticExtInfo *stats, + MVDependencies *dependencies, + Bitmapset *attnums); + +static void +generate_dependencies_recurse(DependencyGenerator state, int index, + AttrNumber start, AttrNumber *current) +{ + /* + * The generator handles the first (k-1) elements differently from the + * last element. + */ + if (index < (state->k - 1)) + { + AttrNumber i; + + /* + * The first (k-1) values have to be in ascending order, which we + * generate recursively. + */ + + for (i = start; i < state->n; i++) + { + current[index] = i; + generate_dependencies_recurse(state, (index + 1), (i + 1), current); + } + } + else + { + int i; + + /* + * the last element is the implied value, which does not respect the + * ascending order. We just need to check that the value is not in the + * first (k-1) elements. + */ + + for (i = 0; i < state->n; i++) + { + int j; + bool match = false; + + current[index] = i; + + for (j = 0; j < index; j++) + { + if (current[j] == i) + { + match = true; + break; + } + } + + /* + * If the value is not found in the first part of the dependency, + * we're done. + */ + if (!match) + { + state->dependencies = (AttrNumber *) repalloc(state->dependencies, + state->k * (state->ndependencies + 1) * sizeof(AttrNumber)); + memcpy(&state->dependencies[(state->k * state->ndependencies)], + current, state->k * sizeof(AttrNumber)); + state->ndependencies++; + } + } + } +} + +/* generate all dependencies (k-permutations of n elements) */ +static void +generate_dependencies(DependencyGenerator state) +{ + AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k); + + generate_dependencies_recurse(state, 0, 0, current); + + pfree(current); +} + +/* + * initialize the DependencyGenerator of variations, and prebuild the variations + * + * This pre-builds all the variations. We could also generate them in + * DependencyGenerator_next(), but this seems simpler. + */ +static DependencyGenerator +DependencyGenerator_init(int n, int k) +{ + DependencyGenerator state; + + Assert((n >= k) && (k > 0)); + + /* allocate the DependencyGenerator state */ + state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData)); + state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber)); + + state->ndependencies = 0; + state->current = 0; + state->k = k; + state->n = n; + + /* now actually pre-generate all the variations */ + generate_dependencies(state); + + return state; +} + +/* free the DependencyGenerator state */ +static void +DependencyGenerator_free(DependencyGenerator state) +{ + pfree(state->dependencies); + pfree(state); + +} + +/* generate next combination */ +static AttrNumber * +DependencyGenerator_next(DependencyGenerator state) +{ + if (state->current == state->ndependencies) + return NULL; + + return &state->dependencies[state->k * state->current++]; +} + + +/* + * validates functional dependency on the data + * + * An actual work horse of detecting functional dependencies. Given a variation + * of k attributes, it checks that the first (k-1) are sufficient to determine + * the last one. + */ +static double +dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, + VacAttrStats **stats, Bitmapset *attrs) +{ + int i, + j; + int nvalues = numrows * k; + MultiSortSupport mss; + SortItem *items; + Datum *values; + bool *isnull; + int *attnums; + + /* counters valid within a group */ + int group_size = 0; + int n_violations = 0; + + /* total number of rows supporting (consistent with) the dependency */ + int n_supporting_rows = 0; + + /* Make sure we have at least two input attributes. */ + Assert(k >= 2); + + /* sort info for all attributes columns */ + mss = multi_sort_init(k); + + /* data for the sort */ + items = (SortItem *) palloc(numrows * sizeof(SortItem)); + values = (Datum *) palloc(sizeof(Datum) * nvalues); + isnull = (bool *) palloc(sizeof(bool) * nvalues); + + /* fix the pointers to values/isnull */ + for (i = 0; i < numrows; i++) + { + items[i].values = &values[i * k]; + items[i].isnull = &isnull[i * k]; + } + + /* + * Transform the bms into an array, to make accessing i-th member easier. + */ + attnums = (int *) palloc(sizeof(int) * bms_num_members(attrs)); + i = 0; + j = -1; + while ((j = bms_next_member(attrs, j)) >= 0) + attnums[i++] = j; + + /* + * Verify the dependency (a,b,...)->z, using a rather simple algorithm: + * + * (a) sort the data lexicographically + * + * (b) split the data into groups by first (k-1) columns + * + * (c) for each group count different values in the last column + */ + + /* prepare the sort function for the first dimension, and SortItem array */ + for (i = 0; i < k; i++) + { + VacAttrStats *colstat = stats[dependency[i]]; + TypeCacheEntry *type; + + type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) /* shouldn't happen */ + elog(ERROR, "cache lookup failed for ordering operator for type %u", + colstat->attrtypid); + + /* prepare the sort function for this dimension */ + multi_sort_add_dimension(mss, i, type->lt_opr); + + /* accumulate all the data for both columns into an array and sort it */ + for (j = 0; j < numrows; j++) + { + items[j].values[i] = + heap_getattr(rows[j], attnums[dependency[i]], + stats[i]->tupDesc, &items[j].isnull[i]); + } + } + + /* sort the items so that we can detect the groups */ + qsort_arg((void *) items, numrows, sizeof(SortItem), + multi_sort_compare, mss); + + /* + * Walk through the sorted array, split it into rows according to the + * first (k-1) columns. If there's a single value in the last column, we + * count the group as 'supporting' the functional dependency. Otherwise we + * count it as contradicting. + * + * We also require a group to have a minimum number of rows to be + * considered useful for supporting the dependency. Contradicting groups + * may be of any size, though. + * + * XXX The minimum size requirement makes it impossible to identify case + * when both columns are unique (or nearly unique), and therefore + * trivially functionally dependent. + */ + + /* start with the first row forming a group */ + group_size = 1; + + /* loop 1 beyond the end of the array so that we count the final group */ + for (i = 1; i <= numrows; i++) + { + /* + * Check if the group ended, which may be either because we processed + * all the items (i==numrows), or because the i-th item is not equal + * to the preceding one. + */ + if (i == numrows || + multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0) + { + /* + * If no violations were found in the group then track the rows of + * the group as supporting the functional dependency. + */ + if (n_violations == 0) + n_supporting_rows += group_size; + + /* Reset counters for the new group */ + n_violations = 0; + group_size = 1; + continue; + } + /* first columns match, but the last one does not (so contradicting) */ + else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0) + n_violations++; + + group_size++; + } + + pfree(items); + pfree(values); + pfree(isnull); + pfree(mss); + + /* Compute the 'degree of validity' as (supporting/total). */ + return (n_supporting_rows * 1.0 / numrows); +} + +/* + * detects functional dependencies between groups of columns + * + * Generates all possible subsets of columns (variations) and computes + * the degree of validity for each one. For example with a statistic on + * three columns (a,b,c) there are 9 possible dependencies + * + * two columns three columns + * ----------- ------------- + * (a) -> b (a,b) -> c + * (a) -> c (a,c) -> b + * (b) -> a (b,c) -> a + * (b) -> c + * (c) -> a + * (c) -> b + */ +MVDependencies * +statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, + VacAttrStats **stats) +{ + int i, + j, + k; + int numattrs; + int *attnums; + + /* result */ + MVDependencies *dependencies = NULL; + + numattrs = bms_num_members(attrs); + + /* + * Transform the bms into an array, to make accessing i-th member easier. + */ + attnums = palloc(sizeof(int) * bms_num_members(attrs)); + i = 0; + j = -1; + while ((j = bms_next_member(attrs, j)) >= 0) + attnums[i++] = j; + + Assert(numattrs >= 2); + + /* + * We'll try build functional dependencies starting from the smallest ones + * covering just 2 columns, to the largest ones, covering all columns + * included in the statistics. We start from the smallest ones because we + * want to be able to skip already implied ones. + */ + for (k = 2; k <= numattrs; k++) + { + AttrNumber *dependency; /* array with k elements */ + + /* prepare a DependencyGenerator of variation */ + DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k); + + /* generate all possible variations of k values (out of n) */ + while ((dependency = DependencyGenerator_next(DependencyGenerator))) + { + double degree; + MVDependency *d; + + /* compute how valid the dependency seems */ + degree = dependency_degree(numrows, rows, k, dependency, stats, attrs); + + /* + * if the dependency seems entirely invalid, don't store it it + */ + if (degree == 0.0) + continue; + + d = (MVDependency *) palloc0(offsetof(MVDependency, attributes) + + k * sizeof(AttrNumber)); + + /* copy the dependency (and keep the indexes into stakeys) */ + d->degree = degree; + d->nattributes = k; + for (i = 0; i < k; i++) + d->attributes[i] = attnums[dependency[i]]; + + /* initialize the list of dependencies */ + if (dependencies == NULL) + { + dependencies + = (MVDependencies *) palloc0(sizeof(MVDependencies)); + + dependencies->magic = STATS_DEPS_MAGIC; + dependencies->type = STATS_DEPS_TYPE_BASIC; + dependencies->ndeps = 0; + } + + dependencies->ndeps++; + dependencies = (MVDependencies *) repalloc(dependencies, + offsetof(MVDependencies, deps) + + dependencies->ndeps * sizeof(MVDependency)); + + dependencies->deps[dependencies->ndeps - 1] = d; + } + + /* + * we're done with variations of k elements, so free the + * DependencyGenerator + */ + DependencyGenerator_free(DependencyGenerator); + } + + return dependencies; +} + + +/* + * Serialize list of dependencies into a bytea value. + */ +bytea * +statext_dependencies_serialize(MVDependencies * dependencies) +{ + int i; + bytea *output; + char *tmp; + Size len; + + /* we need to store ndeps, with a number of attributes for each one */ + len = VARHDRSZ + SizeOfDependencies + + dependencies->ndeps * SizeOfDependency; + + /* and also include space for the actual attribute numbers and degrees */ + for (i = 0; i < dependencies->ndeps; i++) + len += (sizeof(AttrNumber) * dependencies->deps[i]->nattributes); + + output = (bytea *) palloc0(len); + SET_VARSIZE(output, len); + + tmp = VARDATA(output); + + /* Store the base struct values (magic, type, ndeps) */ + memcpy(tmp, &dependencies->magic, sizeof(uint32)); + tmp += sizeof(uint32); + memcpy(tmp, &dependencies->type, sizeof(uint32)); + tmp += sizeof(uint32); + memcpy(tmp, &dependencies->ndeps, sizeof(uint32)); + tmp += sizeof(uint32); + + /* store number of attributes and attribute numbers for each dependency */ + for (i = 0; i < dependencies->ndeps; i++) + { + MVDependency *d = dependencies->deps[i]; + + memcpy(tmp, d, SizeOfDependency); + tmp += SizeOfDependency; + + memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes); + tmp += sizeof(AttrNumber) * d->nattributes; + + Assert(tmp <= ((char *) output + len)); + } + + return output; +} + +/* + * Reads serialized dependencies into MVDependencies structure. + */ +MVDependencies * +statext_dependencies_deserialize(bytea *data) +{ + int i; + Size min_expected_size; + MVDependencies *dependencies; + char *tmp; + + if (data == NULL) + return NULL; + + if (VARSIZE_ANY_EXHDR(data) < SizeOfDependencies) + elog(ERROR, "invalid MVDependencies size %ld (expected at least %ld)", + VARSIZE_ANY_EXHDR(data), SizeOfDependencies); + + /* read the MVDependencies header */ + dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies)); + + /* initialize pointer to the data part (skip the varlena header) */ + tmp = VARDATA_ANY(data); + + /* read the header fields and perform basic sanity checks */ + memcpy(&dependencies->magic, tmp, sizeof(uint32)); + tmp += sizeof(uint32); + memcpy(&dependencies->type, tmp, sizeof(uint32)); + tmp += sizeof(uint32); + memcpy(&dependencies->ndeps, tmp, sizeof(uint32)); + tmp += sizeof(uint32); + + if (dependencies->magic != STATS_DEPS_MAGIC) + elog(ERROR, "invalid dependency magic %d (expected %d)", + dependencies->magic, STATS_DEPS_MAGIC); + + if (dependencies->type != STATS_DEPS_TYPE_BASIC) + elog(ERROR, "invalid dependency type %d (expected %d)", + dependencies->type, STATS_DEPS_TYPE_BASIC); + + if (dependencies->ndeps == 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid zero-length item array in MVDependencies"))); + + /* what minimum bytea size do we expect for those parameters */ + min_expected_size = SizeOfDependencies + + dependencies->ndeps * (SizeOfDependency + + sizeof(AttrNumber) * 2); + + if (VARSIZE_ANY_EXHDR(data) < min_expected_size) + elog(ERROR, "invalid dependencies size %ld (expected at least %ld)", + VARSIZE_ANY_EXHDR(data), min_expected_size); + + /* allocate space for the MCV items */ + dependencies = repalloc(dependencies, offsetof(MVDependencies, deps) + + (dependencies->ndeps * sizeof(MVDependency *))); + + for (i = 0; i < dependencies->ndeps; i++) + { + double degree; + AttrNumber k; + MVDependency *d; + + /* degree of validity */ + memcpy(°ree, tmp, sizeof(double)); + tmp += sizeof(double); + + /* number of attributes */ + memcpy(&k, tmp, sizeof(AttrNumber)); + tmp += sizeof(AttrNumber); + + /* is the number of attributes valid? */ + Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS)); + + /* now that we know the number of attributes, allocate the dependency */ + d = (MVDependency *) palloc0(offsetof(MVDependency, attributes) + + (k * sizeof(AttrNumber))); + + d->degree = degree; + d->nattributes = k; + + /* copy attribute numbers */ + memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes); + tmp += sizeof(AttrNumber) * d->nattributes; + + dependencies->deps[i] = d; + + /* still within the bytea */ + Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); + } + + /* we should have consumed the whole bytea exactly */ + Assert(tmp == ((char *) data + VARSIZE_ANY(data))); + + return dependencies; +} + +/* + * dependency_is_fully_matched + * checks that a functional dependency is fully matched given clauses on + * attributes (assuming the clauses are suitable equality clauses) + */ +static bool +dependency_is_fully_matched(MVDependency * dependency, Bitmapset *attnums) +{ + int j; + + /* + * Check that the dependency actually is fully covered by clauses. We have + * to translate all attribute numbers, as those are referenced + */ + for (j = 0; j < dependency->nattributes; j++) + { + int attnum = dependency->attributes[j]; + + if (!bms_is_member(attnum, attnums)) + return false; + } + + return true; +} + +/* + * dependency_implies_attribute + * check that the attnum matches is implied by the functional dependency + */ +static bool +dependency_implies_attribute(MVDependency * dependency, AttrNumber attnum) +{ + if (attnum == dependency->attributes[dependency->nattributes - 1]) + return true; + + return false; +} + +/* + * staext_dependencies_load + * Load the functional dependencies for the indicated pg_statistic_ext tuple + */ +MVDependencies * +staext_dependencies_load(Oid mvoid) +{ + bool isnull; + Datum deps; + + /* + * Prepare to scan pg_statistic_ext for entries having indrelid = this + * rel. + */ + HeapTuple htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(mvoid)); + + if (!HeapTupleIsValid(htup)) + elog(ERROR, "cache lookup failed for extended statistics %u", mvoid); + + deps = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_stadependencies, &isnull); + + Assert(!isnull); + + ReleaseSysCache(htup); + + return statext_dependencies_deserialize(DatumGetByteaP(deps)); +} + +/* + * pg_dependencies_in - input routine for type pg_dependencies. + * + * pg_dependencies is real enough to be a table column, but it has no operations + * of its own, and disallows input too + */ +Datum +pg_dependencies_in(PG_FUNCTION_ARGS) +{ + /* + * pg_node_list stores the data in binary form and parsing text input is + * not needed, so disallow this. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_dependencies"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_dependencies - output routine for type pg_dependencies. + */ +Datum +pg_dependencies_out(PG_FUNCTION_ARGS) +{ + int i, + j; + StringInfoData str; + + bytea *data = PG_GETARG_BYTEA_PP(0); + + MVDependencies *dependencies = statext_dependencies_deserialize(data); + + initStringInfo(&str); + appendStringInfoChar(&str, '['); + + for (i = 0; i < dependencies->ndeps; i++) + { + MVDependency *dependency = dependencies->deps[i]; + + if (i > 0) + appendStringInfoString(&str, ", "); + + appendStringInfoChar(&str, '{'); + for (j = 0; j < dependency->nattributes; j++) + { + if (j == dependency->nattributes - 1) + appendStringInfoString(&str, " => "); + else if (j > 0) + appendStringInfoString(&str, ", "); + + appendStringInfo(&str, "%d", dependency->attributes[j]); + } + appendStringInfo(&str, " : %f", dependency->degree); + appendStringInfoChar(&str, '}'); + } + + appendStringInfoChar(&str, ']'); + + PG_RETURN_CSTRING(str.data); +} + +/* + * pg_dependencies_recv - binary input routine for type pg_dependencies. + */ +Datum +pg_dependencies_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_dependencies"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_dependencies_send - binary output routine for type pg_dependencies. + * + * Functional dependencies are serialized in a bytea value (although the type + * is named differently), so let's just send that. + */ +Datum +pg_dependencies_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} + +/* + * dependency_is_compatible_clause + * Determines if the clause is compatible with functional dependencies + * + * Only OpExprs with two arguments using an equality operator are supported. + * When returning True attnum is set to the attribute number of the Var within + * the supported clause. + * + * Currently we only support Var = Const, or Const = Var. It may be possible + * to expand on this later. + */ +static bool +dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) +{ + RestrictInfo *rinfo = (RestrictInfo *) clause; + + if (!IsA(rinfo, RestrictInfo)) + return false; + + /* Pseudoconstants are not really interesting here. */ + if (rinfo->pseudoconstant) + return false; + + /* clauses referencing multiple varnos are incompatible */ + if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) + return false; + + if (is_opclause(rinfo->clause)) + { + OpExpr *expr = (OpExpr *) rinfo->clause; + Var *var; + bool varonleft = true; + bool ok; + + /* Only expressions with two arguments are considered compatible. */ + if (list_length(expr->args) != 2) + return false; + + /* see if it actually has the right */ + ok = (NumRelids((Node *) expr) == 1) && + (is_pseudo_constant_clause(lsecond(expr->args)) || + (varonleft = false, + is_pseudo_constant_clause(linitial(expr->args)))); + + /* unsupported structure (two variables or so) */ + if (!ok) + return false; + + /* + * If it's not "=" operator, just ignore the clause, as it's not + * compatible with functional dependencies. + * + * This uses the function for estimating selectivity, not the operator + * directly (a bit awkward, but well ...). + */ + if (get_oprrest(expr->opno) != F_EQSEL) + return false; + + var = (varonleft) ? linitial(expr->args) : lsecond(expr->args); + + /* We only support plain Vars for now */ + if (!IsA(var, Var)) + return false; + + /* Ensure var is from the correct relation */ + if (var->varno != relid) + return false; + + /* we also better ensure the Var is from the current level */ + if (var->varlevelsup > 0) + return false; + + /* Also skip system attributes (we don't allow stats on those). */ + if (!AttrNumberIsForUserDefinedAttr(var->varattno)) + return false; + + *attnum = var->varattno; + return true; + } + + return false; +} + +/* + * find_strongest_dependency + * find the strongest dependency on the attributes + * + * When applying functional dependencies, we start with the strongest + * dependencies. That is, we select the dependency that: + * + * (a) has all attributes covered by equality clauses + * + * (b) has the most attributes + * + * (c) has the highest degree of validity + * + * This guarantees that we eliminate the most redundant conditions first + * (see the comment in dependencies_clauselist_selectivity). + */ +static MVDependency * +find_strongest_dependency(StatisticExtInfo * stats, MVDependencies * dependencies, + Bitmapset *attnums) +{ + int i; + MVDependency *strongest = NULL; + + /* number of attnums in clauses */ + int nattnums = bms_num_members(attnums); + + /* + * Iterate over the MVDependency items and find the strongest one from the + * fully-matched dependencies. We do the cheap checks first, before + * matching it against the attnums. + */ + for (i = 0; i < dependencies->ndeps; i++) + { + MVDependency *dependency = dependencies->deps[i]; + + /* + * Skip dependencies referencing more attributes than available + * clauses, as those can't be fully matched. + */ + if (dependency->nattributes > nattnums) + continue; + + if (strongest) + { + /* skip dependencies on fewer attributes than the strongest. */ + if (dependency->nattributes < strongest->nattributes) + continue; + + /* also skip weaker dependencies when attribute count matches */ + if (strongest->nattributes == dependency->nattributes && + strongest->degree > dependency->degree) + continue; + } + + /* + * this dependency is stronger, but we must still check that it's + * fully matched to these attnums. We perform this check last as it's + * slightly more expensive than the previous checks. + */ + if (dependency_is_fully_matched(dependency, attnums)) + strongest = dependency; /* save new best match */ + } + + return strongest; +} + +/* + * dependencies_clauselist_selectivity + * Attempt to estimate selectivity using functional dependency statistics + * + * Given equality clauses on attributes (a,b) we find the strongest dependency + * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected + * dependency, we then combine the per-clause selectivities using the formula + * + * P(a,b) = P(a) * [f + (1-f)*P(b)] + * + * where 'f' is the degree of the dependency. + * + * With clauses on more than two attributes, the dependencies are applied + * recursively, starting with the widest/strongest dependencies. For example + * P(a,b,c) is first split like this: + * + * P(a,b,c) = P(a,b) * [f + (1-f)*P(c)] + * + * assuming (a,b=>c) is the strongest dependency. + */ +Selectivity +dependencies_clauselist_selectivity(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + RelOptInfo *rel, + Bitmapset **estimatedclauses) +{ + Selectivity s1 = 1.0; + ListCell *l; + Bitmapset *clauses_attnums = NULL; + StatisticExtInfo *stat; + MVDependencies *dependencies; + AttrNumber *list_attnums; + int listidx; + + + /* check if there's any stats that might be useful for us. */ + if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) + return 1.0; + + list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * + list_length(clauses)); + + /* + * Pre-process the clauses list to extract the attnums seen in each item. + * We need to determine if there's any clauses which will be useful for + * dependency selectivity estimations. Along the way we'll record all of + * the attnums for each clause in a list which we'll reference later so we + * don't need to repeat the same work again. We'll also keep track of all + * attnums seen. + */ + listidx = 0; + foreach(l, clauses) + { + Node *clause = (Node *) lfirst(l); + AttrNumber attnum; + + if (dependency_is_compatible_clause(clause, rel->relid, &attnum)) + { + list_attnums[listidx] = attnum; + clauses_attnums = bms_add_member(clauses_attnums, attnum); + } + else + list_attnums[listidx] = InvalidAttrNumber; + + listidx++; + } + + /* + * If there's not at least two distinct attnums then reject the whole list + * of clauses. We must return 1.0 so the calling function's selectivity is + * unaffected. + */ + if (bms_num_members(clauses_attnums) < 2) + { + pfree(list_attnums); + return 1.0; + } + + /* find the best suited statistics for these attnums */ + stat = choose_best_statistics(rel->statlist, clauses_attnums, + STATS_EXT_DEPENDENCIES); + + /* if no matching stats could be found then we've nothing to do */ + if (!stat) + { + pfree(list_attnums); + return 1.0; + } + + /* load the dependency items stored in the statistics */ + dependencies = staext_dependencies_load(stat->statOid); + + /* + * Apply the dependencies recursively, starting with the widest/strongest + * ones, and proceeding to the smaller/weaker ones. At the end of each + * round we factor in the selectivity of clauses on the implied attribute, + * and remove the clauses from the list. + */ + while (true) + { + Selectivity s2 = 1.0; + MVDependency *dependency; + + /* the widest/strongest dependency, fully matched by clauses */ + dependency = find_strongest_dependency(stat, dependencies, + clauses_attnums); + + /* if no suitable dependency was found, we're done */ + if (!dependency) + break; + + /* + * We found an applicable dependency, so find all the clauses on the + * implied attribute - with dependency (a,b => c) we look for clauses + * on 'c'. + */ + listidx = -1; + foreach(l, clauses) + { + Node *clause; + + listidx++; + + /* + * Skip incompatible clauses, and ones we've already estimated on. + */ + if (list_attnums[listidx] == InvalidAttrNumber || + bms_is_member(listidx, *estimatedclauses)) + continue; + + /* + * Technically we could find more than one clause for a given + * attnum. Since these clauses must be equality clauses, we choose + * to only take the selectivity estimate from the final clause in + * the list for this attnum. If the attnum happens to be compared + * to a different Const in another clause then no rows will match + * anyway. If it happens to be compared to the same Const, then + * ignoring the additional clause is just the thing to do. + */ + if (dependency_implies_attribute(dependency, + list_attnums[listidx])) + { + clause = (Node *) lfirst(l); + + s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo, + NULL); /* don't try to use ext stats */ + + /* mark this one as done, so we don't touch it again. */ + *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + + /* + * Mark that we've got and used the dependency on this clause. + * We'll want to ignore this when looking for the next + * strongest dependency above. + */ + clauses_attnums = bms_del_member(clauses_attnums, + list_attnums[listidx]); + } + } + + /* + * Now factor in the selectivity for all the "implied" clauses into + * the final one, using this formula: + * + * P(a,b) = P(a) * (f + (1-f) * P(b)) + * + * where 'f' is the degree of validity of the dependency. + */ + s1 *= (dependency->degree + (1 - dependency->degree) * s2); + } + + pfree(dependencies); + pfree(list_attnums); + + return s1; +} |