diff options
Diffstat (limited to 'src/backend')
29 files changed, 2239 insertions, 27 deletions
diff --git a/src/backend/Makefile b/src/backend/Makefile index fffb0d95bad..bce9d2c3ebb 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -19,7 +19,7 @@ include $(top_builddir)/src/Makefile.global SUBDIRS = access bootstrap catalog parser commands executor foreign lib libpq \ main nodes optimizer port postmaster regex replication rewrite \ - storage tcop tsearch utils $(top_builddir)/src/timezone + statistics storage tcop tsearch utils $(top_builddir)/src/timezone include $(srcdir)/common.mk diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 159cab5c18c..fd33426bad1 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -33,6 +33,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \ pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \ pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \ + pg_statistic_ext.h \ pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \ pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \ pg_database.h pg_db_role_setting.h pg_tablespace.h pg_pltemplate.h \ diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c index be86d76a59e..d01930f4a80 100644 --- a/src/backend/catalog/aclchk.c +++ b/src/backend/catalog/aclchk.c @@ -48,6 +48,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_proc.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" @@ -3302,6 +3303,8 @@ static const char *const no_priv_msg[MAX_ACL_KIND] = gettext_noop("permission denied for collation %s"), /* ACL_KIND_CONVERSION */ gettext_noop("permission denied for conversion %s"), + /* ACL_KIND_STATISTICS */ + gettext_noop("permission denied for statistics %s"), /* ACL_KIND_TABLESPACE */ gettext_noop("permission denied for tablespace %s"), /* ACL_KIND_TSDICTIONARY */ @@ -3352,6 +3355,8 @@ static const char *const not_owner_msg[MAX_ACL_KIND] = gettext_noop("must be owner of collation %s"), /* ACL_KIND_CONVERSION */ gettext_noop("must be owner of conversion %s"), + /* ACL_KIND_STATISTICS */ + gettext_noop("must be owner of statistics %s"), /* ACL_KIND_TABLESPACE */ gettext_noop("must be owner of tablespace %s"), /* ACL_KIND_TSDICTIONARY */ @@ -3467,6 +3472,10 @@ pg_aclmask(AclObjectKind objkind, Oid table_oid, AttrNumber attnum, Oid roleid, mask, how, NULL); case ACL_KIND_NAMESPACE: return pg_namespace_aclmask(table_oid, roleid, mask, how); + case ACL_KIND_STATISTICS: + elog(ERROR, "grantable rights not supported for statistics"); + /* not reached, but keep compiler quiet */ + return ACL_NO_RIGHTS; case ACL_KIND_TABLESPACE: return pg_tablespace_aclmask(table_oid, roleid, mask, how); case ACL_KIND_FDW: @@ -5104,6 +5113,32 @@ pg_subscription_ownercheck(Oid sub_oid, Oid roleid) } /* + * Ownership check for a extended statistics (specified by OID). + */ +bool +pg_statistics_ownercheck(Oid stat_oid, Oid roleid) +{ + HeapTuple tuple; + Oid ownerId; + + /* Superusers bypass all permission checking. */ + if (superuser_arg(roleid)) + return true; + + tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stat_oid)); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("statistics with OID %u do not exist", stat_oid))); + + ownerId = ((Form_pg_statistic_ext) GETSTRUCT(tuple))->staowner; + + ReleaseSysCache(tuple); + + return has_privs_of_role(roleid, ownerId); +} + +/* * Check whether specified role has CREATEROLE privilege (or is a superuser) * * Note: roles do not have owners per se; instead we use this test in diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index fc088b21658..ee27cae7df7 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -51,6 +51,7 @@ #include "catalog/pg_publication.h" #include "catalog/pg_publication_rel.h" #include "catalog/pg_rewrite.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" @@ -154,6 +155,7 @@ static const Oid object_classes[] = { RewriteRelationId, /* OCLASS_REWRITE */ TriggerRelationId, /* OCLASS_TRIGGER */ NamespaceRelationId, /* OCLASS_SCHEMA */ + StatisticExtRelationId, /* OCLASS_STATISTIC_EXT */ TSParserRelationId, /* OCLASS_TSPARSER */ TSDictionaryRelationId, /* OCLASS_TSDICT */ TSTemplateRelationId, /* OCLASS_TSTEMPLATE */ @@ -1263,6 +1265,10 @@ doDeletion(const ObjectAddress *object, int flags) DropTransformById(object->objectId); break; + case OCLASS_STATISTIC_EXT: + RemoveStatisticsById(object->objectId); + break; + default: elog(ERROR, "unrecognized object class: %u", object->classId); @@ -2377,6 +2383,9 @@ getObjectClass(const ObjectAddress *object) case NamespaceRelationId: return OCLASS_SCHEMA; + case StatisticExtRelationId: + return OCLASS_STATISTIC_EXT; + case TSParserRelationId: return OCLASS_TSPARSER; diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index d49dcdc015d..eee5e2f6caf 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -52,6 +52,7 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_partitioned_table.h" #include "catalog/pg_statistic.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription_rel.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" @@ -1609,7 +1610,10 @@ RemoveAttributeById(Oid relid, AttrNumber attnum) heap_close(attr_rel, RowExclusiveLock); if (attnum > 0) + { RemoveStatistics(relid, attnum); + RemoveStatisticsExt(relid, attnum); + } relation_close(rel, NoLock); } @@ -1860,6 +1864,7 @@ heap_drop_with_catalog(Oid relid) * delete statistics */ RemoveStatistics(relid, 0); + RemoveStatisticsExt(relid, 0); /* * delete attribute tuples @@ -2772,6 +2777,75 @@ RemoveStatistics(Oid relid, AttrNumber attnum) /* + * RemoveStatisticsExt --- remove entries in pg_statistic_ext for a relation + * + * If attnum is zero, remove all entries for rel; else remove only the + * one(s) involving that column. + */ +void +RemoveStatisticsExt(Oid relid, AttrNumber attnum) +{ + Relation pgstatisticext; + SysScanDesc scan; + ScanKeyData key; + HeapTuple tuple; + + /* + * Scan pg_statistic_ext to delete relevant tuples + */ + pgstatisticext = heap_open(StatisticExtRelationId, RowExclusiveLock); + + ScanKeyInit(&key, + Anum_pg_statistic_ext_starelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + + scan = systable_beginscan(pgstatisticext, + StatisticExtRelidIndexId, + true, NULL, 1, &key); + + while (HeapTupleIsValid(tuple = systable_getnext(scan))) + { + bool delete = false; + + if (attnum == 0) + delete = true; + else if (attnum != 0) + { + Form_pg_statistic_ext staForm; + int i; + + /* + * Decode the stakeys array and delete any stats that involve the + * specified column. + */ + staForm = (Form_pg_statistic_ext) GETSTRUCT(tuple); + for (i = 0; i < staForm->stakeys.dim1; i++) + { + if (staForm->stakeys.values[i] == attnum) + { + delete = true; + break; + } + } + } + + if (delete) + { + CatalogTupleDelete(pgstatisticext, &tuple->t_self); + deleteDependencyRecordsFor(StatisticExtRelationId, + HeapTupleGetOid(tuple), + false); + } + } + + systable_endscan(scan); + + heap_close(pgstatisticext, RowExclusiveLock); +} + + +/* * RelationTruncateIndexes - truncate all indexes associated * with the heap relation to zero tuples. * diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index a38da3047ff..e521bd908a2 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -2086,6 +2086,62 @@ ConversionIsVisible(Oid conid) } /* + * get_statistics_oid - find a statistics by possibly qualified name + * + * If not found, returns InvalidOid if missing_ok, else throws error + */ +Oid +get_statistics_oid(List *names, bool missing_ok) +{ + char *schemaname; + char *stats_name; + Oid namespaceId; + Oid stats_oid = InvalidOid; + ListCell *l; + + /* deconstruct the name list */ + DeconstructQualifiedName(names, &schemaname, &stats_name); + + if (schemaname) + { + /* use exact schema given */ + namespaceId = LookupExplicitNamespace(schemaname, missing_ok); + if (missing_ok && !OidIsValid(namespaceId)) + stats_oid = InvalidOid; + else + stats_oid = GetSysCacheOid2(STATEXTNAMENSP, + PointerGetDatum(stats_name), + ObjectIdGetDatum(namespaceId)); + } + else + { + /* search for it in search path */ + recomputeNamespacePath(); + + foreach(l, activeSearchPath) + { + namespaceId = lfirst_oid(l); + + if (namespaceId == myTempNamespace) + continue; /* do not look in temp namespace */ + stats_oid = GetSysCacheOid2(STATEXTNAMENSP, + PointerGetDatum(stats_name), + ObjectIdGetDatum(namespaceId)); + if (OidIsValid(stats_oid)) + break; + } + } + + if (!OidIsValid(stats_oid) && !missing_ok) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("statistics \"%s\" do not exist", + NameListToString(names)))); + + return stats_oid; +} + +/* * get_ts_parser_oid - find a TS parser by possibly qualified name * * If not found, returns InvalidOid if missing_ok, else throws error diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c index 61a831b4036..2948d64fa73 100644 --- a/src/backend/catalog/objectaddress.c +++ b/src/backend/catalog/objectaddress.c @@ -48,6 +48,7 @@ #include "catalog/pg_publication.h" #include "catalog/pg_publication_rel.h" #include "catalog/pg_rewrite.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" @@ -478,6 +479,18 @@ static const ObjectPropertyType ObjectProperty[] = InvalidAttrNumber, ACL_KIND_SUBSCRIPTION, true + }, + { + StatisticExtRelationId, + StatisticExtOidIndexId, + STATEXTOID, + STATEXTNAMENSP, + Anum_pg_statistic_ext_staname, + Anum_pg_statistic_ext_stanamespace, + Anum_pg_statistic_ext_staowner, + InvalidAttrNumber, /* no ACL (same as relation) */ + ACL_KIND_STATISTICS, + true } }; @@ -696,6 +709,10 @@ static const struct object_type_map /* OCLASS_TRANSFORM */ { "transform", OBJECT_TRANSFORM + }, + /* OBJECT_STATISTIC_EXT */ + { + "statistics", OBJECT_STATISTIC_EXT } }; @@ -974,6 +991,12 @@ get_object_address(ObjectType objtype, Node *object, address = get_object_address_defacl(castNode(List, object), missing_ok); break; + case OBJECT_STATISTIC_EXT: + address.classId = StatisticExtRelationId; + address.objectId = get_statistics_oid(castNode(List, object), + missing_ok); + address.objectSubId = 0; + break; default: elog(ERROR, "unrecognized objtype: %d", (int) objtype); /* placate compiler, in case it thinks elog might return */ @@ -2083,6 +2106,7 @@ pg_get_object_address(PG_FUNCTION_ARGS) case OBJECT_ATTRIBUTE: case OBJECT_COLLATION: case OBJECT_CONVERSION: + case OBJECT_STATISTIC_EXT: case OBJECT_TSPARSER: case OBJECT_TSDICTIONARY: case OBJECT_TSTEMPLATE: @@ -2370,6 +2394,10 @@ check_object_ownership(Oid roleid, ObjectType objtype, ObjectAddress address, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser"))); break; + case OBJECT_STATISTIC_EXT: + if (!pg_statistics_ownercheck(address.objectId, roleid)) + aclcheck_error_type(ACLCHECK_NOT_OWNER, address.objectId); + break; default: elog(ERROR, "unrecognized object type: %d", (int) objtype); @@ -3857,6 +3885,10 @@ getObjectTypeDescription(const ObjectAddress *object) appendStringInfoString(&buffer, "subscription"); break; + case OCLASS_STATISTIC_EXT: + appendStringInfoString(&buffer, "statistics"); + break; + default: appendStringInfo(&buffer, "unrecognized %u", object->classId); break; @@ -4880,6 +4912,29 @@ getObjectIdentityParts(const ObjectAddress *object, break; } + case OCLASS_STATISTIC_EXT: + { + HeapTuple tup; + Form_pg_statistic_ext formStatistic; + char *schema; + + tup = SearchSysCache1(STATEXTOID, + ObjectIdGetDatum(object->objectId)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "cache lookup failed for statistics %u", + object->objectId); + formStatistic = (Form_pg_statistic_ext) GETSTRUCT(tup); + schema = get_namespace_name_or_temp(formStatistic->stanamespace); + appendStringInfoString(&buffer, + quote_qualified_identifier(schema, + NameStr(formStatistic->staname))); + if (objname) + *objname = list_make2(schema, + pstrdup(NameStr(formStatistic->staname))); + ReleaseSysCache(tup); + } + break; + default: appendStringInfo(&buffer, "unrecognized object %u %u %d", object->classId, diff --git a/src/backend/catalog/pg_shdepend.c b/src/backend/catalog/pg_shdepend.c index 8d946ff44cc..d28a8afb47d 100644 --- a/src/backend/catalog/pg_shdepend.c +++ b/src/backend/catalog/pg_shdepend.c @@ -39,6 +39,7 @@ #include "catalog/pg_opfamily.h" #include "catalog/pg_proc.h" #include "catalog/pg_shdepend.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_ts_config.h" @@ -1415,6 +1416,7 @@ shdepReassignOwned(List *roleids, Oid newrole) case OperatorFamilyRelationId: case OperatorClassRelationId: case ExtensionRelationId: + case StatisticExtRelationId: case TableSpaceRelationId: case DatabaseRelationId: case TSConfigRelationId: diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 80d14296de2..b41882aa521 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -186,6 +186,16 @@ CREATE OR REPLACE VIEW pg_sequences AS WHERE NOT pg_is_other_temp_schema(N.oid) AND relkind = 'S'; +CREATE VIEW pg_stats_ext AS + SELECT + N.nspname AS schemaname, + C.relname AS tablename, + S.staname AS staname, + S.stakeys AS attnums, + length(s.standistinct) AS ndistbytes + FROM (pg_statistic_ext S JOIN pg_class C ON (C.oid = S.starelid)) + LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace); + CREATE VIEW pg_stats WITH (security_barrier) AS SELECT nspname AS schemaname, diff --git a/src/backend/commands/Makefile b/src/backend/commands/Makefile index e0fab38cbe1..4a6c99e0908 100644 --- a/src/backend/commands/Makefile +++ b/src/backend/commands/Makefile @@ -18,8 +18,8 @@ OBJS = amcmds.o aggregatecmds.o alter.o analyze.o async.o cluster.o comment.o \ event_trigger.o explain.o extension.o foreigncmds.o functioncmds.o \ indexcmds.o lockcmds.o matview.o operatorcmds.o opclasscmds.o \ policy.o portalcmds.o prepare.o proclang.o publicationcmds.o \ - schemacmds.o seclabel.o sequence.o subscriptioncmds.o tablecmds.o \ - tablespace.o trigger.o tsearchcmds.o typecmds.o user.o vacuum.o \ - vacuumlazy.o variable.o view.o + schemacmds.o seclabel.o sequence.o statscmds.o subscriptioncmds.o \ + tablecmds.o tablespace.o trigger.o tsearchcmds.o typecmds.o user.o \ + vacuum.o vacuumlazy.o variable.o view.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/commands/alter.c b/src/backend/commands/alter.c index cf1391c2e6b..2c6435b7598 100644 --- a/src/backend/commands/alter.c +++ b/src/backend/commands/alter.c @@ -33,6 +33,7 @@ #include "catalog/pg_opfamily.h" #include "catalog/pg_proc.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_ts_config.h" #include "catalog/pg_ts_dict.h" #include "catalog/pg_ts_parser.h" @@ -120,6 +121,10 @@ report_namespace_conflict(Oid classId, const char *name, Oid nspOid) Assert(OidIsValid(nspOid)); msgfmt = gettext_noop("conversion \"%s\" already exists in schema \"%s\""); break; + case StatisticExtRelationId: + Assert(OidIsValid(nspOid)); + msgfmt = gettext_noop("statistics \"%s\" already exists in schema \"%s\""); + break; case TSParserRelationId: Assert(OidIsValid(nspOid)); msgfmt = gettext_noop("text search parser \"%s\" already exists in schema \"%s\""); @@ -373,6 +378,7 @@ ExecRenameStmt(RenameStmt *stmt) case OBJECT_OPCLASS: case OBJECT_OPFAMILY: case OBJECT_LANGUAGE: + case OBJECT_STATISTIC_EXT: case OBJECT_TSCONFIGURATION: case OBJECT_TSDICTIONARY: case OBJECT_TSPARSER: @@ -489,6 +495,7 @@ ExecAlterObjectSchemaStmt(AlterObjectSchemaStmt *stmt, case OBJECT_OPERATOR: case OBJECT_OPCLASS: case OBJECT_OPFAMILY: + case OBJECT_STATISTIC_EXT: case OBJECT_TSCONFIGURATION: case OBJECT_TSDICTIONARY: case OBJECT_TSPARSER: @@ -803,6 +810,7 @@ ExecAlterOwnerStmt(AlterOwnerStmt *stmt) case OBJECT_OPERATOR: case OBJECT_OPCLASS: case OBJECT_OPFAMILY: + case OBJECT_STATISTIC_EXT: case OBJECT_TABLESPACE: case OBJECT_TSDICTIONARY: case OBJECT_TSCONFIGURATION: diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 055338fdff2..c5b5c54babf 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -17,6 +17,7 @@ #include <math.h> #include "access/multixact.h" +#include "access/sysattr.h" #include "access/transam.h" #include "access/tupconvert.h" #include "access/tuptoaster.h" @@ -28,6 +29,7 @@ #include "catalog/pg_collation.h" #include "catalog/pg_inherits_fn.h" #include "catalog/pg_namespace.h" +#include "catalog/pg_statistic_ext.h" #include "commands/dbcommands.h" #include "commands/tablecmds.h" #include "commands/vacuum.h" @@ -39,13 +41,17 @@ #include "parser/parse_relation.h" #include "pgstat.h" #include "postmaster/autovacuum.h" +#include "statistics/extended_stats_internal.h" +#include "statistics/statistics.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/procarray.h" #include "utils/acl.h" #include "utils/attoptcache.h" +#include "utils/builtins.h" #include "utils/datum.h" +#include "utils/fmgroids.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -566,6 +572,10 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params, update_attstats(RelationGetRelid(Irel[ind]), false, thisdata->attr_cnt, thisdata->vacattrstats); } + + /* Build extended statistics (if there are any). */ + BuildRelationExtStatistics(onerel, totalrows, numrows, rows, attr_cnt, + vacattrstats); } /* @@ -1683,19 +1693,6 @@ ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull) */ typedef struct { - Oid eqopr; /* '=' operator for datatype, if any */ - Oid eqfunc; /* and associated function */ - Oid ltopr; /* '<' operator for datatype, if any */ -} StdAnalyzeData; - -typedef struct -{ - Datum value; /* a data value */ - int tupno; /* position index for tuple it came from */ -} ScalarItem; - -typedef struct -{ int count; /* # of duplicates */ int first; /* values[] index of first occurrence */ } ScalarMCVItem; diff --git a/src/backend/commands/dropcmds.c b/src/backend/commands/dropcmds.c index ab73fbf961b..cb948f0204d 100644 --- a/src/backend/commands/dropcmds.c +++ b/src/backend/commands/dropcmds.c @@ -286,6 +286,13 @@ does_not_exist_skipping(ObjectType objtype, Node *object) msg = gettext_noop("schema \"%s\" does not exist, skipping"); name = strVal((Value *) object); break; + case OBJECT_STATISTIC_EXT: + if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) + { + msg = gettext_noop("extended statistics \"%s\" do not exist, skipping"); + name = NameListToString(castNode(List, object)); + } + break; case OBJECT_TSPARSER: if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name)) { diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c index 346b347ae17..7366fc74bec 100644 --- a/src/backend/commands/event_trigger.c +++ b/src/backend/commands/event_trigger.c @@ -112,6 +112,7 @@ static event_trigger_support_data event_trigger_support[] = { {"SCHEMA", true}, {"SEQUENCE", true}, {"SERVER", true}, + {"STATISTICS", true}, {"SUBSCRIPTION", true}, {"TABLE", true}, {"TABLESPACE", false}, @@ -1108,6 +1109,7 @@ EventTriggerSupportsObjectType(ObjectType obtype) case OBJECT_SCHEMA: case OBJECT_SEQUENCE: case OBJECT_SUBSCRIPTION: + case OBJECT_STATISTIC_EXT: case OBJECT_TABCONSTRAINT: case OBJECT_TABLE: case OBJECT_TRANSFORM: @@ -1173,6 +1175,7 @@ EventTriggerSupportsObjectClass(ObjectClass objclass) case OCLASS_PUBLICATION: case OCLASS_PUBLICATION_REL: case OCLASS_SUBSCRIPTION: + case OCLASS_STATISTIC_EXT: return true; } diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c new file mode 100644 index 00000000000..416309106a7 --- /dev/null +++ b/src/backend/commands/statscmds.c @@ -0,0 +1,296 @@ +/*------------------------------------------------------------------------- + * + * statscmds.c + * Commands for creating and altering extended statistics + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/commands/statscmds.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "catalog/dependency.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_statistic_ext.h" +#include "commands/defrem.h" +#include "miscadmin.h" +#include "statistics/statistics.h" +#include "utils/builtins.h" +#include "utils/inval.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/typcache.h" + + +/* used for sorting the attnums in CreateStatistics */ +static int +compare_int16(const void *a, const void *b) +{ + return memcmp(a, b, sizeof(int16)); +} + +/* + * CREATE STATISTICS + */ +ObjectAddress +CreateStatistics(CreateStatsStmt *stmt) +{ + int i; + ListCell *l; + int16 attnums[STATS_MAX_DIMENSIONS]; + int numcols = 0; + ObjectAddress address = InvalidObjectAddress; + char *namestr; + NameData staname; + Oid statoid; + Oid namespaceId; + HeapTuple htup; + Datum values[Natts_pg_statistic_ext]; + bool nulls[Natts_pg_statistic_ext]; + int2vector *stakeys; + Relation statrel; + Relation rel; + Oid relid; + ObjectAddress parentobject, + childobject; + Datum types[1]; /* only ndistinct defined now */ + int ntypes; + ArrayType *staenabled; + bool build_ndistinct; + bool requested_type = false; + + Assert(IsA(stmt, CreateStatsStmt)); + + /* resolve the pieces of the name (namespace etc.) */ + namespaceId = QualifiedNameGetCreationNamespace(stmt->defnames, &namestr); + namestrcpy(&staname, namestr); + + /* + * If if_not_exists was given and the statistics already exists, bail out. + */ + if (SearchSysCacheExists2(STATEXTNAMENSP, + PointerGetDatum(&staname), + ObjectIdGetDatum(namespaceId))) + { + if (stmt->if_not_exists) + { + ereport(NOTICE, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("statistics \"%s\" already exist, skipping", + namestr))); + return InvalidObjectAddress; + } + + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("statistics \"%s\" already exist", namestr))); + } + + rel = heap_openrv(stmt->relation, AccessExclusiveLock); + relid = RelationGetRelid(rel); + + if (rel->rd_rel->relkind != RELKIND_RELATION && + rel->rd_rel->relkind != RELKIND_MATVIEW) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation \"%s\" is not a table or materialized view", + RelationGetRelationName(rel)))); + + /* + * Transform column names to array of attnums. While at it, enforce some + * constraints. + */ + foreach(l, stmt->keys) + { + char *attname = strVal(lfirst(l)); + HeapTuple atttuple; + Form_pg_attribute attForm; + TypeCacheEntry *type; + + atttuple = SearchSysCacheAttName(relid, attname); + if (!HeapTupleIsValid(atttuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" referenced in statistics does not exist", + attname))); + attForm = (Form_pg_attribute) GETSTRUCT(atttuple); + + /* Disallow use of system attributes in extended stats */ + if (attForm->attnum < 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("statistic creation on system columns is not supported"))); + + /* Disallow data types without a less-than operator */ + type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("only scalar types can be used in extended statistics"))); + + /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */ + if (numcols >= STATS_MAX_DIMENSIONS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("cannot have more than %d keys in statistics", + STATS_MAX_DIMENSIONS))); + + attnums[numcols] = ((Form_pg_attribute) GETSTRUCT(atttuple))->attnum; + ReleaseSysCache(atttuple); + numcols++; + } + + /* + * Check that at least two columns were specified in the statement. The + * upper bound was already checked in the loop above. + */ + if (numcols < 2) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("statistics require at least 2 columns"))); + + /* + * Sort the attnums, which makes detecting duplicies somewhat easier, and + * it does not hurt (it does not affect the efficiency, unlike for + * indexes, for example). + */ + qsort(attnums, numcols, sizeof(int16), compare_int16); + + /* + * Look for duplicities in the list of columns. The attnums are sorted so + * just check consecutive elements. + */ + for (i = 1; i < numcols; i++) + if (attnums[i] == attnums[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("duplicate column name in statistics definition"))); + + stakeys = buildint2vector(attnums, numcols); + + /* + * Parse the statistics options. Currently only statistics types are + * recognized. + */ + build_ndistinct = false; + foreach(l, stmt->options) + { + DefElem *opt = (DefElem *) lfirst(l); + + if (strcmp(opt->defname, "ndistinct") == 0) + { + build_ndistinct = defGetBoolean(opt); + requested_type = true; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized STATISTICS option \"%s\"", + opt->defname))); + } + /* If no statistic type was specified, build them all. */ + if (!requested_type) + build_ndistinct = true; + + /* construct the char array of enabled statistic types */ + ntypes = 0; + if (build_ndistinct) + types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT); + Assert(ntypes > 0); + staenabled = construct_array(types, ntypes, CHAROID, 1, true, 'c'); + + /* + * Everything seems fine, so let's build the pg_statistic_ext tuple. + */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + values[Anum_pg_statistic_ext_starelid - 1] = ObjectIdGetDatum(relid); + values[Anum_pg_statistic_ext_staname - 1] = NameGetDatum(&staname); + values[Anum_pg_statistic_ext_stanamespace - 1] = ObjectIdGetDatum(namespaceId); + values[Anum_pg_statistic_ext_staowner - 1] = ObjectIdGetDatum(GetUserId()); + values[Anum_pg_statistic_ext_stakeys - 1] = PointerGetDatum(stakeys); + values[Anum_pg_statistic_ext_staenabled - 1] = PointerGetDatum(staenabled); + + /* no statistics build yet */ + nulls[Anum_pg_statistic_ext_standistinct - 1] = true; + + /* insert it into pg_statistic_ext */ + statrel = heap_open(StatisticExtRelationId, RowExclusiveLock); + htup = heap_form_tuple(statrel->rd_att, values, nulls); + CatalogTupleInsert(statrel, htup); + statoid = HeapTupleGetOid(htup); + heap_freetuple(htup); + heap_close(statrel, RowExclusiveLock); + relation_close(rel, NoLock); + + /* + * Add a dependency on a table, so that stats get dropped on DROP TABLE. + */ + ObjectAddressSet(parentobject, RelationRelationId, relid); + ObjectAddressSet(childobject, StatisticExtRelationId, statoid); + recordDependencyOn(&childobject, &parentobject, DEPENDENCY_AUTO); + + /* + * Also add dependency on the schema. This is required to ensure that we + * drop the statistics on DROP SCHEMA. This is not handled automatically + * by DROP TABLE because the statistics are not an object in the table's + * schema. + */ + ObjectAddressSet(parentobject, NamespaceRelationId, namespaceId); + recordDependencyOn(&childobject, &parentobject, DEPENDENCY_AUTO); + + ObjectAddressSet(address, StatisticExtRelationId, statoid); + + /* + * Invalidate relcache so that others see the new statistics. + */ + CacheInvalidateRelcache(rel); + + return address; +} + +/* + * Guts of statistics deletion. + */ +void +RemoveStatisticsById(Oid statsOid) +{ + Relation relation; + Oid relid; + Relation rel; + HeapTuple tup; + Form_pg_statistic_ext statext; + + /* + * Delete the pg_statistic_ext tuple. + */ + relation = heap_open(StatisticExtRelationId, RowExclusiveLock); + + tup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statsOid)); + + if (!HeapTupleIsValid(tup)) /* should not happen */ + elog(ERROR, "cache lookup failed for statistics %u", statsOid); + + statext = (Form_pg_statistic_ext) GETSTRUCT(tup); + relid = statext->starelid; + + rel = heap_open(relid, AccessExclusiveLock); + + simple_heap_delete(relation, &tup->t_self); + + CacheInvalidateRelcache(rel); + + ReleaseSysCache(tup); + + heap_close(relation, RowExclusiveLock); + heap_close(rel, NoLock); +} diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 93bda427153..c23d5c52851 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3337,6 +3337,20 @@ _copyIndexStmt(const IndexStmt *from) return newnode; } +static CreateStatsStmt * +_copyCreateStatsStmt(const CreateStatsStmt *from) +{ + CreateStatsStmt *newnode = makeNode(CreateStatsStmt); + + COPY_NODE_FIELD(defnames); + COPY_NODE_FIELD(relation); + COPY_NODE_FIELD(keys); + COPY_NODE_FIELD(options); + COPY_SCALAR_FIELD(if_not_exists); + + return newnode; +} + static CreateFunctionStmt * _copyCreateFunctionStmt(const CreateFunctionStmt *from) { @@ -5050,6 +5064,9 @@ copyObject(const void *from) case T_IndexStmt: retval = _copyIndexStmt(from); break; + case T_CreateStatsStmt: + retval = _copyCreateStatsStmt(from); + break; case T_CreateFunctionStmt: retval = _copyCreateFunctionStmt(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 0d12636d92c..5941b7a2bfb 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -1335,6 +1335,18 @@ _equalIndexStmt(const IndexStmt *a, const IndexStmt *b) } static bool +_equalCreateStatsStmt(const CreateStatsStmt *a, const CreateStatsStmt *b) +{ + COMPARE_NODE_FIELD(defnames); + COMPARE_NODE_FIELD(relation); + COMPARE_NODE_FIELD(keys); + COMPARE_NODE_FIELD(options); + COMPARE_SCALAR_FIELD(if_not_exists); + + return true; +} + +static bool _equalCreateFunctionStmt(const CreateFunctionStmt *a, const CreateFunctionStmt *b) { COMPARE_SCALAR_FIELD(replace); @@ -3236,6 +3248,9 @@ equal(const void *a, const void *b) case T_IndexStmt: retval = _equalIndexStmt(a, b); break; + case T_CreateStatsStmt: + retval = _equalCreateStatsStmt(a, b); + break; case T_CreateFunctionStmt: retval = _equalCreateFunctionStmt(a, b); break; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 1b9005fa537..541af029353 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2202,6 +2202,7 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node) WRITE_NODE_FIELD(lateral_vars); WRITE_BITMAPSET_FIELD(lateral_referencers); WRITE_NODE_FIELD(indexlist); + WRITE_NODE_FIELD(statlist); WRITE_UINT_FIELD(pages); WRITE_FLOAT_FIELD(tuples, "%.0f"); WRITE_FLOAT_FIELD(allvisfrac, "%.6f"); @@ -2275,6 +2276,18 @@ _outForeignKeyOptInfo(StringInfo str, const ForeignKeyOptInfo *node) } static void +_outStatisticExtInfo(StringInfo str, const StatisticExtInfo *node) +{ + WRITE_NODE_TYPE("STATISTICEXTINFO"); + + /* NB: this isn't a complete set of fields */ + WRITE_OID_FIELD(statOid); + /* don't write rel, leads to infinite recursion in plan tree dump */ + WRITE_CHAR_FIELD(kind); + WRITE_BITMAPSET_FIELD(keys); +} + +static void _outEquivalenceClass(StringInfo str, const EquivalenceClass *node) { /* @@ -2578,6 +2591,18 @@ _outIndexStmt(StringInfo str, const IndexStmt *node) } static void +_outCreateStatsStmt(StringInfo str, const CreateStatsStmt *node) +{ + WRITE_NODE_TYPE("CREATESTATSSTMT"); + + WRITE_NODE_FIELD(defnames); + WRITE_NODE_FIELD(relation); + WRITE_NODE_FIELD(keys); + WRITE_NODE_FIELD(options); + WRITE_BOOL_FIELD(if_not_exists); +} + +static void _outNotifyStmt(StringInfo str, const NotifyStmt *node) { WRITE_NODE_TYPE("NOTIFY"); @@ -3936,6 +3961,9 @@ outNode(StringInfo str, const void *obj) case T_PlannerParamItem: _outPlannerParamItem(str, obj); break; + case T_StatisticExtInfo: + _outStatisticExtInfo(str, obj); + break; case T_ExtensibleNode: _outExtensibleNode(str, obj); @@ -3953,6 +3981,9 @@ outNode(StringInfo str, const void *obj) case T_IndexStmt: _outIndexStmt(str, obj); break; + case T_CreateStatsStmt: + _outCreateStatsStmt(str, obj); + break; case T_NotifyStmt: _outNotifyStmt(str, obj); break; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 463f8064678..cc88dcc28e4 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -29,6 +29,7 @@ #include "catalog/heap.h" #include "catalog/partition.h" #include "catalog/pg_am.h" +#include "catalog/pg_statistic_ext.h" #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -40,8 +41,11 @@ #include "parser/parse_relation.h" #include "parser/parsetree.h" #include "rewrite/rewriteManip.h" +#include "statistics/statistics.h" #include "storage/bufmgr.h" +#include "utils/builtins.h" #include "utils/lsyscache.h" +#include "utils/syscache.h" #include "utils/rel.h" #include "utils/snapmgr.h" @@ -63,7 +67,7 @@ static List *get_relation_constraints(PlannerInfo *root, bool include_notnull); static List *build_index_tlist(PlannerInfo *root, IndexOptInfo *index, Relation heapRelation); - +static List *get_relation_statistics(RelOptInfo *rel, Relation relation); /* * get_relation_info - @@ -398,6 +402,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, rel->indexlist = indexinfos; + rel->statlist = get_relation_statistics(rel, relation); + /* Grab foreign-table info using the relcache, while we have it */ if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE) { @@ -1251,6 +1257,65 @@ get_relation_constraints(PlannerInfo *root, return result; } +/* + * get_relation_statistics + * Retrieve extended statistics defined on the table. + * + * Returns a List (possibly empty) of StatisticExtInfo objects describing + * the statistics. Note that this doesn't load the actual statistics data, + * just the identifying metadata. Only stats actually built are considered. + */ +static List * +get_relation_statistics(RelOptInfo *rel, Relation relation) +{ + List *statoidlist; + List *stainfos = NIL; + ListCell *l; + + statoidlist = RelationGetStatExtList(relation); + + foreach(l, statoidlist) + { + Oid statOid = lfirst_oid(l); + Form_pg_statistic_ext staForm; + HeapTuple htup; + Bitmapset *keys = NULL; + int i; + + htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid)); + if (!htup) + elog(ERROR, "cache lookup failed for statistics %u", statOid); + staForm = (Form_pg_statistic_ext) GETSTRUCT(htup); + + /* + * First, build the array of columns covered. This is ultimately + * wasted if no stats are actually built, but it doesn't seem worth + * troubling over that case. + */ + for (i = 0; i < staForm->stakeys.dim1; i++) + keys = bms_add_member(keys, staForm->stakeys.values[i]); + + /* add one StatisticExtInfo for each kind built */ + if (statext_is_kind_built(htup, STATS_EXT_NDISTINCT)) + { + StatisticExtInfo *info = makeNode(StatisticExtInfo); + + info->statOid = statOid; + info->rel = rel; + info->kind = STATS_EXT_NDISTINCT; + info->keys = bms_copy(keys); + + stainfos = lcons(info, stainfos); + } + + ReleaseSysCache(htup); + bms_free(keys); + } + + list_free(statoidlist); + + return stainfos; +} /* * relation_excluded_by_constraints diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 82844a0399d..bbcfc1fb4fd 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -257,7 +257,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); ConstraintsSetStmt CopyStmt CreateAsStmt CreateCastStmt CreateDomainStmt CreateExtensionStmt CreateGroupStmt CreateOpClassStmt CreateOpFamilyStmt AlterOpFamilyStmt CreatePLangStmt - CreateSchemaStmt CreateSeqStmt CreateStmt CreateTableSpaceStmt + CreateSchemaStmt CreateSeqStmt CreateStmt CreateStatsStmt CreateTableSpaceStmt CreateFdwStmt CreateForeignServerStmt CreateForeignTableStmt CreateAssertStmt CreateTransformStmt CreateTrigStmt CreateEventTrigStmt CreateUserStmt CreateUserMappingStmt CreateRoleStmt CreatePolicyStmt @@ -874,6 +874,7 @@ stmt : | CreateSeqStmt | CreateStmt | CreateSubscriptionStmt + | CreateStatsStmt | CreateTableSpaceStmt | CreateTransformStmt | CreateTrigStmt @@ -3747,6 +3748,35 @@ OptConsTableSpace: USING INDEX TABLESPACE name { $$ = $4; } ExistingIndex: USING INDEX index_name { $$ = $3; } ; +/***************************************************************************** + * + * QUERY : + * CREATE STATISTICS stats_name WITH (options) ON (columns) FROM relname + * + *****************************************************************************/ + + +CreateStatsStmt: CREATE STATISTICS any_name opt_reloptions ON '(' columnList ')' FROM qualified_name + { + CreateStatsStmt *n = makeNode(CreateStatsStmt); + n->defnames = $3; + n->relation = $10; + n->keys = $7; + n->options = $4; + n->if_not_exists = false; + $$ = (Node *)n; + } + | CREATE STATISTICS IF_P NOT EXISTS any_name opt_reloptions ON '(' columnList ')' FROM qualified_name + { + CreateStatsStmt *n = makeNode(CreateStatsStmt); + n->defnames = $6; + n->relation = $13; + n->keys = $10; + n->options = $7; + n->if_not_exists = true; + $$ = (Node *)n; + } + ; /***************************************************************************** * @@ -6042,6 +6072,7 @@ drop_type_any_name: | FOREIGN TABLE { $$ = OBJECT_FOREIGN_TABLE; } | COLLATION { $$ = OBJECT_COLLATION; } | CONVERSION_P { $$ = OBJECT_CONVERSION; } + | STATISTICS { $$ = OBJECT_STATISTIC_EXT; } | TEXT_P SEARCH PARSER { $$ = OBJECT_TSPARSER; } | TEXT_P SEARCH DICTIONARY { $$ = OBJECT_TSDICTIONARY; } | TEXT_P SEARCH TEMPLATE { $$ = OBJECT_TSTEMPLATE; } @@ -6119,7 +6150,7 @@ opt_restart_seqs: * EXTENSION | EVENT TRIGGER | FOREIGN DATA WRAPPER | * FOREIGN TABLE | INDEX | [PROCEDURAL] LANGUAGE | * MATERIALIZED VIEW | POLICY | ROLE | SCHEMA | SEQUENCE | - * SERVER | TABLE | TABLESPACE | + * SERVER | STATISTICS | TABLE | TABLESPACE | * TEXT SEARCH CONFIGURATION | TEXT SEARCH DICTIONARY | * TEXT SEARCH PARSER | TEXT SEARCH TEMPLATE | TYPE | * VIEW] <objname> | @@ -6288,6 +6319,7 @@ comment_type_any_name: COLUMN { $$ = OBJECT_COLUMN; } | INDEX { $$ = OBJECT_INDEX; } | SEQUENCE { $$ = OBJECT_SEQUENCE; } + | STATISTICS { $$ = OBJECT_STATISTIC_EXT; } | TABLE { $$ = OBJECT_TABLE; } | VIEW { $$ = OBJECT_VIEW; } | MATERIALIZED VIEW { $$ = OBJECT_MATVIEW; } @@ -8428,6 +8460,15 @@ RenameStmt: ALTER AGGREGATE aggregate_with_argtypes RENAME TO name n->missing_ok = false; $$ = (Node *)n; } + | ALTER STATISTICS any_name RENAME TO name + { + RenameStmt *n = makeNode(RenameStmt); + n->renameType = OBJECT_STATISTIC_EXT; + n->object = (Node *) $3; + n->newname = $6; + n->missing_ok = false; + $$ = (Node *)n; + } | ALTER TEXT_P SEARCH PARSER any_name RENAME TO name { RenameStmt *n = makeNode(RenameStmt); @@ -8643,6 +8684,15 @@ AlterObjectSchemaStmt: n->missing_ok = true; $$ = (Node *)n; } + | ALTER STATISTICS any_name SET SCHEMA name + { + AlterObjectSchemaStmt *n = makeNode(AlterObjectSchemaStmt); + n->objectType = OBJECT_STATISTIC_EXT; + n->object = (Node *) $3; + n->newschema = $6; + n->missing_ok = false; + $$ = (Node *)n; + } | ALTER TEXT_P SEARCH PARSER any_name SET SCHEMA name { AlterObjectSchemaStmt *n = makeNode(AlterObjectSchemaStmt); @@ -8906,6 +8956,14 @@ AlterOwnerStmt: ALTER AGGREGATE aggregate_with_argtypes OWNER TO RoleSpec n->newowner = $6; $$ = (Node *)n; } + | ALTER STATISTICS any_name OWNER TO RoleSpec + { + AlterOwnerStmt *n = makeNode(AlterOwnerStmt); + n->objectType = OBJECT_STATISTIC_EXT; + n->object = (Node *) $3; + n->newowner = $6; + $$ = (Node *)n; + } | ALTER TEXT_P SEARCH DICTIONARY any_name OWNER TO RoleSpec { AlterOwnerStmt *n = makeNode(AlterOwnerStmt); diff --git a/src/backend/statistics/Makefile b/src/backend/statistics/Makefile new file mode 100644 index 00000000000..b3615bd69ac --- /dev/null +++ b/src/backend/statistics/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for statistics +# +# IDENTIFICATION +# src/backend/statistics/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/statistics +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = extended_stats.o mvdistinct.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/statistics/README b/src/backend/statistics/README new file mode 100644 index 00000000000..beb7c2485fd --- /dev/null +++ b/src/backend/statistics/README @@ -0,0 +1,34 @@ +Extended statistics +=================== + +When estimating various quantities (e.g. condition selectivities) the default +approach relies on the assumption of independence. In practice that's often +not true, resulting in estimation errors. + +Extended statistics track different types of dependencies between the columns, +hopefully improving the estimates and producing better plans. + +Currently we only have one type of extended statistics - ndistinct +coefficients, and we use it to improve estimates of grouping queries. See +README.ndistinct for details. + + +Size of sample in ANALYZE +------------------------- +When performing ANALYZE, the number of rows to sample is determined as + + (300 * statistics_target) + +That works reasonably well for statistics on individual columns, but perhaps +it's not enough for extended statistics. Papers analyzing estimation errors +all use samples proportional to the table (usually finding that 1-3% of the +table is enough to build accurate stats). + +The requested accuracy (number of MCV items or histogram bins) should also +be considered when determining the sample size, and in extended statistics +those are not necessarily limited by statistics_target. + +This however merits further discussion, because collecting the sample is quite +expensive and increasing it further would make ANALYZE even more painful. +Judging by the experiments with the current implementation, the fixed size +seems to work reasonably well for now, so we leave this as a future work. diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c new file mode 100644 index 00000000000..d2b9f6a7c70 --- /dev/null +++ b/src/backend/statistics/extended_stats.c @@ -0,0 +1,389 @@ +/*------------------------------------------------------------------------- + * + * extended_stats.c + * POSTGRES extended statistics + * + * Generic code supporting statistic objects created via CREATE STATISTICS. + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/statistics/extended_stats.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/htup_details.h" +#include "catalog/indexing.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_statistic_ext.h" +#include "nodes/relation.h" +#include "statistics/extended_stats_internal.h" +#include "statistics/statistics.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/syscache.h" + + +/* + * Used internally to refer to an individual pg_statistic_ext entry. + */ +typedef struct StatExtEntry +{ + Oid statOid; /* OID of pg_statistic_ext entry */ + Bitmapset *columns; /* attribute numbers covered by the statistics */ + List *types; /* 'char' list of enabled statistic kinds */ +} StatExtEntry; + + +static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid); +static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, + int natts, VacAttrStats **vacattrstats); +static void statext_store(Relation pg_stext, Oid relid, + MVNDistinct *ndistinct, + VacAttrStats **stats); + + +/* + * Compute requested extended stats, using the rows sampled for the plain + * (single-column) stats. + * + * This fetches a list of stats from pg_statistic_ext, computes the stats + * and serializes them back into the catalog (as bytea values). + */ +void +BuildRelationExtStatistics(Relation onerel, double totalrows, + int numrows, HeapTuple *rows, + int natts, VacAttrStats **vacattrstats) +{ + Relation pg_stext; + ListCell *lc; + List *stats; + + pg_stext = heap_open(StatisticExtRelationId, RowExclusiveLock); + stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel)); + + foreach(lc, stats) + { + StatExtEntry *stat = (StatExtEntry *) lfirst(lc); + MVNDistinct *ndistinct = NULL; + VacAttrStats **stats; + ListCell *lc2; + + /* filter only the interesting vacattrstats records */ + stats = lookup_var_attr_stats(onerel, stat->columns, + natts, vacattrstats); + + /* check allowed number of dimensions */ + Assert(bms_num_members(stat->columns) >= 2 && + bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS); + + /* compute statistic of each type */ + foreach(lc2, stat->types) + { + char t = (char) lfirst_int(lc2); + + if (t == STATS_EXT_NDISTINCT) + ndistinct = statext_ndistinct_build(totalrows, numrows, rows, + stat->columns, stats); + } + + /* store the statistics in the catalog */ + statext_store(pg_stext, stat->statOid, ndistinct, stats); + } + + heap_close(pg_stext, RowExclusiveLock); +} + +/* + * statext_is_kind_built + * Is this stat kind built in the given pg_statistic_ext tuple? + */ +bool +statext_is_kind_built(HeapTuple htup, char type) +{ + AttrNumber attnum; + + switch (type) + { + case STATS_EXT_NDISTINCT: + attnum = Anum_pg_statistic_ext_standistinct; + break; + + default: + elog(ERROR, "unexpected statistics type requested: %d", type); + } + + return !heap_attisnull(htup, attnum); +} + +/* + * Return a list (of StatExtEntry) of statistics for the given relation. + */ +static List * +fetch_statentries_for_relation(Relation pg_statext, Oid relid) +{ + SysScanDesc scan; + ScanKeyData skey; + HeapTuple htup; + List *result = NIL; + + /* + * Prepare to scan pg_statistic_ext for entries having indrelid = this + * rel. + */ + ScanKeyInit(&skey, + Anum_pg_statistic_ext_starelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + + scan = systable_beginscan(pg_statext, StatisticExtRelidIndexId, true, + NULL, 1, &skey); + + while (HeapTupleIsValid(htup = systable_getnext(scan))) + { + StatExtEntry *entry; + Datum datum; + bool isnull; + int i; + ArrayType *arr; + char *enabled; + Form_pg_statistic_ext staForm; + + entry = palloc0(sizeof(StatExtEntry)); + entry->statOid = HeapTupleGetOid(htup); + staForm = (Form_pg_statistic_ext) GETSTRUCT(htup); + for (i = 0; i < staForm->stakeys.dim1; i++) + { + entry->columns = bms_add_member(entry->columns, + staForm->stakeys.values[i]); + } + + /* decode the staenabled char array into a list of chars */ + datum = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_staenabled, &isnull); + Assert(!isnull); + arr = DatumGetArrayTypeP(datum); + if (ARR_NDIM(arr) != 1 || + ARR_HASNULL(arr) || + ARR_ELEMTYPE(arr) != CHAROID) + elog(ERROR, "staenabled is not a 1-D char array"); + enabled = (char *) ARR_DATA_PTR(arr); + for (i = 0; i < ARR_DIMS(arr)[0]; i++) + { + Assert(enabled[i] == STATS_EXT_NDISTINCT); + entry->types = lappend_int(entry->types, (int) enabled[i]); + } + + result = lappend(result, entry); + } + + systable_endscan(scan); + + return result; +} + +/* + * Using 'vacattrstats' of size 'natts' as input data, return a newly built + * VacAttrStats array which includes only the items corresponding to attributes + * indicated by 'attrs'. + */ +static VacAttrStats ** +lookup_var_attr_stats(Relation rel, Bitmapset *attrs, int natts, + VacAttrStats **vacattrstats) +{ + int i = 0; + int x = -1; + VacAttrStats **stats; + Bitmapset *matched = NULL; + + stats = (VacAttrStats **) + palloc(bms_num_members(attrs) * sizeof(VacAttrStats *)); + + /* lookup VacAttrStats info for the requested columns (same attnum) */ + while ((x = bms_next_member(attrs, x)) >= 0) + { + int j; + + stats[i] = NULL; + for (j = 0; j < natts; j++) + { + if (x == vacattrstats[j]->tupattnum) + { + stats[i] = vacattrstats[j]; + break; + } + } + + if (!stats[i]) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("extended statistics could not be collected for column \"%s\" of relation %s.%s", + NameStr(RelationGetDescr(rel)->attrs[x - 1]->attname), + get_namespace_name(rel->rd_rel->relnamespace), + RelationGetRelationName(rel)), + errhint("Consider ALTER TABLE \"%s\".\"%s\" ALTER \"%s\" SET STATISTICS -1", + get_namespace_name(rel->rd_rel->relnamespace), + RelationGetRelationName(rel), + NameStr(RelationGetDescr(rel)->attrs[x - 1]->attname)))); + + /* + * Check that we found a non-dropped column and that the attnum + * matches. + */ + Assert(!stats[i]->attr->attisdropped); + matched = bms_add_member(matched, stats[i]->tupattnum); + + i++; + } + if (bms_subset_compare(matched, attrs) != BMS_EQUAL) + elog(ERROR, "could not find all attributes in attribute stats array"); + bms_free(matched); + + return stats; +} + +/* + * statext_store + * Serializes the statistics and stores them into the pg_statistic_ext tuple. + */ +static void +statext_store(Relation pg_stext, Oid statOid, + MVNDistinct *ndistinct, + VacAttrStats **stats) +{ + HeapTuple stup, + oldtup; + Datum values[Natts_pg_statistic_ext]; + bool nulls[Natts_pg_statistic_ext]; + bool replaces[Natts_pg_statistic_ext]; + + memset(nulls, 1, Natts_pg_statistic_ext * sizeof(bool)); + memset(replaces, 0, Natts_pg_statistic_ext * sizeof(bool)); + memset(values, 0, Natts_pg_statistic_ext * sizeof(Datum)); + + /* + * Construct a new pg_statistic_ext tuple, replacing the calculated stats. + */ + if (ndistinct != NULL) + { + bytea *data = statext_ndistinct_serialize(ndistinct); + + nulls[Anum_pg_statistic_ext_standistinct - 1] = (data == NULL); + values[Anum_pg_statistic_ext_standistinct - 1] = PointerGetDatum(data); + } + + /* always replace the value (either by bytea or NULL) */ + replaces[Anum_pg_statistic_ext_standistinct - 1] = true; + + /* there should already be a pg_statistic_ext tuple */ + oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid)); + if (!HeapTupleIsValid(oldtup)) + elog(ERROR, "cache lookup failed for extended statistics %u", statOid); + + /* replace it */ + stup = heap_modify_tuple(oldtup, + RelationGetDescr(pg_stext), + values, + nulls, + replaces); + ReleaseSysCache(oldtup); + CatalogTupleUpdate(pg_stext, &stup->t_self, stup); + + heap_freetuple(stup); +} + +/* initialize multi-dimensional sort */ +MultiSortSupport +multi_sort_init(int ndims) +{ + MultiSortSupport mss; + + Assert(ndims >= 2); + + mss = (MultiSortSupport) palloc0(offsetof(MultiSortSupportData, ssup) + +sizeof(SortSupportData) * ndims); + + mss->ndims = ndims; + + return mss; +} + +/* + * Prepare sort support info using the given sort operator + * at the position 'sortdim' + */ +void +multi_sort_add_dimension(MultiSortSupport mss, int sortdim, Oid oper) +{ + SortSupport ssup = &mss->ssup[sortdim]; + + ssup->ssup_cxt = CurrentMemoryContext; + ssup->ssup_collation = DEFAULT_COLLATION_OID; + ssup->ssup_nulls_first = false; + ssup->ssup_cxt = CurrentMemoryContext; + + PrepareSortSupportFromOrderingOp(oper, ssup); +} + +/* compare all the dimensions in the selected order */ +int +multi_sort_compare(const void *a, const void *b, void *arg) +{ + MultiSortSupport mss = (MultiSortSupport) arg; + SortItem *ia = (SortItem *) a; + SortItem *ib = (SortItem *) b; + int i; + + for (i = 0; i < mss->ndims; i++) + { + int compare; + + compare = ApplySortComparator(ia->values[i], ia->isnull[i], + ib->values[i], ib->isnull[i], + &mss->ssup[i]); + + if (compare != 0) + return compare; + } + + /* equal by default */ + return 0; +} + +/* compare selected dimension */ +int +multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b, + MultiSortSupport mss) +{ + return ApplySortComparator(a->values[dim], a->isnull[dim], + b->values[dim], b->isnull[dim], + &mss->ssup[dim]); +} + +int +multi_sort_compare_dims(int start, int end, + const SortItem *a, const SortItem *b, + MultiSortSupport mss) +{ + int dim; + + for (dim = start; dim <= end; dim++) + { + int r = ApplySortComparator(a->values[dim], a->isnull[dim], + b->values[dim], b->isnull[dim], + &mss->ssup[dim]); + + if (r != 0) + return r; + } + + return 0; +} diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c new file mode 100644 index 00000000000..5df4e29eec3 --- /dev/null +++ b/src/backend/statistics/mvdistinct.c @@ -0,0 +1,671 @@ +/*------------------------------------------------------------------------- + * + * mvdistinct.c + * POSTGRES multivariate ndistinct coefficients + * + * Estimating number of groups in a combination of columns (e.g. for GROUP BY) + * is tricky, and the estimation error is often significant. + + * The multivariate ndistinct coefficients address this by storing ndistinct + * estimates for combinations of the user-specified columns. So for example + * given a statistics object on three columns (a,b,c), this module estimates + * and store n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column + * estimates are already available in pg_statistic. + * + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/statistics/mvdistinct.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "access/htup_details.h" +#include "catalog/pg_statistic_ext.h" +#include "utils/fmgrprotos.h" +#include "utils/lsyscache.h" +#include "lib/stringinfo.h" +#include "utils/syscache.h" +#include "utils/typcache.h" +#include "statistics/extended_stats_internal.h" +#include "statistics/statistics.h" + + +static double ndistinct_for_combination(double totalrows, int numrows, + HeapTuple *rows, VacAttrStats **stats, + int k, int *combination); +static double estimate_ndistinct(double totalrows, int numrows, int d, int f1); +static int n_choose_k(int n, int k); +static int num_combinations(int n); + +/* Combination generator API */ + +/* internal state for generator of k-combinations of n elements */ +typedef struct CombinationGenerator +{ + int k; /* size of the combination */ + int n; /* total number of elements */ + int current; /* index of the next combination to return */ + int ncombinations; /* number of combinations (size of array) */ + int *combinations; /* array of pre-built combinations */ +} CombinationGenerator; + +static CombinationGenerator *generator_init(int n, int k); +static void generator_free(CombinationGenerator *state); +static int *generator_next(CombinationGenerator *state); +static void generate_combinations(CombinationGenerator *state); + + +/* + * statext_ndistinct_build + * Compute ndistinct coefficient for the combination of attributes. + * + * This computes the ndistinct estimate using the same estimator used + * in analyze.c and then computes the coefficient. + */ +MVNDistinct * +statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows, + Bitmapset *attrs, VacAttrStats **stats) +{ + MVNDistinct *result; + int k; + int itemcnt; + int numattrs = bms_num_members(attrs); + int numcombs = num_combinations(numattrs); + + result = palloc(offsetof(MVNDistinct, items) + + numcombs * sizeof(MVNDistinctItem)); + result->magic = STATS_NDISTINCT_MAGIC; + result->type = STATS_NDISTINCT_TYPE_BASIC; + result->nitems = numcombs; + + itemcnt = 0; + for (k = 2; k <= numattrs; k++) + { + int *combination; + CombinationGenerator *generator; + + /* generate combinations of K out of N elements */ + generator = generator_init(numattrs, k); + + while ((combination = generator_next(generator))) + { + MVNDistinctItem *item = &result->items[itemcnt]; + int j; + + item->attrs = NULL; + for (j = 0; j < k; j++) + item->attrs = bms_add_member(item->attrs, + stats[combination[j]]->attr->attnum); + item->ndistinct = + ndistinct_for_combination(totalrows, numrows, rows, + stats, k, combination); + + itemcnt++; + Assert(itemcnt <= result->nitems); + } + + generator_free(generator); + } + + /* must consume exactly the whole output array */ + Assert(itemcnt == result->nitems); + + return result; +} + +/* + * statext_ndistinct_load + * Load the ndistinct value for the indicated pg_statistic_ext tuple + */ +MVNDistinct * +statext_ndistinct_load(Oid mvoid) +{ + bool isnull = false; + Datum ndist; + HeapTuple htup; + + htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(mvoid)); + if (!htup) + elog(ERROR, "cache lookup failed for statistics %u", mvoid); + + ndist = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_standistinct, &isnull); + if (isnull) + elog(ERROR, + "requested statistic kind %c not yet built for statistics %u", + STATS_EXT_NDISTINCT, mvoid); + + ReleaseSysCache(htup); + + return statext_ndistinct_deserialize(DatumGetByteaP(ndist)); +} + +/* + * statext_ndistinct_serialize + * serialize ndistinct to the on-disk bytea format + */ +bytea * +statext_ndistinct_serialize(MVNDistinct *ndistinct) +{ + int i; + bytea *output; + char *tmp; + Size len; + + Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC); + Assert(ndistinct->type == STATS_NDISTINCT_TYPE_BASIC); + + /* + * Base size is base struct size, plus one base struct for each items, + * including number of items for each. + */ + len = VARHDRSZ + offsetof(MVNDistinct, items) + + ndistinct->nitems * (offsetof(MVNDistinctItem, attrs) + sizeof(int)); + + /* and also include space for the actual attribute numbers */ + for (i = 0; i < ndistinct->nitems; i++) + { + int nmembers; + + nmembers = bms_num_members(ndistinct->items[i].attrs); + Assert(nmembers >= 2); + len += sizeof(AttrNumber) * nmembers; + } + + output = (bytea *) palloc(len); + SET_VARSIZE(output, len); + + tmp = VARDATA(output); + + /* Store the base struct values */ + memcpy(tmp, ndistinct, offsetof(MVNDistinct, items)); + tmp += offsetof(MVNDistinct, items); + + /* + * store number of attributes and attribute numbers for each ndistinct + * entry + */ + for (i = 0; i < ndistinct->nitems; i++) + { + MVNDistinctItem item = ndistinct->items[i]; + int nmembers = bms_num_members(item.attrs); + int x; + + memcpy(tmp, &item.ndistinct, sizeof(double)); + tmp += sizeof(double); + memcpy(tmp, &nmembers, sizeof(int)); + tmp += sizeof(int); + + x = -1; + while ((x = bms_next_member(item.attrs, x)) >= 0) + { + AttrNumber value = (AttrNumber) x; + + memcpy(tmp, &value, sizeof(AttrNumber)); + tmp += sizeof(AttrNumber); + } + + Assert(tmp <= ((char *) output + len)); + } + + return output; +} + +/* + * statext_ndistinct_deserialize + * Read an on-disk bytea format MVNDistinct to in-memory format + */ +MVNDistinct * +statext_ndistinct_deserialize(bytea *data) +{ + int i; + Size expected_size; + MVNDistinct *ndistinct; + char *tmp; + + if (data == NULL) + return NULL; + + if (VARSIZE_ANY_EXHDR(data) < offsetof(MVNDistinct, items)) + elog(ERROR, "invalid MVNDistinct size %ld (expected at least %ld)", + VARSIZE_ANY_EXHDR(data), offsetof(MVNDistinct, items)); + + /* read the MVNDistinct header */ + ndistinct = (MVNDistinct *) palloc(sizeof(MVNDistinct)); + + /* initialize pointer to the data part (skip the varlena header) */ + tmp = VARDATA_ANY(data); + + /* get the header and perform basic sanity checks */ + memcpy(ndistinct, tmp, offsetof(MVNDistinct, items)); + tmp += offsetof(MVNDistinct, items); + + if (ndistinct->magic != STATS_NDISTINCT_MAGIC) + elog(ERROR, "invalid ndistinct magic %d (expected %d)", + ndistinct->magic, STATS_NDISTINCT_MAGIC); + + if (ndistinct->type != STATS_NDISTINCT_TYPE_BASIC) + elog(ERROR, "invalid ndistinct type %d (expected %d)", + ndistinct->type, STATS_NDISTINCT_TYPE_BASIC); + + Assert(ndistinct->nitems > 0); + + /* what minimum bytea size do we expect for those parameters */ + expected_size = offsetof(MVNDistinct, items) + + ndistinct->nitems * (offsetof(MVNDistinctItem, attrs) + + sizeof(AttrNumber) * 2); + + if (VARSIZE_ANY_EXHDR(data) < expected_size) + elog(ERROR, "invalid dependencies size %ld (expected at least %ld)", + VARSIZE_ANY_EXHDR(data), expected_size); + + /* allocate space for the ndistinct items */ + ndistinct = repalloc(ndistinct, offsetof(MVNDistinct, items) + + (ndistinct->nitems * sizeof(MVNDistinctItem))); + + for (i = 0; i < ndistinct->nitems; i++) + { + MVNDistinctItem *item = &ndistinct->items[i]; + int nelems; + + item->attrs = NULL; + + /* ndistinct value */ + memcpy(&item->ndistinct, tmp, sizeof(double)); + tmp += sizeof(double); + + /* number of attributes */ + memcpy(&nelems, tmp, sizeof(int)); + tmp += sizeof(int); + Assert((nelems >= 2) && (nelems <= STATS_MAX_DIMENSIONS)); + + while (nelems-- > 0) + { + AttrNumber attno; + + memcpy(&attno, tmp, sizeof(AttrNumber)); + tmp += sizeof(AttrNumber); + item->attrs = bms_add_member(item->attrs, attno); + } + + /* still within the bytea */ + Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); + } + + /* we should have consumed the whole bytea exactly */ + Assert(tmp == ((char *) data + VARSIZE_ANY(data))); + + return ndistinct; +} + +/* + * pg_ndistinct_in + * input routine for type pg_ndistinct + * + * pg_ndistinct is real enough to be a table column, but it has no + * operations of its own, and disallows input (jus like pg_node_tree). + */ +Datum +pg_ndistinct_in(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_ndistinct"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_ndistinct + * output routine for type pg_ndistinct + * + * Produces a human-readable representation of the value. + */ +Datum +pg_ndistinct_out(PG_FUNCTION_ARGS) +{ + bytea *data = PG_GETARG_BYTEA_PP(0); + MVNDistinct *ndist = statext_ndistinct_deserialize(data); + int i; + StringInfoData str; + + initStringInfo(&str); + appendStringInfoChar(&str, '['); + + for (i = 0; i < ndist->nitems; i++) + { + MVNDistinctItem item = ndist->items[i]; + + if (i > 0) + appendStringInfoString(&str, ", "); + + appendStringInfoChar(&str, '{'); + outBitmapset(&str, item.attrs); + appendStringInfo(&str, ", %f}", item.ndistinct); + } + + appendStringInfoChar(&str, ']'); + + PG_RETURN_CSTRING(str.data); +} + +/* + * pg_ndistinct_recv + * binary input routine for type pg_ndistinct + */ +Datum +pg_ndistinct_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_ndistinct"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * pg_ndistinct_send + * binary output routine for type pg_ndistinct + * + * n-distinct is serialized into a bytea value, so let's send that. + */ +Datum +pg_ndistinct_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} + +/* + * ndistinct_for_combination + * Estimates number of distinct values in a combination of columns. + * + * This uses the same ndistinct estimator as compute_scalar_stats() in + * ANALYZE, i.e., + * n*d / (n - f1 + f1*n/N) + * + * except that instead of values in a single column we are dealing with + * combination of multiple columns. + */ +static double +ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, + VacAttrStats **stats, int k, int *combination) +{ + int i, + j; + int f1, + cnt, + d; + bool *isnull; + Datum *values; + SortItem *items; + MultiSortSupport mss; + + mss = multi_sort_init(k); + + /* + * In order to determine the number of distinct elements, create separate + * values[]/isnull[] arrays with all the data we have, then sort them + * using the specified column combination as dimensions. We could try to + * sort in place, but it'd probably be more complex and bug-prone. + */ + items = (SortItem *) palloc(numrows * sizeof(SortItem)); + values = (Datum *) palloc0(sizeof(Datum) * numrows * k); + isnull = (bool *) palloc0(sizeof(bool) * numrows * k); + + for (i = 0; i < numrows; i++) + { + items[i].values = &values[i * k]; + items[i].isnull = &isnull[i * k]; + } + + /* + * For each dimension, set up sort-support and fill in the values from + * the sample data. + */ + for (i = 0; i < k; i++) + { + VacAttrStats *colstat = stats[combination[i]]; + TypeCacheEntry *type; + + type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) /* shouldn't happen */ + elog(ERROR, "cache lookup failed for ordering operator for type %u", + colstat->attrtypid); + + /* prepare the sort function for this dimension */ + multi_sort_add_dimension(mss, i, type->lt_opr); + + /* accumulate all the data for this dimension into the arrays */ + for (j = 0; j < numrows; j++) + { + items[j].values[i] = + heap_getattr(rows[j], + colstat->attr->attnum, + colstat->tupDesc, + &items[j].isnull[i]); + } + } + + /* We can sort the array now ... */ + qsort_arg((void *) items, numrows, sizeof(SortItem), + multi_sort_compare, mss); + + /* ... and count the number of distinct combinations */ + + f1 = 0; + cnt = 1; + d = 1; + for (i = 1; i < numrows; i++) + { + if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0) + { + if (cnt == 1) + f1 += 1; + + d++; + cnt = 0; + } + + cnt += 1; + } + + if (cnt == 1) + f1 += 1; + + return estimate_ndistinct(totalrows, numrows, d, f1); +} + +/* The Duj1 estimator (already used in analyze.c). */ +static double +estimate_ndistinct(double totalrows, int numrows, int d, int f1) +{ + double numer, + denom, + ndistinct; + + numer = (double) numrows * (double) d; + + denom = (double) (numrows - f1) + + (double) f1 *(double) numrows / totalrows; + + ndistinct = numer / denom; + + /* Clamp to sane range in case of roundoff error */ + if (ndistinct < (double) d) + ndistinct = (double) d; + + if (ndistinct > totalrows) + ndistinct = totalrows; + + return floor(ndistinct + 0.5); +} + +/* + * n_choose_k + * computes binomial coefficients using an algorithm that is both + * efficient and prevents overflows + */ +static int +n_choose_k(int n, int k) +{ + int d, + r; + + Assert((k > 0) && (n >= k)); + + /* use symmetry of the binomial coefficients */ + k = Min(k, n - k); + + r = 1; + for (d = 1; d <= k; ++d) + { + r *= n--; + r /= d; + } + + return r; +} + +/* + * num_combinations + * number of combinations, excluding single-value combinations + */ +static int +num_combinations(int n) +{ + int k; + int ncombs = 1; + + for (k = 1; k <= n; k++) + ncombs *= 2; + + ncombs -= (n + 1); + + return ncombs; +} + +/* + * generator_init + * initialize the generator of combinations + * + * The generator produces combinations of K elements in the interval (0..N). + * We prebuild all the combinations in this method, which is simpler than + * generating them on the fly. + */ +static CombinationGenerator * +generator_init(int n, int k) +{ + CombinationGenerator *state; + + Assert((n >= k) && (k > 0)); + + /* allocate the generator state as a single chunk of memory */ + state = (CombinationGenerator *) palloc(sizeof(CombinationGenerator)); + + state->ncombinations = n_choose_k(n, k); + + /* pre-allocate space for all combinations*/ + state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations); + + state->current = 0; + state->k = k; + state->n = n; + + /* now actually pre-generate all the combinations of K elements */ + generate_combinations(state); + + /* make sure we got the expected number of combinations */ + Assert(state->current == state->ncombinations); + + /* reset the number, so we start with the first one */ + state->current = 0; + + return state; +} + +/* + * generator_next + * returns the next combination from the prebuilt list + * + * Returns a combination of K array indexes (0 .. N), as specified to + * generator_init), or NULL when there are no more combination. + */ +static int * +generator_next(CombinationGenerator *state) +{ + if (state->current == state->ncombinations) + return NULL; + + return &state->combinations[state->k * state->current++]; +} + +/* + * generator_free + * free the internal state of the generator + * + * Releases the generator internal state (pre-built combinations). + */ +static void +generator_free(CombinationGenerator *state) +{ + pfree(state->combinations); + pfree(state); +} + +/* + * generate_combinations_recurse + * given a prefix, generate all possible combinations + * + * Given a prefix (first few elements of the combination), generate following + * elements recursively. We generate the combinations in lexicographic order, + * which eliminates permutations of the same combination. + */ +static void +generate_combinations_recurse(CombinationGenerator *state, + int index, int start, int *current) +{ + /* If we haven't filled all the elements, simply recurse. */ + if (index < state->k) + { + int i; + + /* + * The values have to be in ascending order, so make sure we start + * with the value passed by parameter. + */ + + for (i = start; i < state->n; i++) + { + current[index] = i; + generate_combinations_recurse(state, (index + 1), (i + 1), current); + } + + return; + } + else + { + /* we got a valid combination, add it to the array */ + memcpy(&state->combinations[(state->k * state->current)], + current, state->k * sizeof(int)); + state->current++; + } +} + +/* + * generate_combinations + * generate all k-combinations of N elements + */ +static void +generate_combinations(CombinationGenerator *state) +{ + int *current = (int *) palloc0(sizeof(int) * state->k); + + generate_combinations_recurse(state, 0, 0, current); + + pfree(current); +} diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index c8d20fffeaf..b59821bf978 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1623,6 +1623,10 @@ ProcessUtilitySlow(ParseState *pstate, commandCollected = true; break; + case T_CreateStatsStmt: + address = CreateStatistics((CreateStatsStmt *) parsetree); + break; + case T_AlterCollationStmt: address = AlterCollation((AlterCollationStmt *) parsetree); break; @@ -1992,6 +1996,8 @@ AlterObjectTypeCommandTag(ObjectType objtype) break; case OBJECT_SUBSCRIPTION: tag = "ALTER SUBSCRIPTION"; + case OBJECT_STATISTIC_EXT: + tag = "ALTER STATISTICS"; break; default: tag = "???"; @@ -2286,6 +2292,8 @@ CreateCommandTag(Node *parsetree) break; case OBJECT_PUBLICATION: tag = "DROP PUBLICATION"; + case OBJECT_STATISTIC_EXT: + tag = "DROP STATISTICS"; break; default: tag = "???"; @@ -2689,6 +2697,10 @@ CreateCommandTag(Node *parsetree) tag = "EXECUTE"; break; + case T_CreateStatsStmt: + tag = "CREATE STATISTICS"; + break; + case T_DeallocateStmt: { DeallocateStmt *stmt = (DeallocateStmt *) parsetree; diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 5c823250bc2..81c91039e40 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -35,6 +35,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_partitioned_table.h" #include "catalog/pg_proc.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_trigger.h" #include "catalog/pg_type.h" #include "commands/defrem.h" @@ -317,6 +318,7 @@ static char *pg_get_indexdef_worker(Oid indexrelid, int colno, const Oid *excludeOps, bool attrsOnly, bool showTblSpc, int prettyFlags, bool missing_ok); +static char *pg_get_statisticsext_worker(Oid statextid, bool missing_ok); static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags, bool attrsOnly); static char *pg_get_constraintdef_worker(Oid constraintId, bool fullCommand, @@ -1422,6 +1424,85 @@ pg_get_indexdef_worker(Oid indexrelid, int colno, } /* + * pg_get_statisticsextdef + * Get the definition of an extended statistics object + */ +Datum +pg_get_statisticsextdef(PG_FUNCTION_ARGS) +{ + Oid statextid = PG_GETARG_OID(0); + char *res; + + res = pg_get_statisticsext_worker(statextid, true); + + if (res == NULL) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(string_to_text(res)); +} + +/* + * Internal workhorse to decompile an extended statistics object. + */ +static char * +pg_get_statisticsext_worker(Oid statextid, bool missing_ok) +{ + Form_pg_statistic_ext statextrec; + Form_pg_class pgclassrec; + HeapTuple statexttup; + HeapTuple pgclasstup; + StringInfoData buf; + int colno; + + statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid)); + + if (!HeapTupleIsValid(statexttup)) + { + if (missing_ok) + return NULL; + elog(ERROR, "cache lookup failed for extended statistics %u", statextid); + } + + statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup); + + pgclasstup = SearchSysCache1(RELOID, ObjectIdGetDatum(statextrec->starelid)); + + if (!HeapTupleIsValid(statexttup)) + { + ReleaseSysCache(statexttup); + elog(ERROR, "cache lookup failed for relation %u", statextrec->starelid); + } + + pgclassrec = (Form_pg_class) GETSTRUCT(pgclasstup); + + initStringInfo(&buf); + + appendStringInfo(&buf, "CREATE STATISTICS %s ON (", + quote_identifier(NameStr(statextrec->staname))); + + for (colno = 0; colno < statextrec->stakeys.dim1; colno++) + { + AttrNumber attnum = statextrec->stakeys.values[colno]; + char *attname; + + if (colno > 0) + appendStringInfoString(&buf, ", "); + + attname = get_relid_attribute_name(statextrec->starelid, attnum); + + appendStringInfoString(&buf, quote_identifier(attname)); + } + + appendStringInfo(&buf, ") FROM %s", + quote_identifier(NameStr(pgclassrec->relname))); + + ReleaseSysCache(statexttup); + ReleaseSysCache(pgclasstup); + + return buf.data; +} + +/* * pg_get_partkeydef * * Returns the partition key specification, ie, the following: diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index f8b28fe0e61..cc24c8aeb56 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -110,6 +110,7 @@ #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_statistic.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_type.h" #include "executor/executor.h" #include "mb/pg_wchar.h" @@ -126,6 +127,7 @@ #include "parser/parse_clause.h" #include "parser/parse_coerce.h" #include "parser/parsetree.h" +#include "statistics/statistics.h" #include "utils/builtins.h" #include "utils/bytea.h" #include "utils/date.h" @@ -164,6 +166,8 @@ static double eqjoinsel_inner(Oid operator, static double eqjoinsel_semi(Oid operator, VariableStatData *vardata1, VariableStatData *vardata2, RelOptInfo *inner_rel); +static bool estimate_multivariate_ndistinct(PlannerInfo *root, + RelOptInfo *rel, List **varinfos, double *ndistinct); static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, Datum lobound, Datum hibound, Oid boundstypid, double *scaledlobound, double *scaledhibound); @@ -3398,25 +3402,25 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, { GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos); RelOptInfo *rel = varinfo1->rel; - double reldistinct = varinfo1->ndistinct; + double reldistinct = 1; double relmaxndistinct = reldistinct; int relvarcount = 1; List *newvarinfos = NIL; + List *relvarinfos = NIL; /* - * Get the product of numdistinct estimates of the Vars for this rel. - * Also, construct new varinfos list of remaining Vars. + * Split the list of varinfos in two - one for the current rel, + * one for remaining Vars on other rels. */ + relvarinfos = lcons(varinfo1, relvarinfos); for_each_cell(l, lnext(list_head(varinfos))) { GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l); if (varinfo2->rel == varinfo1->rel) { - reldistinct *= varinfo2->ndistinct; - if (relmaxndistinct < varinfo2->ndistinct) - relmaxndistinct = varinfo2->ndistinct; - relvarcount++; + /* varinfos on current rel */ + relvarinfos = lcons(varinfo2, relvarinfos); } else { @@ -3426,6 +3430,43 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, } /* + * Get the numdistinct estimate for the Vars of this rel. We + * iteratively search for multivariate n-distinct with maximum number + * of vars; assuming that each var group is independent of the others, + * we multiply them together. Any remaining relvarinfos after + * no more multivariate matches are found are assumed independent too, + * so their individual ndistinct estimates are multiplied also. + */ + while (relvarinfos) + { + double mvndistinct; + + if (estimate_multivariate_ndistinct(root, rel, &relvarinfos, + &mvndistinct)) + { + reldistinct *= mvndistinct; + if (relmaxndistinct < mvndistinct) + relmaxndistinct = mvndistinct; + relvarcount++; /* inaccurate, but doesn't matter */ + } + else + { + foreach (l, relvarinfos) + { + GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l); + + reldistinct *= varinfo2->ndistinct; + if (relmaxndistinct < varinfo2->ndistinct) + relmaxndistinct = varinfo2->ndistinct; + relvarcount++; + } + + /* we're done with this relation */ + relvarinfos = NIL; + } + } + + /* * Sanity check --- don't divide by zero if empty relation. */ Assert(rel->reloptkind == RELOPT_BASEREL); @@ -3668,6 +3709,132 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets) */ /* + * Find applicable ndistinct statistics for the given list of VarInfos (which + * must all belong to the given rel), and update *ndistinct to the estimate of + * the MVNDistinctItem that best matches. If a match it found, *varinfos is + * updated to remove the list of matched varinfos. + * + * Varinfos that aren't for simple Vars are ignored. + * + * Return TRUE if we're able to find a match, FALSE otherwise. + */ +static bool +estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel, + List **varinfos, double *ndistinct) +{ + ListCell *lc; + Bitmapset *attnums = NULL; + int nmatches; + Oid statOid = InvalidOid; + MVNDistinct *stats; + Bitmapset *matched = NULL; + + /* bail out immediately if the table has no extended statistics */ + if (!rel->statlist) + return false; + + /* Determine the attnums we're looking for */ + foreach(lc, *varinfos) + { + GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc); + + Assert(varinfo->rel == rel); + + if (IsA(varinfo->var, Var)) + { + attnums = bms_add_member(attnums, + ((Var *) varinfo->var)->varattno); + } + } + + /* look for the ndistinct statistics matching the most vars */ + nmatches = 1; /* we require at least two matches */ + foreach(lc, rel->statlist) + { + StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); + Bitmapset *shared; + + /* skip statistics of other kinds */ + if (info->kind != STATS_EXT_NDISTINCT) + continue; + + /* compute attnums shared by the vars and the statistic */ + shared = bms_intersect(info->keys, attnums); + + /* + * Does this statistics matches more columns than the currently + * best statistic? If so, use this one instead. + * + * XXX This should break ties using name of the statistic, or + * something like that, to make the outcome stable. + */ + if (bms_num_members(shared) > nmatches) + { + statOid = info->statOid; + nmatches = bms_num_members(shared); + matched = shared; + } + } + + /* No match? */ + if (statOid == InvalidOid) + return false; + Assert(nmatches > 1 && matched != NULL); + + stats = statext_ndistinct_load(statOid); + + /* + * If we have a match, search it for the specific item that matches (there + * must be one), and construct the output values. + */ + if (stats) + { + int i; + List *newlist = NIL; + MVNDistinctItem *item = NULL; + + /* Find the specific item that exactly matches the combination */ + for (i = 0; i < stats->nitems; i++) + { + MVNDistinctItem *tmpitem = &stats->items[i]; + + if (bms_subset_compare(tmpitem->attrs, matched) == BMS_EQUAL) + { + item = tmpitem; + break; + } + } + + /* make sure we found an item */ + if (!item) + elog(ERROR, "corrupt MVNDistinct entry"); + + /* Form the output varinfo list, keeping only unmatched ones */ + foreach(lc, *varinfos) + { + GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc); + AttrNumber attnum; + + if (!IsA(varinfo->var, Var)) + { + newlist = lappend(newlist, varinfo); + continue; + } + + attnum = ((Var *) varinfo->var)->varattno; + if (!bms_is_member(attnum, matched)) + newlist = lappend(newlist, varinfo); + } + + *varinfos = newlist; + *ndistinct = item->ndistinct; + return true; + } + + return false; +} + +/* * convert_to_scalar * Convert non-NULL values of the indicated types to the comparison * scale needed by scalarineqsel(). diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index ce55fc52777..a6b60c67caa 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -56,6 +56,7 @@ #include "catalog/pg_publication.h" #include "catalog/pg_rewrite.h" #include "catalog/pg_shseclabel.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_trigger.h" @@ -4452,6 +4453,82 @@ RelationGetIndexList(Relation relation) } /* + * RelationGetStatExtList + * get a list of OIDs of extended statistics on this relation + * + * The statistics list is created only if someone requests it, in a way + * similar to RelationGetIndexList(). We scan pg_statistic_ext to find + * relevant statistics, and add the list to the relcache entry so that we + * won't have to compute it again. Note that shared cache inval of a + * relcache entry will delete the old list and set rd_statvalid to 0, + * so that we must recompute the statistics list on next request. This + * handles creation or deletion of a statistic. + * + * The returned list is guaranteed to be sorted in order by OID, although + * this is not currently needed. + * + * Since shared cache inval causes the relcache's copy of the list to go away, + * we return a copy of the list palloc'd in the caller's context. The caller + * may list_free() the returned list after scanning it. This is necessary + * since the caller will typically be doing syscache lookups on the relevant + * statistics, and syscache lookup could cause SI messages to be processed! + */ +List * +RelationGetStatExtList(Relation relation) +{ + Relation indrel; + SysScanDesc indscan; + ScanKeyData skey; + HeapTuple htup; + List *result; + List *oldlist; + MemoryContext oldcxt; + + /* Quick exit if we already computed the list. */ + if (relation->rd_statvalid != 0) + return list_copy(relation->rd_statlist); + + /* + * We build the list we intend to return (in the caller's context) while + * doing the scan. After successfully completing the scan, we copy that + * list into the relcache entry. This avoids cache-context memory leakage + * if we get some sort of error partway through. + */ + result = NIL; + + /* Prepare to scan pg_statistic_ext for entries having starelid = this rel. */ + ScanKeyInit(&skey, + Anum_pg_statistic_ext_starelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationGetRelid(relation))); + + indrel = heap_open(StatisticExtRelationId, AccessShareLock); + indscan = systable_beginscan(indrel, StatisticExtRelidIndexId, true, + NULL, 1, &skey); + + while (HeapTupleIsValid(htup = systable_getnext(indscan))) + /* TODO maybe include only already built statistics? */ + result = insert_ordered_oid(result, HeapTupleGetOid(htup)); + + systable_endscan(indscan); + + heap_close(indrel, AccessShareLock); + + /* Now save a copy of the completed list in the relcache entry. */ + oldcxt = MemoryContextSwitchTo(CacheMemoryContext); + oldlist = relation->rd_statlist; + relation->rd_statlist = list_copy(result); + + relation->rd_statvalid = true; + MemoryContextSwitchTo(oldcxt); + + /* Don't leak the old list, if there is one */ + list_free(oldlist); + + return result; +} + +/* * insert_ordered_oid * Insert a new Oid into a sorted list of Oids, preserving ordering * @@ -5560,6 +5637,8 @@ load_relcache_init_file(bool shared) rel->rd_pkattr = NULL; rel->rd_idattr = NULL; rel->rd_pubactions = NULL; + rel->rd_statvalid = false; + rel->rd_statlist = NIL; rel->rd_createSubid = InvalidSubTransactionId; rel->rd_newRelfilenodeSubid = InvalidSubTransactionId; rel->rd_amcache = NULL; diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index d5a376406fe..d8c823f42b5 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -61,6 +61,7 @@ #include "catalog/pg_shseclabel.h" #include "catalog/pg_replication_origin.h" #include "catalog/pg_statistic.h" +#include "catalog/pg_statistic_ext.h" #include "catalog/pg_subscription.h" #include "catalog/pg_subscription_rel.h" #include "catalog/pg_tablespace.h" @@ -726,6 +727,28 @@ static const struct cachedesc cacheinfo[] = { }, 32 }, + {StatisticExtRelationId, /* STATEXTNAMENSP */ + StatisticExtNameIndexId, + 2, + { + Anum_pg_statistic_ext_staname, + Anum_pg_statistic_ext_stanamespace, + 0, + 0 + }, + 4 + }, + {StatisticExtRelationId, /* STATEXTOID */ + StatisticExtOidIndexId, + 1, + { + ObjectIdAttributeNumber, + 0, + 0, + 0 + }, + 4 + }, {StatisticRelationId, /* STATRELATTINH */ StatisticRelidAttnumInhIndexId, 3, |