diff options
Diffstat (limited to 'src/backend/executor/execPartition.c')
-rw-r--r-- | src/backend/executor/execPartition.c | 560 |
1 files changed, 560 insertions, 0 deletions
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c new file mode 100644 index 00000000000..d275cefe1df --- /dev/null +++ b/src/backend/executor/execPartition.c @@ -0,0 +1,560 @@ +/*------------------------------------------------------------------------- + * + * execPartition.c + * Support routines for partitioning. + * + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/execPartition.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/pg_inherits_fn.h" +#include "executor/execPartition.h" +#include "executor/executor.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "utils/lsyscache.h" +#include "utils/rls.h" +#include "utils/ruleutils.h" + +static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, + int *num_parted, List **leaf_part_oids); +static void get_partition_dispatch_recurse(Relation rel, Relation parent, + List **pds, List **leaf_part_oids); +static void FormPartitionKeyDatum(PartitionDispatch pd, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull); +static char *ExecBuildSlotPartitionKeyDescription(Relation rel, + Datum *values, + bool *isnull, + int maxfieldlen); + +/* + * ExecSetupPartitionTupleRouting - set up information needed during + * tuple routing for partitioned tables + * + * Output arguments: + * 'pd' receives an array of PartitionDispatch objects with one entry for + * every partitioned table in the partition tree + * 'partitions' receives an array of ResultRelInfo* objects with one entry for + * every leaf partition in the partition tree + * 'tup_conv_maps' receives an array of TupleConversionMap objects with one + * entry for every leaf partition (required to convert input tuple based + * on the root table's rowtype to a leaf partition's rowtype after tuple + * routing is done) + * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used + * to manipulate any given leaf partition's rowtype after that partition + * is chosen by tuple-routing. + * 'num_parted' receives the number of partitioned tables in the partition + * tree (= the number of entries in the 'pd' output array) + * 'num_partitions' receives the number of leaf partitions in the partition + * tree (= the number of entries in the 'partitions' and 'tup_conv_maps' + * output arrays + * + * Note that all the relations in the partition tree are locked using the + * RowExclusiveLock mode upon return from this function. + */ +void +ExecSetupPartitionTupleRouting(Relation rel, + Index resultRTindex, + EState *estate, + PartitionDispatch **pd, + ResultRelInfo ***partitions, + TupleConversionMap ***tup_conv_maps, + TupleTableSlot **partition_tuple_slot, + int *num_parted, int *num_partitions) +{ + TupleDesc tupDesc = RelationGetDescr(rel); + List *leaf_parts; + ListCell *cell; + int i; + ResultRelInfo *leaf_part_rri; + + /* + * Get the information about the partition tree after locking all the + * partitions. + */ + (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); + *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts); + *num_partitions = list_length(leaf_parts); + *partitions = (ResultRelInfo **) palloc(*num_partitions * + sizeof(ResultRelInfo *)); + *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions * + sizeof(TupleConversionMap *)); + + /* + * Initialize an empty slot that will be used to manipulate tuples of any + * given partition's rowtype. It is attached to the caller-specified node + * (such as ModifyTableState) and released when the node finishes + * processing. + */ + *partition_tuple_slot = MakeTupleTableSlot(); + + leaf_part_rri = (ResultRelInfo *) palloc0(*num_partitions * + sizeof(ResultRelInfo)); + i = 0; + foreach(cell, leaf_parts) + { + Relation partrel; + TupleDesc part_tupdesc; + + /* + * We locked all the partitions above including the leaf partitions. + * Note that each of the relations in *partitions are eventually + * closed by the caller. + */ + partrel = heap_open(lfirst_oid(cell), NoLock); + part_tupdesc = RelationGetDescr(partrel); + + /* + * Save a tuple conversion map to convert a tuple routed to this + * partition from the parent's type to the partition's. + */ + (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, + gettext_noop("could not convert row type")); + + InitResultRelInfo(leaf_part_rri, + partrel, + resultRTindex, + rel, + estate->es_instrument); + + /* + * Verify result relation is a valid target for INSERT. + */ + CheckValidResultRel(leaf_part_rri, CMD_INSERT); + + /* + * Open partition indices (remember we do not support ON CONFLICT in + * case of partitioned tables, so we do not need support information + * for speculative insertion) + */ + if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex && + leaf_part_rri->ri_IndexRelationDescs == NULL) + ExecOpenIndices(leaf_part_rri, false); + + estate->es_leaf_result_relations = + lappend(estate->es_leaf_result_relations, leaf_part_rri); + + (*partitions)[i] = leaf_part_rri++; + i++; + } +} + +/* + * ExecFindPartition -- Find a leaf partition in the partition tree rooted + * at parent, for the heap tuple contained in *slot + * + * estate must be non-NULL; we'll need it to compute any expressions in the + * partition key(s) + * + * If no leaf partition is found, this routine errors out with the appropriate + * error message, else it returns the leaf partition sequence number + * as an index into the array of (ResultRelInfos of) all leaf partitions in + * the partition tree. + */ +int +ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, + TupleTableSlot *slot, EState *estate) +{ + int result; + Datum values[PARTITION_MAX_KEYS]; + bool isnull[PARTITION_MAX_KEYS]; + Relation rel; + PartitionDispatch parent; + ExprContext *ecxt = GetPerTupleExprContext(estate); + TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; + + /* + * First check the root table's partition constraint, if any. No point in + * routing the tuple if it doesn't belong in the root table itself. + */ + if (resultRelInfo->ri_PartitionCheck) + ExecPartitionCheck(resultRelInfo, slot, estate); + + /* start with the root partitioned table */ + parent = pd[0]; + while (true) + { + PartitionDesc partdesc; + TupleTableSlot *myslot = parent->tupslot; + TupleConversionMap *map = parent->tupmap; + int cur_index = -1; + + rel = parent->reldesc; + partdesc = RelationGetPartitionDesc(rel); + + /* + * Convert the tuple to this parent's layout so that we can do certain + * things we do below. + */ + if (myslot != NULL && map != NULL) + { + HeapTuple tuple = ExecFetchSlotTuple(slot); + + ExecClearTuple(myslot); + tuple = do_convert_tuple(tuple, map); + ExecStoreTuple(tuple, myslot, InvalidBuffer, true); + slot = myslot; + } + + /* Quick exit */ + if (partdesc->nparts == 0) + { + result = -1; + break; + } + + /* + * Extract partition key from tuple. Expression evaluation machinery + * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to + * point to the correct tuple slot. The slot might have changed from + * what was used for the parent table if the table of the current + * partitioning level has different tuple descriptor from the parent. + * So update ecxt_scantuple accordingly. + */ + ecxt->ecxt_scantuple = slot; + FormPartitionKeyDatum(parent, slot, estate, values, isnull); + cur_index = get_partition_for_tuple(rel, values, isnull); + + /* + * cur_index < 0 means we failed to find a partition of this parent. + * cur_index >= 0 means we either found the leaf partition, or the + * next parent to find a partition of. + */ + if (cur_index < 0) + { + result = -1; + break; + } + else if (parent->indexes[cur_index] >= 0) + { + result = parent->indexes[cur_index]; + break; + } + else + parent = pd[-parent->indexes[cur_index]]; + } + + /* A partition was not found. */ + if (result < 0) + { + char *val_desc; + + val_desc = ExecBuildSlotPartitionKeyDescription(rel, + values, isnull, 64); + Assert(OidIsValid(RelationGetRelid(rel))); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("no partition of relation \"%s\" found for row", + RelationGetRelationName(rel)), + val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0)); + } + + ecxt->ecxt_scantuple = ecxt_scantuple_old; + return result; +} + +/* + * RelationGetPartitionDispatchInfo + * Returns information necessary to route tuples down a partition tree + * + * The number of elements in the returned array (that is, the number of + * PartitionDispatch objects for the partitioned tables in the partition tree) + * is returned in *num_parted and a list of the OIDs of all the leaf + * partitions of rel is returned in *leaf_part_oids. + * + * All the relations in the partition tree (including 'rel') must have been + * locked (using at least the AccessShareLock) by the caller. + */ +static PartitionDispatch * +RelationGetPartitionDispatchInfo(Relation rel, + int *num_parted, List **leaf_part_oids) +{ + List *pdlist = NIL; + PartitionDispatchData **pd; + ListCell *lc; + int i; + + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + + *num_parted = 0; + *leaf_part_oids = NIL; + + get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids); + *num_parted = list_length(pdlist); + pd = (PartitionDispatchData **) palloc(*num_parted * + sizeof(PartitionDispatchData *)); + i = 0; + foreach(lc, pdlist) + { + pd[i++] = lfirst(lc); + } + + return pd; +} + +/* + * get_partition_dispatch_recurse + * Recursively expand partition tree rooted at rel + * + * As the partition tree is expanded in a depth-first manner, we maintain two + * global lists: of PartitionDispatch objects corresponding to partitioned + * tables in *pds and of the leaf partition OIDs in *leaf_part_oids. + * + * Note that the order of OIDs of leaf partitions in leaf_part_oids matches + * the order in which the planner's expand_partitioned_rtentry() processes + * them. It's not necessarily the case that the offsets match up exactly, + * because constraint exclusion might prune away some partitions on the + * planner side, whereas we'll always have the complete list; but unpruned + * partitions will appear in the same order in the plan as they are returned + * here. + */ +static void +get_partition_dispatch_recurse(Relation rel, Relation parent, + List **pds, List **leaf_part_oids) +{ + TupleDesc tupdesc = RelationGetDescr(rel); + PartitionDesc partdesc = RelationGetPartitionDesc(rel); + PartitionKey partkey = RelationGetPartitionKey(rel); + PartitionDispatch pd; + int i; + + check_stack_depth(); + + /* Build a PartitionDispatch for this table and add it to *pds. */ + pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); + *pds = lappend(*pds, pd); + pd->reldesc = rel; + pd->key = partkey; + pd->keystate = NIL; + pd->partdesc = partdesc; + if (parent != NULL) + { + /* + * For every partitioned table other than the root, we must store a + * tuple table slot initialized with its tuple descriptor and a tuple + * conversion map to convert a tuple from its parent's rowtype to its + * own. That is to make sure that we are looking at the correct row + * using the correct tuple descriptor when computing its partition key + * for tuple routing. + */ + pd->tupslot = MakeSingleTupleTableSlot(tupdesc); + pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent), + tupdesc, + gettext_noop("could not convert row type")); + } + else + { + /* Not required for the root partitioned table */ + pd->tupslot = NULL; + pd->tupmap = NULL; + } + + /* + * Go look at each partition of this table. If it's a leaf partition, + * simply add its OID to *leaf_part_oids. If it's a partitioned table, + * recursively call get_partition_dispatch_recurse(), so that its + * partitions are processed as well and a corresponding PartitionDispatch + * object gets added to *pds. + * + * About the values in pd->indexes: for a leaf partition, it contains the + * leaf partition's position in the global list *leaf_part_oids minus 1, + * whereas for a partitioned table partition, it contains the partition's + * position in the global list *pds multiplied by -1. The latter is + * multiplied by -1 to distinguish partitioned tables from leaf partitions + * when going through the values in pd->indexes. So, for example, when + * using it during tuple-routing, encountering a value >= 0 means we found + * a leaf partition. It is immediately returned as the index in the array + * of ResultRelInfos of all the leaf partitions, using which we insert the + * tuple into that leaf partition. A negative value means we found a + * partitioned table. The value multiplied by -1 is returned as the index + * in the array of PartitionDispatch objects of all partitioned tables in + * the tree. This value is used to continue the search in the next level + * of the partition tree. + */ + pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); + for (i = 0; i < partdesc->nparts; i++) + { + Oid partrelid = partdesc->oids[i]; + + if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) + { + *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); + pd->indexes[i] = list_length(*leaf_part_oids) - 1; + } + else + { + /* + * We assume all tables in the partition tree were already locked + * by the caller. + */ + Relation partrel = heap_open(partrelid, NoLock); + + pd->indexes[i] = -list_length(*pds); + get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids); + } + } +} + +/* ---------------- + * FormPartitionKeyDatum + * Construct values[] and isnull[] arrays for the partition key + * of a tuple. + * + * pd Partition dispatch object of the partitioned table + * slot Heap tuple from which to extract partition key + * estate executor state for evaluating any partition key + * expressions (must be non-NULL) + * values Array of partition key Datums (output area) + * isnull Array of is-null indicators (output area) + * + * the ecxt_scantuple slot of estate's per-tuple expr context must point to + * the heap tuple passed in. + * ---------------- + */ +static void +FormPartitionKeyDatum(PartitionDispatch pd, + TupleTableSlot *slot, + EState *estate, + Datum *values, + bool *isnull) +{ + ListCell *partexpr_item; + int i; + + if (pd->key->partexprs != NIL && pd->keystate == NIL) + { + /* Check caller has set up context correctly */ + Assert(estate != NULL && + GetPerTupleExprContext(estate)->ecxt_scantuple == slot); + + /* First time through, set up expression evaluation state */ + pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate); + } + + partexpr_item = list_head(pd->keystate); + for (i = 0; i < pd->key->partnatts; i++) + { + AttrNumber keycol = pd->key->partattrs[i]; + Datum datum; + bool isNull; + + if (keycol != 0) + { + /* Plain column; get the value directly from the heap tuple */ + datum = slot_getattr(slot, keycol, &isNull); + } + else + { + /* Expression; need to evaluate it */ + if (partexpr_item == NULL) + elog(ERROR, "wrong number of partition key expressions"); + datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item), + GetPerTupleExprContext(estate), + &isNull); + partexpr_item = lnext(partexpr_item); + } + values[i] = datum; + isnull[i] = isNull; + } + + if (partexpr_item != NULL) + elog(ERROR, "wrong number of partition key expressions"); +} + +/* + * BuildSlotPartitionKeyDescription + * + * This works very much like BuildIndexValueDescription() and is currently + * used for building error messages when ExecFindPartition() fails to find + * partition for a row. + */ +static char * +ExecBuildSlotPartitionKeyDescription(Relation rel, + Datum *values, + bool *isnull, + int maxfieldlen) +{ + StringInfoData buf; + PartitionKey key = RelationGetPartitionKey(rel); + int partnatts = get_partition_natts(key); + int i; + Oid relid = RelationGetRelid(rel); + AclResult aclresult; + + if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED) + return NULL; + + /* If the user has table-level access, just go build the description. */ + aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT); + if (aclresult != ACLCHECK_OK) + { + /* + * Step through the columns of the partition key and make sure the + * user has SELECT rights on all of them. + */ + for (i = 0; i < partnatts; i++) + { + AttrNumber attnum = get_partition_col_attnum(key, i); + + /* + * If this partition key column is an expression, we return no + * detail rather than try to figure out what column(s) the + * expression includes and if the user has SELECT rights on them. + */ + if (attnum == InvalidAttrNumber || + pg_attribute_aclcheck(relid, attnum, GetUserId(), + ACL_SELECT) != ACLCHECK_OK) + return NULL; + } + } + + initStringInfo(&buf); + appendStringInfo(&buf, "(%s) = (", + pg_get_partkeydef_columns(relid, true)); + + for (i = 0; i < partnatts; i++) + { + char *val; + int vallen; + + if (isnull[i]) + val = "null"; + else + { + Oid foutoid; + bool typisvarlena; + + getTypeOutputInfo(get_partition_col_typid(key, i), + &foutoid, &typisvarlena); + val = OidOutputFunctionCall(foutoid, values[i]); + } + + if (i > 0) + appendStringInfoString(&buf, ", "); + + /* truncate if needed */ + vallen = strlen(val); + if (vallen <= maxfieldlen) + appendStringInfoString(&buf, val); + else + { + vallen = pg_mbcliplen(val, vallen, maxfieldlen); + appendBinaryStringInfo(&buf, val, vallen); + appendStringInfoString(&buf, "..."); + } + } + + appendStringInfoChar(&buf, ')'); + + return buf.data; +} |