diff options
Diffstat (limited to 'src/backend/executor')
-rw-r--r-- | src/backend/executor/execPartition.c | 241 | ||||
-rw-r--r-- | src/backend/executor/nodeModifyTable.c | 583 |
2 files changed, 680 insertions, 144 deletions
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 8c0d2df63c7..89b7bb4c608 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -54,7 +54,11 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, List *leaf_parts; ListCell *cell; int i; - ResultRelInfo *leaf_part_rri; + ResultRelInfo *leaf_part_arr = NULL, + *update_rri = NULL; + int num_update_rri = 0, + update_rri_index = 0; + bool is_update = false; PartitionTupleRouting *proute; /* @@ -69,10 +73,38 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, proute->num_partitions = list_length(leaf_parts); proute->partitions = (ResultRelInfo **) palloc(proute->num_partitions * sizeof(ResultRelInfo *)); - proute->partition_tupconv_maps = + proute->parent_child_tupconv_maps = (TupleConversionMap **) palloc0(proute->num_partitions * sizeof(TupleConversionMap *)); + /* Set up details specific to the type of tuple routing we are doing. */ + if (mtstate && mtstate->operation == CMD_UPDATE) + { + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + + is_update = true; + update_rri = mtstate->resultRelInfo; + num_update_rri = list_length(node->plans); + proute->subplan_partition_offsets = + palloc(num_update_rri * sizeof(int)); + + /* + * We need an additional tuple slot for storing transient tuples that + * are converted to the root table descriptor. + */ + proute->root_tuple_slot = MakeTupleTableSlot(); + } + else + { + /* + * Since we are inserting tuples, we need to create all new result + * rels. Avoid repeated pallocs by allocating memory for all the + * result rels in bulk. + */ + leaf_part_arr = (ResultRelInfo *) palloc0(proute->num_partitions * + sizeof(ResultRelInfo)); + } + /* * Initialize an empty slot that will be used to manipulate tuples of any * given partition's rowtype. It is attached to the caller-specified node @@ -81,38 +113,86 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, */ proute->partition_tuple_slot = MakeTupleTableSlot(); - leaf_part_rri = (ResultRelInfo *) palloc0(proute->num_partitions * - sizeof(ResultRelInfo)); i = 0; foreach(cell, leaf_parts) { - Relation partrel; + ResultRelInfo *leaf_part_rri; + Relation partrel = NULL; TupleDesc part_tupdesc; + Oid leaf_oid = lfirst_oid(cell); + + if (is_update) + { + /* + * If the leaf partition is already present in the per-subplan + * result rels, we re-use that rather than initialize a new result + * rel. The per-subplan resultrels and the resultrels of the leaf + * partitions are both in the same canonical order. So while going + * through the leaf partition oids, we need to keep track of the + * next per-subplan result rel to be looked for in the leaf + * partition resultrels. + */ + if (update_rri_index < num_update_rri && + RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid) + { + leaf_part_rri = &update_rri[update_rri_index]; + partrel = leaf_part_rri->ri_RelationDesc; + + /* + * This is required in order to we convert the partition's + * tuple to be compatible with the root partitioned table's + * tuple descriptor. When generating the per-subplan result + * rels, this was not set. + */ + leaf_part_rri->ri_PartitionRoot = rel; + + /* Remember the subplan offset for this ResultRelInfo */ + proute->subplan_partition_offsets[update_rri_index] = i; + + update_rri_index++; + } + else + leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo)); + } + else + { + /* For INSERTs, we already have an array of result rels allocated */ + leaf_part_rri = &leaf_part_arr[i]; + } /* - * We locked all the partitions above including the leaf partitions. - * Note that each of the relations in proute->partitions are - * eventually closed by the caller. + * If we didn't open the partition rel, it means we haven't + * initialized the result rel either. */ - partrel = heap_open(lfirst_oid(cell), NoLock); + if (!partrel) + { + /* + * We locked all the partitions above including the leaf + * partitions. Note that each of the newly opened relations in + * proute->partitions are eventually closed by the caller. + */ + partrel = heap_open(leaf_oid, NoLock); + InitResultRelInfo(leaf_part_rri, + partrel, + resultRTindex, + rel, + estate->es_instrument); + } + part_tupdesc = RelationGetDescr(partrel); /* * Save a tuple conversion map to convert a tuple routed to this * partition from the parent's type to the partition's. */ - proute->partition_tupconv_maps[i] = + proute->parent_child_tupconv_maps[i] = convert_tuples_by_name(tupDesc, part_tupdesc, gettext_noop("could not convert row type")); - InitResultRelInfo(leaf_part_rri, - partrel, - resultRTindex, - rel, - estate->es_instrument); - /* - * Verify result relation is a valid target for INSERT. + * Verify result relation is a valid target for an INSERT. An UPDATE + * of a partition-key becomes a DELETE+INSERT operation, so this check + * is still required when the operation is CMD_UPDATE. */ CheckValidResultRel(leaf_part_rri, CMD_INSERT); @@ -132,10 +212,16 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, estate->es_leaf_result_relations = lappend(estate->es_leaf_result_relations, leaf_part_rri); - proute->partitions[i] = leaf_part_rri++; + proute->partitions[i] = leaf_part_rri; i++; } + /* + * For UPDATE, we should have found all the per-subplan resultrels in the + * leaf partitions. + */ + Assert(!is_update || update_rri_index == num_update_rri); + return proute; } @@ -259,15 +345,111 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, } /* + * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition + * child-to-root tuple conversion map array. + * + * This map is required for capturing transition tuples when the target table + * is a partitioned table. For a tuple that is routed by an INSERT or UPDATE, + * we need to convert it from the leaf partition to the target table + * descriptor. + */ +void +ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute) +{ + Assert(proute != NULL); + + /* + * These array elements gets filled up with maps on an on-demand basis. + * Initially just set all of them to NULL. + */ + proute->child_parent_tupconv_maps = + (TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) * + proute->num_partitions); + + /* Same is the case for this array. All the values are set to false */ + proute->child_parent_map_not_required = + (bool *) palloc0(sizeof(bool) * proute->num_partitions); +} + +/* + * TupConvMapForLeaf -- Get the tuple conversion map for a given leaf partition + * index. + */ +TupleConversionMap * +TupConvMapForLeaf(PartitionTupleRouting *proute, + ResultRelInfo *rootRelInfo, int leaf_index) +{ + ResultRelInfo **resultRelInfos = proute->partitions; + TupleConversionMap **map; + TupleDesc tupdesc; + + /* Don't call this if we're not supposed to be using this type of map. */ + Assert(proute->child_parent_tupconv_maps != NULL); + + /* If it's already known that we don't need a map, return NULL. */ + if (proute->child_parent_map_not_required[leaf_index]) + return NULL; + + /* If we've already got a map, return it. */ + map = &proute->child_parent_tupconv_maps[leaf_index]; + if (*map != NULL) + return *map; + + /* No map yet; try to create one. */ + tupdesc = RelationGetDescr(resultRelInfos[leaf_index]->ri_RelationDesc); + *map = + convert_tuples_by_name(tupdesc, + RelationGetDescr(rootRelInfo->ri_RelationDesc), + gettext_noop("could not convert row type")); + + /* If it turns out no map is needed, remember for next time. */ + proute->child_parent_map_not_required[leaf_index] = (*map == NULL); + + return *map; +} + +/* + * ConvertPartitionTupleSlot -- convenience function for tuple conversion. + * The tuple, if converted, is stored in new_slot, and *p_my_slot is + * updated to point to it. new_slot typically should be one of the + * dedicated partition tuple slots. If map is NULL, *p_my_slot is not changed. + * + * Returns the converted tuple, unless map is NULL, in which case original + * tuple is returned unmodified. + */ +HeapTuple +ConvertPartitionTupleSlot(TupleConversionMap *map, + HeapTuple tuple, + TupleTableSlot *new_slot, + TupleTableSlot **p_my_slot) +{ + if (!map) + return tuple; + + tuple = do_convert_tuple(tuple, map); + + /* + * Change the partition tuple slot descriptor, as per converted tuple. + */ + *p_my_slot = new_slot; + Assert(new_slot != NULL); + ExecSetSlotDescriptor(new_slot, map->outdesc); + ExecStoreTuple(tuple, new_slot, InvalidBuffer, true); + + return tuple; +} + +/* * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple * routing. * * Close all the partitioned tables, leaf partitions, and their indices. */ void -ExecCleanupTupleRouting(PartitionTupleRouting * proute) +ExecCleanupTupleRouting(PartitionTupleRouting *proute) { int i; + int subplan_index = 0; /* * Remember, proute->partition_dispatch_info[0] corresponds to the root @@ -288,11 +470,30 @@ ExecCleanupTupleRouting(PartitionTupleRouting * proute) { ResultRelInfo *resultRelInfo = proute->partitions[i]; + /* + * If this result rel is one of the UPDATE subplan result rels, let + * ExecEndPlan() close it. For INSERT or COPY, + * proute->subplan_partition_offsets will always be NULL. Note that + * the subplan_partition_offsets array and the partitions array have + * the partitions in the same order. So, while we iterate over + * partitions array, we also iterate over the + * subplan_partition_offsets array in order to figure out which of the + * result rels are present in the UPDATE subplans. + */ + if (proute->subplan_partition_offsets && + proute->subplan_partition_offsets[subplan_index] == i) + { + subplan_index++; + continue; + } + ExecCloseIndices(resultRelInfo); heap_close(resultRelInfo->ri_RelationDesc, NoLock); } - /* Release the standalone partition tuple descriptor, if any */ + /* Release the standalone partition tuple descriptors, if any */ + if (proute->root_tuple_slot) + ExecDropSingleTupleTableSlot(proute->root_tuple_slot); if (proute->partition_tuple_slot) ExecDropSingleTupleTableSlot(proute->partition_tuple_slot); } diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index c5eca1bb74c..6c2f8d4ec03 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -62,6 +62,11 @@ static bool ExecOnConflictUpdate(ModifyTableState *mtstate, EState *estate, bool canSetTag, TupleTableSlot **returning); +static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node); +static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate); +static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate); +static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node, + int whichplan); /* * Verify that the tuples to be produced by INSERT or UPDATE match the @@ -265,6 +270,7 @@ ExecInsert(ModifyTableState *mtstate, Oid newId; List *recheckIndexes = NIL; TupleTableSlot *result = NULL; + TransitionCaptureState *ar_insert_trig_tcs; /* * get the heap tuple out of the tuple table slot, making sure we have a @@ -282,7 +288,6 @@ ExecInsert(ModifyTableState *mtstate, { int leaf_part_index; PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; - TupleConversionMap *map; /* * Away we go ... If we end up not finding a partition after all, @@ -331,8 +336,10 @@ ExecInsert(ModifyTableState *mtstate, * back to tuplestore format. */ mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + mtstate->mt_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[leaf_part_index]; + TupConvMapForLeaf(proute, saved_resultRelInfo, + leaf_part_index); } else { @@ -345,30 +352,20 @@ ExecInsert(ModifyTableState *mtstate, } } if (mtstate->mt_oc_transition_capture != NULL) + { mtstate->mt_oc_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[leaf_part_index]; + TupConvMapForLeaf(proute, saved_resultRelInfo, + leaf_part_index); + } /* * We might need to convert from the parent rowtype to the partition * rowtype. */ - map = proute->partition_tupconv_maps[leaf_part_index]; - if (map) - { - Relation partrel = resultRelInfo->ri_RelationDesc; - - tuple = do_convert_tuple(tuple, map); - - /* - * We must use the partition's tuple descriptor from this point - * on, until we're finished dealing with the partition. Use the - * dedicated slot for that. - */ - slot = proute->partition_tuple_slot; - Assert(slot != NULL); - ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); - ExecStoreTuple(tuple, slot, InvalidBuffer, true); - } + tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index], + tuple, + proute->partition_tuple_slot, + &slot); } resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -449,6 +446,8 @@ ExecInsert(ModifyTableState *mtstate, } else { + WCOKind wco_kind; + /* * We always check the partition constraint, including when the tuple * got here via tuple-routing. However we don't need to in the latter @@ -466,14 +465,23 @@ ExecInsert(ModifyTableState *mtstate, tuple->t_tableOid = RelationGetRelid(resultRelationDesc); /* - * Check any RLS INSERT WITH CHECK policies + * Check any RLS WITH CHECK policies. * + * Normally we should check INSERT policies. But if the insert is the + * result of a partition key update that moved the tuple to a new + * partition, we should instead check UPDATE policies, because we are + * executing policies defined on the target table, and not those + * defined on the child partitions. + */ + wco_kind = (mtstate->operation == CMD_UPDATE) ? + WCO_RLS_UPDATE_CHECK : WCO_RLS_INSERT_CHECK; + + /* * ExecWithCheckOptions() will skip any WCOs which are not of the kind * we are looking for at this point. */ if (resultRelInfo->ri_WithCheckOptions != NIL) - ExecWithCheckOptions(WCO_RLS_INSERT_CHECK, - resultRelInfo, slot, estate); + ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate); /* * No need though if the tuple has been routed, and a BR trigger @@ -622,9 +630,32 @@ ExecInsert(ModifyTableState *mtstate, setLastTid(&(tuple->t_self)); } + /* + * If this insert is the result of a partition key update that moved the + * tuple to a new partition, put this row into the transition NEW TABLE, + * if there is one. We need to do this separately for DELETE and INSERT + * because they happen on different tables. + */ + ar_insert_trig_tcs = mtstate->mt_transition_capture; + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture + && mtstate->mt_transition_capture->tcs_update_new_table) + { + ExecARUpdateTriggers(estate, resultRelInfo, NULL, + NULL, + tuple, + NULL, + mtstate->mt_transition_capture); + + /* + * We've already captured the NEW TABLE row, so make sure any AR + * INSERT trigger fired below doesn't capture it again. + */ + ar_insert_trig_tcs = NULL; + } + /* AFTER ROW INSERT Triggers */ ExecARInsertTriggers(estate, resultRelInfo, tuple, recheckIndexes, - mtstate->mt_transition_capture); + ar_insert_trig_tcs); list_free(recheckIndexes); @@ -678,6 +709,8 @@ ExecDelete(ModifyTableState *mtstate, TupleTableSlot *planSlot, EPQState *epqstate, EState *estate, + bool *tupleDeleted, + bool processReturning, bool canSetTag) { ResultRelInfo *resultRelInfo; @@ -685,6 +718,10 @@ ExecDelete(ModifyTableState *mtstate, HTSU_Result result; HeapUpdateFailureData hufd; TupleTableSlot *slot = NULL; + TransitionCaptureState *ar_delete_trig_tcs; + + if (tupleDeleted) + *tupleDeleted = false; /* * get information on the (current) result relation @@ -849,12 +886,40 @@ ldelete:; if (canSetTag) (estate->es_processed)++; + /* Tell caller that the delete actually happened. */ + if (tupleDeleted) + *tupleDeleted = true; + + /* + * If this delete is the result of a partition key update that moved the + * tuple to a new partition, put this row into the transition OLD TABLE, + * if there is one. We need to do this separately for DELETE and INSERT + * because they happen on different tables. + */ + ar_delete_trig_tcs = mtstate->mt_transition_capture; + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture + && mtstate->mt_transition_capture->tcs_update_old_table) + { + ExecARUpdateTriggers(estate, resultRelInfo, + tupleid, + oldtuple, + NULL, + NULL, + mtstate->mt_transition_capture); + + /* + * We've already captured the NEW TABLE row, so make sure any AR + * DELETE trigger fired below doesn't capture it again. + */ + ar_delete_trig_tcs = NULL; + } + /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, - mtstate->mt_transition_capture); + ar_delete_trig_tcs); - /* Process RETURNING if present */ - if (resultRelInfo->ri_projectReturning) + /* Process RETURNING if present and if requested */ + if (processReturning && resultRelInfo->ri_projectReturning) { /* * We have to put the target tuple into a slot, which means first we @@ -947,6 +1012,7 @@ ExecUpdate(ModifyTableState *mtstate, HTSU_Result result; HeapUpdateFailureData hufd; List *recheckIndexes = NIL; + TupleConversionMap *saved_tcs_map = NULL; /* * abort the operation if not running transactions @@ -1018,6 +1084,7 @@ ExecUpdate(ModifyTableState *mtstate, else { LockTupleMode lockmode; + bool partition_constraint_failed; /* * Constraints might reference the tableoid column, so initialize @@ -1033,22 +1100,142 @@ ExecUpdate(ModifyTableState *mtstate, * (We don't need to redo triggers, however. If there are any BEFORE * triggers then trigger.c will have done heap_lock_tuple to lock the * correct tuple, so there's no need to do them again.) - * - * ExecWithCheckOptions() will skip any WCOs which are not of the kind - * we are looking for at this point. */ lreplace:; - if (resultRelInfo->ri_WithCheckOptions != NIL) + + /* + * If partition constraint fails, this row might get moved to another + * partition, in which case we should check the RLS CHECK policy just + * before inserting into the new partition, rather than doing it here. + * This is because a trigger on that partition might again change the + * row. So skip the WCO checks if the partition constraint fails. + */ + partition_constraint_failed = + resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate); + + if (!partition_constraint_failed && + resultRelInfo->ri_WithCheckOptions != NIL) + { + /* + * ExecWithCheckOptions() will skip any WCOs which are not of the + * kind we are looking for at this point. + */ ExecWithCheckOptions(WCO_RLS_UPDATE_CHECK, resultRelInfo, slot, estate); + } + + /* + * If a partition check failed, try to move the row into the right + * partition. + */ + if (partition_constraint_failed) + { + bool tuple_deleted; + TupleTableSlot *ret_slot; + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + int map_index; + TupleConversionMap *tupconv_map; + + /* + * When an UPDATE is run on a leaf partition, we will not have + * partition tuple routing set up. In that case, fail with + * partition constraint violation error. + */ + if (proute == NULL) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); + + /* + * Row movement, part 1. Delete the tuple, but skip RETURNING + * processing. We want to return rows from INSERT. + */ + ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate, + &tuple_deleted, false, false); + + /* + * For some reason if DELETE didn't happen (e.g. trigger prevented + * it, or it was already deleted by self, or it was concurrently + * deleted by another transaction), then we should skip the insert + * as well; otherwise, an UPDATE could cause an increase in the + * total number of rows across all partitions, which is clearly + * wrong. + * + * For a normal UPDATE, the case where the tuple has been the + * subject of a concurrent UPDATE or DELETE would be handled by + * the EvalPlanQual machinery, but for an UPDATE that we've + * translated into a DELETE from this partition and an INSERT into + * some other partition, that's not available, because CTID chains + * can't span relation boundaries. We mimic the semantics to a + * limited extent by skipping the INSERT if the DELETE fails to + * find a tuple. This ensures that two concurrent attempts to + * UPDATE the same tuple at the same time can't turn one tuple + * into two, and that an UPDATE of a just-deleted tuple can't + * resurrect it. + */ + if (!tuple_deleted) + return NULL; + + /* + * Updates set the transition capture map only when a new subplan + * is chosen. But for inserts, it is set for each row. So after + * INSERT, we need to revert back to the map created for UPDATE; + * otherwise the next UPDATE will incorrectly use the one created + * for INSERT. So first save the one created for UPDATE. + */ + if (mtstate->mt_transition_capture) + saved_tcs_map = mtstate->mt_transition_capture->tcs_map; + + /* + * resultRelInfo is one of the per-subplan resultRelInfos. So we + * should convert the tuple into root's tuple descriptor, since + * ExecInsert() starts the search from root. The tuple conversion + * map list is in the order of mtstate->resultRelInfo[], so to + * retrieve the one for this resultRel, we need to know the + * position of the resultRel in mtstate->resultRelInfo[]. + */ + map_index = resultRelInfo - mtstate->resultRelInfo; + Assert(map_index >= 0 && map_index < mtstate->mt_nplans); + tupconv_map = tupconv_map_for_subplan(mtstate, map_index); + tuple = ConvertPartitionTupleSlot(tupconv_map, + tuple, + proute->root_tuple_slot, + &slot); + + + /* + * For ExecInsert(), make it look like we are inserting into the + * root. + */ + Assert(mtstate->rootResultRelInfo != NULL); + estate->es_result_relation_info = mtstate->rootResultRelInfo; + + ret_slot = ExecInsert(mtstate, slot, planSlot, NULL, + ONCONFLICT_NONE, estate, canSetTag); + + /* + * Revert back the active result relation and the active + * transition capture map that we changed above. + */ + estate->es_result_relation_info = resultRelInfo; + if (mtstate->mt_transition_capture) + { + mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + mtstate->mt_transition_capture->tcs_map = saved_tcs_map; + } + return ret_slot; + } /* * Check the constraints of the tuple. Note that we pass the same * slot for the orig_slot argument, because unlike ExecInsert(), no * tuple-routing is performed here, hence the slot remains unchanged. + * We've already checked the partition constraint above; however, we + * must still ensure the tuple passes all other constraints, so we + * will call ExecConstraints() and have it validate all remaining + * checks. */ - if (resultRelationDesc->rd_att->constr || resultRelInfo->ri_PartitionCheck) - ExecConstraints(resultRelInfo, slot, estate, true); + if (resultRelationDesc->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate, false); /* * replace the heap tuple @@ -1418,17 +1605,20 @@ fireBSTriggers(ModifyTableState *node) } /* - * Return the ResultRelInfo for which we will fire AFTER STATEMENT triggers. - * This is also the relation into whose tuple format all captured transition - * tuples must be converted. + * Return the target rel ResultRelInfo. + * + * This relation is the same as : + * - the relation for which we will fire AFTER STATEMENT triggers. + * - the relation into whose tuple format all captured transition tuples must + * be converted. + * - the root partitioned table. */ static ResultRelInfo * -getASTriggerResultRelInfo(ModifyTableState *node) +getTargetResultRelInfo(ModifyTableState *node) { /* - * If the node modifies a partitioned table, we must fire its triggers. - * Note that in that case, node->resultRelInfo points to the first leaf - * partition, not the root table. + * Note that if the node modifies a partitioned table, node->resultRelInfo + * points to the first leaf partition, not the root table. */ if (node->rootResultRelInfo != NULL) return node->rootResultRelInfo; @@ -1442,7 +1632,7 @@ getASTriggerResultRelInfo(ModifyTableState *node) static void fireASTriggers(ModifyTableState *node) { - ResultRelInfo *resultRelInfo = getASTriggerResultRelInfo(node); + ResultRelInfo *resultRelInfo = getTargetResultRelInfo(node); switch (node->operation) { @@ -1475,8 +1665,7 @@ fireASTriggers(ModifyTableState *node) static void ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) { - ResultRelInfo *targetRelInfo = getASTriggerResultRelInfo(mtstate); - int i; + ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate); /* Check for transition tables on the directly targeted relation. */ mtstate->mt_transition_capture = @@ -1499,62 +1688,141 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) if (mtstate->mt_transition_capture != NULL || mtstate->mt_oc_transition_capture != NULL) { - int numResultRelInfos; - PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; - - numResultRelInfos = (proute != NULL ? - proute->num_partitions : - mtstate->mt_nplans); + ExecSetupChildParentMapForTcs(mtstate); /* - * Build array of conversion maps from each child's TupleDesc to the - * one used in the tuplestore. The map pointers may be NULL when no - * conversion is necessary, which is hopefully a common case for - * partitions. + * Install the conversion map for the first plan for UPDATE and DELETE + * operations. It will be advanced each time we switch to the next + * plan. (INSERT operations set it every time, so we need not update + * mtstate->mt_oc_transition_capture here.) */ - mtstate->mt_transition_tupconv_maps = (TupleConversionMap **) - palloc0(sizeof(TupleConversionMap *) * numResultRelInfos); + if (mtstate->mt_transition_capture && mtstate->operation != CMD_INSERT) + mtstate->mt_transition_capture->tcs_map = + tupconv_map_for_subplan(mtstate, 0); + } +} - /* Choose the right set of partitions */ - if (proute != NULL) - { - /* - * For tuple routing among partitions, we need TupleDescs based on - * the partition routing table. - */ - ResultRelInfo **resultRelInfos = proute->partitions; +/* + * Initialize the child-to-root tuple conversion map array for UPDATE subplans. + * + * This map array is required to convert the tuple from the subplan result rel + * to the target table descriptor. This requirement arises for two independent + * scenarios: + * 1. For update-tuple-routing. + * 2. For capturing tuples in transition tables. + */ +void +ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate) +{ + ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate); + ResultRelInfo *resultRelInfos = mtstate->resultRelInfo; + TupleDesc outdesc; + int numResultRelInfos = mtstate->mt_nplans; + int i; - for (i = 0; i < numResultRelInfos; ++i) - { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc), - RelationGetDescr(targetRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); - } - } - else - { - /* Otherwise we need the ResultRelInfo for each subplan. */ - ResultRelInfo *resultRelInfos = mtstate->resultRelInfo; + /* + * First check if there is already a per-subplan array allocated. Even if + * there is already a per-leaf map array, we won't require a per-subplan + * one, since we will use the subplan offset array to convert the subplan + * index to per-leaf index. + */ + if (mtstate->mt_per_subplan_tupconv_maps || + (mtstate->mt_partition_tuple_routing && + mtstate->mt_partition_tuple_routing->child_parent_tupconv_maps)) + return; - for (i = 0; i < numResultRelInfos; ++i) - { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc), - RelationGetDescr(targetRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); - } - } + /* + * Build array of conversion maps from each child's TupleDesc to the one + * used in the target relation. The map pointers may be NULL when no + * conversion is necessary, which is hopefully a common case. + */ + /* Get tuple descriptor of the target rel. */ + outdesc = RelationGetDescr(targetRelInfo->ri_RelationDesc); + + mtstate->mt_per_subplan_tupconv_maps = (TupleConversionMap **) + palloc(sizeof(TupleConversionMap *) * numResultRelInfos); + + for (i = 0; i < numResultRelInfos; ++i) + { + mtstate->mt_per_subplan_tupconv_maps[i] = + convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc), + outdesc, + gettext_noop("could not convert row type")); + } +} + +/* + * Initialize the child-to-root tuple conversion map array required for + * capturing transition tuples. + * + * The map array can be indexed either by subplan index or by leaf-partition + * index. For transition tables, we need a subplan-indexed access to the map, + * and where tuple-routing is present, we also require a leaf-indexed access. + */ +static void +ExecSetupChildParentMapForTcs(ModifyTableState *mtstate) +{ + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + + /* + * If partition tuple routing is set up, we will require partition-indexed + * access. In that case, create the map array indexed by partition; we + * will still be able to access the maps using a subplan index by + * converting the subplan index to a partition index using + * subplan_partition_offsets. If tuple routing is not set up, it means we + * don't require partition-indexed access. In that case, create just a + * subplan-indexed map. + */ + if (proute) + { /* - * Install the conversion map for the first plan for UPDATE and DELETE - * operations. It will be advanced each time we switch to the next - * plan. (INSERT operations set it every time, so we need not update - * mtstate->mt_oc_transition_capture here.) + * If a partition-indexed map array is to be created, the subplan map + * array has to be NULL. If the subplan map array is already created, + * we won't be able to access the map using a partition index. */ - if (mtstate->mt_transition_capture) - mtstate->mt_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[0]; + Assert(mtstate->mt_per_subplan_tupconv_maps == NULL); + + ExecSetupChildParentMapForLeaf(proute); + } + else + ExecSetupChildParentMapForSubplan(mtstate); +} + +/* + * For a given subplan index, get the tuple conversion map. + */ +static TupleConversionMap * +tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan) +{ + /* + * If a partition-index tuple conversion map array is allocated, we need + * to first get the index into the partition array. Exactly *one* of the + * two arrays is allocated. This is because if there is a partition array + * required, we don't require subplan-indexed array since we can translate + * subplan index into partition index. And, we create a subplan-indexed + * array *only* if partition-indexed array is not required. + */ + if (mtstate->mt_per_subplan_tupconv_maps == NULL) + { + int leaf_index; + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + + /* + * If subplan-indexed array is NULL, things should have been arranged + * to convert the subplan index to partition index. + */ + Assert(proute && proute->subplan_partition_offsets != NULL); + + leaf_index = proute->subplan_partition_offsets[whichplan]; + + return TupConvMapForLeaf(proute, getTargetResultRelInfo(mtstate), + leaf_index); + } + else + { + Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans); + return mtstate->mt_per_subplan_tupconv_maps[whichplan]; } } @@ -1661,15 +1929,13 @@ ExecModifyTable(PlanState *pstate) /* Prepare to convert transition tuples from this child. */ if (node->mt_transition_capture != NULL) { - Assert(node->mt_transition_tupconv_maps != NULL); node->mt_transition_capture->tcs_map = - node->mt_transition_tupconv_maps[node->mt_whichplan]; + tupconv_map_for_subplan(node, node->mt_whichplan); } if (node->mt_oc_transition_capture != NULL) { - Assert(node->mt_transition_tupconv_maps != NULL); node->mt_oc_transition_capture->tcs_map = - node->mt_transition_tupconv_maps[node->mt_whichplan]; + tupconv_map_for_subplan(node, node->mt_whichplan); } continue; } @@ -1786,7 +2052,8 @@ ExecModifyTable(PlanState *pstate) break; case CMD_DELETE: slot = ExecDelete(node, tupleid, oldtuple, planSlot, - &node->mt_epqstate, estate, node->canSetTag); + &node->mt_epqstate, estate, + NULL, true, node->canSetTag); break; default: elog(ERROR, "unknown operation"); @@ -1830,9 +2097,12 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ResultRelInfo *saved_resultRelInfo; ResultRelInfo *resultRelInfo; Plan *subplan; + int firstVarno = 0; + Relation firstResultRel = NULL; ListCell *l; int i; Relation rel; + bool update_tuple_routing_needed = node->partColsUpdated; PartitionTupleRouting *proute = NULL; int num_partitions = 0; @@ -1907,6 +2177,16 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo->ri_IndexRelationDescs == NULL) ExecOpenIndices(resultRelInfo, mtstate->mt_onconflict != ONCONFLICT_NONE); + /* + * If this is an UPDATE and a BEFORE UPDATE trigger is present, the + * trigger itself might modify the partition-key values. So arrange + * for tuple routing. + */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_before_row && + operation == CMD_UPDATE) + update_tuple_routing_needed = true; + /* Now init the plan for this result rel */ estate->es_result_relation_info = resultRelInfo; mtstate->mt_plans[i] = ExecInitNode(subplan, estate, eflags); @@ -1931,16 +2211,35 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_result_relation_info = saved_resultRelInfo; - /* Build state for INSERT tuple routing */ - rel = mtstate->resultRelInfo->ri_RelationDesc; - if (operation == CMD_INSERT && - rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + /* Get the target relation */ + rel = (getTargetResultRelInfo(mtstate))->ri_RelationDesc; + + /* + * If it's not a partitioned table after all, UPDATE tuple routing should + * not be attempted. + */ + if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + update_tuple_routing_needed = false; + + /* + * Build state for tuple routing if it's an INSERT or if it's an UPDATE of + * partition key. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE && + (operation == CMD_INSERT || update_tuple_routing_needed)) { proute = mtstate->mt_partition_tuple_routing = ExecSetupPartitionTupleRouting(mtstate, rel, node->nominalRelation, estate); num_partitions = proute->num_partitions; + + /* + * Below are required as reference objects for mapping partition + * attno's in expressions such as WithCheckOptions and RETURNING. + */ + firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; } /* @@ -1951,6 +2250,17 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ExecSetupTransitionCaptureState(mtstate, estate); /* + * Construct mapping from each of the per-subplan partition attnos to the + * root attno. This is required when during update row movement the tuple + * descriptor of a source partition does not match the root partitioned + * table descriptor. In such a case we need to convert tuples to the root + * tuple descriptor, because the search for destination partition starts + * from the root. Skip this setup if it's not a partition key update. + */ + if (update_tuple_routing_needed) + ExecSetupChildParentMapForSubplan(mtstate); + + /* * Initialize any WITH CHECK OPTION constraints if needed. */ resultRelInfo = mtstate->resultRelInfo; @@ -1980,26 +2290,29 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * Build WITH CHECK OPTION constraints for each leaf partition rel. Note * that we didn't build the withCheckOptionList for each partition within * the planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * cases are handled above. + * will suffice. This only occurs for the INSERT case or for UPDATE row + * movement. DELETEs and local UPDATEs are handled above. */ if (node->withCheckOptionLists != NIL && num_partitions > 0) { - List *wcoList; - PlanState *plan; + List *first_wcoList; /* * In case of INSERT on partitioned tables, there is only one plan. * Likewise, there is only one WITH CHECK OPTIONS list, not one per - * partition. We make a copy of the WCO qual for each partition; note - * that, if there are SubPlans in there, they all end up attached to - * the one parent Plan node. + * partition. Whereas for UPDATE, there are as many WCOs as there are + * plans. So in either case, use the WCO expression of the first + * resultRelInfo as a reference to calculate attno's for the WCO + * expression of each of the partitions. We make a copy of the WCO + * qual for each partition. Note that, if there are SubPlans in there, + * they all end up attached to the one parent Plan node. */ - Assert(operation == CMD_INSERT && - list_length(node->withCheckOptionLists) == 1 && - mtstate->mt_nplans == 1); - wcoList = linitial(node->withCheckOptionLists); - plan = mtstate->mt_plans[0]; + Assert(update_tuple_routing_needed || + (operation == CMD_INSERT && + list_length(node->withCheckOptionLists) == 1 && + mtstate->mt_nplans == 1)); + + first_wcoList = linitial(node->withCheckOptionLists); for (i = 0; i < num_partitions; i++) { Relation partrel; @@ -2008,17 +2321,26 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ListCell *ll; resultRelInfo = proute->partitions[i]; + + /* + * If we are referring to a resultRelInfo from one of the update + * result rels, that result rel would already have + * WithCheckOptions initialized. + */ + if (resultRelInfo->ri_WithCheckOptions) + continue; + partrel = resultRelInfo->ri_RelationDesc; - /* varno = node->nominalRelation */ - mapped_wcoList = map_partition_varattnos(wcoList, - node->nominalRelation, - partrel, rel, NULL); + mapped_wcoList = map_partition_varattnos(first_wcoList, + firstVarno, + partrel, firstResultRel, + NULL); foreach(ll, mapped_wcoList) { WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), - plan); + &mtstate->ps); wcoExprs = lappend(wcoExprs, wcoExpr); } @@ -2035,7 +2357,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) { TupleTableSlot *slot; ExprContext *econtext; - List *returningList; + List *firstReturningList; /* * Initialize result tuple slot and assign its rowtype using the first @@ -2071,22 +2393,35 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * Build a projection for each leaf partition rel. Note that we * didn't build the returningList for each partition within the * planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * are handled above. + * will suffice. This only occurs for the INSERT case or for UPDATE + * row movement. DELETEs and local UPDATEs are handled above. */ - returningList = linitial(node->returningLists); + firstReturningList = linitial(node->returningLists); for (i = 0; i < num_partitions; i++) { Relation partrel; List *rlist; resultRelInfo = proute->partitions[i]; + + /* + * If we are referring to a resultRelInfo from one of the update + * result rels, that result rel would already have a returningList + * built. + */ + if (resultRelInfo->ri_projectReturning) + continue; + partrel = resultRelInfo->ri_RelationDesc; - /* varno = node->nominalRelation */ - rlist = map_partition_varattnos(returningList, - node->nominalRelation, - partrel, rel, NULL); + /* + * Use the returning expression of the first resultRelInfo as a + * reference to calculate attno's for the returning expression of + * each of the partitions. + */ + rlist = map_partition_varattnos(firstReturningList, + firstVarno, + partrel, firstResultRel, NULL); resultRelInfo->ri_projectReturning = ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps, resultRelInfo->ri_RelationDesc->rd_att); |