diff options
author | Robert Haas <rhaas@postgresql.org> | 2018-01-19 15:33:06 -0500 |
---|---|---|
committer | Robert Haas <rhaas@postgresql.org> | 2018-01-19 15:33:06 -0500 |
commit | 2f178441044be430f6b4d626e4dae68a9a6f6cec (patch) | |
tree | 131e7547b169b2bc42f638f1ca1b50ae6e146b70 /src/backend/executor/nodeModifyTable.c | |
parent | 7f17fd6fc7125b41218bc99ccfa8165e2d730cd9 (diff) | |
download | postgresql-2f178441044be430f6b4d626e4dae68a9a6f6cec.tar.gz postgresql-2f178441044be430f6b4d626e4dae68a9a6f6cec.zip |
Allow UPDATE to move rows between partitions.
When an UPDATE causes a row to no longer match the partition
constraint, try to move it to a different partition where it does
match the partition constraint. In essence, the UPDATE is split into
a DELETE from the old partition and an INSERT into the new one. This
can lead to surprising behavior in concurrency scenarios because
EvalPlanQual rechecks won't work as they normally did; the known
problems are documented. (There is a pending patch to improve the
situation further, but it needs more review.)
Amit Khandekar, reviewed and tested by Amit Langote, David Rowley,
Rajkumar Raghuwanshi, Dilip Kumar, Amul Sul, Thomas Munro, Álvaro
Herrera, Amit Kapila, and me. A few final revisions by me.
Discussion: http://postgr.es/m/CAJ3gD9do9o2ccQ7j7+tSgiE1REY65XRiMb=yJO3u3QhyP8EEPQ@mail.gmail.com
Diffstat (limited to 'src/backend/executor/nodeModifyTable.c')
-rw-r--r-- | src/backend/executor/nodeModifyTable.c | 583 |
1 files changed, 459 insertions, 124 deletions
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index c5eca1bb74c..6c2f8d4ec03 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -62,6 +62,11 @@ static bool ExecOnConflictUpdate(ModifyTableState *mtstate, EState *estate, bool canSetTag, TupleTableSlot **returning); +static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node); +static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate); +static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate); +static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node, + int whichplan); /* * Verify that the tuples to be produced by INSERT or UPDATE match the @@ -265,6 +270,7 @@ ExecInsert(ModifyTableState *mtstate, Oid newId; List *recheckIndexes = NIL; TupleTableSlot *result = NULL; + TransitionCaptureState *ar_insert_trig_tcs; /* * get the heap tuple out of the tuple table slot, making sure we have a @@ -282,7 +288,6 @@ ExecInsert(ModifyTableState *mtstate, { int leaf_part_index; PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; - TupleConversionMap *map; /* * Away we go ... If we end up not finding a partition after all, @@ -331,8 +336,10 @@ ExecInsert(ModifyTableState *mtstate, * back to tuplestore format. */ mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + mtstate->mt_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[leaf_part_index]; + TupConvMapForLeaf(proute, saved_resultRelInfo, + leaf_part_index); } else { @@ -345,30 +352,20 @@ ExecInsert(ModifyTableState *mtstate, } } if (mtstate->mt_oc_transition_capture != NULL) + { mtstate->mt_oc_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[leaf_part_index]; + TupConvMapForLeaf(proute, saved_resultRelInfo, + leaf_part_index); + } /* * We might need to convert from the parent rowtype to the partition * rowtype. */ - map = proute->partition_tupconv_maps[leaf_part_index]; - if (map) - { - Relation partrel = resultRelInfo->ri_RelationDesc; - - tuple = do_convert_tuple(tuple, map); - - /* - * We must use the partition's tuple descriptor from this point - * on, until we're finished dealing with the partition. Use the - * dedicated slot for that. - */ - slot = proute->partition_tuple_slot; - Assert(slot != NULL); - ExecSetSlotDescriptor(slot, RelationGetDescr(partrel)); - ExecStoreTuple(tuple, slot, InvalidBuffer, true); - } + tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index], + tuple, + proute->partition_tuple_slot, + &slot); } resultRelationDesc = resultRelInfo->ri_RelationDesc; @@ -449,6 +446,8 @@ ExecInsert(ModifyTableState *mtstate, } else { + WCOKind wco_kind; + /* * We always check the partition constraint, including when the tuple * got here via tuple-routing. However we don't need to in the latter @@ -466,14 +465,23 @@ ExecInsert(ModifyTableState *mtstate, tuple->t_tableOid = RelationGetRelid(resultRelationDesc); /* - * Check any RLS INSERT WITH CHECK policies + * Check any RLS WITH CHECK policies. * + * Normally we should check INSERT policies. But if the insert is the + * result of a partition key update that moved the tuple to a new + * partition, we should instead check UPDATE policies, because we are + * executing policies defined on the target table, and not those + * defined on the child partitions. + */ + wco_kind = (mtstate->operation == CMD_UPDATE) ? + WCO_RLS_UPDATE_CHECK : WCO_RLS_INSERT_CHECK; + + /* * ExecWithCheckOptions() will skip any WCOs which are not of the kind * we are looking for at this point. */ if (resultRelInfo->ri_WithCheckOptions != NIL) - ExecWithCheckOptions(WCO_RLS_INSERT_CHECK, - resultRelInfo, slot, estate); + ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate); /* * No need though if the tuple has been routed, and a BR trigger @@ -622,9 +630,32 @@ ExecInsert(ModifyTableState *mtstate, setLastTid(&(tuple->t_self)); } + /* + * If this insert is the result of a partition key update that moved the + * tuple to a new partition, put this row into the transition NEW TABLE, + * if there is one. We need to do this separately for DELETE and INSERT + * because they happen on different tables. + */ + ar_insert_trig_tcs = mtstate->mt_transition_capture; + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture + && mtstate->mt_transition_capture->tcs_update_new_table) + { + ExecARUpdateTriggers(estate, resultRelInfo, NULL, + NULL, + tuple, + NULL, + mtstate->mt_transition_capture); + + /* + * We've already captured the NEW TABLE row, so make sure any AR + * INSERT trigger fired below doesn't capture it again. + */ + ar_insert_trig_tcs = NULL; + } + /* AFTER ROW INSERT Triggers */ ExecARInsertTriggers(estate, resultRelInfo, tuple, recheckIndexes, - mtstate->mt_transition_capture); + ar_insert_trig_tcs); list_free(recheckIndexes); @@ -678,6 +709,8 @@ ExecDelete(ModifyTableState *mtstate, TupleTableSlot *planSlot, EPQState *epqstate, EState *estate, + bool *tupleDeleted, + bool processReturning, bool canSetTag) { ResultRelInfo *resultRelInfo; @@ -685,6 +718,10 @@ ExecDelete(ModifyTableState *mtstate, HTSU_Result result; HeapUpdateFailureData hufd; TupleTableSlot *slot = NULL; + TransitionCaptureState *ar_delete_trig_tcs; + + if (tupleDeleted) + *tupleDeleted = false; /* * get information on the (current) result relation @@ -849,12 +886,40 @@ ldelete:; if (canSetTag) (estate->es_processed)++; + /* Tell caller that the delete actually happened. */ + if (tupleDeleted) + *tupleDeleted = true; + + /* + * If this delete is the result of a partition key update that moved the + * tuple to a new partition, put this row into the transition OLD TABLE, + * if there is one. We need to do this separately for DELETE and INSERT + * because they happen on different tables. + */ + ar_delete_trig_tcs = mtstate->mt_transition_capture; + if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture + && mtstate->mt_transition_capture->tcs_update_old_table) + { + ExecARUpdateTriggers(estate, resultRelInfo, + tupleid, + oldtuple, + NULL, + NULL, + mtstate->mt_transition_capture); + + /* + * We've already captured the NEW TABLE row, so make sure any AR + * DELETE trigger fired below doesn't capture it again. + */ + ar_delete_trig_tcs = NULL; + } + /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple, - mtstate->mt_transition_capture); + ar_delete_trig_tcs); - /* Process RETURNING if present */ - if (resultRelInfo->ri_projectReturning) + /* Process RETURNING if present and if requested */ + if (processReturning && resultRelInfo->ri_projectReturning) { /* * We have to put the target tuple into a slot, which means first we @@ -947,6 +1012,7 @@ ExecUpdate(ModifyTableState *mtstate, HTSU_Result result; HeapUpdateFailureData hufd; List *recheckIndexes = NIL; + TupleConversionMap *saved_tcs_map = NULL; /* * abort the operation if not running transactions @@ -1018,6 +1084,7 @@ ExecUpdate(ModifyTableState *mtstate, else { LockTupleMode lockmode; + bool partition_constraint_failed; /* * Constraints might reference the tableoid column, so initialize @@ -1033,22 +1100,142 @@ ExecUpdate(ModifyTableState *mtstate, * (We don't need to redo triggers, however. If there are any BEFORE * triggers then trigger.c will have done heap_lock_tuple to lock the * correct tuple, so there's no need to do them again.) - * - * ExecWithCheckOptions() will skip any WCOs which are not of the kind - * we are looking for at this point. */ lreplace:; - if (resultRelInfo->ri_WithCheckOptions != NIL) + + /* + * If partition constraint fails, this row might get moved to another + * partition, in which case we should check the RLS CHECK policy just + * before inserting into the new partition, rather than doing it here. + * This is because a trigger on that partition might again change the + * row. So skip the WCO checks if the partition constraint fails. + */ + partition_constraint_failed = + resultRelInfo->ri_PartitionCheck && + !ExecPartitionCheck(resultRelInfo, slot, estate); + + if (!partition_constraint_failed && + resultRelInfo->ri_WithCheckOptions != NIL) + { + /* + * ExecWithCheckOptions() will skip any WCOs which are not of the + * kind we are looking for at this point. + */ ExecWithCheckOptions(WCO_RLS_UPDATE_CHECK, resultRelInfo, slot, estate); + } + + /* + * If a partition check failed, try to move the row into the right + * partition. + */ + if (partition_constraint_failed) + { + bool tuple_deleted; + TupleTableSlot *ret_slot; + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + int map_index; + TupleConversionMap *tupconv_map; + + /* + * When an UPDATE is run on a leaf partition, we will not have + * partition tuple routing set up. In that case, fail with + * partition constraint violation error. + */ + if (proute == NULL) + ExecPartitionCheckEmitError(resultRelInfo, slot, estate); + + /* + * Row movement, part 1. Delete the tuple, but skip RETURNING + * processing. We want to return rows from INSERT. + */ + ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate, + &tuple_deleted, false, false); + + /* + * For some reason if DELETE didn't happen (e.g. trigger prevented + * it, or it was already deleted by self, or it was concurrently + * deleted by another transaction), then we should skip the insert + * as well; otherwise, an UPDATE could cause an increase in the + * total number of rows across all partitions, which is clearly + * wrong. + * + * For a normal UPDATE, the case where the tuple has been the + * subject of a concurrent UPDATE or DELETE would be handled by + * the EvalPlanQual machinery, but for an UPDATE that we've + * translated into a DELETE from this partition and an INSERT into + * some other partition, that's not available, because CTID chains + * can't span relation boundaries. We mimic the semantics to a + * limited extent by skipping the INSERT if the DELETE fails to + * find a tuple. This ensures that two concurrent attempts to + * UPDATE the same tuple at the same time can't turn one tuple + * into two, and that an UPDATE of a just-deleted tuple can't + * resurrect it. + */ + if (!tuple_deleted) + return NULL; + + /* + * Updates set the transition capture map only when a new subplan + * is chosen. But for inserts, it is set for each row. So after + * INSERT, we need to revert back to the map created for UPDATE; + * otherwise the next UPDATE will incorrectly use the one created + * for INSERT. So first save the one created for UPDATE. + */ + if (mtstate->mt_transition_capture) + saved_tcs_map = mtstate->mt_transition_capture->tcs_map; + + /* + * resultRelInfo is one of the per-subplan resultRelInfos. So we + * should convert the tuple into root's tuple descriptor, since + * ExecInsert() starts the search from root. The tuple conversion + * map list is in the order of mtstate->resultRelInfo[], so to + * retrieve the one for this resultRel, we need to know the + * position of the resultRel in mtstate->resultRelInfo[]. + */ + map_index = resultRelInfo - mtstate->resultRelInfo; + Assert(map_index >= 0 && map_index < mtstate->mt_nplans); + tupconv_map = tupconv_map_for_subplan(mtstate, map_index); + tuple = ConvertPartitionTupleSlot(tupconv_map, + tuple, + proute->root_tuple_slot, + &slot); + + + /* + * For ExecInsert(), make it look like we are inserting into the + * root. + */ + Assert(mtstate->rootResultRelInfo != NULL); + estate->es_result_relation_info = mtstate->rootResultRelInfo; + + ret_slot = ExecInsert(mtstate, slot, planSlot, NULL, + ONCONFLICT_NONE, estate, canSetTag); + + /* + * Revert back the active result relation and the active + * transition capture map that we changed above. + */ + estate->es_result_relation_info = resultRelInfo; + if (mtstate->mt_transition_capture) + { + mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; + mtstate->mt_transition_capture->tcs_map = saved_tcs_map; + } + return ret_slot; + } /* * Check the constraints of the tuple. Note that we pass the same * slot for the orig_slot argument, because unlike ExecInsert(), no * tuple-routing is performed here, hence the slot remains unchanged. + * We've already checked the partition constraint above; however, we + * must still ensure the tuple passes all other constraints, so we + * will call ExecConstraints() and have it validate all remaining + * checks. */ - if (resultRelationDesc->rd_att->constr || resultRelInfo->ri_PartitionCheck) - ExecConstraints(resultRelInfo, slot, estate, true); + if (resultRelationDesc->rd_att->constr) + ExecConstraints(resultRelInfo, slot, estate, false); /* * replace the heap tuple @@ -1418,17 +1605,20 @@ fireBSTriggers(ModifyTableState *node) } /* - * Return the ResultRelInfo for which we will fire AFTER STATEMENT triggers. - * This is also the relation into whose tuple format all captured transition - * tuples must be converted. + * Return the target rel ResultRelInfo. + * + * This relation is the same as : + * - the relation for which we will fire AFTER STATEMENT triggers. + * - the relation into whose tuple format all captured transition tuples must + * be converted. + * - the root partitioned table. */ static ResultRelInfo * -getASTriggerResultRelInfo(ModifyTableState *node) +getTargetResultRelInfo(ModifyTableState *node) { /* - * If the node modifies a partitioned table, we must fire its triggers. - * Note that in that case, node->resultRelInfo points to the first leaf - * partition, not the root table. + * Note that if the node modifies a partitioned table, node->resultRelInfo + * points to the first leaf partition, not the root table. */ if (node->rootResultRelInfo != NULL) return node->rootResultRelInfo; @@ -1442,7 +1632,7 @@ getASTriggerResultRelInfo(ModifyTableState *node) static void fireASTriggers(ModifyTableState *node) { - ResultRelInfo *resultRelInfo = getASTriggerResultRelInfo(node); + ResultRelInfo *resultRelInfo = getTargetResultRelInfo(node); switch (node->operation) { @@ -1475,8 +1665,7 @@ fireASTriggers(ModifyTableState *node) static void ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) { - ResultRelInfo *targetRelInfo = getASTriggerResultRelInfo(mtstate); - int i; + ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate); /* Check for transition tables on the directly targeted relation. */ mtstate->mt_transition_capture = @@ -1499,62 +1688,141 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) if (mtstate->mt_transition_capture != NULL || mtstate->mt_oc_transition_capture != NULL) { - int numResultRelInfos; - PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; - - numResultRelInfos = (proute != NULL ? - proute->num_partitions : - mtstate->mt_nplans); + ExecSetupChildParentMapForTcs(mtstate); /* - * Build array of conversion maps from each child's TupleDesc to the - * one used in the tuplestore. The map pointers may be NULL when no - * conversion is necessary, which is hopefully a common case for - * partitions. + * Install the conversion map for the first plan for UPDATE and DELETE + * operations. It will be advanced each time we switch to the next + * plan. (INSERT operations set it every time, so we need not update + * mtstate->mt_oc_transition_capture here.) */ - mtstate->mt_transition_tupconv_maps = (TupleConversionMap **) - palloc0(sizeof(TupleConversionMap *) * numResultRelInfos); + if (mtstate->mt_transition_capture && mtstate->operation != CMD_INSERT) + mtstate->mt_transition_capture->tcs_map = + tupconv_map_for_subplan(mtstate, 0); + } +} - /* Choose the right set of partitions */ - if (proute != NULL) - { - /* - * For tuple routing among partitions, we need TupleDescs based on - * the partition routing table. - */ - ResultRelInfo **resultRelInfos = proute->partitions; +/* + * Initialize the child-to-root tuple conversion map array for UPDATE subplans. + * + * This map array is required to convert the tuple from the subplan result rel + * to the target table descriptor. This requirement arises for two independent + * scenarios: + * 1. For update-tuple-routing. + * 2. For capturing tuples in transition tables. + */ +void +ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate) +{ + ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate); + ResultRelInfo *resultRelInfos = mtstate->resultRelInfo; + TupleDesc outdesc; + int numResultRelInfos = mtstate->mt_nplans; + int i; - for (i = 0; i < numResultRelInfos; ++i) - { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc), - RelationGetDescr(targetRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); - } - } - else - { - /* Otherwise we need the ResultRelInfo for each subplan. */ - ResultRelInfo *resultRelInfos = mtstate->resultRelInfo; + /* + * First check if there is already a per-subplan array allocated. Even if + * there is already a per-leaf map array, we won't require a per-subplan + * one, since we will use the subplan offset array to convert the subplan + * index to per-leaf index. + */ + if (mtstate->mt_per_subplan_tupconv_maps || + (mtstate->mt_partition_tuple_routing && + mtstate->mt_partition_tuple_routing->child_parent_tupconv_maps)) + return; - for (i = 0; i < numResultRelInfos; ++i) - { - mtstate->mt_transition_tupconv_maps[i] = - convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc), - RelationGetDescr(targetRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); - } - } + /* + * Build array of conversion maps from each child's TupleDesc to the one + * used in the target relation. The map pointers may be NULL when no + * conversion is necessary, which is hopefully a common case. + */ + /* Get tuple descriptor of the target rel. */ + outdesc = RelationGetDescr(targetRelInfo->ri_RelationDesc); + + mtstate->mt_per_subplan_tupconv_maps = (TupleConversionMap **) + palloc(sizeof(TupleConversionMap *) * numResultRelInfos); + + for (i = 0; i < numResultRelInfos; ++i) + { + mtstate->mt_per_subplan_tupconv_maps[i] = + convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc), + outdesc, + gettext_noop("could not convert row type")); + } +} + +/* + * Initialize the child-to-root tuple conversion map array required for + * capturing transition tuples. + * + * The map array can be indexed either by subplan index or by leaf-partition + * index. For transition tables, we need a subplan-indexed access to the map, + * and where tuple-routing is present, we also require a leaf-indexed access. + */ +static void +ExecSetupChildParentMapForTcs(ModifyTableState *mtstate) +{ + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + + /* + * If partition tuple routing is set up, we will require partition-indexed + * access. In that case, create the map array indexed by partition; we + * will still be able to access the maps using a subplan index by + * converting the subplan index to a partition index using + * subplan_partition_offsets. If tuple routing is not set up, it means we + * don't require partition-indexed access. In that case, create just a + * subplan-indexed map. + */ + if (proute) + { /* - * Install the conversion map for the first plan for UPDATE and DELETE - * operations. It will be advanced each time we switch to the next - * plan. (INSERT operations set it every time, so we need not update - * mtstate->mt_oc_transition_capture here.) + * If a partition-indexed map array is to be created, the subplan map + * array has to be NULL. If the subplan map array is already created, + * we won't be able to access the map using a partition index. */ - if (mtstate->mt_transition_capture) - mtstate->mt_transition_capture->tcs_map = - mtstate->mt_transition_tupconv_maps[0]; + Assert(mtstate->mt_per_subplan_tupconv_maps == NULL); + + ExecSetupChildParentMapForLeaf(proute); + } + else + ExecSetupChildParentMapForSubplan(mtstate); +} + +/* + * For a given subplan index, get the tuple conversion map. + */ +static TupleConversionMap * +tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan) +{ + /* + * If a partition-index tuple conversion map array is allocated, we need + * to first get the index into the partition array. Exactly *one* of the + * two arrays is allocated. This is because if there is a partition array + * required, we don't require subplan-indexed array since we can translate + * subplan index into partition index. And, we create a subplan-indexed + * array *only* if partition-indexed array is not required. + */ + if (mtstate->mt_per_subplan_tupconv_maps == NULL) + { + int leaf_index; + PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + + /* + * If subplan-indexed array is NULL, things should have been arranged + * to convert the subplan index to partition index. + */ + Assert(proute && proute->subplan_partition_offsets != NULL); + + leaf_index = proute->subplan_partition_offsets[whichplan]; + + return TupConvMapForLeaf(proute, getTargetResultRelInfo(mtstate), + leaf_index); + } + else + { + Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans); + return mtstate->mt_per_subplan_tupconv_maps[whichplan]; } } @@ -1661,15 +1929,13 @@ ExecModifyTable(PlanState *pstate) /* Prepare to convert transition tuples from this child. */ if (node->mt_transition_capture != NULL) { - Assert(node->mt_transition_tupconv_maps != NULL); node->mt_transition_capture->tcs_map = - node->mt_transition_tupconv_maps[node->mt_whichplan]; + tupconv_map_for_subplan(node, node->mt_whichplan); } if (node->mt_oc_transition_capture != NULL) { - Assert(node->mt_transition_tupconv_maps != NULL); node->mt_oc_transition_capture->tcs_map = - node->mt_transition_tupconv_maps[node->mt_whichplan]; + tupconv_map_for_subplan(node, node->mt_whichplan); } continue; } @@ -1786,7 +2052,8 @@ ExecModifyTable(PlanState *pstate) break; case CMD_DELETE: slot = ExecDelete(node, tupleid, oldtuple, planSlot, - &node->mt_epqstate, estate, node->canSetTag); + &node->mt_epqstate, estate, + NULL, true, node->canSetTag); break; default: elog(ERROR, "unknown operation"); @@ -1830,9 +2097,12 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ResultRelInfo *saved_resultRelInfo; ResultRelInfo *resultRelInfo; Plan *subplan; + int firstVarno = 0; + Relation firstResultRel = NULL; ListCell *l; int i; Relation rel; + bool update_tuple_routing_needed = node->partColsUpdated; PartitionTupleRouting *proute = NULL; int num_partitions = 0; @@ -1907,6 +2177,16 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) resultRelInfo->ri_IndexRelationDescs == NULL) ExecOpenIndices(resultRelInfo, mtstate->mt_onconflict != ONCONFLICT_NONE); + /* + * If this is an UPDATE and a BEFORE UPDATE trigger is present, the + * trigger itself might modify the partition-key values. So arrange + * for tuple routing. + */ + if (resultRelInfo->ri_TrigDesc && + resultRelInfo->ri_TrigDesc->trig_update_before_row && + operation == CMD_UPDATE) + update_tuple_routing_needed = true; + /* Now init the plan for this result rel */ estate->es_result_relation_info = resultRelInfo; mtstate->mt_plans[i] = ExecInitNode(subplan, estate, eflags); @@ -1931,16 +2211,35 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) estate->es_result_relation_info = saved_resultRelInfo; - /* Build state for INSERT tuple routing */ - rel = mtstate->resultRelInfo->ri_RelationDesc; - if (operation == CMD_INSERT && - rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + /* Get the target relation */ + rel = (getTargetResultRelInfo(mtstate))->ri_RelationDesc; + + /* + * If it's not a partitioned table after all, UPDATE tuple routing should + * not be attempted. + */ + if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + update_tuple_routing_needed = false; + + /* + * Build state for tuple routing if it's an INSERT or if it's an UPDATE of + * partition key. + */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE && + (operation == CMD_INSERT || update_tuple_routing_needed)) { proute = mtstate->mt_partition_tuple_routing = ExecSetupPartitionTupleRouting(mtstate, rel, node->nominalRelation, estate); num_partitions = proute->num_partitions; + + /* + * Below are required as reference objects for mapping partition + * attno's in expressions such as WithCheckOptions and RETURNING. + */ + firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; } /* @@ -1951,6 +2250,17 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ExecSetupTransitionCaptureState(mtstate, estate); /* + * Construct mapping from each of the per-subplan partition attnos to the + * root attno. This is required when during update row movement the tuple + * descriptor of a source partition does not match the root partitioned + * table descriptor. In such a case we need to convert tuples to the root + * tuple descriptor, because the search for destination partition starts + * from the root. Skip this setup if it's not a partition key update. + */ + if (update_tuple_routing_needed) + ExecSetupChildParentMapForSubplan(mtstate); + + /* * Initialize any WITH CHECK OPTION constraints if needed. */ resultRelInfo = mtstate->resultRelInfo; @@ -1980,26 +2290,29 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * Build WITH CHECK OPTION constraints for each leaf partition rel. Note * that we didn't build the withCheckOptionList for each partition within * the planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * cases are handled above. + * will suffice. This only occurs for the INSERT case or for UPDATE row + * movement. DELETEs and local UPDATEs are handled above. */ if (node->withCheckOptionLists != NIL && num_partitions > 0) { - List *wcoList; - PlanState *plan; + List *first_wcoList; /* * In case of INSERT on partitioned tables, there is only one plan. * Likewise, there is only one WITH CHECK OPTIONS list, not one per - * partition. We make a copy of the WCO qual for each partition; note - * that, if there are SubPlans in there, they all end up attached to - * the one parent Plan node. + * partition. Whereas for UPDATE, there are as many WCOs as there are + * plans. So in either case, use the WCO expression of the first + * resultRelInfo as a reference to calculate attno's for the WCO + * expression of each of the partitions. We make a copy of the WCO + * qual for each partition. Note that, if there are SubPlans in there, + * they all end up attached to the one parent Plan node. */ - Assert(operation == CMD_INSERT && - list_length(node->withCheckOptionLists) == 1 && - mtstate->mt_nplans == 1); - wcoList = linitial(node->withCheckOptionLists); - plan = mtstate->mt_plans[0]; + Assert(update_tuple_routing_needed || + (operation == CMD_INSERT && + list_length(node->withCheckOptionLists) == 1 && + mtstate->mt_nplans == 1)); + + first_wcoList = linitial(node->withCheckOptionLists); for (i = 0; i < num_partitions; i++) { Relation partrel; @@ -2008,17 +2321,26 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ListCell *ll; resultRelInfo = proute->partitions[i]; + + /* + * If we are referring to a resultRelInfo from one of the update + * result rels, that result rel would already have + * WithCheckOptions initialized. + */ + if (resultRelInfo->ri_WithCheckOptions) + continue; + partrel = resultRelInfo->ri_RelationDesc; - /* varno = node->nominalRelation */ - mapped_wcoList = map_partition_varattnos(wcoList, - node->nominalRelation, - partrel, rel, NULL); + mapped_wcoList = map_partition_varattnos(first_wcoList, + firstVarno, + partrel, firstResultRel, + NULL); foreach(ll, mapped_wcoList) { WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll)); ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), - plan); + &mtstate->ps); wcoExprs = lappend(wcoExprs, wcoExpr); } @@ -2035,7 +2357,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) { TupleTableSlot *slot; ExprContext *econtext; - List *returningList; + List *firstReturningList; /* * Initialize result tuple slot and assign its rowtype using the first @@ -2071,22 +2393,35 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * Build a projection for each leaf partition rel. Note that we * didn't build the returningList for each partition within the * planner, but simple translation of the varattnos for each partition - * will suffice. This only occurs for the INSERT case; UPDATE/DELETE - * are handled above. + * will suffice. This only occurs for the INSERT case or for UPDATE + * row movement. DELETEs and local UPDATEs are handled above. */ - returningList = linitial(node->returningLists); + firstReturningList = linitial(node->returningLists); for (i = 0; i < num_partitions; i++) { Relation partrel; List *rlist; resultRelInfo = proute->partitions[i]; + + /* + * If we are referring to a resultRelInfo from one of the update + * result rels, that result rel would already have a returningList + * built. + */ + if (resultRelInfo->ri_projectReturning) + continue; + partrel = resultRelInfo->ri_RelationDesc; - /* varno = node->nominalRelation */ - rlist = map_partition_varattnos(returningList, - node->nominalRelation, - partrel, rel, NULL); + /* + * Use the returning expression of the first resultRelInfo as a + * reference to calculate attno's for the returning expression of + * each of the partitions. + */ + rlist = map_partition_varattnos(firstReturningList, + firstVarno, + partrel, firstResultRel, NULL); resultRelInfo->ri_projectReturning = ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps, resultRelInfo->ri_RelationDesc->rd_att); |