2 files changed, 680 insertions, 144 deletions
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 8c0d2df63c7..89b7bb4c608 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -54,7 +54,11 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 	List	   *leaf_parts;
 	ListCell   *cell;
 	int			i;
-	ResultRelInfo *leaf_part_rri;
+	ResultRelInfo *leaf_part_arr = NULL,
+			   *update_rri = NULL;
+	int			num_update_rri = 0,
+				update_rri_index = 0;
+	bool		is_update = false;
 	PartitionTupleRouting *proute;
 
 	/*
@@ -69,10 +73,38 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 	proute->num_partitions = list_length(leaf_parts);
 	proute->partitions = (ResultRelInfo **) palloc(proute->num_partitions *
 												   sizeof(ResultRelInfo *));
-	proute->partition_tupconv_maps =
+	proute->parent_child_tupconv_maps =
 		(TupleConversionMap **) palloc0(proute->num_partitions *
 										sizeof(TupleConversionMap *));
 
+	/* Set up details specific to the type of tuple routing we are doing. */
+	if (mtstate && mtstate->operation == CMD_UPDATE)
+	{
+		ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+
+		is_update = true;
+		update_rri = mtstate->resultRelInfo;
+		num_update_rri = list_length(node->plans);
+		proute->subplan_partition_offsets =
+			palloc(num_update_rri * sizeof(int));
+
+		/*
+		 * We need an additional tuple slot for storing transient tuples that
+		 * are converted to the root table descriptor.
+		 */
+		proute->root_tuple_slot = MakeTupleTableSlot();
+	}
+	else
+	{
+		/*
+		 * Since we are inserting tuples, we need to create all new result
+		 * rels. Avoid repeated pallocs by allocating memory for all the
+		 * result rels in bulk.
+		 */
+		leaf_part_arr = (ResultRelInfo *) palloc0(proute->num_partitions *
+												  sizeof(ResultRelInfo));
+	}
+
 	/*
 	 * Initialize an empty slot that will be used to manipulate tuples of any
 	 * given partition's rowtype.  It is attached to the caller-specified node
@@ -81,38 +113,86 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 	 */
 	proute->partition_tuple_slot = MakeTupleTableSlot();
 
-	leaf_part_rri = (ResultRelInfo *) palloc0(proute->num_partitions *
-											  sizeof(ResultRelInfo));
 	i = 0;
 	foreach(cell, leaf_parts)
 	{
-		Relation	partrel;
+		ResultRelInfo *leaf_part_rri;
+		Relation	partrel = NULL;
 		TupleDesc	part_tupdesc;
+		Oid			leaf_oid = lfirst_oid(cell);
+
+		if (is_update)
+		{
+			/*
+			 * If the leaf partition is already present in the per-subplan
+			 * result rels, we re-use that rather than initialize a new result
+			 * rel. The per-subplan resultrels and the resultrels of the leaf
+			 * partitions are both in the same canonical order. So while going
+			 * through the leaf partition oids, we need to keep track of the
+			 * next per-subplan result rel to be looked for in the leaf
+			 * partition resultrels.
+			 */
+			if (update_rri_index < num_update_rri &&
+				RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid)
+			{
+				leaf_part_rri = &update_rri[update_rri_index];
+				partrel = leaf_part_rri->ri_RelationDesc;
+
+				/*
+				 * This is required in order to we convert the partition's
+				 * tuple to be compatible with the root partitioned table's
+				 * tuple descriptor.  When generating the per-subplan result
+				 * rels, this was not set.
+				 */
+				leaf_part_rri->ri_PartitionRoot = rel;
+
+				/* Remember the subplan offset for this ResultRelInfo */
+				proute->subplan_partition_offsets[update_rri_index] = i;
+
+				update_rri_index++;
+			}
+			else
+				leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo));
+		}
+		else
+		{
+			/* For INSERTs, we already have an array of result rels allocated */
+			leaf_part_rri = &leaf_part_arr[i];
+		}
 
 		/*
-		 * We locked all the partitions above including the leaf partitions.
-		 * Note that each of the relations in proute->partitions are
-		 * eventually closed by the caller.
+		 * If we didn't open the partition rel, it means we haven't
+		 * initialized the result rel either.
 		 */
-		partrel = heap_open(lfirst_oid(cell), NoLock);
+		if (!partrel)
+		{
+			/*
+			 * We locked all the partitions above including the leaf
+			 * partitions. Note that each of the newly opened relations in
+			 * proute->partitions are eventually closed by the caller.
+			 */
+			partrel = heap_open(leaf_oid, NoLock);
+			InitResultRelInfo(leaf_part_rri,
+							  partrel,
+							  resultRTindex,
+							  rel,
+							  estate->es_instrument);
+		}
+
 		part_tupdesc = RelationGetDescr(partrel);
 
 		/*
 		 * Save a tuple conversion map to convert a tuple routed to this
 		 * partition from the parent's type to the partition's.
 		 */
-		proute->partition_tupconv_maps[i] =
+		proute->parent_child_tupconv_maps[i] =
 			convert_tuples_by_name(tupDesc, part_tupdesc,
 								   gettext_noop("could not convert row type"));
 
-		InitResultRelInfo(leaf_part_rri,
-						  partrel,
-						  resultRTindex,
-						  rel,
-						  estate->es_instrument);
-
 		/*
-		 * Verify result relation is a valid target for INSERT.
+		 * Verify result relation is a valid target for an INSERT.  An UPDATE
+		 * of a partition-key becomes a DELETE+INSERT operation, so this check
+		 * is still required when the operation is CMD_UPDATE.
 		 */
 		CheckValidResultRel(leaf_part_rri, CMD_INSERT);
 
@@ -132,10 +212,16 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 		estate->es_leaf_result_relations =
 			lappend(estate->es_leaf_result_relations, leaf_part_rri);
 
-		proute->partitions[i] = leaf_part_rri++;
+		proute->partitions[i] = leaf_part_rri;
 		i++;
 	}
 
+	/*
+	 * For UPDATE, we should have found all the per-subplan resultrels in the
+	 * leaf partitions.
+	 */
+	Assert(!is_update || update_rri_index == num_update_rri);
+
 	return proute;
 }
 
@@ -259,15 +345,111 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 }
 
 /*
+ * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition
+ * child-to-root tuple conversion map array.
+ *
+ * This map is required for capturing transition tuples when the target table
+ * is a partitioned table. For a tuple that is routed by an INSERT or UPDATE,
+ * we need to convert it from the leaf partition to the target table
+ * descriptor.
+ */
+void
+ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute)
+{
+	Assert(proute != NULL);
+
+	/*
+	 * These array elements gets filled up with maps on an on-demand basis.
+	 * Initially just set all of them to NULL.
+	 */
+	proute->child_parent_tupconv_maps =
+		(TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) *
+										proute->num_partitions);
+
+	/* Same is the case for this array. All the values are set to false */
+	proute->child_parent_map_not_required =
+		(bool *) palloc0(sizeof(bool) * proute->num_partitions);
+}
+
+/*
+ * TupConvMapForLeaf -- Get the tuple conversion map for a given leaf partition
+ * index.
+ */
+TupleConversionMap *
+TupConvMapForLeaf(PartitionTupleRouting *proute,
+				  ResultRelInfo *rootRelInfo, int leaf_index)
+{
+	ResultRelInfo **resultRelInfos = proute->partitions;
+	TupleConversionMap **map;
+	TupleDesc	tupdesc;
+
+	/* Don't call this if we're not supposed to be using this type of map. */
+	Assert(proute->child_parent_tupconv_maps != NULL);
+
+	/* If it's already known that we don't need a map, return NULL. */
+	if (proute->child_parent_map_not_required[leaf_index])
+		return NULL;
+
+	/* If we've already got a map, return it. */
+	map = &proute->child_parent_tupconv_maps[leaf_index];
+	if (*map != NULL)
+		return *map;
+
+	/* No map yet; try to create one. */
+	tupdesc = RelationGetDescr(resultRelInfos[leaf_index]->ri_RelationDesc);
+	*map =
+		convert_tuples_by_name(tupdesc,
+							   RelationGetDescr(rootRelInfo->ri_RelationDesc),
+							   gettext_noop("could not convert row type"));
+
+	/* If it turns out no map is needed, remember for next time. */
+	proute->child_parent_map_not_required[leaf_index] = (*map == NULL);
+
+	return *map;
+}
+
+/*
+ * ConvertPartitionTupleSlot -- convenience function for tuple conversion.
+ * The tuple, if converted, is stored in new_slot, and *p_my_slot is
+ * updated to point to it.  new_slot typically should be one of the
+ * dedicated partition tuple slots. If map is NULL, *p_my_slot is not changed.
+ *
+ * Returns the converted tuple, unless map is NULL, in which case original
+ * tuple is returned unmodified.
+ */
+HeapTuple
+ConvertPartitionTupleSlot(TupleConversionMap *map,
+						  HeapTuple tuple,
+						  TupleTableSlot *new_slot,
+						  TupleTableSlot **p_my_slot)
+{
+	if (!map)
+		return tuple;
+
+	tuple = do_convert_tuple(tuple, map);
+
+	/*
+	 * Change the partition tuple slot descriptor, as per converted tuple.
+	 */
+	*p_my_slot = new_slot;
+	Assert(new_slot != NULL);
+	ExecSetSlotDescriptor(new_slot, map->outdesc);
+	ExecStoreTuple(tuple, new_slot, InvalidBuffer, true);
+
+	return tuple;
+}
+
+/*
  * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
  * routing.
  *
  * Close all the partitioned tables, leaf partitions, and their indices.
  */
 void
-ExecCleanupTupleRouting(PartitionTupleRouting * proute)
+ExecCleanupTupleRouting(PartitionTupleRouting *proute)
 {
 	int			i;
+	int			subplan_index = 0;
 
 	/*
 	 * Remember, proute->partition_dispatch_info[0] corresponds to the root
@@ -288,11 +470,30 @@ ExecCleanupTupleRouting(PartitionTupleRouting * proute)
 	{
 		ResultRelInfo *resultRelInfo = proute->partitions[i];
 
+		/*
+		 * If this result rel is one of the UPDATE subplan result rels, let
+		 * ExecEndPlan() close it. For INSERT or COPY,
+		 * proute->subplan_partition_offsets will always be NULL. Note that
+		 * the subplan_partition_offsets array and the partitions array have
+		 * the partitions in the same order. So, while we iterate over
+		 * partitions array, we also iterate over the
+		 * subplan_partition_offsets array in order to figure out which of the
+		 * result rels are present in the UPDATE subplans.
+		 */
+		if (proute->subplan_partition_offsets &&
+			proute->subplan_partition_offsets[subplan_index] == i)
+		{
+			subplan_index++;
+			continue;
+		}
+
 		ExecCloseIndices(resultRelInfo);
 		heap_close(resultRelInfo->ri_RelationDesc, NoLock);
 	}
 
-	/* Release the standalone partition tuple descriptor, if any */
+	/* Release the standalone partition tuple descriptors, if any */
+	if (proute->root_tuple_slot)
+		ExecDropSingleTupleTableSlot(proute->root_tuple_slot);
 	if (proute->partition_tuple_slot)
 		ExecDropSingleTupleTableSlot(proute->partition_tuple_slot);
 }
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index c5eca1bb74c..6c2f8d4ec03 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -62,6 +62,11 @@ static bool ExecOnConflictUpdate(ModifyTableState *mtstate,
 					 EState *estate,
 					 bool canSetTag,
 					 TupleTableSlot **returning);
+static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node);
+static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate);
+static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate);
+static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node,
+						int whichplan);
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -265,6 +270,7 @@ ExecInsert(ModifyTableState *mtstate,
 	Oid			newId;
 	List	   *recheckIndexes = NIL;
 	TupleTableSlot *result = NULL;
+	TransitionCaptureState *ar_insert_trig_tcs;
 
 	/*
 	 * get the heap tuple out of the tuple table slot, making sure we have a
@@ -282,7 +288,6 @@ ExecInsert(ModifyTableState *mtstate,
 	{
 		int			leaf_part_index;
 		PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
-		TupleConversionMap *map;
 
 		/*
 		 * Away we go ... If we end up not finding a partition after all,
@@ -331,8 +336,10 @@ ExecInsert(ModifyTableState *mtstate,
 				 * back to tuplestore format.
 				 */
 				mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+
 				mtstate->mt_transition_capture->tcs_map =
-					mtstate->mt_transition_tupconv_maps[leaf_part_index];
+					TupConvMapForLeaf(proute, saved_resultRelInfo,
+									  leaf_part_index);
 			}
 			else
 			{
@@ -345,30 +352,20 @@ ExecInsert(ModifyTableState *mtstate,
 			}
 		}
 		if (mtstate->mt_oc_transition_capture != NULL)
+		{
 			mtstate->mt_oc_transition_capture->tcs_map =
-				mtstate->mt_transition_tupconv_maps[leaf_part_index];
+				TupConvMapForLeaf(proute, saved_resultRelInfo,
+								  leaf_part_index);
+		}
 
 		/*
 		 * We might need to convert from the parent rowtype to the partition
 		 * rowtype.
 		 */
-		map = proute->partition_tupconv_maps[leaf_part_index];
-		if (map)
-		{
-			Relation	partrel = resultRelInfo->ri_RelationDesc;
-
-			tuple = do_convert_tuple(tuple, map);
-
-			/*
-			 * We must use the partition's tuple descriptor from this point
-			 * on, until we're finished dealing with the partition. Use the
-			 * dedicated slot for that.
-			 */
-			slot = proute->partition_tuple_slot;
-			Assert(slot != NULL);
-			ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
-			ExecStoreTuple(tuple, slot, InvalidBuffer, true);
-		}
+		tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index],
+										  tuple,
+										  proute->partition_tuple_slot,
+										  &slot);
 	}
 
 	resultRelationDesc = resultRelInfo->ri_RelationDesc;
@@ -449,6 +446,8 @@ ExecInsert(ModifyTableState *mtstate,
 	}
 	else
 	{
+		WCOKind		wco_kind;
+
 		/*
 		 * We always check the partition constraint, including when the tuple
 		 * got here via tuple-routing.  However we don't need to in the latter
@@ -466,14 +465,23 @@ ExecInsert(ModifyTableState *mtstate,
 		tuple->t_tableOid = RelationGetRelid(resultRelationDesc);
 
 		/*
-		 * Check any RLS INSERT WITH CHECK policies
+		 * Check any RLS WITH CHECK policies.
 		 *
+		 * Normally we should check INSERT policies. But if the insert is the
+		 * result of a partition key update that moved the tuple to a new
+		 * partition, we should instead check UPDATE policies, because we are
+		 * executing policies defined on the target table, and not those
+		 * defined on the child partitions.
+		 */
+		wco_kind = (mtstate->operation == CMD_UPDATE) ?
+			WCO_RLS_UPDATE_CHECK : WCO_RLS_INSERT_CHECK;
+
+		/*
 		 * ExecWithCheckOptions() will skip any WCOs which are not of the kind
 		 * we are looking for at this point.
 		 */
 		if (resultRelInfo->ri_WithCheckOptions != NIL)
-			ExecWithCheckOptions(WCO_RLS_INSERT_CHECK,
-								 resultRelInfo, slot, estate);
+			ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate);
 
 		/*
 		 * No need though if the tuple has been routed, and a BR trigger
@@ -622,9 +630,32 @@ ExecInsert(ModifyTableState *mtstate,
 		setLastTid(&(tuple->t_self));
 	}
 
+	/*
+	 * If this insert is the result of a partition key update that moved the
+	 * tuple to a new partition, put this row into the transition NEW TABLE,
+	 * if there is one. We need to do this separately for DELETE and INSERT
+	 * because they happen on different tables.
+	 */
+	ar_insert_trig_tcs = mtstate->mt_transition_capture;
+	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	{
+		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+							 NULL,
+							 tuple,
+							 NULL,
+							 mtstate->mt_transition_capture);
+
+		/*
+		 * We've already captured the NEW TABLE row, so make sure any AR
+		 * INSERT trigger fired below doesn't capture it again.
+		 */
+		ar_insert_trig_tcs = NULL;
+	}
+
 	/* AFTER ROW INSERT Triggers */
 	ExecARInsertTriggers(estate, resultRelInfo, tuple, recheckIndexes,
-						 mtstate->mt_transition_capture);
+						 ar_insert_trig_tcs);
 
 	list_free(recheckIndexes);
 
@@ -678,6 +709,8 @@ ExecDelete(ModifyTableState *mtstate,
 		   TupleTableSlot *planSlot,
 		   EPQState *epqstate,
 		   EState *estate,
+		   bool *tupleDeleted,
+		   bool processReturning,
 		   bool canSetTag)
 {
 	ResultRelInfo *resultRelInfo;
@@ -685,6 +718,10 @@ ExecDelete(ModifyTableState *mtstate,
 	HTSU_Result result;
 	HeapUpdateFailureData hufd;
 	TupleTableSlot *slot = NULL;
+	TransitionCaptureState *ar_delete_trig_tcs;
+
+	if (tupleDeleted)
+		*tupleDeleted = false;
 
 	/*
 	 * get information on the (current) result relation
@@ -849,12 +886,40 @@ ldelete:;
 	if (canSetTag)
 		(estate->es_processed)++;
 
+	/* Tell caller that the delete actually happened. */
+	if (tupleDeleted)
+		*tupleDeleted = true;
+
+	/*
+	 * If this delete is the result of a partition key update that moved the
+	 * tuple to a new partition, put this row into the transition OLD TABLE,
+	 * if there is one. We need to do this separately for DELETE and INSERT
+	 * because they happen on different tables.
+	 */
+	ar_delete_trig_tcs = mtstate->mt_transition_capture;
+	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+		&& mtstate->mt_transition_capture->tcs_update_old_table)
+	{
+		ExecARUpdateTriggers(estate, resultRelInfo,
+							 tupleid,
+							 oldtuple,
+							 NULL,
+							 NULL,
+							 mtstate->mt_transition_capture);
+
+		/*
+		 * We've already captured the NEW TABLE row, so make sure any AR
+		 * DELETE trigger fired below doesn't capture it again.
+		 */
+		ar_delete_trig_tcs = NULL;
+	}
+
 	/* AFTER ROW DELETE Triggers */
 	ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple,
-						 mtstate->mt_transition_capture);
+						 ar_delete_trig_tcs);
 
-	/* Process RETURNING if present */
-	if (resultRelInfo->ri_projectReturning)
+	/* Process RETURNING if present and if requested */
+	if (processReturning && resultRelInfo->ri_projectReturning)
 	{
 		/*
 		 * We have to put the target tuple into a slot, which means first we
@@ -947,6 +1012,7 @@ ExecUpdate(ModifyTableState *mtstate,
 	HTSU_Result result;
 	HeapUpdateFailureData hufd;
 	List	   *recheckIndexes = NIL;
+	TupleConversionMap *saved_tcs_map = NULL;
 
 	/*
 	 * abort the operation if not running transactions
@@ -1018,6 +1084,7 @@ ExecUpdate(ModifyTableState *mtstate,
 	else
 	{
 		LockTupleMode lockmode;
+		bool		partition_constraint_failed;
 
 		/*
 		 * Constraints might reference the tableoid column, so initialize
@@ -1033,22 +1100,142 @@ ExecUpdate(ModifyTableState *mtstate,
 		 * (We don't need to redo triggers, however.  If there are any BEFORE
 		 * triggers then trigger.c will have done heap_lock_tuple to lock the
 		 * correct tuple, so there's no need to do them again.)
-		 *
-		 * ExecWithCheckOptions() will skip any WCOs which are not of the kind
-		 * we are looking for at this point.
 		 */
 lreplace:;
-		if (resultRelInfo->ri_WithCheckOptions != NIL)
+
+		/*
+		 * If partition constraint fails, this row might get moved to another
+		 * partition, in which case we should check the RLS CHECK policy just
+		 * before inserting into the new partition, rather than doing it here.
+		 * This is because a trigger on that partition might again change the
+		 * row.  So skip the WCO checks if the partition constraint fails.
+		 */
+		partition_constraint_failed =
+			resultRelInfo->ri_PartitionCheck &&
+			!ExecPartitionCheck(resultRelInfo, slot, estate);
+
+		if (!partition_constraint_failed &&
+			resultRelInfo->ri_WithCheckOptions != NIL)
+		{
+			/*
+			 * ExecWithCheckOptions() will skip any WCOs which are not of the
+			 * kind we are looking for at this point.
+			 */
 			ExecWithCheckOptions(WCO_RLS_UPDATE_CHECK,
 								 resultRelInfo, slot, estate);
+		}
+
+		/*
+		 * If a partition check failed, try to move the row into the right
+		 * partition.
+		 */
+		if (partition_constraint_failed)
+		{
+			bool		tuple_deleted;
+			TupleTableSlot *ret_slot;
+			PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+			int			map_index;
+			TupleConversionMap *tupconv_map;
+
+			/*
+			 * When an UPDATE is run on a leaf partition, we will not have
+			 * partition tuple routing set up. In that case, fail with
+			 * partition constraint violation error.
+			 */
+			if (proute == NULL)
+				ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+
+			/*
+			 * Row movement, part 1.  Delete the tuple, but skip RETURNING
+			 * processing. We want to return rows from INSERT.
+			 */
+			ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate,
+					   &tuple_deleted, false, false);
+
+			/*
+			 * For some reason if DELETE didn't happen (e.g. trigger prevented
+			 * it, or it was already deleted by self, or it was concurrently
+			 * deleted by another transaction), then we should skip the insert
+			 * as well; otherwise, an UPDATE could cause an increase in the
+			 * total number of rows across all partitions, which is clearly
+			 * wrong.
+			 *
+			 * For a normal UPDATE, the case where the tuple has been the
+			 * subject of a concurrent UPDATE or DELETE would be handled by
+			 * the EvalPlanQual machinery, but for an UPDATE that we've
+			 * translated into a DELETE from this partition and an INSERT into
+			 * some other partition, that's not available, because CTID chains
+			 * can't span relation boundaries.  We mimic the semantics to a
+			 * limited extent by skipping the INSERT if the DELETE fails to
+			 * find a tuple. This ensures that two concurrent attempts to
+			 * UPDATE the same tuple at the same time can't turn one tuple
+			 * into two, and that an UPDATE of a just-deleted tuple can't
+			 * resurrect it.
+			 */
+			if (!tuple_deleted)
+				return NULL;
+
+			/*
+			 * Updates set the transition capture map only when a new subplan
+			 * is chosen.  But for inserts, it is set for each row. So after
+			 * INSERT, we need to revert back to the map created for UPDATE;
+			 * otherwise the next UPDATE will incorrectly use the one created
+			 * for INSERT.  So first save the one created for UPDATE.
+			 */
+			if (mtstate->mt_transition_capture)
+				saved_tcs_map = mtstate->mt_transition_capture->tcs_map;
+
+			/*
+			 * resultRelInfo is one of the per-subplan resultRelInfos.  So we
+			 * should convert the tuple into root's tuple descriptor, since
+			 * ExecInsert() starts the search from root.  The tuple conversion
+			 * map list is in the order of mtstate->resultRelInfo[], so to
+			 * retrieve the one for this resultRel, we need to know the
+			 * position of the resultRel in mtstate->resultRelInfo[].
+			 */
+			map_index = resultRelInfo - mtstate->resultRelInfo;
+			Assert(map_index >= 0 && map_index < mtstate->mt_nplans);
+			tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
+			tuple = ConvertPartitionTupleSlot(tupconv_map,
+											  tuple,
+											  proute->root_tuple_slot,
+											  &slot);
+
+
+			/*
+			 * For ExecInsert(), make it look like we are inserting into the
+			 * root.
+			 */
+			Assert(mtstate->rootResultRelInfo != NULL);
+			estate->es_result_relation_info = mtstate->rootResultRelInfo;
+
+			ret_slot = ExecInsert(mtstate, slot, planSlot, NULL,
+								  ONCONFLICT_NONE, estate, canSetTag);
+
+			/*
+			 * Revert back the active result relation and the active
+			 * transition capture map that we changed above.
+			 */
+			estate->es_result_relation_info = resultRelInfo;
+			if (mtstate->mt_transition_capture)
+			{
+				mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+				mtstate->mt_transition_capture->tcs_map = saved_tcs_map;
+			}
+			return ret_slot;
+		}
 
 		/*
 		 * Check the constraints of the tuple.  Note that we pass the same
 		 * slot for the orig_slot argument, because unlike ExecInsert(), no
 		 * tuple-routing is performed here, hence the slot remains unchanged.
+		 * We've already checked the partition constraint above; however, we
+		 * must still ensure the tuple passes all other constraints, so we
+		 * will call ExecConstraints() and have it validate all remaining
+		 * checks.
 		 */
-		if (resultRelationDesc->rd_att->constr || resultRelInfo->ri_PartitionCheck)
-			ExecConstraints(resultRelInfo, slot, estate, true);
+		if (resultRelationDesc->rd_att->constr)
+			ExecConstraints(resultRelInfo, slot, estate, false);
 
 		/*
 		 * replace the heap tuple
@@ -1418,17 +1605,20 @@ fireBSTriggers(ModifyTableState *node)
 }
 
 /*
- * Return the ResultRelInfo for which we will fire AFTER STATEMENT triggers.
- * This is also the relation into whose tuple format all captured transition
- * tuples must be converted.
+ * Return the target rel ResultRelInfo.
+ *
+ * This relation is the same as :
+ * - the relation for which we will fire AFTER STATEMENT triggers.
+ * - the relation into whose tuple format all captured transition tuples must
+ *   be converted.
+ * - the root partitioned table.
  */
 static ResultRelInfo *
-getASTriggerResultRelInfo(ModifyTableState *node)
+getTargetResultRelInfo(ModifyTableState *node)
 {
 	/*
-	 * If the node modifies a partitioned table, we must fire its triggers.
-	 * Note that in that case, node->resultRelInfo points to the first leaf
-	 * partition, not the root table.
+	 * Note that if the node modifies a partitioned table, node->resultRelInfo
+	 * points to the first leaf partition, not the root table.
 	 */
 	if (node->rootResultRelInfo != NULL)
 		return node->rootResultRelInfo;
@@ -1442,7 +1632,7 @@ getASTriggerResultRelInfo(ModifyTableState *node)
 static void
 fireASTriggers(ModifyTableState *node)
 {
-	ResultRelInfo *resultRelInfo = getASTriggerResultRelInfo(node);
+	ResultRelInfo *resultRelInfo = getTargetResultRelInfo(node);
 
 	switch (node->operation)
 	{
@@ -1475,8 +1665,7 @@ fireASTriggers(ModifyTableState *node)
 static void
 ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
 {
-	ResultRelInfo *targetRelInfo = getASTriggerResultRelInfo(mtstate);
-	int			i;
+	ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate);
 
 	/* Check for transition tables on the directly targeted relation. */
 	mtstate->mt_transition_capture =
@@ -1499,62 +1688,141 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
 	if (mtstate->mt_transition_capture != NULL ||
 		mtstate->mt_oc_transition_capture != NULL)
 	{
-		int			numResultRelInfos;
-		PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
-
-		numResultRelInfos = (proute != NULL ?
-							 proute->num_partitions :
-							 mtstate->mt_nplans);
+		ExecSetupChildParentMapForTcs(mtstate);
 
 		/*
-		 * Build array of conversion maps from each child's TupleDesc to the
-		 * one used in the tuplestore.  The map pointers may be NULL when no
-		 * conversion is necessary, which is hopefully a common case for
-		 * partitions.
+		 * Install the conversion map for the first plan for UPDATE and DELETE
+		 * operations.  It will be advanced each time we switch to the next
+		 * plan.  (INSERT operations set it every time, so we need not update
+		 * mtstate->mt_oc_transition_capture here.)
 		 */
-		mtstate->mt_transition_tupconv_maps = (TupleConversionMap **)
-			palloc0(sizeof(TupleConversionMap *) * numResultRelInfos);
+		if (mtstate->mt_transition_capture && mtstate->operation != CMD_INSERT)
+			mtstate->mt_transition_capture->tcs_map =
+				tupconv_map_for_subplan(mtstate, 0);
+	}
+}
 
-		/* Choose the right set of partitions */
-		if (proute != NULL)
-		{
-			/*
-			 * For tuple routing among partitions, we need TupleDescs based on
-			 * the partition routing table.
-			 */
-			ResultRelInfo **resultRelInfos = proute->partitions;
+/*
+ * Initialize the child-to-root tuple conversion map array for UPDATE subplans.
+ *
+ * This map array is required to convert the tuple from the subplan result rel
+ * to the target table descriptor. This requirement arises for two independent
+ * scenarios:
+ * 1. For update-tuple-routing.
+ * 2. For capturing tuples in transition tables.
+ */
+void
+ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate)
+{
+	ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate);
+	ResultRelInfo *resultRelInfos = mtstate->resultRelInfo;
+	TupleDesc	outdesc;
+	int			numResultRelInfos = mtstate->mt_nplans;
+	int			i;
 
-			for (i = 0; i < numResultRelInfos; ++i)
-			{
-				mtstate->mt_transition_tupconv_maps[i] =
-					convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc),
-										   RelationGetDescr(targetRelInfo->ri_RelationDesc),
-										   gettext_noop("could not convert row type"));
-			}
-		}
-		else
-		{
-			/* Otherwise we need the ResultRelInfo for each subplan. */
-			ResultRelInfo *resultRelInfos = mtstate->resultRelInfo;
+	/*
+	 * First check if there is already a per-subplan array allocated. Even if
+	 * there is already a per-leaf map array, we won't require a per-subplan
+	 * one, since we will use the subplan offset array to convert the subplan
+	 * index to per-leaf index.
+	 */
+	if (mtstate->mt_per_subplan_tupconv_maps ||
+		(mtstate->mt_partition_tuple_routing &&
+		 mtstate->mt_partition_tuple_routing->child_parent_tupconv_maps))
+		return;
 
-			for (i = 0; i < numResultRelInfos; ++i)
-			{
-				mtstate->mt_transition_tupconv_maps[i] =
-					convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc),
-										   RelationGetDescr(targetRelInfo->ri_RelationDesc),
-										   gettext_noop("could not convert row type"));
-			}
-		}
+	/*
+	 * Build array of conversion maps from each child's TupleDesc to the one
+	 * used in the target relation.  The map pointers may be NULL when no
+	 * conversion is necessary, which is hopefully a common case.
+	 */
 
+	/* Get tuple descriptor of the target rel. */
+	outdesc = RelationGetDescr(targetRelInfo->ri_RelationDesc);
+
+	mtstate->mt_per_subplan_tupconv_maps = (TupleConversionMap **)
+		palloc(sizeof(TupleConversionMap *) * numResultRelInfos);
+
+	for (i = 0; i < numResultRelInfos; ++i)
+	{
+		mtstate->mt_per_subplan_tupconv_maps[i] =
+			convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc),
+								   outdesc,
+								   gettext_noop("could not convert row type"));
+	}
+}
+
+/*
+ * Initialize the child-to-root tuple conversion map array required for
+ * capturing transition tuples.
+ *
+ * The map array can be indexed either by subplan index or by leaf-partition
+ * index.  For transition tables, we need a subplan-indexed access to the map,
+ * and where tuple-routing is present, we also require a leaf-indexed access.
+ */
+static void
+ExecSetupChildParentMapForTcs(ModifyTableState *mtstate)
+{
+	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+
+	/*
+	 * If partition tuple routing is set up, we will require partition-indexed
+	 * access. In that case, create the map array indexed by partition; we
+	 * will still be able to access the maps using a subplan index by
+	 * converting the subplan index to a partition index using
+	 * subplan_partition_offsets. If tuple routing is not set up, it means we
+	 * don't require partition-indexed access. In that case, create just a
+	 * subplan-indexed map.
+	 */
+	if (proute)
+	{
 		/*
-		 * Install the conversion map for the first plan for UPDATE and DELETE
-		 * operations.  It will be advanced each time we switch to the next
-		 * plan.  (INSERT operations set it every time, so we need not update
-		 * mtstate->mt_oc_transition_capture here.)
+		 * If a partition-indexed map array is to be created, the subplan map
+		 * array has to be NULL.  If the subplan map array is already created,
+		 * we won't be able to access the map using a partition index.
 		 */
-		if (mtstate->mt_transition_capture)
-			mtstate->mt_transition_capture->tcs_map =
-				mtstate->mt_transition_tupconv_maps[0];
+		Assert(mtstate->mt_per_subplan_tupconv_maps == NULL);
+
+		ExecSetupChildParentMapForLeaf(proute);
+	}
+	else
+		ExecSetupChildParentMapForSubplan(mtstate);
+}
+
+/*
+ * For a given subplan index, get the tuple conversion map.
+ */
+static TupleConversionMap *
+tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan)
+{
+	/*
+	 * If a partition-index tuple conversion map array is allocated, we need
+	 * to first get the index into the partition array. Exactly *one* of the
+	 * two arrays is allocated. This is because if there is a partition array
+	 * required, we don't require subplan-indexed array since we can translate
+	 * subplan index into partition index. And, we create a subplan-indexed
+	 * array *only* if partition-indexed array is not required.
+	 */
+	if (mtstate->mt_per_subplan_tupconv_maps == NULL)
+	{
+		int			leaf_index;
+		PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+
+		/*
+		 * If subplan-indexed array is NULL, things should have been arranged
+		 * to convert the subplan index to partition index.
+		 */
+		Assert(proute && proute->subplan_partition_offsets != NULL);
+
+		leaf_index = proute->subplan_partition_offsets[whichplan];
+
+		return TupConvMapForLeaf(proute, getTargetResultRelInfo(mtstate),
+								 leaf_index);
+	}
+	else
+	{
+		Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans);
+		return mtstate->mt_per_subplan_tupconv_maps[whichplan];
 	}
 }
 
@@ -1661,15 +1929,13 @@ ExecModifyTable(PlanState *pstate)
 				/* Prepare to convert transition tuples from this child. */
 				if (node->mt_transition_capture != NULL)
 				{
-					Assert(node->mt_transition_tupconv_maps != NULL);
 					node->mt_transition_capture->tcs_map =
-						node->mt_transition_tupconv_maps[node->mt_whichplan];
+						tupconv_map_for_subplan(node, node->mt_whichplan);
 				}
 				if (node->mt_oc_transition_capture != NULL)
 				{
-					Assert(node->mt_transition_tupconv_maps != NULL);
 					node->mt_oc_transition_capture->tcs_map =
-						node->mt_transition_tupconv_maps[node->mt_whichplan];
+						tupconv_map_for_subplan(node, node->mt_whichplan);
 				}
 				continue;
 			}
@@ -1786,7 +2052,8 @@ ExecModifyTable(PlanState *pstate)
 				break;
 			case CMD_DELETE:
 				slot = ExecDelete(node, tupleid, oldtuple, planSlot,
-								  &node->mt_epqstate, estate, node->canSetTag);
+								  &node->mt_epqstate, estate,
+								  NULL, true, node->canSetTag);
 				break;
 			default:
 				elog(ERROR, "unknown operation");
@@ -1830,9 +2097,12 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	ResultRelInfo *saved_resultRelInfo;
 	ResultRelInfo *resultRelInfo;
 	Plan	   *subplan;
+	int			firstVarno = 0;
+	Relation	firstResultRel = NULL;
 	ListCell   *l;
 	int			i;
 	Relation	rel;
+	bool		update_tuple_routing_needed = node->partColsUpdated;
 	PartitionTupleRouting *proute = NULL;
 	int			num_partitions = 0;
 
@@ -1907,6 +2177,16 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 			resultRelInfo->ri_IndexRelationDescs == NULL)
 			ExecOpenIndices(resultRelInfo, mtstate->mt_onconflict != ONCONFLICT_NONE);
 
+		/*
+		 * If this is an UPDATE and a BEFORE UPDATE trigger is present, the
+		 * trigger itself might modify the partition-key values. So arrange
+		 * for tuple routing.
+		 */
+		if (resultRelInfo->ri_TrigDesc &&
+			resultRelInfo->ri_TrigDesc->trig_update_before_row &&
+			operation == CMD_UPDATE)
+			update_tuple_routing_needed = true;
+
 		/* Now init the plan for this result rel */
 		estate->es_result_relation_info = resultRelInfo;
 		mtstate->mt_plans[i] = ExecInitNode(subplan, estate, eflags);
@@ -1931,16 +2211,35 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
 	estate->es_result_relation_info = saved_resultRelInfo;
 
-	/* Build state for INSERT tuple routing */
-	rel = mtstate->resultRelInfo->ri_RelationDesc;
-	if (operation == CMD_INSERT &&
-		rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+	/* Get the target relation */
+	rel = (getTargetResultRelInfo(mtstate))->ri_RelationDesc;
+
+	/*
+	 * If it's not a partitioned table after all, UPDATE tuple routing should
+	 * not be attempted.
+	 */
+	if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		update_tuple_routing_needed = false;
+
+	/*
+	 * Build state for tuple routing if it's an INSERT or if it's an UPDATE of
+	 * partition key.
+	 */
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
+		(operation == CMD_INSERT || update_tuple_routing_needed))
 	{
 		proute = mtstate->mt_partition_tuple_routing =
 			ExecSetupPartitionTupleRouting(mtstate,
 										   rel, node->nominalRelation,
 										   estate);
 		num_partitions = proute->num_partitions;
+
+		/*
+		 * Below are required as reference objects for mapping partition
+		 * attno's in expressions such as WithCheckOptions and RETURNING.
+		 */
+		firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
+		firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
 	}
 
 	/*
@@ -1951,6 +2250,17 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 		ExecSetupTransitionCaptureState(mtstate, estate);
 
 	/*
+	 * Construct mapping from each of the per-subplan partition attnos to the
+	 * root attno.  This is required when during update row movement the tuple
+	 * descriptor of a source partition does not match the root partitioned
+	 * table descriptor.  In such a case we need to convert tuples to the root
+	 * tuple descriptor, because the search for destination partition starts
+	 * from the root.  Skip this setup if it's not a partition key update.
+	 */
+	if (update_tuple_routing_needed)
+		ExecSetupChildParentMapForSubplan(mtstate);
+
+	/*
 	 * Initialize any WITH CHECK OPTION constraints if needed.
 	 */
 	resultRelInfo = mtstate->resultRelInfo;
@@ -1980,26 +2290,29 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	 * Build WITH CHECK OPTION constraints for each leaf partition rel. Note
 	 * that we didn't build the withCheckOptionList for each partition within
 	 * the planner, but simple translation of the varattnos for each partition
-	 * will suffice.  This only occurs for the INSERT case; UPDATE/DELETE
-	 * cases are handled above.
+	 * will suffice.  This only occurs for the INSERT case or for UPDATE row
+	 * movement. DELETEs and local UPDATEs are handled above.
 	 */
 	if (node->withCheckOptionLists != NIL && num_partitions > 0)
 	{
-		List	   *wcoList;
-		PlanState  *plan;
+		List	   *first_wcoList;
 
 		/*
 		 * In case of INSERT on partitioned tables, there is only one plan.
 		 * Likewise, there is only one WITH CHECK OPTIONS list, not one per
-		 * partition.  We make a copy of the WCO qual for each partition; note
-		 * that, if there are SubPlans in there, they all end up attached to
-		 * the one parent Plan node.
+		 * partition. Whereas for UPDATE, there are as many WCOs as there are
+		 * plans. So in either case, use the WCO expression of the first
+		 * resultRelInfo as a reference to calculate attno's for the WCO
+		 * expression of each of the partitions. We make a copy of the WCO
+		 * qual for each partition. Note that, if there are SubPlans in there,
+		 * they all end up attached to the one parent Plan node.
 		 */
-		Assert(operation == CMD_INSERT &&
-			   list_length(node->withCheckOptionLists) == 1 &&
-			   mtstate->mt_nplans == 1);
-		wcoList = linitial(node->withCheckOptionLists);
-		plan = mtstate->mt_plans[0];
+		Assert(update_tuple_routing_needed ||
+			   (operation == CMD_INSERT &&
+				list_length(node->withCheckOptionLists) == 1 &&
+				mtstate->mt_nplans == 1));
+
+		first_wcoList = linitial(node->withCheckOptionLists);
 		for (i = 0; i < num_partitions; i++)
 		{
 			Relation	partrel;
@@ -2008,17 +2321,26 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 			ListCell   *ll;
 
 			resultRelInfo = proute->partitions[i];
+
+			/*
+			 * If we are referring to a resultRelInfo from one of the update
+			 * result rels, that result rel would already have
+			 * WithCheckOptions initialized.
+			 */
+			if (resultRelInfo->ri_WithCheckOptions)
+				continue;
+
 			partrel = resultRelInfo->ri_RelationDesc;
 
-			/* varno = node->nominalRelation */
-			mapped_wcoList = map_partition_varattnos(wcoList,
-													 node->nominalRelation,
-													 partrel, rel, NULL);
+			mapped_wcoList = map_partition_varattnos(first_wcoList,
+													 firstVarno,
+													 partrel, firstResultRel,
+													 NULL);
 			foreach(ll, mapped_wcoList)
 			{
 				WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll));
 				ExprState  *wcoExpr = ExecInitQual(castNode(List, wco->qual),
-												   plan);
+												   &mtstate->ps);
 
 				wcoExprs = lappend(wcoExprs, wcoExpr);
 			}
@@ -2035,7 +2357,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	{
 		TupleTableSlot *slot;
 		ExprContext *econtext;
-		List	   *returningList;
+		List	   *firstReturningList;
 
 		/*
 		 * Initialize result tuple slot and assign its rowtype using the first
@@ -2071,22 +2393,35 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 		 * Build a projection for each leaf partition rel.  Note that we
 		 * didn't build the returningList for each partition within the
 		 * planner, but simple translation of the varattnos for each partition
-		 * will suffice.  This only occurs for the INSERT case; UPDATE/DELETE
-		 * are handled above.
+		 * will suffice.  This only occurs for the INSERT case or for UPDATE
+		 * row movement. DELETEs and local UPDATEs are handled above.
 		 */
-		returningList = linitial(node->returningLists);
+		firstReturningList = linitial(node->returningLists);
 		for (i = 0; i < num_partitions; i++)
 		{
 			Relation	partrel;
 			List	   *rlist;
 
 			resultRelInfo = proute->partitions[i];
+
+			/*
+			 * If we are referring to a resultRelInfo from one of the update
+			 * result rels, that result rel would already have a returningList
+			 * built.
+			 */
+			if (resultRelInfo->ri_projectReturning)
+				continue;
+
 			partrel = resultRelInfo->ri_RelationDesc;
 
-			/* varno = node->nominalRelation */
-			rlist = map_partition_varattnos(returningList,
-											node->nominalRelation,
-											partrel, rel, NULL);
+			/*
+			 * Use the returning expression of the first resultRelInfo as a
+			 * reference to calculate attno's for the returning expression of
+			 * each of the partitions.
+			 */
+			rlist = map_partition_varattnos(firstReturningList,
+											firstVarno,
+											partrel, firstResultRel, NULL);
 			resultRelInfo->ri_projectReturning =
 				ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps,
 										resultRelInfo->ri_RelationDesc->rd_att);