aboutsummaryrefslogtreecommitdiff
path: root/src/backend/executor/nodeModifyTable.c
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2020-10-15 17:08:10 +0300
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2020-10-15 17:08:10 +0300
commitc5b097f8fa69861ab83470c0bfe732ab90df9e62 (patch)
treecf764819eaa8e8d3b5928f50952a4f14e1a8228c /src/backend/executor/nodeModifyTable.c
parent7f4708818317f59a0452f549dedf4ce35182c13f (diff)
downloadpostgresql-c5b097f8fa69861ab83470c0bfe732ab90df9e62.tar.gz
postgresql-c5b097f8fa69861ab83470c0bfe732ab90df9e62.zip
Refactor code for cross-partition updates to a separate function.
ExecUpdate() is very long, so extract the part of it that deals with cross-partition updates to a separate function to make it more readable. Per Andres Freund's suggestion. Author: Amit Langote Discussion: https://www.postgresql.org/message-id/CA%2BHiwqEUgb5RdUgxR7Sqco4S09jzJstHiaT2vnCRPGR4JCAPqA%40mail.gmail.com
Diffstat (limited to 'src/backend/executor/nodeModifyTable.c')
-rw-r--r--src/backend/executor/nodeModifyTable.c265
1 files changed, 158 insertions, 107 deletions
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 6782a2dcd28..0c055ed4080 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -1059,6 +1059,148 @@ ldelete:;
return NULL;
}
+/*
+ * ExecCrossPartitionUpdate --- Move an updated tuple to another partition.
+ *
+ * This works by first deleting the old tuple from the current partition,
+ * followed by inserting the new tuple into the root parent table, that is,
+ * mtstate->rootResultRelInfo. It will be re-routed from there to the
+ * correct partition.
+ *
+ * Returns true if the tuple has been successfully moved, or if it's found
+ * that the tuple was concurrently deleted so there's nothing more to do
+ * for the caller.
+ *
+ * False is returned if the tuple we're trying to move is found to have been
+ * concurrently updated. In that case, the caller must to check if the
+ * updated tuple that's returned in *retry_slot still needs to be re-routed,
+ * and call this function again or perform a regular update accordingly.
+ */
+static bool
+ExecCrossPartitionUpdate(ModifyTableState *mtstate,
+ ResultRelInfo *resultRelInfo,
+ ItemPointer tupleid, HeapTuple oldtuple,
+ TupleTableSlot *slot, TupleTableSlot *planSlot,
+ EPQState *epqstate, bool canSetTag,
+ TupleTableSlot **retry_slot,
+ TupleTableSlot **inserted_tuple)
+{
+ EState *estate = mtstate->ps.state;
+ PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+ int map_index;
+ TupleConversionMap *tupconv_map;
+ TupleConversionMap *saved_tcs_map = NULL;
+ bool tuple_deleted;
+ TupleTableSlot *epqslot = NULL;
+
+ *inserted_tuple = NULL;
+ *retry_slot = NULL;
+
+ /*
+ * Disallow an INSERT ON CONFLICT DO UPDATE that causes the original row
+ * to migrate to a different partition. Maybe this can be implemented
+ * some day, but it seems a fringe feature with little redeeming value.
+ */
+ if (((ModifyTable *) mtstate->ps.plan)->onConflictAction == ONCONFLICT_UPDATE)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("invalid ON UPDATE specification"),
+ errdetail("The result tuple would appear in a different partition than the original tuple.")));
+
+ /*
+ * When an UPDATE is run on a leaf partition, we will not have partition
+ * tuple routing set up. In that case, fail with partition constraint
+ * violation error.
+ */
+ if (proute == NULL)
+ ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+
+ /*
+ * Row movement, part 1. Delete the tuple, but skip RETURNING processing.
+ * We want to return rows from INSERT.
+ */
+ ExecDelete(mtstate, resultRelInfo, tupleid, oldtuple, planSlot,
+ epqstate, estate,
+ false, /* processReturning */
+ false, /* canSetTag */
+ true, /* changingPart */
+ &tuple_deleted, &epqslot);
+
+ /*
+ * For some reason if DELETE didn't happen (e.g. trigger prevented it, or
+ * it was already deleted by self, or it was concurrently deleted by
+ * another transaction), then we should skip the insert as well;
+ * otherwise, an UPDATE could cause an increase in the total number of
+ * rows across all partitions, which is clearly wrong.
+ *
+ * For a normal UPDATE, the case where the tuple has been the subject of a
+ * concurrent UPDATE or DELETE would be handled by the EvalPlanQual
+ * machinery, but for an UPDATE that we've translated into a DELETE from
+ * this partition and an INSERT into some other partition, that's not
+ * available, because CTID chains can't span relation boundaries. We
+ * mimic the semantics to a limited extent by skipping the INSERT if the
+ * DELETE fails to find a tuple. This ensures that two concurrent
+ * attempts to UPDATE the same tuple at the same time can't turn one tuple
+ * into two, and that an UPDATE of a just-deleted tuple can't resurrect
+ * it.
+ */
+ if (!tuple_deleted)
+ {
+ /*
+ * epqslot will be typically NULL. But when ExecDelete() finds that
+ * another transaction has concurrently updated the same row, it
+ * re-fetches the row, skips the delete, and epqslot is set to the
+ * re-fetched tuple slot. In that case, we need to do all the checks
+ * again.
+ */
+ if (TupIsNull(epqslot))
+ return true;
+ else
+ {
+ *retry_slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot);
+ return false;
+ }
+ }
+
+ /*
+ * resultRelInfo is one of the per-subplan resultRelInfos. So we should
+ * convert the tuple into root's tuple descriptor, since ExecInsert()
+ * starts the search from root. The tuple conversion map list is in the
+ * order of mtstate->resultRelInfo[], so to retrieve the one for this
+ * resultRel, we need to know the position of the resultRel in
+ * mtstate->resultRelInfo[].
+ */
+ map_index = resultRelInfo - mtstate->resultRelInfo;
+ Assert(map_index >= 0 && map_index < mtstate->mt_nplans);
+ tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
+ if (tupconv_map != NULL)
+ slot = execute_attr_map_slot(tupconv_map->attrMap,
+ slot,
+ mtstate->mt_root_tuple_slot);
+
+ /*
+ * ExecInsert() may scribble on mtstate->mt_transition_capture, so save
+ * the currently active map.
+ */
+ if (mtstate->mt_transition_capture)
+ saved_tcs_map = mtstate->mt_transition_capture->tcs_map;
+
+ /* Tuple routing starts from the root table. */
+ Assert(mtstate->rootResultRelInfo != NULL);
+ *inserted_tuple = ExecInsert(mtstate, mtstate->rootResultRelInfo, slot,
+ planSlot, estate, canSetTag);
+
+ /* Clear the INSERT's tuple and restore the saved map. */
+ if (mtstate->mt_transition_capture)
+ {
+ mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+ mtstate->mt_transition_capture->tcs_map = saved_tcs_map;
+ }
+
+ /* We're done moving. */
+ return true;
+}
+
/* ----------------------------------------------------------------
* ExecUpdate
*
@@ -1212,119 +1354,28 @@ lreplace:;
*/
if (partition_constraint_failed)
{
- bool tuple_deleted;
- TupleTableSlot *ret_slot;
- TupleTableSlot *epqslot = NULL;
- PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
- int map_index;
- TupleConversionMap *tupconv_map;
- TupleConversionMap *saved_tcs_map = NULL;
-
- /*
- * Disallow an INSERT ON CONFLICT DO UPDATE that causes the
- * original row to migrate to a different partition. Maybe this
- * can be implemented some day, but it seems a fringe feature with
- * little redeeming value.
- */
- if (((ModifyTable *) mtstate->ps.plan)->onConflictAction == ONCONFLICT_UPDATE)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("invalid ON UPDATE specification"),
- errdetail("The result tuple would appear in a different partition than the original tuple.")));
-
- /*
- * When an UPDATE is run on a leaf partition, we will not have
- * partition tuple routing set up. In that case, fail with
- * partition constraint violation error.
- */
- if (proute == NULL)
- ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
-
- /*
- * Row movement, part 1. Delete the tuple, but skip RETURNING
- * processing. We want to return rows from INSERT.
- */
- ExecDelete(mtstate, resultRelInfo, tupleid, oldtuple, planSlot,
- epqstate, estate,
- false, /* processReturning */
- false, /* canSetTag */
- true, /* changingPart */
- &tuple_deleted, &epqslot);
-
- /*
- * For some reason if DELETE didn't happen (e.g. trigger prevented
- * it, or it was already deleted by self, or it was concurrently
- * deleted by another transaction), then we should skip the insert
- * as well; otherwise, an UPDATE could cause an increase in the
- * total number of rows across all partitions, which is clearly
- * wrong.
- *
- * For a normal UPDATE, the case where the tuple has been the
- * subject of a concurrent UPDATE or DELETE would be handled by
- * the EvalPlanQual machinery, but for an UPDATE that we've
- * translated into a DELETE from this partition and an INSERT into
- * some other partition, that's not available, because CTID chains
- * can't span relation boundaries. We mimic the semantics to a
- * limited extent by skipping the INSERT if the DELETE fails to
- * find a tuple. This ensures that two concurrent attempts to
- * UPDATE the same tuple at the same time can't turn one tuple
- * into two, and that an UPDATE of a just-deleted tuple can't
- * resurrect it.
- */
- if (!tuple_deleted)
- {
- /*
- * epqslot will be typically NULL. But when ExecDelete()
- * finds that another transaction has concurrently updated the
- * same row, it re-fetches the row, skips the delete, and
- * epqslot is set to the re-fetched tuple slot. In that case,
- * we need to do all the checks again.
- */
- if (TupIsNull(epqslot))
- return NULL;
- else
- {
- slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot);
- goto lreplace;
- }
- }
+ TupleTableSlot *inserted_tuple,
+ *retry_slot;
+ bool retry;
/*
- * resultRelInfo is one of the per-subplan resultRelInfos. So we
- * should convert the tuple into root's tuple descriptor, since
- * ExecInsert() starts the search from root. The tuple conversion
- * map list is in the order of mtstate->resultRelInfo[], so to
- * retrieve the one for this resultRel, we need to know the
- * position of the resultRel in mtstate->resultRelInfo[].
+ * ExecCrossPartitionUpdate will first DELETE the row from the
+ * partition it's currently in and then insert it back into the
+ * root table, which will re-route it to the correct partition.
+ * The first part may have to be repeated if it is detected that
+ * the tuple we're trying to move has been concurrently updated.
*/
- map_index = resultRelInfo - mtstate->resultRelInfo;
- Assert(map_index >= 0 && map_index < mtstate->mt_nplans);
- tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
- if (tupconv_map != NULL)
- slot = execute_attr_map_slot(tupconv_map->attrMap,
- slot,
- mtstate->mt_root_tuple_slot);
-
- /*
- * ExecInsert() may scribble on mtstate->mt_transition_capture, so
- * save the currently active map.
- */
- if (mtstate->mt_transition_capture)
- saved_tcs_map = mtstate->mt_transition_capture->tcs_map;
-
- /* Tuple routing starts from the root table. */
- Assert(mtstate->rootResultRelInfo != NULL);
- ret_slot = ExecInsert(mtstate, mtstate->rootResultRelInfo, slot,
- planSlot, estate, canSetTag);
-
- /* Clear the INSERT's tuple and restore the saved map. */
- if (mtstate->mt_transition_capture)
+ retry = !ExecCrossPartitionUpdate(mtstate, resultRelInfo, tupleid,
+ oldtuple, slot, planSlot,
+ epqstate, canSetTag,
+ &retry_slot, &inserted_tuple);
+ if (retry)
{
- mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
- mtstate->mt_transition_capture->tcs_map = saved_tcs_map;
+ slot = retry_slot;
+ goto lreplace;
}
- return ret_slot;
+ return inserted_tuple;
}
/*