aboutsummaryrefslogtreecommitdiff
path: root/src/backend/executor
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/executor')
-rw-r--r--src/backend/executor/execExprInterp.c1
-rw-r--r--src/backend/executor/execGrouping.c4
-rw-r--r--src/backend/executor/execIndexing.c4
-rw-r--r--src/backend/executor/execParallel.c1
-rw-r--r--src/backend/executor/execReplication.c255
-rw-r--r--src/backend/executor/nodeModifyTable.c136
-rw-r--r--src/backend/executor/nodeTidrangescan.c6
7 files changed, 389 insertions, 18 deletions
diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c
index 8a72b5e70a4..1a37737d4a2 100644
--- a/src/backend/executor/execExprInterp.c
+++ b/src/backend/executor/execExprInterp.c
@@ -5228,7 +5228,6 @@ ExecEvalJsonCoercionFinish(ExprState *state, ExprEvalStep *op)
* JsonBehavior expression.
*/
jsestate->escontext.error_occurred = false;
- jsestate->escontext.error_occurred = false;
jsestate->escontext.details_wanted = true;
}
}
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 255bd795361..b5400749353 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -144,7 +144,7 @@ execTuplesHashPrepare(int numCols,
* hashfunctions: FmgrInfos of datatype-specific hashing functions to use
* collations: collations to use in comparisons
* nbuckets: initial estimate of hashtable size
- * additionalsize: size of data stored in ->additional
+ * additionalsize: size of data that may be stored along with the hash entry
* metacxt: memory context for long-lived allocation, but not per-entry data
* tablecxt: memory context in which to store table entries
* tempcxt: short-lived context for evaluation hash and comparison functions
@@ -288,7 +288,7 @@ ResetTupleHashTable(TupleHashTable hashtable)
*
* If isnew isn't NULL, then a new entry is created if no existing entry
* matches. On return, *isnew is true if the entry is newly created,
- * false if it existed already. ->additional_data in the new entry has
+ * false if it existed already. The additional data in the new entry has
* been zeroed.
*/
TupleHashEntry
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index bdf862b2406..ca33a854278 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -279,7 +279,7 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
* executor is performing an UPDATE that could not use an
* optimization like heapam's HOT (in more general terms a
* call to table_tuple_update() took place and set
- * 'update_indexes' to TUUI_All). Receiving this hint makes
+ * 'update_indexes' to TU_All). Receiving this hint makes
* us consider if we should pass down the 'indexUnchanged'
* hint in turn. That's something that we figure out for
* each index_insert() call iff 'update' is true.
@@ -290,7 +290,7 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo)
* HOT has been applied and any updated columns are indexed
* only by summarizing indexes (or in more general terms a
* call to table_tuple_update() took place and set
- * 'update_indexes' to TUUI_Summarizing). We can (and must)
+ * 'update_indexes' to TU_Summarizing). We can (and must)
* therefore only update the indexes that have
* 'amsummarizing' = true.
*
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index f3e77bda279..f098a5557cf 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -189,6 +189,7 @@ ExecSerializePlan(Plan *plan, EState *estate)
pstmt->permInfos = estate->es_rteperminfos;
pstmt->resultRelations = NIL;
pstmt->appendRelations = NIL;
+ pstmt->planOrigin = PLAN_STMT_INTERNAL;
/*
* Transfer only parallel-safe subplans, leaving a NULL "hole" in the list
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 53ddd25c42d..68184f5d671 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -14,12 +14,14 @@
#include "postgres.h"
+#include "access/commit_ts.h"
#include "access/genam.h"
#include "access/gist.h"
#include "access/relscan.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xact.h"
+#include "access/heapam.h"
#include "catalog/pg_am_d.h"
#include "commands/trigger.h"
#include "executor/executor.h"
@@ -36,7 +38,7 @@
static bool tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2,
- TypeCacheEntry **eq);
+ TypeCacheEntry **eq, Bitmapset *columns);
/*
* Setup a ScanKey for a search in the relation 'rel' for a tuple 'key' that
@@ -221,7 +223,7 @@ retry:
if (eq == NULL)
eq = palloc0(sizeof(*eq) * outslot->tts_tupleDescriptor->natts);
- if (!tuples_equal(outslot, searchslot, eq))
+ if (!tuples_equal(outslot, searchslot, eq, NULL))
continue;
}
@@ -277,10 +279,13 @@ retry:
/*
* Compare the tuples in the slots by checking if they have equal values.
+ *
+ * If 'columns' is not null, only the columns specified within it will be
+ * considered for the equality check, ignoring all other columns.
*/
static bool
tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2,
- TypeCacheEntry **eq)
+ TypeCacheEntry **eq, Bitmapset *columns)
{
int attrnum;
@@ -306,6 +311,14 @@ tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2,
continue;
/*
+ * Ignore columns that are not listed for checking.
+ */
+ if (columns &&
+ !bms_is_member(att->attnum - FirstLowInvalidHeapAttributeNumber,
+ columns))
+ continue;
+
+ /*
* If one value is NULL and other is not, then they are certainly not
* equal
*/
@@ -380,7 +393,7 @@ retry:
/* Try to find the tuple */
while (table_scan_getnextslot(scan, ForwardScanDirection, scanslot))
{
- if (!tuples_equal(scanslot, searchslot, eq))
+ if (!tuples_equal(scanslot, searchslot, eq, NULL))
continue;
found = true;
@@ -456,6 +469,236 @@ BuildConflictIndexInfo(ResultRelInfo *resultRelInfo, Oid conflictindex)
}
/*
+ * If the tuple is recently dead and was deleted by a transaction with a newer
+ * commit timestamp than previously recorded, update the associated transaction
+ * ID, commit time, and origin. This helps ensure that conflict detection uses
+ * the most recent and relevant deletion metadata.
+ */
+static void
+update_most_recent_deletion_info(TupleTableSlot *scanslot,
+ TransactionId oldestxmin,
+ TransactionId *delete_xid,
+ TimestampTz *delete_time,
+ RepOriginId *delete_origin)
+{
+ BufferHeapTupleTableSlot *hslot;
+ HeapTuple tuple;
+ Buffer buf;
+ bool recently_dead = false;
+ TransactionId xmax;
+ TimestampTz localts;
+ RepOriginId localorigin;
+
+ hslot = (BufferHeapTupleTableSlot *) scanslot;
+
+ tuple = ExecFetchSlotHeapTuple(scanslot, false, NULL);
+ buf = hslot->buffer;
+
+ LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+ /*
+ * We do not consider HEAPTUPLE_DEAD status because it indicates either
+ * tuples whose inserting transaction was aborted (meaning there is no
+ * commit timestamp or origin), or tuples deleted by a transaction older
+ * than oldestxmin, making it safe to ignore them during conflict
+ * detection (See comments atop worker.c for details).
+ */
+ if (HeapTupleSatisfiesVacuum(tuple, oldestxmin, buf) == HEAPTUPLE_RECENTLY_DEAD)
+ recently_dead = true;
+
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+ if (!recently_dead)
+ return;
+
+ xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
+ if (!TransactionIdIsValid(xmax))
+ return;
+
+ /* Select the dead tuple with the most recent commit timestamp */
+ if (TransactionIdGetCommitTsData(xmax, &localts, &localorigin) &&
+ TimestampDifferenceExceeds(*delete_time, localts, 0))
+ {
+ *delete_xid = xmax;
+ *delete_time = localts;
+ *delete_origin = localorigin;
+ }
+}
+
+/*
+ * Searches the relation 'rel' for the most recently deleted tuple that matches
+ * the values in 'searchslot' and is not yet removable by VACUUM. The function
+ * returns the transaction ID, origin, and commit timestamp of the transaction
+ * that deleted this tuple.
+ *
+ * 'oldestxmin' acts as a cutoff transaction ID. Tuples deleted by transactions
+ * with IDs >= 'oldestxmin' are considered recently dead and are eligible for
+ * conflict detection.
+ *
+ * Instead of stopping at the first match, we scan all matching dead tuples to
+ * identify most recent deletion. This is crucial because only the latest
+ * deletion is relevant for resolving conflicts.
+ *
+ * For example, consider a scenario on the subscriber where a row is deleted,
+ * re-inserted, and then deleted again only on the subscriber:
+ *
+ * - (pk, 1) - deleted at 9:00,
+ * - (pk, 1) - deleted at 9:02,
+ *
+ * Now, a remote update arrives: (pk, 1) -> (pk, 2), timestamped at 9:01.
+ *
+ * If we mistakenly return the older deletion (9:00), the system may wrongly
+ * apply the remote update using a last-update-wins strategy. Instead, we must
+ * recognize the more recent deletion at 9:02 and skip the update. See
+ * comments atop worker.c for details. Note, as of now, conflict resolution
+ * is not implemented. Consequently, the system may incorrectly report the
+ * older tuple as the conflicted one, leading to misleading results.
+ *
+ * The commit timestamp of the deleting transaction is used to determine which
+ * tuple was deleted most recently.
+ */
+bool
+RelationFindDeletedTupleInfoSeq(Relation rel, TupleTableSlot *searchslot,
+ TransactionId oldestxmin,
+ TransactionId *delete_xid,
+ RepOriginId *delete_origin,
+ TimestampTz *delete_time)
+{
+ TupleTableSlot *scanslot;
+ TableScanDesc scan;
+ TypeCacheEntry **eq;
+ Bitmapset *indexbitmap;
+ TupleDesc desc PG_USED_FOR_ASSERTS_ONLY = RelationGetDescr(rel);
+
+ Assert(equalTupleDescs(desc, searchslot->tts_tupleDescriptor));
+
+ *delete_xid = InvalidTransactionId;
+ *delete_origin = InvalidRepOriginId;
+ *delete_time = 0;
+
+ /*
+ * If the relation has a replica identity key or a primary key that is
+ * unusable for locating deleted tuples (see
+ * IsIndexUsableForFindingDeletedTuple), a full table scan becomes
+ * necessary. In such cases, comparing the entire tuple is not required,
+ * since the remote tuple might not include all column values. Instead,
+ * the indexed columns alone are suffcient to identify the target tuple
+ * (see logicalrep_rel_mark_updatable).
+ */
+ indexbitmap = RelationGetIndexAttrBitmap(rel,
+ INDEX_ATTR_BITMAP_IDENTITY_KEY);
+
+ /* fallback to PK if no replica identity */
+ if (!indexbitmap)
+ indexbitmap = RelationGetIndexAttrBitmap(rel,
+ INDEX_ATTR_BITMAP_PRIMARY_KEY);
+
+ eq = palloc0(sizeof(*eq) * searchslot->tts_tupleDescriptor->natts);
+
+ /*
+ * Start a heap scan using SnapshotAny to identify dead tuples that are
+ * not visible under a standard MVCC snapshot. Tuples from transactions
+ * not yet committed or those just committed prior to the scan are
+ * excluded in update_most_recent_deletion_info().
+ */
+ scan = table_beginscan(rel, SnapshotAny, 0, NULL);
+ scanslot = table_slot_create(rel, NULL);
+
+ table_rescan(scan, NULL);
+
+ /* Try to find the tuple */
+ while (table_scan_getnextslot(scan, ForwardScanDirection, scanslot))
+ {
+ if (!tuples_equal(scanslot, searchslot, eq, indexbitmap))
+ continue;
+
+ update_most_recent_deletion_info(scanslot, oldestxmin, delete_xid,
+ delete_time, delete_origin);
+ }
+
+ table_endscan(scan);
+ ExecDropSingleTupleTableSlot(scanslot);
+
+ return *delete_time != 0;
+}
+
+/*
+ * Similar to RelationFindDeletedTupleInfoSeq() but using index scan to locate
+ * the deleted tuple.
+ */
+bool
+RelationFindDeletedTupleInfoByIndex(Relation rel, Oid idxoid,
+ TupleTableSlot *searchslot,
+ TransactionId oldestxmin,
+ TransactionId *delete_xid,
+ RepOriginId *delete_origin,
+ TimestampTz *delete_time)
+{
+ Relation idxrel;
+ ScanKeyData skey[INDEX_MAX_KEYS];
+ int skey_attoff;
+ IndexScanDesc scan;
+ TupleTableSlot *scanslot;
+ TypeCacheEntry **eq = NULL;
+ bool isIdxSafeToSkipDuplicates;
+ TupleDesc desc PG_USED_FOR_ASSERTS_ONLY = RelationGetDescr(rel);
+
+ Assert(equalTupleDescs(desc, searchslot->tts_tupleDescriptor));
+ Assert(OidIsValid(idxoid));
+
+ *delete_xid = InvalidTransactionId;
+ *delete_time = 0;
+ *delete_origin = InvalidRepOriginId;
+
+ isIdxSafeToSkipDuplicates = (GetRelationIdentityOrPK(rel) == idxoid);
+
+ scanslot = table_slot_create(rel, NULL);
+
+ idxrel = index_open(idxoid, RowExclusiveLock);
+
+ /* Build scan key. */
+ skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot);
+
+ /*
+ * Start an index scan using SnapshotAny to identify dead tuples that are
+ * not visible under a standard MVCC snapshot. Tuples from transactions
+ * not yet committed or those just committed prior to the scan are
+ * excluded in update_most_recent_deletion_info().
+ */
+ scan = index_beginscan(rel, idxrel, SnapshotAny, NULL, skey_attoff, 0);
+
+ index_rescan(scan, skey, skey_attoff, NULL, 0);
+
+ /* Try to find the tuple */
+ while (index_getnext_slot(scan, ForwardScanDirection, scanslot))
+ {
+ /*
+ * Avoid expensive equality check if the index is primary key or
+ * replica identity index.
+ */
+ if (!isIdxSafeToSkipDuplicates)
+ {
+ if (eq == NULL)
+ eq = palloc0(sizeof(*eq) * scanslot->tts_tupleDescriptor->natts);
+
+ if (!tuples_equal(scanslot, searchslot, eq, NULL))
+ continue;
+ }
+
+ update_most_recent_deletion_info(scanslot, oldestxmin, delete_xid,
+ delete_time, delete_origin);
+ }
+
+ index_endscan(scan);
+
+ index_close(idxrel, NoLock);
+
+ ExecDropSingleTupleTableSlot(scanslot);
+
+ return *delete_time != 0;
+}
+
+/*
* Find the tuple that violates the passed unique index (conflictindex).
*
* If the conflicting tuple is found return true, otherwise false.
@@ -670,7 +913,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo,
resultRelInfo->ri_TrigDesc->trig_update_before_row)
{
if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo,
- tid, NULL, slot, NULL, NULL))
+ tid, NULL, slot, NULL, NULL, false))
skip_tuple = true; /* "do nothing" */
}
@@ -746,7 +989,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo,
resultRelInfo->ri_TrigDesc->trig_delete_before_row)
{
skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
- tid, NULL, NULL, NULL, NULL);
+ tid, NULL, NULL, NULL, NULL, false);
}
if (!skip_tuple)
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 2bc89bf84dc..7c6c2c1f6e4 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -64,6 +64,7 @@
#include "nodes/nodeFuncs.h"
#include "optimizer/optimizer.h"
#include "rewrite/rewriteHandler.h"
+#include "rewrite/rewriteManip.h"
#include "storage/lmgr.h"
#include "utils/builtins.h"
#include "utils/datum.h"
@@ -1473,7 +1474,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
return ExecBRDeleteTriggers(context->estate, context->epqstate,
resultRelInfo, tupleid, oldtuple,
- epqreturnslot, result, &context->tmfd);
+ epqreturnslot, result, &context->tmfd,
+ context->mtstate->operation == CMD_MERGE);
}
return true;
@@ -2116,7 +2118,8 @@ ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
return ExecBRUpdateTriggers(context->estate, context->epqstate,
resultRelInfo, tupleid, oldtuple, slot,
- result, &context->tmfd);
+ result, &context->tmfd,
+ context->mtstate->operation == CMD_MERGE);
}
return true;
@@ -3735,6 +3738,7 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate)
switch (action->commandType)
{
case CMD_INSERT:
+ /* INSERT actions always use rootRelInfo */
ExecCheckPlanOutput(rootRelInfo->ri_RelationDesc,
action->targetList);
@@ -3774,9 +3778,23 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate)
}
else
{
- /* not partitioned? use the stock relation and slot */
- tgtslot = resultRelInfo->ri_newTupleSlot;
- tgtdesc = RelationGetDescr(resultRelInfo->ri_RelationDesc);
+ /*
+ * If the MERGE targets an inherited table, we insert
+ * into the root table, so we must initialize its
+ * "new" tuple slot, if not already done, and use its
+ * relation descriptor for the projection.
+ *
+ * For non-inherited tables, rootRelInfo and
+ * resultRelInfo are the same, and the "new" tuple
+ * slot will already have been initialized.
+ */
+ if (rootRelInfo->ri_newTupleSlot == NULL)
+ rootRelInfo->ri_newTupleSlot =
+ table_slot_create(rootRelInfo->ri_RelationDesc,
+ &estate->es_tupleTable);
+
+ tgtslot = rootRelInfo->ri_newTupleSlot;
+ tgtdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc);
}
action_state->mas_proj =
@@ -3809,6 +3827,114 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate)
}
}
}
+
+ /*
+ * If the MERGE targets an inherited table, any INSERT actions will use
+ * rootRelInfo, and rootRelInfo will not be in the resultRelInfo array.
+ * Therefore we must initialize its WITH CHECK OPTION constraints and
+ * RETURNING projection, as ExecInitModifyTable did for the resultRelInfo
+ * entries.
+ *
+ * Note that the planner does not build a withCheckOptionList or
+ * returningList for the root relation, but as in ExecInitPartitionInfo,
+ * we can use the first resultRelInfo entry as a reference to calculate
+ * the attno's for the root table.
+ */
+ if (rootRelInfo != mtstate->resultRelInfo &&
+ rootRelInfo->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE &&
+ (mtstate->mt_merge_subcommands & MERGE_INSERT) != 0)
+ {
+ ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+ Relation rootRelation = rootRelInfo->ri_RelationDesc;
+ Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
+ int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
+ AttrMap *part_attmap = NULL;
+ bool found_whole_row;
+
+ if (node->withCheckOptionLists != NIL)
+ {
+ List *wcoList;
+ List *wcoExprs = NIL;
+
+ /* There should be as many WCO lists as result rels */
+ Assert(list_length(node->withCheckOptionLists) ==
+ list_length(node->resultRelations));
+
+ /*
+ * Use the first WCO list as a reference. In the most common case,
+ * this will be for the same relation as rootRelInfo, and so there
+ * will be no need to adjust its attno's.
+ */
+ wcoList = linitial(node->withCheckOptionLists);
+ if (rootRelation != firstResultRel)
+ {
+ /* Convert any Vars in it to contain the root's attno's */
+ part_attmap =
+ build_attrmap_by_name(RelationGetDescr(rootRelation),
+ RelationGetDescr(firstResultRel),
+ false);
+
+ wcoList = (List *)
+ map_variable_attnos((Node *) wcoList,
+ firstVarno, 0,
+ part_attmap,
+ RelationGetForm(rootRelation)->reltype,
+ &found_whole_row);
+ }
+
+ foreach(lc, wcoList)
+ {
+ WithCheckOption *wco = lfirst_node(WithCheckOption, lc);
+ ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual),
+ &mtstate->ps);
+
+ wcoExprs = lappend(wcoExprs, wcoExpr);
+ }
+
+ rootRelInfo->ri_WithCheckOptions = wcoList;
+ rootRelInfo->ri_WithCheckOptionExprs = wcoExprs;
+ }
+
+ if (node->returningLists != NIL)
+ {
+ List *returningList;
+
+ /* There should be as many returning lists as result rels */
+ Assert(list_length(node->returningLists) ==
+ list_length(node->resultRelations));
+
+ /*
+ * Use the first returning list as a reference. In the most common
+ * case, this will be for the same relation as rootRelInfo, and so
+ * there will be no need to adjust its attno's.
+ */
+ returningList = linitial(node->returningLists);
+ if (rootRelation != firstResultRel)
+ {
+ /* Convert any Vars in it to contain the root's attno's */
+ if (part_attmap == NULL)
+ part_attmap =
+ build_attrmap_by_name(RelationGetDescr(rootRelation),
+ RelationGetDescr(firstResultRel),
+ false);
+
+ returningList = (List *)
+ map_variable_attnos((Node *) returningList,
+ firstVarno, 0,
+ part_attmap,
+ RelationGetForm(rootRelation)->reltype,
+ &found_whole_row);
+ }
+ rootRelInfo->ri_returningList = returningList;
+
+ /* Initialize the RETURNING projection */
+ rootRelInfo->ri_projectReturning =
+ ExecBuildProjectionInfo(returningList, econtext,
+ mtstate->ps.ps_ResultTupleSlot,
+ &mtstate->ps,
+ RelationGetDescr(rootRelation));
+ }
+ }
}
/*
diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c
index ab2eab9596e..26f7420b64b 100644
--- a/src/backend/executor/nodeTidrangescan.c
+++ b/src/backend/executor/nodeTidrangescan.c
@@ -128,9 +128,11 @@ TidExprListCreate(TidRangeScanState *tidrangestate)
* TidRangeEval
*
* Compute and set node's block and offset range to scan by evaluating
- * the trss_tidexprs. Returns false if we detect the range cannot
+ * node->trss_tidexprs. Returns false if we detect the range cannot
* contain any tuples. Returns true if it's possible for the range to
- * contain tuples.
+ * contain tuples. We don't bother validating that trss_mintid is less
+ * than or equal to trss_maxtid, as the scan_set_tidrange() table AM
+ * function will handle that.
* ----------------------------------------------------------------
*/
static bool