aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/heap/heapam.c
diff options
context:
space:
mode:
authorAlvaro Herrera <alvherre@alvh.no-ip.org>2013-01-23 12:04:59 -0300
committerAlvaro Herrera <alvherre@alvh.no-ip.org>2013-01-23 12:04:59 -0300
commit0ac5ad5134f2769ccbaefec73844f8504c4d6182 (patch)
treed9b0ba4a1b65a52030820efe68a9c937c46aad1f /src/backend/access/heap/heapam.c
parentf925c79b9f36c54b67053ade5ad225a75b8dc803 (diff)
downloadpostgresql-0ac5ad5134f2769ccbaefec73844f8504c4d6182.tar.gz
postgresql-0ac5ad5134f2769ccbaefec73844f8504c4d6182.zip
Improve concurrency of foreign key locking
This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
Diffstat (limited to 'src/backend/access/heap/heapam.c')
-rw-r--r--src/backend/access/heap/heapam.c2187
1 files changed, 1777 insertions, 410 deletions
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index b19d1cf6c57..57d47e86014 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -84,12 +84,105 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
TransactionId xid, CommandId cid, int options);
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
- ItemPointerData from, Buffer newbuf, HeapTuple newtup,
- bool all_visible_cleared, bool new_all_visible_cleared);
-static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
- HeapTuple oldtup, HeapTuple newtup);
+ Buffer newbuf, HeapTuple oldtup,
+ HeapTuple newtup, bool all_visible_cleared,
+ bool new_all_visible_cleared);
+static void HeapSatisfiesHOTandKeyUpdate(Relation relation,
+ Bitmapset *hot_attrs, Bitmapset *key_attrs,
+ bool *satisfies_hot, bool *satisfies_key,
+ HeapTuple oldtup, HeapTuple newtup);
+static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
+ uint16 old_infomask2, TransactionId add_to_xmax,
+ LockTupleMode mode, bool is_update,
+ TransactionId *result_xmax, uint16 *result_infomask,
+ uint16 *result_infomask2);
+static HTSU_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
+ ItemPointer ctid, TransactionId xid,
+ LockTupleMode mode);
+static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
+ uint16 *new_infomask2);
+static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
+ uint16 t_infomask);
+static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+ int *remaining, uint16 infomask);
+static bool ConditionalMultiXactIdWait(MultiXactId multi,
+ MultiXactStatus status, int *remaining,
+ uint16 infomask);
+/*
+ * Each tuple lock mode has a corresponding heavyweight lock, and one or two
+ * corresponding MultiXactStatuses (one to merely lock tuples, another one to
+ * update them). This table (and the macros below) helps us determine the
+ * heavyweight lock mode and MultiXactStatus values to use for any particular
+ * tuple lock strength.
+ */
+static const struct
+{
+ LOCKMODE hwlock;
+ MultiXactStatus lockstatus;
+ MultiXactStatus updstatus;
+}
+tupleLockExtraInfo[MaxLockTupleMode + 1] =
+{
+ { /* LockTupleKeyShare */
+ AccessShareLock,
+ MultiXactStatusForKeyShare,
+ -1 /* KeyShare does not allow updating tuples */
+ },
+ { /* LockTupleShare */
+ RowShareLock,
+ MultiXactStatusForShare,
+ -1 /* Share does not allow updating tuples */
+ },
+ { /* LockTupleNoKeyExclusive */
+ ExclusiveLock,
+ MultiXactStatusForNoKeyUpdate,
+ MultiXactStatusNoKeyUpdate
+ },
+ { /* LockTupleExclusive */
+ AccessExclusiveLock,
+ MultiXactStatusForUpdate,
+ MultiXactStatusUpdate
+ }
+};
+/* Get the LOCKMODE for a given MultiXactStatus */
+#define LOCKMODE_from_mxstatus(status) \
+ (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
+
+/*
+ * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
+ * This is more readable than having every caller translate it to lock.h's
+ * LOCKMODE.
+ */
+#define LockTupleTuplock(rel, tup, mode) \
+ LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+#define UnlockTupleTuplock(rel, tup, mode) \
+ UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+#define ConditionalLockTupleTuplock(rel, tup, mode) \
+ ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
+
+/*
+ * This table maps tuple lock strength values for each particular
+ * MultiXactStatus value.
+ */
+static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
+{
+ LockTupleKeyShare, /* ForKeyShare */
+ LockTupleShare, /* ForShare */
+ LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
+ LockTupleExclusive, /* ForUpdate */
+ LockTupleNoKeyExclusive, /* NoKeyUpdate */
+ LockTupleExclusive /* Update */
+};
+
+/* Get the LockTupleMode for a given MultiXactStatus */
+#define TUPLOCK_from_mxstatus(status) \
+ (MultiXactStatusLock[(status)])
+/* Get the is_update bit for a given MultiXactStatus */
+#define ISUPDATE_from_mxstatus(status) \
+ ((status) > MultiXactStatusForUpdate)
+
/* ----------------------------------------------------------------
* heap support routines
* ----------------------------------------------------------------
@@ -1664,7 +1757,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
ItemPointerGetBlockNumber(tid));
offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
at_chain_start = false;
- prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+ prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
}
else
break; /* end of chain */
@@ -1787,7 +1880,7 @@ heap_get_latest_tid(Relation relation,
* tuple. Check for XMIN match.
*/
if (TransactionIdIsValid(priorXmax) &&
- !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
+ !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
{
UnlockReleaseBuffer(buffer);
break;
@@ -1805,7 +1898,8 @@ heap_get_latest_tid(Relation relation,
/*
* If there's a valid t_ctid link, follow it, else we're done.
*/
- if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) ||
+ if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+ HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
{
UnlockReleaseBuffer(buffer);
@@ -1813,7 +1907,7 @@ heap_get_latest_tid(Relation relation,
}
ctid = tp.t_data->t_ctid;
- priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
+ priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
UnlockReleaseBuffer(buffer);
} /* end of loop */
}
@@ -1826,17 +1920,25 @@ heap_get_latest_tid(Relation relation,
* If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
* be set on exit. If the transaction committed, we set the XMAX_COMMITTED
* hint bit if possible --- but beware that that may not yet be possible,
- * if the transaction committed asynchronously. Hence callers should look
- * only at XMAX_INVALID.
+ * if the transaction committed asynchronously.
+ *
+ * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
+ * even if it commits.
+ *
+ * Hence callers should look only at XMAX_INVALID.
+ *
+ * Note this is not allowed for tuples whose xmax is a multixact.
*/
static void
UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
{
- Assert(TransactionIdEquals(HeapTupleHeaderGetXmax(tuple), xid));
+ Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
+ Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
{
- if (TransactionIdDidCommit(xid))
+ if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
+ TransactionIdDidCommit(xid))
HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
xid);
else
@@ -2374,6 +2476,26 @@ simple_heap_insert(Relation relation, HeapTuple tup)
}
/*
+ * Given infomask/infomask2, compute the bits that must be saved in the
+ * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
+ * xl_heap_lock_updated WAL records.
+ *
+ * See fix_infomask_from_infobits.
+ */
+static uint8
+compute_infobits(uint16 infomask, uint16 infomask2)
+{
+ return
+ ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
+ ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
+ ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
+ /* note we ignore HEAP_XMAX_SHR_LOCK here */
+ ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
+ ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
+ XLHL_KEYS_UPDATED : 0);
+}
+
+/*
* heap_delete - delete a tuple
*
* NB: do not call this directly unless you are prepared to deal with
@@ -2393,7 +2515,8 @@ simple_heap_insert(Relation relation, HeapTuple tup)
* (the last only possible if wait == false).
*
* In the failure cases, the routine fills *hufd with the tuple's t_ctid,
- * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we
+ * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
+ * (the last only for HeapTupleSelfUpdated, since we
* cannot obtain cmax from a combocid generated by another transaction).
* See comments for struct HeapUpdateFailureData for additional info.
*/
@@ -2410,6 +2533,9 @@ heap_delete(Relation relation, ItemPointer tid,
BlockNumber block;
Buffer buffer;
Buffer vmbuffer = InvalidBuffer;
+ TransactionId new_xmax;
+ uint16 new_infomask,
+ new_infomask2;
bool have_tuple_lock = false;
bool iscombo;
bool all_visible_cleared = false;
@@ -2465,7 +2591,7 @@ l1:
uint16 infomask;
/* must copy state data before unlocking buffer */
- xwait = HeapTupleHeaderGetXmax(tp.t_data);
+ xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
infomask = tp.t_data->t_infomask;
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
@@ -2481,20 +2607,20 @@ l1:
*/
if (!have_tuple_lock)
{
- LockTuple(relation, &(tp.t_self), ExclusiveLock);
+ LockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
have_tuple_lock = true;
}
/*
* Sleep until concurrent transaction ends. Note that we don't care
- * if the locker has an exclusive or shared lock, because we need
- * exclusive.
+ * which lock mode the locker has, because we need the strongest one.
*/
if (infomask & HEAP_XMAX_IS_MULTI)
{
/* wait for multixact */
- MultiXactIdWait((MultiXactId) xwait);
+ MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate,
+ NULL, infomask);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
/*
@@ -2503,7 +2629,7 @@ l1:
* change, and start over if so.
*/
if (!(tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
- !TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
+ !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
xwait))
goto l1;
@@ -2529,7 +2655,7 @@ l1:
* Check for xmax change, and start over if so.
*/
if ((tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
- !TransactionIdEquals(HeapTupleHeaderGetXmax(tp.t_data),
+ !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
xwait))
goto l1;
@@ -2541,8 +2667,9 @@ l1:
* We may overwrite if previous xmax aborted, or if it committed but
* only locked the tuple without updating it.
*/
- if (tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
- HEAP_IS_LOCKED))
+ if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+ HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
+ HeapTupleHeaderIsOnlyLocked(tp.t_data))
result = HeapTupleMayBeUpdated;
else
result = HeapTupleUpdated;
@@ -2562,14 +2689,14 @@ l1:
result == HeapTupleBeingUpdated);
Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
hufd->ctid = tp.t_data->t_ctid;
- hufd->xmax = HeapTupleHeaderGetXmax(tp.t_data);
+ hufd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
if (result == HeapTupleSelfUpdated)
hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
else
hufd->cmax = 0; /* for lack of an InvalidCommandId value */
UnlockReleaseBuffer(buffer);
if (have_tuple_lock)
- UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
+ UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
return result;
@@ -2603,14 +2730,29 @@ l1:
vmbuffer);
}
+ /*
+ * If this is the first possibly-multixact-able operation in the
+ * current transaction, set my per-backend OldestMemberMXactId setting.
+ * We can be certain that the transaction will never become a member of
+ * any older MultiXactIds than that. (We have to do this even if we
+ * end up just using our own TransactionId below, since some other
+ * backend could incorporate our XID into a MultiXact immediately
+ * afterwards.)
+ */
+ MultiXactIdSetOldestMember();
+
+ compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
+ tp.t_data->t_infomask, tp.t_data->t_infomask2,
+ xid, LockTupleExclusive, true,
+ &new_xmax, &new_infomask, &new_infomask2);
+
/* store transaction information of xact deleting the tuple */
- tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
- HEAP_XMAX_INVALID |
- HEAP_XMAX_IS_MULTI |
- HEAP_IS_LOCKED |
- HEAP_MOVED);
+ tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+ tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+ tp.t_data->t_infomask |= new_infomask;
+ tp.t_data->t_infomask2 |= new_infomask2;
HeapTupleHeaderClearHotUpdated(tp.t_data);
- HeapTupleHeaderSetXmax(tp.t_data, xid);
+ HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
/* Make sure there is no forward chain link in t_ctid */
tp.t_data->t_ctid = tp.t_self;
@@ -2625,8 +2767,11 @@ l1:
XLogRecData rdata[2];
xlrec.all_visible_cleared = all_visible_cleared;
+ xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
+ tp.t_data->t_infomask2);
xlrec.target.node = relation->rd_node;
xlrec.target.tid = tp.t_self;
+ xlrec.xmax = new_xmax;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfHeapDelete;
rdata[0].buffer = InvalidBuffer;
@@ -2679,7 +2824,7 @@ l1:
* Release the lmgr tuple lock, if we had it.
*/
if (have_tuple_lock)
- UnlockTuple(relation, &(tp.t_self), ExclusiveLock);
+ UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
pgstat_count_heap_delete(relation);
@@ -2739,6 +2884,7 @@ simple_heap_delete(Relation relation, ItemPointer tid)
* crosscheck - if not InvalidSnapshot, also check old tuple against this
* wait - true if should wait for any conflicting update to commit/abort
* hufd - output parameter, filled in failure cases (see below)
+ * lockmode - output parameter, filled with lock mode acquired on tuple
*
* Normal, successful return value is HeapTupleMayBeUpdated, which
* actually means we *did* update it. Failure return codes are
@@ -2752,23 +2898,26 @@ simple_heap_delete(Relation relation, ItemPointer tid)
* data are not reflected into *newtup.
*
* In the failure cases, the routine fills *hufd with the tuple's t_ctid,
- * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we
+ * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
+ * (the last only for HeapTupleSelfUpdated, since we
* cannot obtain cmax from a combocid generated by another transaction).
* See comments for struct HeapUpdateFailureData for additional info.
*/
HTSU_Result
heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
CommandId cid, Snapshot crosscheck, bool wait,
- HeapUpdateFailureData *hufd)
+ HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
{
HTSU_Result result;
TransactionId xid = GetCurrentTransactionId();
Bitmapset *hot_attrs;
+ Bitmapset *key_attrs;
ItemId lp;
HeapTupleData oldtup;
HeapTuple heaptup;
Page page;
BlockNumber block;
+ MultiXactStatus mxact_status;
Buffer buffer,
newbuf,
vmbuffer = InvalidBuffer,
@@ -2779,9 +2928,20 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
pagefree;
bool have_tuple_lock = false;
bool iscombo;
+ bool satisfies_hot;
+ bool satisfies_key;
bool use_hot_update = false;
+ bool key_intact;
bool all_visible_cleared = false;
bool all_visible_cleared_new = false;
+ bool checked_lockers;
+ bool locker_remains;
+ TransactionId xmax_new_tuple,
+ xmax_old_tuple;
+ uint16 infomask_old_tuple,
+ infomask2_old_tuple,
+ infomask_new_tuple,
+ infomask2_new_tuple;
Assert(ItemPointerIsValid(otid));
@@ -2797,7 +2957,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
* Note that we get a copy here, so we need not worry about relcache flush
* happening midway through.
*/
- hot_attrs = RelationGetIndexAttrBitmap(relation);
+ hot_attrs = RelationGetIndexAttrBitmap(relation, false);
+ key_attrs = RelationGetIndexAttrBitmap(relation, true);
block = ItemPointerGetBlockNumber(otid);
buffer = ReadBuffer(relation, block);
@@ -2822,6 +2983,44 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
oldtup.t_self = *otid;
/*
+ * If we're not updating any "key" column, we can grab a weaker lock type.
+ * This allows for more concurrency when we are running simultaneously with
+ * foreign key checks.
+ *
+ * Note that if a column gets detoasted while executing the update, but the
+ * value ends up being the same, this test will fail and we will use the
+ * stronger lock. This is acceptable; the important case to optimize is
+ * updates that don't manipulate key columns, not those that
+ * serendipitiously arrive at the same key values.
+ */
+ HeapSatisfiesHOTandKeyUpdate(relation, hot_attrs, key_attrs,
+ &satisfies_hot, &satisfies_key,
+ &oldtup, newtup);
+ if (satisfies_key)
+ {
+ *lockmode = LockTupleNoKeyExclusive;
+ mxact_status = MultiXactStatusNoKeyUpdate;
+ key_intact = true;
+
+ /*
+ * If this is the first possibly-multixact-able operation in the
+ * current transaction, set my per-backend OldestMemberMXactId setting.
+ * We can be certain that the transaction will never become a member of
+ * any older MultiXactIds than that. (We have to do this even if we
+ * end up just using our own TransactionId below, since some other
+ * backend could incorporate our XID into a MultiXact immediately
+ * afterwards.)
+ */
+ MultiXactIdSetOldestMember();
+ }
+ else
+ {
+ *lockmode = LockTupleExclusive;
+ mxact_status = MultiXactStatusUpdate;
+ key_intact = false;
+ }
+
+ /*
* Note: beyond this point, use oldtup not otid to refer to old tuple.
* otid may very well point at newtup->t_self, which we will overwrite
* with the new tuple's location, so there's great risk of confusion if we
@@ -2829,8 +3028,13 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
*/
l2:
+ checked_lockers = false;
+ locker_remains = false;
result = HeapTupleSatisfiesUpdate(oldtup.t_data, cid, buffer);
+ /* see below about the "no wait" case */
+ Assert(result != HeapTupleBeingUpdated || wait);
+
if (result == HeapTupleInvisible)
{
UnlockReleaseBuffer(buffer);
@@ -2838,11 +3042,26 @@ l2:
}
else if (result == HeapTupleBeingUpdated && wait)
{
- TransactionId xwait;
+ TransactionId xwait;
uint16 infomask;
+ bool can_continue = false;
+
+ checked_lockers = true;
+
+ /*
+ * XXX note that we don't consider the "no wait" case here. This
+ * isn't a problem currently because no caller uses that case, but it
+ * should be fixed if such a caller is introduced. It wasn't a problem
+ * previously because this code would always wait, but now that some
+ * tuple locks do not conflict with one of the lock modes we use, it is
+ * possible that this case is interesting to handle specially.
+ *
+ * This may cause failures with third-party code that calls heap_update
+ * directly.
+ */
/* must copy state data before unlocking buffer */
- xwait = HeapTupleHeaderGetXmax(oldtup.t_data);
+ xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
infomask = oldtup.t_data->t_infomask;
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
@@ -2858,20 +3077,29 @@ l2:
*/
if (!have_tuple_lock)
{
- LockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+ LockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
have_tuple_lock = true;
}
/*
- * Sleep until concurrent transaction ends. Note that we don't care
- * if the locker has an exclusive or shared lock, because we need
- * exclusive.
+ * Now we have to do something about the existing locker. If it's a
+ * multi, sleep on it; we might be awakened before it is completely
+ * gone (or even not sleep at all in some cases); we need to preserve
+ * it as locker, unless it is gone completely.
+ *
+ * If it's not a multi, we need to check for sleeping conditions before
+ * actually going to sleep. If the update doesn't conflict with the
+ * locks, we just continue without sleeping (but making sure it is
+ * preserved).
*/
-
if (infomask & HEAP_XMAX_IS_MULTI)
{
+ TransactionId update_xact;
+ int remain;
+
/* wait for multixact */
- MultiXactIdWait((MultiXactId) xwait);
+ MultiXactIdWait((MultiXactId) xwait, mxact_status, &remain,
+ infomask);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
/*
@@ -2880,49 +3108,87 @@ l2:
* change, and start over if so.
*/
if (!(oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
- !TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
+ !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
xwait))
goto l2;
/*
- * You might think the multixact is necessarily done here, but not
- * so: it could have surviving members, namely our own xact or
- * other subxacts of this backend. It is legal for us to update
- * the tuple in either case, however (the latter case is
- * essentially a situation of upgrading our former shared lock to
- * exclusive). We don't bother changing the on-disk hint bits
- * since we are about to overwrite the xmax altogether.
+ * Note that the multixact may not be done by now. It could have
+ * surviving members; our own xact or other subxacts of this
+ * backend, and also any other concurrent transaction that locked
+ * the tuple with KeyShare if we only got TupleLockUpdate. If this
+ * is the case, we have to be careful to mark the updated tuple
+ * with the surviving members in Xmax.
+ *
+ * Note that there could have been another update in the MultiXact.
+ * In that case, we need to check whether it committed or aborted.
+ * If it aborted we are safe to update it again; otherwise there is
+ * an update conflict, and we have to return HeapTupleUpdated
+ * below.
+ *
+ * In the LockTupleExclusive case, we still need to preserve the
+ * surviving members: those would include the tuple locks we had
+ * before this one, which are important to keep in case this
+ * subxact aborts.
*/
+ update_xact = InvalidTransactionId;
+ if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
+ update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
+
+ /* there was no UPDATE in the MultiXact; or it aborted. */
+ if (!TransactionIdIsValid(update_xact) ||
+ TransactionIdDidAbort(update_xact))
+ can_continue = true;
+
+ locker_remains = remain != 0;
}
else
{
- /* wait for regular transaction to end */
- XactLockTableWait(xwait);
- LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
/*
- * xwait is done, but if xwait had just locked the tuple then some
- * other xact could update this tuple before we get to this point.
- * Check for xmax change, and start over if so.
+ * If it's just a key-share locker, and we're not changing the
+ * key columns, we don't need to wait for it to end; but we
+ * need to preserve it as locker.
*/
- if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
- !TransactionIdEquals(HeapTupleHeaderGetXmax(oldtup.t_data),
- xwait))
- goto l2;
+ if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
+ {
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
- /* Otherwise check if it committed or aborted */
- UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
+ /*
+ * recheck the locker; if someone else changed the tuple while we
+ * weren't looking, start over.
+ */
+ if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+ !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
+ xwait))
+ goto l2;
+
+ can_continue = true;
+ locker_remains = true;
+ }
+ else
+ {
+ /* wait for regular transaction to end */
+ XactLockTableWait(xwait);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ /*
+ * xwait is done, but if xwait had just locked the tuple then some
+ * other xact could update this tuple before we get to this point.
+ * Check for xmax change, and start over if so.
+ */
+ if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+ !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
+ xwait))
+ goto l2;
+
+ /* Otherwise check if it committed or aborted */
+ UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
+ if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
+ can_continue = true;
+ }
}
- /*
- * We may overwrite if previous xmax aborted, or if it committed but
- * only locked the tuple without updating it.
- */
- if (oldtup.t_data->t_infomask & (HEAP_XMAX_INVALID |
- HEAP_IS_LOCKED))
- result = HeapTupleMayBeUpdated;
- else
- result = HeapTupleUpdated;
+ result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
}
if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
@@ -2939,17 +3205,18 @@ l2:
result == HeapTupleBeingUpdated);
Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
hufd->ctid = oldtup.t_data->t_ctid;
- hufd->xmax = HeapTupleHeaderGetXmax(oldtup.t_data);
+ hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
if (result == HeapTupleSelfUpdated)
hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
else
hufd->cmax = 0; /* for lack of an InvalidCommandId value */
UnlockReleaseBuffer(buffer);
if (have_tuple_lock)
- UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+ UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
bms_free(hot_attrs);
+ bms_free(key_attrs);
return result;
}
@@ -2958,7 +3225,7 @@ l2:
* visible while we were busy locking the buffer, or during some
* subsequent window during which we had it unlocked, we'll have to unlock
* and re-lock, to avoid holding the buffer lock across an I/O. That's a
- * bit unfortunate, esepecially since we'll now have to recheck whether
+ * bit unfortunate, especially since we'll now have to recheck whether
* the tuple has been locked or updated under us, but hopefully it won't
* happen very often.
*/
@@ -2991,12 +3258,54 @@ l2:
Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
}
+ /*
+ * If the tuple we're updating is locked, we need to preserve the locking
+ * info in the old tuple's Xmax. Prepare a new Xmax value for this.
+ */
+ compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
+ oldtup.t_data->t_infomask,
+ oldtup.t_data->t_infomask2,
+ xid, *lockmode, true,
+ &xmax_old_tuple, &infomask_old_tuple,
+ &infomask2_old_tuple);
+
+ /* And also prepare an Xmax value for the new copy of the tuple */
+ if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
+ (checked_lockers && !locker_remains))
+ xmax_new_tuple = InvalidTransactionId;
+ else
+ xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
+
+ if (!TransactionIdIsValid(xmax_new_tuple))
+ {
+ infomask_new_tuple = HEAP_XMAX_INVALID;
+ infomask2_new_tuple = 0;
+ }
+ else
+ {
+ if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
+ {
+ GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
+ &infomask2_new_tuple);
+ }
+ else
+ {
+ infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
+ infomask2_new_tuple = 0;
+ }
+ }
+
+ /*
+ * Prepare the new tuple with the appropriate initial values of Xmin and
+ * Xmax, as well as initial infomask bits as computed above.
+ */
newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
- newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
HeapTupleHeaderSetXmin(newtup->t_data, xid);
HeapTupleHeaderSetCmin(newtup->t_data, cid);
- HeapTupleHeaderSetXmax(newtup->t_data, 0); /* for cleanliness */
+ newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
+ newtup->t_data->t_infomask2 |= infomask2_new_tuple;
+ HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
newtup->t_tableOid = RelationGetRelid(relation);
/*
@@ -3035,14 +3344,14 @@ l2:
if (need_toast || newtupsize > pagefree)
{
/* Clear obsolete visibility flags ... */
- oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
- HEAP_XMAX_INVALID |
- HEAP_XMAX_IS_MULTI |
- HEAP_IS_LOCKED |
- HEAP_MOVED);
+ oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+ oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
HeapTupleClearHotUpdated(&oldtup);
/* ... and store info about transaction updating this tuple */
- HeapTupleHeaderSetXmax(oldtup.t_data, xid);
+ Assert(TransactionIdIsValid(xmax_old_tuple));
+ HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
+ oldtup.t_data->t_infomask |= infomask_old_tuple;
+ oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
/* temporarily make it look not-updated */
oldtup.t_data->t_ctid = oldtup.t_self;
@@ -3145,7 +3454,7 @@ l2:
* to do a HOT update. Check if any of the index columns have been
* changed. If not, then HOT update is possible.
*/
- if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup))
+ if (satisfies_hot)
use_hot_update = true;
}
else
@@ -3193,13 +3502,13 @@ l2:
if (!already_marked)
{
/* Clear obsolete visibility flags ... */
- oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
- HEAP_XMAX_INVALID |
- HEAP_XMAX_IS_MULTI |
- HEAP_IS_LOCKED |
- HEAP_MOVED);
+ oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+ oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
/* ... and store info about transaction updating this tuple */
- HeapTupleHeaderSetXmax(oldtup.t_data, xid);
+ Assert(TransactionIdIsValid(xmax_old_tuple));
+ HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
+ oldtup.t_data->t_infomask |= infomask_old_tuple;
+ oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
}
@@ -3229,8 +3538,8 @@ l2:
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
- XLogRecPtr recptr = log_heap_update(relation, buffer, oldtup.t_self,
- newbuf, heaptup,
+ XLogRecPtr recptr = log_heap_update(relation, buffer,
+ newbuf, &oldtup, heaptup,
all_visible_cleared,
all_visible_cleared_new);
@@ -3272,7 +3581,7 @@ l2:
* Release the lmgr tuple lock, if we had it.
*/
if (have_tuple_lock)
- UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+ UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
pgstat_count_heap_update(relation, use_hot_update);
@@ -3287,13 +3596,14 @@ l2:
}
bms_free(hot_attrs);
+ bms_free(key_attrs);
return HeapTupleMayBeUpdated;
}
/*
* Check if the specified attribute's value is same in both given tuples.
- * Subroutine for HeapSatisfiesHOTUpdate.
+ * Subroutine for HeapSatisfiesHOTandKeyUpdate.
*/
static bool
heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
@@ -3327,7 +3637,7 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
/*
* Extract the corresponding values. XXX this is pretty inefficient if
- * there are many indexed columns. Should HeapSatisfiesHOTUpdate do a
+ * there are many indexed columns. Should HeapSatisfiesHOTandKeyUpdate do a
* single heap_deform_tuple call on each tuple, instead? But that doesn't
* work for system columns ...
*/
@@ -3370,35 +3680,101 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
}
/*
- * Check if the old and new tuples represent a HOT-safe update. To be able
- * to do a HOT update, we must not have changed any columns used in index
- * definitions.
+ * Check which columns are being updated.
*
- * The set of attributes to be checked is passed in (we dare not try to
- * compute it while holding exclusive buffer lock...) NOTE that hot_attrs
- * is destructively modified! That is OK since this is invoked at most once
- * by heap_update().
+ * This simultaneously checks conditions for HOT updates and for FOR KEY
+ * SHARE updates. Since much of the time they will be checking very similar
+ * sets of columns, and doing the same tests on them, it makes sense to
+ * optimize and do them together.
*
- * Returns true if safe to do HOT update.
+ * We receive two bitmapsets comprising the two sets of columns we're
+ * interested in. Note these are destructively modified; that is OK since
+ * this is invoked at most once in heap_update.
+ *
+ * hot_result is set to TRUE if it's okay to do a HOT update (i.e. it does not
+ * modified indexed columns); key_result is set to TRUE if the update does not
+ * modify columns used in the key.
*/
-static bool
-HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
- HeapTuple oldtup, HeapTuple newtup)
+static void
+HeapSatisfiesHOTandKeyUpdate(Relation relation,
+ Bitmapset *hot_attrs, Bitmapset *key_attrs,
+ bool *satisfies_hot, bool *satisfies_key,
+ HeapTuple oldtup, HeapTuple newtup)
{
- int attrnum;
+ int next_hot_attnum;
+ int next_key_attnum;
+ bool hot_result = true;
+ bool key_result = true;
+ bool key_done = false;
+ bool hot_done = false;
+
+ next_hot_attnum = bms_first_member(hot_attrs);
+ if (next_hot_attnum == -1)
+ hot_done = true;
+ else
+ /* Adjust for system attributes */
+ next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
- while ((attrnum = bms_first_member(hot_attrs)) >= 0)
- {
+ next_key_attnum = bms_first_member(key_attrs);
+ if (next_key_attnum == -1)
+ key_done = true;
+ else
/* Adjust for system attributes */
- attrnum += FirstLowInvalidHeapAttributeNumber;
+ next_key_attnum += FirstLowInvalidHeapAttributeNumber;
- /* If the attribute value has changed, we can't do HOT update */
- if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum,
- oldtup, newtup))
- return false;
+ for (;;)
+ {
+ int check_now;
+ bool changed;
+
+ /* both bitmapsets are now empty */
+ if (key_done && hot_done)
+ break;
+
+ /* XXX there's probably an easier way ... */
+ if (hot_done)
+ check_now = next_key_attnum;
+ if (key_done)
+ check_now = next_hot_attnum;
+ else
+ check_now = Min(next_hot_attnum, next_key_attnum);
+
+ changed = !heap_tuple_attr_equals(RelationGetDescr(relation),
+ check_now, oldtup, newtup);
+ if (changed)
+ {
+ if (check_now == next_hot_attnum)
+ hot_result = false;
+ if (check_now == next_key_attnum)
+ key_result = false;
+ }
+
+ /* if both are false now, we can stop checking */
+ if (!hot_result && !key_result)
+ break;
+
+ if (check_now == next_hot_attnum)
+ {
+ next_hot_attnum = bms_first_member(hot_attrs);
+ if (next_hot_attnum == -1)
+ hot_done = true;
+ else
+ /* Adjust for system attributes */
+ next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
+ }
+ if (check_now == next_key_attnum)
+ {
+ next_key_attnum = bms_first_member(key_attrs);
+ if (next_key_attnum == -1)
+ key_done = true;
+ else
+ /* Adjust for system attributes */
+ next_key_attnum += FirstLowInvalidHeapAttributeNumber;
+ }
}
- return true;
+ *satisfies_hot = hot_result;
+ *satisfies_key = key_result;
}
/*
@@ -3414,11 +3790,12 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
{
HTSU_Result result;
HeapUpdateFailureData hufd;
+ LockTupleMode lockmode;
result = heap_update(relation, otid, tup,
GetCurrentCommandId(true), InvalidSnapshot,
true /* wait for commit */,
- &hufd);
+ &hufd, &lockmode);
switch (result)
{
case HeapTupleSelfUpdated:
@@ -3440,6 +3817,28 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
}
}
+
+/*
+ * Return the MultiXactStatus corresponding to the given tuple lock mode.
+ */
+static MultiXactStatus
+get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
+{
+ MultiXactStatus retval;
+
+ if (is_update)
+ retval = tupleLockExtraInfo[mode].updstatus;
+ else
+ retval = tupleLockExtraInfo[mode].lockstatus;
+
+ if (retval == -1)
+ elog(ERROR, "invalid lock tuple mode %d/%s", mode,
+ is_update ? "true" : "false");
+
+ return retval;
+}
+
+
/*
* heap_lock_tuple - lock a tuple in shared or exclusive mode
*
@@ -3452,6 +3851,8 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
* tuple's cmax if lock is successful)
* mode: indicates if shared or exclusive tuple lock is desired
* nowait: if true, ereport rather than blocking if lock not available
+ * follow_updates: if true, follow the update chain to also lock descendant
+ * tuples.
*
* Output parameters:
* *tuple: all fields filled in
@@ -3464,61 +3865,30 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
* HeapTupleUpdated: lock failed because tuple updated by other xact
*
* In the failure cases, the routine fills *hufd with the tuple's t_ctid,
- * t_xmax, and t_cmax (the last only for HeapTupleSelfUpdated, since we
+ * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
+ * (the last only for HeapTupleSelfUpdated, since we
* cannot obtain cmax from a combocid generated by another transaction).
* See comments for struct HeapUpdateFailureData for additional info.
*
- *
- * NOTES: because the shared-memory lock table is of finite size, but users
- * could reasonably want to lock large numbers of tuples, we do not rely on
- * the standard lock manager to store tuple-level locks over the long term.
- * Instead, a tuple is marked as locked by setting the current transaction's
- * XID as its XMAX, and setting additional infomask bits to distinguish this
- * usage from the more normal case of having deleted the tuple. When
- * multiple transactions concurrently share-lock a tuple, the first locker's
- * XID is replaced in XMAX with a MultiTransactionId representing the set of
- * XIDs currently holding share-locks.
- *
- * When it is necessary to wait for a tuple-level lock to be released, the
- * basic delay is provided by XactLockTableWait or MultiXactIdWait on the
- * contents of the tuple's XMAX. However, that mechanism will release all
- * waiters concurrently, so there would be a race condition as to which
- * waiter gets the tuple, potentially leading to indefinite starvation of
- * some waiters. The possibility of share-locking makes the problem much
- * worse --- a steady stream of share-lockers can easily block an exclusive
- * locker forever. To provide more reliable semantics about who gets a
- * tuple-level lock first, we use the standard lock manager. The protocol
- * for waiting for a tuple-level lock is really
- * LockTuple()
- * XactLockTableWait()
- * mark tuple as locked by me
- * UnlockTuple()
- * When there are multiple waiters, arbitration of who is to get the lock next
- * is provided by LockTuple(). However, at most one tuple-level lock will
- * be held or awaited per backend at any time, so we don't risk overflow
- * of the lock table. Note that incoming share-lockers are required to
- * do LockTuple as well, if there is any conflict, to ensure that they don't
- * starve out waiting exclusive-lockers. However, if there is not any active
- * conflict for a tuple, we don't incur any extra overhead.
+ * See README.tuplock for a thorough explanation of this mechanism.
*/
HTSU_Result
heap_lock_tuple(Relation relation, HeapTuple tuple,
CommandId cid, LockTupleMode mode, bool nowait,
+ bool follow_updates,
Buffer *buffer, HeapUpdateFailureData *hufd)
{
HTSU_Result result;
ItemPointer tid = &(tuple->t_self);
ItemId lp;
Page page;
- TransactionId xid;
- TransactionId xmax;
- uint16 old_infomask;
- uint16 new_infomask;
- LOCKMODE tuple_lock_type;
+ TransactionId xid,
+ xmax;
+ uint16 old_infomask,
+ new_infomask,
+ new_infomask2;
bool have_tuple_lock = false;
- tuple_lock_type = (mode == LockTupleShared) ? ShareLock : ExclusiveLock;
-
*buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
@@ -3542,30 +3912,58 @@ l3:
{
TransactionId xwait;
uint16 infomask;
+ uint16 infomask2;
+ bool require_sleep;
+ ItemPointerData t_ctid;
/* must copy state data before unlocking buffer */
- xwait = HeapTupleHeaderGetXmax(tuple->t_data);
+ xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
infomask = tuple->t_data->t_infomask;
+ infomask2 = tuple->t_data->t_infomask2;
+ ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
/*
- * If we wish to acquire share lock, and the tuple is already
- * share-locked by a multixact that includes any subtransaction of the
- * current top transaction, then we effectively hold the desired lock
- * already. We *must* succeed without trying to take the tuple lock,
- * else we will deadlock against anyone waiting to acquire exclusive
- * lock. We don't need to make any state changes in this case.
+ * If any subtransaction of the current top transaction already holds a
+ * lock as strong or stronger than what we're requesting, we
+ * effectively hold the desired lock already. We *must* succeed
+ * without trying to take the tuple lock, else we will deadlock against
+ * anyone wanting to acquire a stronger lock.
*/
- if (mode == LockTupleShared &&
- (infomask & HEAP_XMAX_IS_MULTI) &&
- MultiXactIdIsCurrent((MultiXactId) xwait))
+ if (infomask & HEAP_XMAX_IS_MULTI)
{
- Assert(infomask & HEAP_XMAX_SHARED_LOCK);
- /* Probably can't hold tuple lock here, but may as well check */
- if (have_tuple_lock)
- UnlockTuple(relation, tid, tuple_lock_type);
- return HeapTupleMayBeUpdated;
+ int i;
+ int nmembers;
+ MultiXactMember *members;
+
+ /*
+ * We don't need to allow old multixacts here; if that had been the
+ * case, HeapTupleSatisfiesUpdate would have returned MayBeUpdated
+ * and we wouldn't be here.
+ */
+ nmembers = GetMultiXactIdMembers(xwait, &members, false);
+
+ for (i = 0; i < nmembers; i++)
+ {
+ if (TransactionIdIsCurrentTransactionId(members[i].xid))
+ {
+ LockTupleMode membermode;
+
+ membermode = TUPLOCK_from_mxstatus(members[i].status);
+
+ if (membermode >= mode)
+ {
+ if (have_tuple_lock)
+ UnlockTupleTuplock(relation, tid, mode);
+
+ pfree(members);
+ return HeapTupleMayBeUpdated;
+ }
+ }
+ }
+
+ pfree(members);
}
/*
@@ -3581,126 +3979,355 @@ l3:
{
if (nowait)
{
- if (!ConditionalLockTuple(relation, tid, tuple_lock_type))
+ if (!ConditionalLockTupleTuplock(relation, tid, mode))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
- errmsg("could not obtain lock on row in relation \"%s\"",
- RelationGetRelationName(relation))));
+ errmsg("could not obtain lock on row in relation \"%s\"",
+ RelationGetRelationName(relation))));
}
else
- LockTuple(relation, tid, tuple_lock_type);
+ LockTupleTuplock(relation, tid, mode);
have_tuple_lock = true;
}
- if (mode == LockTupleShared && (infomask & HEAP_XMAX_SHARED_LOCK))
+ /*
+ * Initially assume that we will have to wait for the locking
+ * transaction(s) to finish. We check various cases below in which
+ * this can be turned off.
+ */
+ require_sleep = true;
+ if (mode == LockTupleKeyShare)
{
/*
- * Acquiring sharelock when there's at least one sharelocker
- * already. We need not wait for him/them to complete.
- */
- LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
-
- /*
- * Make sure it's still a shared lock, else start over. (It's OK
- * if the ownership of the shared lock has changed, though.)
+ * If we're requesting KeyShare, and there's no update present, we
+ * don't need to wait. Even if there is an update, we can still
+ * continue if the key hasn't been modified.
+ *
+ * However, if there are updates, we need to walk the update chain
+ * to mark future versions of the row as locked, too. That way, if
+ * somebody deletes that future version, we're protected against
+ * the key going away. This locking of future versions could block
+ * momentarily, if a concurrent transaction is deleting a key; or
+ * it could return a value to the effect that the transaction
+ * deleting the key has already committed. So we do this before
+ * re-locking the buffer; otherwise this would be prone to
+ * deadlocks.
+ *
+ * Note that the TID we're locking was grabbed before we unlocked
+ * the buffer. For it to change while we're not looking, the other
+ * properties we're testing for below after re-locking the buffer
+ * would also change, in which case we would restart this loop
+ * above.
*/
- if (!(tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK))
- goto l3;
- }
- else if (infomask & HEAP_XMAX_IS_MULTI)
- {
- /* wait for multixact to end */
- if (nowait)
+ if (!(infomask2 & HEAP_KEYS_UPDATED))
{
- if (!ConditionalMultiXactIdWait((MultiXactId) xwait))
- ereport(ERROR,
- (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
- errmsg("could not obtain lock on row in relation \"%s\"",
- RelationGetRelationName(relation))));
- }
- else
- MultiXactIdWait((MultiXactId) xwait);
+ bool updated;
+
+ updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
+
+ /*
+ * If there are updates, follow the update chain; bail out
+ * if that cannot be done.
+ */
+ if (follow_updates && updated)
+ {
+ HTSU_Result res;
+
+ res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+ GetCurrentTransactionId(),
+ mode);
+ if (res != HeapTupleMayBeUpdated)
+ {
+ result = res;
+ /* recovery code expects to have buffer lock held */
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+ goto failed;
+ }
+ }
+
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ /*
+ * Make sure it's still an appropriate lock, else start over.
+ * Also, if it wasn't updated before we released the lock, but
+ * is updated now, we start over too; the reason is that we now
+ * need to follow the update chain to lock the new versions.
+ */
+ if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
+ ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
+ !updated))
+ goto l3;
- LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+ /* Things look okay, so we can skip sleeping */
+ require_sleep = false;
+ /*
+ * Note we allow Xmax to change here; other updaters/lockers
+ * could have modified it before we grabbed the buffer lock.
+ * However, this is not a problem, because with the recheck we
+ * just did we ensure that they still don't conflict with the
+ * lock we want.
+ */
+ }
+ }
+ else if (mode == LockTupleShare)
+ {
/*
- * If xwait had just locked the tuple then some other xact could
- * update this tuple before we get to this point. Check for xmax
- * change, and start over if so.
+ * If we're requesting Share, we can similarly avoid sleeping if
+ * there's no update and no exclusive lock present.
*/
- if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
- !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
- xwait))
- goto l3;
+ if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
+ !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
+ {
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+ /*
+ * Make sure it's still an appropriate lock, else start over.
+ * See above about allowing xmax to change.
+ */
+ if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
+ HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
+ goto l3;
+ require_sleep = false;
+ }
+ }
+ else if (mode == LockTupleNoKeyExclusive)
+ {
/*
- * You might think the multixact is necessarily done here, but not
- * so: it could have surviving members, namely our own xact or
- * other subxacts of this backend. It is legal for us to lock the
- * tuple in either case, however. We don't bother changing the
- * on-disk hint bits since we are about to overwrite the xmax
- * altogether.
+ * If we're requesting NoKeyExclusive, we might also be able to
+ * avoid sleeping; just ensure that there's no other lock type than
+ * KeyShare. Note that this is a bit more involved than just
+ * checking hint bits -- we need to expand the multixact to figure
+ * out lock modes for each one (unless there was only one such
+ * locker).
*/
+ if (infomask & HEAP_XMAX_IS_MULTI)
+ {
+ int nmembers;
+ MultiXactMember *members;
+
+ /*
+ * We don't need to allow old multixacts here; if that had been
+ * the case, HeapTupleSatisfiesUpdate would have returned
+ * MayBeUpdated and we wouldn't be here.
+ */
+ nmembers = GetMultiXactIdMembers(xwait, &members, false);
+
+ if (nmembers <= 0)
+ {
+ /*
+ * No need to keep the previous xmax here. This is unlikely
+ * to happen.
+ */
+ require_sleep = false;
+ }
+ else
+ {
+ int i;
+ bool allowed = true;
+
+ for (i = 0; i < nmembers; i++)
+ {
+ if (members[i].status != MultiXactStatusForKeyShare)
+ {
+ allowed = false;
+ break;
+ }
+ }
+ if (allowed)
+ {
+ /*
+ * if the xmax changed under us in the meantime, start
+ * over.
+ */
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+ if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+ !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+ xwait))
+ {
+ pfree(members);
+ goto l3;
+ }
+ /* otherwise, we're good */
+ require_sleep = false;
+ }
+
+ pfree(members);
+ }
+ }
+ else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
+ {
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ /* if the xmax changed in the meantime, start over */
+ if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+ !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+ xwait))
+ goto l3;
+ /* otherwise, we're good */
+ require_sleep = false;
+ }
}
- else
+
+ /*
+ * By here, we either have already acquired the buffer exclusive lock,
+ * or we must wait for the locking transaction or multixact; so below
+ * we ensure that we grab buffer lock after the sleep.
+ */
+
+ if (require_sleep)
{
- /* wait for regular transaction to end */
- if (nowait)
+ if (infomask & HEAP_XMAX_IS_MULTI)
{
- if (!ConditionalXactLockTableWait(xwait))
- ereport(ERROR,
- (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
- errmsg("could not obtain lock on row in relation \"%s\"",
- RelationGetRelationName(relation))));
+ MultiXactStatus status = get_mxact_status_for_lock(mode, false);
+
+ /* We only ever lock tuples, never update them */
+ if (status >= MultiXactStatusNoKeyUpdate)
+ elog(ERROR, "invalid lock mode in heap_lock_tuple");
+
+ /* wait for multixact to end */
+ if (nowait)
+ {
+ if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
+ status, NULL, infomask))
+ ereport(ERROR,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("could not obtain lock on row in relation \"%s\"",
+ RelationGetRelationName(relation))));
+ }
+ else
+ MultiXactIdWait((MultiXactId) xwait, status, NULL, infomask);
+
+ /* if there are updates, follow the update chain */
+ if (follow_updates &&
+ !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+ {
+ HTSU_Result res;
+
+ res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+ GetCurrentTransactionId(),
+ mode);
+ if (res != HeapTupleMayBeUpdated)
+ {
+ result = res;
+ /* recovery code expects to have buffer lock held */
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+ goto failed;
+ }
+ }
+
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ /*
+ * If xwait had just locked the tuple then some other xact
+ * could update this tuple before we get to this point. Check
+ * for xmax change, and start over if so.
+ */
+ if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+ !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+ xwait))
+ goto l3;
+
+ /*
+ * Of course, the multixact might not be done here: if we're
+ * requesting a light lock mode, other transactions with light
+ * locks could still be alive, as well as locks owned by our
+ * own xact or other subxacts of this backend. We need to
+ * preserve the surviving MultiXact members. Note that it
+ * isn't absolutely necessary in the latter case, but doing so
+ * is simpler.
+ */
}
else
- XactLockTableWait(xwait);
+ {
+ /* wait for regular transaction to end */
+ if (nowait)
+ {
+ if (!ConditionalXactLockTableWait(xwait))
+ ereport(ERROR,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("could not obtain lock on row in relation \"%s\"",
+ RelationGetRelationName(relation))));
+ }
+ else
+ XactLockTableWait(xwait);
- LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+ /* if there are updates, follow the update chain */
+ if (follow_updates &&
+ !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+ {
+ HTSU_Result res;
+
+ res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+ GetCurrentTransactionId(),
+ mode);
+ if (res != HeapTupleMayBeUpdated)
+ {
+ result = res;
+ /* recovery code expects to have buffer lock held */
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+ goto failed;
+ }
+ }
- /*
- * xwait is done, but if xwait had just locked the tuple then some
- * other xact could update this tuple before we get to this point.
- * Check for xmax change, and start over if so.
- */
- if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
- !TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
- xwait))
- goto l3;
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
- /* Otherwise check if it committed or aborted */
- UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
+ /*
+ * xwait is done, but if xwait had just locked the tuple then
+ * some other xact could update this tuple before we get to
+ * this point. Check for xmax change, and start over if so.
+ */
+ if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
+ !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
+ xwait))
+ goto l3;
+
+ /*
+ * Otherwise check if it committed or aborted. Note we cannot
+ * be here if the tuple was only locked by somebody who didn't
+ * conflict with us; that should have been handled above. So
+ * that transaction must necessarily be gone by now.
+ */
+ UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
+ }
}
+ /* By here, we're certain that we hold buffer exclusive lock again */
+
/*
* We may lock if previous xmax aborted, or if it committed but only
- * locked the tuple without updating it. The case where we didn't
- * wait because we are joining an existing shared lock is correctly
- * handled, too.
+ * locked the tuple without updating it; or if we didn't have to wait
+ * at all for whatever reason.
*/
- if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
- HEAP_IS_LOCKED))
+ if (!require_sleep ||
+ (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
+ HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
+ HeapTupleHeaderIsOnlyLocked(tuple->t_data))
result = HeapTupleMayBeUpdated;
else
result = HeapTupleUpdated;
}
+failed:
if (result != HeapTupleMayBeUpdated)
{
Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated);
Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
hufd->ctid = tuple->t_data->t_ctid;
- hufd->xmax = HeapTupleHeaderGetXmax(tuple->t_data);
+ hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
if (result == HeapTupleSelfUpdated)
hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
else
hufd->cmax = 0; /* for lack of an InvalidCommandId value */
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
if (have_tuple_lock)
- UnlockTuple(relation, tid, tuple_lock_type);
+ UnlockTupleTuplock(relation, tid, mode);
return result;
}
+ xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
+ old_infomask = tuple->t_data->t_infomask;
+
/*
* We might already hold the desired lock (or stronger), possibly under a
* different subtransaction of the current top transaction. If so, there
@@ -3709,113 +4336,48 @@ l3:
* for cases where it is a plain TransactionId.
*
* Note in particular that this covers the case where we already hold
- * exclusive lock on the tuple and the caller only wants shared lock. It
- * would certainly not do to give up the exclusive lock.
+ * exclusive lock on the tuple and the caller only wants key share or share
+ * lock. It would certainly not do to give up the exclusive lock.
*/
- xmax = HeapTupleHeaderGetXmax(tuple->t_data);
- old_infomask = tuple->t_data->t_infomask;
-
if (!(old_infomask & (HEAP_XMAX_INVALID |
HEAP_XMAX_COMMITTED |
HEAP_XMAX_IS_MULTI)) &&
- (mode == LockTupleShared ?
- (old_infomask & HEAP_IS_LOCKED) :
- (old_infomask & HEAP_XMAX_EXCL_LOCK)) &&
+ (mode == LockTupleKeyShare ?
+ (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask) ||
+ HEAP_XMAX_IS_SHR_LOCKED(old_infomask) ||
+ HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) :
+ mode == LockTupleShare ?
+ (HEAP_XMAX_IS_SHR_LOCKED(old_infomask) ||
+ HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) :
+ (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))) &&
TransactionIdIsCurrentTransactionId(xmax))
{
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
/* Probably can't hold tuple lock here, but may as well check */
if (have_tuple_lock)
- UnlockTuple(relation, tid, tuple_lock_type);
+ UnlockTupleTuplock(relation, tid, mode);
return HeapTupleMayBeUpdated;
}
/*
+ * If this is the first possibly-multixact-able operation in the
+ * current transaction, set my per-backend OldestMemberMXactId setting.
+ * We can be certain that the transaction will never become a member of
+ * any older MultiXactIds than that. (We have to do this even if we
+ * end up just using our own TransactionId below, since some other
+ * backend could incorporate our XID into a MultiXact immediately
+ * afterwards.)
+ */
+ MultiXactIdSetOldestMember();
+
+ /*
* Compute the new xmax and infomask to store into the tuple. Note we do
* not modify the tuple just yet, because that would leave it in the wrong
* state if multixact.c elogs.
*/
- xid = GetCurrentTransactionId();
-
- new_infomask = old_infomask & ~(HEAP_XMAX_COMMITTED |
- HEAP_XMAX_INVALID |
- HEAP_XMAX_IS_MULTI |
- HEAP_IS_LOCKED |
- HEAP_MOVED);
-
- if (mode == LockTupleShared)
- {
- /*
- * If this is the first acquisition of a shared lock in the current
- * transaction, set my per-backend OldestMemberMXactId setting. We can
- * be certain that the transaction will never become a member of any
- * older MultiXactIds than that. (We have to do this even if we end
- * up just using our own TransactionId below, since some other backend
- * could incorporate our XID into a MultiXact immediately afterwards.)
- */
- MultiXactIdSetOldestMember();
-
- new_infomask |= HEAP_XMAX_SHARED_LOCK;
-
- /*
- * Check to see if we need a MultiXactId because there are multiple
- * lockers.
- *
- * HeapTupleSatisfiesUpdate will have set the HEAP_XMAX_INVALID bit if
- * the xmax was a MultiXactId but it was not running anymore. There is
- * a race condition, which is that the MultiXactId may have finished
- * since then, but that uncommon case is handled within
- * MultiXactIdExpand.
- *
- * There is a similar race condition possible when the old xmax was a
- * regular TransactionId. We test TransactionIdIsInProgress again
- * just to narrow the window, but it's still possible to end up
- * creating an unnecessary MultiXactId. Fortunately this is harmless.
- */
- if (!(old_infomask & (HEAP_XMAX_INVALID | HEAP_XMAX_COMMITTED)))
- {
- if (old_infomask & HEAP_XMAX_IS_MULTI)
- {
- /*
- * If the XMAX is already a MultiXactId, then we need to
- * expand it to include our own TransactionId.
- */
- xid = MultiXactIdExpand((MultiXactId) xmax, xid);
- new_infomask |= HEAP_XMAX_IS_MULTI;
- }
- else if (TransactionIdIsInProgress(xmax))
- {
- /*
- * If the XMAX is a valid TransactionId, then we need to
- * create a new MultiXactId that includes both the old locker
- * and our own TransactionId.
- */
- xid = MultiXactIdCreate(xmax, xid);
- new_infomask |= HEAP_XMAX_IS_MULTI;
- }
- else
- {
- /*
- * Can get here iff HeapTupleSatisfiesUpdate saw the old xmax
- * as running, but it finished before
- * TransactionIdIsInProgress() got to run. Treat it like
- * there's no locker in the tuple.
- */
- }
- }
- else
- {
- /*
- * There was no previous locker, so just insert our own
- * TransactionId.
- */
- }
- }
- else
- {
- /* We want an exclusive lock on the tuple */
- new_infomask |= HEAP_XMAX_EXCL_LOCK;
- }
+ compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
+ GetCurrentTransactionId(), mode, false,
+ &xid, &new_infomask, &new_infomask2);
START_CRIT_SECTION();
@@ -3823,13 +4385,29 @@ l3:
* Store transaction information of xact locking the tuple.
*
* Note: Cmax is meaningless in this context, so don't set it; this avoids
- * possibly generating a useless combo CID.
+ * possibly generating a useless combo CID. Moreover, if we're locking a
+ * previously updated tuple, it's important to preserve the Cmax.
+ *
+ * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
+ * we would break the HOT chain.
*/
- tuple->t_data->t_infomask = new_infomask;
- HeapTupleHeaderClearHotUpdated(tuple->t_data);
+ tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
+ tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+ tuple->t_data->t_infomask |= new_infomask;
+ tuple->t_data->t_infomask2 |= new_infomask2;
+ if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
+ HeapTupleHeaderClearHotUpdated(tuple->t_data);
HeapTupleHeaderSetXmax(tuple->t_data, xid);
- /* Make sure there is no forward chain link in t_ctid */
- tuple->t_data->t_ctid = *tid;
+
+ /*
+ * Make sure there is no forward chain link in t_ctid. Note that in the
+ * cases where the tuple has been updated, we must not overwrite t_ctid,
+ * because it was set by the updater. Moreover, if the tuple has been
+ * updated, we need to follow the update chain to lock the new versions
+ * of the tuple as well.
+ */
+ if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
+ tuple->t_data->t_ctid = *tid;
MarkBufferDirty(*buffer);
@@ -3854,8 +4432,8 @@ l3:
xlrec.target.node = relation->rd_node;
xlrec.target.tid = tuple->t_self;
xlrec.locking_xid = xid;
- xlrec.xid_is_mxact = ((new_infomask & HEAP_XMAX_IS_MULTI) != 0);
- xlrec.shared_lock = (mode == LockTupleShared);
+ xlrec.infobits_set = compute_infobits(new_infomask,
+ tuple->t_data->t_infomask2);
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfHeapLock;
rdata[0].buffer = InvalidBuffer;
@@ -3887,8 +4465,469 @@ l3:
* release the lmgr tuple lock, if we had it.
*/
if (have_tuple_lock)
- UnlockTuple(relation, tid, tuple_lock_type);
+ UnlockTupleTuplock(relation, tid, mode);
+
+ return HeapTupleMayBeUpdated;
+}
+
+
+/*
+ * Given an original set of Xmax and infomask, and a transaction (identified by
+ * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
+ * corresponding infomasks to use on the tuple.
+ *
+ * Note that this might have side effects such as creating a new MultiXactId.
+ *
+ * Most callers will have called HeapTupleSatisfiesUpdate before this function;
+ * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
+ * but it was not running anymore. There is a race condition, which is that the
+ * MultiXactId may have finished since then, but that uncommon case is handled
+ * either here, or within MultiXactIdExpand.
+ *
+ * There is a similar race condition possible when the old xmax was a regular
+ * TransactionId. We test TransactionIdIsInProgress again just to narrow the
+ * window, but it's still possible to end up creating an unnecessary
+ * MultiXactId. Fortunately this is harmless.
+ */
+static void
+compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
+ uint16 old_infomask2, TransactionId add_to_xmax,
+ LockTupleMode mode, bool is_update,
+ TransactionId *result_xmax, uint16 *result_infomask,
+ uint16 *result_infomask2)
+{
+ TransactionId new_xmax;
+ uint16 new_infomask,
+ new_infomask2;
+
+l5:
+ new_infomask = 0;
+ new_infomask2 = 0;
+ if (old_infomask & HEAP_XMAX_INVALID)
+ {
+ /*
+ * No previous locker; we just insert our own TransactionId.
+ */
+ if (is_update)
+ {
+ new_xmax = add_to_xmax;
+ if (mode == LockTupleExclusive)
+ new_infomask2 |= HEAP_KEYS_UPDATED;
+ }
+ else
+ {
+ new_infomask |= HEAP_XMAX_LOCK_ONLY;
+ switch (mode)
+ {
+ case LockTupleKeyShare:
+ new_xmax = add_to_xmax;
+ new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
+ break;
+ case LockTupleShare:
+ new_xmax = add_to_xmax;
+ new_infomask |= HEAP_XMAX_SHR_LOCK;
+ break;
+ case LockTupleNoKeyExclusive:
+ new_xmax = add_to_xmax;
+ new_infomask |= HEAP_XMAX_EXCL_LOCK;
+ break;
+ case LockTupleExclusive:
+ new_xmax = add_to_xmax;
+ new_infomask |= HEAP_XMAX_EXCL_LOCK;
+ new_infomask2 |= HEAP_KEYS_UPDATED;
+ break;
+ default:
+ new_xmax = InvalidTransactionId; /* silence compiler */
+ elog(ERROR, "invalid lock mode");
+ }
+ }
+ }
+ else if (old_infomask & HEAP_XMAX_IS_MULTI)
+ {
+ MultiXactStatus new_status;
+
+ /*
+ * Currently we don't allow XMAX_COMMITTED to be set for multis,
+ * so cross-check.
+ */
+ Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
+
+ /*
+ * A multixact together with LOCK_ONLY set but neither lock bit set
+ * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
+ * anymore. This check is critical for databases upgraded by
+ * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
+ * that such multis are never passed.
+ */
+ if (!(old_infomask & HEAP_LOCK_MASK) &&
+ HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
+ {
+ old_infomask &= ~HEAP_XMAX_IS_MULTI;
+ old_infomask |= HEAP_XMAX_INVALID;
+ goto l5;
+ }
+
+ /*
+ * If the XMAX is already a MultiXactId, then we need to expand it to
+ * include add_to_xmax; but if all the members were lockers and are all
+ * gone, we can do away with the IS_MULTI bit and just set add_to_xmax
+ * as the only locker/updater. If all lockers are gone and we have an
+ * updater that aborted, we can also do without a multi.
+ *
+ * The cost of doing GetMultiXactIdMembers would be paid by
+ * MultiXactIdExpand if we weren't to do this, so this check is not
+ * incurring extra work anyhow.
+ */
+ if (!MultiXactIdIsRunning(xmax))
+ {
+ if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
+ TransactionIdDidAbort(MultiXactIdGetUpdateXid(xmax,
+ old_infomask)))
+ {
+ /*
+ * Reset these bits and restart; otherwise fall through to
+ * create a new multi below.
+ */
+ old_infomask &= ~HEAP_XMAX_IS_MULTI;
+ old_infomask |= HEAP_XMAX_INVALID;
+ goto l5;
+ }
+ }
+
+ new_status = get_mxact_status_for_lock(mode, is_update);
+
+ new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
+ new_status);
+ GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+ }
+ else if (old_infomask & HEAP_XMAX_COMMITTED)
+ {
+ /*
+ * It's a committed update, so we need to preserve him as updater of
+ * the tuple.
+ */
+ MultiXactStatus status;
+ MultiXactStatus new_status;
+
+ if (old_infomask2 & HEAP_KEYS_UPDATED)
+ status = MultiXactStatusUpdate;
+ else
+ status = MultiXactStatusNoKeyUpdate;
+
+ new_status = get_mxact_status_for_lock(mode, is_update);
+ /*
+ * since it's not running, it's obviously impossible for the old
+ * updater to be identical to the current one, so we need not check
+ * for that case as we do in the block above.
+ */
+ new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
+ GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+ }
+ else if (TransactionIdIsInProgress(xmax))
+ {
+ /*
+ * If the XMAX is a valid, in-progress TransactionId, then we need to
+ * create a new MultiXactId that includes both the old locker or
+ * updater and our own TransactionId.
+ */
+ MultiXactStatus status;
+ MultiXactStatus new_status;
+
+ if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
+ {
+ if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
+ status = MultiXactStatusForKeyShare;
+ else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
+ status = MultiXactStatusForShare;
+ else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
+ {
+ if (old_infomask2 & HEAP_KEYS_UPDATED)
+ status = MultiXactStatusForUpdate;
+ else
+ status = MultiXactStatusForNoKeyUpdate;
+ }
+ else
+ {
+ /*
+ * LOCK_ONLY can be present alone only when a page has been
+ * upgraded by pg_upgrade. But in that case,
+ * TransactionIdIsInProgress() should have returned false. We
+ * assume it's no longer locked in this case.
+ */
+ elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
+ old_infomask |= HEAP_XMAX_INVALID;
+ old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
+ goto l5;
+ }
+ }
+ else
+ {
+ /* it's an update, but which kind? */
+ if (old_infomask2 & HEAP_KEYS_UPDATED)
+ status = MultiXactStatusUpdate;
+ else
+ status = MultiXactStatusNoKeyUpdate;
+ }
+
+ new_status = get_mxact_status_for_lock(mode, is_update);
+
+ /*
+ * If the existing lock mode is identical to or weaker than the new
+ * one, we can act as though there is no existing lock, so set
+ * XMAX_INVALID and restart.
+ */
+ if (xmax == add_to_xmax)
+ {
+ LockTupleMode old_mode = TUPLOCK_from_mxstatus(status);
+ bool old_isupd = ISUPDATE_from_mxstatus(status);
+
+ /*
+ * We can do this if the new LockTupleMode is higher or equal than
+ * the old one; and if there was previously an update, we need an
+ * update, but if there wasn't, then we can accept there not being
+ * one.
+ */
+ if ((mode >= old_mode) && (is_update || !old_isupd))
+ {
+ /*
+ * Note that the infomask might contain some other dirty bits.
+ * However, since the new infomask is reset to zero, we only
+ * set what's minimally necessary, and that the case that
+ * checks HEAP_XMAX_INVALID is the very first above, there is
+ * no need for extra cleanup of the infomask here.
+ */
+ old_infomask |= HEAP_XMAX_INVALID;
+ goto l5;
+ }
+ }
+ new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
+ GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+ }
+ else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
+ TransactionIdDidCommit(xmax))
+ {
+ /*
+ * It's a committed update, so we gotta preserve him as updater of the
+ * tuple.
+ */
+ MultiXactStatus status;
+ MultiXactStatus new_status;
+
+ if (old_infomask2 & HEAP_KEYS_UPDATED)
+ status = MultiXactStatusUpdate;
+ else
+ status = MultiXactStatusNoKeyUpdate;
+
+ new_status = get_mxact_status_for_lock(mode, is_update);
+ /*
+ * since it's not running, it's obviously impossible for the old
+ * updater to be identical to the current one, so we need not check
+ * for that case as we do in the block above.
+ */
+ new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
+ GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
+ }
+ else
+ {
+ /*
+ * Can get here iff the locking/updating transaction was running when
+ * the infomask was extracted from the tuple, but finished before
+ * TransactionIdIsInProgress got to run. Deal with it as if there was
+ * no locker at all in the first place.
+ */
+ old_infomask |= HEAP_XMAX_INVALID;
+ goto l5;
+ }
+
+ *result_infomask = new_infomask;
+ *result_infomask2 = new_infomask2;
+ *result_xmax = new_xmax;
+}
+
+
+/*
+ * Recursive part of heap_lock_updated_tuple
+ *
+ * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
+ * xid with the given mode; if this tuple is updated, recurse to lock the new
+ * version as well.
+ */
+static HTSU_Result
+heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
+ LockTupleMode mode)
+{
+ ItemPointerData tupid;
+ HeapTupleData mytup;
+ Buffer buf;
+ uint16 new_infomask,
+ new_infomask2,
+ old_infomask;
+ TransactionId xmax,
+ new_xmax;
+
+ ItemPointerCopy(tid, &tupid);
+ for (;;)
+ {
+ new_infomask = 0;
+ new_xmax = InvalidTransactionId;
+ ItemPointerCopy(&tupid, &(mytup.t_self));
+
+ if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
+ elog(ERROR, "unable to fetch updated version of tuple");
+
+l4:
+ CHECK_FOR_INTERRUPTS();
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ old_infomask = mytup.t_data->t_infomask;
+ xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
+
+ /*
+ * If this tuple is updated and the key has been modified (or deleted),
+ * what we do depends on the status of the updating transaction: if
+ * it's live, we sleep until it finishes; if it has committed, we have
+ * to fail (i.e. return HeapTupleUpdated); if it aborted, we ignore it.
+ * For updates that didn't touch the key, we can just plough ahead.
+ */
+ if (!(old_infomask & HEAP_XMAX_INVALID) &&
+ (mytup.t_data->t_infomask2 & HEAP_KEYS_UPDATED))
+ {
+ TransactionId update_xid;
+
+ /*
+ * Note: we *must* check TransactionIdIsInProgress before
+ * TransactionIdDidAbort/Commit; see comment at top of tqual.c for
+ * an explanation.
+ */
+ update_xid = HeapTupleHeaderGetUpdateXid(mytup.t_data);
+ if (TransactionIdIsCurrentTransactionId(update_xid))
+ {
+ UnlockReleaseBuffer(buf);
+ return HeapTupleSelfUpdated;
+ }
+ else if (TransactionIdIsInProgress(update_xid))
+ {
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ /* No LockTupleTuplock here -- see heap_lock_updated_tuple */
+ XactLockTableWait(update_xid);
+ goto l4;
+ }
+ else if (TransactionIdDidAbort(update_xid))
+ ; /* okay to proceed */
+ else if (TransactionIdDidCommit(update_xid))
+ {
+ UnlockReleaseBuffer(buf);
+ return HeapTupleUpdated;
+ }
+ }
+
+ /* compute the new Xmax and infomask values for the tuple ... */
+ compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
+ xid, mode, false,
+ &new_xmax, &new_infomask, &new_infomask2);
+
+ START_CRIT_SECTION();
+
+ /* ... and set them */
+ HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
+ mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
+ mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+ mytup.t_data->t_infomask |= new_infomask;
+ mytup.t_data->t_infomask2 |= new_infomask2;
+
+ MarkBufferDirty(buf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_heap_lock_updated xlrec;
+ XLogRecPtr recptr;
+ XLogRecData rdata[2];
+ Page page = BufferGetPage(buf);
+
+ xlrec.target.node = rel->rd_node;
+ xlrec.target.tid = mytup.t_self;
+ xlrec.xmax = new_xmax;
+ xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
+
+ rdata[0].data = (char *) &xlrec;
+ rdata[0].len = SizeOfHeapLockUpdated;
+ rdata[0].buffer = InvalidBuffer;
+ rdata[0].next = &(rdata[1]);
+
+ rdata[1].data = NULL;
+ rdata[1].len = 0;
+ rdata[1].buffer = buf;
+ rdata[1].buffer_std = true;
+ rdata[1].next = NULL;
+
+ recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED, rdata);
+
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
+
+ END_CRIT_SECTION();
+
+ /* if we find the end of update chain, we're done. */
+ if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
+ ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
+ HeapTupleHeaderIsOnlyLocked(mytup.t_data))
+ {
+ UnlockReleaseBuffer(buf);
+ return HeapTupleMayBeUpdated;
+ }
+
+ /* tail recursion */
+ ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
+ UnlockReleaseBuffer(buf);
+ }
+}
+
+/*
+ * heap_lock_updated_tuple
+ * Follow update chain when locking an updated tuple, acquiring locks (row
+ * marks) on the updated versions.
+ *
+ * The initial tuple is assumed to be already locked.
+ *
+ * This function doesn't check visibility, it just inconditionally marks the
+ * tuple(s) as locked. If any tuple in the updated chain is being deleted
+ * concurrently (or updated with the key being modified), sleep until the
+ * transaction doing it is finished.
+ *
+ * Note that we don't acquire heavyweight tuple locks on the tuples we walk
+ * when we have to wait for other transactions to release them, as opposed to
+ * what heap_lock_tuple does. The reason is that having more than one
+ * transaction walking the chain is probably uncommon enough that risk of
+ * starvation is not likely: one of the preconditions for being here is that
+ * the snapshot in use predates the update that created this tuple (because we
+ * started at an earlier version of the tuple), but at the same time such a
+ * transaction cannot be using repeatable read or serializable isolation
+ * levels, because that would lead to a serializability failure.
+ */
+static HTSU_Result
+heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
+ TransactionId xid, LockTupleMode mode)
+{
+ if (!ItemPointerEquals(&tuple->t_self, ctid))
+ {
+ /*
+ * If this is the first possibly-multixact-able operation in the
+ * current transaction, set my per-backend OldestMemberMXactId setting.
+ * We can be certain that the transaction will never become a member of
+ * any older MultiXactIds than that. (We have to do this even if we
+ * end up just using our own TransactionId below, since some other
+ * backend could incorporate our XID into a MultiXact immediately
+ * afterwards.)
+ */
+ MultiXactIdSetOldestMember();
+
+ return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
+ }
+
+ /* nothing to lock */
return HeapTupleMayBeUpdated;
}
@@ -4010,6 +5049,9 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
* because this function is applied during WAL recovery, when we don't have
* access to any such state, and can't depend on the hint bits to be set.)
*
+ * Similarly, cutoff_multi must be less than or equal to the smallest
+ * MultiXactId used by any transaction currently open.
+ *
* If the tuple is in a shared buffer, caller must hold an exclusive lock on
* that buffer.
*
@@ -4023,7 +5065,8 @@ heap_inplace_update(Relation relation, HeapTuple tuple)
* infomask bits.
*/
bool
-heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid)
+heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
+ MultiXactId cutoff_multi)
{
bool changed = false;
TransactionId xid;
@@ -4043,43 +5086,29 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid)
changed = true;
}
- if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+ /*
+ * Note that this code handles IS_MULTI Xmax values, too, but only to mark
+ * the tuple frozen if the updating Xid in the mxact is below the freeze
+ * cutoff; it doesn't remove dead members of a very old multixact.
+ */
+ xid = HeapTupleHeaderGetRawXmax(tuple);
+ if (TransactionIdIsNormal(xid) &&
+ (((!(tuple->t_infomask & HEAP_XMAX_IS_MULTI) &&
+ TransactionIdPrecedes(xid, cutoff_xid))) ||
+ MultiXactIdPrecedes(xid, cutoff_multi)))
{
- xid = HeapTupleHeaderGetXmax(tuple);
- if (TransactionIdIsNormal(xid) &&
- TransactionIdPrecedes(xid, cutoff_xid))
- {
- HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
+ HeapTupleHeaderSetXmax(tuple, InvalidTransactionId);
- /*
- * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED
- * + LOCKED. Normalize to INVALID just to be sure no one gets
- * confused.
- */
- tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
- tuple->t_infomask |= HEAP_XMAX_INVALID;
- HeapTupleHeaderClearHotUpdated(tuple);
- changed = true;
- }
- }
- else
- {
- /*----------
- * XXX perhaps someday we should zero out very old MultiXactIds here?
- *
- * The only way a stale MultiXactId could pose a problem is if a
- * tuple, having once been multiply-share-locked, is not touched by
- * any vacuum or attempted lock or deletion for just over 4G MultiXact
- * creations, and then in the probably-narrow window where its xmax
- * is again a live MultiXactId, someone tries to lock or delete it.
- * Even then, another share-lock attempt would work fine. An
- * exclusive-lock or delete attempt would face unexpected delay, or
- * in the very worst case get a deadlock error. This seems an
- * extremely low-probability scenario with minimal downside even if
- * it does happen, so for now we don't do the extra bookkeeping that
- * would be needed to clean out MultiXactIds.
- *----------
+ /*
+ * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED
+ * + LOCKED. Normalize to INVALID just to be sure no one gets
+ * confused. Also get rid of the HEAP_KEYS_UPDATED bit.
*/
+ tuple->t_infomask &= ~HEAP_XMAX_BITS;
+ tuple->t_infomask |= HEAP_XMAX_INVALID;
+ HeapTupleHeaderClearHotUpdated(tuple);
+ tuple->t_infomask2 &= ~HEAP_KEYS_UPDATED;
+ changed = true;
}
/*
@@ -4116,17 +5145,268 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid)
}
/*
+ * For a given MultiXactId, return the hint bits that should be set in the
+ * tuple's infomask.
+ *
+ * Normally this should be called for a multixact that was just created, and
+ * so is on our local cache, so the GetMembers call is fast.
+ */
+static void
+GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
+ uint16 *new_infomask2)
+{
+ int nmembers;
+ MultiXactMember *members;
+ int i;
+ uint16 bits = HEAP_XMAX_IS_MULTI;
+ uint16 bits2 = 0;
+ bool has_update = false;
+
+ /*
+ * We only use this in multis we just created, so they cannot be values
+ * pre-pg_upgrade.
+ */
+ nmembers = GetMultiXactIdMembers(multi, &members, false);
+
+ for (i = 0; i < nmembers; i++)
+ {
+ switch (members[i].status)
+ {
+ case MultiXactStatusForKeyShare:
+ bits |= HEAP_XMAX_KEYSHR_LOCK;
+ break;
+ case MultiXactStatusForShare:
+ bits |= HEAP_XMAX_SHR_LOCK;
+ break;
+ case MultiXactStatusForNoKeyUpdate:
+ bits |= HEAP_XMAX_EXCL_LOCK;
+ break;
+ case MultiXactStatusForUpdate:
+ bits |= HEAP_XMAX_EXCL_LOCK;
+ bits2 |= HEAP_KEYS_UPDATED;
+ break;
+ case MultiXactStatusNoKeyUpdate:
+ bits |= HEAP_XMAX_EXCL_LOCK;
+ has_update = true;
+ break;
+ case MultiXactStatusUpdate:
+ bits |= HEAP_XMAX_EXCL_LOCK;
+ bits2 |= HEAP_KEYS_UPDATED;
+ has_update = true;
+ break;
+ }
+ }
+ if (!has_update)
+ bits |= HEAP_XMAX_LOCK_ONLY;
+
+ if (nmembers > 0)
+ pfree(members);
+
+ *new_infomask = bits;
+ *new_infomask2 = bits2;
+}
+
+/*
+ * MultiXactIdGetUpdateXid
+ *
+ * Given a multixact Xmax and corresponding infomask, which does not have the
+ * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
+ * transaction.
+ */
+static TransactionId
+MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
+{
+ TransactionId update_xact = InvalidTransactionId;
+ MultiXactMember *members;
+ int nmembers;
+
+ Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
+ Assert(t_infomask & HEAP_XMAX_IS_MULTI);
+
+ /*
+ * Since we know the LOCK_ONLY bit is not set, this cannot be a
+ * multi from pre-pg_upgrade.
+ */
+ nmembers = GetMultiXactIdMembers(xmax, &members, false);
+
+ if (nmembers > 0)
+ {
+ int i;
+
+ for (i = 0; i < nmembers; i++)
+ {
+ /* Ignore lockers */
+ if (members[i].status == MultiXactStatusForKeyShare ||
+ members[i].status == MultiXactStatusForShare ||
+ members[i].status == MultiXactStatusForNoKeyUpdate ||
+ members[i].status == MultiXactStatusForUpdate)
+ continue;
+
+ /* ignore aborted transactions */
+ if (TransactionIdDidAbort(members[i].xid))
+ continue;
+ /* there should be at most one non-aborted updater */
+ Assert(update_xact == InvalidTransactionId);
+ Assert(members[i].status == MultiXactStatusNoKeyUpdate ||
+ members[i].status == MultiXactStatusUpdate);
+ update_xact = members[i].xid;
+#ifndef USE_ASSERT_CHECKING
+ /*
+ * in an assert-enabled build, walk the whole array to ensure
+ * there's no other updater.
+ */
+ break;
+#endif
+ }
+
+ pfree(members);
+ }
+
+ return update_xact;
+}
+
+/*
+ * HeapTupleGetUpdateXid
+ * As above, but use a HeapTupleHeader
+ *
+ * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
+ * checking the hint bits.
+ */
+TransactionId
+HeapTupleGetUpdateXid(HeapTupleHeader tuple)
+{
+ return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
+ tuple->t_infomask);
+}
+
+/*
+ * Do_MultiXactIdWait
+ * Actual implementation for the two functions below.
+ *
+ * We do this by sleeping on each member using XactLockTableWait. Any
+ * members that belong to the current backend are *not* waited for, however;
+ * this would not merely be useless but would lead to Assert failure inside
+ * XactLockTableWait. By the time this returns, it is certain that all
+ * transactions *of other backends* that were members of the MultiXactId
+ * that conflict with the requested status are dead (and no new ones can have
+ * been added, since it is not legal to add members to an existing
+ * MultiXactId).
+ *
+ * But by the time we finish sleeping, someone else may have changed the Xmax
+ * of the containing tuple, so the caller needs to iterate on us somehow.
+ *
+ * Note that in case we return false, the number of remaining members is
+ * not to be trusted.
+ */
+static bool
+Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+ int *remaining, uint16 infomask, bool nowait)
+{
+ bool allow_old;
+ bool result = true;
+ MultiXactMember *members;
+ int nmembers;
+ int remain = 0;
+
+ allow_old = !(infomask & HEAP_LOCK_MASK) && HEAP_XMAX_IS_LOCKED_ONLY(infomask);
+ nmembers = GetMultiXactIdMembers(multi, &members, allow_old);
+
+ if (nmembers >= 0)
+ {
+ int i;
+
+ for (i = 0; i < nmembers; i++)
+ {
+ TransactionId memxid = members[i].xid;
+ MultiXactStatus memstatus = members[i].status;
+
+ if (TransactionIdIsCurrentTransactionId(memxid))
+ {
+ remain++;
+ continue;
+ }
+
+ if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
+ LOCKMODE_from_mxstatus(status)))
+ {
+ if (remaining && TransactionIdIsInProgress(memxid))
+ remain++;
+ continue;
+ }
+
+ /*
+ * This member conflicts with our multi, so we have to sleep (or
+ * return failure, if asked to avoid waiting.)
+ */
+ if (nowait)
+ {
+ result = ConditionalXactLockTableWait(memxid);
+ if (!result)
+ break;
+ }
+ else
+ XactLockTableWait(memxid);
+ }
+
+ pfree(members);
+ }
+
+ if (remaining)
+ *remaining = remain;
+
+ return result;
+}
+
+/*
+ * MultiXactIdWait
+ * Sleep on a MultiXactId.
+ *
+ * By the time we finish sleeping, someone else may have changed the Xmax
+ * of the containing tuple, so the caller needs to iterate on us somehow.
+ *
+ * We return (in *remaining, if not NULL) the number of members that are still
+ * running, including any (non-aborted) subtransactions of our own transaction.
+ *
+ */
+static void
+MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+ int *remaining, uint16 infomask)
+{
+ Do_MultiXactIdWait(multi, status, remaining, infomask, false);
+}
+
+/*
+ * ConditionalMultiXactIdWait
+ * As above, but only lock if we can get the lock without blocking.
+ *
+ * By the time we finish sleeping, someone else may have changed the Xmax
+ * of the containing tuple, so the caller needs to iterate on us somehow.
+ *
+ * If the multixact is now all gone, return true. Returns false if some
+ * transactions might still be running.
+ *
+ * We return (in *remaining, if not NULL) the number of members that are still
+ * running, including any (non-aborted) subtransactions of our own transaction.
+ */
+static bool
+ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
+ int *remaining, uint16 infomask)
+{
+ return Do_MultiXactIdWait(multi, status, remaining, infomask, true);
+}
+
+/*
* heap_tuple_needs_freeze
*
* Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
- * are older than the specified cutoff XID. If so, return TRUE.
+ * are older than the specified cutoff XID or MultiXactId. If so, return TRUE.
*
* It doesn't matter whether the tuple is alive or dead, we are checking
* to see if a tuple needs to be removed or frozen to avoid wraparound.
*/
bool
heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
- Buffer buf)
+ MultiXactId cutoff_multi, Buffer buf)
{
TransactionId xid;
@@ -4135,12 +5415,23 @@ heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
TransactionIdPrecedes(xid, cutoff_xid))
return true;
- if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+ if (!(tuple->t_infomask & HEAP_XMAX_INVALID))
{
- xid = HeapTupleHeaderGetXmax(tuple);
- if (TransactionIdIsNormal(xid) &&
- TransactionIdPrecedes(xid, cutoff_xid))
- return true;
+ if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI))
+ {
+ xid = HeapTupleHeaderGetRawXmax(tuple);
+ if (TransactionIdIsNormal(xid) &&
+ TransactionIdPrecedes(xid, cutoff_xid))
+ return true;
+ }
+ else
+ {
+ MultiXactId multi;
+
+ multi = HeapTupleHeaderGetRawXmax(tuple);
+ if (MultiXactIdPrecedes(multi, cutoff_multi))
+ return true;
+ }
}
if (tuple->t_infomask & HEAP_MOVED)
@@ -4231,7 +5522,7 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
TransactionId *latestRemovedXid)
{
TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
- TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
+ TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
if (tuple->t_infomask & HEAP_MOVED)
@@ -4387,7 +5678,7 @@ log_heap_clean(Relation reln, Buffer buffer,
*/
XLogRecPtr
log_heap_freeze(Relation reln, Buffer buffer,
- TransactionId cutoff_xid,
+ TransactionId cutoff_xid, MultiXactId cutoff_multi,
OffsetNumber *offsets, int offcnt)
{
xl_heap_freeze xlrec;
@@ -4402,6 +5693,7 @@ log_heap_freeze(Relation reln, Buffer buffer,
xlrec.node = reln->rd_node;
xlrec.block = BufferGetBlockNumber(buffer);
xlrec.cutoff_xid = cutoff_xid;
+ xlrec.cutoff_multi = cutoff_multi;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfHeapFreeze;
@@ -4463,8 +5755,8 @@ log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer vm_buffer,
* have modified the buffer(s) and marked them dirty.
*/
static XLogRecPtr
-log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
- Buffer newbuf, HeapTuple newtup,
+log_heap_update(Relation reln, Buffer oldbuf,
+ Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
bool all_visible_cleared, bool new_all_visible_cleared)
{
xl_heap_update xlrec;
@@ -4483,7 +5775,11 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
info = XLOG_HEAP_UPDATE;
xlrec.target.node = reln->rd_node;
- xlrec.target.tid = from;
+ xlrec.target.tid = oldtup->t_self;
+ xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
+ xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
+ oldtup->t_data->t_infomask2);
+ xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
xlrec.all_visible_cleared = all_visible_cleared;
xlrec.newtid = newtup->t_self;
xlrec.new_all_visible_cleared = new_all_visible_cleared;
@@ -4748,6 +6044,7 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
{
xl_heap_freeze *xlrec = (xl_heap_freeze *) XLogRecGetData(record);
TransactionId cutoff_xid = xlrec->cutoff_xid;
+ MultiXactId cutoff_multi = xlrec->cutoff_multi;
Buffer buffer;
Page page;
@@ -4790,7 +6087,7 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
ItemId lp = PageGetItemId(page, *offsets);
HeapTupleHeader tuple = (HeapTupleHeader) PageGetItem(page, lp);
- (void) heap_freeze_tuple(tuple, cutoff_xid);
+ (void) heap_freeze_tuple(tuple, cutoff_xid, cutoff_multi);
offsets++;
}
}
@@ -4937,6 +6234,33 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
UnlockReleaseBuffer(buffer);
}
+/*
+ * Given an "infobits" field from an XLog record, set the correct bits in the
+ * given infomask and infomask2 for the tuple touched by the record.
+ *
+ * (This is the reverse of compute_infobits).
+ */
+static void
+fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
+{
+ *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
+ HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
+ *infomask2 &= ~HEAP_KEYS_UPDATED;
+
+ if (infobits & XLHL_XMAX_IS_MULTI)
+ *infomask |= HEAP_XMAX_IS_MULTI;
+ if (infobits & XLHL_XMAX_LOCK_ONLY)
+ *infomask |= HEAP_XMAX_LOCK_ONLY;
+ if (infobits & XLHL_XMAX_EXCL_LOCK)
+ *infomask |= HEAP_XMAX_EXCL_LOCK;
+ /* note HEAP_XMAX_SHR_LOCK isn't considered here */
+ if (infobits & XLHL_XMAX_KEYSHR_LOCK)
+ *infomask |= HEAP_XMAX_KEYSHR_LOCK;
+
+ if (infobits & XLHL_KEYS_UPDATED)
+ *infomask2 |= HEAP_KEYS_UPDATED;
+}
+
static void
heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
{
@@ -4992,13 +6316,12 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
htup = (HeapTupleHeader) PageGetItem(page, lp);
- htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
- HEAP_XMAX_INVALID |
- HEAP_XMAX_IS_MULTI |
- HEAP_IS_LOCKED |
- HEAP_MOVED);
+ htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+ htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
HeapTupleHeaderClearHotUpdated(htup);
- HeapTupleHeaderSetXmax(htup, record->xl_xid);
+ fix_infomask_from_infobits(xlrec->infobits_set,
+ &htup->t_infomask, &htup->t_infomask2);
+ HeapTupleHeaderSetXmax(htup, xlrec->xmax);
HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
/* Mark the page as a candidate for pruning */
@@ -5368,16 +6691,15 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
htup = (HeapTupleHeader) PageGetItem(page, lp);
- htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
- HEAP_XMAX_INVALID |
- HEAP_XMAX_IS_MULTI |
- HEAP_IS_LOCKED |
- HEAP_MOVED);
+ htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
+ htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
if (hot_update)
HeapTupleHeaderSetHotUpdated(htup);
else
HeapTupleHeaderClearHotUpdated(htup);
- HeapTupleHeaderSetXmax(htup, record->xl_xid);
+ fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
+ &htup->t_infomask2);
+ HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
/* Set forward chain link in t_ctid */
htup->t_ctid = xlrec->newtid;
@@ -5484,6 +6806,7 @@ newsame:;
HeapTupleHeaderSetXmin(htup, record->xl_xid);
HeapTupleHeaderSetCmin(htup, FirstCommandId);
+ HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
/* Make sure there is no forward chain link in t_ctid */
htup->t_ctid = xlrec->newtid;
@@ -5564,17 +6887,8 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
htup = (HeapTupleHeader) PageGetItem(page, lp);
- htup->t_infomask &= ~(HEAP_XMAX_COMMITTED |
- HEAP_XMAX_INVALID |
- HEAP_XMAX_IS_MULTI |
- HEAP_IS_LOCKED |
- HEAP_MOVED);
- if (xlrec->xid_is_mxact)
- htup->t_infomask |= HEAP_XMAX_IS_MULTI;
- if (xlrec->shared_lock)
- htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
- else
- htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
+ fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
+ &htup->t_infomask2);
HeapTupleHeaderClearHotUpdated(htup);
HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
@@ -5587,6 +6901,56 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
}
static void
+heap_xlog_lock_updated(XLogRecPtr lsn, XLogRecord *record)
+{
+ xl_heap_lock_updated *xlrec =
+ (xl_heap_lock_updated *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+ OffsetNumber offnum;
+ ItemId lp = NULL;
+ HeapTupleHeader htup;
+
+ /* If we have a full-page image, restore it and we're done */
+ if (record->xl_info & XLR_BKP_BLOCK(0))
+ {
+ (void) RestoreBackupBlock(lsn, record, 0, false, false);
+ return;
+ }
+
+ buffer = XLogReadBuffer(xlrec->target.node,
+ ItemPointerGetBlockNumber(&(xlrec->target.tid)),
+ false);
+ if (!BufferIsValid(buffer))
+ return;
+ page = (Page) BufferGetPage(buffer);
+
+ if (lsn <= PageGetLSN(page)) /* changes are applied */
+ {
+ UnlockReleaseBuffer(buffer);
+ return;
+ }
+
+ offnum = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
+ if (PageGetMaxOffsetNumber(page) >= offnum)
+ lp = PageGetItemId(page, offnum);
+
+ if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
+ elog(PANIC, "heap_xlog_lock_updated: invalid lp");
+
+ htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+ fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
+ &htup->t_infomask2);
+ HeapTupleHeaderSetXmax(htup, xlrec->xmax);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
heap_xlog_inplace(XLogRecPtr lsn, XLogRecord *record)
{
xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
@@ -5702,6 +7066,9 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
case XLOG_HEAP2_MULTI_INSERT:
heap_xlog_multi_insert(lsn, record);
break;
+ case XLOG_HEAP2_LOCK_UPDATED:
+ heap_xlog_lock_updated(lsn, record);
+ break;
default:
elog(PANIC, "heap2_redo: unknown op code %u", info);
}