aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/heap/README.tuplock4
-rw-r--r--src/backend/access/heap/heapam.c58
2 files changed, 46 insertions, 16 deletions
diff --git a/src/backend/access/heap/README.tuplock b/src/backend/access/heap/README.tuplock
index 818cd7f9806..31c52ad28f9 100644
--- a/src/backend/access/heap/README.tuplock
+++ b/src/backend/access/heap/README.tuplock
@@ -203,6 +203,4 @@ Inplace updates create an exception to the rule that tuple data won't change
under a reader holding a pin. A reader of a heap_fetch() result tuple may
witness a torn read. Current inplace-updated fields are aligned and are no
wider than four bytes, and current readers don't need consistency across
-fields. Hence, they get by with just fetching each field once. XXX such a
-caller may also read a value that has not reached WAL; see
-systable_inplace_update_finish().
+fields. Hence, they get by with just fetching each field once.
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 3a13671a1ef..4c8febdc811 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6326,6 +6326,8 @@ heap_inplace_update_and_unlock(Relation relation,
HeapTupleHeader htup = oldtup->t_data;
uint32 oldlen;
uint32 newlen;
+ char *dst;
+ char *src;
int nmsgs = 0;
SharedInvalidationMessage *invalMessages = NULL;
bool RelcacheInitFileInval = false;
@@ -6336,6 +6338,9 @@ heap_inplace_update_and_unlock(Relation relation,
if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
elog(ERROR, "wrong tuple length");
+ dst = (char *) htup + htup->t_hoff;
+ src = (char *) tuple->t_data + tuple->t_data->t_hoff;
+
/*
* Construct shared cache inval if necessary. Note that because we only
* pass the new version of the tuple, this mustn't be used for any
@@ -6359,15 +6364,15 @@ heap_inplace_update_and_unlock(Relation relation,
*/
PreInplace_Inval();
- /* NO EREPORT(ERROR) from here till changes are logged */
- START_CRIT_SECTION();
-
- memcpy((char *) htup + htup->t_hoff,
- (char *) tuple->t_data + tuple->t_data->t_hoff,
- newlen);
-
/*----------
- * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
+ * NO EREPORT(ERROR) from here till changes are complete
+ *
+ * Our buffer lock won't stop a reader having already pinned and checked
+ * visibility for this tuple. Hence, we write WAL first, then mutate the
+ * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
+ * checkpoint delay makes that acceptable. With the usual order of
+ * changes, a crash after memcpy() and before XLogInsert() could allow
+ * datfrozenxid to overtake relfrozenxid:
*
* ["D" is a VACUUM (ONLY_DATABASE_STATS)]
* ["R" is a VACUUM tbl]
@@ -6377,14 +6382,28 @@ heap_inplace_update_and_unlock(Relation relation,
* D: raise pg_database.datfrozenxid, XLogInsert(), finish
* [crash]
* [recovery restores datfrozenxid w/o relfrozenxid]
+ *
+ * Like in MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(), copy
+ * the buffer to the stack before logging. Here, that facilitates a FPI
+ * of the post-mutation block before we accept other sessions seeing it.
*/
-
- MarkBufferDirty(buffer);
+ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+ START_CRIT_SECTION();
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
xl_heap_inplace xlrec;
+ PGAlignedBlock copied_buffer;
+ char *origdata = (char *) BufferGetBlock(buffer);
+ Page page = BufferGetPage(buffer);
+ uint16 lower = ((PageHeader) page)->pd_lower;
+ uint16 upper = ((PageHeader) page)->pd_upper;
+ uintptr_t dst_offset_in_block;
+ RelFileLocator rlocator;
+ ForkNumber forkno;
+ BlockNumber blkno;
XLogRecPtr recptr;
xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
@@ -6399,16 +6418,28 @@ heap_inplace_update_and_unlock(Relation relation,
XLogRegisterData((char *) invalMessages,
nmsgs * sizeof(SharedInvalidationMessage));
- XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
- XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
+ /* register block matching what buffer will look like after changes */
+ memcpy(copied_buffer.data, origdata, lower);
+ memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
+ dst_offset_in_block = dst - origdata;
+ memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
+ BufferGetTag(buffer, &rlocator, &forkno, &blkno);
+ Assert(forkno == MAIN_FORKNUM);
+ XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
+ REGBUF_STANDARD);
+ XLogRegisterBufData(0, src, newlen);
/* inplace updates aren't decoded atm, don't log the origin */
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
- PageSetLSN(BufferGetPage(buffer), recptr);
+ PageSetLSN(page, recptr);
}
+ memcpy(dst, src, newlen);
+
+ MarkBufferDirty(buffer);
+
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
/*
@@ -6421,6 +6452,7 @@ heap_inplace_update_and_unlock(Relation relation,
*/
AtInplace_Inval();
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
END_CRIT_SECTION();
UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);