diff options
Diffstat (limited to 'src')
82 files changed, 1115 insertions, 848 deletions
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 27f88e0eb21..5365477000a 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/xloginsert.h" #include "miscadmin.h" #include "utils/rel.h" diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index e2d15a85fe4..97cd706c08e 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/xloginsert.h" #include "lib/ilist.h" #include "miscadmin.h" #include "utils/memutils.h" diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index 412f90da4db..84dc1e228c1 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/xloginsert.h" #include "miscadmin.h" #include "utils/rel.h" diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index 09c3e39bf3b..ed581977f54 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -19,6 +19,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/xloginsert.h" #include "commands/vacuum.h" #include "miscadmin.h" #include "utils/memutils.h" diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 2fbd1bf5e47..370884ed17f 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/xloginsert.h" #include "catalog/index.h" #include "miscadmin.h" #include "storage/bufmgr.h" diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 3ca0b68434b..1f8db9de6d9 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -16,6 +16,7 @@ #include "access/gin_private.h" #include "access/reloptions.h" +#include "access/xloginsert.h" #include "catalog/pg_collation.h" #include "catalog/pg_type.h" #include "miscadmin.h" diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index af4d2714b5f..3a61321a835 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/xloginsert.h" #include "commands/vacuum.h" #include "miscadmin.h" #include "postmaster/autovacuum.h" diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 1832687aa0f..2143096c66b 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -18,6 +18,7 @@ #include "access/genam.h" #include "access/gist_private.h" +#include "access/xloginsert.h" #include "catalog/index.h" #include "miscadmin.h" #include "optimizer/cost.h" diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index ecc095671df..b732f532679 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -14,6 +14,7 @@ #include "postgres.h" #include "access/gist_private.h" +#include "access/xloginsert.h" #include "access/xlogutils.h" #include "utils/memutils.h" diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 808b942c5e8..8f671ac4342 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -51,6 +51,8 @@ #include "access/valid.h" #include "access/visibilitymap.h" #include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/namespace.h" diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 06b54889230..e4561bc3a27 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -18,6 +18,7 @@ #include "access/heapam_xlog.h" #include "access/transam.h" #include "access/htup_details.h" +#include "access/xlog.h" #include "catalog/catalog.h" #include "miscadmin.h" #include "pgstat.h" diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 951f3f1a489..bea52460a08 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -113,6 +113,7 @@ #include "access/transam.h" #include "access/tuptoaster.h" #include "access/xact.h" +#include "access/xloginsert.h" #include "catalog/catalog.h" diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index a0c0c7f2a6b..350a52fc52a 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -84,6 +84,7 @@ #include "access/heapam_xlog.h" #include "access/visibilitymap.h" +#include "access/xlog.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 59d7006c94e..bcaba7e5e84 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -18,6 +18,7 @@ #include "access/heapam.h" #include "access/nbtree.h" #include "access/transam.h" +#include "access/xloginsert.h" #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/predicate.h" diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index b71f65de2c1..6093215c43d 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -24,6 +24,8 @@ #include "access/nbtree.h" #include "access/transam.h" +#include "access/xlog.h" +#include "access/xloginsert.h" #include "miscadmin.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 117b18e5905..d881525bd38 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -20,6 +20,7 @@ #include "access/nbtree.h" #include "access/relscan.h" +#include "access/xlog.h" #include "catalog/index.h" #include "commands/vacuum.h" #include "storage/indexfsm.h" diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index e8a89d24ae5..31d48210b22 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -67,6 +67,8 @@ #include "postgres.h" #include "access/nbtree.h" +#include "access/xlog.h" +#include "access/xloginsert.h" #include "miscadmin.h" #include "storage/smgr.h" #include "tcop/tcopprot.h" diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index fac006e2972..13951be62af 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -17,6 +17,8 @@ #include "access/heapam_xlog.h" #include "access/nbtree.h" #include "access/transam.h" +#include "access/xlog.h" +#include "access/xlogutils.h" #include "storage/procarray.h" #include "miscadmin.h" diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c index bab93a5dd31..21a071ab199 100644 --- a/src/backend/access/spgist/spgdoinsert.c +++ b/src/backend/access/spgist/spgdoinsert.c @@ -16,6 +16,7 @@ #include "postgres.h" #include "access/genam.h" +#include "access/xloginsert.h" #include "access/spgist_private.h" #include "miscadmin.h" #include "storage/bufmgr.h" diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 7c104f49cc8..e1dfc8e3580 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -18,6 +18,8 @@ #include "access/genam.h" #include "access/spgist_private.h" +#include "access/xlog.h" +#include "access/xloginsert.h" #include "catalog/index.h" #include "miscadmin.h" #include "storage/bufmgr.h" diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 01b8ffe5acd..a028cf1f42a 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -18,6 +18,7 @@ #include "access/genam.h" #include "access/spgist_private.h" #include "access/transam.h" +#include "access/xloginsert.h" #include "catalog/storage_xlog.h" #include "commands/vacuum.h" #include "miscadmin.h" diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index 911a14828e3..920739436ac 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -16,6 +16,7 @@ #include "access/spgist_private.h" #include "access/transam.h" +#include "access/xlog.h" #include "access/xlogutils.h" #include "storage/standby.h" #include "utils/memutils.h" diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index eb6cfc5c44e..82a6c7695fc 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -14,7 +14,7 @@ include $(top_builddir)/src/Makefile.global OBJS = clog.o transam.o varsup.o xact.o rmgr.o slru.o subtrans.o multixact.o \ timeline.o twophase.o twophase_rmgr.o xlog.o xlogarchive.o xlogfuncs.o \ - xlogreader.o xlogutils.o + xloginsert.o xlogreader.o xlogutils.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 27ca4c65673..5ee070bd0a9 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -35,6 +35,9 @@ #include "access/clog.h" #include "access/slru.h" #include "access/transam.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" #include "miscadmin.h" #include "pg_trace.h" diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 33346c76643..bfbe738530e 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -72,6 +72,8 @@ #include "access/twophase.h" #include "access/twophase_rmgr.h" #include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" #include "catalog/pg_type.h" #include "commands/dbcommands.h" #include "funcapi.h" diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index c4069c39a20..d23c292edcd 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -48,6 +48,7 @@ #include "access/twophase_rmgr.h" #include "access/xact.h" #include "access/xlog.h" +#include "access/xloginsert.h" #include "access/xlogutils.h" #include "catalog/pg_type.h" #include "catalog/storage.h" diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 7013fb894b4..d51cca406c7 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -17,6 +17,7 @@ #include "access/subtrans.h" #include "access/transam.h" #include "access/xact.h" +#include "access/xlog.h" #include "commands/dbcommands.h" #include "miscadmin.h" #include "postmaster/autovacuum.h" diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 651a5c40f46..6f92bad07ca 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -25,6 +25,8 @@ #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/namespace.h" diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 3160db72458..563d442a7a3 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -31,6 +31,7 @@ #include "access/twophase.h" #include "access/xact.h" #include "access/xlog_internal.h" +#include "access/xloginsert.h" #include "access/xlogreader.h" #include "access/xlogutils.h" #include "catalog/catversion.h" @@ -300,14 +301,21 @@ XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr; * (which is almost but not quite the same as a pointer to the most recent * CHECKPOINT record). We update this from the shared-memory copy, * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we - * hold an insertion lock). See XLogInsert for details. We are also allowed - * to update from XLogCtl->RedoRecPtr if we hold the info_lck; + * hold an insertion lock). See XLogInsertRecord for details. We are also + * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck; * see GetRedoRecPtr. A freshly spawned backend obtains the value during * InitXLOGAccess. */ static XLogRecPtr RedoRecPtr; /* + * doPageWrites is this backend's local copy of (forcePageWrites || + * fullPageWrites). It is used together with RedoRecPtr to decide whether + * a full-page image of a page need to be taken. + */ +static bool doPageWrites; + +/* * RedoStartLSN points to the checkpoint's REDO location which is specified * in a backup label file, backup history file or control file. In standby * mode, XLOG streaming usually starts from the position where an invalid @@ -419,7 +427,7 @@ typedef union WALInsertLockPadded } WALInsertLockPadded; /* - * Shared state data for XLogInsert. + * Shared state data for WAL insertion. */ typedef struct XLogCtlInsert { @@ -765,10 +773,6 @@ static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo); static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); -static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock, - XLogRecPtr *lsn, BkpBlock *bkpb); -static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, - char *blk, bool get_cleanup_lock, bool keep_buffer); static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic); static bool XLogCheckpointNeeded(XLogSegNo new_segno); static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible); @@ -831,226 +835,45 @@ static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); /* - * Insert an XLOG record having the specified RMID and info bytes, - * with the body of the record being the data chunk(s) described by - * the rdata chain (see xlog.h for notes about rdata). + * Insert an XLOG record represented by an already-constructed chain of data + * chunks. This is a low-level routine; to construct the WAL record header + * and data, use the higher-level routines in xloginsert.c. + * + * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this + * WAL record applies to, that were not included in the record as full page + * images. If fpw_lsn >= RedoRecPtr, the function does not perform the + * insertion and returns InvalidXLogRecPtr. The caller can then recalculate + * which pages need a full-page image, and retry. If fpw_lsn is invalid, the + * record is always inserted. + * + * The first XLogRecData in the chain must be for the record header, and its + * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and + * xl_crc fields in the header, the rest of the header must already be filled + * by the caller. * * Returns XLOG pointer to end of record (beginning of next record). * This can be used as LSN for data pages affected by the logged action. * (LSN is the XLOG point up to which the XLOG must be flushed to disk * before the data page can be written out. This implements the basic * WAL rule "write the log before the data".) - * - * NB: this routine feels free to scribble on the XLogRecData structs, - * though not on the data they reference. This is OK since the XLogRecData - * structs are always just temporaries in the calling code. */ XLogRecPtr -XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) +XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) { XLogCtlInsert *Insert = &XLogCtl->Insert; XLogRecData *rdt; - XLogRecData *rdt_lastnormal; - Buffer dtbuf[XLR_MAX_BKP_BLOCKS]; - bool dtbuf_bkp[XLR_MAX_BKP_BLOCKS]; - BkpBlock dtbuf_xlg[XLR_MAX_BKP_BLOCKS]; - XLogRecPtr dtbuf_lsn[XLR_MAX_BKP_BLOCKS]; - XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS]; - XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS]; - XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS]; - XLogRecData hdr_rdt; pg_crc32 rdata_crc; - uint32 len, - write_len; - unsigned i; - bool doPageWrites; - bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); bool inserted; - uint8 info_orig = info; - static XLogRecord *rechdr; + XLogRecord *rechdr = (XLogRecord *) rdata->data; + bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && + rechdr->xl_info == XLOG_SWITCH); XLogRecPtr StartPos; XLogRecPtr EndPos; - if (rechdr == NULL) - { - static char rechdrbuf[SizeOfXLogRecord + MAXIMUM_ALIGNOF]; - - rechdr = (XLogRecord *) MAXALIGN(&rechdrbuf); - MemSet(rechdr, 0, SizeOfXLogRecord); - } - /* cross-check on whether we should be here or not */ if (!XLogInsertAllowed()) elog(ERROR, "cannot make new WAL entries during recovery"); - /* info's high bits are reserved for use by me */ - if (info & XLR_INFO_MASK) - elog(PANIC, "invalid xlog info mask %02X", info); - - TRACE_POSTGRESQL_XLOG_INSERT(rmid, info); - - /* - * In bootstrap mode, we don't actually log anything but XLOG resources; - * return a phony record pointer. - */ - if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID) - { - EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */ - return EndPos; - } - - /* - * Here we scan the rdata chain, to determine which buffers must be backed - * up. - * - * We may have to loop back to here if a race condition is detected below. - * We could prevent the race by doing all this work while holding an - * insertion lock, but it seems better to avoid doing CRC calculations - * while holding one. - * - * We add entries for backup blocks to the chain, so that they don't need - * any special treatment in the critical section where the chunks are - * copied into the WAL buffers. Those entries have to be unlinked from the - * chain if we have to loop back here. - */ -begin:; - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) - { - dtbuf[i] = InvalidBuffer; - dtbuf_bkp[i] = false; - } - - /* - * Decide if we need to do full-page writes in this XLOG record: true if - * full_page_writes is on or we have a PITR request for it. Since we - * don't yet have an insertion lock, fullPageWrites and forcePageWrites - * could change under us, but we'll recheck them once we have a lock. - */ - doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites; - - len = 0; - for (rdt = rdata;;) - { - if (rdt->buffer == InvalidBuffer) - { - /* Simple data, just include it */ - len += rdt->len; - } - else - { - /* Find info for buffer */ - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) - { - if (rdt->buffer == dtbuf[i]) - { - /* Buffer already referenced by earlier chain item */ - if (dtbuf_bkp[i]) - { - rdt->data = NULL; - rdt->len = 0; - } - else if (rdt->data) - len += rdt->len; - break; - } - if (dtbuf[i] == InvalidBuffer) - { - /* OK, put it in this slot */ - dtbuf[i] = rdt->buffer; - if (doPageWrites && XLogCheckBuffer(rdt, true, - &(dtbuf_lsn[i]), &(dtbuf_xlg[i]))) - { - dtbuf_bkp[i] = true; - rdt->data = NULL; - rdt->len = 0; - } - else if (rdt->data) - len += rdt->len; - break; - } - } - if (i >= XLR_MAX_BKP_BLOCKS) - elog(PANIC, "can backup at most %d blocks per xlog record", - XLR_MAX_BKP_BLOCKS); - } - /* Break out of loop when rdt points to last chain item */ - if (rdt->next == NULL) - break; - rdt = rdt->next; - } - - /* - * NOTE: We disallow len == 0 because it provides a useful bit of extra - * error checking in ReadRecord. This means that all callers of - * XLogInsert must supply at least some not-in-a-buffer data. However, we - * make an exception for XLOG SWITCH records because we don't want them to - * ever cross a segment boundary. - */ - if (len == 0 && !isLogSwitch) - elog(PANIC, "invalid xlog record length %u", len); - - /* - * Make additional rdata chain entries for the backup blocks, so that we - * don't need to special-case them in the write loop. This modifies the - * original rdata chain, but we keep a pointer to the last regular entry, - * rdt_lastnormal, so that we can undo this if we have to loop back to the - * beginning. - * - * At the exit of this loop, write_len includes the backup block data. - * - * Also set the appropriate info bits to show which buffers were backed - * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer - * value (ignoring InvalidBuffer) appearing in the rdata chain. - */ - rdt_lastnormal = rdt; - write_len = len; - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) - { - BkpBlock *bkpb; - char *page; - - if (!dtbuf_bkp[i]) - continue; - - info |= XLR_BKP_BLOCK(i); - - bkpb = &(dtbuf_xlg[i]); - page = (char *) BufferGetBlock(dtbuf[i]); - - rdt->next = &(dtbuf_rdt1[i]); - rdt = rdt->next; - - rdt->data = (char *) bkpb; - rdt->len = sizeof(BkpBlock); - write_len += sizeof(BkpBlock); - - rdt->next = &(dtbuf_rdt2[i]); - rdt = rdt->next; - - if (bkpb->hole_length == 0) - { - rdt->data = page; - rdt->len = BLCKSZ; - write_len += BLCKSZ; - rdt->next = NULL; - } - else - { - /* must skip the hole */ - rdt->data = page; - rdt->len = bkpb->hole_offset; - write_len += bkpb->hole_offset; - - rdt->next = &(dtbuf_rdt3[i]); - rdt = rdt->next; - - rdt->data = page + (bkpb->hole_offset + bkpb->hole_length); - rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length); - write_len += rdt->len; - rdt->next = NULL; - } - } - /* * Calculate CRC of the data, including all the backup blocks * @@ -1060,29 +883,15 @@ begin:; * header. */ INIT_CRC32C(rdata_crc); - for (rdt = rdata; rdt != NULL; rdt = rdt->next) + for (rdt = rdata->next; rdt != NULL; rdt = rdt->next) COMP_CRC32C(rdata_crc, rdt->data, rdt->len); /* - * Construct record header (prev-link is filled in later, after reserving - * the space for the record), and make that the first chunk in the chain. - * - * The CRC calculated for the header here doesn't include prev-link, - * because we don't know it yet. It will be added later. - */ - rechdr->xl_xid = GetCurrentTransactionIdIfAny(); - rechdr->xl_tot_len = SizeOfXLogRecord + write_len; - rechdr->xl_len = len; /* doesn't include backup blocks */ - rechdr->xl_info = info; - rechdr->xl_rmid = rmid; - rechdr->xl_prev = InvalidXLogRecPtr; + * Calculate CRC of the header, except for prev-link, because we don't + * know it yet. It will be added later. + */ COMP_CRC32C(rdata_crc, ((char *) rechdr), offsetof(XLogRecord, xl_prev)); - hdr_rdt.next = rdata; - hdr_rdt.data = (char *) rechdr; - hdr_rdt.len = SizeOfXLogRecord; - write_len += SizeOfXLogRecord; - /*---------- * * We have now done all the preparatory work we can without holding a @@ -1122,56 +931,33 @@ begin:; WALInsertLockAcquire(); /* - * Check to see if my RedoRecPtr is out of date. If so, may have to go - * back and recompute everything. This can only happen just after a - * checkpoint, so it's better to be slow in this case and fast otherwise. + * Check to see if my copy of RedoRecPtr or doPageWrites is out of date. + * If so, may have to go back and have the caller recompute everything. + * This can only happen just after a checkpoint, so it's better to be + * slow in this case and fast otherwise. * * If we aren't doing full-page writes then RedoRecPtr doesn't actually * affect the contents of the XLOG record, so we'll update our local copy - * but not force a recomputation. + * but not force a recomputation. (If doPageWrites was just turned off, + * we could recompute the record without full pages, but we choose not + * to bother.) */ if (RedoRecPtr != Insert->RedoRecPtr) { Assert(RedoRecPtr < Insert->RedoRecPtr); RedoRecPtr = Insert->RedoRecPtr; - - if (doPageWrites) - { - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) - { - if (dtbuf[i] == InvalidBuffer) - continue; - if (dtbuf_bkp[i] == false && - dtbuf_lsn[i] <= RedoRecPtr) - { - /* - * Oops, this buffer now needs to be backed up, but we - * didn't think so above. Start over. - */ - WALInsertLockRelease(); - END_CRIT_SECTION(); - rdt_lastnormal->next = NULL; - info = info_orig; - goto begin; - } - } - } } + doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); - /* - * Also check to see if fullPageWrites or forcePageWrites was just turned - * on; if we weren't already doing full-page writes then go back and - * recompute. (If it was just turned off, we could recompute the record - * without full pages, but we choose not to bother.) - */ - if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites) + if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr && doPageWrites) { - /* Oops, must redo it with full-page data. */ + /* + * Oops, some buffer now needs to be backed up that the caller + * didn't back up. Start over. + */ WALInsertLockRelease(); END_CRIT_SECTION(); - rdt_lastnormal->next = NULL; - info = info_orig; - goto begin; + return InvalidXLogRecPtr; } /* @@ -1182,7 +968,7 @@ begin:; inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev); else { - ReserveXLogInsertLocation(write_len, &StartPos, &EndPos, + ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos, &rechdr->xl_prev); inserted = true; } @@ -1201,7 +987,8 @@ begin:; * All the record data, including the header, is now ready to be * inserted. Copy the record in the space reserved. */ - CopyXLogRecordToWAL(write_len, isLogSwitch, &hdr_rdt, StartPos, EndPos); + CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata, + StartPos, EndPos); } else { @@ -1437,7 +1224,7 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) } /* - * Subroutine of XLogInsert. Copies a WAL record to an already-reserved + * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved * area in the WAL. */ static void @@ -2004,93 +1791,6 @@ XLogRecPtrToBytePos(XLogRecPtr ptr) } /* - * Determine whether the buffer referenced has to be backed up. - * - * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites - * could change later, so the result should be used for optimization purposes - * only. - */ -bool -XLogCheckBufferNeedsBackup(Buffer buffer) -{ - bool doPageWrites; - Page page; - - page = BufferGetPage(buffer); - - doPageWrites = XLogCtl->Insert.fullPageWrites || XLogCtl->Insert.forcePageWrites; - - if (doPageWrites && PageGetLSN(page) <= RedoRecPtr) - return true; /* buffer requires backup */ - - return false; /* buffer does not need to be backed up */ -} - -/* - * Determine whether the buffer referenced by an XLogRecData item has to - * be backed up, and if so fill a BkpBlock struct for it. In any case - * save the buffer's LSN at *lsn. - */ -static bool -XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock, - XLogRecPtr *lsn, BkpBlock *bkpb) -{ - Page page; - - page = BufferGetPage(rdata->buffer); - - /* - * We assume page LSN is first data on *every* page that can be passed to - * XLogInsert, whether it has the standard page layout or not. We don't - * need to take the buffer header lock for PageGetLSN if we hold an - * exclusive lock on the page and/or the relation. - */ - if (holdsExclusiveLock) - *lsn = PageGetLSN(page); - else - *lsn = BufferGetLSNAtomic(rdata->buffer); - - if (*lsn <= RedoRecPtr) - { - /* - * The page needs to be backed up, so set up *bkpb - */ - BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block); - - if (rdata->buffer_std) - { - /* Assume we can omit data between pd_lower and pd_upper */ - uint16 lower = ((PageHeader) page)->pd_lower; - uint16 upper = ((PageHeader) page)->pd_upper; - - if (lower >= SizeOfPageHeaderData && - upper > lower && - upper <= BLCKSZ) - { - bkpb->hole_offset = lower; - bkpb->hole_length = upper - lower; - } - else - { - /* No "hole" to compress out */ - bkpb->hole_offset = 0; - bkpb->hole_length = 0; - } - } - else - { - /* Not a standard page header, don't try to eliminate "hole" */ - bkpb->hole_offset = 0; - bkpb->hole_length = 0; - } - - return true; /* buffer requires backup */ - } - - return false; /* buffer does not need to be backed up */ -} - -/* * Initialize XLOG buffers, writing out old buffers if they still contain * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is * true, initialize as many pages as we can without having to write out @@ -3944,128 +3644,6 @@ CleanupBackupHistory(void) } /* - * Restore a full-page image from a backup block attached to an XLOG record. - * - * lsn: LSN of the XLOG record being replayed - * record: the complete XLOG record - * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1) - * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock - * keep_buffer: TRUE to return the buffer still locked and pinned - * - * Returns the buffer number containing the page. Note this is not terribly - * useful unless keep_buffer is specified as TRUE. - * - * Note: when a backup block is available in XLOG, we restore it - * unconditionally, even if the page in the database appears newer. - * This is to protect ourselves against database pages that were partially - * or incorrectly written during a crash. We assume that the XLOG data - * must be good because it has passed a CRC check, while the database - * page might not be. This will force us to replay all subsequent - * modifications of the page that appear in XLOG, rather than possibly - * ignoring them as already applied, but that's not a huge drawback. - * - * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer, - * else a normal exclusive lock is used. During crash recovery, that's just - * pro forma because there can't be any regular backends in the system, but - * in hot standby mode the distinction is important. - * - * If 'keep_buffer' is true, return without releasing the buffer lock and pin; - * then caller is responsible for doing UnlockReleaseBuffer() later. This - * is needed in some cases when replaying XLOG records that touch multiple - * pages, to prevent inconsistent states from being visible to other backends. - * (Again, that's only important in hot standby mode.) - */ -Buffer -RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index, - bool get_cleanup_lock, bool keep_buffer) -{ - BkpBlock bkpb; - char *blk; - int i; - - /* Locate requested BkpBlock in the record */ - blk = (char *) XLogRecGetData(record) + record->xl_len; - for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) - { - if (!(record->xl_info & XLR_BKP_BLOCK(i))) - continue; - - memcpy(&bkpb, blk, sizeof(BkpBlock)); - blk += sizeof(BkpBlock); - - if (i == block_index) - { - /* Found it, apply the update */ - return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock, - keep_buffer); - } - - blk += BLCKSZ - bkpb.hole_length; - } - - /* Caller specified a bogus block_index */ - elog(ERROR, "failed to restore block_index %d", block_index); - return InvalidBuffer; /* keep compiler quiet */ -} - -/* - * Workhorse for RestoreBackupBlock usable without an xlog record - * - * Restores a full-page image from BkpBlock and a data pointer. - */ -static Buffer -RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk, - bool get_cleanup_lock, bool keep_buffer) -{ - Buffer buffer; - Page page; - - buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block, - RBM_ZERO); - Assert(BufferIsValid(buffer)); - if (get_cleanup_lock) - LockBufferForCleanup(buffer); - else - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - - page = (Page) BufferGetPage(buffer); - - if (bkpb.hole_length == 0) - { - memcpy((char *) page, blk, BLCKSZ); - } - else - { - memcpy((char *) page, blk, bkpb.hole_offset); - /* must zero-fill the hole */ - MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length); - memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length), - blk + bkpb.hole_offset, - BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); - } - - /* - * The checksum value on this page is currently invalid. We don't need to - * reset it here since it will be set before being written. - */ - - /* - * The page may be uninitialized. If so, we can't set the LSN because that - * would corrupt the page. - */ - if (!PageIsNew(page)) - { - PageSetLSN(page, lsn); - } - MarkBufferDirty(buffer); - - if (!keep_buffer) - UnlockReleaseBuffer(buffer); - - return buffer; -} - -/* * Attempt to read an XLOG record. * * If RecPtr is not NULL, try to read a record at that position. Otherwise @@ -6352,6 +5930,7 @@ StartupXLOG(void) lastFullPageWrites = checkPoint.fullPageWrites; RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; + doPageWrites = lastFullPageWrites; if (RecPtr < checkPoint.redo) ereport(PANIC, @@ -7606,12 +7185,16 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, void InitXLOGAccess(void) { + XLogCtlInsert *Insert = &XLogCtl->Insert; + /* ThisTimeLineID doesn't change so we need no lock to copy it */ ThisTimeLineID = XLogCtl->ThisTimeLineID; Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode()); /* Use GetRedoRecPtr to copy the RedoRecPtr safely */ (void) GetRedoRecPtr(); + /* Also update our copy of doPageWrites. */ + doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); } /* @@ -7640,6 +7223,21 @@ GetRedoRecPtr(void) } /* + * Return information needed to decide whether a modified block needs a + * full-page image to be included in the WAL record. + * + * The returned values are cached copies from backend-private memory, and + * possibly out-of-date. XLogInsertRecord will re-check them against + * up-to-date values, while holding the WAL insert lock. + */ +void +GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p) +{ + *RedoRecPtr_p = RedoRecPtr; + *doPageWrites_p = doPageWrites; +} + +/* * GetInsertRecPtr -- Returns the current insert position. * * NOTE: The value *actually* returned is the position of the last full @@ -8793,218 +8391,6 @@ XLogRestorePoint(const char *rpName) } /* - * Write a backup block if needed when we are setting a hint. Note that - * this may be called for a variety of page types, not just heaps. - * - * Callable while holding just share lock on the buffer content. - * - * We can't use the plain backup block mechanism since that relies on the - * Buffer being exclusively locked. Since some modifications (setting LSN, hint - * bits) are allowed in a sharelocked buffer that can lead to wal checksum - * failures. So instead we copy the page and insert the copied data as normal - * record data. - * - * We only need to do something if page has not yet been full page written in - * this checkpoint round. The LSN of the inserted wal record is returned if we - * had to write, InvalidXLogRecPtr otherwise. - * - * It is possible that multiple concurrent backends could attempt to write WAL - * records. In that case, multiple copies of the same block would be recorded - * in separate WAL records by different backends, though that is still OK from - * a correctness perspective. - */ -XLogRecPtr -XLogSaveBufferForHint(Buffer buffer, bool buffer_std) -{ - XLogRecPtr recptr = InvalidXLogRecPtr; - XLogRecPtr lsn; - XLogRecData rdata[2]; - BkpBlock bkpb; - - /* - * Ensure no checkpoint can change our view of RedoRecPtr. - */ - Assert(MyPgXact->delayChkpt); - - /* - * Update RedoRecPtr so XLogCheckBuffer can make the right decision - */ - GetRedoRecPtr(); - - /* - * Setup phony rdata element for use within XLogCheckBuffer only. We reuse - * and reset rdata for any actual WAL record insert. - */ - rdata[0].buffer = buffer; - rdata[0].buffer_std = buffer_std; - - /* - * Check buffer while not holding an exclusive lock. - */ - if (XLogCheckBuffer(rdata, false, &lsn, &bkpb)) - { - char copied_buffer[BLCKSZ]; - char *origdata = (char *) BufferGetBlock(buffer); - - /* - * Copy buffer so we don't have to worry about concurrent hint bit or - * lsn updates. We assume pd_lower/upper cannot be changed without an - * exclusive lock, so the contents bkp are not racy. - * - * With buffer_std set to false, XLogCheckBuffer() sets hole_length - * and hole_offset to 0; so the following code is safe for either - * case. - */ - memcpy(copied_buffer, origdata, bkpb.hole_offset); - memcpy(copied_buffer + bkpb.hole_offset, - origdata + bkpb.hole_offset + bkpb.hole_length, - BLCKSZ - bkpb.hole_offset - bkpb.hole_length); - - /* - * Header for backup block. - */ - rdata[0].data = (char *) &bkpb; - rdata[0].len = sizeof(BkpBlock); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); - - /* - * Save copy of the buffer. - */ - rdata[1].data = copied_buffer; - rdata[1].len = BLCKSZ - bkpb.hole_length; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; - - recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata); - } - - return recptr; -} - -/* - * Write a WAL record containing a full image of a page. Caller is responsible - * for writing the page to disk after calling this routine. - * - * Note: If you're using this function, you should be building pages in private - * memory and writing them directly to smgr. If you're using buffers, call - * log_newpage_buffer instead. - * - * If the page follows the standard page layout, with a PageHeader and unused - * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows - * the unused space to be left out from the WAL record, making it smaller. - */ -XLogRecPtr -log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, - Page page, bool page_std) -{ - BkpBlock bkpb; - XLogRecPtr recptr; - XLogRecData rdata[3]; - - /* NO ELOG(ERROR) from here till newpage op is logged */ - START_CRIT_SECTION(); - - bkpb.node = *rnode; - bkpb.fork = forkNum; - bkpb.block = blkno; - - if (page_std) - { - /* Assume we can omit data between pd_lower and pd_upper */ - uint16 lower = ((PageHeader) page)->pd_lower; - uint16 upper = ((PageHeader) page)->pd_upper; - - if (lower >= SizeOfPageHeaderData && - upper > lower && - upper <= BLCKSZ) - { - bkpb.hole_offset = lower; - bkpb.hole_length = upper - lower; - } - else - { - /* No "hole" to compress out */ - bkpb.hole_offset = 0; - bkpb.hole_length = 0; - } - } - else - { - /* Not a standard page header, don't try to eliminate "hole" */ - bkpb.hole_offset = 0; - bkpb.hole_length = 0; - } - - rdata[0].data = (char *) &bkpb; - rdata[0].len = sizeof(BkpBlock); - rdata[0].buffer = InvalidBuffer; - rdata[0].next = &(rdata[1]); - - if (bkpb.hole_length == 0) - { - rdata[1].data = (char *) page; - rdata[1].len = BLCKSZ; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = NULL; - } - else - { - /* must skip the hole */ - rdata[1].data = (char *) page; - rdata[1].len = bkpb.hole_offset; - rdata[1].buffer = InvalidBuffer; - rdata[1].next = &rdata[2]; - - rdata[2].data = (char *) page + (bkpb.hole_offset + bkpb.hole_length); - rdata[2].len = BLCKSZ - (bkpb.hole_offset + bkpb.hole_length); - rdata[2].buffer = InvalidBuffer; - rdata[2].next = NULL; - } - - recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata); - - /* - * The page may be uninitialized. If so, we can't set the LSN because that - * would corrupt the page. - */ - if (!PageIsNew(page)) - { - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - return recptr; -} - -/* - * Write a WAL record containing a full image of a page. - * - * Caller should initialize the buffer and mark it dirty before calling this - * function. This function will set the page LSN. - * - * If the page follows the standard page layout, with a PageHeader and unused - * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows - * the unused space to be left out from the WAL record, making it smaller. - */ -XLogRecPtr -log_newpage_buffer(Buffer buffer, bool page_std) -{ - Page page = BufferGetPage(buffer); - RelFileNode rnode; - ForkNumber forkNum; - BlockNumber blkno; - - /* Shared buffers should be modified in a critical section. */ - Assert(CritSectionCount > 0); - - BufferGetTag(buffer, &rnode, &forkNum, &blkno); - - return log_newpage(&rnode, forkNum, blkno, page, page_std); -} - -/* * Check if any of the GUC parameters that are critical for hot standby * have changed, and update the value in pg_control file if necessary. */ @@ -9757,7 +9143,8 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, * the standby. * * We must hold all the insertion locks to change the value of - * forcePageWrites, to ensure adequate interlocking against XLogInsert(). + * forcePageWrites, to ensure adequate interlocking against + * XLogInsertRecord(). */ WALInsertLockAcquireExclusive(); if (exclusive) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c new file mode 100644 index 00000000000..b83343bf5bd --- /dev/null +++ b/src/backend/access/transam/xloginsert.c @@ -0,0 +1,633 @@ +/*------------------------------------------------------------------------- + * + * xloginsert.c + * Functions for constructing WAL records + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/xloginsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xloginsert.h" +#include "catalog/pg_control.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/proc.h" +#include "utils/memutils.h" +#include "pg_trace.h" + +static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, + XLogRecData *rdata, + XLogRecPtr RedoRecPtr, bool doPageWrites, + XLogRecPtr *fpw_lsn, XLogRecData **rdt_lastnormal); +static void XLogFillBkpBlock(Buffer buffer, bool buffer_std, BkpBlock *bkpb); + +/* + * Insert an XLOG record having the specified RMID and info bytes, + * with the body of the record being the data chunk(s) described by + * the rdata chain (see xloginsert.h for notes about rdata). + * + * Returns XLOG pointer to end of record (beginning of next record). + * This can be used as LSN for data pages affected by the logged action. + * (LSN is the XLOG point up to which the XLOG must be flushed to disk + * before the data page can be written out. This implements the basic + * WAL rule "write the log before the data".) + * + * NB: this routine feels free to scribble on the XLogRecData structs, + * though not on the data they reference. This is OK since the XLogRecData + * structs are always just temporaries in the calling code. + */ +XLogRecPtr +XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata) +{ + XLogRecPtr RedoRecPtr; + bool doPageWrites; + XLogRecPtr EndPos; + XLogRecPtr fpw_lsn; + XLogRecData *rdt; + XLogRecData *rdt_lastnormal; + + /* info's high bits are reserved for use by me */ + if (info & XLR_INFO_MASK) + elog(PANIC, "invalid xlog info mask %02X", info); + + TRACE_POSTGRESQL_XLOG_INSERT(rmid, info); + + /* + * In bootstrap mode, we don't actually log anything but XLOG resources; + * return a phony record pointer. + */ + if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID) + { + EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */ + return EndPos; + } + + /* + * Get values needed to decide whether to do full-page writes. Since we + * don't yet have an insertion lock, these could change under us, but + * XLogInsertRecord will recheck them once it has a lock. + */ + GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); + + /* + * Assemble an XLogRecData chain representing the WAL record, including + * any backup blocks needed. + * + * We may have to loop back to here if a race condition is detected in + * XLogInsertRecord. We could prevent the race by doing all this work + * while holding an insertion lock, but it seems better to avoid doing CRC + * calculations while holding one. + */ +retry: + rdt = XLogRecordAssemble(rmid, info, rdata, RedoRecPtr, doPageWrites, + &fpw_lsn, &rdt_lastnormal); + + EndPos = XLogInsertRecord(rdt, fpw_lsn); + + if (EndPos == InvalidXLogRecPtr) + { + /* + * Undo the changes we made to the rdata chain, and retry. + * + * XXX: This doesn't undo *all* the changes; the XLogRecData + * entries for buffers that we had already decided to back up have + * had their data-pointers cleared. That's OK, as long as we + * decide to back them up on the next iteration as well. Hence, + * don't allow "doPageWrites" value to go from true to false after + * we've modified the rdata chain. + */ + bool newDoPageWrites; + + GetFullPageWriteInfo(&RedoRecPtr, &newDoPageWrites); + doPageWrites = doPageWrites || newDoPageWrites; + rdt_lastnormal->next = NULL; + + goto retry; + } + + return EndPos; +} + +/* + * Assemble a full WAL record, including backup blocks, from an XLogRecData + * chain, ready for insertion with XLogInsertRecord(). The record header + * fields are filled in, except for the xl_prev field and CRC. + * + * The rdata chain is modified, adding entries for full-page images. + * *rdt_lastnormal is set to point to the last normal (ie. not added by + * this function) entry. It can be used to reset the chain to its original + * state. + * + * If the rdata chain contains any buffer references, and a full-page image + * was not taken of all the buffers, *fpw_lsn is set to the lowest LSN among + * such pages. This signals that the assembled record is only good for + * insertion on the assumption that the RedoRecPtr and doPageWrites values + * were up-to-date. + */ +static XLogRecData * +XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecData *rdata, + XLogRecPtr RedoRecPtr, bool doPageWrites, + XLogRecPtr *fpw_lsn, XLogRecData **rdt_lastnormal) +{ + bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH); + XLogRecData *rdt; + Buffer dtbuf[XLR_MAX_BKP_BLOCKS]; + bool dtbuf_bkp[XLR_MAX_BKP_BLOCKS]; + uint32 len, + total_len; + unsigned i; + + /* + * These need to be static because they are returned to the caller as part + * of the XLogRecData chain. + */ + static BkpBlock dtbuf_xlg[XLR_MAX_BKP_BLOCKS]; + static XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS]; + static XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS]; + static XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS]; + static XLogRecData hdr_rdt; + static XLogRecord *rechdr; + + if (rechdr == NULL) + { + static char rechdrbuf[SizeOfXLogRecord + MAXIMUM_ALIGNOF]; + + rechdr = (XLogRecord *) MAXALIGN(&rechdrbuf); + MemSet(rechdr, 0, SizeOfXLogRecord); + } + + /* The record begins with the header */ + hdr_rdt.data = (char *) rechdr; + hdr_rdt.len = SizeOfXLogRecord; + hdr_rdt.next = rdata; + total_len = SizeOfXLogRecord; + + /* + * Here we scan the rdata chain, to determine which buffers must be backed + * up. + * + * We add entries for backup blocks to the chain, so that they don't need + * any special treatment in the critical section where the chunks are + * copied into the WAL buffers. Those entries have to be unlinked from the + * chain if we have to loop back here. + */ + for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) + { + dtbuf[i] = InvalidBuffer; + dtbuf_bkp[i] = false; + } + + *fpw_lsn = InvalidXLogRecPtr; + len = 0; + for (rdt = rdata;;) + { + if (rdt->buffer == InvalidBuffer) + { + /* Simple data, just include it */ + len += rdt->len; + } + else + { + /* Find info for buffer */ + for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) + { + if (rdt->buffer == dtbuf[i]) + { + /* Buffer already referenced by earlier chain item */ + if (dtbuf_bkp[i]) + { + rdt->data = NULL; + rdt->len = 0; + } + else if (rdt->data) + len += rdt->len; + break; + } + if (dtbuf[i] == InvalidBuffer) + { + /* OK, put it in this slot */ + XLogRecPtr page_lsn; + bool needs_backup; + + dtbuf[i] = rdt->buffer; + + /* + * Determine whether the buffer has to be backed up. + * + * We assume page LSN is first data on *every* page that + * can be passed to XLogInsert, whether it has the + * standard page layout or not. We don't need to take the + * buffer header lock for PageGetLSN because we hold an + * exclusive lock on the page and/or the relation. + */ + page_lsn = PageGetLSN(BufferGetPage(rdt->buffer)); + if (!doPageWrites) + needs_backup = false; + else if (page_lsn <= RedoRecPtr) + needs_backup = true; + else + needs_backup = false; + + if (needs_backup) + { + /* + * The page needs to be backed up, so set up BkpBlock + */ + XLogFillBkpBlock(rdt->buffer, rdt->buffer_std, + &(dtbuf_xlg[i])); + dtbuf_bkp[i] = true; + rdt->data = NULL; + rdt->len = 0; + } + else + { + if (rdt->data) + len += rdt->len; + if (*fpw_lsn == InvalidXLogRecPtr || + page_lsn < *fpw_lsn) + { + *fpw_lsn = page_lsn; + } + } + break; + } + } + if (i >= XLR_MAX_BKP_BLOCKS) + elog(PANIC, "can backup at most %d blocks per xlog record", + XLR_MAX_BKP_BLOCKS); + } + /* Break out of loop when rdt points to last chain item */ + if (rdt->next == NULL) + break; + rdt = rdt->next; + } + total_len += len; + + /* + * Make additional rdata chain entries for the backup blocks, so that we + * don't need to special-case them in the write loop. This modifies the + * original rdata chain, but we keep a pointer to the last regular entry, + * rdt_lastnormal, so that we can undo this if we have to start over. + * + * At the exit of this loop, total_len includes the backup block data. + * + * Also set the appropriate info bits to show which buffers were backed + * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer + * value (ignoring InvalidBuffer) appearing in the rdata chain. + */ + *rdt_lastnormal = rdt; + for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) + { + BkpBlock *bkpb; + char *page; + + if (!dtbuf_bkp[i]) + continue; + + info |= XLR_BKP_BLOCK(i); + + bkpb = &(dtbuf_xlg[i]); + page = (char *) BufferGetBlock(dtbuf[i]); + + rdt->next = &(dtbuf_rdt1[i]); + rdt = rdt->next; + + rdt->data = (char *) bkpb; + rdt->len = sizeof(BkpBlock); + total_len += sizeof(BkpBlock); + + rdt->next = &(dtbuf_rdt2[i]); + rdt = rdt->next; + + if (bkpb->hole_length == 0) + { + rdt->data = page; + rdt->len = BLCKSZ; + total_len += BLCKSZ; + rdt->next = NULL; + } + else + { + /* must skip the hole */ + rdt->data = page; + rdt->len = bkpb->hole_offset; + total_len += bkpb->hole_offset; + + rdt->next = &(dtbuf_rdt3[i]); + rdt = rdt->next; + + rdt->data = page + (bkpb->hole_offset + bkpb->hole_length); + rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length); + total_len += rdt->len; + rdt->next = NULL; + } + } + + /* + * We disallow len == 0 because it provides a useful bit of extra error + * checking in ReadRecord. This means that all callers of XLogInsert + * must supply at least some not-in-a-buffer data. However, we make an + * exception for XLOG SWITCH records because we don't want them to ever + * cross a segment boundary. + */ + if (len == 0 && !isLogSwitch) + elog(PANIC, "invalid xlog record length %u", rechdr->xl_len); + + /* + * Fill in the fields in the record header. Prev-link is filled in later, + * once we know where in the WAL the record will be inserted. CRC is also + * not calculated yet. + */ + rechdr->xl_xid = GetCurrentTransactionIdIfAny(); + rechdr->xl_tot_len = total_len; + rechdr->xl_len = len; /* doesn't include backup blocks */ + rechdr->xl_info = info; + rechdr->xl_rmid = rmid; + rechdr->xl_prev = InvalidXLogRecPtr; + + return &hdr_rdt; +} + +/* + * Determine whether the buffer referenced has to be backed up. + * + * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites + * could change later, so the result should be used for optimization purposes + * only. + */ +bool +XLogCheckBufferNeedsBackup(Buffer buffer) +{ + XLogRecPtr RedoRecPtr; + bool doPageWrites; + Page page; + + GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); + + page = BufferGetPage(buffer); + + if (doPageWrites && PageGetLSN(page) <= RedoRecPtr) + return true; /* buffer requires backup */ + + return false; /* buffer does not need to be backed up */ +} + +/* + * Write a backup block if needed when we are setting a hint. Note that + * this may be called for a variety of page types, not just heaps. + * + * Callable while holding just share lock on the buffer content. + * + * We can't use the plain backup block mechanism since that relies on the + * Buffer being exclusively locked. Since some modifications (setting LSN, hint + * bits) are allowed in a sharelocked buffer that can lead to wal checksum + * failures. So instead we copy the page and insert the copied data as normal + * record data. + * + * We only need to do something if page has not yet been full page written in + * this checkpoint round. The LSN of the inserted wal record is returned if we + * had to write, InvalidXLogRecPtr otherwise. + * + * It is possible that multiple concurrent backends could attempt to write WAL + * records. In that case, multiple copies of the same block would be recorded + * in separate WAL records by different backends, though that is still OK from + * a correctness perspective. + */ +XLogRecPtr +XLogSaveBufferForHint(Buffer buffer, bool buffer_std) +{ + XLogRecPtr recptr = InvalidXLogRecPtr; + XLogRecPtr lsn; + XLogRecPtr RedoRecPtr; + + /* + * Ensure no checkpoint can change our view of RedoRecPtr. + */ + Assert(MyPgXact->delayChkpt); + + /* + * Update RedoRecPtr so that we can make the right decision + */ + RedoRecPtr = GetRedoRecPtr(); + + /* + * We assume page LSN is first data on *every* page that can be passed to + * XLogInsert, whether it has the standard page layout or not. Since we're + * only holding a share-lock on the page, we must take the buffer header + * lock when we look at the LSN. + */ + lsn = BufferGetLSNAtomic(buffer); + + if (lsn <= RedoRecPtr) + { + XLogRecData rdata[2]; + BkpBlock bkpb; + char copied_buffer[BLCKSZ]; + char *origdata = (char *) BufferGetBlock(buffer); + + /* Make a BkpBlock struct representing the buffer */ + XLogFillBkpBlock(buffer, buffer_std, &bkpb); + + /* + * Copy buffer so we don't have to worry about concurrent hint bit or + * lsn updates. We assume pd_lower/upper cannot be changed without an + * exclusive lock, so the contents bkp are not racy. + * + * With buffer_std set to false, XLogFillBkpBlock() sets hole_length + * and hole_offset to 0; so the following code is safe for either + * case. + */ + memcpy(copied_buffer, origdata, bkpb.hole_offset); + memcpy(copied_buffer + bkpb.hole_offset, + origdata + bkpb.hole_offset + bkpb.hole_length, + BLCKSZ - bkpb.hole_offset - bkpb.hole_length); + + /* + * Header for backup block. + */ + rdata[0].data = (char *) &bkpb; + rdata[0].len = sizeof(BkpBlock); + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + + /* + * Save copy of the buffer. + */ + rdata[1].data = copied_buffer; + rdata[1].len = BLCKSZ - bkpb.hole_length; + rdata[1].buffer = InvalidBuffer; + rdata[1].next = NULL; + + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata); + } + + return recptr; +} + +/* + * Write a WAL record containing a full image of a page. Caller is responsible + * for writing the page to disk after calling this routine. + * + * Note: If you're using this function, you should be building pages in private + * memory and writing them directly to smgr. If you're using buffers, call + * log_newpage_buffer instead. + * + * If the page follows the standard page layout, with a PageHeader and unused + * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows + * the unused space to be left out from the WAL record, making it smaller. + */ +XLogRecPtr +log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, + Page page, bool page_std) +{ + BkpBlock bkpb; + XLogRecPtr recptr; + XLogRecData rdata[3]; + + /* NO ELOG(ERROR) from here till newpage op is logged */ + START_CRIT_SECTION(); + + bkpb.node = *rnode; + bkpb.fork = forkNum; + bkpb.block = blkno; + + if (page_std) + { + /* Assume we can omit data between pd_lower and pd_upper */ + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; + + if (lower >= SizeOfPageHeaderData && + upper > lower && + upper <= BLCKSZ) + { + bkpb.hole_offset = lower; + bkpb.hole_length = upper - lower; + } + else + { + /* No "hole" to compress out */ + bkpb.hole_offset = 0; + bkpb.hole_length = 0; + } + } + else + { + /* Not a standard page header, don't try to eliminate "hole" */ + bkpb.hole_offset = 0; + bkpb.hole_length = 0; + } + + rdata[0].data = (char *) &bkpb; + rdata[0].len = sizeof(BkpBlock); + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + + if (bkpb.hole_length == 0) + { + rdata[1].data = (char *) page; + rdata[1].len = BLCKSZ; + rdata[1].buffer = InvalidBuffer; + rdata[1].next = NULL; + } + else + { + /* must skip the hole */ + rdata[1].data = (char *) page; + rdata[1].len = bkpb.hole_offset; + rdata[1].buffer = InvalidBuffer; + rdata[1].next = &rdata[2]; + + rdata[2].data = (char *) page + (bkpb.hole_offset + bkpb.hole_length); + rdata[2].len = BLCKSZ - (bkpb.hole_offset + bkpb.hole_length); + rdata[2].buffer = InvalidBuffer; + rdata[2].next = NULL; + } + + recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI, rdata); + + /* + * The page may be uninitialized. If so, we can't set the LSN because that + * would corrupt the page. + */ + if (!PageIsNew(page)) + { + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + return recptr; +} + +/* + * Write a WAL record containing a full image of a page. + * + * Caller should initialize the buffer and mark it dirty before calling this + * function. This function will set the page LSN. + * + * If the page follows the standard page layout, with a PageHeader and unused + * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows + * the unused space to be left out from the WAL record, making it smaller. + */ +XLogRecPtr +log_newpage_buffer(Buffer buffer, bool page_std) +{ + Page page = BufferGetPage(buffer); + RelFileNode rnode; + ForkNumber forkNum; + BlockNumber blkno; + + /* Shared buffers should be modified in a critical section. */ + Assert(CritSectionCount > 0); + + BufferGetTag(buffer, &rnode, &forkNum, &blkno); + + return log_newpage(&rnode, forkNum, blkno, page, page_std); +} + +/* + * Fill a BkpBlock for a buffer. + */ +static void +XLogFillBkpBlock(Buffer buffer, bool buffer_std, BkpBlock *bkpb) +{ + BufferGetTag(buffer, &bkpb->node, &bkpb->fork, &bkpb->block); + + if (buffer_std) + { + /* Assume we can omit data between pd_lower and pd_upper */ + Page page = BufferGetPage(buffer); + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; + + if (lower >= SizeOfPageHeaderData && + upper > lower && + upper <= BLCKSZ) + { + bkpb->hole_offset = lower; + bkpb->hole_length = upper - lower; + } + else + { + /* No "hole" to compress out */ + bkpb->hole_offset = 0; + bkpb->hole_length = 0; + } + } + else + { + /* Not a standard page header, don't try to eliminate "hole" */ + bkpb->hole_offset = 0; + bkpb->hole_length = 0; + } +} diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index da7ed92941c..7d573cc585d 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -16,7 +16,7 @@ #include "postgres.h" #include "access/transam.h" -#include "access/xlog.h" +#include "access/xlogrecord.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "catalog/pg_control.h" diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index ef827dbc404..1a21dac8538 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -456,6 +456,127 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, return buffer; } +/* + * Restore a full-page image from a backup block attached to an XLOG record. + * + * lsn: LSN of the XLOG record being replayed + * record: the complete XLOG record + * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1) + * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock + * keep_buffer: TRUE to return the buffer still locked and pinned + * + * Returns the buffer number containing the page. Note this is not terribly + * useful unless keep_buffer is specified as TRUE. + * + * Note: when a backup block is available in XLOG, we restore it + * unconditionally, even if the page in the database appears newer. + * This is to protect ourselves against database pages that were partially + * or incorrectly written during a crash. We assume that the XLOG data + * must be good because it has passed a CRC check, while the database + * page might not be. This will force us to replay all subsequent + * modifications of the page that appear in XLOG, rather than possibly + * ignoring them as already applied, but that's not a huge drawback. + * + * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer, + * else a normal exclusive lock is used. During crash recovery, that's just + * pro forma because there can't be any regular backends in the system, but + * in hot standby mode the distinction is important. + * + * If 'keep_buffer' is true, return without releasing the buffer lock and pin; + * then caller is responsible for doing UnlockReleaseBuffer() later. This + * is needed in some cases when replaying XLOG records that touch multiple + * pages, to prevent inconsistent states from being visible to other backends. + * (Again, that's only important in hot standby mode.) + */ +Buffer +RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index, + bool get_cleanup_lock, bool keep_buffer) +{ + BkpBlock bkpb; + char *blk; + int i; + + /* Locate requested BkpBlock in the record */ + blk = (char *) XLogRecGetData(record) + record->xl_len; + for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++) + { + if (!(record->xl_info & XLR_BKP_BLOCK(i))) + continue; + + memcpy(&bkpb, blk, sizeof(BkpBlock)); + blk += sizeof(BkpBlock); + + if (i == block_index) + { + /* Found it, apply the update */ + return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock, + keep_buffer); + } + + blk += BLCKSZ - bkpb.hole_length; + } + + /* Caller specified a bogus block_index */ + elog(ERROR, "failed to restore block_index %d", block_index); + return InvalidBuffer; /* keep compiler quiet */ +} + +/* + * Workhorse for RestoreBackupBlock usable without an xlog record + * + * Restores a full-page image from BkpBlock and a data pointer. + */ +Buffer +RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk, + bool get_cleanup_lock, bool keep_buffer) +{ + Buffer buffer; + Page page; + + buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block, + RBM_ZERO); + Assert(BufferIsValid(buffer)); + if (get_cleanup_lock) + LockBufferForCleanup(buffer); + else + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + page = (Page) BufferGetPage(buffer); + + if (bkpb.hole_length == 0) + { + memcpy((char *) page, blk, BLCKSZ); + } + else + { + memcpy((char *) page, blk, bkpb.hole_offset); + /* must zero-fill the hole */ + MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length); + memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length), + blk + bkpb.hole_offset, + BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); + } + + /* + * The checksum value on this page is currently invalid. We don't need to + * reset it here since it will be set before being written. + */ + + /* + * The page may be uninitialized. If so, we can't set the LSN because that + * would corrupt the page. + */ + if (!PageIsNew(page)) + { + PageSetLSN(page, lsn); + } + MarkBufferDirty(buffer); + + if (!keep_buffer) + UnlockReleaseBuffer(buffer); + + return buffer; +} /* * Struct actually returned by XLogFakeRelcacheEntry, though the declared diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index c0eade0a3d7..e523ee923a4 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -34,6 +34,7 @@ #include "access/sysattr.h" #include "access/transam.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/binary_upgrade.h" #include "catalog/catalog.h" #include "catalog/dependency.h" diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index 5eb8fd4cd00..911f015f27e 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -21,6 +21,7 @@ #include "access/htup_details.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/dependency.h" #include "catalog/objectaccess.h" #include "catalog/pg_authid.h" diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index c3b2f072e44..46780e71d69 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -21,6 +21,8 @@ #include "access/visibilitymap.h" #include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/storage.h" diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index ff80b09c100..6a578ec58f5 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -23,6 +23,7 @@ #include "access/transam.h" #include "access/tuptoaster.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/heap.h" diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 6b8357634a7..83e8f891222 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -24,6 +24,7 @@ #include "access/htup_details.h" #include "access/sysattr.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" #include "commands/copy.h" diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c index e381c06e67f..5e0ac585603 100644 --- a/src/backend/commands/createas.c +++ b/src/backend/commands/createas.c @@ -27,6 +27,7 @@ #include "access/htup_details.h" #include "access/sysattr.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/toasting.h" #include "commands/createas.h" #include "commands/matview.h" diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 45f525d1d40..94c82d37410 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -28,6 +28,7 @@ #include "access/heapam.h" #include "access/htup_details.h" #include "access/xact.h" +#include "access/xloginsert.h" #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/dependency.h" diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index db05f7cf801..523ba35ba24 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -17,6 +17,7 @@ #include "access/htup_details.h" #include "access/multixact.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/indexing.h" #include "catalog/namespace.h" diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 6d5f65b8bd4..e5f7765d556 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -17,6 +17,8 @@ #include "access/htup_details.h" #include "access/multixact.h" #include "access/transam.h" +#include "access/xlog.h" +#include "access/xloginsert.h" #include "access/xlogutils.h" #include "catalog/dependency.h" #include "catalog/namespace.h" diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index ecdff1e5e35..714a9f1ee78 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -21,6 +21,7 @@ #include "access/relscan.h" #include "access/sysattr.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/heap.h" diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index 28e69a55510..378e355adcc 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -56,6 +56,8 @@ #include "access/htup_details.h" #include "access/sysattr.h" #include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/indexing.h" diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 3778d9d4250..8dad8c269cc 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -44,6 +44,7 @@ #include "access/multixact.h" #include "access/transam.h" #include "access/visibilitymap.h" +#include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/storage.h" #include "commands/dbcommands.h" diff --git a/src/backend/commands/variable.c b/src/backend/commands/variable.c index 40a991653d5..6ce8daeb95a 100644 --- a/src/backend/commands/variable.c +++ b/src/backend/commands/variable.c @@ -20,6 +20,7 @@ #include "access/htup_details.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/pg_authid.h" #include "commands/variable.h" #include "miscadmin.h" diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 57067ef57a6..cbfe05e2b5c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -33,6 +33,7 @@ #include <sys/file.h> #include <unistd.h> +#include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/storage.h" #include "executor/instrument.h" diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 1f69c9e03c9..a11fa6cbdf8 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -69,6 +69,7 @@ #include "miscadmin.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/pg_tablespace.h" #include "pgstat.h" diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index ea82882aa6d..d9535451389 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -48,8 +48,9 @@ #include "access/clog.h" #include "access/subtrans.h" #include "access/transam.h" -#include "access/xact.h" #include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog.h" #include "catalog/catalog.h" #include "miscadmin.h" #include "storage/proc.h" diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 1c327fd45c7..8c3720bc737 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -20,6 +20,7 @@ #include "access/twophase.h" #include "access/xact.h" #include "access/xlog.h" +#include "access/xloginsert.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 723051efb50..cbe95747564 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -35,6 +35,7 @@ #include "access/transam.h" #include "access/twophase.h" #include "access/twophase_rmgr.h" +#include "access/xlog.h" #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index 7c8d53e6a5a..f1261181c9e 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -190,6 +190,7 @@ #include "access/twophase.h" #include "access/twophase_rmgr.h" #include "access/xact.h" +#include "access/xlog.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/predicate.h" diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 4a2a339cf2a..422911cde03 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -20,6 +20,7 @@ #include "access/reloptions.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/namespace.h" #include "catalog/toasting.h" diff --git a/src/backend/utils/adt/txid.c b/src/backend/utils/adt/txid.c index 7969a3353cf..1a2cc1d9284 100644 --- a/src/backend/utils/adt/txid.c +++ b/src/backend/utils/adt/txid.c @@ -23,6 +23,7 @@ #include "access/transam.h" #include "access/xact.h" +#include "access/xlog.h" #include "funcapi.h" #include "miscadmin.h" #include "libpq/pqformat.h" diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index c8137798f24..e8ed9995ff6 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -36,6 +36,7 @@ #include "access/sysattr.h" #include "access/transam.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/index.h" #include "catalog/indexing.h" diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c index b6b13308773..d1f64e58c8c 100644 --- a/src/backend/utils/cache/relmapper.c +++ b/src/backend/utils/cache/relmapper.c @@ -44,6 +44,8 @@ #include <unistd.h> #include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" #include "catalog/catalog.h" #include "catalog/pg_tablespace.h" #include "catalog/storage.h" diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 6a6a4453cd0..c34803437b6 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -23,6 +23,7 @@ #include "access/htup_details.h" #include "access/sysattr.h" #include "access/xact.h" +#include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/indexing.h" #include "catalog/namespace.h" diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index 1d6fe869944..a69aae3b9e7 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -54,6 +54,8 @@ #include "postgres.h" +#include <limits.h> + #include "access/htup_details.h" #include "commands/tablespace.h" #include "executor/executor.h" diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index 5c3f5adb4a7..a2c4989428c 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -60,6 +60,7 @@ #include "access/subtrans.h" #include "access/transam.h" #include "access/xact.h" +#include "access/xlog.h" #include "storage/bufmgr.h" #include "storage/procarray.h" #include "utils/builtins.h" diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c index e224e67ad62..2ba99469825 100644 --- a/src/bin/pg_resetxlog/pg_resetxlog.c +++ b/src/bin/pg_resetxlog/pg_resetxlog.c @@ -48,6 +48,7 @@ #include "access/transam.h" #include "access/tuptoaster.h" #include "access/multixact.h" +#include "access/xlog.h" #include "access/xlog_internal.h" #include "catalog/catversion.h" #include "catalog/pg_control.h" diff --git a/src/include/access/clog.h b/src/include/access/clog.h index 8562631b54f..04ac4ba3119 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -11,7 +11,8 @@ #ifndef CLOG_H #define CLOG_H -#include "access/xlog.h" +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" /* * Possible transaction statuses --- note that all-zeroes is the initial diff --git a/src/include/access/gin.h b/src/include/access/gin.h index 0ebecb4140d..80826b843bf 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -10,7 +10,8 @@ #ifndef GIN_H #define GIN_H -#include "access/xlog.h" +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" #include "storage/block.h" #include "utils/relcache.h" diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index ab3afb812e0..6a09dc990e4 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -13,6 +13,7 @@ #include "access/genam.h" #include "access/gin.h" #include "access/itup.h" +#include "access/xloginsert.h" #include "fmgr.h" #include "storage/bufmgr.h" #include "utils/rbtree.h" diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 879f11351b0..21daf3b2b6a 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -16,6 +16,7 @@ #include "access/gist.h" #include "access/itup.h" +#include "access/xlogrecord.h" #include "fmgr.h" #include "storage/bufmgr.h" #include "storage/buffile.h" diff --git a/src/include/access/hash.h b/src/include/access/hash.h index a81b9de0e61..c175a5c1822 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -20,8 +20,9 @@ #include "access/genam.h" #include "access/itup.h" #include "access/sdir.h" -#include "access/xlog.h" +#include "access/xlogrecord.h" #include "fmgr.h" +#include "lib/stringinfo.h" #include "storage/bufmgr.h" #include "storage/lock.h" #include "utils/relcache.h" diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 5ac98a5baa6..1d64264b010 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -15,7 +15,9 @@ #define HEAPAM_XLOG_H #include "access/htup.h" -#include "access/xlog.h" +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" +#include "storage/buf.h" #include "storage/bufpage.h" #include "storage/relfilenode.h" #include "utils/relcache.h" diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index b331447ade5..43d737505d2 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -11,7 +11,8 @@ #ifndef MULTIXACT_H #define MULTIXACT_H -#include "access/xlog.h" +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 90fd6d0056a..c8bb3f5d668 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -17,9 +17,10 @@ #include "access/genam.h" #include "access/itup.h" #include "access/sdir.h" -#include "access/xlog.h" -#include "access/xlogutils.h" +#include "access/xlogrecord.h" #include "catalog/pg_index.h" +#include "lib/stringinfo.h" +#include "storage/bufmgr.h" /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ typedef uint16 BTCycleId; diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index f218a83224e..ccf1ed77869 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -15,8 +15,9 @@ #define SPGIST_H #include "access/skey.h" -#include "access/xlog.h" +#include "access/xlogrecord.h" #include "fmgr.h" +#include "lib/stringinfo.h" /* reloption parameters */ diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index d092029d8a7..3330644651c 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -17,6 +17,7 @@ #include "access/itup.h" #include "access/spgist.h" #include "nodes/tidbitmap.h" +#include "storage/buf.h" #include "storage/relfilenode.h" #include "utils/relcache.h" diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 45376b47928..11a51b26859 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -14,9 +14,11 @@ #ifndef XACT_H #define XACT_H -#include "access/xlog.h" +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" #include "nodes/pg_list.h" #include "storage/relfilenode.h" +#include "utils/datetime.h" /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 0ae110f18b7..6f8b5f46e10 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -13,67 +13,11 @@ #include "access/rmgr.h" #include "access/xlogdefs.h" +#include "access/xloginsert.h" +#include "access/xlogrecord.h" #include "datatype/timestamp.h" #include "lib/stringinfo.h" -#include "storage/block.h" -#include "storage/buf.h" -#include "storage/relfilenode.h" -#include "utils/pg_crc.h" -/* - * The overall layout of an XLOG record is: - * Fixed-size header (XLogRecord struct) - * rmgr-specific data - * BkpBlock - * backup block data - * BkpBlock - * backup block data - * ... - * - * where there can be zero to four backup blocks (as signaled by xl_info flag - * bits). XLogRecord structs always start on MAXALIGN boundaries in the WAL - * files, and we round up SizeOfXLogRecord so that the rmgr data is also - * guaranteed to begin on a MAXALIGN boundary. However, no padding is added - * to align BkpBlock structs or backup block data. - * - * NOTE: xl_len counts only the rmgr data, not the XLogRecord header, - * and also not any backup blocks. xl_tot_len counts everything. Neither - * length field is rounded up to an alignment boundary. - */ -typedef struct XLogRecord -{ - uint32 xl_tot_len; /* total len of entire record */ - TransactionId xl_xid; /* xact id */ - uint32 xl_len; /* total len of rmgr data */ - uint8 xl_info; /* flag bits, see below */ - RmgrId xl_rmid; /* resource manager for this record */ - /* 2 bytes of padding here, initialize to zero */ - XLogRecPtr xl_prev; /* ptr to previous record in log */ - pg_crc32 xl_crc; /* CRC for this record */ - - /* If MAXALIGN==8, there are 4 wasted bytes here */ - - /* ACTUAL LOG DATA FOLLOWS AT END OF STRUCT */ - -} XLogRecord; - -#define SizeOfXLogRecord MAXALIGN(sizeof(XLogRecord)) - -#define XLogRecGetData(record) ((char*) (record) + SizeOfXLogRecord) - -/* - * XLOG uses only low 4 bits of xl_info. High 4 bits may be used by rmgr. - */ -#define XLR_INFO_MASK 0x0F - -/* - * If we backed up any disk blocks with the XLOG record, we use flag bits in - * xl_info to signal it. We support backup of up to 4 disk blocks per XLOG - * record. - */ -#define XLR_BKP_BLOCK_MASK 0x0F /* all info bits used for bkp blocks */ -#define XLR_MAX_BKP_BLOCKS 4 -#define XLR_BKP_BLOCK(iblk) (0x08 >> (iblk)) /* iblk in 0..3 */ /* Sync methods */ #define SYNC_METHOD_FSYNC 0 @@ -83,45 +27,6 @@ typedef struct XLogRecord #define SYNC_METHOD_OPEN_DSYNC 4 /* for O_DSYNC */ extern int sync_method; -/* - * The rmgr data to be written by XLogInsert() is defined by a chain of - * one or more XLogRecData structs. (Multiple structs would be used when - * parts of the source data aren't physically adjacent in memory, or when - * multiple associated buffers need to be specified.) - * - * If buffer is valid then XLOG will check if buffer must be backed up - * (ie, whether this is first change of that page since last checkpoint). - * If so, the whole page contents are attached to the XLOG record, and XLOG - * sets XLR_BKP_BLOCK(N) bit in xl_info. Note that the buffer must be pinned - * and exclusive-locked by the caller, so that it won't change under us. - * NB: when the buffer is backed up, we DO NOT insert the data pointed to by - * this XLogRecData struct into the XLOG record, since we assume it's present - * in the buffer. Therefore, rmgr redo routines MUST pay attention to - * XLR_BKP_BLOCK(N) to know what is actually stored in the XLOG record. - * The N'th XLR_BKP_BLOCK bit corresponds to the N'th distinct buffer - * value (ignoring InvalidBuffer) appearing in the rdata chain. - * - * When buffer is valid, caller must set buffer_std to indicate whether the - * page uses standard pd_lower/pd_upper header fields. If this is true, then - * XLOG is allowed to omit the free space between pd_lower and pd_upper from - * the backed-up page image. Note that even when buffer_std is false, the - * page MUST have an LSN field as its first eight bytes! - * - * Note: data can be NULL to indicate no rmgr data associated with this chain - * entry. This can be sensible (ie, not a wasted entry) if buffer is valid. - * The implication is that the buffer has been changed by the operation being - * logged, and so may need to be backed up, but the change can be redone using - * only information already present elsewhere in the XLOG entry. - */ -typedef struct XLogRecData -{ - char *data; /* start of rmgr data to include */ - uint32 len; /* length of rmgr data to include */ - Buffer buffer; /* buffer associated with data, if any */ - bool buffer_std; /* buffer has standard pd_lower/pd_upper */ - struct XLogRecData *next; /* next struct in chain, or NULL */ -} XLogRecData; - extern PGDLLIMPORT TimeLineID ThisTimeLineID; /* current TLI */ /* @@ -281,28 +186,18 @@ typedef struct CheckpointStatsData extern CheckpointStatsData CheckpointStats; -extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata); -extern bool XLogCheckBufferNeedsBackup(Buffer buffer); +extern XLogRecPtr XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn); extern void XLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); extern int XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock); extern int XLogFileOpen(XLogSegNo segno); -extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, - BlockNumber blk, char *page, bool page_std); -extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std); -extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); - extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli); extern XLogSegNo XLogGetLastRemovedSegno(void); extern void XLogSetAsyncXactLSN(XLogRecPtr record); extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); -extern Buffer RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, - int block_index, - bool get_cleanup_lock, bool keep_buffer); - extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); extern void xlog_desc(StringInfo buf, XLogRecord *record); extern const char *xlog_identify(uint8 info); @@ -338,6 +233,7 @@ extern bool CreateRestartPoint(int flags); extern void XLogPutNextOid(Oid nextOid); extern XLogRecPtr XLogRestorePoint(const char *rpName); extern void UpdateFullPageWrites(void); +extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p); extern XLogRecPtr GetRedoRecPtr(void); extern XLogRecPtr GetInsertRecPtr(void); extern XLogRecPtr GetFlushRecPtr(void); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index eaa1f9540e2..19b2ef8d90d 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -6,7 +6,7 @@ * NOTE: this file is intended to contain declarations useful for * manipulating the XLOG files directly, but it is not supposed to be * needed by rmgr routines (redo support for individual record types). - * So the XLogRecord typedef and associated stuff appear in xlog.h. + * So the XLogRecord typedef and associated stuff appear in xlogrecord.h. * * Note: This file must be includable in both frontend and backend contexts, * to allow stand-alone tools like pg_receivexlog to deal with WAL files. @@ -20,6 +20,7 @@ #define XLOG_INTERNAL_H #include "access/xlogdefs.h" +#include "access/xlogrecord.h" #include "datatype/timestamp.h" #include "lib/stringinfo.h" #include "pgtime.h" @@ -28,31 +29,6 @@ /* - * Header info for a backup block appended to an XLOG record. - * - * As a trivial form of data compression, the XLOG code is aware that - * PG data pages usually contain an unused "hole" in the middle, which - * contains only zero bytes. If hole_length > 0 then we have removed - * such a "hole" from the stored data (and it's not counted in the - * XLOG record's CRC, either). Hence, the amount of block data actually - * present following the BkpBlock struct is BLCKSZ - hole_length bytes. - * - * Note that we don't attempt to align either the BkpBlock struct or the - * block's data. So, the struct must be copied to aligned local storage - * before use. - */ -typedef struct BkpBlock -{ - RelFileNode node; /* relation containing block */ - ForkNumber fork; /* fork within the relation */ - BlockNumber block; /* block number */ - uint16 hole_offset; /* number of bytes before "hole" */ - uint16 hole_length; /* number of bytes in "hole" */ - - /* ACTUAL BLOCK DATA FOLLOWS AT END OF STRUCT */ -} BkpBlock; - -/* * Each page of XLOG file has a header like this: */ #define XLOG_PAGE_MAGIC 0xD080 /* can be used as WAL version indicator */ @@ -228,12 +204,6 @@ typedef struct xl_end_of_recovery } xl_end_of_recovery; /* - * XLogRecord is defined in xlog.h, but we avoid #including that to keep - * this file includable in stand-alone programs. - */ -struct XLogRecord; - -/* * Method table for resource managers. * * This struct must be kept in sync with the PG_RMGR definition in @@ -249,8 +219,8 @@ struct XLogRecord; typedef struct RmgrData { const char *rm_name; - void (*rm_redo) (XLogRecPtr lsn, struct XLogRecord *rptr); - void (*rm_desc) (StringInfo buf, struct XLogRecord *rptr); + void (*rm_redo) (XLogRecPtr lsn, XLogRecord *rptr); + void (*rm_desc) (StringInfo buf, XLogRecord *rptr); const char *(*rm_identify) (uint8 info); void (*rm_startup) (void); void (*rm_cleanup) (void); diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h new file mode 100644 index 00000000000..30c2e84cbc9 --- /dev/null +++ b/src/include/access/xloginsert.h @@ -0,0 +1,66 @@ +/* + * xloginsert.h + * + * Functions for generating WAL records + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xloginsert.h + */ +#ifndef XLOGINSERT_H +#define XLOGINSERT_H + +#include "access/rmgr.h" +#include "access/xlogdefs.h" +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/relfilenode.h" + +/* + * The rmgr data to be written by XLogInsert() is defined by a chain of + * one or more XLogRecData structs. (Multiple structs would be used when + * parts of the source data aren't physically adjacent in memory, or when + * multiple associated buffers need to be specified.) + * + * If buffer is valid then XLOG will check if buffer must be backed up + * (ie, whether this is first change of that page since last checkpoint). + * If so, the whole page contents are attached to the XLOG record, and XLOG + * sets XLR_BKP_BLOCK(N) bit in xl_info. Note that the buffer must be pinned + * and exclusive-locked by the caller, so that it won't change under us. + * NB: when the buffer is backed up, we DO NOT insert the data pointed to by + * this XLogRecData struct into the XLOG record, since we assume it's present + * in the buffer. Therefore, rmgr redo routines MUST pay attention to + * XLR_BKP_BLOCK(N) to know what is actually stored in the XLOG record. + * The N'th XLR_BKP_BLOCK bit corresponds to the N'th distinct buffer + * value (ignoring InvalidBuffer) appearing in the rdata chain. + * + * When buffer is valid, caller must set buffer_std to indicate whether the + * page uses standard pd_lower/pd_upper header fields. If this is true, then + * XLOG is allowed to omit the free space between pd_lower and pd_upper from + * the backed-up page image. Note that even when buffer_std is false, the + * page MUST have an LSN field as its first eight bytes! + * + * Note: data can be NULL to indicate no rmgr data associated with this chain + * entry. This can be sensible (ie, not a wasted entry) if buffer is valid. + * The implication is that the buffer has been changed by the operation being + * logged, and so may need to be backed up, but the change can be redone using + * only information already present elsewhere in the XLOG entry. + */ +typedef struct XLogRecData +{ + char *data; /* start of rmgr data to include */ + uint32 len; /* length of rmgr data to include */ + Buffer buffer; /* buffer associated with data, if any */ + bool buffer_std; /* buffer has standard pd_lower/pd_upper */ + struct XLogRecData *next; /* next struct in chain, or NULL */ +} XLogRecData; + +extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata); +extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, + BlockNumber blk, char *page, bool page_std); +extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std); +extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); +extern bool XLogCheckBufferNeedsBackup(Buffer buffer); + +#endif /* XLOGINSERT_H */ diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h new file mode 100644 index 00000000000..ab0fb1c5004 --- /dev/null +++ b/src/include/access/xlogrecord.h @@ -0,0 +1,100 @@ +/* + * xlogrecord.h + * + * Definitions for the WAL record format. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xlogrecord.h + */ +#ifndef XLOGRECORD_H +#define XLOGRECORD_H + +#include "access/rmgr.h" +#include "access/xlogdefs.h" +#include "storage/block.h" +#include "storage/relfilenode.h" +#include "utils/pg_crc.h" + +/* + * The overall layout of an XLOG record is: + * Fixed-size header (XLogRecord struct) + * rmgr-specific data + * BkpBlock + * backup block data + * BkpBlock + * backup block data + * ... + * + * where there can be zero to four backup blocks (as signaled by xl_info flag + * bits). XLogRecord structs always start on MAXALIGN boundaries in the WAL + * files, and we round up SizeOfXLogRecord so that the rmgr data is also + * guaranteed to begin on a MAXALIGN boundary. However, no padding is added + * to align BkpBlock structs or backup block data. + * + * NOTE: xl_len counts only the rmgr data, not the XLogRecord header, + * and also not any backup blocks. xl_tot_len counts everything. Neither + * length field is rounded up to an alignment boundary. + */ +typedef struct XLogRecord +{ + uint32 xl_tot_len; /* total len of entire record */ + TransactionId xl_xid; /* xact id */ + uint32 xl_len; /* total len of rmgr data */ + uint8 xl_info; /* flag bits, see below */ + RmgrId xl_rmid; /* resource manager for this record */ + /* 2 bytes of padding here, initialize to zero */ + XLogRecPtr xl_prev; /* ptr to previous record in log */ + pg_crc32 xl_crc; /* CRC for this record */ + + /* If MAXALIGN==8, there are 4 wasted bytes here */ + + /* ACTUAL LOG DATA FOLLOWS AT END OF STRUCT */ + +} XLogRecord; + +#define SizeOfXLogRecord MAXALIGN(sizeof(XLogRecord)) + +#define XLogRecGetData(record) ((char*) (record) + SizeOfXLogRecord) + +/* + * XLOG uses only low 4 bits of xl_info. High 4 bits may be used by rmgr. + */ +#define XLR_INFO_MASK 0x0F + +/* + * If we backed up any disk blocks with the XLOG record, we use flag bits in + * xl_info to signal it. We support backup of up to 4 disk blocks per XLOG + * record. + */ +#define XLR_BKP_BLOCK_MASK 0x0F /* all info bits used for bkp blocks */ +#define XLR_MAX_BKP_BLOCKS 4 +#define XLR_BKP_BLOCK(iblk) (0x08 >> (iblk)) /* iblk in 0..3 */ + +/* + * Header info for a backup block appended to an XLOG record. + * + * As a trivial form of data compression, the XLOG code is aware that + * PG data pages usually contain an unused "hole" in the middle, which + * contains only zero bytes. If hole_length > 0 then we have removed + * such a "hole" from the stored data (and it's not counted in the + * XLOG record's CRC, either). Hence, the amount of block data actually + * present following the BkpBlock struct is BLCKSZ - hole_length bytes. + * + * Note that we don't attempt to align either the BkpBlock struct or the + * block's data. So, the struct must be copied to aligned local storage + * before use. + */ +typedef struct BkpBlock +{ + RelFileNode node; /* relation containing block */ + ForkNumber fork; /* fork within the relation */ + BlockNumber block; /* block number */ + uint16 hole_offset; /* number of bytes before "hole" */ + uint16 hole_length; /* number of bytes in "hole" */ + + /* ACTUAL BLOCK DATA FOLLOWS AT END OF STRUCT */ +} BkpBlock; + +#endif /* XLOGRECORD_H */ diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index ad579083ab8..8d906967232 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -11,7 +11,7 @@ #ifndef XLOG_UTILS_H #define XLOG_UTILS_H -#include "access/xlog.h" +#include "access/xlogrecord.h" #include "storage/bufmgr.h" @@ -47,6 +47,12 @@ extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init); extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode); +extern Buffer RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, + int block_index, + bool get_cleanup_lock, bool keep_buffer); +extern Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, + char *blk, bool get_cleanup_lock, bool keep_buffer); + extern Relation CreateFakeRelcacheEntry(RelFileNode rnode); extern void FreeFakeRelcacheEntry(Relation fakerel); diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index 5fc72358284..6c687e3a827 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -14,7 +14,8 @@ #ifndef STORAGE_XLOG_H #define STORAGE_XLOG_H -#include "access/xlog.h" +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" #include "storage/block.h" #include "storage/relfilenode.h" diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h index 811713fb83e..b79d9fc8648 100644 --- a/src/include/commands/dbcommands.h +++ b/src/include/commands/dbcommands.h @@ -14,7 +14,8 @@ #ifndef DBCOMMANDS_H #define DBCOMMANDS_H -#include "access/xlog.h" +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" #include "nodes/parsenodes.h" /* XLOG stuff */ diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index 914d155c9f1..7cbe6f9a819 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -13,8 +13,9 @@ #ifndef SEQUENCE_H #define SEQUENCE_H -#include "access/xlog.h" +#include "access/xlogrecord.h" #include "fmgr.h" +#include "lib/stringinfo.h" #include "nodes/parsenodes.h" #include "storage/relfilenode.h" diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h index 0f16f40e3df..afd9e05cb78 100644 --- a/src/include/commands/tablespace.h +++ b/src/include/commands/tablespace.h @@ -14,7 +14,8 @@ #ifndef TABLESPACE_H #define TABLESPACE_H -#include "access/xlog.h" +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" #include "nodes/parsenodes.h" /* XLOG stuff */ diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h index d9e30776af0..e4185287a1c 100644 --- a/src/include/replication/decode.h +++ b/src/include/replication/decode.h @@ -10,6 +10,7 @@ #define DECODE_H #include "access/xlogreader.h" +#include "access/xlogrecord.h" #include "replication/reorderbuffer.h" #include "replication/logical.h" diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index 1c63af521fe..c89989fd201 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -14,7 +14,8 @@ #ifndef STANDBY_H #define STANDBY_H -#include "access/xlog.h" +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" #include "storage/lock.h" #include "storage/procsignal.h" #include "storage/relfilenode.h" diff --git a/src/include/utils/relmapper.h b/src/include/utils/relmapper.h index 37937ddab82..bd5836b0d98 100644 --- a/src/include/utils/relmapper.h +++ b/src/include/utils/relmapper.h @@ -14,7 +14,8 @@ #ifndef RELMAPPER_H #define RELMAPPER_H -#include "access/xlog.h" +#include "access/xlogrecord.h" +#include "lib/stringinfo.h" /* ---------------- * relmap-related XLOG entries |