diff options
Diffstat (limited to 'src/include/access')
-rw-r--r-- | src/include/access/brin_xlog.h | 63 | ||||
-rw-r--r-- | src/include/access/clog.h | 6 | ||||
-rw-r--r-- | src/include/access/gin.h | 6 | ||||
-rw-r--r-- | src/include/access/gin_private.h | 66 | ||||
-rw-r--r-- | src/include/access/gist_private.h | 31 | ||||
-rw-r--r-- | src/include/access/hash.h | 6 | ||||
-rw-r--r-- | src/include/access/heapam_xlog.h | 139 | ||||
-rw-r--r-- | src/include/access/htup_details.h | 1 | ||||
-rw-r--r-- | src/include/access/itup.h | 1 | ||||
-rw-r--r-- | src/include/access/multixact.h | 6 | ||||
-rw-r--r-- | src/include/access/nbtree.h | 94 | ||||
-rw-r--r-- | src/include/access/spgist.h | 6 | ||||
-rw-r--r-- | src/include/access/spgist_private.h | 144 | ||||
-rw-r--r-- | src/include/access/xact.h | 6 | ||||
-rw-r--r-- | src/include/access/xlog.h | 10 | ||||
-rw-r--r-- | src/include/access/xlog_internal.h | 19 | ||||
-rw-r--r-- | src/include/access/xloginsert.h | 70 | ||||
-rw-r--r-- | src/include/access/xlogreader.h | 77 | ||||
-rw-r--r-- | src/include/access/xlogrecord.h | 160 | ||||
-rw-r--r-- | src/include/access/xlogutils.h | 21 |
20 files changed, 534 insertions, 398 deletions
diff --git a/src/include/access/brin_xlog.h b/src/include/access/brin_xlog.h index d748db4d0c6..6dc9eb3eca8 100644 --- a/src/include/access/brin_xlog.h +++ b/src/include/access/brin_xlog.h @@ -14,7 +14,7 @@ #ifndef BRIN_XLOG_H #define BRIN_XLOG_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/bufpage.h" #include "storage/itemptr.h" @@ -42,59 +42,82 @@ */ #define XLOG_BRIN_INIT_PAGE 0x80 -/* This is what we need to know about a BRIN index create */ +/* + * This is what we need to know about a BRIN index create. + * + * Backup block 0: metapage + */ typedef struct xl_brin_createidx { BlockNumber pagesPerRange; - RelFileNode node; uint16 version; } xl_brin_createidx; #define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16)) /* * This is what we need to know about a BRIN tuple insert + * + * Backup block 0: main page, block data is the new BrinTuple. + * Backup block 1: revmap page */ typedef struct xl_brin_insert { - RelFileNode node; BlockNumber heapBlk; /* extra information needed to update the revmap */ - BlockNumber revmapBlk; BlockNumber pagesPerRange; - uint16 tuplen; - ItemPointerData tid; - /* tuple data follows at end of struct */ + /* offset number in the main page to insert the tuple to. */ + OffsetNumber offnum; } xl_brin_insert; -#define SizeOfBrinInsert (offsetof(xl_brin_insert, tid) + sizeof(ItemPointerData)) +#define SizeOfBrinInsert (offsetof(xl_brin_insert, offnum) + sizeof(OffsetNumber)) /* - * A cross-page update is the same as an insert, but also store the old tid. + * A cross-page update is the same as an insert, but also stores information + * about the old tuple. + * + * Like in xlog_brin_update: + * Backup block 0: new page, block data includes the new BrinTuple. + * Backup block 1: revmap page + * + * And in addition: + * Backup block 2: old page */ typedef struct xl_brin_update { - ItemPointerData oldtid; + /* offset number of old tuple on old page */ + OffsetNumber oldOffnum; + xl_brin_insert insert; } xl_brin_update; #define SizeOfBrinUpdate (offsetof(xl_brin_update, insert) + SizeOfBrinInsert) -/* This is what we need to know about a BRIN tuple samepage update */ +/* + * This is what we need to know about a BRIN tuple samepage update + * + * Backup block 0: updated page, with new BrinTuple as block data + */ typedef struct xl_brin_samepage_update { - RelFileNode node; - ItemPointerData tid; - /* tuple data follows at end of struct */ + OffsetNumber offnum; } xl_brin_samepage_update; -#define SizeOfBrinSamepageUpdate (offsetof(xl_brin_samepage_update, tid) + sizeof(ItemPointerData)) +#define SizeOfBrinSamepageUpdate (sizeof(OffsetNumber)) -/* This is what we need to know about a revmap extension */ +/* + * This is what we need to know about a revmap extension + * + * Backup block 0: metapage + * Backup block 1: new revmap page + */ typedef struct xl_brin_revmap_extend { - RelFileNode node; + /* + * XXX: This is actually redundant - the block number is stored as part of + * backup block 1. + */ BlockNumber targetBlk; } xl_brin_revmap_extend; @@ -102,8 +125,8 @@ typedef struct xl_brin_revmap_extend sizeof(BlockNumber)) -extern void brin_desc(StringInfo buf, XLogRecord *record); -extern void brin_redo(XLogRecPtr lsn, XLogRecord *record); +extern void brin_redo(XLogReaderState *record); +extern void brin_desc(StringInfo buf, XLogReaderState *record); extern const char *brin_identify(uint8 info); #endif /* BRIN_XLOG_H */ diff --git a/src/include/access/clog.h b/src/include/access/clog.h index 04ac4ba3119..fe5e4c634d1 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -11,7 +11,7 @@ #ifndef CLOG_H #define CLOG_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" /* @@ -48,8 +48,8 @@ extern void TruncateCLOG(TransactionId oldestXact); #define CLOG_ZEROPAGE 0x00 #define CLOG_TRUNCATE 0x10 -extern void clog_redo(XLogRecPtr lsn, XLogRecord *record); -extern void clog_desc(StringInfo buf, XLogRecord *record); +extern void clog_redo(XLogReaderState *record); +extern void clog_desc(StringInfo buf, XLogReaderState *record); extern const char *clog_identify(uint8 info); #endif /* CLOG_H */ diff --git a/src/include/access/gin.h b/src/include/access/gin.h index 433e56f20df..fe5f77b1736 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -10,7 +10,7 @@ #ifndef GIN_H #define GIN_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/block.h" #include "utils/relcache.h" @@ -74,8 +74,8 @@ extern void ginGetStats(Relation index, GinStatsData *stats); extern void ginUpdateStats(Relation index, const GinStatsData *stats); /* ginxlog.c */ -extern void gin_redo(XLogRecPtr lsn, XLogRecord *record); -extern void gin_desc(StringInfo buf, XLogRecord *record); +extern void gin_redo(XLogReaderState *record); +extern void gin_desc(StringInfo buf, XLogReaderState *record); extern const char *gin_identify(uint8 info); extern void gin_xlog_startup(void); extern void gin_xlog_cleanup(void); diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 333316d78e2..3d46f20bb83 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -13,7 +13,6 @@ #include "access/genam.h" #include "access/gin.h" #include "access/itup.h" -#include "access/xloginsert.h" #include "fmgr.h" #include "storage/bufmgr.h" #include "utils/rbtree.h" @@ -397,22 +396,22 @@ typedef struct typedef struct ginxlogCreatePostingTree { - RelFileNode node; - BlockNumber blkno; uint32 size; /* A compressed posting list follows */ } ginxlogCreatePostingTree; -#define XLOG_GIN_INSERT 0x20 - /* * The format of the insertion record varies depending on the page type. * ginxlogInsert is the common part between all variants. + * + * Backup Blk 0: target page + * Backup Blk 1: left child, if this insertion finishes an incomplete split */ + +#define XLOG_GIN_INSERT 0x20 + typedef struct { - RelFileNode node; - BlockNumber blkno; uint16 flags; /* GIN_SPLIT_ISLEAF and/or GIN_SPLIT_ISDATA */ /* @@ -477,14 +476,17 @@ typedef struct PostingItem newitem; } ginxlogInsertDataInternal; - +/* + * Backup Blk 0: new left page (= original page, if not root split) + * Backup Blk 1: new right page + * Backup Blk 2: original page / new root page, if root split + * Backup Blk 3: left child, if this insertion completes an earlier split + */ #define XLOG_GIN_SPLIT 0x30 typedef struct ginxlogSplit { RelFileNode node; - BlockNumber lblkno; - BlockNumber rblkno; BlockNumber rrlink; /* right link, or root's blocknumber if root * split */ BlockNumber leftChildBlkno; /* valid on a non-leaf split */ @@ -538,15 +540,6 @@ typedef struct */ #define XLOG_GIN_VACUUM_PAGE 0x40 -typedef struct ginxlogVacuumPage -{ - RelFileNode node; - BlockNumber blkno; - uint16 hole_offset; /* number of bytes before "hole" */ - uint16 hole_length; /* number of bytes in "hole" */ - /* entire page contents (minus the hole) follow at end of record */ -} ginxlogVacuumPage; - /* * Vacuuming posting tree leaf page is WAL-logged like recompression caused * by insertion. @@ -555,26 +548,28 @@ typedef struct ginxlogVacuumPage typedef struct ginxlogVacuumDataLeafPage { - RelFileNode node; - BlockNumber blkno; - ginxlogRecompressDataLeaf data; } ginxlogVacuumDataLeafPage; +/* + * Backup Blk 0: deleted page + * Backup Blk 1: parent + * Backup Blk 2: left sibling + */ #define XLOG_GIN_DELETE_PAGE 0x50 typedef struct ginxlogDeletePage { - RelFileNode node; - BlockNumber blkno; - BlockNumber parentBlkno; OffsetNumber parentOffset; - BlockNumber leftBlkno; BlockNumber rightLink; } ginxlogDeletePage; #define XLOG_GIN_UPDATE_META_PAGE 0x60 +/* + * Backup Blk 0: metapage + * Backup Blk 1: tail page + */ typedef struct ginxlogUpdateMeta { RelFileNode node; @@ -591,22 +586,29 @@ typedef struct ginxlogUpdateMeta typedef struct ginxlogInsertListPage { - RelFileNode node; - BlockNumber blkno; BlockNumber rightlink; int32 ntuples; /* array of inserted tuples follows */ } ginxlogInsertListPage; +/* + * Backup Blk 0: metapage + * Backup Blk 1 to (ndeleted + 1): deleted pages + */ + #define XLOG_GIN_DELETE_LISTPAGE 0x80 -#define GIN_NDELETE_AT_ONCE 16 +/* + * The WAL record for deleting list pages must contain a block reference to + * all the deleted pages, so the number of pages that can be deleted in one + * record is limited by XLR_MAX_BLOCK_ID. (block_id 0 is used for the + * metapage.) + */ +#define GIN_NDELETE_AT_ONCE Min(16, XLR_MAX_BLOCK_ID - 1) typedef struct ginxlogDeleteListPages { - RelFileNode node; GinMetaPageData metadata; int32 ndeleted; - BlockNumber toDelete[GIN_NDELETE_AT_ONCE]; } ginxlogDeleteListPages; @@ -673,7 +675,7 @@ typedef struct GinBtreeData /* insert methods */ OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber); - GinPlaceToPageRC (*placeToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, XLogRecData **, Page *, Page *); + GinPlaceToPageRC (*placeToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, Page *, Page *); void *(*prepareDownlink) (GinBtree, Buffer); void (*fillRoot) (GinBtree, Page, BlockNumber, Page, BlockNumber, Page); diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 21daf3b2b6a..2cbc918ad1a 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -16,7 +16,7 @@ #include "access/gist.h" #include "access/itup.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "fmgr.h" #include "storage/bufmgr.h" #include "storage/buffile.h" @@ -185,34 +185,33 @@ typedef GISTScanOpaqueData *GISTScanOpaque; #define XLOG_GIST_CREATE_INDEX 0x50 /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */ +/* + * Backup Blk 0: updated page. + * Backup Blk 1: If this operation completes a page split, by inserting a + * downlink for the split page, the left half of the split + */ typedef struct gistxlogPageUpdate { - RelFileNode node; - BlockNumber blkno; - - /* - * If this operation completes a page split, by inserting a downlink for - * the split page, leftchild points to the left half of the split. - */ - BlockNumber leftchild; - /* number of deleted offsets */ uint16 ntodelete; + uint16 ntoinsert; /* - * follow: 1. todelete OffsetNumbers 2. tuples to insert + * In payload of blk 0 : 1. todelete OffsetNumbers 2. tuples to insert */ } gistxlogPageUpdate; +/* + * Backup Blk 0: If this operation completes a page split, by inserting a + * downlink for the split page, the left half of the split + * Backup Blk 1 - npage: split pages (1 is the original page) + */ typedef struct gistxlogPageSplit { - RelFileNode node; - BlockNumber origblkno; /* splitted page */ BlockNumber origrlink; /* rightlink of the page before split */ GistNSN orignsn; /* NSN of the page before split */ bool origleaf; /* was splitted page a leaf page? */ - BlockNumber leftchild; /* like in gistxlogPageUpdate */ uint16 npage; /* # of pages in the split */ bool markfollowright; /* set F_FOLLOW_RIGHT flags */ @@ -451,8 +450,8 @@ extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate); /* gistxlog.c */ -extern void gist_redo(XLogRecPtr lsn, XLogRecord *record); -extern void gist_desc(StringInfo buf, XLogRecord *record); +extern void gist_redo(XLogReaderState *record); +extern void gist_desc(StringInfo buf, XLogReaderState *record); extern const char *gist_identify(uint8 info); extern void gist_xlog_startup(void); extern void gist_xlog_cleanup(void); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index c175a5c1822..afd06ff7def 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -20,7 +20,7 @@ #include "access/genam.h" #include "access/itup.h" #include "access/sdir.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "fmgr.h" #include "lib/stringinfo.h" #include "storage/bufmgr.h" @@ -356,8 +356,8 @@ extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); /* hash.c */ -extern void hash_redo(XLogRecPtr lsn, XLogRecord *record); -extern void hash_desc(StringInfo buf, XLogRecord *record); +extern void hash_redo(XLogReaderState *record); +extern void hash_desc(StringInfo buf, XLogReaderState *record); extern const char *hash_identify(uint8 info); #endif /* HASH_H */ diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 1d64264b010..853e2dd491f 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -15,7 +15,7 @@ #define HEAPAM_XLOG_H #include "access/htup.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/buf.h" #include "storage/bufpage.h" @@ -78,27 +78,11 @@ #define XLOG_HEAP_CONTAINS_OLD \ (XLOG_HEAP_CONTAINS_OLD_TUPLE | XLOG_HEAP_CONTAINS_OLD_KEY) -/* - * All what we need to find changed tuple - * - * NB: on most machines, sizeof(xl_heaptid) will include some trailing pad - * bytes for alignment. We don't want to store the pad space in the XLOG, - * so use SizeOfHeapTid for space calculations. Similar comments apply for - * the other xl_FOO structs. - */ -typedef struct xl_heaptid -{ - RelFileNode node; - ItemPointerData tid; /* changed tuple id */ -} xl_heaptid; - -#define SizeOfHeapTid (offsetof(xl_heaptid, tid) + SizeOfIptrData) - /* This is what we need to know about delete */ typedef struct xl_heap_delete { - xl_heaptid target; /* deleted tuple id */ TransactionId xmax; /* xmax of the deleted tuple */ + OffsetNumber offnum; /* deleted tuple's offset */ uint8 infobits_set; /* infomask bits */ uint8 flags; } xl_heap_delete; @@ -122,45 +106,33 @@ typedef struct xl_heap_header #define SizeOfHeapHeader (offsetof(xl_heap_header, t_hoff) + sizeof(uint8)) -/* - * Variant of xl_heap_header that contains the length of the tuple, which is - * useful if the length of the tuple cannot be computed using the overall - * record length. E.g. because there are several tuples inside a single - * record. - */ -typedef struct xl_heap_header_len -{ - uint16 t_len; - xl_heap_header header; -} xl_heap_header_len; - -#define SizeOfHeapHeaderLen (offsetof(xl_heap_header_len, header) + SizeOfHeapHeader) - /* This is what we need to know about insert */ typedef struct xl_heap_insert { - xl_heaptid target; /* inserted tuple id */ + OffsetNumber offnum; /* inserted tuple's offset */ uint8 flags; - /* xl_heap_header & TUPLE DATA FOLLOWS AT END OF STRUCT */ + + /* xl_heap_header & TUPLE DATA in backup block 0 */ } xl_heap_insert; #define SizeOfHeapInsert (offsetof(xl_heap_insert, flags) + sizeof(uint8)) /* - * This is what we need to know about a multi-insert. The record consists of - * xl_heap_multi_insert header, followed by a xl_multi_insert_tuple and tuple - * data for each tuple. 'offsets' array is omitted if the whole page is - * reinitialized (XLOG_HEAP_INIT_PAGE) + * This is what we need to know about a multi-insert. + * + * The main data of the record consists of this xl_heap_multi_insert header. + * 'offsets' array is omitted if the whole page is reinitialized + * (XLOG_HEAP_INIT_PAGE). + * + * In block 0's data portion, there is an xl_multi_insert_tuple struct, + * followed by the tuple data for each tuple. There is padding to align + * each xl_multi_insert struct. */ typedef struct xl_heap_multi_insert { - RelFileNode node; - BlockNumber blkno; uint8 flags; uint16 ntuples; OffsetNumber offsets[1]; - - /* TUPLE DATA (xl_multi_insert_tuples) FOLLOW AT END OF STRUCT */ } xl_heap_multi_insert; #define SizeOfHeapMultiInsert offsetof(xl_heap_multi_insert, offsets) @@ -176,34 +148,39 @@ typedef struct xl_multi_insert_tuple #define SizeOfMultiInsertTuple (offsetof(xl_multi_insert_tuple, t_hoff) + sizeof(uint8)) -/* This is what we need to know about update|hot_update */ +/* + * This is what we need to know about update|hot_update + * + * Backup blk 0: new page + * + * If XLOG_HEAP_PREFIX_FROM_OLD or XLOG_HEAP_SUFFIX_FROM_OLD flags are set, + * the prefix and/or suffix come first, as one or two uint16s. + * + * After that, xl_heap_header and new tuple data follow. The new tuple + * data doesn't include the prefix and suffix, which are copied from the + * old tuple on replay. + * + * If HEAP_CONTAINS_NEW_TUPLE_DATA flag is given, the tuple data is + * included even if a full-page image was taken. + * + * Backup blk 1: old page, if different. (no data, just a reference to the blk) + */ typedef struct xl_heap_update { - xl_heaptid target; /* deleted tuple id */ TransactionId old_xmax; /* xmax of the old tuple */ - TransactionId new_xmax; /* xmax of the new tuple */ - ItemPointerData newtid; /* new inserted tuple id */ + OffsetNumber old_offnum; /* old tuple's offset */ uint8 old_infobits_set; /* infomask bits to set on old tuple */ uint8 flags; + TransactionId new_xmax; /* xmax of the new tuple */ + OffsetNumber new_offnum; /* new tuple's offset */ /* - * If XLOG_HEAP_PREFIX_FROM_OLD or XLOG_HEAP_SUFFIX_FROM_OLD flags are - * set, the prefix and/or suffix come next, as one or two uint16s. - * - * After that, xl_heap_header_len and new tuple data follow. The new - * tuple data and length don't include the prefix and suffix, which are - * copied from the old tuple on replay. The new tuple data is omitted if - * a full-page image of the page was taken (unless the - * XLOG_HEAP_CONTAINS_NEW_TUPLE flag is set, in which case it's included - * anyway). - * * If XLOG_HEAP_CONTAINS_OLD_TUPLE or XLOG_HEAP_CONTAINS_OLD_KEY flags are - * set, another xl_heap_header_len struct and tuple data for the old tuple - * follows. + * set, a xl_heap_header struct and tuple data for the old tuple follows. */ } xl_heap_update; -#define SizeOfHeapUpdate (offsetof(xl_heap_update, flags) + sizeof(uint8)) +#define SizeOfHeapUpdate (offsetof(xl_heap_update, new_offnum) + sizeof(OffsetNumber)) /* * This is what we need to know about vacuum page cleanup/redirect @@ -218,12 +195,10 @@ typedef struct xl_heap_update */ typedef struct xl_heap_clean { - RelFileNode node; - BlockNumber block; TransactionId latestRemovedXid; uint16 nredirected; uint16 ndead; - /* OFFSET NUMBERS FOLLOW */ + /* OFFSET NUMBERS are in the block reference 0 */ } xl_heap_clean; #define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16)) @@ -251,8 +226,8 @@ typedef struct xl_heap_cleanup_info /* This is what we need to know about lock */ typedef struct xl_heap_lock { - xl_heaptid target; /* locked tuple id */ TransactionId locking_xid; /* might be a MultiXactId not xid */ + OffsetNumber offnum; /* locked tuple's offset on page */ int8 infobits_set; /* infomask and infomask2 bits to set */ } xl_heap_lock; @@ -261,8 +236,8 @@ typedef struct xl_heap_lock /* This is what we need to know about locking an updated version of a row */ typedef struct xl_heap_lock_updated { - xl_heaptid target; TransactionId xmax; + OffsetNumber offnum; uint8 infobits_set; } xl_heap_lock_updated; @@ -271,11 +246,11 @@ typedef struct xl_heap_lock_updated /* This is what we need to know about in-place update */ typedef struct xl_heap_inplace { - xl_heaptid target; /* updated tuple id */ + OffsetNumber offnum; /* updated tuple's offset on page */ /* TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_heap_inplace; -#define SizeOfHeapInplace (offsetof(xl_heap_inplace, target) + SizeOfHeapTid) +#define SizeOfHeapInplace (offsetof(xl_heap_inplace, offnum) + sizeof(OffsetNumber)) /* * This struct represents a 'freeze plan', which is what we need to know about @@ -296,23 +271,26 @@ typedef struct xl_heap_freeze_tuple /* * This is what we need to know about a block being frozen during vacuum + * + * Backup block 0's data contains an array of xl_heap_freeze_tuple structs, + * one for each tuple. */ typedef struct xl_heap_freeze_page { - RelFileNode node; - BlockNumber block; TransactionId cutoff_xid; uint16 ntuples; - xl_heap_freeze_tuple tuples[FLEXIBLE_ARRAY_MEMBER]; } xl_heap_freeze_page; -#define SizeOfHeapFreezePage offsetof(xl_heap_freeze_page, tuples) +#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, ntuples) + sizeof(uint16)) -/* This is what we need to know about setting a visibility map bit */ +/* + * This is what we need to know about setting a visibility map bit + * + * Backup blk 0: visibility map buffer + * Backup blk 1: heap buffer + */ typedef struct xl_heap_visible { - RelFileNode node; - BlockNumber block; TransactionId cutoff_xid; } xl_heap_visible; @@ -338,10 +316,11 @@ typedef struct xl_heap_new_cid /* * Store the relfilenode/ctid pair to facilitate lookups. */ - xl_heaptid target; + RelFileNode target_node; + ItemPointerData target_tid; } xl_heap_new_cid; -#define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target) + SizeOfHeapTid) +#define SizeOfHeapNewCid (offsetof(xl_heap_new_cid, target_tid) + sizeof(ItemPointerData)) /* logical rewrite xlog record header */ typedef struct xl_heap_rewrite_mapping @@ -357,13 +336,13 @@ typedef struct xl_heap_rewrite_mapping extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, TransactionId *latestRemovedXid); -extern void heap_redo(XLogRecPtr lsn, XLogRecord *record); -extern void heap_desc(StringInfo buf, XLogRecord *record); +extern void heap_redo(XLogReaderState *record); +extern void heap_desc(StringInfo buf, XLogReaderState *record); extern const char *heap_identify(uint8 info); -extern void heap2_redo(XLogRecPtr lsn, XLogRecord *record); -extern void heap2_desc(StringInfo buf, XLogRecord *record); +extern void heap2_redo(XLogReaderState *record); +extern void heap2_desc(StringInfo buf, XLogReaderState *record); extern const char *heap2_identify(uint8 info); -extern void heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r); +extern void heap_xlog_logical_rewrite(XLogReaderState *r); extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid); diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 294d21bd180..300c2a52f02 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -498,6 +498,7 @@ do { \ * you can, say, fit 2 tuples of size MaxHeapTupleSize/2 on the same page. */ #define MaxHeapTupleSize (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData))) +#define MinHeapTupleSize MAXALIGN(offsetof(HeapTupleHeaderData, t_bits)) /* * MaxHeapTuplesPerPage is an upper bound on the number of tuples that can diff --git a/src/include/access/itup.h b/src/include/access/itup.h index de17936b106..e4dc51e8720 100644 --- a/src/include/access/itup.h +++ b/src/include/access/itup.h @@ -133,6 +133,7 @@ typedef IndexAttributeBitMapData *IndexAttributeBitMap; * IndexTupleData struct. We arrive at the divisor because each tuple * must be maxaligned, and it must have an associated item pointer. */ +#define MinIndexTupleSize MAXALIGN(sizeof(IndexTupleData) + 1) #define MaxIndexTuplesPerPage \ ((int) ((BLCKSZ - SizeOfPageHeaderData) / \ (MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData)))) diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 43d737505d2..ac58a3766d5 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -11,7 +11,7 @@ #ifndef MULTIXACT_H #define MULTIXACT_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" @@ -135,8 +135,8 @@ extern void multixact_twophase_postcommit(TransactionId xid, uint16 info, extern void multixact_twophase_postabort(TransactionId xid, uint16 info, void *recdata, uint32 len); -extern void multixact_redo(XLogRecPtr lsn, XLogRecord *record); -extern void multixact_desc(StringInfo buf, XLogRecord *record); +extern void multixact_redo(XLogReaderState *record); +extern void multixact_desc(StringInfo buf, XLogReaderState *record); extern const char *multixact_identify(uint8 info); extern char *mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 6ecd2ced62d..d3d258bcc9f 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -17,7 +17,7 @@ #include "access/genam.h" #include "access/itup.h" #include "access/sdir.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "catalog/pg_index.h" #include "lib/stringinfo.h" #include "storage/bufmgr.h" @@ -228,15 +228,6 @@ typedef struct BTMetaPageData * FSM */ /* - * All that we need to find changed index tuple - */ -typedef struct xl_btreetid -{ - RelFileNode node; - ItemPointerData tid; /* changed tuple id */ -} xl_btreetid; - -/* * All that we need to regenerate the meta-data page */ typedef struct xl_btree_metadata @@ -252,16 +243,17 @@ typedef struct xl_btree_metadata * * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. * Note that INSERT_META implies it's not a leaf page. + * + * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META + * Backup Blk 2: xl_btree_metadata, if INSERT_META */ typedef struct xl_btree_insert { - xl_btreetid target; /* inserted tuple id */ - /* BlockNumber finishes_split field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */ - /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */ - /* INDEX TUPLE FOLLOWS AT END OF STRUCT */ + OffsetNumber offnum; } xl_btree_insert; -#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData) +#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) /* * On insert with split, we save all the items going into the right sibling @@ -278,45 +270,41 @@ typedef struct xl_btree_insert * the root page, and thus that a newroot record rather than an insert or * split record should follow. Note that a split record never carries a * metapage update --- we'll do that in the parent-level update. + * + * Backup Blk 0: original page / new left page + * + * The left page's data portion contains the new item, if it's the _L variant. + * (In the _R variants, the new item is one of the right page's tuples.) + * If level > 0, an IndexTuple representing the HIKEY of the left page + * follows. We don't need this on leaf pages, because it's the same as the + * leftmost key in the new right page. + * + * Backup Blk 1: new right page + * + * The right page's data portion contains the right page's tuples in the + * form used by _bt_restore_page. + * + * Backup Blk 2: next block (orig page's rightlink), if any + * Backup Blk 3: child's left sibling, if non-leaf split */ typedef struct xl_btree_split { - RelFileNode node; - BlockNumber leftsib; /* orig page / new left page */ - BlockNumber rightsib; /* new right page */ - BlockNumber rnext; /* next block (orig page's rightlink) */ uint32 level; /* tree level of page being split */ OffsetNumber firstright; /* first item moved to right page */ - - /* - * In the _L variants, next are OffsetNumber newitemoff and the new item. - * (In the _R variants, the new item is one of the right page's tuples.) - * The new item, but not newitemoff, is suppressed if XLogInsert chooses - * to store the left page's whole page image. - * - * If level > 0, an IndexTuple representing the HIKEY of the left page - * follows. We don't need this on leaf pages, because it's the same as - * the leftmost key in the new right page. Also, it's suppressed if - * XLogInsert chooses to store the left page's whole page image. - * - * If level > 0, BlockNumber of the page whose incomplete-split flag this - * insertion clears. (not aligned) - * - * Last are the right page's tuples in the form used by _bt_restore_page. - */ + OffsetNumber newitemoff; /* new item's offset (if placed on left page) */ } xl_btree_split; -#define SizeOfBtreeSplit (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber)) +#define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber)) /* * This is what we need to know about delete of individual leaf index tuples. * The WAL record can represent deletion of any number of index tuples on a * single index page when *not* executed by VACUUM. + * + * Backup Blk 0: index page */ typedef struct xl_btree_delete { - RelFileNode node; /* RelFileNode of the index */ - BlockNumber block; RelFileNode hnode; /* RelFileNode of the heap the index currently * points at */ int nitems; @@ -361,8 +349,6 @@ typedef struct xl_btree_reuse_page */ typedef struct xl_btree_vacuum { - RelFileNode node; - BlockNumber block; BlockNumber lastBlockVacuumed; /* TARGET OFFSET NUMBERS FOLLOW */ @@ -376,10 +362,13 @@ typedef struct xl_btree_vacuum * remove this tuple's downlink and the *following* tuple's key). Note that * the leaf page is empty, so we don't need to store its content --- it is * just reinitialized during recovery using the rest of the fields. + * + * Backup Blk 0: leaf block + * Backup Blk 1: top parent */ typedef struct xl_btree_mark_page_halfdead { - xl_btreetid target; /* deleted tuple id in parent page */ + OffsetNumber poffset; /* deleted tuple id in parent page */ /* information needed to recreate the leaf page: */ BlockNumber leafblk; /* leaf block ultimately being deleted */ @@ -394,11 +383,15 @@ typedef struct xl_btree_mark_page_halfdead * This is what we need to know about deletion of a btree page. Note we do * not store any content for the deleted page --- it is just rewritten as empty * during recovery, apart from resetting the btpo.xact. + * + * Backup Blk 0: target block being deleted + * Backup Blk 1: target block's left sibling, if any + * Backup Blk 2: target block's right sibling + * Backup Blk 3: leaf block (if different from target) + * Backup Blk 4: metapage (if rightsib becomes new fast root) */ typedef struct xl_btree_unlink_page { - RelFileNode node; - BlockNumber deadblk; /* target block being deleted */ BlockNumber leftsib; /* target block's left sibling, if any */ BlockNumber rightsib; /* target block's right sibling */ @@ -406,7 +399,6 @@ typedef struct xl_btree_unlink_page * Information needed to recreate the leaf page, when target is an * internal page. */ - BlockNumber leafblk; BlockNumber leafleftsib; BlockNumber leafrightsib; BlockNumber topparent; /* next child down in the branch */ @@ -423,13 +415,15 @@ typedef struct xl_btree_unlink_page * * Note that although this implies rewriting the metadata page, we don't need * an xl_btree_metadata record --- the rootblk and level are sufficient. + * + * Backup Blk 0: new root page (2 tuples as payload, if splitting old root) + * Backup Blk 1: left child (if splitting an old root) + * Backup Blk 2: metapage */ typedef struct xl_btree_newroot { - RelFileNode node; - BlockNumber rootblk; /* location of new root */ + BlockNumber rootblk; /* location of new root (redundant with blk 0) */ uint32 level; /* its tree level */ - /* 0 or 2 INDEX TUPLES FOLLOW AT END OF STRUCT */ } xl_btree_newroot; #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) @@ -726,8 +720,8 @@ extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2); /* * prototypes for functions in nbtxlog.c */ -extern void btree_redo(XLogRecPtr lsn, XLogRecord *record); -extern void btree_desc(StringInfo buf, XLogRecord *record); +extern void btree_redo(XLogReaderState *record); +extern void btree_desc(StringInfo buf, XLogReaderState *record); extern const char *btree_identify(uint8 info); #endif /* NBTREE_H */ diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index ccf1ed77869..3aa96bde86f 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -15,7 +15,7 @@ #define SPGIST_H #include "access/skey.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "fmgr.h" #include "lib/stringinfo.h" @@ -197,8 +197,8 @@ extern Datum spgbulkdelete(PG_FUNCTION_ARGS); extern Datum spgvacuumcleanup(PG_FUNCTION_ARGS); /* spgxlog.c */ -extern void spg_redo(XLogRecPtr lsn, XLogRecord *record); -extern void spg_desc(StringInfo buf, XLogRecord *record); +extern void spg_redo(XLogReaderState *record); +extern void spg_desc(StringInfo buf, XLogReaderState *record); extern const char *spg_identify(uint8 info); extern void spg_xlog_startup(void); extern void spg_xlog_cleanup(void); diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index 3330644651c..4b6fdee8017 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -18,7 +18,6 @@ #include "access/spgist.h" #include "nodes/tidbitmap.h" #include "storage/buf.h" -#include "storage/relfilenode.h" #include "utils/relcache.h" @@ -351,35 +350,8 @@ typedef SpGistDeadTupleData *SpGistDeadTuple; /* * XLOG stuff - * - * ACCEPT_RDATA_* can only use fixed-length rdata arrays, because of lengthof */ -#define ACCEPT_RDATA_DATA(p, s, i) \ - do { \ - Assert((i) < lengthof(rdata)); \ - rdata[i].data = (char *) (p); \ - rdata[i].len = (s); \ - rdata[i].buffer = InvalidBuffer; \ - rdata[i].buffer_std = true; \ - rdata[i].next = NULL; \ - if ((i) > 0) \ - rdata[(i) - 1].next = rdata + (i); \ - } while(0) - -#define ACCEPT_RDATA_BUFFER(b, i) \ - do { \ - Assert((i) < lengthof(rdata)); \ - rdata[i].data = NULL; \ - rdata[i].len = 0; \ - rdata[i].buffer = (b); \ - rdata[i].buffer_std = true; \ - rdata[i].next = NULL; \ - if ((i) > 0) \ - rdata[(i) - 1].next = rdata + (i); \ - } while(0) - - /* XLOG record types for SPGiST */ #define XLOG_SPGIST_CREATE_INDEX 0x00 #define XLOG_SPGIST_ADD_LEAF 0x10 @@ -408,36 +380,36 @@ typedef struct spgxlogState (d).isBuild = (s)->isBuild; \ } while(0) - +/* + * Backup Blk 0: destination page for leaf tuple + * Backup Blk 1: parent page (if any) + */ typedef struct spgxlogAddLeaf { - RelFileNode node; - - BlockNumber blknoLeaf; /* destination page for leaf tuple */ bool newPage; /* init dest page? */ bool storesNulls; /* page is in the nulls tree? */ OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */ OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */ - BlockNumber blknoParent; /* where the parent downlink is, if any */ - OffsetNumber offnumParent; + OffsetNumber offnumParent; /* where the parent downlink is, if any */ uint16 nodeI; /* new leaf tuple follows (unaligned!) */ } spgxlogAddLeaf; +/* + * Backup Blk 0: source leaf page + * Backup Blk 1: destination leaf page + * Backup Blk 2: parent page + */ typedef struct spgxlogMoveLeafs { - RelFileNode node; - - BlockNumber blknoSrc; /* source leaf page */ - BlockNumber blknoDst; /* destination leaf page */ uint16 nMoves; /* number of tuples moved from source page */ bool newPage; /* init dest page? */ bool replaceDead; /* are we replacing a DEAD source tuple? */ bool storesNulls; /* pages are in the nulls tree? */ - BlockNumber blknoParent; /* where the parent downlink is */ + /* where the parent downlink is */ OffsetNumber offnumParent; uint16 nodeI; @@ -452,11 +424,6 @@ typedef struct spgxlogMoveLeafs * Note: if replaceDead is true then there is only one inserted tuple * number and only one leaf tuple in the data, because we are not copying * the dead tuple from the source - * - * Buffer references in the rdata array are: - * Src page - * Dest page - * Parent page *---------- */ OffsetNumber offsets[1]; @@ -464,21 +431,43 @@ typedef struct spgxlogMoveLeafs #define SizeOfSpgxlogMoveLeafs offsetof(spgxlogMoveLeafs, offsets) +/* + * Backup Blk 0: original page + * Backup Blk 1: where new tuple goes, if not same place + * Backup Blk 2: where parent downlink is, if updated and different from + * the old and new + */ typedef struct spgxlogAddNode { - RelFileNode node; - - BlockNumber blkno; /* block number of original inner tuple */ - OffsetNumber offnum; /* offset of original inner tuple */ - - BlockNumber blknoParent; /* where parent downlink is, if updated */ - OffsetNumber offnumParent; - uint16 nodeI; + /* + * Offset of the original inner tuple, in the original page (on backup + * block 0). + */ + OffsetNumber offnum; - BlockNumber blknoNew; /* where new tuple goes, if not same place */ + /* + * Offset of the new tuple, on the new page (on backup block 1). Invalid, + * if we overwrote the old tuple in the original page). + */ OffsetNumber offnumNew; bool newPage; /* init new page? */ + /*---- + * Where is the parent downlink? parentBlk indicates which page it's on, + * and offnumParent is the offset within the page. The possible values for + * parentBlk are: + * + * 0: parent == original page + * 1: parent == new page + * 2: parent == different page (blk ref 2) + * -1: parent not updated + *---- + */ + char parentBlk; + OffsetNumber offnumParent; /* offset within the parent page */ + + uint16 nodeI; + spgxlogState stateSrc; /* @@ -486,41 +475,51 @@ typedef struct spgxlogAddNode */ } spgxlogAddNode; +/* + * Backup Blk 0: where the prefix tuple goes + * Backup Blk 1: where the postfix tuple goes (if different page) + */ typedef struct spgxlogSplitTuple { - RelFileNode node; - - BlockNumber blknoPrefix; /* where the prefix tuple goes */ + /* where the prefix tuple goes */ OffsetNumber offnumPrefix; - BlockNumber blknoPostfix; /* where the postfix tuple goes */ + /* where the postfix tuple goes */ OffsetNumber offnumPostfix; bool newPage; /* need to init that page? */ + bool postfixBlkSame; /* was postfix tuple put on same page as + * prefix? */ /* - * new prefix inner tuple follows, then new postfix inner tuple - * (both are unaligned!) + * new prefix inner tuple follows, then new postfix inner tuple (both are + * unaligned!) */ } spgxlogSplitTuple; +/* + * Buffer references in the rdata array are: + * Backup Blk 0: Src page (only if not root) + * Backup Blk 1: Dest page (if used) + * Backup Blk 2: Inner page + * Backup Blk 3: Parent page (if any, and different from Inner) + */ typedef struct spgxlogPickSplit { - RelFileNode node; + bool isRootSplit; - BlockNumber blknoSrc; /* original leaf page */ - BlockNumber blknoDest; /* other leaf page, if any */ uint16 nDelete; /* n to delete from Src */ uint16 nInsert; /* n to insert on Src and/or Dest */ bool initSrc; /* re-init the Src page? */ bool initDest; /* re-init the Dest page? */ - BlockNumber blknoInner; /* where to put new inner tuple */ + /* where to put new inner tuple */ OffsetNumber offnumInner; bool initInner; /* re-init the Inner page? */ bool storesNulls; /* pages are in the nulls tree? */ - BlockNumber blknoParent; /* where the parent downlink is, if any */ + /* where the parent downlink is, if any */ + bool innerIsParent; /* is parent the same as inner page? */ OffsetNumber offnumParent; uint16 nodeI; @@ -533,24 +532,15 @@ typedef struct spgxlogPickSplit * array of page selector bytes for inserted tuples, length nInsert * new inner tuple (unaligned!) * list of leaf tuples, length nInsert (unaligned!) - * - * Buffer references in the rdata array are: - * Src page (only if not root and not being init'd) - * Dest page (if used and not being init'd) - * Inner page (only if not being init'd) - * Parent page (if any; could be same as Inner) *---------- */ - OffsetNumber offsets[1]; + OffsetNumber offsets[1]; } spgxlogPickSplit; #define SizeOfSpgxlogPickSplit offsetof(spgxlogPickSplit, offsets) typedef struct spgxlogVacuumLeaf { - RelFileNode node; - - BlockNumber blkno; /* block number to clean */ uint16 nDead; /* number of tuples to become DEAD */ uint16 nPlaceholder; /* number of tuples to become PLACEHOLDER */ uint16 nMove; /* number of tuples to move */ @@ -576,9 +566,6 @@ typedef struct spgxlogVacuumLeaf typedef struct spgxlogVacuumRoot { /* vacuum a root page when it is also a leaf */ - RelFileNode node; - - BlockNumber blkno; /* block number to clean */ uint16 nDelete; /* number of tuples to delete */ spgxlogState stateSrc; @@ -591,9 +578,6 @@ typedef struct spgxlogVacuumRoot typedef struct spgxlogVacuumRedirect { - RelFileNode node; - - BlockNumber blkno; /* block number to clean */ uint16 nToPlaceholder; /* number of redirects to make placeholders */ OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */ TransactionId newestRedirectXid; /* newest XID of removed redirects */ diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 11a51b26859..b018aa4f5d8 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -14,7 +14,7 @@ #ifndef XACT_H #define XACT_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" #include "storage/relfilenode.h" @@ -256,8 +256,8 @@ extern void UnregisterSubXactCallback(SubXactCallback callback, void *arg); extern int xactGetCommittedChildren(TransactionId **ptr); -extern void xact_redo(XLogRecPtr lsn, XLogRecord *record); -extern void xact_desc(StringInfo buf, XLogRecord *record); +extern void xact_redo(XLogReaderState *record); +extern void xact_desc(StringInfo buf, XLogReaderState *record); extern const char *xact_identify(uint8 info); #endif /* XACT_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 6f8b5f46e10..d06fbc0ec1e 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -14,7 +14,7 @@ #include "access/rmgr.h" #include "access/xlogdefs.h" #include "access/xloginsert.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "datatype/timestamp.h" #include "lib/stringinfo.h" @@ -186,7 +186,9 @@ typedef struct CheckpointStatsData extern CheckpointStatsData CheckpointStats; -extern XLogRecPtr XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn); +struct XLogRecData; + +extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, XLogRecPtr fpw_lsn); extern void XLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); @@ -198,8 +200,8 @@ extern XLogSegNo XLogGetLastRemovedSegno(void); extern void XLogSetAsyncXactLSN(XLogRecPtr record); extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); -extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record); -extern void xlog_desc(StringInfo buf, XLogRecord *record); +extern void xlog_redo(XLogReaderState *record); +extern void xlog_desc(StringInfo buf, XLogReaderState *record); extern const char *xlog_identify(uint8 info); extern void issue_xlog_fsync(int fd, XLogSegNo segno); diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 19b2ef8d90d..423ef4d7fa0 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -20,7 +20,7 @@ #define XLOG_INTERNAL_H #include "access/xlogdefs.h" -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "datatype/timestamp.h" #include "lib/stringinfo.h" #include "pgtime.h" @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD080 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD081 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { @@ -204,6 +204,17 @@ typedef struct xl_end_of_recovery } xl_end_of_recovery; /* + * The functions in xloginsert.c construct a chain of XLogRecData structs + * to represent the final WAL record. + */ +typedef struct XLogRecData +{ + struct XLogRecData *next; /* next struct in chain, or NULL */ + char *data; /* start of rmgr data to include */ + uint32 len; /* length of rmgr data to include */ +} XLogRecData; + +/* * Method table for resource managers. * * This struct must be kept in sync with the PG_RMGR definition in @@ -219,8 +230,8 @@ typedef struct xl_end_of_recovery typedef struct RmgrData { const char *rm_name; - void (*rm_redo) (XLogRecPtr lsn, XLogRecord *rptr); - void (*rm_desc) (StringInfo buf, XLogRecord *rptr); + void (*rm_redo) (XLogReaderState *record); + void (*rm_desc) (StringInfo buf, XLogReaderState *record); const char *(*rm_identify) (uint8 info); void (*rm_startup) (void); void (*rm_cleanup) (void); diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 30c2e84cbc9..e5ab71e2305 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -18,49 +18,43 @@ #include "storage/relfilenode.h" /* - * The rmgr data to be written by XLogInsert() is defined by a chain of - * one or more XLogRecData structs. (Multiple structs would be used when - * parts of the source data aren't physically adjacent in memory, or when - * multiple associated buffers need to be specified.) - * - * If buffer is valid then XLOG will check if buffer must be backed up - * (ie, whether this is first change of that page since last checkpoint). - * If so, the whole page contents are attached to the XLOG record, and XLOG - * sets XLR_BKP_BLOCK(N) bit in xl_info. Note that the buffer must be pinned - * and exclusive-locked by the caller, so that it won't change under us. - * NB: when the buffer is backed up, we DO NOT insert the data pointed to by - * this XLogRecData struct into the XLOG record, since we assume it's present - * in the buffer. Therefore, rmgr redo routines MUST pay attention to - * XLR_BKP_BLOCK(N) to know what is actually stored in the XLOG record. - * The N'th XLR_BKP_BLOCK bit corresponds to the N'th distinct buffer - * value (ignoring InvalidBuffer) appearing in the rdata chain. - * - * When buffer is valid, caller must set buffer_std to indicate whether the - * page uses standard pd_lower/pd_upper header fields. If this is true, then - * XLOG is allowed to omit the free space between pd_lower and pd_upper from - * the backed-up page image. Note that even when buffer_std is false, the - * page MUST have an LSN field as its first eight bytes! - * - * Note: data can be NULL to indicate no rmgr data associated with this chain - * entry. This can be sensible (ie, not a wasted entry) if buffer is valid. - * The implication is that the buffer has been changed by the operation being - * logged, and so may need to be backed up, but the change can be redone using - * only information already present elsewhere in the XLOG entry. + * The minimum size of the WAL construction working area. If you need to + * register more than XLR_NORMAL_MAX_BLOCK_ID block references or have more + * than XLR_NORMAL_RDATAS data chunks in a single WAL record, you must call + * XLogEnsureRecordSpace() first to allocate more working memory. */ -typedef struct XLogRecData -{ - char *data; /* start of rmgr data to include */ - uint32 len; /* length of rmgr data to include */ - Buffer buffer; /* buffer associated with data, if any */ - bool buffer_std; /* buffer has standard pd_lower/pd_upper */ - struct XLogRecData *next; /* next struct in chain, or NULL */ -} XLogRecData; +#define XLR_NORMAL_MAX_BLOCK_ID 4 +#define XLR_NORMAL_RDATAS 20 + +/* flags for XLogRegisterBuffer */ +#define REGBUF_FORCE_IMAGE 0x01 /* force a full-page image */ +#define REGBUF_NO_IMAGE 0x02 /* don't take a full-page image */ +#define REGBUF_WILL_INIT (0x04 | 0x02) /* page will be re-initialized at + * replay (implies NO_IMAGE) */ +#define REGBUF_STANDARD 0x08 /* page follows "standard" page layout, + * (data between pd_lower and pd_upper + * will be skipped) */ +#define REGBUF_KEEP_DATA 0x10 /* include data even if a full-page image + * is taken */ + +/* prototypes for public functions in xloginsert.c: */ +extern void XLogBeginInsert(void); +extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info); +extern void XLogEnsureRecordSpace(int nbuffers, int ndatas); +extern void XLogRegisterData(char *data, int len); +extern void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags); +extern void XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, + ForkNumber forknum, BlockNumber blknum, char *page, + uint8 flags); +extern void XLogRegisterBufData(uint8 block_id, char *data, int len); +extern void XLogResetInsertion(void); +extern bool XLogCheckBufferNeedsBackup(Buffer buffer); -extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata); extern XLogRecPtr log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blk, char *page, bool page_std); extern XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std); extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std); -extern bool XLogCheckBufferNeedsBackup(Buffer buffer); + +extern void InitXLogInsert(void); #endif /* XLOGINSERT_H */ diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index ea873a2d9c7..eb6cc8996a5 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -14,12 +14,18 @@ * * The basic idea is to allocate an XLogReaderState via * XLogReaderAllocate(), and call XLogReadRecord() until it returns NULL. + * + * After reading a record with XLogReadRecord(), it's decomposed into + * the per-block and main data parts, and the parts can be accessed + * with the XLogRec* macros and functions. You can also decode a + * record that's already constructed in memory, without reading from + * disk, by calling the DecodeXLogRecord() function. *------------------------------------------------------------------------- */ #ifndef XLOGREADER_H #define XLOGREADER_H -#include "access/xlog_internal.h" +#include "access/xlogrecord.h" typedef struct XLogReaderState XLogReaderState; @@ -31,6 +37,32 @@ typedef int (*XLogPageReadCB) (XLogReaderState *xlogreader, char *readBuf, TimeLineID *pageTLI); +typedef struct +{ + /* Is this block ref in use? */ + bool in_use; + + /* Identify the block this refers to */ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + + /* copy of the fork_flags field from the XLogRecordBlockHeader */ + uint8 flags; + + /* Information on full-page image, if any */ + bool has_image; + char *bkp_image; + uint16 hole_offset; + uint16 hole_length; + + /* Buffer holding the rmgr-specific data associated with this block */ + bool has_data; + char *data; + uint16 data_len; + uint16 data_bufsz; +} DecodedBkpBlock; + struct XLogReaderState { /* ---------------------------------------- @@ -79,6 +111,25 @@ struct XLogReaderState XLogRecPtr ReadRecPtr; /* start of last record read */ XLogRecPtr EndRecPtr; /* end+1 of last record read */ + + /* ---------------------------------------- + * Decoded representation of current record + * + * Use XLogRecGet* functions to investigate the record; these fields + * should not be accessed directly. + * ---------------------------------------- + */ + XLogRecord *decoded_record; /* currently decoded record */ + + char *main_data; /* record's main data portion */ + uint32 main_data_len; /* main data portion's length */ + uint32 main_data_bufsz; /* allocated size of the buffer */ + + /* information about blocks referenced by the record. */ + DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID + 1]; + + int max_block_id; /* highest block_id in use (-1 if none) */ + /* ---------------------------------------- * private/internal state * ---------------------------------------- @@ -123,4 +174,28 @@ extern struct XLogRecord *XLogReadRecord(XLogReaderState *state, extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr); #endif /* FRONTEND */ +/* Functions for decoding an XLogRecord */ + +extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, + char **errmsg); + +#define XLogRecGetTotalLen(decoder) ((decoder)->decoded_record->xl_tot_len) +#define XLogRecGetPrev(decoder) ((decoder)->decoded_record->xl_prev) +#define XLogRecGetInfo(decoder) ((decoder)->decoded_record->xl_info) +#define XLogRecGetRmid(decoder) ((decoder)->decoded_record->xl_rmid) +#define XLogRecGetXid(decoder) ((decoder)->decoded_record->xl_xid) +#define XLogRecGetData(decoder) ((decoder)->main_data) +#define XLogRecGetDataLen(decoder) ((decoder)->main_data_len) +#define XLogRecHasAnyBlockRefs(decoder) ((decoder)->max_block_id >= 0) +#define XLogRecHasBlockRef(decoder, block_id) \ + ((decoder)->blocks[block_id].in_use) +#define XLogRecHasBlockImage(decoder, block_id) \ + ((decoder)->blocks[block_id].has_image) + +extern bool RestoreBlockImage(XLogReaderState *recoder, uint8 block_id, char *dst); +extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len); +extern bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, + RelFileNode *rnode, ForkNumber *forknum, + BlockNumber *blknum); + #endif /* XLOGREADER_H */ diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index ab0fb1c5004..11ddfac9c7f 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -20,81 +20,161 @@ /* * The overall layout of an XLOG record is: * Fixed-size header (XLogRecord struct) - * rmgr-specific data - * BkpBlock - * backup block data - * BkpBlock - * backup block data + * XLogRecordBlockHeader struct + * XLogRecordBlockHeader struct * ... + * XLogRecordDataHeader[Short|Long] struct + * block data + * block data + * ... + * main data * - * where there can be zero to four backup blocks (as signaled by xl_info flag - * bits). XLogRecord structs always start on MAXALIGN boundaries in the WAL - * files, and we round up SizeOfXLogRecord so that the rmgr data is also - * guaranteed to begin on a MAXALIGN boundary. However, no padding is added - * to align BkpBlock structs or backup block data. + * There can be zero or more XLogRecordBlockHeaders, and 0 or more bytes of + * rmgr-specific data not associated with a block. XLogRecord structs + * always start on MAXALIGN boundaries in the WAL files, but the rest of + * the fields are not aligned. * - * NOTE: xl_len counts only the rmgr data, not the XLogRecord header, - * and also not any backup blocks. xl_tot_len counts everything. Neither - * length field is rounded up to an alignment boundary. + * The XLogRecordBlockHeader, XLogRecordDataHeaderShort and + * XLogRecordDataHeaderLong structs all begin with a single 'id' byte. It's + * used to distinguish between block references, and the main data structs. */ typedef struct XLogRecord { uint32 xl_tot_len; /* total len of entire record */ TransactionId xl_xid; /* xact id */ - uint32 xl_len; /* total len of rmgr data */ + XLogRecPtr xl_prev; /* ptr to previous record in log */ uint8 xl_info; /* flag bits, see below */ RmgrId xl_rmid; /* resource manager for this record */ /* 2 bytes of padding here, initialize to zero */ - XLogRecPtr xl_prev; /* ptr to previous record in log */ pg_crc32 xl_crc; /* CRC for this record */ - /* If MAXALIGN==8, there are 4 wasted bytes here */ - - /* ACTUAL LOG DATA FOLLOWS AT END OF STRUCT */ + /* XLogRecordBlockHeaders and XLogRecordDataHeader follow, no padding */ } XLogRecord; -#define SizeOfXLogRecord MAXALIGN(sizeof(XLogRecord)) - -#define XLogRecGetData(record) ((char*) (record) + SizeOfXLogRecord) +#define SizeOfXLogRecord (offsetof(XLogRecord, xl_crc) + sizeof(pg_crc32)) /* - * XLOG uses only low 4 bits of xl_info. High 4 bits may be used by rmgr. + * The high 4 bits in xl_info may be used freely by rmgr. The + * XLR_SPECIAL_REL_UPDATE bit can be passed by XLogInsert caller. The rest + * are set internally by XLogInsert. */ #define XLR_INFO_MASK 0x0F +#define XLR_RMGR_INFO_MASK 0xF0 /* - * If we backed up any disk blocks with the XLOG record, we use flag bits in - * xl_info to signal it. We support backup of up to 4 disk blocks per XLOG - * record. + * If a WAL record modifies any relation files, in ways not covered by the + * usual block references, this flag is set. This is not used for anything + * by PostgreSQL itself, but it allows external tools that read WAL and keep + * track of modified blocks to recognize such special record types. + */ +#define XLR_SPECIAL_REL_UPDATE 0x01 + +/* + * Header info for block data appended to an XLOG record. + * + * Note that we don't attempt to align the XLogRecordBlockHeader struct! + * So, the struct must be copied to aligned local storage before use. + * 'data_length' is the length of the payload data associated with this, + * and includes the possible full-page image, and rmgr-specific data. It + * does not include the XLogRecordBlockHeader struct itself. */ -#define XLR_BKP_BLOCK_MASK 0x0F /* all info bits used for bkp blocks */ -#define XLR_MAX_BKP_BLOCKS 4 -#define XLR_BKP_BLOCK(iblk) (0x08 >> (iblk)) /* iblk in 0..3 */ +typedef struct XLogRecordBlockHeader +{ + uint8 id; /* block reference ID */ + uint8 fork_flags; /* fork within the relation, and flags */ + uint16 data_length; /* number of payload bytes (not including page + * image) */ + + /* If BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows */ + /* If !BKPBLOCK_SAME_REL is not set, a RelFileNode follows */ + /* BlockNumber follows */ +} XLogRecordBlockHeader; + +#define SizeOfXLogRecordBlockHeader (offsetof(XLogRecordBlockHeader, data_length) + sizeof(uint16)) /* - * Header info for a backup block appended to an XLOG record. + * Additional header information when a full-page image is included + * (i.e. when BKPBLOCK_HAS_IMAGE is set). * * As a trivial form of data compression, the XLOG code is aware that * PG data pages usually contain an unused "hole" in the middle, which * contains only zero bytes. If hole_length > 0 then we have removed * such a "hole" from the stored data (and it's not counted in the * XLOG record's CRC, either). Hence, the amount of block data actually - * present following the BkpBlock struct is BLCKSZ - hole_length bytes. - * - * Note that we don't attempt to align either the BkpBlock struct or the - * block's data. So, the struct must be copied to aligned local storage - * before use. + * present is BLCKSZ - hole_length bytes. */ -typedef struct BkpBlock +typedef struct XLogRecordBlockImageHeader { - RelFileNode node; /* relation containing block */ - ForkNumber fork; /* fork within the relation */ - BlockNumber block; /* block number */ uint16 hole_offset; /* number of bytes before "hole" */ uint16 hole_length; /* number of bytes in "hole" */ +} XLogRecordBlockImageHeader; + +#define SizeOfXLogRecordBlockImageHeader sizeof(XLogRecordBlockImageHeader) + +/* + * Maximum size of the header for a block reference. This is used to size a + * temporary buffer for constructing the header. + */ +#define MaxSizeOfXLogRecordBlockHeader \ + (SizeOfXLogRecordBlockHeader + \ + SizeOfXLogRecordBlockImageHeader + \ + sizeof(RelFileNode) + \ + sizeof(BlockNumber)) + +/* + * The fork number fits in the lower 4 bits in the fork_flags field. The upper + * bits are used for flags. + */ +#define BKPBLOCK_FORK_MASK 0x0F +#define BKPBLOCK_FLAG_MASK 0xF0 +#define BKPBLOCK_HAS_IMAGE 0x10 /* block data is an XLogRecordBlockImage */ +#define BKPBLOCK_HAS_DATA 0x20 +#define BKPBLOCK_WILL_INIT 0x40 /* redo will re-init the page */ +#define BKPBLOCK_SAME_REL 0x80 /* RelFileNode omitted, same as previous */ + +/* + * XLogRecordDataHeaderShort/Long are used for the "main data" portion of + * the record. If the length of the data is less than 256 bytes, the short + * form is used, with a single byte to hold the length. Otherwise the long + * form is used. + * + * (These structs are currently not used in the code, they are here just for + * documentation purposes). + */ +typedef struct XLogRecordDataHeaderShort +{ + uint8 id; /* XLR_BLOCK_ID_DATA_SHORT */ + uint8 data_length; /* number of payload bytes */ +} XLogRecordDataHeaderShort; + +#define SizeOfXLogRecordDataHeaderShort (sizeof(uint8) * 2) + +typedef struct XLogRecordDataHeaderLong +{ + uint8 id; /* XLR_BLOCK_ID_DATA_LONG */ + /* followed by uint32 data_length, unaligned */ +} XLogRecordDataHeaderLong; + +#define SizeOfXLogRecordDataHeaderLong (sizeof(uint8) + sizeof(uint32)) + +/* + * Block IDs used to distinguish different kinds of record fragments. Block + * references are numbered from 0 to XLR_MAX_BLOCK_ID. A rmgr is free to use + * any ID number in that range (although you should stick to small numbers, + * because the WAL machinery is optimized for that case). A couple of ID + * numbers are reserved to denote the "main" data portion of the record. + * + * The maximum is currently set at 32, quite arbitrarily. Most records only + * need a handful of block references, but there are a few exceptions that + * need more. + */ +#define XLR_MAX_BLOCK_ID 32 + +#define XLR_BLOCK_ID_DATA_SHORT 255 +#define XLR_BLOCK_ID_DATA_LONG 254 + +#define SizeOfXLogRecordDataHeaderLong (sizeof(uint8) + sizeof(uint32)) - /* ACTUAL BLOCK DATA FOLLOWS AT END OF STRUCT */ -} BkpBlock; #endif /* XLOGRECORD_H */ diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 8d906967232..68f72cfac6d 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -11,7 +11,7 @@ #ifndef XLOG_UTILS_H #define XLOG_UTILS_H -#include "access/xlogrecord.h" +#include "access/xlogreader.h" #include "storage/bufmgr.h" @@ -33,26 +33,17 @@ typedef enum * replayed) */ } XLogRedoAction; -extern XLogRedoAction XLogReadBufferForRedo(XLogRecPtr lsn, XLogRecord *record, - int block_index, RelFileNode rnode, BlockNumber blkno, - Buffer *buf); -extern XLogRedoAction XLogReadBufferForRedoExtended(XLogRecPtr lsn, - XLogRecord *record, int block_index, - RelFileNode rnode, ForkNumber forkno, - BlockNumber blkno, +extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, + uint8 buffer_id, Buffer *buf); +extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id); +extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, + uint8 buffer_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf); -extern Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init); extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode); -extern Buffer RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, - int block_index, - bool get_cleanup_lock, bool keep_buffer); -extern Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, - char *blk, bool get_cleanup_lock, bool keep_buffer); - extern Relation CreateFakeRelcacheEntry(RelFileNode rnode); extern void FreeFakeRelcacheEntry(Relation fakerel); |