aboutsummaryrefslogtreecommitdiff
path: root/src/include/access/spgist_private.h
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2014-11-20 17:56:26 +0200
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2014-11-20 18:46:41 +0200
commit2c03216d831160bedd72d45f712601b6f7d03f1c (patch)
treeab6a03d031ffa605d848b0b7067add15e56e2207 /src/include/access/spgist_private.h
parent8dc626defec23016dd5988208d8704b858b9d21d (diff)
downloadpostgresql-2c03216d831160bedd72d45f712601b6f7d03f1c.tar.gz
postgresql-2c03216d831160bedd72d45f712601b6f7d03f1c.zip
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
Diffstat (limited to 'src/include/access/spgist_private.h')
-rw-r--r--src/include/access/spgist_private.h144
1 files changed, 64 insertions, 80 deletions
diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h
index 3330644651c..4b6fdee8017 100644
--- a/src/include/access/spgist_private.h
+++ b/src/include/access/spgist_private.h
@@ -18,7 +18,6 @@
#include "access/spgist.h"
#include "nodes/tidbitmap.h"
#include "storage/buf.h"
-#include "storage/relfilenode.h"
#include "utils/relcache.h"
@@ -351,35 +350,8 @@ typedef SpGistDeadTupleData *SpGistDeadTuple;
/*
* XLOG stuff
- *
- * ACCEPT_RDATA_* can only use fixed-length rdata arrays, because of lengthof
*/
-#define ACCEPT_RDATA_DATA(p, s, i) \
- do { \
- Assert((i) < lengthof(rdata)); \
- rdata[i].data = (char *) (p); \
- rdata[i].len = (s); \
- rdata[i].buffer = InvalidBuffer; \
- rdata[i].buffer_std = true; \
- rdata[i].next = NULL; \
- if ((i) > 0) \
- rdata[(i) - 1].next = rdata + (i); \
- } while(0)
-
-#define ACCEPT_RDATA_BUFFER(b, i) \
- do { \
- Assert((i) < lengthof(rdata)); \
- rdata[i].data = NULL; \
- rdata[i].len = 0; \
- rdata[i].buffer = (b); \
- rdata[i].buffer_std = true; \
- rdata[i].next = NULL; \
- if ((i) > 0) \
- rdata[(i) - 1].next = rdata + (i); \
- } while(0)
-
-
/* XLOG record types for SPGiST */
#define XLOG_SPGIST_CREATE_INDEX 0x00
#define XLOG_SPGIST_ADD_LEAF 0x10
@@ -408,36 +380,36 @@ typedef struct spgxlogState
(d).isBuild = (s)->isBuild; \
} while(0)
-
+/*
+ * Backup Blk 0: destination page for leaf tuple
+ * Backup Blk 1: parent page (if any)
+ */
typedef struct spgxlogAddLeaf
{
- RelFileNode node;
-
- BlockNumber blknoLeaf; /* destination page for leaf tuple */
bool newPage; /* init dest page? */
bool storesNulls; /* page is in the nulls tree? */
OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */
OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */
- BlockNumber blknoParent; /* where the parent downlink is, if any */
- OffsetNumber offnumParent;
+ OffsetNumber offnumParent; /* where the parent downlink is, if any */
uint16 nodeI;
/* new leaf tuple follows (unaligned!) */
} spgxlogAddLeaf;
+/*
+ * Backup Blk 0: source leaf page
+ * Backup Blk 1: destination leaf page
+ * Backup Blk 2: parent page
+ */
typedef struct spgxlogMoveLeafs
{
- RelFileNode node;
-
- BlockNumber blknoSrc; /* source leaf page */
- BlockNumber blknoDst; /* destination leaf page */
uint16 nMoves; /* number of tuples moved from source page */
bool newPage; /* init dest page? */
bool replaceDead; /* are we replacing a DEAD source tuple? */
bool storesNulls; /* pages are in the nulls tree? */
- BlockNumber blknoParent; /* where the parent downlink is */
+ /* where the parent downlink is */
OffsetNumber offnumParent;
uint16 nodeI;
@@ -452,11 +424,6 @@ typedef struct spgxlogMoveLeafs
* Note: if replaceDead is true then there is only one inserted tuple
* number and only one leaf tuple in the data, because we are not copying
* the dead tuple from the source
- *
- * Buffer references in the rdata array are:
- * Src page
- * Dest page
- * Parent page
*----------
*/
OffsetNumber offsets[1];
@@ -464,21 +431,43 @@ typedef struct spgxlogMoveLeafs
#define SizeOfSpgxlogMoveLeafs offsetof(spgxlogMoveLeafs, offsets)
+/*
+ * Backup Blk 0: original page
+ * Backup Blk 1: where new tuple goes, if not same place
+ * Backup Blk 2: where parent downlink is, if updated and different from
+ * the old and new
+ */
typedef struct spgxlogAddNode
{
- RelFileNode node;
-
- BlockNumber blkno; /* block number of original inner tuple */
- OffsetNumber offnum; /* offset of original inner tuple */
-
- BlockNumber blknoParent; /* where parent downlink is, if updated */
- OffsetNumber offnumParent;
- uint16 nodeI;
+ /*
+ * Offset of the original inner tuple, in the original page (on backup
+ * block 0).
+ */
+ OffsetNumber offnum;
- BlockNumber blknoNew; /* where new tuple goes, if not same place */
+ /*
+ * Offset of the new tuple, on the new page (on backup block 1). Invalid,
+ * if we overwrote the old tuple in the original page).
+ */
OffsetNumber offnumNew;
bool newPage; /* init new page? */
+ /*----
+ * Where is the parent downlink? parentBlk indicates which page it's on,
+ * and offnumParent is the offset within the page. The possible values for
+ * parentBlk are:
+ *
+ * 0: parent == original page
+ * 1: parent == new page
+ * 2: parent == different page (blk ref 2)
+ * -1: parent not updated
+ *----
+ */
+ char parentBlk;
+ OffsetNumber offnumParent; /* offset within the parent page */
+
+ uint16 nodeI;
+
spgxlogState stateSrc;
/*
@@ -486,41 +475,51 @@ typedef struct spgxlogAddNode
*/
} spgxlogAddNode;
+/*
+ * Backup Blk 0: where the prefix tuple goes
+ * Backup Blk 1: where the postfix tuple goes (if different page)
+ */
typedef struct spgxlogSplitTuple
{
- RelFileNode node;
-
- BlockNumber blknoPrefix; /* where the prefix tuple goes */
+ /* where the prefix tuple goes */
OffsetNumber offnumPrefix;
- BlockNumber blknoPostfix; /* where the postfix tuple goes */
+ /* where the postfix tuple goes */
OffsetNumber offnumPostfix;
bool newPage; /* need to init that page? */
+ bool postfixBlkSame; /* was postfix tuple put on same page as
+ * prefix? */
/*
- * new prefix inner tuple follows, then new postfix inner tuple
- * (both are unaligned!)
+ * new prefix inner tuple follows, then new postfix inner tuple (both are
+ * unaligned!)
*/
} spgxlogSplitTuple;
+/*
+ * Buffer references in the rdata array are:
+ * Backup Blk 0: Src page (only if not root)
+ * Backup Blk 1: Dest page (if used)
+ * Backup Blk 2: Inner page
+ * Backup Blk 3: Parent page (if any, and different from Inner)
+ */
typedef struct spgxlogPickSplit
{
- RelFileNode node;
+ bool isRootSplit;
- BlockNumber blknoSrc; /* original leaf page */
- BlockNumber blknoDest; /* other leaf page, if any */
uint16 nDelete; /* n to delete from Src */
uint16 nInsert; /* n to insert on Src and/or Dest */
bool initSrc; /* re-init the Src page? */
bool initDest; /* re-init the Dest page? */
- BlockNumber blknoInner; /* where to put new inner tuple */
+ /* where to put new inner tuple */
OffsetNumber offnumInner;
bool initInner; /* re-init the Inner page? */
bool storesNulls; /* pages are in the nulls tree? */
- BlockNumber blknoParent; /* where the parent downlink is, if any */
+ /* where the parent downlink is, if any */
+ bool innerIsParent; /* is parent the same as inner page? */
OffsetNumber offnumParent;
uint16 nodeI;
@@ -533,24 +532,15 @@ typedef struct spgxlogPickSplit
* array of page selector bytes for inserted tuples, length nInsert
* new inner tuple (unaligned!)
* list of leaf tuples, length nInsert (unaligned!)
- *
- * Buffer references in the rdata array are:
- * Src page (only if not root and not being init'd)
- * Dest page (if used and not being init'd)
- * Inner page (only if not being init'd)
- * Parent page (if any; could be same as Inner)
*----------
*/
- OffsetNumber offsets[1];
+ OffsetNumber offsets[1];
} spgxlogPickSplit;
#define SizeOfSpgxlogPickSplit offsetof(spgxlogPickSplit, offsets)
typedef struct spgxlogVacuumLeaf
{
- RelFileNode node;
-
- BlockNumber blkno; /* block number to clean */
uint16 nDead; /* number of tuples to become DEAD */
uint16 nPlaceholder; /* number of tuples to become PLACEHOLDER */
uint16 nMove; /* number of tuples to move */
@@ -576,9 +566,6 @@ typedef struct spgxlogVacuumLeaf
typedef struct spgxlogVacuumRoot
{
/* vacuum a root page when it is also a leaf */
- RelFileNode node;
-
- BlockNumber blkno; /* block number to clean */
uint16 nDelete; /* number of tuples to delete */
spgxlogState stateSrc;
@@ -591,9 +578,6 @@ typedef struct spgxlogVacuumRoot
typedef struct spgxlogVacuumRedirect
{
- RelFileNode node;
-
- BlockNumber blkno; /* block number to clean */
uint16 nToPlaceholder; /* number of redirects to make placeholders */
OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */
TransactionId newestRedirectXid; /* newest XID of removed redirects */