aboutsummaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2014-03-18 20:12:58 +0200
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2014-03-18 20:50:44 +0200
commit40dae7ec537c5619fc93ad602c62f37be786d161 (patch)
tree933e82bd349f08206cf2f97841ccee2a80e06c91 /src/include
parentb6ec7c92ac7ab6223b3c45dc554efffd1953758f (diff)
downloadpostgresql-40dae7ec537c5619fc93ad602c62f37be786d161.tar.gz
postgresql-40dae7ec537c5619fc93ad602c62f37be786d161.zip
Make the handling of interrupted B-tree page splits more robust.
Splitting a page consists of two separate steps: splitting the child page, and inserting the downlink for the new right page to the parent. Previously, we handled the case that you crash in between those steps with a cleanup routine after the WAL recovery had finished, which finished the incomplete split. However, that doesn't help if the page split is interrupted but the database doesn't crash, so that you don't perform WAL recovery. That could happen for example if you run out of disk space. Remove the end-of-recovery cleanup step. Instead, when a page is split, the left page is marked with a new INCOMPLETE_SPLIT flag, and when the downlink is inserted to the parent, the flag is cleared again. If an insertion sees a page with the flag set, it knows that the split was interrupted for some reason, and inserts the missing downlink before proceeding. I used the same approach to fix GIN and GiST split algorithms earlier. This was the last WAL cleanup routine, so we could get rid of that whole machinery now, but I'll leave that for a separate patch. Reviewed by Peter Geoghegan.
Diffstat (limited to 'src/include')
-rw-r--r--src/include/access/nbtree.h26
-rw-r--r--src/include/access/rmgrlist.h2
-rw-r--r--src/include/access/xlog_internal.h2
3 files changed, 14 insertions, 16 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 779a42229b7..64c6982f50e 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -73,6 +73,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */
#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */
#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples */
+#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
/*
* The max allowed value of a cycle ID is a bit less than 64K. This is
@@ -178,6 +179,7 @@ typedef struct BTMetaPageData
#define P_ISHALFDEAD(opaque) ((opaque)->btpo_flags & BTP_HALF_DEAD)
#define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
#define P_HAS_GARBAGE(opaque) ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
+#define P_INCOMPLETE_SPLIT(opaque) ((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT)
/*
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
@@ -253,7 +255,7 @@ typedef struct xl_btree_metadata
typedef struct xl_btree_insert
{
xl_btreetid target; /* inserted tuple id */
- /* BlockNumber downlink field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
+ /* BlockNumber finishes_split field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
/* INDEX TUPLE FOLLOWS AT END OF STRUCT */
} xl_btree_insert;
@@ -286,19 +288,18 @@ typedef struct xl_btree_split
OffsetNumber firstright; /* first item moved to right page */
/*
- * If level > 0, BlockIdData downlink follows. (We use BlockIdData rather
- * than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit
- * aligned.)
+ * In the _L variants, next are OffsetNumber newitemoff and the new item.
+ * (In the _R variants, the new item is one of the right page's tuples.)
+ * The new item, but not newitemoff, is suppressed if XLogInsert chooses
+ * to store the left page's whole page image.
*
* If level > 0, an IndexTuple representing the HIKEY of the left page
* follows. We don't need this on leaf pages, because it's the same as
* the leftmost key in the new right page. Also, it's suppressed if
* XLogInsert chooses to store the left page's whole page image.
*
- * In the _L variants, next are OffsetNumber newitemoff and the new item.
- * (In the _R variants, the new item is one of the right page's tuples.)
- * The new item, but not newitemoff, is suppressed if XLogInsert chooses
- * to store the left page's whole page image.
+ * If level > 0, BlockNumber of the page whose incomplete-split flag
+ * this insertion clears. (not aligned)
*
* Last are the right page's tuples in the form used by _bt_restore_page.
*/
@@ -642,8 +643,7 @@ extern Datum btoptions(PG_FUNCTION_ARGS);
extern bool _bt_doinsert(Relation rel, IndexTuple itup,
IndexUniqueCheck checkUnique, Relation heapRel);
extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
-extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
- BTStack stack, bool is_root, bool is_only);
+extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
/*
* prototypes for functions in nbtpage.c
@@ -673,7 +673,8 @@ extern BTStack _bt_search(Relation rel,
int keysz, ScanKey scankey, bool nextkey,
Buffer *bufP, int access);
extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
- ScanKey scankey, bool nextkey, int access);
+ ScanKey scankey, bool nextkey, bool forupdate, BTStack stack,
+ int access);
extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
ScanKey scankey, bool nextkey);
extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
@@ -722,8 +723,5 @@ extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
*/
extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec);
-extern void btree_xlog_startup(void);
-extern void btree_xlog_cleanup(void);
-extern bool btree_safe_restartpoint(void);
#endif /* NBTREE_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index c968101e8a2..d9ee57d6da2 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, NULL, NULL, NULL)
PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, NULL, NULL, NULL)
PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL)
PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, NULL, NULL, NULL)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, NULL, NULL, NULL)
PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, NULL, NULL, NULL)
PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, NULL)
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, NULL)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 297e4505db9..ec021fe29ae 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -55,7 +55,7 @@ typedef struct BkpBlock
/*
* Each page of XLOG file has a header like this:
*/
-#define XLOG_PAGE_MAGIC 0xD07C /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD07D /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{