aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access')
-rw-r--r--src/backend/access/gist/gistbuild.c121
-rw-r--r--src/backend/access/heap/rewriteheap.c72
-rw-r--r--src/backend/access/nbtree/nbtree.c33
-rw-r--r--src/backend/access/nbtree/nbtsort.c135
-rw-r--r--src/backend/access/spgist/spginsert.c49
5 files changed, 112 insertions, 298 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 08555b97f92..465246173ba 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -43,7 +43,8 @@
#include "miscadmin.h"
#include "optimizer/optimizer.h"
#include "storage/bufmgr.h"
-#include "storage/smgr.h"
+#include "storage/bulk_write.h"
+
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/tuplesort.h"
@@ -106,11 +107,8 @@ typedef struct
Tuplesortstate *sortstate; /* state data for tuplesort.c */
BlockNumber pages_allocated;
- BlockNumber pages_written;
- int ready_num_pages;
- BlockNumber ready_blknos[XLR_MAX_BLOCK_ID];
- Page ready_pages[XLR_MAX_BLOCK_ID];
+ BulkWriteState *bulkstate;
} GISTBuildState;
#define GIST_SORTED_BUILD_PAGE_NUM 4
@@ -142,7 +140,6 @@ static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
IndexTuple itup);
static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
GistSortedBuildLevelState *levelstate);
-static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state);
static void gistInitBuffering(GISTBuildState *buildstate);
static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
@@ -405,27 +402,18 @@ gist_indexsortbuild(GISTBuildState *state)
{
IndexTuple itup;
GistSortedBuildLevelState *levelstate;
- Page page;
+ BulkWriteBuffer rootbuf;
- state->pages_allocated = 0;
- state->pages_written = 0;
- state->ready_num_pages = 0;
+ /* Reserve block 0 for the root page */
+ state->pages_allocated = 1;
- /*
- * Write an empty page as a placeholder for the root page. It will be
- * replaced with the real root page at the end.
- */
- page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
- smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
- page, true);
- state->pages_allocated++;
- state->pages_written++;
+ state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM);
/* Allocate a temporary buffer for the first leaf page batch. */
levelstate = palloc0(sizeof(GistSortedBuildLevelState));
- levelstate->pages[0] = page;
+ levelstate->pages[0] = palloc(BLCKSZ);
levelstate->parent = NULL;
- gistinitpage(page, F_LEAF);
+ gistinitpage(levelstate->pages[0], F_LEAF);
/*
* Fill index pages with tuples in the sorted order.
@@ -455,31 +443,15 @@ gist_indexsortbuild(GISTBuildState *state)
levelstate = parent;
}
- gist_indexsortbuild_flush_ready_pages(state);
-
/* Write out the root */
PageSetLSN(levelstate->pages[0], GistBuildLSN);
- PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO);
- smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
- levelstate->pages[0], true);
- if (RelationNeedsWAL(state->indexrel))
- log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO,
- levelstate->pages[0], true);
-
- pfree(levelstate->pages[0]);
+ rootbuf = smgr_bulk_get_buf(state->bulkstate);
+ memcpy(rootbuf, levelstate->pages[0], BLCKSZ);
+ smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true);
+
pfree(levelstate);
- /*
- * When we WAL-logged index pages, we must nonetheless fsync index files.
- * Since we're building outside shared buffers, a CHECKPOINT occurring
- * during the build has no way to flush the previously written data to
- * disk (indeed it won't know the index even exists). A crash later on
- * would replay WAL from the checkpoint, therefore it wouldn't replay our
- * earlier WAL entries. If we do not fsync those pages here, they might
- * still not be on disk when the crash occurs.
- */
- if (RelationNeedsWAL(state->indexrel))
- smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM);
+ smgr_bulk_finish(state->bulkstate);
}
/*
@@ -509,8 +481,7 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state,
levelstate->current_page++;
if (levelstate->pages[levelstate->current_page] == NULL)
- levelstate->pages[levelstate->current_page] =
- palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+ levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ);
newPage = levelstate->pages[levelstate->current_page];
gistinitpage(newPage, old_page_flags);
@@ -573,6 +544,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
for (; dist != NULL; dist = dist->next)
{
char *data;
+ BulkWriteBuffer buf;
Page target;
/* check once per page */
@@ -580,7 +552,8 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
/* Create page and copy data */
data = (char *) (dist->list);
- target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
+ buf = smgr_bulk_get_buf(state->bulkstate);
+ target = (Page) buf;
gistinitpage(target, isleaf ? F_LEAF : 0);
for (int i = 0; i < dist->block.num; i++)
{
@@ -593,20 +566,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
}
union_tuple = dist->itup;
- if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
- gist_indexsortbuild_flush_ready_pages(state);
-
- /*
- * The page is now complete. Assign a block number to it, and add it
- * to the list of finished pages. (We don't write it out immediately,
- * because we want to WAL-log the pages in batches.)
- */
- blkno = state->pages_allocated++;
- state->ready_blknos[state->ready_num_pages] = blkno;
- state->ready_pages[state->ready_num_pages] = target;
- state->ready_num_pages++;
- ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
-
/*
* Set the right link to point to the previous page. This is just for
* debugging purposes: GiST only follows the right link if a page is
@@ -621,6 +580,15 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
*/
if (levelstate->last_blkno)
GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
+
+ /*
+ * The page is now complete. Assign a block number to it, and pass it
+ * to the bulk writer.
+ */
+ blkno = state->pages_allocated++;
+ PageSetLSN(target, GistBuildLSN);
+ smgr_bulk_write(state->bulkstate, blkno, buf, true);
+ ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
levelstate->last_blkno = blkno;
/*
@@ -631,7 +599,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
if (parent == NULL)
{
parent = palloc0(sizeof(GistSortedBuildLevelState));
- parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+ parent->pages[0] = palloc(BLCKSZ);
parent->parent = NULL;
gistinitpage(parent->pages[0], 0);
@@ -641,39 +609,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
}
}
-static void
-gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
-{
- if (state->ready_num_pages == 0)
- return;
-
- for (int i = 0; i < state->ready_num_pages; i++)
- {
- Page page = state->ready_pages[i];
- BlockNumber blkno = state->ready_blknos[i];
-
- /* Currently, the blocks must be buffered in order. */
- if (blkno != state->pages_written)
- elog(ERROR, "unexpected block number to flush GiST sorting build");
-
- PageSetLSN(page, GistBuildLSN);
- PageSetChecksumInplace(page, blkno);
- smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page,
- true);
-
- state->pages_written++;
- }
-
- if (RelationNeedsWAL(state->indexrel))
- log_newpages(&state->indexrel->rd_locator, MAIN_FORKNUM, state->ready_num_pages,
- state->ready_blknos, state->ready_pages, true);
-
- for (int i = 0; i < state->ready_num_pages; i++)
- pfree(state->ready_pages[i]);
-
- state->ready_num_pages = 0;
-}
-
/*-------------------------------------------------------------------------
* Routines for non-sorted build
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 34107323ffe..a578b876174 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -87,8 +87,8 @@
* is optimized for bulk inserting a lot of tuples, knowing that we have
* exclusive access to the heap. raw_heap_insert builds new pages in
* local storage. When a page is full, or at the end of the process,
- * we insert it to WAL as a single record and then write it to disk
- * directly through smgr. Note, however, that any data sent to the new
+ * we insert it to WAL as a single record and then write it to disk with
+ * the bulk smgr writer. Note, however, that any data sent to the new
* heap's TOAST table will go through the normal bufmgr.
*
*
@@ -119,9 +119,9 @@
#include "replication/logical.h"
#include "replication/slot.h"
#include "storage/bufmgr.h"
+#include "storage/bulk_write.h"
#include "storage/fd.h"
#include "storage/procarray.h"
-#include "storage/smgr.h"
#include "utils/memutils.h"
#include "utils/rel.h"
@@ -133,9 +133,9 @@ typedef struct RewriteStateData
{
Relation rs_old_rel; /* source heap */
Relation rs_new_rel; /* destination heap */
- Page rs_buffer; /* page currently being built */
+ BulkWriteState *rs_bulkstate; /* writer for the destination */
+ BulkWriteBuffer rs_buffer; /* page currently being built */
BlockNumber rs_blockno; /* block where page will go */
- bool rs_buffer_valid; /* T if any tuples in buffer */
bool rs_logical_rewrite; /* do we need to do logical rewriting */
TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine
* tuple visibility */
@@ -255,14 +255,14 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
state->rs_old_rel = old_heap;
state->rs_new_rel = new_heap;
- state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+ state->rs_buffer = NULL;
/* new_heap needn't be empty, just locked */
state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
- state->rs_buffer_valid = false;
state->rs_oldest_xmin = oldest_xmin;
state->rs_freeze_xid = freeze_xid;
state->rs_cutoff_multi = cutoff_multi;
state->rs_cxt = rw_cxt;
+ state->rs_bulkstate = smgr_bulk_start_rel(new_heap, MAIN_FORKNUM);
/* Initialize hash tables used to track update chains */
hash_ctl.keysize = sizeof(TidHashKey);
@@ -314,30 +314,13 @@ end_heap_rewrite(RewriteState state)
}
/* Write the last page, if any */
- if (state->rs_buffer_valid)
+ if (state->rs_buffer)
{
- if (RelationNeedsWAL(state->rs_new_rel))
- log_newpage(&state->rs_new_rel->rd_locator,
- MAIN_FORKNUM,
- state->rs_blockno,
- state->rs_buffer,
- true);
-
- PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
-
- smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
- state->rs_blockno, state->rs_buffer, true);
+ smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true);
+ state->rs_buffer = NULL;
}
- /*
- * When we WAL-logged rel pages, we must nonetheless fsync them. The
- * reason is the same as in storage.c's RelationCopyStorage(): we're
- * writing data that's not in shared buffers, and so a CHECKPOINT
- * occurring during the rewriteheap operation won't have fsync'd data we
- * wrote before the checkpoint.
- */
- if (RelationNeedsWAL(state->rs_new_rel))
- smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM);
+ smgr_bulk_finish(state->rs_bulkstate);
logical_end_heap_rewrite(state);
@@ -611,7 +594,7 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple)
static void
raw_heap_insert(RewriteState state, HeapTuple tup)
{
- Page page = state->rs_buffer;
+ Page page;
Size pageFreeSpace,
saveFreeSpace;
Size len;
@@ -664,7 +647,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
HEAP_DEFAULT_FILLFACTOR);
/* Now we can check to see if there's enough free space already. */
- if (state->rs_buffer_valid)
+ page = (Page) state->rs_buffer;
+ if (page)
{
pageFreeSpace = PageGetHeapFreeSpace(page);
@@ -675,35 +659,19 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
* contains a tuple. Hence, unlike RelationGetBufferForTuple(),
* enforce saveFreeSpace unconditionally.
*/
-
- /* XLOG stuff */
- if (RelationNeedsWAL(state->rs_new_rel))
- log_newpage(&state->rs_new_rel->rd_locator,
- MAIN_FORKNUM,
- state->rs_blockno,
- page,
- true);
-
- /*
- * Now write the page. We say skipFsync = true because there's no
- * need for smgr to schedule an fsync for this write; we'll do it
- * ourselves in end_heap_rewrite.
- */
- PageSetChecksumInplace(page, state->rs_blockno);
-
- smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
- state->rs_blockno, page, true);
-
+ smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true);
+ state->rs_buffer = NULL;
+ page = NULL;
state->rs_blockno++;
- state->rs_buffer_valid = false;
}
}
- if (!state->rs_buffer_valid)
+ if (!page)
{
/* Initialize a new empty page */
+ state->rs_buffer = smgr_bulk_get_buf(state->rs_bulkstate);
+ page = (Page) state->rs_buffer;
PageInit(page, BLCKSZ, 0);
- state->rs_buffer_valid = true;
}
/* And now we can insert the tuple into the page */
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 696d79c0852..21d879a3bdf 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -29,11 +29,11 @@
#include "nodes/execnodes.h"
#include "pgstat.h"
#include "postmaster/autovacuum.h"
+#include "storage/bulk_write.h"
#include "storage/condition_variable.h"
#include "storage/indexfsm.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
-#include "storage/smgr.h"
#include "utils/builtins.h"
#include "utils/index_selfuncs.h"
#include "utils/memutils.h"
@@ -154,32 +154,17 @@ void
btbuildempty(Relation index)
{
bool allequalimage = _bt_allequalimage(index, false);
- Buffer metabuf;
- Page metapage;
+ BulkWriteState *bulkstate;
+ BulkWriteBuffer metabuf;
- /*
- * Initialize the metapage.
- *
- * Regular index build bypasses the buffer manager and uses smgr functions
- * directly, with an smgrimmedsync() call at the end. That makes sense
- * when the index is large, but for an empty index, it's better to use the
- * buffer cache to avoid the smgrimmedsync().
- */
- metabuf = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
- Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE);
- _bt_lockbuf(index, metabuf, BT_WRITE);
-
- START_CRIT_SECTION();
-
- metapage = BufferGetPage(metabuf);
- _bt_initmetapage(metapage, P_NONE, 0, allequalimage);
- MarkBufferDirty(metabuf);
- log_newpage_buffer(metabuf, true);
+ bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
- END_CRIT_SECTION();
+ /* Construct metapage. */
+ metabuf = smgr_bulk_get_buf(bulkstate);
+ _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
+ smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
- _bt_unlockbuf(index, metabuf);
- ReleaseBuffer(metabuf);
+ smgr_bulk_finish(bulkstate);
}
/*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 20111965793..3f1e7b9c155 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -23,13 +23,8 @@
* many upper pages if the keys are reasonable-size) without risking a lot of
* cascading splits during early insertions.
*
- * Formerly the index pages being built were kept in shared buffers, but
- * that is of no value (since other backends have no interest in them yet)
- * and it created locking problems for CHECKPOINT, because the upper-level
- * pages were held exclusive-locked for long periods. Now we just build
- * the pages in local memory and smgrwrite or smgrextend them as we finish
- * them. They will need to be re-read into shared buffers on first use after
- * the build finishes.
+ * We use the bulk smgr loading facility to bypass the buffer cache and
+ * WAL-log the pages efficiently.
*
* This code isn't concerned about the FSM at all. The caller is responsible
* for initializing that.
@@ -57,7 +52,7 @@
#include "executor/instrument.h"
#include "miscadmin.h"
#include "pgstat.h"
-#include "storage/smgr.h"
+#include "storage/bulk_write.h"
#include "tcop/tcopprot.h" /* pgrminclude ignore */
#include "utils/rel.h"
#include "utils/sortsupport.h"
@@ -234,7 +229,7 @@ typedef struct BTBuildState
*/
typedef struct BTPageState
{
- Page btps_page; /* workspace for page building */
+ BulkWriteBuffer btps_buf; /* workspace for page building */
BlockNumber btps_blkno; /* block # to write this page at */
IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
OffsetNumber btps_lastoff; /* last item offset loaded */
@@ -251,11 +246,9 @@ typedef struct BTWriteState
{
Relation heap;
Relation index;
+ BulkWriteState *bulkstate;
BTScanInsert inskey; /* generic insertion scankey */
- bool btws_use_wal; /* dump pages to WAL? */
BlockNumber btws_pages_alloced; /* # pages allocated */
- BlockNumber btws_pages_written; /* # pages written out */
- Page btws_zeropage; /* workspace for filling zeroes */
} BTWriteState;
@@ -267,7 +260,7 @@ static void _bt_spool(BTSpool *btspool, ItemPointer self,
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
bool *isnull, bool tupleIsAlive, void *state);
-static Page _bt_blnewpage(uint32 level);
+static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
static void _bt_slideleft(Page rightmostpage);
static void _bt_sortaddtup(Page page, Size itemsize,
@@ -569,12 +562,9 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
wstate.inskey = _bt_mkscankey(wstate.index, NULL);
/* _bt_mkscankey() won't set allequalimage without metapage */
wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
- wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
/* reserve the metapage */
wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
- wstate.btws_pages_written = 0;
- wstate.btws_zeropage = NULL; /* until needed */
pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
PROGRESS_BTREE_PHASE_LEAF_LOAD);
@@ -613,13 +603,15 @@ _bt_build_callback(Relation index,
/*
* allocate workspace for a new, clean btree page, not linked to any siblings.
*/
-static Page
-_bt_blnewpage(uint32 level)
+static BulkWriteBuffer
+_bt_blnewpage(BTWriteState *wstate, uint32 level)
{
+ BulkWriteBuffer buf;
Page page;
BTPageOpaque opaque;
- page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+ buf = smgr_bulk_get_buf(wstate->bulkstate);
+ page = (Page) buf;
/* Zero the page and set up standard page header info */
_bt_pageinit(page, BLCKSZ);
@@ -634,63 +626,17 @@ _bt_blnewpage(uint32 level)
/* Make the P_HIKEY line pointer appear allocated */
((PageHeader) page)->pd_lower += sizeof(ItemIdData);
- return page;
+ return buf;
}
/*
* emit a completed btree page, and release the working storage.
*/
static void
-_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
+_bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
{
- /* XLOG stuff */
- if (wstate->btws_use_wal)
- {
- /* We use the XLOG_FPI record type for this */
- log_newpage(&wstate->index->rd_locator, MAIN_FORKNUM, blkno, page, true);
- }
-
- /*
- * If we have to write pages nonsequentially, fill in the space with
- * zeroes until we come back and overwrite. This is not logically
- * necessary on standard Unix filesystems (unwritten space will read as
- * zeroes anyway), but it should help to avoid fragmentation. The dummy
- * pages aren't WAL-logged though.
- */
- while (blkno > wstate->btws_pages_written)
- {
- if (!wstate->btws_zeropage)
- wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
- PG_IO_ALIGN_SIZE,
- MCXT_ALLOC_ZERO);
- /* don't set checksum for all-zero page */
- smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
- wstate->btws_pages_written++,
- wstate->btws_zeropage,
- true);
- }
-
- PageSetChecksumInplace(page, blkno);
-
- /*
- * Now write the page. There's no need for smgr to schedule an fsync for
- * this write; we'll do it ourselves before ending the build.
- */
- if (blkno == wstate->btws_pages_written)
- {
- /* extending the file... */
- smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
- page, true);
- wstate->btws_pages_written++;
- }
- else
- {
- /* overwriting a block we zero-filled before */
- smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
- page, true);
- }
-
- pfree(page);
+ smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
+ /* smgr_bulk_write took ownership of 'buf' */
}
/*
@@ -703,7 +649,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level)
BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
/* create initial page for level */
- state->btps_page = _bt_blnewpage(level);
+ state->btps_buf = _bt_blnewpage(wstate, level);
/* and assign it a page position */
state->btps_blkno = wstate->btws_pages_alloced++;
@@ -839,6 +785,7 @@ static void
_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
Size truncextra)
{
+ BulkWriteBuffer nbuf;
Page npage;
BlockNumber nblkno;
OffsetNumber last_off;
@@ -853,7 +800,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
*/
CHECK_FOR_INTERRUPTS();
- npage = state->btps_page;
+ nbuf = state->btps_buf;
+ npage = (Page) nbuf;
nblkno = state->btps_blkno;
last_off = state->btps_lastoff;
last_truncextra = state->btps_lastextra;
@@ -909,6 +857,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
/*
* Finish off the page and write it out.
*/
+ BulkWriteBuffer obuf = nbuf;
Page opage = npage;
BlockNumber oblkno = nblkno;
ItemId ii;
@@ -916,7 +865,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
IndexTuple oitup;
/* Create new page of same level */
- npage = _bt_blnewpage(state->btps_level);
+ nbuf = _bt_blnewpage(wstate, state->btps_level);
+ npage = (Page) nbuf;
/* and assign it a page position */
nblkno = wstate->btws_pages_alloced++;
@@ -1028,10 +978,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
}
/*
- * Write out the old page. We never need to touch it again, so we can
- * free the opage workspace too.
+ * Write out the old page. _bt_blwritepage takes ownership of the
+ * 'opage' buffer.
*/
- _bt_blwritepage(wstate, opage, oblkno);
+ _bt_blwritepage(wstate, obuf, oblkno);
/*
* Reset last_off to point to new page
@@ -1064,7 +1014,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
_bt_sortaddtup(npage, itupsz, itup, last_off,
!isleaf && last_off == P_FIRSTKEY);
- state->btps_page = npage;
+ state->btps_buf = nbuf;
state->btps_blkno = nblkno;
state->btps_lastoff = last_off;
}
@@ -1116,7 +1066,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
BTPageState *s;
BlockNumber rootblkno = P_NONE;
uint32 rootlevel = 0;
- Page metapage;
+ BulkWriteBuffer metabuf;
/*
* Each iteration of this loop completes one more level of the tree.
@@ -1127,7 +1077,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
BTPageOpaque opaque;
blkno = s->btps_blkno;
- opaque = BTPageGetOpaque(s->btps_page);
+ opaque = BTPageGetOpaque((Page) s->btps_buf);
/*
* We have to link the last page on this level to somewhere.
@@ -1161,9 +1111,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
* This is the rightmost page, so the ItemId array needs to be slid
* back one slot. Then we can dump out the page.
*/
- _bt_slideleft(s->btps_page);
- _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
- s->btps_page = NULL; /* writepage freed the workspace */
+ _bt_slideleft((Page) s->btps_buf);
+ _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
+ s->btps_buf = NULL; /* writepage took ownership of the buffer */
}
/*
@@ -1172,10 +1122,10 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
* set to point to "P_NONE"). This changes the index to the "valid" state
* by filling in a valid magic number in the metapage.
*/
- metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
- _bt_initmetapage(metapage, rootblkno, rootlevel,
+ metabuf = smgr_bulk_get_buf(wstate->bulkstate);
+ _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
wstate->inskey->allequalimage);
- _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
+ _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
}
/*
@@ -1197,6 +1147,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
int64 tuples_done = 0;
bool deduplicate;
+ wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
+
deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
BTGetDeduplicateItems(wstate->index);
@@ -1352,7 +1304,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
*/
dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
sizeof(ItemIdData);
- Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
+ Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) &&
dstate->maxpostingsize <= INDEX_SIZE_MASK);
dstate->htids = palloc(dstate->maxpostingsize);
@@ -1422,18 +1374,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
/* Close down final pages and write the metapage */
_bt_uppershutdown(wstate, state);
-
- /*
- * When we WAL-logged index pages, we must nonetheless fsync index files.
- * Since we're building outside shared buffers, a CHECKPOINT occurring
- * during the build has no way to flush the previously written data to
- * disk (indeed it won't know the index even exists). A crash later on
- * would replay WAL from the checkpoint, therefore it wouldn't replay our
- * earlier WAL entries. If we do not fsync those pages here, they might
- * still not be on disk when the crash occurs.
- */
- if (wstate->btws_use_wal)
- smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM);
+ smgr_bulk_finish(wstate->bulkstate);
}
/*
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 98b1da20d58..1b70c5a59fd 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -25,7 +25,7 @@
#include "catalog/index.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
-#include "storage/smgr.h"
+#include "storage/bulk_write.h"
#include "utils/memutils.h"
#include "utils/rel.h"
@@ -155,42 +155,27 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
void
spgbuildempty(Relation index)
{
- Buffer metabuffer,
- rootbuffer,
- nullbuffer;
-
- /*
- * Initialize the meta page and root pages
- */
- metabuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
- LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
- rootbuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
- LockBuffer(rootbuffer, BUFFER_LOCK_EXCLUSIVE);
- nullbuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
- LockBuffer(nullbuffer, BUFFER_LOCK_EXCLUSIVE);
-
- Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO);
- Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_ROOT_BLKNO);
- Assert(BufferGetBlockNumber(nullbuffer) == SPGIST_NULL_BLKNO);
+ BulkWriteState *bulkstate;
+ BulkWriteBuffer buf;
- START_CRIT_SECTION();
+ bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
- SpGistInitMetapage(BufferGetPage(metabuffer));
- MarkBufferDirty(metabuffer);
- SpGistInitBuffer(rootbuffer, SPGIST_LEAF);
- MarkBufferDirty(rootbuffer);
- SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS);
- MarkBufferDirty(nullbuffer);
+ /* Construct metapage. */
+ buf = smgr_bulk_get_buf(bulkstate);
+ SpGistInitMetapage((Page) buf);
+ smgr_bulk_write(bulkstate, SPGIST_METAPAGE_BLKNO, buf, true);
- log_newpage_buffer(metabuffer, true);
- log_newpage_buffer(rootbuffer, true);
- log_newpage_buffer(nullbuffer, true);
+ /* Likewise for the root page. */
+ buf = smgr_bulk_get_buf(bulkstate);
+ SpGistInitPage((Page) buf, SPGIST_LEAF);
+ smgr_bulk_write(bulkstate, SPGIST_ROOT_BLKNO, buf, true);
- END_CRIT_SECTION();
+ /* Likewise for the null-tuples root page. */
+ buf = smgr_bulk_get_buf(bulkstate);
+ SpGistInitPage((Page) buf, SPGIST_LEAF | SPGIST_NULLS);
+ smgr_bulk_write(bulkstate, SPGIST_NULL_BLKNO, buf, true);
- UnlockReleaseBuffer(metabuffer);
- UnlockReleaseBuffer(rootbuffer);
- UnlockReleaseBuffer(nullbuffer);
+ smgr_bulk_finish(bulkstate);
}
/*