diff options
Diffstat (limited to 'src/backend/access')
-rw-r--r-- | src/backend/access/gist/gistbuild.c | 121 | ||||
-rw-r--r-- | src/backend/access/heap/rewriteheap.c | 72 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 33 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 135 | ||||
-rw-r--r-- | src/backend/access/spgist/spginsert.c | 49 |
5 files changed, 112 insertions, 298 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 08555b97f92..465246173ba 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -43,7 +43,8 @@ #include "miscadmin.h" #include "optimizer/optimizer.h" #include "storage/bufmgr.h" -#include "storage/smgr.h" +#include "storage/bulk_write.h" + #include "utils/memutils.h" #include "utils/rel.h" #include "utils/tuplesort.h" @@ -106,11 +107,8 @@ typedef struct Tuplesortstate *sortstate; /* state data for tuplesort.c */ BlockNumber pages_allocated; - BlockNumber pages_written; - int ready_num_pages; - BlockNumber ready_blknos[XLR_MAX_BLOCK_ID]; - Page ready_pages[XLR_MAX_BLOCK_ID]; + BulkWriteState *bulkstate; } GISTBuildState; #define GIST_SORTED_BUILD_PAGE_NUM 4 @@ -142,7 +140,6 @@ static void gist_indexsortbuild_levelstate_add(GISTBuildState *state, IndexTuple itup); static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state, GistSortedBuildLevelState *levelstate); -static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state); static void gistInitBuffering(GISTBuildState *buildstate); static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep); @@ -405,27 +402,18 @@ gist_indexsortbuild(GISTBuildState *state) { IndexTuple itup; GistSortedBuildLevelState *levelstate; - Page page; + BulkWriteBuffer rootbuf; - state->pages_allocated = 0; - state->pages_written = 0; - state->ready_num_pages = 0; + /* Reserve block 0 for the root page */ + state->pages_allocated = 1; - /* - * Write an empty page as a placeholder for the root page. It will be - * replaced with the real root page at the end. - */ - page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); - smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, - page, true); - state->pages_allocated++; - state->pages_written++; + state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM); /* Allocate a temporary buffer for the first leaf page batch. */ levelstate = palloc0(sizeof(GistSortedBuildLevelState)); - levelstate->pages[0] = page; + levelstate->pages[0] = palloc(BLCKSZ); levelstate->parent = NULL; - gistinitpage(page, F_LEAF); + gistinitpage(levelstate->pages[0], F_LEAF); /* * Fill index pages with tuples in the sorted order. @@ -455,31 +443,15 @@ gist_indexsortbuild(GISTBuildState *state) levelstate = parent; } - gist_indexsortbuild_flush_ready_pages(state); - /* Write out the root */ PageSetLSN(levelstate->pages[0], GistBuildLSN); - PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO); - smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, - levelstate->pages[0], true); - if (RelationNeedsWAL(state->indexrel)) - log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO, - levelstate->pages[0], true); - - pfree(levelstate->pages[0]); + rootbuf = smgr_bulk_get_buf(state->bulkstate); + memcpy(rootbuf, levelstate->pages[0], BLCKSZ); + smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true); + pfree(levelstate); - /* - * When we WAL-logged index pages, we must nonetheless fsync index files. - * Since we're building outside shared buffers, a CHECKPOINT occurring - * during the build has no way to flush the previously written data to - * disk (indeed it won't know the index even exists). A crash later on - * would replay WAL from the checkpoint, therefore it wouldn't replay our - * earlier WAL entries. If we do not fsync those pages here, they might - * still not be on disk when the crash occurs. - */ - if (RelationNeedsWAL(state->indexrel)) - smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM); + smgr_bulk_finish(state->bulkstate); } /* @@ -509,8 +481,7 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state, levelstate->current_page++; if (levelstate->pages[levelstate->current_page] == NULL) - levelstate->pages[levelstate->current_page] = - palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ); newPage = levelstate->pages[levelstate->current_page]; gistinitpage(newPage, old_page_flags); @@ -573,6 +544,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, for (; dist != NULL; dist = dist->next) { char *data; + BulkWriteBuffer buf; Page target; /* check once per page */ @@ -580,7 +552,8 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, /* Create page and copy data */ data = (char *) (dist->list); - target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); + buf = smgr_bulk_get_buf(state->bulkstate); + target = (Page) buf; gistinitpage(target, isleaf ? F_LEAF : 0); for (int i = 0; i < dist->block.num; i++) { @@ -593,20 +566,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, } union_tuple = dist->itup; - if (state->ready_num_pages == XLR_MAX_BLOCK_ID) - gist_indexsortbuild_flush_ready_pages(state); - - /* - * The page is now complete. Assign a block number to it, and add it - * to the list of finished pages. (We don't write it out immediately, - * because we want to WAL-log the pages in batches.) - */ - blkno = state->pages_allocated++; - state->ready_blknos[state->ready_num_pages] = blkno; - state->ready_pages[state->ready_num_pages] = target; - state->ready_num_pages++; - ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); - /* * Set the right link to point to the previous page. This is just for * debugging purposes: GiST only follows the right link if a page is @@ -621,6 +580,15 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, */ if (levelstate->last_blkno) GistPageGetOpaque(target)->rightlink = levelstate->last_blkno; + + /* + * The page is now complete. Assign a block number to it, and pass it + * to the bulk writer. + */ + blkno = state->pages_allocated++; + PageSetLSN(target, GistBuildLSN); + smgr_bulk_write(state->bulkstate, blkno, buf, true); + ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); levelstate->last_blkno = blkno; /* @@ -631,7 +599,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, if (parent == NULL) { parent = palloc0(sizeof(GistSortedBuildLevelState)); - parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + parent->pages[0] = palloc(BLCKSZ); parent->parent = NULL; gistinitpage(parent->pages[0], 0); @@ -641,39 +609,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, } } -static void -gist_indexsortbuild_flush_ready_pages(GISTBuildState *state) -{ - if (state->ready_num_pages == 0) - return; - - for (int i = 0; i < state->ready_num_pages; i++) - { - Page page = state->ready_pages[i]; - BlockNumber blkno = state->ready_blknos[i]; - - /* Currently, the blocks must be buffered in order. */ - if (blkno != state->pages_written) - elog(ERROR, "unexpected block number to flush GiST sorting build"); - - PageSetLSN(page, GistBuildLSN); - PageSetChecksumInplace(page, blkno); - smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page, - true); - - state->pages_written++; - } - - if (RelationNeedsWAL(state->indexrel)) - log_newpages(&state->indexrel->rd_locator, MAIN_FORKNUM, state->ready_num_pages, - state->ready_blknos, state->ready_pages, true); - - for (int i = 0; i < state->ready_num_pages; i++) - pfree(state->ready_pages[i]); - - state->ready_num_pages = 0; -} - /*------------------------------------------------------------------------- * Routines for non-sorted build diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 34107323ffe..a578b876174 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -87,8 +87,8 @@ * is optimized for bulk inserting a lot of tuples, knowing that we have * exclusive access to the heap. raw_heap_insert builds new pages in * local storage. When a page is full, or at the end of the process, - * we insert it to WAL as a single record and then write it to disk - * directly through smgr. Note, however, that any data sent to the new + * we insert it to WAL as a single record and then write it to disk with + * the bulk smgr writer. Note, however, that any data sent to the new * heap's TOAST table will go through the normal bufmgr. * * @@ -119,9 +119,9 @@ #include "replication/logical.h" #include "replication/slot.h" #include "storage/bufmgr.h" +#include "storage/bulk_write.h" #include "storage/fd.h" #include "storage/procarray.h" -#include "storage/smgr.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -133,9 +133,9 @@ typedef struct RewriteStateData { Relation rs_old_rel; /* source heap */ Relation rs_new_rel; /* destination heap */ - Page rs_buffer; /* page currently being built */ + BulkWriteState *rs_bulkstate; /* writer for the destination */ + BulkWriteBuffer rs_buffer; /* page currently being built */ BlockNumber rs_blockno; /* block where page will go */ - bool rs_buffer_valid; /* T if any tuples in buffer */ bool rs_logical_rewrite; /* do we need to do logical rewriting */ TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine * tuple visibility */ @@ -255,14 +255,14 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm state->rs_old_rel = old_heap; state->rs_new_rel = new_heap; - state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + state->rs_buffer = NULL; /* new_heap needn't be empty, just locked */ state->rs_blockno = RelationGetNumberOfBlocks(new_heap); - state->rs_buffer_valid = false; state->rs_oldest_xmin = oldest_xmin; state->rs_freeze_xid = freeze_xid; state->rs_cutoff_multi = cutoff_multi; state->rs_cxt = rw_cxt; + state->rs_bulkstate = smgr_bulk_start_rel(new_heap, MAIN_FORKNUM); /* Initialize hash tables used to track update chains */ hash_ctl.keysize = sizeof(TidHashKey); @@ -314,30 +314,13 @@ end_heap_rewrite(RewriteState state) } /* Write the last page, if any */ - if (state->rs_buffer_valid) + if (state->rs_buffer) { - if (RelationNeedsWAL(state->rs_new_rel)) - log_newpage(&state->rs_new_rel->rd_locator, - MAIN_FORKNUM, - state->rs_blockno, - state->rs_buffer, - true); - - PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); - - smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, - state->rs_blockno, state->rs_buffer, true); + smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true); + state->rs_buffer = NULL; } - /* - * When we WAL-logged rel pages, we must nonetheless fsync them. The - * reason is the same as in storage.c's RelationCopyStorage(): we're - * writing data that's not in shared buffers, and so a CHECKPOINT - * occurring during the rewriteheap operation won't have fsync'd data we - * wrote before the checkpoint. - */ - if (RelationNeedsWAL(state->rs_new_rel)) - smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM); + smgr_bulk_finish(state->rs_bulkstate); logical_end_heap_rewrite(state); @@ -611,7 +594,7 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple) static void raw_heap_insert(RewriteState state, HeapTuple tup) { - Page page = state->rs_buffer; + Page page; Size pageFreeSpace, saveFreeSpace; Size len; @@ -664,7 +647,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup) HEAP_DEFAULT_FILLFACTOR); /* Now we can check to see if there's enough free space already. */ - if (state->rs_buffer_valid) + page = (Page) state->rs_buffer; + if (page) { pageFreeSpace = PageGetHeapFreeSpace(page); @@ -675,35 +659,19 @@ raw_heap_insert(RewriteState state, HeapTuple tup) * contains a tuple. Hence, unlike RelationGetBufferForTuple(), * enforce saveFreeSpace unconditionally. */ - - /* XLOG stuff */ - if (RelationNeedsWAL(state->rs_new_rel)) - log_newpage(&state->rs_new_rel->rd_locator, - MAIN_FORKNUM, - state->rs_blockno, - page, - true); - - /* - * Now write the page. We say skipFsync = true because there's no - * need for smgr to schedule an fsync for this write; we'll do it - * ourselves in end_heap_rewrite. - */ - PageSetChecksumInplace(page, state->rs_blockno); - - smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, - state->rs_blockno, page, true); - + smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true); + state->rs_buffer = NULL; + page = NULL; state->rs_blockno++; - state->rs_buffer_valid = false; } } - if (!state->rs_buffer_valid) + if (!page) { /* Initialize a new empty page */ + state->rs_buffer = smgr_bulk_get_buf(state->rs_bulkstate); + page = (Page) state->rs_buffer; PageInit(page, BLCKSZ, 0); - state->rs_buffer_valid = true; } /* And now we can insert the tuple into the page */ diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 696d79c0852..21d879a3bdf 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -29,11 +29,11 @@ #include "nodes/execnodes.h" #include "pgstat.h" #include "postmaster/autovacuum.h" +#include "storage/bulk_write.h" #include "storage/condition_variable.h" #include "storage/indexfsm.h" #include "storage/ipc.h" #include "storage/lmgr.h" -#include "storage/smgr.h" #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" @@ -154,32 +154,17 @@ void btbuildempty(Relation index) { bool allequalimage = _bt_allequalimage(index, false); - Buffer metabuf; - Page metapage; + BulkWriteState *bulkstate; + BulkWriteBuffer metabuf; - /* - * Initialize the metapage. - * - * Regular index build bypasses the buffer manager and uses smgr functions - * directly, with an smgrimmedsync() call at the end. That makes sense - * when the index is large, but for an empty index, it's better to use the - * buffer cache to avoid the smgrimmedsync(). - */ - metabuf = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); - Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE); - _bt_lockbuf(index, metabuf, BT_WRITE); - - START_CRIT_SECTION(); - - metapage = BufferGetPage(metabuf); - _bt_initmetapage(metapage, P_NONE, 0, allequalimage); - MarkBufferDirty(metabuf); - log_newpage_buffer(metabuf, true); + bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM); - END_CRIT_SECTION(); + /* Construct metapage. */ + metabuf = smgr_bulk_get_buf(bulkstate); + _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage); + smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true); - _bt_unlockbuf(index, metabuf); - ReleaseBuffer(metabuf); + smgr_bulk_finish(bulkstate); } /* diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 20111965793..3f1e7b9c155 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -23,13 +23,8 @@ * many upper pages if the keys are reasonable-size) without risking a lot of * cascading splits during early insertions. * - * Formerly the index pages being built were kept in shared buffers, but - * that is of no value (since other backends have no interest in them yet) - * and it created locking problems for CHECKPOINT, because the upper-level - * pages were held exclusive-locked for long periods. Now we just build - * the pages in local memory and smgrwrite or smgrextend them as we finish - * them. They will need to be re-read into shared buffers on first use after - * the build finishes. + * We use the bulk smgr loading facility to bypass the buffer cache and + * WAL-log the pages efficiently. * * This code isn't concerned about the FSM at all. The caller is responsible * for initializing that. @@ -57,7 +52,7 @@ #include "executor/instrument.h" #include "miscadmin.h" #include "pgstat.h" -#include "storage/smgr.h" +#include "storage/bulk_write.h" #include "tcop/tcopprot.h" /* pgrminclude ignore */ #include "utils/rel.h" #include "utils/sortsupport.h" @@ -234,7 +229,7 @@ typedef struct BTBuildState */ typedef struct BTPageState { - Page btps_page; /* workspace for page building */ + BulkWriteBuffer btps_buf; /* workspace for page building */ BlockNumber btps_blkno; /* block # to write this page at */ IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */ OffsetNumber btps_lastoff; /* last item offset loaded */ @@ -251,11 +246,9 @@ typedef struct BTWriteState { Relation heap; Relation index; + BulkWriteState *bulkstate; BTScanInsert inskey; /* generic insertion scankey */ - bool btws_use_wal; /* dump pages to WAL? */ BlockNumber btws_pages_alloced; /* # pages allocated */ - BlockNumber btws_pages_written; /* # pages written out */ - Page btws_zeropage; /* workspace for filling zeroes */ } BTWriteState; @@ -267,7 +260,7 @@ static void _bt_spool(BTSpool *btspool, ItemPointer self, static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); -static Page _bt_blnewpage(uint32 level); +static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); static void _bt_slideleft(Page rightmostpage); static void _bt_sortaddtup(Page page, Size itemsize, @@ -569,12 +562,9 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); - wstate.btws_use_wal = RelationNeedsWAL(wstate.index); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; - wstate.btws_pages_written = 0; - wstate.btws_zeropage = NULL; /* until needed */ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); @@ -613,13 +603,15 @@ _bt_build_callback(Relation index, /* * allocate workspace for a new, clean btree page, not linked to any siblings. */ -static Page -_bt_blnewpage(uint32 level) +static BulkWriteBuffer +_bt_blnewpage(BTWriteState *wstate, uint32 level) { + BulkWriteBuffer buf; Page page; BTPageOpaque opaque; - page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + buf = smgr_bulk_get_buf(wstate->bulkstate); + page = (Page) buf; /* Zero the page and set up standard page header info */ _bt_pageinit(page, BLCKSZ); @@ -634,63 +626,17 @@ _bt_blnewpage(uint32 level) /* Make the P_HIKEY line pointer appear allocated */ ((PageHeader) page)->pd_lower += sizeof(ItemIdData); - return page; + return buf; } /* * emit a completed btree page, and release the working storage. */ static void -_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) +_bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno) { - /* XLOG stuff */ - if (wstate->btws_use_wal) - { - /* We use the XLOG_FPI record type for this */ - log_newpage(&wstate->index->rd_locator, MAIN_FORKNUM, blkno, page, true); - } - - /* - * If we have to write pages nonsequentially, fill in the space with - * zeroes until we come back and overwrite. This is not logically - * necessary on standard Unix filesystems (unwritten space will read as - * zeroes anyway), but it should help to avoid fragmentation. The dummy - * pages aren't WAL-logged though. - */ - while (blkno > wstate->btws_pages_written) - { - if (!wstate->btws_zeropage) - wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ, - PG_IO_ALIGN_SIZE, - MCXT_ALLOC_ZERO); - /* don't set checksum for all-zero page */ - smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, - wstate->btws_pages_written++, - wstate->btws_zeropage, - true); - } - - PageSetChecksumInplace(page, blkno); - - /* - * Now write the page. There's no need for smgr to schedule an fsync for - * this write; we'll do it ourselves before ending the build. - */ - if (blkno == wstate->btws_pages_written) - { - /* extending the file... */ - smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, - page, true); - wstate->btws_pages_written++; - } - else - { - /* overwriting a block we zero-filled before */ - smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, - page, true); - } - - pfree(page); + smgr_bulk_write(wstate->bulkstate, blkno, buf, true); + /* smgr_bulk_write took ownership of 'buf' */ } /* @@ -703,7 +649,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level) BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); /* create initial page for level */ - state->btps_page = _bt_blnewpage(level); + state->btps_buf = _bt_blnewpage(wstate, level); /* and assign it a page position */ state->btps_blkno = wstate->btws_pages_alloced++; @@ -839,6 +785,7 @@ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra) { + BulkWriteBuffer nbuf; Page npage; BlockNumber nblkno; OffsetNumber last_off; @@ -853,7 +800,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, */ CHECK_FOR_INTERRUPTS(); - npage = state->btps_page; + nbuf = state->btps_buf; + npage = (Page) nbuf; nblkno = state->btps_blkno; last_off = state->btps_lastoff; last_truncextra = state->btps_lastextra; @@ -909,6 +857,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, /* * Finish off the page and write it out. */ + BulkWriteBuffer obuf = nbuf; Page opage = npage; BlockNumber oblkno = nblkno; ItemId ii; @@ -916,7 +865,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, IndexTuple oitup; /* Create new page of same level */ - npage = _bt_blnewpage(state->btps_level); + nbuf = _bt_blnewpage(wstate, state->btps_level); + npage = (Page) nbuf; /* and assign it a page position */ nblkno = wstate->btws_pages_alloced++; @@ -1028,10 +978,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, } /* - * Write out the old page. We never need to touch it again, so we can - * free the opage workspace too. + * Write out the old page. _bt_blwritepage takes ownership of the + * 'opage' buffer. */ - _bt_blwritepage(wstate, opage, oblkno); + _bt_blwritepage(wstate, obuf, oblkno); /* * Reset last_off to point to new page @@ -1064,7 +1014,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, _bt_sortaddtup(npage, itupsz, itup, last_off, !isleaf && last_off == P_FIRSTKEY); - state->btps_page = npage; + state->btps_buf = nbuf; state->btps_blkno = nblkno; state->btps_lastoff = last_off; } @@ -1116,7 +1066,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) BTPageState *s; BlockNumber rootblkno = P_NONE; uint32 rootlevel = 0; - Page metapage; + BulkWriteBuffer metabuf; /* * Each iteration of this loop completes one more level of the tree. @@ -1127,7 +1077,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) BTPageOpaque opaque; blkno = s->btps_blkno; - opaque = BTPageGetOpaque(s->btps_page); + opaque = BTPageGetOpaque((Page) s->btps_buf); /* * We have to link the last page on this level to somewhere. @@ -1161,9 +1111,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * This is the rightmost page, so the ItemId array needs to be slid * back one slot. Then we can dump out the page. */ - _bt_slideleft(s->btps_page); - _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); - s->btps_page = NULL; /* writepage freed the workspace */ + _bt_slideleft((Page) s->btps_buf); + _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno); + s->btps_buf = NULL; /* writepage took ownership of the buffer */ } /* @@ -1172,10 +1122,10 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * set to point to "P_NONE"). This changes the index to the "valid" state * by filling in a valid magic number in the metapage. */ - metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); - _bt_initmetapage(metapage, rootblkno, rootlevel, + metabuf = smgr_bulk_get_buf(wstate->bulkstate); + _bt_initmetapage((Page) metabuf, rootblkno, rootlevel, wstate->inskey->allequalimage); - _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); + _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE); } /* @@ -1197,6 +1147,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) int64 tuples_done = 0; bool deduplicate; + wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); + deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); @@ -1352,7 +1304,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) */ dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) - sizeof(ItemIdData); - Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) && + Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) && dstate->maxpostingsize <= INDEX_SIZE_MASK); dstate->htids = palloc(dstate->maxpostingsize); @@ -1422,18 +1374,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); - - /* - * When we WAL-logged index pages, we must nonetheless fsync index files. - * Since we're building outside shared buffers, a CHECKPOINT occurring - * during the build has no way to flush the previously written data to - * disk (indeed it won't know the index even exists). A crash later on - * would replay WAL from the checkpoint, therefore it wouldn't replay our - * earlier WAL entries. If we do not fsync those pages here, they might - * still not be on disk when the crash occurs. - */ - if (wstate->btws_use_wal) - smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM); + smgr_bulk_finish(wstate->bulkstate); } /* diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 98b1da20d58..1b70c5a59fd 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -25,7 +25,7 @@ #include "catalog/index.h" #include "miscadmin.h" #include "storage/bufmgr.h" -#include "storage/smgr.h" +#include "storage/bulk_write.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -155,42 +155,27 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) void spgbuildempty(Relation index) { - Buffer metabuffer, - rootbuffer, - nullbuffer; - - /* - * Initialize the meta page and root pages - */ - metabuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); - LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE); - rootbuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); - LockBuffer(rootbuffer, BUFFER_LOCK_EXCLUSIVE); - nullbuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); - LockBuffer(nullbuffer, BUFFER_LOCK_EXCLUSIVE); - - Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO); - Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_ROOT_BLKNO); - Assert(BufferGetBlockNumber(nullbuffer) == SPGIST_NULL_BLKNO); + BulkWriteState *bulkstate; + BulkWriteBuffer buf; - START_CRIT_SECTION(); + bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM); - SpGistInitMetapage(BufferGetPage(metabuffer)); - MarkBufferDirty(metabuffer); - SpGistInitBuffer(rootbuffer, SPGIST_LEAF); - MarkBufferDirty(rootbuffer); - SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS); - MarkBufferDirty(nullbuffer); + /* Construct metapage. */ + buf = smgr_bulk_get_buf(bulkstate); + SpGistInitMetapage((Page) buf); + smgr_bulk_write(bulkstate, SPGIST_METAPAGE_BLKNO, buf, true); - log_newpage_buffer(metabuffer, true); - log_newpage_buffer(rootbuffer, true); - log_newpage_buffer(nullbuffer, true); + /* Likewise for the root page. */ + buf = smgr_bulk_get_buf(bulkstate); + SpGistInitPage((Page) buf, SPGIST_LEAF); + smgr_bulk_write(bulkstate, SPGIST_ROOT_BLKNO, buf, true); - END_CRIT_SECTION(); + /* Likewise for the null-tuples root page. */ + buf = smgr_bulk_get_buf(bulkstate); + SpGistInitPage((Page) buf, SPGIST_LEAF | SPGIST_NULLS); + smgr_bulk_write(bulkstate, SPGIST_NULL_BLKNO, buf, true); - UnlockReleaseBuffer(metabuffer); - UnlockReleaseBuffer(rootbuffer); - UnlockReleaseBuffer(nullbuffer); + smgr_bulk_finish(bulkstate); } /* |