11 files changed, 501 insertions, 331 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 08555b97f92..465246173ba 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -43,7 +43,8 @@
 #include "miscadmin.h"
 #include "optimizer/optimizer.h"
 #include "storage/bufmgr.h"
-#include "storage/smgr.h"
+#include "storage/bulk_write.h"
+
 #include "utils/memutils.h"
 #include "utils/rel.h"
 #include "utils/tuplesort.h"
@@ -106,11 +107,8 @@ typedef struct
 	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
 
 	BlockNumber pages_allocated;
-	BlockNumber pages_written;
 
-	int			ready_num_pages;
-	BlockNumber ready_blknos[XLR_MAX_BLOCK_ID];
-	Page		ready_pages[XLR_MAX_BLOCK_ID];
+	BulkWriteState *bulkstate;
 } GISTBuildState;
 
 #define GIST_SORTED_BUILD_PAGE_NUM 4
@@ -142,7 +140,6 @@ static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
 											   IndexTuple itup);
 static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 												 GistSortedBuildLevelState *levelstate);
-static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state);
 
 static void gistInitBuffering(GISTBuildState *buildstate);
 static int	calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
@@ -405,27 +402,18 @@ gist_indexsortbuild(GISTBuildState *state)
 {
 	IndexTuple	itup;
 	GistSortedBuildLevelState *levelstate;
-	Page		page;
+	BulkWriteBuffer rootbuf;
 
-	state->pages_allocated = 0;
-	state->pages_written = 0;
-	state->ready_num_pages = 0;
+	/* Reserve block 0 for the root page */
+	state->pages_allocated = 1;
 
-	/*
-	 * Write an empty page as a placeholder for the root page. It will be
-	 * replaced with the real root page at the end.
-	 */
-	page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
-	smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
-			   page, true);
-	state->pages_allocated++;
-	state->pages_written++;
+	state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM);
 
 	/* Allocate a temporary buffer for the first leaf page batch. */
 	levelstate = palloc0(sizeof(GistSortedBuildLevelState));
-	levelstate->pages[0] = page;
+	levelstate->pages[0] = palloc(BLCKSZ);
 	levelstate->parent = NULL;
-	gistinitpage(page, F_LEAF);
+	gistinitpage(levelstate->pages[0], F_LEAF);
 
 	/*
 	 * Fill index pages with tuples in the sorted order.
@@ -455,31 +443,15 @@ gist_indexsortbuild(GISTBuildState *state)
 		levelstate = parent;
 	}
 
-	gist_indexsortbuild_flush_ready_pages(state);
-
 	/* Write out the root */
 	PageSetLSN(levelstate->pages[0], GistBuildLSN);
-	PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO);
-	smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
-			  levelstate->pages[0], true);
-	if (RelationNeedsWAL(state->indexrel))
-		log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO,
-					levelstate->pages[0], true);
-
-	pfree(levelstate->pages[0]);
+	rootbuf = smgr_bulk_get_buf(state->bulkstate);
+	memcpy(rootbuf, levelstate->pages[0], BLCKSZ);
+	smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true);
+
 	pfree(levelstate);
 
-	/*
-	 * When we WAL-logged index pages, we must nonetheless fsync index files.
-	 * Since we're building outside shared buffers, a CHECKPOINT occurring
-	 * during the build has no way to flush the previously written data to
-	 * disk (indeed it won't know the index even exists).  A crash later on
-	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
-	 * earlier WAL entries. If we do not fsync those pages here, they might
-	 * still not be on disk when the crash occurs.
-	 */
-	if (RelationNeedsWAL(state->indexrel))
-		smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM);
+	smgr_bulk_finish(state->bulkstate);
 }
 
 /*
@@ -509,8 +481,7 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state,
 			levelstate->current_page++;
 
 		if (levelstate->pages[levelstate->current_page] == NULL)
-			levelstate->pages[levelstate->current_page] =
-				palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+			levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ);
 
 		newPage = levelstate->pages[levelstate->current_page];
 		gistinitpage(newPage, old_page_flags);
@@ -573,6 +544,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 	for (; dist != NULL; dist = dist->next)
 	{
 		char	   *data;
+		BulkWriteBuffer buf;
 		Page		target;
 
 		/* check once per page */
@@ -580,7 +552,8 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 
 		/* Create page and copy data */
 		data = (char *) (dist->list);
-		target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
+		buf = smgr_bulk_get_buf(state->bulkstate);
+		target = (Page) buf;
 		gistinitpage(target, isleaf ? F_LEAF : 0);
 		for (int i = 0; i < dist->block.num; i++)
 		{
@@ -593,20 +566,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 		}
 		union_tuple = dist->itup;
 
-		if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
-			gist_indexsortbuild_flush_ready_pages(state);
-
-		/*
-		 * The page is now complete. Assign a block number to it, and add it
-		 * to the list of finished pages. (We don't write it out immediately,
-		 * because we want to WAL-log the pages in batches.)
-		 */
-		blkno = state->pages_allocated++;
-		state->ready_blknos[state->ready_num_pages] = blkno;
-		state->ready_pages[state->ready_num_pages] = target;
-		state->ready_num_pages++;
-		ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
-
 		/*
 		 * Set the right link to point to the previous page. This is just for
 		 * debugging purposes: GiST only follows the right link if a page is
@@ -621,6 +580,15 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 		 */
 		if (levelstate->last_blkno)
 			GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
+
+		/*
+		 * The page is now complete. Assign a block number to it, and pass it
+		 * to the bulk writer.
+		 */
+		blkno = state->pages_allocated++;
+		PageSetLSN(target, GistBuildLSN);
+		smgr_bulk_write(state->bulkstate, blkno, buf, true);
+		ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
 		levelstate->last_blkno = blkno;
 
 		/*
@@ -631,7 +599,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 		if (parent == NULL)
 		{
 			parent = palloc0(sizeof(GistSortedBuildLevelState));
-			parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+			parent->pages[0] = palloc(BLCKSZ);
 			parent->parent = NULL;
 			gistinitpage(parent->pages[0], 0);
 
@@ -641,39 +609,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 	}
 }
 
-static void
-gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
-{
-	if (state->ready_num_pages == 0)
-		return;
-
-	for (int i = 0; i < state->ready_num_pages; i++)
-	{
-		Page		page = state->ready_pages[i];
-		BlockNumber blkno = state->ready_blknos[i];
-
-		/* Currently, the blocks must be buffered in order. */
-		if (blkno != state->pages_written)
-			elog(ERROR, "unexpected block number to flush GiST sorting build");
-
-		PageSetLSN(page, GistBuildLSN);
-		PageSetChecksumInplace(page, blkno);
-		smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page,
-				   true);
-
-		state->pages_written++;
-	}
-
-	if (RelationNeedsWAL(state->indexrel))
-		log_newpages(&state->indexrel->rd_locator, MAIN_FORKNUM, state->ready_num_pages,
-					 state->ready_blknos, state->ready_pages, true);
-
-	for (int i = 0; i < state->ready_num_pages; i++)
-		pfree(state->ready_pages[i]);
-
-	state->ready_num_pages = 0;
-}
-
 
 /*-------------------------------------------------------------------------
  * Routines for non-sorted build
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 34107323ffe..a578b876174 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -87,8 +87,8 @@
  * is optimized for bulk inserting a lot of tuples, knowing that we have
  * exclusive access to the heap.  raw_heap_insert builds new pages in
  * local storage.  When a page is full, or at the end of the process,
- * we insert it to WAL as a single record and then write it to disk
- * directly through smgr.  Note, however, that any data sent to the new
+ * we insert it to WAL as a single record and then write it to disk with
+ * the bulk smgr writer.  Note, however, that any data sent to the new
  * heap's TOAST table will go through the normal bufmgr.
  *
  *
@@ -119,9 +119,9 @@
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "storage/bufmgr.h"
+#include "storage/bulk_write.h"
 #include "storage/fd.h"
 #include "storage/procarray.h"
-#include "storage/smgr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -133,9 +133,9 @@ typedef struct RewriteStateData
 {
 	Relation	rs_old_rel;		/* source heap */
 	Relation	rs_new_rel;		/* destination heap */
-	Page		rs_buffer;		/* page currently being built */
+	BulkWriteState *rs_bulkstate;	/* writer for the destination */
+	BulkWriteBuffer rs_buffer;	/* page currently being built */
 	BlockNumber rs_blockno;		/* block where page will go */
-	bool		rs_buffer_valid;	/* T if any tuples in buffer */
 	bool		rs_logical_rewrite; /* do we need to do logical rewriting */
 	TransactionId rs_oldest_xmin;	/* oldest xmin used by caller to determine
 									 * tuple visibility */
@@ -255,14 +255,14 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 
 	state->rs_old_rel = old_heap;
 	state->rs_new_rel = new_heap;
-	state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+	state->rs_buffer = NULL;
 	/* new_heap needn't be empty, just locked */
 	state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
-	state->rs_buffer_valid = false;
 	state->rs_oldest_xmin = oldest_xmin;
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_bulkstate = smgr_bulk_start_rel(new_heap, MAIN_FORKNUM);
 
 	/* Initialize hash tables used to track update chains */
 	hash_ctl.keysize = sizeof(TidHashKey);
@@ -314,30 +314,13 @@ end_heap_rewrite(RewriteState state)
 	}
 
 	/* Write the last page, if any */
-	if (state->rs_buffer_valid)
+	if (state->rs_buffer)
 	{
-		if (RelationNeedsWAL(state->rs_new_rel))
-			log_newpage(&state->rs_new_rel->rd_locator,
-						MAIN_FORKNUM,
-						state->rs_blockno,
-						state->rs_buffer,
-						true);
-
-		PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
-
-		smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-				   state->rs_blockno, state->rs_buffer, true);
+		smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true);
+		state->rs_buffer = NULL;
 	}
 
-	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
-	 * reason is the same as in storage.c's RelationCopyStorage(): we're
-	 * writing data that's not in shared buffers, and so a CHECKPOINT
-	 * occurring during the rewriteheap operation won't have fsync'd data we
-	 * wrote before the checkpoint.
-	 */
-	if (RelationNeedsWAL(state->rs_new_rel))
-		smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM);
+	smgr_bulk_finish(state->rs_bulkstate);
 
 	logical_end_heap_rewrite(state);
 
@@ -611,7 +594,7 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple)
 static void
 raw_heap_insert(RewriteState state, HeapTuple tup)
 {
-	Page		page = state->rs_buffer;
+	Page		page;
 	Size		pageFreeSpace,
 				saveFreeSpace;
 	Size		len;
@@ -664,7 +647,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 												   HEAP_DEFAULT_FILLFACTOR);
 
 	/* Now we can check to see if there's enough free space already. */
-	if (state->rs_buffer_valid)
+	page = (Page) state->rs_buffer;
+	if (page)
 	{
 		pageFreeSpace = PageGetHeapFreeSpace(page);
 
@@ -675,35 +659,19 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			 * contains a tuple.  Hence, unlike RelationGetBufferForTuple(),
 			 * enforce saveFreeSpace unconditionally.
 			 */
-
-			/* XLOG stuff */
-			if (RelationNeedsWAL(state->rs_new_rel))
-				log_newpage(&state->rs_new_rel->rd_locator,
-							MAIN_FORKNUM,
-							state->rs_blockno,
-							page,
-							true);
-
-			/*
-			 * Now write the page. We say skipFsync = true because there's no
-			 * need for smgr to schedule an fsync for this write; we'll do it
-			 * ourselves in end_heap_rewrite.
-			 */
-			PageSetChecksumInplace(page, state->rs_blockno);
-
-			smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-					   state->rs_blockno, page, true);
-
+			smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true);
+			state->rs_buffer = NULL;
+			page = NULL;
 			state->rs_blockno++;
-			state->rs_buffer_valid = false;
 		}
 	}
 
-	if (!state->rs_buffer_valid)
+	if (!page)
 	{
 		/* Initialize a new empty page */
+		state->rs_buffer = smgr_bulk_get_buf(state->rs_bulkstate);
+		page = (Page) state->rs_buffer;
 		PageInit(page, BLCKSZ, 0);
-		state->rs_buffer_valid = true;
 	}
 
 	/* And now we can insert the tuple into the page */
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 696d79c0852..21d879a3bdf 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -29,11 +29,11 @@
 #include "nodes/execnodes.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "storage/bulk_write.h"
 #include "storage/condition_variable.h"
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
-#include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
@@ -154,32 +154,17 @@ void
 btbuildempty(Relation index)
 {
 	bool		allequalimage = _bt_allequalimage(index, false);
-	Buffer		metabuf;
-	Page		metapage;
+	BulkWriteState *bulkstate;
+	BulkWriteBuffer metabuf;
 
-	/*
-	 * Initialize the metapage.
-	 *
-	 * Regular index build bypasses the buffer manager and uses smgr functions
-	 * directly, with an smgrimmedsync() call at the end.  That makes sense
-	 * when the index is large, but for an empty index, it's better to use the
-	 * buffer cache to avoid the smgrimmedsync().
-	 */
-	metabuf = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
-	Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE);
-	_bt_lockbuf(index, metabuf, BT_WRITE);
-
-	START_CRIT_SECTION();
-
-	metapage = BufferGetPage(metabuf);
-	_bt_initmetapage(metapage, P_NONE, 0, allequalimage);
-	MarkBufferDirty(metabuf);
-	log_newpage_buffer(metabuf, true);
+	bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
 
-	END_CRIT_SECTION();
+	/* Construct metapage. */
+	metabuf = smgr_bulk_get_buf(bulkstate);
+	_bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
+	smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
 
-	_bt_unlockbuf(index, metabuf);
-	ReleaseBuffer(metabuf);
+	smgr_bulk_finish(bulkstate);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 20111965793..3f1e7b9c155 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -23,13 +23,8 @@
  * many upper pages if the keys are reasonable-size) without risking a lot of
  * cascading splits during early insertions.
  *
- * Formerly the index pages being built were kept in shared buffers, but
- * that is of no value (since other backends have no interest in them yet)
- * and it created locking problems for CHECKPOINT, because the upper-level
- * pages were held exclusive-locked for long periods.  Now we just build
- * the pages in local memory and smgrwrite or smgrextend them as we finish
- * them.  They will need to be re-read into shared buffers on first use after
- * the build finishes.
+ * We use the bulk smgr loading facility to bypass the buffer cache and
+ * WAL-log the pages efficiently.
  *
  * This code isn't concerned about the FSM at all. The caller is responsible
  * for initializing that.
@@ -57,7 +52,7 @@
 #include "executor/instrument.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "storage/smgr.h"
+#include "storage/bulk_write.h"
 #include "tcop/tcopprot.h"		/* pgrminclude ignore */
 #include "utils/rel.h"
 #include "utils/sortsupport.h"
@@ -234,7 +229,7 @@ typedef struct BTBuildState
  */
 typedef struct BTPageState
 {
-	Page		btps_page;		/* workspace for page building */
+	BulkWriteBuffer btps_buf;	/* workspace for page building */
 	BlockNumber btps_blkno;		/* block # to write this page at */
 	IndexTuple	btps_lowkey;	/* page's strict lower bound pivot tuple */
 	OffsetNumber btps_lastoff;	/* last item offset loaded */
@@ -251,11 +246,9 @@ typedef struct BTWriteState
 {
 	Relation	heap;
 	Relation	index;
+	BulkWriteState *bulkstate;
 	BTScanInsert inskey;		/* generic insertion scankey */
-	bool		btws_use_wal;	/* dump pages to WAL? */
 	BlockNumber btws_pages_alloced; /* # pages allocated */
-	BlockNumber btws_pages_written; /* # pages written out */
-	Page		btws_zeropage;	/* workspace for filling zeroes */
 } BTWriteState;
 
 
@@ -267,7 +260,7 @@ static void _bt_spool(BTSpool *btspool, ItemPointer self,
 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
 static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
 							   bool *isnull, bool tupleIsAlive, void *state);
-static Page _bt_blnewpage(uint32 level);
+static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
 static void _bt_slideleft(Page rightmostpage);
 static void _bt_sortaddtup(Page page, Size itemsize,
@@ -569,12 +562,9 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
 	wstate.inskey = _bt_mkscankey(wstate.index, NULL);
 	/* _bt_mkscankey() won't set allequalimage without metapage */
 	wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
-	wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
 
 	/* reserve the metapage */
 	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
-	wstate.btws_pages_written = 0;
-	wstate.btws_zeropage = NULL;	/* until needed */
 
 	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
 								 PROGRESS_BTREE_PHASE_LEAF_LOAD);
@@ -613,13 +603,15 @@ _bt_build_callback(Relation index,
 /*
  * allocate workspace for a new, clean btree page, not linked to any siblings.
  */
-static Page
-_bt_blnewpage(uint32 level)
+static BulkWriteBuffer
+_bt_blnewpage(BTWriteState *wstate, uint32 level)
 {
+	BulkWriteBuffer buf;
 	Page		page;
 	BTPageOpaque opaque;
 
-	page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+	buf = smgr_bulk_get_buf(wstate->bulkstate);
+	page = (Page) buf;
 
 	/* Zero the page and set up standard page header info */
 	_bt_pageinit(page, BLCKSZ);
@@ -634,63 +626,17 @@ _bt_blnewpage(uint32 level)
 	/* Make the P_HIKEY line pointer appear allocated */
 	((PageHeader) page)->pd_lower += sizeof(ItemIdData);
 
-	return page;
+	return buf;
 }
 
 /*
  * emit a completed btree page, and release the working storage.
  */
 static void
-_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
+_bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
 {
-	/* XLOG stuff */
-	if (wstate->btws_use_wal)
-	{
-		/* We use the XLOG_FPI record type for this */
-		log_newpage(&wstate->index->rd_locator, MAIN_FORKNUM, blkno, page, true);
-	}
-
-	/*
-	 * If we have to write pages nonsequentially, fill in the space with
-	 * zeroes until we come back and overwrite.  This is not logically
-	 * necessary on standard Unix filesystems (unwritten space will read as
-	 * zeroes anyway), but it should help to avoid fragmentation. The dummy
-	 * pages aren't WAL-logged though.
-	 */
-	while (blkno > wstate->btws_pages_written)
-	{
-		if (!wstate->btws_zeropage)
-			wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
-														  PG_IO_ALIGN_SIZE,
-														  MCXT_ALLOC_ZERO);
-		/* don't set checksum for all-zero page */
-		smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
-				   wstate->btws_pages_written++,
-				   wstate->btws_zeropage,
-				   true);
-	}
-
-	PageSetChecksumInplace(page, blkno);
-
-	/*
-	 * Now write the page.  There's no need for smgr to schedule an fsync for
-	 * this write; we'll do it ourselves before ending the build.
-	 */
-	if (blkno == wstate->btws_pages_written)
-	{
-		/* extending the file... */
-		smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
-				   page, true);
-		wstate->btws_pages_written++;
-	}
-	else
-	{
-		/* overwriting a block we zero-filled before */
-		smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
-				  page, true);
-	}
-
-	pfree(page);
+	smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
+	/* smgr_bulk_write took ownership of 'buf' */
 }
 
 /*
@@ -703,7 +649,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level)
 	BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
 
 	/* create initial page for level */
-	state->btps_page = _bt_blnewpage(level);
+	state->btps_buf = _bt_blnewpage(wstate, level);
 
 	/* and assign it a page position */
 	state->btps_blkno = wstate->btws_pages_alloced++;
@@ -839,6 +785,7 @@ static void
 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 			 Size truncextra)
 {
+	BulkWriteBuffer nbuf;
 	Page		npage;
 	BlockNumber nblkno;
 	OffsetNumber last_off;
@@ -853,7 +800,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 	 */
 	CHECK_FOR_INTERRUPTS();
 
-	npage = state->btps_page;
+	nbuf = state->btps_buf;
+	npage = (Page) nbuf;
 	nblkno = state->btps_blkno;
 	last_off = state->btps_lastoff;
 	last_truncextra = state->btps_lastextra;
@@ -909,6 +857,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 		/*
 		 * Finish off the page and write it out.
 		 */
+		BulkWriteBuffer obuf = nbuf;
 		Page		opage = npage;
 		BlockNumber oblkno = nblkno;
 		ItemId		ii;
@@ -916,7 +865,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 		IndexTuple	oitup;
 
 		/* Create new page of same level */
-		npage = _bt_blnewpage(state->btps_level);
+		nbuf = _bt_blnewpage(wstate, state->btps_level);
+		npage = (Page) nbuf;
 
 		/* and assign it a page position */
 		nblkno = wstate->btws_pages_alloced++;
@@ -1028,10 +978,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 		}
 
 		/*
-		 * Write out the old page.  We never need to touch it again, so we can
-		 * free the opage workspace too.
+		 * Write out the old page. _bt_blwritepage takes ownership of the
+		 * 'opage' buffer.
 		 */
-		_bt_blwritepage(wstate, opage, oblkno);
+		_bt_blwritepage(wstate, obuf, oblkno);
 
 		/*
 		 * Reset last_off to point to new page
@@ -1064,7 +1014,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 	_bt_sortaddtup(npage, itupsz, itup, last_off,
 				   !isleaf && last_off == P_FIRSTKEY);
 
-	state->btps_page = npage;
+	state->btps_buf = nbuf;
 	state->btps_blkno = nblkno;
 	state->btps_lastoff = last_off;
 }
@@ -1116,7 +1066,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	BTPageState *s;
 	BlockNumber rootblkno = P_NONE;
 	uint32		rootlevel = 0;
-	Page		metapage;
+	BulkWriteBuffer metabuf;
 
 	/*
 	 * Each iteration of this loop completes one more level of the tree.
@@ -1127,7 +1077,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 		BTPageOpaque opaque;
 
 		blkno = s->btps_blkno;
-		opaque = BTPageGetOpaque(s->btps_page);
+		opaque = BTPageGetOpaque((Page) s->btps_buf);
 
 		/*
 		 * We have to link the last page on this level to somewhere.
@@ -1161,9 +1111,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 		 * This is the rightmost page, so the ItemId array needs to be slid
 		 * back one slot.  Then we can dump out the page.
 		 */
-		_bt_slideleft(s->btps_page);
-		_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
-		s->btps_page = NULL;	/* writepage freed the workspace */
+		_bt_slideleft((Page) s->btps_buf);
+		_bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
+		s->btps_buf = NULL;		/* writepage took ownership of the buffer */
 	}
 
 	/*
@@ -1172,10 +1122,10 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	 * set to point to "P_NONE").  This changes the index to the "valid" state
 	 * by filling in a valid magic number in the metapage.
 	 */
-	metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
-	_bt_initmetapage(metapage, rootblkno, rootlevel,
+	metabuf = smgr_bulk_get_buf(wstate->bulkstate);
+	_bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
 					 wstate->inskey->allequalimage);
-	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
+	_bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
 }
 
 /*
@@ -1197,6 +1147,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	int64		tuples_done = 0;
 	bool		deduplicate;
 
+	wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
+
 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);
 
@@ -1352,7 +1304,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 				 */
 				dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
 					sizeof(ItemIdData);
-				Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
+				Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) &&
 					   dstate->maxpostingsize <= INDEX_SIZE_MASK);
 				dstate->htids = palloc(dstate->maxpostingsize);
 
@@ -1422,18 +1374,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 
 	/* Close down final pages and write the metapage */
 	_bt_uppershutdown(wstate, state);
-
-	/*
-	 * When we WAL-logged index pages, we must nonetheless fsync index files.
-	 * Since we're building outside shared buffers, a CHECKPOINT occurring
-	 * during the build has no way to flush the previously written data to
-	 * disk (indeed it won't know the index even exists).  A crash later on
-	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
-	 * earlier WAL entries. If we do not fsync those pages here, they might
-	 * still not be on disk when the crash occurs.
-	 */
-	if (wstate->btws_use_wal)
-		smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM);
+	smgr_bulk_finish(wstate->bulkstate);
 }
 
 /*
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 98b1da20d58..1b70c5a59fd 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -25,7 +25,7 @@
 #include "catalog/index.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
-#include "storage/smgr.h"
+#include "storage/bulk_write.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -155,42 +155,27 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 void
 spgbuildempty(Relation index)
 {
-	Buffer		metabuffer,
-				rootbuffer,
-				nullbuffer;
-
-	/*
-	 * Initialize the meta page and root pages
-	 */
-	metabuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
-	LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
-	rootbuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
-	LockBuffer(rootbuffer, BUFFER_LOCK_EXCLUSIVE);
-	nullbuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
-	LockBuffer(nullbuffer, BUFFER_LOCK_EXCLUSIVE);
-
-	Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO);
-	Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_ROOT_BLKNO);
-	Assert(BufferGetBlockNumber(nullbuffer) == SPGIST_NULL_BLKNO);
+	BulkWriteState *bulkstate;
+	BulkWriteBuffer buf;
 
-	START_CRIT_SECTION();
+	bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
 
-	SpGistInitMetapage(BufferGetPage(metabuffer));
-	MarkBufferDirty(metabuffer);
-	SpGistInitBuffer(rootbuffer, SPGIST_LEAF);
-	MarkBufferDirty(rootbuffer);
-	SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS);
-	MarkBufferDirty(nullbuffer);
+	/* Construct metapage. */
+	buf = smgr_bulk_get_buf(bulkstate);
+	SpGistInitMetapage((Page) buf);
+	smgr_bulk_write(bulkstate, SPGIST_METAPAGE_BLKNO, buf, true);
 
-	log_newpage_buffer(metabuffer, true);
-	log_newpage_buffer(rootbuffer, true);
-	log_newpage_buffer(nullbuffer, true);
+	/* Likewise for the root page. */
+	buf = smgr_bulk_get_buf(bulkstate);
+	SpGistInitPage((Page) buf, SPGIST_LEAF);
+	smgr_bulk_write(bulkstate, SPGIST_ROOT_BLKNO, buf, true);
 
-	END_CRIT_SECTION();
+	/* Likewise for the null-tuples root page. */
+	buf = smgr_bulk_get_buf(bulkstate);
+	SpGistInitPage((Page) buf, SPGIST_LEAF | SPGIST_NULLS);
+	smgr_bulk_write(bulkstate, SPGIST_NULL_BLKNO, buf, true);
 
-	UnlockReleaseBuffer(metabuffer);
-	UnlockReleaseBuffer(rootbuffer);
-	UnlockReleaseBuffer(nullbuffer);
+	smgr_bulk_finish(bulkstate);
 }
 
 /*
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index b155c03386e..a17cf4bb0cc 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -28,6 +28,7 @@
 #include "catalog/storage.h"
 #include "catalog/storage_xlog.h"
 #include "miscadmin.h"
+#include "storage/bulk_write.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
 #include "utils/hsearch.h"
@@ -451,14 +452,11 @@ void
 RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 					ForkNumber forkNum, char relpersistence)
 {
-	PGIOAlignedBlock buf;
-	Page		page;
 	bool		use_wal;
 	bool		copying_initfork;
 	BlockNumber nblocks;
 	BlockNumber blkno;
-
-	page = (Page) buf.data;
+	BulkWriteState *bulkstate;
 
 	/*
 	 * The init fork for an unlogged relation in many respects has to be
@@ -477,16 +475,21 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 
+	bulkstate = smgr_bulk_start_smgr(dst, forkNum, use_wal);
+
 	nblocks = smgrnblocks(src, forkNum);
 
 	for (blkno = 0; blkno < nblocks; blkno++)
 	{
+		BulkWriteBuffer buf;
+
 		/* If we got a cancel signal during the copy of the data, quit */
 		CHECK_FOR_INTERRUPTS();
 
-		smgrread(src, forkNum, blkno, buf.data);
+		buf = smgr_bulk_get_buf(bulkstate);
+		smgrread(src, forkNum, blkno, (Page) buf);
 
-		if (!PageIsVerifiedExtended(page, blkno,
+		if (!PageIsVerifiedExtended((Page) buf, blkno,
 									PIV_LOG_WARNING | PIV_REPORT_STAT))
 		{
 			/*
@@ -507,34 +510,13 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 		}
 
 		/*
-		 * WAL-log the copied page. Unfortunately we don't know what kind of a
-		 * page this is, so we have to log the full page including any unused
-		 * space.
-		 */
-		if (use_wal)
-			log_newpage(&dst->smgr_rlocator.locator, forkNum, blkno, page, false);
-
-		PageSetChecksumInplace(page, blkno);
-
-		/*
-		 * Now write the page.  We say skipFsync = true because there's no
-		 * need for smgr to schedule an fsync for this write; we'll do it
-		 * ourselves below.
+		 * Queue the page for WAL-logging and writing out.  Unfortunately we
+		 * don't know what kind of a page this is, so we have to log the full
+		 * page including any unused space.
 		 */
-		smgrextend(dst, forkNum, blkno, buf.data, true);
+		smgr_bulk_write(bulkstate, blkno, buf, false);
 	}
-
-	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
-	 * reason is that since we're copying outside shared buffers, a CHECKPOINT
-	 * occurring during the copy has no way to flush the previously written
-	 * data to disk (indeed it won't know the new rel even exists).  A crash
-	 * later on would replay WAL from the checkpoint, therefore it wouldn't
-	 * replay our earlier WAL entries. If we do not fsync those pages here,
-	 * they might still not be on disk when the crash occurs.
-	 */
-	if (use_wal || copying_initfork)
-		smgrimmedsync(dst, forkNum);
+	smgr_bulk_finish(bulkstate);
 }
 
 /*
diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile
index 596b564656f..1d0b98764f9 100644
--- a/src/backend/storage/smgr/Makefile
+++ b/src/backend/storage/smgr/Makefile
@@ -13,6 +13,7 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = \
+	bulk_write.o \
 	md.o \
 	smgr.o
 
diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c
new file mode 100644
index 00000000000..04a76ec5e36
--- /dev/null
+++ b/src/backend/storage/smgr/bulk_write.c
@@ -0,0 +1,298 @@
+/*-------------------------------------------------------------------------
+ *
+ * bulk_write.c
+ *	  Efficiently and reliably populate a new relation
+ *
+ * The assumption is that no other backends access the relation while we are
+ * loading it, so we can take some shortcuts.  Do not mix operations through
+ * the regular buffer manager and the bulk loading interface!
+ *
+ * We bypass the buffer manager to avoid the locking overhead, and call
+ * smgrextend() directly.  A downside is that the pages will need to be
+ * re-read into shared buffers on first use after the build finishes.  That's
+ * usually a good tradeoff for large relations, and for small relations, the
+ * overhead isn't very significant compared to creating the relation in the
+ * first place.
+ *
+ * The pages are WAL-logged if needed.  To save on WAL header overhead, we
+ * WAL-log several pages in one record.
+ *
+ * One tricky point is that because we bypass the buffer manager, we need to
+ * register the relation for fsyncing at the next checkpoint ourselves, and
+ * make sure that the relation is correctly fsync'd by us or the checkpointer
+ * even if a checkpoint happens concurrently.
+ *
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/smgr/bulk_write.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xloginsert.h"
+#include "access/xlogrecord.h"
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+#include "storage/bulk_write.h"
+#include "storage/proc.h"
+#include "storage/smgr.h"
+#include "utils/rel.h"
+
+#define MAX_PENDING_WRITES XLR_MAX_BLOCK_ID
+
+static const PGIOAlignedBlock zero_buffer = {{0}};	/* worth BLCKSZ */
+
+typedef struct PendingWrite
+{
+	BulkWriteBuffer buf;
+	BlockNumber blkno;
+	bool		page_std;
+} PendingWrite;
+
+/*
+ * Bulk writer state for one relation fork.
+ */
+typedef struct BulkWriteState
+{
+	/* Information about the target relation we're writing */
+	SMgrRelation smgr;
+	ForkNumber	forknum;
+	bool		use_wal;
+
+	/* We keep several writes queued, and WAL-log them in batches */
+	int			npending;
+	PendingWrite pending_writes[MAX_PENDING_WRITES];
+
+	/* Current size of the relation */
+	BlockNumber pages_written;
+
+	/* The RedoRecPtr at the time that the bulk operation started */
+	XLogRecPtr	start_RedoRecPtr;
+
+	MemoryContext memcxt;
+} BulkWriteState;
+
+static void smgr_bulk_flush(BulkWriteState *bulkstate);
+
+/*
+ * Start a bulk write operation on a relation fork.
+ */
+BulkWriteState *
+smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
+{
+	return smgr_bulk_start_smgr(RelationGetSmgr(rel),
+								forknum,
+								RelationNeedsWAL(rel) || forknum == INIT_FORKNUM);
+}
+
+/*
+ * Start a bulk write operation on a relation fork.
+ *
+ * This is like smgr_bulk_start_rel, but can be used without a relcache entry.
+ */
+BulkWriteState *
+smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
+{
+	BulkWriteState *state;
+
+	state = palloc(sizeof(BulkWriteState));
+	state->smgr = smgr;
+	state->forknum = forknum;
+	state->use_wal = use_wal;
+
+	state->npending = 0;
+	state->pages_written = 0;
+
+	state->start_RedoRecPtr = GetRedoRecPtr();
+
+	/*
+	 * Remember the memory context.  We will use it to allocate all the
+	 * buffers later.
+	 */
+	state->memcxt = CurrentMemoryContext;
+
+	return state;
+}
+
+/*
+ * Finish bulk write operation.
+ *
+ * This WAL-logs and flushes any remaining pending writes to disk, and fsyncs
+ * the relation if needed.
+ */
+void
+smgr_bulk_finish(BulkWriteState *bulkstate)
+{
+	/* WAL-log and flush any remaining pages */
+	smgr_bulk_flush(bulkstate);
+
+	/*
+	 * When we wrote out the pages, we passed skipFsync=true to avoid the
+	 * overhead of registering all the writes with the checkpointer.  Register
+	 * the whole relation now.
+	 *
+	 * There is one hole in that idea: If a checkpoint occurred while we were
+	 * writing the pages, it already missed fsyncing the pages we had written
+	 * before the checkpoint started.  A crash later on would replay the WAL
+	 * starting from the checkpoint, therefore it wouldn't replay our earlier
+	 * WAL records.  So if a checkpoint started after the bulk write, fsync
+	 * the files now.
+	 */
+	if (!SmgrIsTemp(bulkstate->smgr))
+	{
+		/*
+		 * Prevent a checkpoint from starting between the GetRedoRecPtr() and
+		 * smgrregistersync() calls.
+		 */
+		Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+		MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+
+		if (bulkstate->start_RedoRecPtr != GetRedoRecPtr())
+		{
+			/*
+			 * A checkpoint occurred and it didn't know about our writes, so
+			 * fsync() the relation ourselves.
+			 */
+			MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+			smgrimmedsync(bulkstate->smgr, bulkstate->forknum);
+			elog(DEBUG1, "flushed relation because a checkpoint occurred concurrently");
+		}
+		else
+		{
+			smgrregistersync(bulkstate->smgr, bulkstate->forknum);
+			MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+		}
+	}
+}
+
+static int
+buffer_cmp(const void *a, const void *b)
+{
+	const PendingWrite *bufa = (const PendingWrite *) a;
+	const PendingWrite *bufb = (const PendingWrite *) b;
+
+	/* We should not see duplicated writes for the same block */
+	Assert(bufa->blkno != bufb->blkno);
+	if (bufa->blkno > bufb->blkno)
+		return 1;
+	else
+		return -1;
+}
+
+/*
+ * Finish all the pending writes.
+ */
+static void
+smgr_bulk_flush(BulkWriteState *bulkstate)
+{
+	int			npending = bulkstate->npending;
+	PendingWrite *pending_writes = bulkstate->pending_writes;
+
+	if (npending == 0)
+		return;
+
+	if (npending > 1)
+		qsort(pending_writes, npending, sizeof(PendingWrite), buffer_cmp);
+
+	if (bulkstate->use_wal)
+	{
+		BlockNumber blknos[MAX_PENDING_WRITES];
+		Page		pages[MAX_PENDING_WRITES];
+		bool		page_std = true;
+
+		for (int i = 0; i < npending; i++)
+		{
+			blknos[i] = pending_writes[i].blkno;
+			pages[i] = pending_writes[i].buf->data;
+
+			/*
+			 * If any of the pages use !page_std, we log them all as such.
+			 * That's a bit wasteful, but in practice, a mix of standard and
+			 * non-standard page layout is rare.  None of the built-in AMs do
+			 * that.
+			 */
+			if (!pending_writes[i].page_std)
+				page_std = false;
+		}
+		log_newpages(&bulkstate->smgr->smgr_rlocator.locator, bulkstate->forknum,
+					 npending, blknos, pages, page_std);
+	}
+
+	for (int i = 0; i < npending; i++)
+	{
+		BlockNumber blkno = pending_writes[i].blkno;
+		Page		page = pending_writes[i].buf->data;
+
+		PageSetChecksumInplace(page, blkno);
+
+		if (blkno >= bulkstate->pages_written)
+		{
+			/*
+			 * If we have to write pages nonsequentially, fill in the space
+			 * with zeroes until we come back and overwrite.  This is not
+			 * logically necessary on standard Unix filesystems (unwritten
+			 * space will read as zeroes anyway), but it should help to avoid
+			 * fragmentation.  The dummy pages aren't WAL-logged though.
+			 */
+			while (blkno > bulkstate->pages_written)
+			{
+				/* don't set checksum for all-zero page */
+				smgrextend(bulkstate->smgr, bulkstate->forknum,
+						   bulkstate->pages_written++,
+						   &zero_buffer,
+						   true);
+			}
+
+			smgrextend(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
+			bulkstate->pages_written = pending_writes[i].blkno + 1;
+		}
+		else
+			smgrwrite(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
+		pfree(page);
+	}
+
+	bulkstate->npending = 0;
+}
+
+/*
+ * Queue write of 'buf'.
+ *
+ * NB: this takes ownership of 'buf'!
+ *
+ * You are only allowed to write a given block once as part of one bulk write
+ * operation.
+ */
+void
+smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
+{
+	PendingWrite *w;
+
+	w = &bulkstate->pending_writes[bulkstate->npending++];
+	w->buf = buf;
+	w->blkno = blocknum;
+	w->page_std = page_std;
+
+	if (bulkstate->npending == MAX_PENDING_WRITES)
+		smgr_bulk_flush(bulkstate);
+}
+
+/*
+ * Allocate a new buffer which can later be written with smgr_bulk_write().
+ *
+ * There is no function to free the buffer.  When you pass it to
+ * smgr_bulk_write(), it takes ownership and frees it when it's no longer
+ * needed.
+ *
+ * This is currently implemented as a simple palloc, but could be implemented
+ * using a ring buffer or larger chunks in the future, so don't rely on it.
+ */
+BulkWriteBuffer
+smgr_bulk_get_buf(BulkWriteState *bulkstate)
+{
+	return MemoryContextAllocAligned(bulkstate->memcxt, BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index b1e9932a291..233f6987f59 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1237,6 +1237,49 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 }
 
 /*
+ * mdregistersync() -- Mark whole relation as needing fsync
+ */
+void
+mdregistersync(SMgrRelation reln, ForkNumber forknum)
+{
+	int			segno;
+	int			min_inactive_seg;
+
+	/*
+	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
+	 * the loop below will get them all!
+	 */
+	mdnblocks(reln, forknum);
+
+	min_inactive_seg = segno = reln->md_num_open_segs[forknum];
+
+	/*
+	 * Temporarily open inactive segments, then close them after sync.  There
+	 * may be some inactive segments left opened after error, but that is
+	 * harmless.  We don't bother to clean them up and take a risk of further
+	 * trouble.  The next mdclose() will soon close them.
+	 */
+	while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
+		segno++;
+
+	while (segno > 0)
+	{
+		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
+
+		register_dirty_segment(reln, forknum, v);
+
+		/* Close inactive segments immediately */
+		if (segno > min_inactive_seg)
+		{
+			FileClose(v->mdfd_vfd);
+			_fdvec_resize(reln, forknum, segno - 1);
+		}
+
+		segno--;
+	}
+}
+
+/*
  * mdimmedsync() -- Immediately sync a relation to stable storage.
  *
  * Note that only writes already issued are synced; this routine knows
@@ -1255,7 +1298,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
-	 * fsync loop will get them all!
+	 * the loop below will get them all!
 	 */
 	mdnblocks(reln, forknum);
 
diff --git a/src/backend/storage/smgr/meson.build b/src/backend/storage/smgr/meson.build
index 003d5e30dd4..6d91b18fe67 100644
--- a/src/backend/storage/smgr/meson.build
+++ b/src/backend/storage/smgr/meson.build
@@ -1,6 +1,7 @@
 # Copyright (c) 2022-2024, PostgreSQL Global Development Group
 
 backend_sources += files(
+  'bulk_write.c',
   'md.c',
   'smgr.c',
 )
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index c67ba9a51c8..f7f7fe30b6b 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -102,6 +102,7 @@ typedef struct f_smgr
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber nblocks);
 	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_registersync) (SMgrRelation reln, ForkNumber forknum);
 } f_smgr;
 
 static const f_smgr smgrsw[] = {
@@ -123,6 +124,7 @@ static const f_smgr smgrsw[] = {
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
 		.smgr_immedsync = mdimmedsync,
+		.smgr_registersync = mdregistersync,
 	}
 };
 
@@ -616,6 +618,14 @@ smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
  * on disk at return, only dumped out to the kernel.  However,
  * provisions will be made to fsync the write before the next checkpoint.
  *
+ * NB: The mechanism to ensure fsync at next checkpoint assumes that there is
+ * something that prevents a concurrent checkpoint from "racing ahead" of the
+ * write.  One way to prevent that is by holding a lock on the buffer; the
+ * buffer manager's writes are protected by that.  The bulk writer facility
+ * in bulk_write.c checks the redo pointer and calls smgrimmedsync() if a
+ * checkpoint happened; that relies on the fact that no other backend can be
+ * concurrently modifying the page.
+ *
  * skipFsync indicates that the caller will make other provisions to
  * fsync the relation, so we needn't bother.  Temporary relations also
  * do not require fsync.
@@ -734,6 +744,24 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 }
 
 /*
+ * smgrregistersync() -- Request a relation to be sync'd at next checkpoint
+ *
+ * This can be used after calling smgrwrite() or smgrextend() with skipFsync =
+ * true, to register the fsyncs that were skipped earlier.
+ *
+ * Note: be mindful that a checkpoint could already have happened between the
+ * smgrwrite or smgrextend calls and this!  In that case, the checkpoint
+ * already missed fsyncing this relation, and you should use smgrimmedsync
+ * instead.  Most callers should use the bulk loading facility in bulk_write.c
+ * which handles all that.
+ */
+void
+smgrregistersync(SMgrRelation reln, ForkNumber forknum)
+{
+	smgrsw[reln->smgr_which].smgr_registersync(reln, forknum);
+}
+
+/*
  * smgrimmedsync() -- Force the specified relation to stable storage.
  *
  * Synchronously force all previous writes to the specified relation
@@ -755,6 +783,9 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
  * Note that you need to do FlushRelationBuffers() first if there is
  * any possibility that there are dirty buffers for the relation;
  * otherwise the sync is not very meaningful.
+ *
+ * Most callers should use the bulk loading facility in bulk_write.c
+ * instead of calling this directly.
  */
 void
 smgrimmedsync(SMgrRelation reln, ForkNumber forknum)