aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/buffer/bufmgr.c
diff options
context:
space:
mode:
authorRobert Haas <rhaas@postgresql.org>2022-03-29 11:31:43 -0400
committerRobert Haas <rhaas@postgresql.org>2022-03-29 11:48:36 -0400
commit9c08aea6a3090a396be334cc58c511edab05776a (patch)
treec15e6e9fa45a18173a5bbd67ff4a4c889e616cde /src/backend/storage/buffer/bufmgr.c
parentbf902c13930c268388644100663f2998868b6e85 (diff)
downloadpostgresql-9c08aea6a3090a396be334cc58c511edab05776a.tar.gz
postgresql-9c08aea6a3090a396be334cc58c511edab05776a.zip
Add new block-by-block strategy for CREATE DATABASE.
Because this strategy logs changes on a block-by-block basis, it avoids the need to checkpoint before and after the operation. However, because it logs each changed block individually, it might generate a lot of extra write-ahead logging if the template database is large. Therefore, the older strategy remains available via a new STRATEGY parameter to CREATE DATABASE, and a corresponding --strategy option to createdb. Somewhat controversially, this patch assembles the list of relations to be copied to the new database by reading the pg_class relation of the template database. Cross-database access like this isn't normally possible, but it can be made to work here because there can't be any connections to the database being copied, nor can it contain any in-doubt transactions. Even so, we have to use lower-level interfaces than normal, since the table scan and relcache interfaces will not work for a database to which we're not connected. The advantage of this approach is that we do not need to rely on the filesystem to determine what ought to be copied, but instead on PostgreSQL's own knowledge of the database structure. This avoids, for example, copying stray files that happen to be located in the source database directory. Dilip Kumar, with a fairly large number of cosmetic changes by me. Reviewed and tested by Ashutosh Sharma, Andres Freund, John Naylor, Greg Nancarrow, Neha Sharma. Additional feedback from Bruce Momjian, Heikki Linnakangas, Julien Rouhaud, Adam Brusselback, Kyotaro Horiguchi, Tomas Vondra, Andrew Dunstan, Álvaro Herrera, and others. Discussion: http://postgr.es/m/CA+TgmoYtcdxBjLh31DLxUXHxFVMPGzrU5_T=CYCvRyFHywSBUQ@mail.gmail.com
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r--src/backend/storage/buffer/bufmgr.c172
1 files changed, 164 insertions, 8 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 11005edc735..d73a40c1bc6 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -38,6 +38,7 @@
#include "access/xlogutils.h"
#include "catalog/catalog.h"
#include "catalog/storage.h"
+#include "catalog/storage_xlog.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
@@ -486,6 +487,9 @@ static void FindAndDropRelFileNodeBuffers(RelFileNode rnode,
ForkNumber forkNum,
BlockNumber nForkBlock,
BlockNumber firstDelBlock);
+static void RelationCopyStorageUsingBuffer(Relation src, Relation dst,
+ ForkNumber forkNum,
+ bool isunlogged);
static void AtProcExit_Buffers(int code, Datum arg);
static void CheckForBufferLeaks(void);
static int rnode_comparator(const void *p1, const void *p2);
@@ -772,23 +776,23 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
* ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
* a relcache entry for the relation.
*
- * NB: At present, this function may only be used on permanent relations, which
- * is OK, because we only use it during XLOG replay. If in the future we
- * want to use it on temporary or unlogged relations, we could pass additional
- * parameters.
+ * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
+ * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
+ * cannot be used for temporary relations (and making that work might be
+ * difficult, unless we only want to read temporary relations for our own
+ * BackendId).
*/
Buffer
ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
BlockNumber blockNum, ReadBufferMode mode,
- BufferAccessStrategy strategy)
+ BufferAccessStrategy strategy, bool permanent)
{
bool hit;
SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
- Assert(InRecovery);
-
- return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
+ return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
+ RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
mode, strategy, &hit);
}
@@ -3677,6 +3681,158 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
}
/* ---------------------------------------------------------------------
+ * RelationCopyStorageUsingBuffer
+ *
+ * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
+ * of using smgrread and smgrextend this will copy using bufmgr APIs.
+ *
+ * Refer comments atop CreateAndCopyRelationData() for details about
+ * 'permanent' parameter.
+ * --------------------------------------------------------------------
+ */
+static void
+RelationCopyStorageUsingBuffer(Relation src, Relation dst, ForkNumber forkNum,
+ bool permanent)
+{
+ Buffer srcBuf;
+ Buffer dstBuf;
+ Page srcPage;
+ Page dstPage;
+ bool use_wal;
+ BlockNumber nblocks;
+ BlockNumber blkno;
+ BufferAccessStrategy bstrategy_src;
+ BufferAccessStrategy bstrategy_dst;
+
+ /*
+ * In general, we want to write WAL whenever wal_level > 'minimal', but
+ * we can skip it when copying any fork of an unlogged relation other
+ * than the init fork.
+ */
+ use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
+
+ /* Get number of blocks in the source relation. */
+ nblocks = smgrnblocks(RelationGetSmgr(src), forkNum);
+
+ /* Nothing to copy; just return. */
+ if (nblocks == 0)
+ return;
+
+ /* This is a bulk operation, so use buffer access strategies. */
+ bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
+ bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
+
+ /* Iterate over each block of the source relation file. */
+ for (blkno = 0; blkno < nblocks; blkno++)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ /* Read block from source relation. */
+ srcBuf = ReadBufferWithoutRelcache(src->rd_node, forkNum, blkno,
+ RBM_NORMAL, bstrategy_src,
+ permanent);
+ srcPage = BufferGetPage(srcBuf);
+ if (PageIsNew(srcPage) || PageIsEmpty(srcPage))
+ {
+ ReleaseBuffer(srcBuf);
+ continue;
+ }
+
+ /* Use P_NEW to extend the destination relation. */
+ dstBuf = ReadBufferWithoutRelcache(dst->rd_node, forkNum, P_NEW,
+ RBM_NORMAL, bstrategy_dst,
+ permanent);
+ LockBuffer(dstBuf, BUFFER_LOCK_EXCLUSIVE);
+
+ START_CRIT_SECTION();
+
+ /* Copy page data from the source to the destination. */
+ dstPage = BufferGetPage(dstBuf);
+ memcpy(dstPage, srcPage, BLCKSZ);
+ MarkBufferDirty(dstBuf);
+
+ /* WAL-log the copied page. */
+ if (use_wal)
+ log_newpage_buffer(dstBuf, true);
+
+ END_CRIT_SECTION();
+
+ UnlockReleaseBuffer(dstBuf);
+ ReleaseBuffer(srcBuf);
+ }
+}
+
+/* ---------------------------------------------------------------------
+ * CreateAndCopyRelationData
+ *
+ * Create destination relation storage and copy all forks from the
+ * source relation to the destination.
+ *
+ * Pass permanent as true for permanent relations and false for
+ * unlogged relations. Currently this API is not supported for
+ * temporary relations.
+ * --------------------------------------------------------------------
+ */
+void
+CreateAndCopyRelationData(RelFileNode src_rnode, RelFileNode dst_rnode,
+ bool permanent)
+{
+ Relation src_rel;
+ Relation dst_rel;
+ char relpersistence;
+
+ /* Set the relpersistence. */
+ relpersistence = permanent ?
+ RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
+
+ /*
+ * We can't use a real relcache entry for a relation in some other
+ * database, but since we're only going to access the fields related
+ * to physical storage, a fake one is good enough. If we didn't do this
+ * and used the smgr layer directly, we would have to worry about
+ * invalidations.
+ */
+ src_rel = CreateFakeRelcacheEntry(src_rnode);
+ dst_rel = CreateFakeRelcacheEntry(dst_rnode);
+
+ /*
+ * Create and copy all forks of the relation. During create database we
+ * have a separate cleanup mechanism which deletes complete database
+ * directory. Therefore, each individual relation doesn't need to be
+ * registered for cleanup.
+ */
+ RelationCreateStorage(dst_rnode, relpersistence, false);
+
+ /* copy main fork. */
+ RelationCopyStorageUsingBuffer(src_rel, dst_rel, MAIN_FORKNUM, permanent);
+
+ /* copy those extra forks that exist */
+ for (ForkNumber forkNum = MAIN_FORKNUM + 1;
+ forkNum <= MAX_FORKNUM; forkNum++)
+ {
+ if (smgrexists(RelationGetSmgr(src_rel), forkNum))
+ {
+ smgrcreate(RelationGetSmgr(dst_rel), forkNum, false);
+
+ /*
+ * WAL log creation if the relation is persistent, or this is the
+ * init fork of an unlogged relation.
+ */
+ if (permanent || forkNum == INIT_FORKNUM)
+ log_smgrcreate(&dst_rnode, forkNum);
+
+ /* Copy a fork's data, block by block. */
+ RelationCopyStorageUsingBuffer(src_rel, dst_rel, forkNum,
+ permanent);
+ }
+ }
+
+ /* Release fake relcache entries. */
+ FreeFakeRelcacheEntry(src_rel);
+ FreeFakeRelcacheEntry(dst_rel);
+}
+
+/* ---------------------------------------------------------------------
* FlushDatabaseBuffers
*
* This function writes all dirty pages of a database out to disk