diff options
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/access/gin/gininsert.c | 42 | ||||
-rw-r--r-- | src/backend/access/gist/gist.c | 13 | ||||
-rw-r--r-- | src/backend/access/hash/hash.c | 15 | ||||
-rw-r--r-- | src/backend/access/hash/hashovfl.c | 9 | ||||
-rw-r--r-- | src/backend/access/hash/hashpage.c | 20 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 31 | ||||
-rw-r--r-- | src/backend/access/transam/xlog.c | 17 | ||||
-rw-r--r-- | src/backend/catalog/catalog.c | 14 | ||||
-rw-r--r-- | src/backend/catalog/heap.c | 19 | ||||
-rw-r--r-- | src/backend/catalog/index.c | 11 | ||||
-rw-r--r-- | src/backend/catalog/storage.c | 49 | ||||
-rw-r--r-- | src/backend/commands/tablecmds.c | 19 | ||||
-rw-r--r-- | src/backend/parser/gram.y | 11 | ||||
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 53 | ||||
-rw-r--r-- | src/backend/storage/file/Makefile | 2 | ||||
-rw-r--r-- | src/backend/storage/file/copydir.c | 3 | ||||
-rw-r--r-- | src/backend/storage/file/fd.c | 2 | ||||
-rw-r--r-- | src/backend/storage/file/reinit.c | 396 | ||||
-rw-r--r-- | src/backend/utils/adt/dbsize.c | 1 | ||||
-rw-r--r-- | src/backend/utils/cache/relcache.c | 2 |
20 files changed, 662 insertions, 67 deletions
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 8681edefe67..d66c79cb8de 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -19,6 +19,7 @@ #include "catalog/index.h" #include "miscadmin.h" #include "storage/bufmgr.h" +#include "storage/smgr.h" #include "storage/indexfsm.h" #include "utils/memutils.h" @@ -412,6 +413,47 @@ ginbuild(PG_FUNCTION_ARGS) } /* + * ginbuildempty() -- build an empty gin index in the initialization fork + */ +Datum +ginbuildempty(PG_FUNCTION_ARGS) +{ + Relation index = (Relation) PG_GETARG_POINTER(0); + Buffer RootBuffer, + MetaBuffer; + + /* An empty GIN index has two pages. */ + MetaBuffer = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(MetaBuffer, BUFFER_LOCK_EXCLUSIVE); + RootBuffer = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE); + + /* Initialize both pages, mark them dirty, unlock and release buffer. */ + START_CRIT_SECTION(); + GinInitMetabuffer(MetaBuffer); + MarkBufferDirty(MetaBuffer); + GinInitBuffer(RootBuffer, GIN_LEAF); + MarkBufferDirty(RootBuffer); + + /* XLOG the new pages */ + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + BufferGetBlockNumber(MetaBuffer), + BufferGetPage(MetaBuffer)); + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + BufferGetBlockNumber(RootBuffer), + BufferGetPage(RootBuffer)); + END_CRIT_SECTION(); + + /* Unlock and release the buffers. */ + UnlockReleaseBuffer(MetaBuffer); + UnlockReleaseBuffer(RootBuffer); + + PG_RETURN_VOID(); +} + +/* * Inserts value during normal insertion */ static uint32 diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 7cd144e2f09..c26ac74332d 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -219,6 +219,19 @@ gistbuildCallback(Relation index, } /* + * gistbuildempty() -- build an empty gist index in the initialization fork + */ +Datum +gistbuildempty(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unlogged GIST indexes are not supported"))); + + PG_RETURN_VOID(); +} + +/* * gistinsert -- wrapper for GiST tuple insertion. * * This is the public interface routine for tuple insertion in GiSTs. diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index e53ec3d5eaa..4df92d44c03 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -69,7 +69,7 @@ hashbuild(PG_FUNCTION_ARGS) estimate_rel_size(heap, NULL, &relpages, &reltuples); /* Initialize the hash index metadata page and initial buckets */ - num_buckets = _hash_metapinit(index, reltuples); + num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM); /* * If we just insert the tuples into the index in scan order, then @@ -114,6 +114,19 @@ hashbuild(PG_FUNCTION_ARGS) } /* + * hashbuildempty() -- build an empty hash index in the initialization fork + */ +Datum +hashbuildempty(PG_FUNCTION_ARGS) +{ + Relation index = (Relation) PG_GETARG_POINTER(0); + + _hash_metapinit(index, 0, INIT_FORKNUM); + + PG_RETURN_VOID(); +} + +/* * Per-tuple callback from IndexBuildHeapScan */ static void diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 7c6e902ea93..454ad6c7a8a 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -259,7 +259,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; - _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit)); + _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM); metap->hashm_spares[splitnum]++; } else @@ -280,7 +280,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. */ - newbuf = _hash_getnewbuf(rel, blkno); + newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); metap->hashm_spares[splitnum]++; @@ -503,7 +503,8 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, * All bits in the new bitmap page are set to "1", indicating "in use". */ void -_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno) +_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, + ForkNumber forkNum) { Buffer buf; Page pg; @@ -520,7 +521,7 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno) * page while holding the metapage lock, but this path is taken so seldom * that it's not worth worrying about. */ - buf = _hash_getnewbuf(rel, blkno); + buf = _hash_getnewbuf(rel, blkno, forkNum); pg = BufferGetPage(buf); /* initialize the page's special space */ diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 2ebeda98b59..29f7b25b4ec 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -183,9 +183,9 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno) * extend the index at a time. */ Buffer -_hash_getnewbuf(Relation rel, BlockNumber blkno) +_hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum) { - BlockNumber nblocks = RelationGetNumberOfBlocks(rel); + BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum); Buffer buf; if (blkno == P_NEW) @@ -197,13 +197,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno) /* smgr insists we use P_NEW to extend the relation */ if (blkno == nblocks) { - buf = ReadBuffer(rel, P_NEW); + buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL); if (BufferGetBlockNumber(buf) != blkno) elog(ERROR, "unexpected hash relation size: %u, should be %u", BufferGetBlockNumber(buf), blkno); } else - buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL); + buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL); LockBuffer(buf, HASH_WRITE); @@ -324,7 +324,7 @@ _hash_chgbufaccess(Relation rel, * multiple buffer locks is ignored. */ uint32 -_hash_metapinit(Relation rel, double num_tuples) +_hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) { HashMetaPage metap; HashPageOpaque pageopaque; @@ -340,7 +340,7 @@ _hash_metapinit(Relation rel, double num_tuples) uint32 i; /* safety check */ - if (RelationGetNumberOfBlocks(rel) != 0) + if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); @@ -383,7 +383,7 @@ _hash_metapinit(Relation rel, double num_tuples) * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. */ - metabuf = _hash_getnewbuf(rel, HASH_METAPAGE); + metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); pg = BufferGetPage(metabuf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); @@ -451,7 +451,7 @@ _hash_metapinit(Relation rel, double num_tuples) /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); - buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i)); + buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); pg = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; @@ -468,7 +468,7 @@ _hash_metapinit(Relation rel, double num_tuples) /* * Initialize first bitmap page */ - _hash_initbitmap(rel, metap, num_buckets + 1); + _hash_initbitmap(rel, metap, num_buckets + 1, forkNum); /* all done */ _hash_wrtbuf(rel, metabuf); @@ -785,7 +785,7 @@ _hash_splitbucket(Relation rel, oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); nblkno = start_nblkno; - nbuf = _hash_getnewbuf(rel, nblkno); + nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM); npage = BufferGetPage(nbuf); /* initialize the new bucket's primary page */ diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 655a40090e9..a13d629b0ef 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -29,6 +29,7 @@ #include "storage/indexfsm.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/smgr.h" #include "utils/memutils.h" @@ -205,6 +206,36 @@ btbuildCallback(Relation index, } /* + * btbuildempty() -- build an empty btree index in the initialization fork + */ +Datum +btbuildempty(PG_FUNCTION_ARGS) +{ + Relation index = (Relation) PG_GETARG_POINTER(0); + Page metapage; + + /* Construct metapage. */ + metapage = (Page) palloc(BLCKSZ); + _bt_initmetapage(metapage, P_NONE, 0); + + /* Write the page. If archiving/streaming, XLOG it. */ + smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, + (char *) metapage, true); + if (XLogIsNeeded()) + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + BTREE_METAPAGE, metapage); + + /* + * An immediate sync is require even if we xlog'd the page, because the + * write did not go through shared_buffers and therefore a concurrent + * checkpoint may have move the redo pointer past our xlog record. + */ + smgrimmedsync(index->rd_smgr, INIT_FORKNUM); + + PG_RETURN_VOID(); +} + +/* * btinsert() -- insert an index tuple into a btree. * * Descend the tree recursively, find the appropriate location for our diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index bf62138bf86..1ec6f2f15ac 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -49,6 +49,7 @@ #include "storage/latch.h" #include "storage/pmsignal.h" #include "storage/procarray.h" +#include "storage/reinit.h" #include "storage/smgr.h" #include "storage/spin.h" #include "utils/builtins.h" @@ -5961,6 +5962,14 @@ StartupXLOG(void) CheckRequiredParameterValues(); /* + * We're in recovery, so unlogged relations relations may be trashed + * and must be reset. This should be done BEFORE allowing Hot + * Standby connections, so that read-only backends don't try to + * read whatever garbage is left over from before. + */ + ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP); + + /* * Initialize for Hot Standby, if enabled. We won't let backends in * yet, not until we've reached the min recovery point specified in * control file and we've established a recovery snapshot from a @@ -6414,6 +6423,14 @@ StartupXLOG(void) PreallocXlogFiles(EndOfLog); /* + * Reset initial contents of unlogged relations. This has to be done + * AFTER recovery is complete so that any unlogged relations created + * during recovery also get picked up. + */ + if (InRecovery) + ResetUnloggedRelations(UNLOGGED_RELATION_INIT); + + /* * Okay, we're officially UP. */ InRecovery = false; diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 88b5c2a215d..fc5a8fcd655 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -55,7 +55,8 @@ const char *forkNames[] = { "main", /* MAIN_FORKNUM */ "fsm", /* FSM_FORKNUM */ - "vm" /* VISIBILITYMAP_FORKNUM */ + "vm", /* VISIBILITYMAP_FORKNUM */ + "init" /* INIT_FORKNUM */ }; /* @@ -82,14 +83,14 @@ forkname_to_number(char *forkName) * We use this to figure out whether a filename could be a relation * fork (as opposed to an oddly named stray file that somehow ended * up in the database directory). If the passed string begins with - * a fork name (other than the main fork name), we return its length. - * If not, we return 0. + * a fork name (other than the main fork name), we return its length, + * and set *fork (if not NULL) to the fork number. If not, we return 0. * * Note that the present coding assumes that there are no fork names which * are prefixes of other fork names. */ int -forkname_chars(const char *str) +forkname_chars(const char *str, ForkNumber *fork) { ForkNumber forkNum; @@ -97,7 +98,11 @@ forkname_chars(const char *str) { int len = strlen(forkNames[forkNum]); if (strncmp(forkNames[forkNum], str, len) == 0) + { + if (fork) + *fork = forkNum; return len; + } } return 0; } @@ -537,6 +542,7 @@ GetNewRelFileNode(Oid reltablespace, Relation pg_class, char relpersistence) case RELPERSISTENCE_TEMP: backend = MyBackendId; break; + case RELPERSISTENCE_UNLOGGED: case RELPERSISTENCE_PERMANENT: backend = InvalidBackendId; break; diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index bcf6caa2eef..8027d740f6d 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -1211,6 +1211,25 @@ heap_create_with_catalog(const char *relname, register_on_commit_action(relid, oncommit); /* + * If this is an unlogged relation, it needs an init fork so that it + * can be correctly reinitialized on restart. Since we're going to + * do an immediate sync, we ony need to xlog this if archiving or + * streaming is enabled. And the immediate sync is required, because + * otherwise there's no guarantee that this will hit the disk before + * the next checkpoint moves the redo pointer. + */ + if (relpersistence == RELPERSISTENCE_UNLOGGED) + { + Assert(relkind == RELKIND_RELATION || relkind == RELKIND_TOASTVALUE); + + smgrcreate(new_rel_desc->rd_smgr, INIT_FORKNUM, false); + if (XLogIsNeeded()) + log_smgrcreate(&new_rel_desc->rd_smgr->smgr_rnode.node, + INIT_FORKNUM); + smgrimmedsync(new_rel_desc->rd_smgr, INIT_FORKNUM); + } + + /* * ok, the relation has been cataloged, so close our relations and return * the OID of the newly created relation. */ diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 8fbe8ebc91d..e50a084f003 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1438,6 +1438,17 @@ index_build(Relation heapRelation, Assert(PointerIsValid(stats)); /* + * If this is an unlogged index, we need to write out an init fork for it. + */ + if (heapRelation->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) + { + RegProcedure ambuildempty = indexRelation->rd_am->ambuildempty; + RelationOpenSmgr(indexRelation); + smgrcreate(indexRelation->rd_smgr, INIT_FORKNUM, false); + OidFunctionCall1(ambuildempty, PointerGetDatum(indexRelation)); + } + + /* * If it's for an exclusion constraint, make a second pass over the heap * to verify that the constraint is satisfied. */ diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 671aaff133a..0bd0451f008 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -74,6 +74,7 @@ static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ typedef struct xl_smgr_create { RelFileNode rnode; + ForkNumber forkNum; } xl_smgr_create; typedef struct xl_smgr_truncate @@ -98,9 +99,6 @@ void RelationCreateStorage(RelFileNode rnode, char relpersistence) { PendingRelDelete *pending; - XLogRecPtr lsn; - XLogRecData rdata; - xl_smgr_create xlrec; SMgrRelation srel; BackendId backend; bool needs_wal; @@ -111,6 +109,10 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence) backend = MyBackendId; needs_wal = false; break; + case RELPERSISTENCE_UNLOGGED: + backend = InvalidBackendId; + needs_wal = false; + break; case RELPERSISTENCE_PERMANENT: backend = InvalidBackendId; needs_wal = true; @@ -124,19 +126,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence) smgrcreate(srel, MAIN_FORKNUM, false); if (needs_wal) - { - /* - * Make an XLOG entry reporting the file creation. - */ - xlrec.rnode = rnode; - - rdata.data = (char *) &xlrec; - rdata.len = sizeof(xlrec); - rdata.buffer = InvalidBuffer; - rdata.next = NULL; - - lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE, &rdata); - } + log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM); /* Add the relation to the list of stuff to delete at abort */ pending = (PendingRelDelete *) @@ -150,6 +140,29 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence) } /* + * Perform XLogInsert of a XLOG_SMGR_CREATE record to WAL. + */ +void +log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum) +{ + xl_smgr_create xlrec; + XLogRecData rdata; + + /* + * Make an XLOG entry reporting the file creation. + */ + xlrec.rnode = *rnode; + xlrec.forkNum = forkNum; + + rdata.data = (char *) &xlrec; + rdata.len = sizeof(xlrec); + rdata.buffer = InvalidBuffer; + rdata.next = NULL; + + XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE, &rdata); +} + +/* * RelationDropStorage * Schedule unlinking of physical storage at transaction commit. */ @@ -478,7 +491,7 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record) SMgrRelation reln; reln = smgropen(xlrec->rnode, InvalidBackendId); - smgrcreate(reln, MAIN_FORKNUM, true); + smgrcreate(reln, xlrec->forkNum, true); } else if (info == XLOG_SMGR_TRUNCATE) { @@ -523,7 +536,7 @@ smgr_desc(StringInfo buf, uint8 xl_info, char *rec) if (info == XLOG_SMGR_CREATE) { xl_smgr_create *xlrec = (xl_smgr_create *) rec; - char *path = relpathperm(xlrec->rnode, MAIN_FORKNUM); + char *path = relpathperm(xlrec->rnode, xlrec->forkNum); appendStringInfo(buf, "file create: %s", path); pfree(path); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 6729d8336f5..3f6b814f02c 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -5128,12 +5128,12 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel, RelationGetRelationName(pkrel)))); /* - * References from permanent tables to temp tables are disallowed because - * the contents of the temp table disappear at the end of each session. - * References from temp tables to permanent tables are also disallowed, - * because other backends might need to run the RI triggers on the perm - * table, but they can't reliably see tuples in the local buffers of other - * backends. + * References from permanent or unlogged tables to temp tables, and from + * permanent tables to unlogged tables, are disallowed because the + * referenced data can vanish out from under us. References from temp + * tables to any other table type are also disallowed, because other + * backends might need to run the RI triggers on the perm table, but they + * can't reliably see tuples in the local buffers of other backends. */ switch (rel->rd_rel->relpersistence) { @@ -5143,6 +5143,13 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("constraints on permanent tables may reference only permanent tables"))); break; + case RELPERSISTENCE_UNLOGGED: + if (pkrel->rd_rel->relpersistence != RELPERSISTENCE_PERMANENT + && pkrel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("constraints on unlogged tables may reference only permanent or unlogged tables"))); + break; case RELPERSISTENCE_TEMP: if (pkrel->rd_rel->relpersistence != RELPERSISTENCE_TEMP) ereport(ERROR, diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 37840baa0f6..26a5e84d44a 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -538,8 +538,8 @@ static RangeVar *makeRangeVarFromAnyName(List *names, int position, core_yyscan_ TO TRAILING TRANSACTION TREAT TRIGGER TRIM TRUE_P TRUNCATE TRUSTED TYPE_P - UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNTIL - UPDATE USER USING + UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED + UNTIL UPDATE USER USING VACUUM VALID VALIDATOR VALUE_P VALUES VARCHAR VARIADIC VARYING VERBOSE VERSION_P VIEW VOLATILE @@ -2365,6 +2365,7 @@ OptTemp: TEMPORARY { $$ = RELPERSISTENCE_TEMP; } | LOCAL TEMP { $$ = RELPERSISTENCE_TEMP; } | GLOBAL TEMPORARY { $$ = RELPERSISTENCE_TEMP; } | GLOBAL TEMP { $$ = RELPERSISTENCE_TEMP; } + | UNLOGGED { $$ = RELPERSISTENCE_UNLOGGED; } | /*EMPTY*/ { $$ = RELPERSISTENCE_PERMANENT; } ; @@ -7927,6 +7928,11 @@ OptTempTableName: $$ = $4; $$->relpersistence = RELPERSISTENCE_TEMP; } + | UNLOGGED opt_table qualified_name + { + $$ = $3; + $$->relpersistence = RELPERSISTENCE_UNLOGGED; + } | TABLE qualified_name { $$ = $2; @@ -11395,6 +11401,7 @@ unreserved_keyword: | UNENCRYPTED | UNKNOWN | UNLISTEN + | UNLOGGED | UNTIL | UPDATE | VACUUM diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 860e736ff05..34e54536692 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -82,7 +82,7 @@ static bool IsForInput; static volatile BufferDesc *PinCountWaitBuf = NULL; -static Buffer ReadBuffer_common(SMgrRelation reln, +static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit); @@ -97,7 +97,9 @@ static void TerminateBufferIO(volatile BufferDesc *buf, bool clear_dirty, int set_flag_bits); static void shared_buffer_write_error_callback(void *arg); static void local_buffer_write_error_callback(void *arg); -static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum, +static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, + char relpersistence, + ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr); @@ -241,8 +243,8 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, * miss. */ pgstat_count_buffer_read(reln); - buf = ReadBuffer_common(reln->rd_smgr, forkNum, blockNum, - mode, strategy, &hit); + buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence, + forkNum, blockNum, mode, strategy, &hit); if (hit) pgstat_count_buffer_hit(reln); return buf; @@ -253,10 +255,10 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require * a relcache entry for the relation. * - * NB: At present, this function may not be used on temporary relations, which + * NB: At present, this function may only be used on permanent relations, which * is OK, because we only use it during XLOG replay. If in the future we - * want to use it on temporary relations, we could pass the backend ID as an - * additional parameter. + * want to use it on temporary or unlogged relations, we could pass additional + * parameters. */ Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, @@ -267,7 +269,8 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, SMgrRelation smgr = smgropen(rnode, InvalidBackendId); - return ReadBuffer_common(smgr, forkNum, blockNum, mode, strategy, &hit); + return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum, + mode, strategy, &hit); } @@ -277,7 +280,7 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, * *hit is set to true if the request was satisfied from shared buffer cache. */ static Buffer -ReadBuffer_common(SMgrRelation smgr, ForkNumber forkNum, +ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit) { @@ -319,7 +322,8 @@ ReadBuffer_common(SMgrRelation smgr, ForkNumber forkNum, * lookup the buffer. IO_IN_PROGRESS is set if the requested block is * not currently in memory. */ - bufHdr = BufferAlloc(smgr, forkNum, blockNum, strategy, &found); + bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum, + strategy, &found); if (found) pgBufferUsage.shared_blks_hit++; else @@ -500,7 +504,7 @@ ReadBuffer_common(SMgrRelation smgr, ForkNumber forkNum, * No locks are held either at entry or exit. */ static volatile BufferDesc * -BufferAlloc(SMgrRelation smgr, ForkNumber forkNum, +BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr) @@ -797,8 +801,11 @@ BufferAlloc(SMgrRelation smgr, ForkNumber forkNum, * 1 so that the buffer can survive one clock-sweep pass.) */ buf->tag = newTag; - buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR); - buf->flags |= BM_TAG_VALID; + buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT); + if (relpersistence == RELPERSISTENCE_PERMANENT) + buf->flags |= BM_TAG_VALID | BM_PERMANENT; + else + buf->flags |= BM_TAG_VALID; buf->usage_count = 1; UnlockBufHdr(buf); @@ -1155,8 +1162,10 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner) * BufferSync -- Write out all dirty buffers in the pool. * * This is called at checkpoint time to write out all dirty shared buffers. - * The checkpoint request flags should be passed in; currently the only one - * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. + * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE + * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN is + * set, we write even unlogged buffers, which are otherwise skipped. The + * remaining flags currently have no effect here. */ static void BufferSync(int flags) @@ -1165,11 +1174,19 @@ BufferSync(int flags) int num_to_scan; int num_to_write; int num_written; + int mask = BM_DIRTY; /* Make sure we can handle the pin inside SyncOneBuffer */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); /* + * Unless this is a shutdown checkpoint, we write only permanent, dirty + * buffers. But at shutdown time, we write all dirty buffers. + */ + if (!(flags & CHECKPOINT_IS_SHUTDOWN)) + flags |= BM_PERMANENT; + + /* * Loop over all buffers, and mark the ones that need to be written with * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_write), so that we * can estimate how much work needs to be done. @@ -1196,7 +1213,7 @@ BufferSync(int flags) */ LockBufHdr(bufHdr); - if (bufHdr->flags & BM_DIRTY) + if ((bufHdr->flags & mask) == mask) { bufHdr->flags |= BM_CHECKPOINT_NEEDED; num_to_write++; @@ -1897,12 +1914,12 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) * Determines the current number of pages in the relation. */ BlockNumber -RelationGetNumberOfBlocks(Relation relation) +RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum) { /* Open it at the smgr level if not already done */ RelationOpenSmgr(relation); - return smgrnblocks(relation->rd_smgr, MAIN_FORKNUM); + return smgrnblocks(relation->rd_smgr, forkNum); } /* --------------------------------------------------------------------- diff --git a/src/backend/storage/file/Makefile b/src/backend/storage/file/Makefile index 3b93aa1b45d..d2198f2b93e 100644 --- a/src/backend/storage/file/Makefile +++ b/src/backend/storage/file/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/file top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = fd.o buffile.o copydir.o +OBJS = fd.o buffile.o copydir.o reinit.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index f7dc509b500..587fb9260c0 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -38,7 +38,6 @@ #endif -static void copy_file(char *fromfile, char *tofile); static void fsync_fname(char *fname, bool isdir); @@ -142,7 +141,7 @@ copydir(char *fromdir, char *todir, bool recurse) /* * copy one file */ -static void +void copy_file(char *fromfile, char *tofile) { char *buffer; diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 4f7dc39d638..a1dc18be44b 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -2055,7 +2055,7 @@ looks_like_temp_rel_name(const char *name) /* We might have _forkname or .segment or both. */ if (name[pos] == '_') { - int forkchar = forkname_chars(&name[pos+1]); + int forkchar = forkname_chars(&name[pos+1], NULL); if (forkchar <= 0) return false; pos += forkchar + 1; diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c new file mode 100644 index 00000000000..b75178b8045 --- /dev/null +++ b/src/backend/storage/file/reinit.c @@ -0,0 +1,396 @@ +/*------------------------------------------------------------------------- + * + * reinit.c + * Reinitialization of unlogged relations + * + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/reinit.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> + +#include "catalog/catalog.h" +#include "storage/copydir.h" +#include "storage/fd.h" +#include "storage/reinit.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" + +static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, + int op); +static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, + int op); +static bool parse_filename_for_nontemp_relation(const char *name, + int *oidchars, ForkNumber *fork); + +typedef struct { + char oid[OIDCHARS+1]; +} unlogged_relation_entry; + +/* + * Reset unlogged relations from before the last restart. + * + * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any + * relation with an "init" fork, except for the "init" fork itself. + * + * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main + * fork. + */ +void +ResetUnloggedRelations(int op) +{ + char temp_path[MAXPGPATH]; + DIR *spc_dir; + struct dirent *spc_de; + MemoryContext tmpctx, oldctx; + + /* Log it. */ + ereport(DEBUG1, + (errmsg("resetting unlogged relations: cleanup %d init %d", + (op & UNLOGGED_RELATION_CLEANUP) != 0, + (op & UNLOGGED_RELATION_INIT) != 0))); + + /* + * Just to be sure we don't leak any memory, let's create a temporary + * memory context for this operation. + */ + tmpctx = AllocSetContextCreate(CurrentMemoryContext, + "ResetUnloggedRelations", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldctx = MemoryContextSwitchTo(tmpctx); + + /* + * First process unlogged files in pg_default ($PGDATA/base) + */ + ResetUnloggedRelationsInTablespaceDir("base", op); + + /* + * Cycle through directories for all non-default tablespaces. + */ + spc_dir = AllocateDir("pg_tblspc"); + + while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL) + { + if (strcmp(spc_de->d_name, ".") == 0 || + strcmp(spc_de->d_name, "..") == 0) + continue; + + snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", + spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); + ResetUnloggedRelationsInTablespaceDir(temp_path, op); + } + + FreeDir(spc_dir); + + /* + * Restore memory context. + */ + MemoryContextSwitchTo(oldctx); + MemoryContextDelete(tmpctx); +} + +/* Process one tablespace directory for ResetUnloggedRelations */ +static void +ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) +{ + DIR *ts_dir; + struct dirent *de; + char dbspace_path[MAXPGPATH]; + + ts_dir = AllocateDir(tsdirname); + if (ts_dir == NULL) + { + /* anything except ENOENT is fishy */ + if (errno != ENOENT) + elog(LOG, + "could not open tablespace directory \"%s\": %m", + tsdirname); + return; + } + + while ((de = ReadDir(ts_dir, tsdirname)) != NULL) + { + int i = 0; + + /* + * We're only interested in the per-database directories, which have + * numeric names. Note that this code will also (properly) ignore "." + * and "..". + */ + while (isdigit((unsigned char) de->d_name[i])) + ++i; + if (de->d_name[i] != '\0' || i == 0) + continue; + + snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s", + tsdirname, de->d_name); + ResetUnloggedRelationsInDbspaceDir(dbspace_path, op); + } + + FreeDir(ts_dir); +} + +/* Process one per-dbspace directory for ResetUnloggedRelations */ +static void +ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) +{ + DIR *dbspace_dir; + struct dirent *de; + char rm_path[MAXPGPATH]; + + /* Caller must specify at least one operation. */ + Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0); + + /* + * Cleanup is a two-pass operation. First, we go through and identify all + * the files with init forks. Then, we go through again and nuke + * everything with the same OID except the init fork. + */ + if ((op & UNLOGGED_RELATION_CLEANUP) != 0) + { + HTAB *hash = NULL; + HASHCTL ctl; + + /* Open the directory. */ + dbspace_dir = AllocateDir(dbspacedirname); + if (dbspace_dir == NULL) + { + elog(LOG, + "could not open dbspace directory \"%s\": %m", + dbspacedirname); + return; + } + + /* + * It's possible that someone could create a ton of unlogged relations + * in the same database & tablespace, so we'd better use a hash table + * rather than an array or linked list to keep track of which files + * need to be reset. Otherwise, this cleanup operation would be + * O(n^2). + */ + ctl.keysize = sizeof(unlogged_relation_entry); + ctl.entrysize = sizeof(unlogged_relation_entry); + hash = hash_create("unlogged hash", 32, &ctl, HASH_ELEM); + + /* Scan the directory. */ + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int oidchars; + unlogged_relation_entry ent; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, + &forkNum)) + continue; + + /* Also skip it unless this is the init fork. */ + if (forkNum != INIT_FORKNUM) + continue; + + /* + * Put the OID portion of the name into the hash table, if it isn't + * already. + */ + memset(ent.oid, 0, sizeof(ent.oid)); + memcpy(ent.oid, de->d_name, oidchars); + hash_search(hash, &ent, HASH_ENTER, NULL); + } + + /* Done with the first pass. */ + FreeDir(dbspace_dir); + + /* + * If we didn't find any init forks, there's no point in continuing; + * we can bail out now. + */ + if (hash_get_num_entries(hash) == 0) + { + hash_destroy(hash); + return; + } + + /* + * Now, make a second pass and remove anything that matches. First, + * reopen the directory. + */ + dbspace_dir = AllocateDir(dbspacedirname); + if (dbspace_dir == NULL) + { + elog(LOG, + "could not open dbspace directory \"%s\": %m", + dbspacedirname); + hash_destroy(hash); + return; + } + + /* Scan the directory. */ + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int oidchars; + bool found; + unlogged_relation_entry ent; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, + &forkNum)) + continue; + + /* We never remove the init fork. */ + if (forkNum == INIT_FORKNUM) + continue; + + /* + * See whether the OID portion of the name shows up in the hash + * table. + */ + memset(ent.oid, 0, sizeof(ent.oid)); + memcpy(ent.oid, de->d_name, oidchars); + hash_search(hash, &ent, HASH_FIND, &found); + + /* If so, nuke it! */ + if (found) + { + snprintf(rm_path, sizeof(rm_path), "%s/%s", + dbspacedirname, de->d_name); + /* + * It's tempting to actually throw an error here, but since + * this code gets run during database startup, that could + * result in the database failing to start. (XXX Should we do + * it anyway?) + */ + if (unlink(rm_path)) + elog(LOG, "could not unlink file \"%s\": %m", rm_path); + else + elog(DEBUG2, "unlinked file \"%s\"", rm_path); + } + } + + /* Cleanup is complete. */ + FreeDir(dbspace_dir); + hash_destroy(hash); + } + + /* + * Initialization happens after cleanup is complete: we copy each init + * fork file to the corresponding main fork file. Note that if we are + * asked to do both cleanup and init, we may never get here: if the cleanup + * code determines that there are no init forks in this dbspace, it will + * return before we get to this point. + */ + if ((op & UNLOGGED_RELATION_INIT) != 0) + { + /* Open the directory. */ + dbspace_dir = AllocateDir(dbspacedirname); + if (dbspace_dir == NULL) + { + /* we just saw this directory, so it really ought to be there */ + elog(LOG, + "could not open dbspace directory \"%s\": %m", + dbspacedirname); + return; + } + + /* Scan the directory. */ + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int oidchars; + char oidbuf[OIDCHARS+1]; + char srcpath[MAXPGPATH]; + char dstpath[MAXPGPATH]; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, + &forkNum)) + continue; + + /* Also skip it unless this is the init fork. */ + if (forkNum != INIT_FORKNUM) + continue; + + /* Construct source pathname. */ + snprintf(srcpath, sizeof(srcpath), "%s/%s", + dbspacedirname, de->d_name); + + /* Construct destination pathname. */ + memcpy(oidbuf, de->d_name, oidchars); + oidbuf[oidchars] = '\0'; + snprintf(dstpath, sizeof(dstpath), "%s/%s%s", + dbspacedirname, oidbuf, de->d_name + oidchars + 1 + + strlen(forkNames[INIT_FORKNUM])); + + /* OK, we're ready to perform the actual copy. */ + elog(DEBUG2, "copying %s to %s", srcpath, dstpath); + copy_file(srcpath, dstpath); + } + + /* Done with the first pass. */ + FreeDir(dbspace_dir); + } +} + +/* + * Basic parsing of putative relation filenames. + * + * This funtion returns true if the file appears to be in the correct format + * for a non-temporary relation and false otherwise. + * + * NB: If this function returns true, the caller is entitled to assume that + * *oidchars has been set to the a value no more than OIDCHARS, and thus + * that a buffer of OIDCHARS+1 characters is sufficient to hold the OID + * portion of the filename. This is critical to protect against a possible + * buffer overrun. + */ +static bool +parse_filename_for_nontemp_relation(const char *name, int *oidchars, + ForkNumber *fork) +{ + int pos; + + /* Look for a non-empty string of digits (that isn't too long). */ + for (pos = 0; isdigit((unsigned char) name[pos]); ++pos) + ; + if (pos == 0 || pos > OIDCHARS) + return false; + *oidchars = pos; + + /* Check for a fork name. */ + if (name[pos] != '_') + *fork = MAIN_FORKNUM; + else + { + int forkchar; + + forkchar = forkname_chars(&name[pos+1], fork); + if (forkchar <= 0) + return false; + pos += forkchar + 1; + } + + /* Check for a segment number. */ + if (name[pos] == '.') + { + int segchar; + for (segchar = 1; isdigit((unsigned char) name[pos+segchar]); ++segchar) + ; + if (segchar <= 1) + return false; + pos += segchar; + } + + /* Now we should be at the end. */ + if (name[pos] != '\0') + return false; + return true; +} diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index e352cdafb3b..f33c29e4b21 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -615,6 +615,7 @@ pg_relation_filepath(PG_FUNCTION_ARGS) /* Determine owning backend. */ switch (relform->relpersistence) { + case RELPERSISTENCE_UNLOGGED: case RELPERSISTENCE_PERMANENT: backend = InvalidBackendId; break; diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 1509686079b..fa9e9ca3a4e 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -851,6 +851,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) relation->rd_newRelfilenodeSubid = InvalidSubTransactionId; switch (relation->rd_rel->relpersistence) { + case RELPERSISTENCE_UNLOGGED: case RELPERSISTENCE_PERMANENT: relation->rd_backend = InvalidBackendId; break; @@ -2490,6 +2491,7 @@ RelationBuildLocalRelation(const char *relname, rel->rd_rel->relpersistence = relpersistence; switch (relpersistence) { + case RELPERSISTENCE_UNLOGGED: case RELPERSISTENCE_PERMANENT: rel->rd_backend = InvalidBackendId; break; |