diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2010-02-07 20:48:13 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2010-02-07 20:48:13 +0000 |
commit | b9b8831ad60f6e4bd580fe6dbe9749359298a3c4 (patch) | |
tree | af6948498f13a43edd982b05808ed89b5b8191ab /src/backend/utils/cache | |
parent | 7fc30c488fc6e9674564206193c29b1a657a818f (diff) | |
download | postgresql-b9b8831ad60f6e4bd580fe6dbe9749359298a3c4.tar.gz postgresql-b9b8831ad60f6e4bd580fe6dbe9749359298a3c4.zip |
Create a "relation mapping" infrastructure to support changing the relfilenodes
of shared or nailed system catalogs. This has two key benefits:
* The new CLUSTER-based VACUUM FULL can be applied safely to all catalogs.
* We no longer have to use an unsafe reindex-in-place approach for reindexing
shared catalogs.
CLUSTER on nailed catalogs now works too, although I left it disabled on
shared catalogs because the resulting pg_index.indisclustered update would
only be visible in one database.
Since reindexing shared system catalogs is now fully transactional and
crash-safe, the former special cases in REINDEX behavior have been removed;
shared catalogs are treated the same as non-shared.
This commit does not do anything about the recently-discussed problem of
deadlocks between VACUUM FULL/CLUSTER on a system catalog and other
concurrent queries; will address that in a separate patch. As a stopgap,
parallel_schedule has been tweaked to run vacuum.sql by itself, to avoid
such failures during the regression tests.
Diffstat (limited to 'src/backend/utils/cache')
-rw-r--r-- | src/backend/utils/cache/Makefile | 4 | ||||
-rw-r--r-- | src/backend/utils/cache/catcache.c | 62 | ||||
-rw-r--r-- | src/backend/utils/cache/inval.c | 133 | ||||
-rw-r--r-- | src/backend/utils/cache/relcache.c | 140 | ||||
-rw-r--r-- | src/backend/utils/cache/relmapper.c | 913 |
5 files changed, 1193 insertions, 59 deletions
diff --git a/src/backend/utils/cache/Makefile b/src/backend/utils/cache/Makefile index 617cb677f7a..d1caf8e4aeb 100644 --- a/src/backend/utils/cache/Makefile +++ b/src/backend/utils/cache/Makefile @@ -4,7 +4,7 @@ # Makefile for utils/cache # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/utils/cache/Makefile,v 1.25 2010/01/22 16:40:19 rhaas Exp $ +# $PostgreSQL: pgsql/src/backend/utils/cache/Makefile,v 1.26 2010/02/07 20:48:10 tgl Exp $ # #------------------------------------------------------------------------- @@ -12,7 +12,7 @@ subdir = src/backend/utils/cache top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = attoptcache.o catcache.o inval.o plancache.o relcache.o \ +OBJS = attoptcache.o catcache.o inval.o plancache.o relcache.o relmapper.o \ spccache.o syscache.o lsyscache.o typcache.o ts_cache.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 8b606a8da27..aac1e87d87e 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/catcache.c,v 1.148 2010/01/02 16:57:55 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/catcache.c,v 1.149 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -28,6 +28,7 @@ #endif #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/inval.h" #include "utils/memutils.h" #include "utils/rel.h" #include "utils/resowner.h" @@ -679,17 +680,6 @@ ResetCatalogCaches(void) * or a temp table being dropped at end of transaction, or a table created * during the current transaction that is being dropped because of abort.) * Remove all cache entries relevant to the specified relation OID. - * - * A special case occurs when relId is itself one of the cacheable system - * tables --- although those'll never be dropped, they can get flushed from - * the relcache (VACUUM causes this, for example). In that case we need - * to flush all cache entries that came from that table. (At one point we - * also tried to force re-execution of CatalogCacheInitializeCache for - * the cache(s) on that table. This is a bad idea since it leads to all - * kinds of trouble if a cache flush occurs while loading cache entries. - * We now avoid the need to do it by copying cc_tupdesc out of the relcache, - * rather than relying on the relcache to keep a tupdesc for us. Of course - * this assumes the tupdesc of a cachable system table will not change...) */ void CatalogCacheFlushRelation(Oid relId) @@ -706,14 +696,6 @@ CatalogCacheFlushRelation(Oid relId) if (cache->cc_tupdesc == NULL) continue; - /* Does this cache store tuples of the target relation itself? */ - if (cache->cc_tupdesc->attrs[0]->attrelid == relId) - { - /* Yes, so flush all its contents */ - ResetCatalogCache(cache); - continue; - } - /* Does this cache store tuples associated with relations at all? */ if (cache->cc_reloidattr == 0) continue; /* nope, leave it alone */ @@ -776,6 +758,46 @@ CatalogCacheFlushRelation(Oid relId) } /* + * CatalogCacheFlushCatalog + * + * Flush all catcache entries that came from the specified system catalog. + * This is needed after VACUUM FULL/CLUSTER on the catalog, since the + * tuples very likely now have different TIDs than before. (At one point + * we also tried to force re-execution of CatalogCacheInitializeCache for + * the cache(s) on that catalog. This is a bad idea since it leads to all + * kinds of trouble if a cache flush occurs while loading cache entries. + * We now avoid the need to do it by copying cc_tupdesc out of the relcache, + * rather than relying on the relcache to keep a tupdesc for us. Of course + * this assumes the tupdesc of a cachable system table will not change...) + */ +void +CatalogCacheFlushCatalog(Oid catId) +{ + CatCache *cache; + + CACHE2_elog(DEBUG2, "CatalogCacheFlushCatalog called for %u", catId); + + for (cache = CacheHdr->ch_caches; cache; cache = cache->cc_next) + { + /* We can ignore uninitialized caches, since they must be empty */ + if (cache->cc_tupdesc == NULL) + continue; + + /* Does this cache store tuples of the target catalog? */ + if (cache->cc_tupdesc->attrs[0]->attrelid == catId) + { + /* Yes, so flush all its contents */ + ResetCatalogCache(cache); + + /* Tell inval.c to call syscache callbacks for this cache */ + CallSyscacheCallbacks(cache->id, NULL); + } + } + + CACHE1_elog(DEBUG2, "end of CatalogCacheFlushCatalog call"); +} + +/* * InitCatCache * * This allocates and initializes a cache for a system catalog relation. diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index 99aad752bb3..96439fda18a 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -80,7 +80,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.93 2010/02/03 01:14:17 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.94 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -96,6 +96,7 @@ #include "utils/inval.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "utils/relmapper.h" #include "utils/syscache.h" @@ -326,6 +327,21 @@ AddCatcacheInvalidationMessage(InvalidationListHeader *hdr, } /* + * Add a whole-catalog inval entry + */ +static void +AddCatalogInvalidationMessage(InvalidationListHeader *hdr, + Oid dbId, Oid catId) +{ + SharedInvalidationMessage msg; + + msg.cat.id = SHAREDINVALCATALOG_ID; + msg.cat.dbId = dbId; + msg.cat.catId = catId; + AddInvalidationMessage(&hdr->cclist, &msg); +} + +/* * Add a relcache inval entry */ static void @@ -407,6 +423,18 @@ RegisterCatcacheInvalidation(int cacheId, } /* + * RegisterCatalogInvalidation + * + * Register an invalidation event for all catcache entries from a catalog. + */ +static void +RegisterCatalogInvalidation(Oid dbId, Oid catId) +{ + AddCatalogInvalidationMessage(&transInvalInfo->CurrentCmdInvalidMsgs, + dbId, catId); +} + +/* * RegisterRelcacheInvalidation * * As above, but register a relcache invalidation event. @@ -443,30 +471,32 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId) static void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) { - int i; - if (msg->id >= 0) { - if (msg->cc.dbId == MyDatabaseId || msg->cc.dbId == 0) + if (msg->cc.dbId == MyDatabaseId || msg->cc.dbId == InvalidOid) { CatalogCacheIdInvalidate(msg->cc.id, msg->cc.hashValue, &msg->cc.tuplePtr); - for (i = 0; i < syscache_callback_count; i++) - { - struct SYSCACHECALLBACK *ccitem = syscache_callback_list + i; + CallSyscacheCallbacks(msg->cc.id, &msg->cc.tuplePtr); + } + } + else if (msg->id == SHAREDINVALCATALOG_ID) + { + if (msg->cat.dbId == MyDatabaseId || msg->cat.dbId == InvalidOid) + { + CatalogCacheFlushCatalog(msg->cat.catId); - if (ccitem->id == msg->cc.id) - (*ccitem->function) (ccitem->arg, - msg->cc.id, &msg->cc.tuplePtr); - } + /* CatalogCacheFlushCatalog calls CallSyscacheCallbacks as needed */ } } else if (msg->id == SHAREDINVALRELCACHE_ID) { if (msg->rc.dbId == MyDatabaseId || msg->rc.dbId == InvalidOid) { + int i; + RelationCacheInvalidateEntry(msg->rc.relId); for (i = 0; i < relcache_callback_count; i++) @@ -485,6 +515,14 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) */ smgrclosenode(msg->sm.rnode); } + else if (msg->id == SHAREDINVALRELMAP_ID) + { + /* We only care about our own database and shared catalogs */ + if (msg->rm.dbId == InvalidOid) + RelationMapInvalidate(true); + else if (msg->rm.dbId == MyDatabaseId) + RelationMapInvalidate(false); + } else elog(FATAL, "unrecognized SI message id: %d", msg->id); } @@ -506,7 +544,7 @@ InvalidateSystemCaches(void) int i; ResetCatalogCaches(); - RelationCacheInvalidate(); /* gets smgr cache too */ + RelationCacheInvalidate(); /* gets smgr and relmap too */ for (i = 0; i < syscache_callback_count; i++) { @@ -874,7 +912,7 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs, else { /* - * Invalidation message is a SHAREDINVALSMGR_ID + * Invalidation message is a catalog or nontransactional inval, * which never cause relcache file invalidation, * so we ignore them, no matter which db they're for. */ @@ -1183,6 +1221,30 @@ CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple) } /* + * CacheInvalidateCatalog + * Register invalidation of the whole content of a system catalog. + * + * This is normally used in VACUUM FULL/CLUSTER, where we haven't so much + * changed any tuples as moved them around. Some uses of catcache entries + * expect their TIDs to be correct, so we have to blow away the entries. + * + * Note: we expect caller to verify that the rel actually is a system + * catalog. If it isn't, no great harm is done, just a wasted sinval message. + */ +void +CacheInvalidateCatalog(Oid catalogId) +{ + Oid databaseId; + + if (IsSharedRelation(catalogId)) + databaseId = InvalidOid; + else + databaseId = MyDatabaseId; + + RegisterCatalogInvalidation(databaseId, catalogId); +} + +/* * CacheInvalidateRelcache * Register invalidation of the specified relation's relcache entry * at end of command. @@ -1277,6 +1339,31 @@ CacheInvalidateSmgr(RelFileNode rnode) SendSharedInvalidMessages(&msg, 1); } +/* + * CacheInvalidateRelmap + * Register invalidation of the relation mapping for a database, + * or for the shared catalogs if databaseId is zero. + * + * Sending this type of invalidation msg forces other backends to re-read + * the indicated relation mapping file. It is also necessary to send a + * relcache inval for the specific relations whose mapping has been altered, + * else the relcache won't get updated with the new filenode data. + * + * Note: because these messages are nontransactional, they won't be captured + * in commit/abort WAL entries. Instead, calls to CacheInvalidateRelmap() + * should happen in low-level relmapper.c routines, which are executed while + * replaying WAL as well as when creating it. + */ +void +CacheInvalidateRelmap(Oid databaseId) +{ + SharedInvalidationMessage msg; + + msg.rm.id = SHAREDINVALRELMAP_ID; + msg.rm.dbId = databaseId; + SendSharedInvalidMessages(&msg, 1); +} + /* * CacheRegisterSyscacheCallback @@ -1323,3 +1410,23 @@ CacheRegisterRelcacheCallback(RelcacheCallbackFunction func, ++relcache_callback_count; } + +/* + * CallSyscacheCallbacks + * + * This is exported so that CatalogCacheFlushCatalog can call it, saving + * this module from knowing which catcache IDs correspond to which catalogs. + */ +void +CallSyscacheCallbacks(int cacheid, ItemPointer tuplePtr) +{ + int i; + + for (i = 0; i < syscache_callback_count; i++) + { + struct SYSCACHECALLBACK *ccitem = syscache_callback_list + i; + + if (ccitem->id == cacheid) + (*ccitem->function) (ccitem->arg, cacheid, tuplePtr); + } +} diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index e71416c0f70..ff85195ed13 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.302 2010/02/04 00:09:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.303 2010/02/07 20:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -72,6 +72,7 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/relcache.h" +#include "utils/relmapper.h" #include "utils/resowner.h" #include "utils/syscache.h" #include "utils/tqual.h" @@ -838,6 +839,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) */ relid = HeapTupleGetOid(pg_class_tuple); relp = (Form_pg_class) GETSTRUCT(pg_class_tuple); + Assert(relid == targetRelId); /* * allocate storage for the relation descriptor, and copy pg_class_tuple @@ -927,6 +929,10 @@ RelationBuildDesc(Oid targetRelId, bool insertIt) /* * Initialize the physical addressing info (RelFileNode) for a relcache entry + * + * Note: at the physical level, relations in the pg_global tablespace must + * be treated as shared, even if relisshared isn't set. Hence we do not + * look at relisshared here. */ static void RelationInitPhysicalAddr(Relation relation) @@ -935,11 +941,22 @@ RelationInitPhysicalAddr(Relation relation) relation->rd_node.spcNode = relation->rd_rel->reltablespace; else relation->rd_node.spcNode = MyDatabaseTableSpace; - if (relation->rd_rel->relisshared) + if (relation->rd_node.spcNode == GLOBALTABLESPACE_OID) relation->rd_node.dbNode = InvalidOid; else relation->rd_node.dbNode = MyDatabaseId; - relation->rd_node.relNode = relation->rd_rel->relfilenode; + if (relation->rd_rel->relfilenode) + relation->rd_node.relNode = relation->rd_rel->relfilenode; + else + { + /* Consult the relation mapper */ + relation->rd_node.relNode = + RelationMapOidToFilenode(relation->rd_id, + relation->rd_rel->relisshared); + if (!OidIsValid(relation->rd_node.relNode)) + elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u", + RelationGetRelationName(relation), relation->rd_id); + } } /* @@ -1496,7 +1513,18 @@ formrdesc(const char *relationName, Oid relationReltype, * initialize relation id from info in att array (my, this is ugly) */ RelationGetRelid(relation) = relation->rd_att->attrs[0]->attrelid; - relation->rd_rel->relfilenode = RelationGetRelid(relation); + + /* + * All relations made with formrdesc are mapped. This is necessarily so + * because there is no other way to know what filenode they currently + * have. In bootstrap mode, add them to the initial relation mapper data, + * specifying that the initial filenode is the same as the OID. + */ + relation->rd_rel->relfilenode = InvalidOid; + if (IsBootstrapProcessingMode()) + RelationMapUpdateMap(RelationGetRelid(relation), + RelationGetRelid(relation), + isshared, true); /* * initialize the relation lock manager information @@ -1841,7 +1869,9 @@ RelationClearRelation(Relation relation, bool rebuild) * Never, never ever blow away a nailed-in system relation, because we'd * be unable to recover. However, we must reset rd_targblock, in case we * got called because of a relation cache flush that was triggered by - * VACUUM. Likewise reset the fsm and vm size info. + * VACUUM. Likewise reset the fsm and vm size info. Also, redo + * RelationInitPhysicalAddr in case it is a mapped relation whose mapping + * changed. * * If it's a nailed index, then we need to re-read the pg_class row to see * if its relfilenode changed. We can't necessarily do that here, because @@ -1855,6 +1885,9 @@ RelationClearRelation(Relation relation, bool rebuild) relation->rd_targblock = InvalidBlockNumber; relation->rd_fsm_nblocks = InvalidBlockNumber; relation->rd_vm_nblocks = InvalidBlockNumber; + /* We must recalculate physical address in case it changed */ + RelationInitPhysicalAddr(relation); + if (relation->rd_rel->relkind == RELKIND_INDEX) { relation->rd_isvalid = false; /* needs to be revalidated */ @@ -1885,7 +1918,8 @@ RelationClearRelation(Relation relation, bool rebuild) /* * Clear out catcache's entries for this relation. This is a bit of - * a hack, but it's a convenient place to do it. + * a hack, but it's a convenient place to do it. (XXX do we really + * still need this?) */ CatalogCacheFlushRelation(RelationGetRelid(relation)); @@ -2104,7 +2138,7 @@ RelationCacheInvalidateEntry(Oid relationId) * RelationCacheInvalidate * Blow away cached relation descriptors that have zero reference counts, * and rebuild those with positive reference counts. Also reset the smgr - * relation cache. + * relation cache and re-read relation mapping data. * * This is currently used only to recover from SI message buffer overflow, * so we do not touch new-in-transaction relations; they cannot be targets @@ -2190,6 +2224,11 @@ RelationCacheInvalidate(void) */ smgrcloseall(); + /* + * Reload relation mapping data before starting to reconstruct cache. + */ + RelationMapInvalidateAll(); + /* Phase 2: rebuild the items found to need rebuild in phase 1 */ foreach(l, rebuildFirstList) { @@ -2206,6 +2245,25 @@ RelationCacheInvalidate(void) } /* + * RelationCloseSmgrByOid - close a relcache entry's smgr link + * + * Needed in some cases where we are changing a relation's physical mapping. + * The link will be automatically reopened on next use. + */ +void +RelationCloseSmgrByOid(Oid relationId) +{ + Relation relation; + + RelationIdCacheLookup(relationId, relation); + + if (!PointerIsValid(relation)) + return; /* not in cache, nothing to do */ + + RelationCloseSmgr(relation); +} + +/* * AtEOXact_RelationCache * * Clean up the relcache at main-transaction commit or abort. @@ -2393,7 +2451,8 @@ RelationBuildLocalRelation(const char *relname, TupleDesc tupDesc, Oid relid, Oid reltablespace, - bool shared_relation) + bool shared_relation, + bool mapped_relation) { Relation rel; MemoryContext oldcxt; @@ -2409,6 +2468,8 @@ RelationBuildLocalRelation(const char *relname, * * XXX this list had better match the relations specially handled in * RelationCacheInitializePhase2/3. + * + * XXX do we need this at all?? */ switch (relid) { @@ -2434,6 +2495,9 @@ RelationBuildLocalRelation(const char *relname, elog(ERROR, "shared_relation flag for \"%s\" does not match IsSharedRelation(%u)", relname, relid); + /* Shared relations had better be mapped, too */ + Assert(mapped_relation || !shared_relation); + /* * switch to the cache context to create the relcache entry. */ @@ -2512,7 +2576,9 @@ RelationBuildLocalRelation(const char *relname, /* * Insert relation physical and logical identifiers (OIDs) into the right * places. Note that the physical ID (relfilenode) is initially the same - * as the logical ID (OID). + * as the logical ID (OID); except that for a mapped relation, we set + * relfilenode to zero and rely on RelationInitPhysicalAddr to consult + * the map. */ rel->rd_rel->relisshared = shared_relation; rel->rd_rel->relistemp = rel->rd_istemp; @@ -2522,9 +2588,17 @@ RelationBuildLocalRelation(const char *relname, for (i = 0; i < natts; i++) rel->rd_att->attrs[i]->attrelid = relid; - rel->rd_rel->relfilenode = relid; rel->rd_rel->reltablespace = reltablespace; + if (mapped_relation) + { + rel->rd_rel->relfilenode = InvalidOid; + /* Add it to the active mapping information */ + RelationMapUpdateMap(relid, relid, shared_relation, true); + } + else + rel->rd_rel->relfilenode = relid; + RelationInitLockInfo(rel); /* see lmgr.c */ RelationInitPhysicalAddr(rel); @@ -2577,24 +2651,16 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid) HeapTuple tuple; Form_pg_class classform; - /* Can't change relfilenode for nailed tables (indexes ok though) */ - Assert(!relation->rd_isnailed || - relation->rd_rel->relkind == RELKIND_INDEX); - /* Can't change for shared tables or indexes */ - Assert(!relation->rd_rel->relisshared); /* Indexes must have Invalid frozenxid; other relations must not */ Assert((relation->rd_rel->relkind == RELKIND_INDEX && freezeXid == InvalidTransactionId) || TransactionIdIsNormal(freezeXid)); /* Allocate a new relfilenode */ - newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, - relation->rd_rel->relisshared, - NULL); + newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL); /* - * Find the pg_class tuple for the given relation. This is not used - * during bootstrap, so okay to use heap_update always. + * Get a writable copy of the pg_class tuple for the given relation. */ pg_class = heap_open(RelationRelationId, RowExclusiveLock); @@ -2623,12 +2689,23 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid) RelationDropStorage(relation); /* - * Now update the pg_class row. + * Now update the pg_class row. However, if we're dealing with a mapped + * index, pg_class.relfilenode doesn't change; instead we have to send + * the update to the relation mapper. */ - classform->relfilenode = newrelfilenode; + if (RelationIsMapped(relation)) + RelationMapUpdateMap(RelationGetRelid(relation), + newrelfilenode, + relation->rd_rel->relisshared, + false); + else + classform->relfilenode = newrelfilenode; + + /* These changes are safe even for a mapped relation */ classform->relpages = 0; /* it's empty until further notice */ classform->reltuples = 0; classform->relfrozenxid = freezeXid; + simple_heap_update(pg_class, &tuple->t_self, tuple); CatalogUpdateIndexes(pg_class, tuple); @@ -2637,8 +2714,8 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid) heap_close(pg_class, RowExclusiveLock); /* - * Make the pg_class row change visible. This will cause the relcache - * entry to get updated, too. + * Make the pg_class row change visible, as well as the relation map + * change if any. This will cause the relcache entry to get updated, too. */ CommandCounterIncrement(); @@ -2687,6 +2764,11 @@ RelationCacheInitialize(void) ctl.hash = oid_hash; RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE, &ctl, HASH_ELEM | HASH_FUNCTION); + + /* + * relation mapper needs initialized too + */ + RelationMapInitialize(); } /* @@ -2705,6 +2787,11 @@ RelationCacheInitializePhase2(void) MemoryContext oldcxt; /* + * relation mapper needs initialized too + */ + RelationMapInitializePhase2(); + + /* * In bootstrap mode, pg_database isn't there yet anyway, so do nothing. */ if (IsBootstrapProcessingMode()) @@ -2753,6 +2840,11 @@ RelationCacheInitializePhase3(void) bool needNewCacheFile = !criticalSharedRelcachesBuilt; /* + * relation mapper needs initialized too + */ + RelationMapInitializePhase3(); + + /* * switch to cache memory context */ oldcxt = MemoryContextSwitchTo(CacheMemoryContext); diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c new file mode 100644 index 00000000000..b22cadf6eb5 --- /dev/null +++ b/src/backend/utils/cache/relmapper.c @@ -0,0 +1,913 @@ +/*------------------------------------------------------------------------- + * + * relmapper.c + * Catalog-to-filenode mapping + * + * For most tables, the physical file underlying the table is specified by + * pg_class.relfilenode. However, that obviously won't work for pg_class + * itself, nor for the other "nailed" catalogs for which we have to be able + * to set up working Relation entries without access to pg_class. It also + * does not work for shared catalogs, since there is no practical way to + * update other databases' pg_class entries when relocating a shared catalog. + * Therefore, for these special catalogs (henceforth referred to as "mapped + * catalogs") we rely on a separately maintained file that shows the mapping + * from catalog OIDs to filenode numbers. Each database has a map file for + * its local mapped catalogs, and there is a separate map file for shared + * catalogs. Mapped catalogs have zero in their pg_class.relfilenode entries. + * + * Relocation of a normal table is committed (ie, the new physical file becomes + * authoritative) when the pg_class row update commits. For mapped catalogs, + * the act of updating the map file is effectively commit of the relocation. + * We postpone the file update till just before commit of the transaction + * doing the rewrite, but there is necessarily a window between. Therefore + * mapped catalogs can only be relocated by operations such as VACUUM FULL + * and CLUSTER, which make no transactionally-significant changes: it must be + * safe for the new file to replace the old, even if the transaction itself + * aborts. An important factor here is that the indexes and toast table of + * a mapped catalog must also be mapped, so that the rewrites/relocations of + * all these files commit in a single map file update rather than being tied + * to transaction commit. + * + * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/cache/relmapper.c,v 1.1 2010/02/07 20:48:10 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <fcntl.h> +#include <unistd.h> + +#include "access/xact.h" +#include "catalog/catalog.h" +#include "catalog/pg_tablespace.h" +#include "catalog/storage.h" +#include "miscadmin.h" +#include "storage/fd.h" +#include "storage/lwlock.h" +#include "utils/inval.h" +#include "utils/pg_crc.h" +#include "utils/relmapper.h" + + +/* + * The map file is critical data: we have no automatic method for recovering + * from loss or corruption of it. We use a CRC so that we can detect + * corruption. To minimize the risk of failed updates, the map file should + * be kept to no more than one standard-size disk sector (ie 512 bytes), + * and we use overwrite-in-place rather than playing renaming games. + * The struct layout below is designed to occupy exactly 512 bytes, which + * might make filesystem updates a bit more efficient. + * + * Entries in the mappings[] array are in no particular order. We could + * speed searching by insisting on OID order, but it really shouldn't be + * worth the trouble given the intended size of the mapping sets. + */ +#define RELMAPPER_FILENAME "pg_filenode.map" + +#define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */ + +#define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */ + +typedef struct RelMapping +{ + Oid mapoid; /* OID of a catalog */ + Oid mapfilenode; /* its filenode number */ +} RelMapping; + +typedef struct RelMapFile +{ + int32 magic; /* always RELMAPPER_FILEMAGIC */ + int32 num_mappings; /* number of valid RelMapping entries */ + RelMapping mappings[MAX_MAPPINGS]; + int32 crc; /* CRC of all above */ + int32 pad; /* to make the struct size be 512 exactly */ +} RelMapFile; + +/* + * The currently known contents of the shared map file and our database's + * local map file are stored here. These can be reloaded from disk + * immediately whenever we receive an update sinval message. + */ +static RelMapFile shared_map; +static RelMapFile local_map; + +/* + * We use the same RelMapFile data structure to track uncommitted local + * changes in the mappings (but note the magic and crc fields are not made + * valid in these variables). Currently, map updates are not allowed within + * subtransactions, so one set of transaction-level changes is sufficient. + * + * The active_xxx variables contain updates that are valid in our transaction + * and should be honored by RelationMapOidToFilenode. The pending_xxx + * variables contain updates we have been told about that aren't active yet; + * they will become active at the next CommandCounterIncrement. This setup + * lets map updates act similarly to updates of pg_class rows, ie, they + * become visible only at the next CommandCounterIncrement boundary. + */ +static RelMapFile active_shared_updates; +static RelMapFile active_local_updates; +static RelMapFile pending_shared_updates; +static RelMapFile pending_local_updates; + + +/* non-export function prototypes */ +static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, + bool add_okay); +static void merge_map_updates(RelMapFile *map, const RelMapFile *updates, + bool add_okay); +static void load_relmap_file(bool shared); +static void write_relmap_file(bool shared, RelMapFile *newmap, + bool write_wal, bool send_sinval, bool preserve_files, + Oid dbid, Oid tsid, const char *dbpath); +static void perform_relmap_update(bool shared, const RelMapFile *updates); + + +/* + * RelationMapOidToFilenode + * + * The raison d' etre ... given a relation OID, look up its filenode. + * + * Although shared and local relation OIDs should never overlap, the caller + * always knows which we need --- so pass that information to avoid useless + * searching. + * + * Returns InvalidOid if the OID is not known (which should never happen, + * but the caller is in a better position to report a meaningful error). + */ +Oid +RelationMapOidToFilenode(Oid relationId, bool shared) +{ + const RelMapFile *map; + int32 i; + + /* If there are active updates, believe those over the main maps */ + if (shared) + { + map = &active_shared_updates; + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + return map->mappings[i].mapfilenode; + } + map = &shared_map; + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + return map->mappings[i].mapfilenode; + } + } + else + { + map = &active_local_updates; + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + return map->mappings[i].mapfilenode; + } + map = &local_map; + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + return map->mappings[i].mapfilenode; + } + } + + return InvalidOid; +} + +/* + * RelationMapUpdateMap + * + * Install a new relfilenode mapping for the specified relation. + * + * If immediate is true (or we're bootstrapping), the mapping is activated + * immediately. Otherwise it is made pending until CommandCounterIncrement. + */ +void +RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared, + bool immediate) +{ + RelMapFile *map; + + if (IsBootstrapProcessingMode()) + { + /* + * In bootstrap mode, the mapping gets installed in permanent map. + */ + if (shared) + map = &shared_map; + else + map = &local_map; + } + else + { + /* + * We don't currently support map changes within subtransactions. + * This could be done with more bookkeeping infrastructure, but it + * doesn't presently seem worth it. + */ + if (GetCurrentTransactionNestLevel() > 1) + elog(ERROR, "cannot change relation mapping within subtransaction"); + + if (immediate) + { + /* Make it active, but only locally */ + if (shared) + map = &active_shared_updates; + else + map = &active_local_updates; + } + else + { + /* Make it pending */ + if (shared) + map = &pending_shared_updates; + else + map = &pending_local_updates; + } + } + apply_map_update(map, relationId, fileNode, true); +} + +/* + * apply_map_update + * + * Insert a new mapping into the given map variable, replacing any existing + * mapping for the same relation. + * + * In some cases the caller knows there must be an existing mapping; pass + * add_okay = false to draw an error if not. + */ +static void +apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay) +{ + int32 i; + + /* Replace any existing mapping */ + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + { + map->mappings[i].mapfilenode = fileNode; + return; + } + } + + /* Nope, need to add a new mapping */ + if (!add_okay) + elog(ERROR, "attempt to apply a mapping to unmapped relation %u", + relationId); + if (map->num_mappings >= MAX_MAPPINGS) + elog(ERROR, "ran out of space in relation map"); + map->mappings[map->num_mappings].mapoid = relationId; + map->mappings[map->num_mappings].mapfilenode = fileNode; + map->num_mappings++; +} + +/* + * merge_map_updates + * + * Merge all the updates in the given pending-update map into the target map. + * This is just a bulk form of apply_map_update. + */ +static void +merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay) +{ + int32 i; + + for (i = 0; i < updates->num_mappings; i++) + { + apply_map_update(map, + updates->mappings[i].mapoid, + updates->mappings[i].mapfilenode, + add_okay); + } +} + +/* + * RelationMapRemoveMapping + * + * Remove a relation's entry in the map. This is only allowed for "active" + * (but not committed) local mappings. We need it so we can back out the + * entry for the transient target file when doing VACUUM FULL/CLUSTER on + * a mapped relation. + */ +void +RelationMapRemoveMapping(Oid relationId) +{ + RelMapFile *map = &active_local_updates; + int32 i; + + for (i = 0; i < map->num_mappings; i++) + { + if (relationId == map->mappings[i].mapoid) + { + /* Found it, collapse it out */ + map->mappings[i] = map->mappings[map->num_mappings - 1]; + map->num_mappings--; + return; + } + } + elog(ERROR, "could not find temporary mapping for relation %u", + relationId); +} + +/* + * RelationMapInvalidate + * + * This routine is invoked for SI cache flush messages. We must re-read + * the indicated map file. However, we might receive a SI message in a + * process that hasn't yet, and might never, load the mapping files; + * for example the autovacuum launcher, which *must not* try to read + * a local map since it is attached to no particular database. + * So, re-read only if the map is valid now. + */ +void +RelationMapInvalidate(bool shared) +{ + if (shared) + { + if (shared_map.magic == RELMAPPER_FILEMAGIC) + load_relmap_file(true); + } + else + { + if (local_map.magic == RELMAPPER_FILEMAGIC) + load_relmap_file(false); + } +} + +/* + * RelationMapInvalidateAll + * + * Reload all map files. This is used to recover from SI message buffer + * overflow: we can't be sure if we missed an inval message. + * Again, reload only currently-valid maps. + */ +void +RelationMapInvalidateAll(void) +{ + if (shared_map.magic == RELMAPPER_FILEMAGIC) + load_relmap_file(true); + if (local_map.magic == RELMAPPER_FILEMAGIC) + load_relmap_file(false); +} + +/* + * AtCCI_RelationMap + * + * Activate any "pending" relation map updates at CommandCounterIncrement time. + */ +void +AtCCI_RelationMap(void) +{ + if (pending_shared_updates.num_mappings != 0) + { + merge_map_updates(&active_shared_updates, + &pending_shared_updates, + true); + pending_shared_updates.num_mappings = 0; + } + if (pending_local_updates.num_mappings != 0) + { + merge_map_updates(&active_local_updates, + &pending_local_updates, + true); + pending_local_updates.num_mappings = 0; + } +} + +/* + * AtEOXact_RelationMap + * + * Handle relation mapping at main-transaction commit or abort. + * + * During commit, this must be called as late as possible before the actual + * transaction commit, so as to minimize the window where the transaction + * could still roll back after committing map changes. Although nothing + * critically bad happens in such a case, we still would prefer that it + * not happen, since we'd possibly be losing useful updates to the relations' + * pg_class row(s). + * + * During abort, we just have to throw away any pending map changes. + * Normal post-abort cleanup will take care of fixing relcache entries. + */ +void +AtEOXact_RelationMap(bool isCommit) +{ + if (isCommit) + { + /* + * We should not get here with any "pending" updates. (We could + * logically choose to treat such as committed, but in the current + * code this should never happen.) + */ + Assert(pending_shared_updates.num_mappings == 0); + Assert(pending_local_updates.num_mappings == 0); + + /* + * Write any active updates to the actual map files, then reset them. + */ + if (active_shared_updates.num_mappings != 0) + { + perform_relmap_update(true, &active_shared_updates); + active_shared_updates.num_mappings = 0; + } + if (active_local_updates.num_mappings != 0) + { + perform_relmap_update(false, &active_local_updates); + active_local_updates.num_mappings = 0; + } + } + else + { + /* Abort --- drop all local and pending updates */ + active_shared_updates.num_mappings = 0; + active_local_updates.num_mappings = 0; + pending_shared_updates.num_mappings = 0; + pending_local_updates.num_mappings = 0; + } +} + +/* + * AtPrepare_RelationMap + * + * Handle relation mapping at PREPARE. + * + * Currently, we don't support preparing any transaction that changes the map. + */ +void +AtPrepare_RelationMap(void) +{ + if (active_shared_updates.num_mappings != 0 || + active_local_updates.num_mappings != 0 || + pending_shared_updates.num_mappings != 0 || + pending_local_updates.num_mappings != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE a transaction that modified relation mapping"))); +} + +/* + * CheckPointRelationMap + * + * This is called during a checkpoint. It must ensure that any relation map + * updates that were WAL-logged before the start of the checkpoint are + * securely flushed to disk and will not need to be replayed later. This + * seems unlikely to be a performance-critical issue, so we use a simple + * method: we just take and release the RelationMappingLock. This ensures + * that any already-logged map update is complete, because write_relmap_file + * will fsync the map file before the lock is released. + */ +void +CheckPointRelationMap(void) +{ + LWLockAcquire(RelationMappingLock, LW_SHARED); + LWLockRelease(RelationMappingLock); +} + +/* + * RelationMapFinishBootstrap + * + * Write out the initial relation mapping files at the completion of + * bootstrap. All the mapped files should have been made known to us + * via RelationMapUpdateMap calls. + */ +void +RelationMapFinishBootstrap(void) +{ + Assert(IsBootstrapProcessingMode()); + + /* Shouldn't be anything "pending" ... */ + Assert(active_shared_updates.num_mappings == 0); + Assert(active_local_updates.num_mappings == 0); + Assert(pending_shared_updates.num_mappings == 0); + Assert(pending_local_updates.num_mappings == 0); + + /* Write the files; no WAL or sinval needed */ + write_relmap_file(true, &shared_map, false, false, false, + InvalidOid, GLOBALTABLESPACE_OID, NULL); + write_relmap_file(false, &local_map, false, false, false, + MyDatabaseId, MyDatabaseTableSpace, DatabasePath); +} + +/* + * RelationMapInitialize + * + * This initializes the mapper module at process startup. We can't access the + * database yet, so just make sure the maps are empty. + */ +void +RelationMapInitialize(void) +{ + /* The static variables should initialize to zeroes, but let's be sure */ + shared_map.magic = 0; /* mark it not loaded */ + local_map.magic = 0; + shared_map.num_mappings = 0; + local_map.num_mappings = 0; + active_shared_updates.num_mappings = 0; + active_local_updates.num_mappings = 0; + pending_shared_updates.num_mappings = 0; + pending_local_updates.num_mappings = 0; +} + +/* + * RelationMapInitializePhase2 + * + * This is called to prepare for access to pg_database during startup. + * We should be able to read the shared map file now. + */ +void +RelationMapInitializePhase2(void) +{ + /* + * In bootstrap mode, the map file isn't there yet, so do nothing. + */ + if (IsBootstrapProcessingMode()) + return; + + /* + * Load the shared map file, die on error. + */ + load_relmap_file(true); +} + +/* + * RelationMapInitializePhase3 + * + * This is called as soon as we have determined MyDatabaseId and set up + * DatabasePath. At this point we should be able to read the local map file. + */ +void +RelationMapInitializePhase3(void) +{ + /* + * In bootstrap mode, the map file isn't there yet, so do nothing. + */ + if (IsBootstrapProcessingMode()) + return; + + /* + * Load the local map file, die on error. + */ + load_relmap_file(false); +} + +/* + * load_relmap_file -- load data from the shared or local map file + * + * Because the map file is essential for access to core system catalogs, + * failure to read it is a fatal error. + * + * Note that the local case requires DatabasePath to be set up. + */ +static void +load_relmap_file(bool shared) +{ + RelMapFile *map; + char mapfilename[MAXPGPATH]; + pg_crc32 crc; + int fd; + + if (shared) + { + snprintf(mapfilename, sizeof(mapfilename), "global/%s", + RELMAPPER_FILENAME); + map = &shared_map; + } + else + { + snprintf(mapfilename, sizeof(mapfilename), "%s/%s", + DatabasePath, RELMAPPER_FILENAME); + map = &local_map; + } + + /* Read data ... */ + fd = BasicOpenFile(mapfilename, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR); + if (fd < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open relation mapping file \"%s\": %m", + mapfilename))); + + /* + * Note: we could take RelationMappingLock in shared mode here, but it + * seems unnecessary since our read() should be atomic against any + * concurrent updater's write(). If the file is updated shortly after + * we look, the sinval signaling mechanism will make us re-read it + * before we are able to access any relation that's affected by the + * change. + */ + if (read(fd, map, sizeof(RelMapFile)) != sizeof(RelMapFile)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read relation mapping file \"%s\": %m", + mapfilename))); + + close(fd); + + /* check for correct magic number, etc */ + if (map->magic != RELMAPPER_FILEMAGIC || + map->num_mappings < 0 || + map->num_mappings > MAX_MAPPINGS) + ereport(FATAL, + (errmsg("relation mapping file \"%s\" contains invalid data", + mapfilename))); + + /* verify the CRC */ + INIT_CRC32(crc); + COMP_CRC32(crc, (char *) map, offsetof(RelMapFile, crc)); + FIN_CRC32(crc); + + if (!EQ_CRC32(crc, map->crc)) + ereport(FATAL, + (errmsg("relation mapping file \"%s\" contains incorrect checksum", + mapfilename))); +} + +/* + * Write out a new shared or local map file with the given contents. + * + * The magic number and CRC are automatically updated in *newmap. On + * success, we copy the data to the appropriate permanent static variable. + * + * If write_wal is TRUE then an appropriate WAL message is emitted. + * (It will be false for bootstrap and WAL replay cases.) + * + * If send_sinval is TRUE then a SI invalidation message is sent. + * (This should be true except in bootstrap case.) + * + * If preserve_files is TRUE then the storage manager is warned not to + * delete the files listed in the map. + * + * Because this may be called during WAL replay when MyDatabaseId, + * DatabasePath, etc aren't valid, we require the caller to pass in suitable + * values. The caller is also responsible for being sure no concurrent + * map update could be happening. + */ +static void +write_relmap_file(bool shared, RelMapFile *newmap, + bool write_wal, bool send_sinval, bool preserve_files, + Oid dbid, Oid tsid, const char *dbpath) +{ + int fd; + RelMapFile *realmap; + char mapfilename[MAXPGPATH]; + + /* + * Fill in the overhead fields and update CRC. + */ + newmap->magic = RELMAPPER_FILEMAGIC; + if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS) + elog(ERROR, "attempt to write bogus relation mapping"); + + INIT_CRC32(newmap->crc); + COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc)); + FIN_CRC32(newmap->crc); + + /* + * Open the target file. We prefer to do this before entering the + * critical section, so that an open() failure need not force PANIC. + * + * Note: since we use BasicOpenFile, we are nominally responsible for + * ensuring the fd is closed on error. In practice, this isn't important + * because either an error happens inside the critical section, or we + * are in bootstrap or WAL replay; so an error past this point is always + * fatal anyway. + */ + if (shared) + { + snprintf(mapfilename, sizeof(mapfilename), "global/%s", + RELMAPPER_FILENAME); + realmap = &shared_map; + } + else + { + snprintf(mapfilename, sizeof(mapfilename), "%s/%s", + dbpath, RELMAPPER_FILENAME); + realmap = &local_map; + } + + fd = BasicOpenFile(mapfilename, + O_WRONLY | O_CREAT | PG_BINARY, + S_IRUSR | S_IWUSR); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open relation mapping file \"%s\": %m", + mapfilename))); + + if (write_wal) + { + xl_relmap_update xlrec; + XLogRecData rdata[2]; + XLogRecPtr lsn; + + /* now errors are fatal ... */ + START_CRIT_SECTION(); + + xlrec.dbid = dbid; + xlrec.tsid = tsid; + xlrec.nbytes = sizeof(RelMapFile); + + rdata[0].data = (char *) (&xlrec); + rdata[0].len = MinSizeOfRelmapUpdate; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + rdata[1].data = (char *) newmap; + rdata[1].len = sizeof(RelMapFile); + rdata[1].buffer = InvalidBuffer; + rdata[1].next = NULL; + + lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata); + + /* As always, WAL must hit the disk before the data update does */ + XLogFlush(lsn); + } + + errno = 0; + if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile)) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to relation mapping file \"%s\": %m", + mapfilename))); + } + + /* + * We choose to fsync the data to disk before considering the task done. + * It would be possible to relax this if it turns out to be a performance + * issue, but it would complicate checkpointing --- see notes for + * CheckPointRelationMap. + */ + if (pg_fsync(fd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync relation mapping file \"%s\": %m", + mapfilename))); + + if (close(fd)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close relation mapping file \"%s\": %m", + mapfilename))); + + /* + * Now that the file is safely on disk, send sinval message to let other + * backends know to re-read it. We must do this inside the critical + * section: if for some reason we fail to send the message, we have to + * force a database-wide PANIC. Otherwise other backends might continue + * execution with stale mapping information, which would be catastrophic + * as soon as others began to use the now-committed data. + */ + if (send_sinval) + CacheInvalidateRelmap(dbid); + + /* + * Make sure that the files listed in the map are not deleted if the + * outer transaction aborts. This had better be within the critical + * section too: it's not likely to fail, but if it did, we'd arrive + * at transaction abort with the files still vulnerable. PANICing + * will leave things in a good state on-disk. + * + * Note: we're cheating a little bit here by assuming that mapped files + * are either in pg_global or the database's default tablespace. + */ + if (preserve_files) + { + int32 i; + + for (i = 0; i < newmap->num_mappings; i++) + { + RelFileNode rnode; + + rnode.spcNode = tsid; + rnode.dbNode = dbid; + rnode.relNode = newmap->mappings[i].mapfilenode; + RelationPreserveStorage(rnode); + } + } + + /* Success, update permanent copy */ + memcpy(realmap, newmap, sizeof(RelMapFile)); + + /* Critical section done */ + if (write_wal) + END_CRIT_SECTION(); +} + +/* + * Merge the specified updates into the appropriate "real" map, + * and write out the changes. This function must be used for committing + * updates during normal multiuser operation. + */ +static void +perform_relmap_update(bool shared, const RelMapFile *updates) +{ + RelMapFile newmap; + + /* + * Anyone updating a relation's mapping info should take exclusive lock + * on that rel and hold it until commit. This ensures that there will + * not be concurrent updates on the same mapping value; but there could + * easily be concurrent updates on different values in the same file. + * We cover that by acquiring the RelationMappingLock, re-reading the + * target file to ensure it's up to date, applying the updates, and + * writing the data before releasing RelationMappingLock. + * + * There is only one RelationMappingLock. In principle we could try to + * have one per mapping file, but it seems unlikely to be worth the + * trouble. + */ + LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE); + + /* Be certain we see any other updates just made */ + load_relmap_file(shared); + + /* Prepare updated data in a local variable */ + if (shared) + memcpy(&newmap, &shared_map, sizeof(RelMapFile)); + else + memcpy(&newmap, &local_map, sizeof(RelMapFile)); + + /* Apply the updates to newmap. No new mappings should appear. */ + merge_map_updates(&newmap, updates, false); + + /* Write out the updated map and do other necessary tasks */ + write_relmap_file(shared, &newmap, true, true, true, + (shared ? InvalidOid : MyDatabaseId), + (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace), + DatabasePath); + + /* Now we can release the lock */ + LWLockRelease(RelationMappingLock); +} + +/* + * RELMAP resource manager's routines + */ +void +relmap_redo(XLogRecPtr lsn, XLogRecord *record) +{ + uint8 info = record->xl_info & ~XLR_INFO_MASK; + + /* Backup blocks are not used in relmap records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + + if (info == XLOG_RELMAP_UPDATE) + { + xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record); + RelMapFile newmap; + char *dbpath; + + if (xlrec->nbytes != sizeof(RelMapFile)) + elog(PANIC, "relmap_redo: wrong size %u in relmap update record", + xlrec->nbytes); + memcpy(&newmap, xlrec->data, sizeof(newmap)); + + /* We need to construct the pathname for this database */ + dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid); + + /* + * Write out the new map and send sinval, but of course don't + * write a new WAL entry. There's no surrounding transaction + * to tell to preserve files, either. + * + * There shouldn't be anyone else updating relmaps during WAL replay, + * so we don't bother to take the RelationMappingLock. We would + * need to do so if load_relmap_file needed to interlock against + * writers. + */ + write_relmap_file((xlrec->dbid == InvalidOid), &newmap, + false, true, false, + xlrec->dbid, xlrec->tsid, dbpath); + + pfree(dbpath); + } + else + elog(PANIC, "relmap_redo: unknown op code %u", info); +} + +void +relmap_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + if (info == XLOG_RELMAP_UPDATE) + { + xl_relmap_update *xlrec = (xl_relmap_update *) rec; + + appendStringInfo(buf, "update relmap: database %u tablespace %u size %u", + xlrec->dbid, xlrec->tsid, xlrec->nbytes); + } + else + appendStringInfo(buf, "UNKNOWN"); +} |