diff options
Diffstat (limited to 'src')
220 files changed, 6622 insertions, 3710 deletions
diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 04952b533de..8b1b357beaa 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -254,7 +254,7 @@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ PG_SYSROOT = @PG_SYSROOT@ -override CPPFLAGS := $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) $(CPPFLAGS) +override CPPFLAGS += $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) ifdef PGXS override CPPFLAGS := -I$(includedir_server) -I$(includedir_internal) $(CPPFLAGS) diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c index 21f2f4af97e..926f1e4008a 100644 --- a/src/backend/access/common/toast_compression.c +++ b/src/backend/access/common/toast_compression.c @@ -25,11 +25,11 @@ /* GUC */ int default_toast_compression = TOAST_PGLZ_COMPRESSION; -#define NO_LZ4_SUPPORT() \ +#define NO_COMPRESSION_SUPPORT(method) \ ereport(ERROR, \ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ - errmsg("compression method lz4 not supported"), \ - errdetail("This functionality requires the server to be built with lz4 support."))) + errmsg("compression method %s not supported", method), \ + errdetail("This functionality requires the server to be built with %s support.", method))) /* * Compress a varlena using PGLZ. @@ -139,7 +139,7 @@ struct varlena * lz4_compress_datum(const struct varlena *value) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 valsize; @@ -182,7 +182,7 @@ struct varlena * lz4_decompress_datum(const struct varlena *value) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 rawsize; @@ -215,7 +215,7 @@ struct varlena * lz4_decompress_datum_slice(const struct varlena *value, int32 slicelength) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 rawsize; @@ -289,7 +289,7 @@ CompressionNameToMethod(const char *compression) else if (strcmp(compression, "lz4") == 0) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); #endif return TOAST_LZ4_COMPRESSION; } diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 745a04ef26e..8f918e00af7 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -364,7 +364,7 @@ visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) { *vmbuf = vm_readbuf(rel, mapBlock, false); if (!BufferIsValid(*vmbuf)) - return false; + return (uint8) 0; } map = PageGetContents(BufferGetPage(*vmbuf)); diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index f0f4f974bce..60684c53422 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -42,6 +42,19 @@ GetIndexAmRoutine(Oid amhandler) elog(ERROR, "index access method handler function %u did not return an IndexAmRoutine struct", amhandler); + /* Assert that all required callbacks are present. */ + Assert(routine->ambuild != NULL); + Assert(routine->ambuildempty != NULL); + Assert(routine->aminsert != NULL); + Assert(routine->ambulkdelete != NULL); + Assert(routine->amvacuumcleanup != NULL); + Assert(routine->amcostestimate != NULL); + Assert(routine->amoptions != NULL); + Assert(routine->amvalidate != NULL); + Assert(routine->ambeginscan != NULL); + Assert(routine->amrescan != NULL); + Assert(routine->amendscan != NULL); + return routine; } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 4af1ff1e9e5..d69798795b4 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -892,9 +892,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) OffsetNumber offnum; BTScanInsertData inskey; ScanKey startKeys[INDEX_MAX_KEYS]; - ScanKeyData notnullkeys[INDEX_MAX_KEYS]; + ScanKeyData notnullkey; int keysz = 0; - StrategyNumber strat_total; + StrategyNumber strat_total = InvalidStrategy; BlockNumber blkno = InvalidBlockNumber, lastcurrblkno; @@ -1034,7 +1034,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * need to be kept in sync. *---------- */ - strat_total = BTEqualStrategyNumber; if (so->numberOfKeys > 0) { AttrNumber curattr; @@ -1122,16 +1121,15 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) ScanDirectionIsForward(dir) : ScanDirectionIsBackward(dir))) { - /* Yes, so build the key in notnullkeys[keysz] */ - bkey = ¬nullkeys[keysz]; + /* Final startKeys[] entry will be deduced NOT NULL key */ + bkey = ¬nullkey; ScanKeyEntryInitialize(bkey, (SK_SEARCHNOTNULL | SK_ISNULL | (impliesNN->sk_flags & (SK_BT_DESC | SK_BT_NULLS_FIRST))), curattr, - ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? - BTGreaterStrategyNumber : - BTLessStrategyNumber), + ScanDirectionIsForward(dir) ? + BTGreaterStrategyNumber : BTLessStrategyNumber, InvalidOid, InvalidOid, InvalidOid, diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 85cbe397cb2..7918176fc58 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -1183,7 +1183,11 @@ EndPrepare(GlobalTransaction gxact) * starting immediately after the WAL record is inserted could complete * without fsync'ing our state file. (This is essentially the same kind * of race condition as the COMMIT-to-clog-write case that - * RecordTransactionCommit uses DELAY_CHKPT_START for; see notes there.) + * RecordTransactionCommit uses DELAY_CHKPT_IN_COMMIT for; see notes + * there.) Note that DELAY_CHKPT_IN_COMMIT is used to find transactions in + * the critical commit section. We need to know about such transactions + * for conflict detection in logical replication. See + * GetOldestActiveTransactionId(true, false) and its use. * * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. @@ -2298,7 +2302,7 @@ ProcessTwoPhaseBuffer(FullTransactionId fxid, * RecordTransactionCommitPrepared * * This is basically the same as RecordTransactionCommit (q.v. if you change - * this function): in particular, we must set DELAY_CHKPT_START to avoid a + * this function): in particular, we must set DELAY_CHKPT_IN_COMMIT to avoid a * race condition. * * We know the transaction made at least one XLOG entry (its PREPARE), @@ -2318,7 +2322,7 @@ RecordTransactionCommitPrepared(TransactionId xid, const char *gid) { XLogRecPtr recptr; - TimestampTz committs = GetCurrentTimestamp(); + TimestampTz committs; bool replorigin; /* @@ -2331,8 +2335,24 @@ RecordTransactionCommitPrepared(TransactionId xid, START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); + MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + + /* + * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible before + * commit time is written. + */ + pg_write_barrier(); + + /* + * Note it is important to set committs value after marking ourselves as + * in the commit critical section (DELAY_CHKPT_IN_COMMIT). This is because + * we want to ensure all transactions that have acquired commit timestamp + * are finished before we allow the logical replication client to advance + * its xid which is used to hold back dead rows for conflict detection. + * See comments atop worker.c. + */ + committs = GetCurrentTimestamp(); /* * Emit the XLOG commit record. Note that we mark 2PC commits as @@ -2381,7 +2401,7 @@ RecordTransactionCommitPrepared(TransactionId xid, TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT; END_CRIT_SECTION(); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 41601fcb280..b46e7e9c2a6 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1431,10 +1431,22 @@ RecordTransactionCommit(void) * without holding the ProcArrayLock, since we're the only one * modifying it. This makes checkpoint's determination of which xacts * are delaying the checkpoint a bit fuzzy, but it doesn't matter. + * + * Note, it is important to get the commit timestamp after marking the + * transaction in the commit critical section. See + * RecordTransactionCommitPrepared. */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); START_CRIT_SECTION(); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; + MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + + Assert(xactStopTimestamp == 0); + + /* + * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible + * before commit time is written. + */ + pg_write_barrier(); /* * Insert the commit XLOG record. @@ -1537,7 +1549,7 @@ RecordTransactionCommit(void) */ if (markXidCommitted) { - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT; END_CRIT_SECTION(); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8e7827c6ed9..5553c20fee8 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -96,6 +96,7 @@ #include "utils/guc_hooks.h" #include "utils/guc_tables.h" #include "utils/injection_point.h" +#include "utils/pgstat_internal.h" #include "utils/ps_status.h" #include "utils/relmapper.h" #include "utils/snapmgr.h" @@ -702,7 +703,7 @@ static void InitControlFile(uint64 sysidentifier, uint32 data_checksum_version); static void WriteControlFile(void); static void ReadControlFile(void); static void UpdateControlFile(void); -static char *str_time(pg_time_t tnow); +static char *str_time(pg_time_t tnow, char *buf, size_t bufsize); static int get_sync_bit(int method); @@ -1091,6 +1092,9 @@ XLogInsertRecord(XLogRecData *rdata, pgWalUsage.wal_bytes += rechdr->xl_tot_len; pgWalUsage.wal_records++; pgWalUsage.wal_fpi += num_fpi; + + /* Required for the flush of pending stats WAL data */ + pgstat_report_fixed = true; } return EndPos; @@ -2108,6 +2112,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) LWLockRelease(WALWriteLock); pgWalUsage.wal_buffers_full++; TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); + + /* + * Required for the flush of pending stats WAL data, per + * update of pgWalUsage. + */ + pgstat_report_fixed = true; } } } @@ -5361,11 +5371,9 @@ BootStrapXLOG(uint32 data_checksum_version) } static char * -str_time(pg_time_t tnow) +str_time(pg_time_t tnow, char *buf, size_t bufsize) { - char *buf = palloc(128); - - pg_strftime(buf, 128, + pg_strftime(buf, bufsize, "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&tnow, log_timezone)); @@ -5608,6 +5616,7 @@ StartupXLOG(void) XLogRecPtr missingContrecPtr; TransactionId oldestActiveXID; bool promoted = false; + char timebuf[128]; /* * We should have an aux process resource owner to use, and we should not @@ -5636,25 +5645,29 @@ StartupXLOG(void) */ ereport(IsPostmasterEnvironment ? LOG : NOTICE, (errmsg("database system was shut down at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; case DB_SHUTDOWNED_IN_RECOVERY: ereport(LOG, (errmsg("database system was shut down in recovery at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; case DB_SHUTDOWNING: ereport(LOG, (errmsg("database system shutdown was interrupted; last known up at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; case DB_IN_CRASH_RECOVERY: ereport(LOG, (errmsg("database system was interrupted while in recovery at %s", - str_time(ControlFile->time)), + str_time(ControlFile->time, + timebuf, sizeof(timebuf))), errhint("This probably means that some data is corrupted and" " you will have to use the last backup for recovery."))); break; @@ -5662,7 +5675,8 @@ StartupXLOG(void) case DB_IN_ARCHIVE_RECOVERY: ereport(LOG, (errmsg("database system was interrupted while in recovery at log time %s", - str_time(ControlFile->checkPointCopy.time)), + str_time(ControlFile->checkPointCopy.time, + timebuf, sizeof(timebuf))), errhint("If this has occurred more than once some data might be corrupted" " and you might need to choose an earlier recovery target."))); break; @@ -5670,7 +5684,8 @@ StartupXLOG(void) case DB_IN_PRODUCTION: ereport(LOG, (errmsg("database system was interrupted; last known up at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; default: @@ -6315,6 +6330,12 @@ StartupXLOG(void) */ CompleteCommitTsInitialization(); + /* Clean up EndOfWalRecoveryInfo data to appease Valgrind leak checking */ + if (endOfRecoveryInfo->lastPage) + pfree(endOfRecoveryInfo->lastPage); + pfree(endOfRecoveryInfo->recoveryStopReason); + pfree(endOfRecoveryInfo); + /* * All done with end-of-recovery actions. * @@ -7121,7 +7142,7 @@ CreateCheckPoint(int flags) * starting snapshot of locks and transactions. */ if (!shutdown && XLogStandbyInfoActive()) - checkPoint.oldestActiveXid = GetOldestActiveTransactionId(); + checkPoint.oldestActiveXid = GetOldestActiveTransactionId(false, true); else checkPoint.oldestActiveXid = InvalidTransactionId; diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index ac1f801b1eb..dcc8d4f9c1b 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -723,11 +723,12 @@ restart: /* Calculate pointer to beginning of next page */ targetPagePtr += XLOG_BLCKSZ; - /* Wait for the next page to become available */ - readOff = ReadPageInternal(state, targetPagePtr, - Min(total_len - gotlen + SizeOfXLogShortPHD, - XLOG_BLCKSZ)); - + /* + * Read the page header before processing the record data, so we + * can handle the case where the previous record ended as being a + * partial one. + */ + readOff = ReadPageInternal(state, targetPagePtr, SizeOfXLogShortPHD); if (readOff == XLREAD_WOULDBLOCK) return XLREAD_WOULDBLOCK; else if (readOff < 0) @@ -776,6 +777,15 @@ restart: goto err; } + /* Wait for the next page to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, + XLOG_BLCKSZ)); + if (readOff == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readOff < 0) + goto err; + /* Append the continuation from this page to the buffer */ pageHeaderSize = XLogPageHeaderSize(pageHeader); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 23878b2dd91..f23ec8969c2 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1626,6 +1626,7 @@ ShutdownWalRecovery(void) close(readFile); readFile = -1; } + pfree(xlogreader->private_data); XLogReaderFree(xlogreader); XLogPrefetcherFree(xlogprefetcher); @@ -4760,7 +4761,7 @@ bool check_primary_slot_name(char **newval, void **extra, GucSource source) { if (*newval && strcmp(*newval, "") != 0 && - !ReplicationSlotValidateName(*newval, WARNING)) + !ReplicationSlotValidateName(*newval, false, WARNING)) return false; return true; diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c index 1395032413e..244acf52f36 100644 --- a/src/backend/catalog/pg_subscription.c +++ b/src/backend/catalog/pg_subscription.c @@ -103,6 +103,7 @@ GetSubscription(Oid subid, bool missing_ok) sub->passwordrequired = subform->subpasswordrequired; sub->runasowner = subform->subrunasowner; sub->failover = subform->subfailover; + sub->retaindeadtuples = subform->subretaindeadtuples; /* Get conninfo */ datum = SysCacheGetAttrNotNull(SUBSCRIPTIONOID, @@ -319,7 +320,7 @@ AddSubscriptionRelState(Oid subid, Oid relid, char state, */ void UpdateSubscriptionRelState(Oid subid, Oid relid, char state, - XLogRecPtr sublsn) + XLogRecPtr sublsn, bool already_locked) { Relation rel; HeapTuple tup; @@ -327,9 +328,24 @@ UpdateSubscriptionRelState(Oid subid, Oid relid, char state, Datum values[Natts_pg_subscription_rel]; bool replaces[Natts_pg_subscription_rel]; - LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock); + if (already_locked) + { +#ifdef USE_ASSERT_CHECKING + LOCKTAG tag; - rel = table_open(SubscriptionRelRelationId, RowExclusiveLock); + Assert(CheckRelationOidLockedByMe(SubscriptionRelRelationId, + RowExclusiveLock, true)); + SET_LOCKTAG_OBJECT(tag, InvalidOid, SubscriptionRelationId, subid, 0); + Assert(LockHeldByMe(&tag, AccessShareLock, true)); +#endif + + rel = table_open(SubscriptionRelRelationId, NoLock); + } + else + { + LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock); + rel = table_open(SubscriptionRelRelationId, RowExclusiveLock); + } /* Try finding existing mapping. */ tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP, diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index b2d5332effc..f6eca09ee15 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1386,7 +1386,8 @@ REVOKE ALL ON pg_subscription FROM public; GRANT SELECT (oid, subdbid, subskiplsn, subname, subowner, subenabled, subbinary, substream, subtwophasestate, subdisableonerr, subpasswordrequired, subrunasowner, subfailover, - subslotname, subsynccommit, subpublications, suborigin) + subretaindeadtuples, subslotname, subsynccommit, + subpublications, suborigin) ON pg_subscription TO public; CREATE VIEW pg_stat_subscription_stats AS diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 7e2792ead71..8345bc0264b 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -3582,6 +3582,7 @@ static void show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) { Plan *plan = ((PlanState *) mstate)->plan; + Memoize *mplan = (Memoize *) plan; ListCell *lc; List *context; StringInfoData keystr; @@ -3602,7 +3603,7 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) plan, ancestors); - foreach(lc, ((Memoize *) plan)->param_exprs) + foreach(lc, mplan->param_exprs) { Node *expr = (Node *) lfirst(lc); @@ -3618,6 +3619,24 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) pfree(keystr.data); + if (es->costs) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + appendStringInfo(es->str, "Estimates: capacity=%u distinct keys=%.0f lookups=%.0f hit percent=%.2f%%\n", + mplan->est_entries, mplan->est_unique_keys, + mplan->est_calls, mplan->est_hit_ratio * 100.0); + } + else + { + ExplainPropertyUInteger("Estimated Capacity", NULL, mplan->est_entries, es); + ExplainPropertyFloat("Estimated Distinct Lookup Keys", NULL, mplan->est_unique_keys, 0, es); + ExplainPropertyFloat("Estimated Lookups", NULL, mplan->est_calls, 0, es); + ExplainPropertyFloat("Estimated Hit Percent", NULL, mplan->est_hit_ratio * 100.0, 2, es); + } + } + if (!es->analyze) return; diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c index 8d2d7431544..77f8461f42e 100644 --- a/src/backend/commands/foreigncmds.c +++ b/src/backend/commands/foreigncmds.c @@ -1588,6 +1588,7 @@ ImportForeignSchema(ImportForeignSchemaStmt *stmt) pstmt->utilityStmt = (Node *) cstmt; pstmt->stmt_location = rs->stmt_location; pstmt->stmt_len = rs->stmt_len; + pstmt->planOrigin = PLAN_STMT_INTERNAL; /* Execute statement */ ProcessUtility(pstmt, cmd, false, diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c index 546160f0941..0f03d9743d2 100644 --- a/src/backend/commands/schemacmds.c +++ b/src/backend/commands/schemacmds.c @@ -215,6 +215,7 @@ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString, wrapper->utilityStmt = stmt; wrapper->stmt_location = stmt_location; wrapper->stmt_len = stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; /* do this step */ ProcessUtility(wrapper, diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index e23b0de7242..cd6c3684482 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -14,6 +14,7 @@ #include "postgres.h" +#include "access/commit_ts.h" #include "access/htup_details.h" #include "access/table.h" #include "access/twophase.h" @@ -71,8 +72,9 @@ #define SUBOPT_PASSWORD_REQUIRED 0x00000800 #define SUBOPT_RUN_AS_OWNER 0x00001000 #define SUBOPT_FAILOVER 0x00002000 -#define SUBOPT_LSN 0x00004000 -#define SUBOPT_ORIGIN 0x00008000 +#define SUBOPT_RETAIN_DEAD_TUPLES 0x00004000 +#define SUBOPT_LSN 0x00008000 +#define SUBOPT_ORIGIN 0x00010000 /* check if the 'val' has 'bits' set */ #define IsSet(val, bits) (((val) & (bits)) == (bits)) @@ -98,6 +100,7 @@ typedef struct SubOpts bool passwordrequired; bool runasowner; bool failover; + bool retaindeadtuples; char *origin; XLogRecPtr lsn; } SubOpts; @@ -105,8 +108,10 @@ typedef struct SubOpts static List *fetch_table_list(WalReceiverConn *wrconn, List *publications); static void check_publications_origin(WalReceiverConn *wrconn, List *publications, bool copydata, - char *origin, Oid *subrel_local_oids, - int subrel_count, char *subname); + bool retain_dead_tuples, char *origin, + Oid *subrel_local_oids, int subrel_count, + char *subname); +static void check_pub_dead_tuple_retention(WalReceiverConn *wrconn); static void check_duplicates_in_publist(List *publist, Datum *datums); static List *merge_publications(List *oldpublist, List *newpublist, bool addpub, const char *subname); static void ReportSlotConnectionError(List *rstates, Oid subid, char *slotname, char *err); @@ -162,6 +167,8 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, opts->runasowner = false; if (IsSet(supported_opts, SUBOPT_FAILOVER)) opts->failover = false; + if (IsSet(supported_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + opts->retaindeadtuples = false; if (IsSet(supported_opts, SUBOPT_ORIGIN)) opts->origin = pstrdup(LOGICALREP_ORIGIN_ANY); @@ -210,7 +217,7 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, if (strcmp(opts->slot_name, "none") == 0) opts->slot_name = NULL; else - ReplicationSlotValidateName(opts->slot_name, ERROR); + ReplicationSlotValidateName(opts->slot_name, false, ERROR); } else if (IsSet(supported_opts, SUBOPT_COPY_DATA) && strcmp(defel->defname, "copy_data") == 0) @@ -307,6 +314,15 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, opts->specified_opts |= SUBOPT_FAILOVER; opts->failover = defGetBoolean(defel); } + else if (IsSet(supported_opts, SUBOPT_RETAIN_DEAD_TUPLES) && + strcmp(defel->defname, "retain_dead_tuples") == 0) + { + if (IsSet(opts->specified_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + errorConflictingDefElem(defel, pstate); + + opts->specified_opts |= SUBOPT_RETAIN_DEAD_TUPLES; + opts->retaindeadtuples = defGetBoolean(defel); + } else if (IsSet(supported_opts, SUBOPT_ORIGIN) && strcmp(defel->defname, "origin") == 0) { @@ -563,7 +579,8 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, SUBOPT_SYNCHRONOUS_COMMIT | SUBOPT_BINARY | SUBOPT_STREAMING | SUBOPT_TWOPHASE_COMMIT | SUBOPT_DISABLE_ON_ERR | SUBOPT_PASSWORD_REQUIRED | - SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | SUBOPT_ORIGIN); + SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | + SUBOPT_RETAIN_DEAD_TUPLES | SUBOPT_ORIGIN); parse_subscription_options(pstate, stmt->options, supported_opts, &opts); /* @@ -630,6 +647,10 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, stmt->subname))); } + /* Ensure that we can enable retain_dead_tuples */ + if (opts.retaindeadtuples) + CheckSubDeadTupleRetention(true, !opts.enabled, WARNING); + if (!IsSet(opts.specified_opts, SUBOPT_SLOT_NAME) && opts.slot_name == NULL) opts.slot_name = stmt->subname; @@ -670,6 +691,8 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, values[Anum_pg_subscription_subpasswordrequired - 1] = BoolGetDatum(opts.passwordrequired); values[Anum_pg_subscription_subrunasowner - 1] = BoolGetDatum(opts.runasowner); values[Anum_pg_subscription_subfailover - 1] = BoolGetDatum(opts.failover); + values[Anum_pg_subscription_subretaindeadtuples - 1] = + BoolGetDatum(opts.retaindeadtuples); values[Anum_pg_subscription_subconninfo - 1] = CStringGetTextDatum(conninfo); if (opts.slot_name) @@ -722,7 +745,11 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, { check_publications(wrconn, publications); check_publications_origin(wrconn, publications, opts.copy_data, - opts.origin, NULL, 0, stmt->subname); + opts.retaindeadtuples, opts.origin, + NULL, 0, stmt->subname); + + if (opts.retaindeadtuples) + check_pub_dead_tuple_retention(wrconn); /* * Set sync state based on if we were asked to do data copy or @@ -881,8 +908,8 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, sizeof(Oid), oid_cmp); check_publications_origin(wrconn, sub->publications, copy_data, - sub->origin, subrel_local_oids, - subrel_count, sub->name); + sub->retaindeadtuples, sub->origin, + subrel_local_oids, subrel_count, sub->name); /* * Rels that we want to remove from subscription and drop any slots @@ -1040,18 +1067,22 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, } /* - * Common checks for altering failover and two_phase options. + * Common checks for altering failover, two_phase, and retain_dead_tuples + * options. */ static void CheckAlterSubOption(Subscription *sub, const char *option, bool slot_needs_update, bool isTopLevel) { + Assert(strcmp(option, "failover") == 0 || + strcmp(option, "two_phase") == 0 || + strcmp(option, "retain_dead_tuples") == 0); + /* - * The checks in this function are required only for failover and - * two_phase options. + * Altering the retain_dead_tuples option does not update the slot on the + * publisher. */ - Assert(strcmp(option, "failover") == 0 || - strcmp(option, "two_phase") == 0); + Assert(!slot_needs_update || strcmp(option, "retain_dead_tuples") != 0); /* * Do not allow changing the option if the subscription is enabled. This @@ -1063,6 +1094,39 @@ CheckAlterSubOption(Subscription *sub, const char *option, * the publisher by the existing walsender, so we could have allowed that * even when the subscription is enabled. But we kept this restriction for * the sake of consistency and simplicity. + * + * Additionally, do not allow changing the retain_dead_tuples option when + * the subscription is enabled to prevent race conditions arising from the + * new option value being acknowledged asynchronously by the launcher and + * apply workers. + * + * Without the restriction, a race condition may arise when a user + * disables and immediately re-enables the retain_dead_tuples option. In + * this case, the launcher might drop the slot upon noticing the disabled + * action, while the apply worker may keep maintaining + * oldest_nonremovable_xid without noticing the option change. During this + * period, a transaction ID wraparound could falsely make this ID appear + * as if it originates from the future w.r.t the transaction ID stored in + * the slot maintained by launcher. + * + * Similarly, if the user enables retain_dead_tuples concurrently with the + * launcher starting the worker, the apply worker may start calculating + * oldest_nonremovable_xid before the launcher notices the enable action. + * Consequently, the launcher may update slot.xmin to a newer value than + * that maintained by the worker. In subsequent cycles, upon integrating + * the worker's oldest_nonremovable_xid, the launcher might detect a + * retreat in the calculated xmin, necessitating additional handling. + * + * XXX To address the above race conditions, we can define + * oldest_nonremovable_xid as FullTransactionID and adds the check to + * disallow retreating the conflict slot's xmin. For now, we kept the + * implementation simple by disallowing change to the retain_dead_tuples, + * but in the future we can change this after some more analysis. + * + * Note that we could restrict only the enabling of retain_dead_tuples to + * avoid the race conditions described above, but we maintain the + * restriction for both enable and disable operations for the sake of + * consistency. */ if (sub->enabled) ereport(ERROR, @@ -1110,6 +1174,9 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, bool update_tuple = false; bool update_failover = false; bool update_two_phase = false; + bool check_pub_rdt = false; + bool retain_dead_tuples; + char *origin; Subscription *sub; Form_pg_subscription form; bits32 supported_opts; @@ -1137,6 +1204,9 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, sub = GetSubscription(subid, false); + retain_dead_tuples = sub->retaindeadtuples; + origin = sub->origin; + /* * Don't allow non-superuser modification of a subscription with * password_required=false. @@ -1165,7 +1235,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, SUBOPT_DISABLE_ON_ERR | SUBOPT_PASSWORD_REQUIRED | SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | - SUBOPT_ORIGIN); + SUBOPT_RETAIN_DEAD_TUPLES | SUBOPT_ORIGIN); parse_subscription_options(pstate, stmt->options, supported_opts, &opts); @@ -1325,11 +1395,62 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, replaces[Anum_pg_subscription_subfailover - 1] = true; } + if (IsSet(opts.specified_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + { + values[Anum_pg_subscription_subretaindeadtuples - 1] = + BoolGetDatum(opts.retaindeadtuples); + replaces[Anum_pg_subscription_subretaindeadtuples - 1] = true; + + CheckAlterSubOption(sub, "retain_dead_tuples", false, isTopLevel); + + /* + * Workers may continue running even after the + * subscription has been disabled. + * + * To prevent race conditions (as described in + * CheckAlterSubOption()), ensure that all worker + * processes have already exited before proceeding. + */ + if (logicalrep_workers_find(subid, true, true)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot alter retain_dead_tuples when logical replication worker is still running"), + errhint("Try again after some time."))); + + /* + * Remind the user that enabling subscription will prevent + * the accumulation of dead tuples. + */ + if (opts.retaindeadtuples) + CheckSubDeadTupleRetention(true, !sub->enabled, NOTICE); + + /* + * Notify the launcher to manage the replication slot for + * conflict detection. This ensures that replication slot + * is efficiently handled (created, updated, or dropped) + * in response to any configuration changes. + */ + ApplyLauncherWakeupAtCommit(); + + check_pub_rdt = opts.retaindeadtuples; + retain_dead_tuples = opts.retaindeadtuples; + } + if (IsSet(opts.specified_opts, SUBOPT_ORIGIN)) { values[Anum_pg_subscription_suborigin - 1] = CStringGetTextDatum(opts.origin); replaces[Anum_pg_subscription_suborigin - 1] = true; + + /* + * Check if changes from different origins may be received + * from the publisher when the origin is changed to ANY + * and retain_dead_tuples is enabled. + */ + check_pub_rdt = retain_dead_tuples && + pg_strcasecmp(opts.origin, LOGICALREP_ORIGIN_ANY) == 0; + + origin = opts.origin; } update_tuple = true; @@ -1347,6 +1468,15 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot enable subscription that does not have a slot name"))); + /* + * Check track_commit_timestamp only when enabling the + * subscription in case it was disabled after creation. See + * comments atop CheckSubDeadTupleRetention() for details. + */ + if (sub->retaindeadtuples) + CheckSubDeadTupleRetention(opts.enabled, !opts.enabled, + WARNING); + values[Anum_pg_subscription_subenabled - 1] = BoolGetDatum(opts.enabled); replaces[Anum_pg_subscription_subenabled - 1] = true; @@ -1355,6 +1485,14 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, ApplyLauncherWakeupAtCommit(); update_tuple = true; + + /* + * The subscription might be initially created with + * connect=false and retain_dead_tuples=true, meaning the + * remote server's status may not be checked. Ensure this + * check is conducted now. + */ + check_pub_rdt = sub->retaindeadtuples && opts.enabled; break; } @@ -1369,6 +1507,13 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, CStringGetTextDatum(stmt->conninfo); replaces[Anum_pg_subscription_subconninfo - 1] = true; update_tuple = true; + + /* + * Since the remote server configuration might have changed, + * perform a check to ensure it permits enabling + * retain_dead_tuples. + */ + check_pub_rdt = sub->retaindeadtuples; break; case ALTER_SUBSCRIPTION_SET_PUBLICATION: @@ -1568,14 +1713,15 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, } /* - * Try to acquire the connection necessary for altering the slot, if - * needed. + * Try to acquire the connection necessary either for modifying the slot + * or for checking if the remote server permits enabling + * retain_dead_tuples. * * This has to be at the end because otherwise if there is an error while * doing the database operations we won't be able to rollback altered * slot. */ - if (update_failover || update_two_phase) + if (update_failover || update_two_phase || check_pub_rdt) { bool must_use_password; char *err; @@ -1584,10 +1730,14 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, /* Load the library providing us libpq calls. */ load_file("libpqwalreceiver", false); - /* Try to connect to the publisher. */ + /* + * Try to connect to the publisher, using the new connection string if + * available. + */ must_use_password = sub->passwordrequired && !sub->ownersuperuser; - wrconn = walrcv_connect(sub->conninfo, true, true, must_use_password, - sub->name, &err); + wrconn = walrcv_connect(stmt->conninfo ? stmt->conninfo : sub->conninfo, + true, true, must_use_password, sub->name, + &err); if (!wrconn) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), @@ -1596,9 +1746,17 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, PG_TRY(); { - walrcv_alter_slot(wrconn, sub->slotname, - update_failover ? &opts.failover : NULL, - update_two_phase ? &opts.twophase : NULL); + if (retain_dead_tuples) + check_pub_dead_tuple_retention(wrconn); + + check_publications_origin(wrconn, sub->publications, false, + retain_dead_tuples, origin, NULL, 0, + sub->name); + + if (update_failover || update_two_phase) + walrcv_alter_slot(wrconn, sub->slotname, + update_failover ? &opts.failover : NULL, + update_two_phase ? &opts.twophase : NULL); } PG_FINALLY(); { @@ -2086,20 +2244,29 @@ AlterSubscriptionOwner_oid(Oid subid, Oid newOwnerId) * Check and log a warning if the publisher has subscribed to the same table, * its partition ancestors (if it's a partition), or its partition children (if * it's a partitioned table), from some other publishers. This check is - * required only if "copy_data = true" and "origin = none" for CREATE - * SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements to notify the - * user that data having origin might have been copied. + * required in the following scenarios: * - * This check need not be performed on the tables that are already added - * because incremental sync for those tables will happen through WAL and the - * origin of the data can be identified from the WAL records. + * 1) For CREATE SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements + * with "copy_data = true" and "origin = none": + * - Warn the user that data with an origin might have been copied. + * - This check is skipped for tables already added, as incremental sync via + * WAL allows origin tracking. The list of such tables is in + * subrel_local_oids. * - * subrel_local_oids contains the list of relation oids that are already - * present on the subscriber. + * 2) For CREATE SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements + * with "retain_dead_tuples = true" and "origin = any", and for ALTER + * SUBSCRIPTION statements that modify retain_dead_tuples or origin, or + * when the publisher's status changes (e.g., due to a connection string + * update): + * - Warn the user that only conflict detection info for local changes on + * the publisher is retained. Data from other origins may lack sufficient + * details for reliable conflict detection. + * - See comments atop worker.c for more details. */ static void check_publications_origin(WalReceiverConn *wrconn, List *publications, - bool copydata, char *origin, Oid *subrel_local_oids, + bool copydata, bool retain_dead_tuples, + char *origin, Oid *subrel_local_oids, int subrel_count, char *subname) { WalRcvExecResult *res; @@ -2108,9 +2275,29 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, Oid tableRow[1] = {TEXTOID}; List *publist = NIL; int i; + bool check_rdt; + bool check_table_sync; + bool origin_none = origin && + pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) == 0; + + /* + * Enable retain_dead_tuples checks only when origin is set to 'any', + * since with origin='none' only local changes are replicated to the + * subscriber. + */ + check_rdt = retain_dead_tuples && !origin_none; + + /* + * Enable table synchronization checks only when origin is 'none', to + * ensure that data from other origins is not inadvertently copied. + */ + check_table_sync = copydata && origin_none; - if (!copydata || !origin || - (pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) != 0)) + /* retain_dead_tuples and table sync checks occur separately */ + Assert(!(check_rdt && check_table_sync)); + + /* Return if no checks are required */ + if (!check_rdt && !check_table_sync) return; initStringInfo(&cmd); @@ -2129,16 +2316,23 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, /* * In case of ALTER SUBSCRIPTION ... REFRESH, subrel_local_oids contains * the list of relation oids that are already present on the subscriber. - * This check should be skipped for these tables. + * This check should be skipped for these tables if checking for table + * sync scenario. However, when handling the retain_dead_tuples scenario, + * ensure all tables are checked, as some existing tables may now include + * changes from other origins due to newly created subscriptions on the + * publisher. */ - for (i = 0; i < subrel_count; i++) + if (check_table_sync) { - Oid relid = subrel_local_oids[i]; - char *schemaname = get_namespace_name(get_rel_namespace(relid)); - char *tablename = get_rel_name(relid); + for (i = 0; i < subrel_count; i++) + { + Oid relid = subrel_local_oids[i]; + char *schemaname = get_namespace_name(get_rel_namespace(relid)); + char *tablename = get_rel_name(relid); - appendStringInfo(&cmd, "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n", - schemaname, tablename); + appendStringInfo(&cmd, "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n", + schemaname, tablename); + } } res = walrcv_exec(wrconn, cmd.data, 1, tableRow); @@ -2173,22 +2367,37 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, * XXX: For simplicity, we don't check whether the table has any data or * not. If the table doesn't have any data then we don't need to * distinguish between data having origin and data not having origin so we - * can avoid logging a warning in that case. + * can avoid logging a warning for table sync scenario. */ if (publist) { StringInfo pubnames = makeStringInfo(); + StringInfo err_msg = makeStringInfo(); + StringInfo err_hint = makeStringInfo(); /* Prepare the list of publication(s) for warning message. */ GetPublicationsStr(publist, pubnames, false); + + if (check_table_sync) + { + appendStringInfo(err_msg, _("subscription \"%s\" requested copy_data with origin = NONE but might copy data that had a different origin"), + subname); + appendStringInfoString(err_hint, _("Verify that initial data copied from the publisher tables did not come from other origins.")); + } + else + { + appendStringInfo(err_msg, _("subscription \"%s\" enabled retain_dead_tuples but might not reliably detect conflicts for changes from different origins"), + subname); + appendStringInfoString(err_hint, _("Consider using origin = NONE or disabling retain_dead_tuples.")); + } + ereport(WARNING, errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("subscription \"%s\" requested copy_data with origin = NONE but might copy data that had a different origin", - subname), - errdetail_plural("The subscription being created subscribes to a publication (%s) that contains tables that are written to by other subscriptions.", - "The subscription being created subscribes to publications (%s) that contain tables that are written to by other subscriptions.", + errmsg_internal("%s", err_msg->data), + errdetail_plural("The subscription subscribes to a publication (%s) that contains tables that are written to by other subscriptions.", + "The subscription subscribes to publications (%s) that contain tables that are written to by other subscriptions.", list_length(publist), pubnames->data), - errhint("Verify that initial data copied from the publisher tables did not come from other origins.")); + errhint_internal("%s", err_hint->data)); } ExecDropSingleTupleTableSlot(slot); @@ -2197,6 +2406,101 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, } /* + * Determine whether the retain_dead_tuples can be enabled based on the + * publisher's status. + * + * This option is disallowed if the publisher is running a version earlier + * than the PG19, or if the publisher is in recovery (i.e., it is a standby + * server). + * + * See comments atop worker.c for a detailed explanation. + */ +static void +check_pub_dead_tuple_retention(WalReceiverConn *wrconn) +{ + WalRcvExecResult *res; + Oid RecoveryRow[1] = {BOOLOID}; + TupleTableSlot *slot; + bool isnull; + bool remote_in_recovery; + + if (walrcv_server_version(wrconn) < 19000) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot enable retain_dead_tuples if the publisher is running a version earlier than PostgreSQL 19")); + + res = walrcv_exec(wrconn, "SELECT pg_is_in_recovery()", 1, RecoveryRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not obtain recovery progress from the publisher: %s", + res->err))); + + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + elog(ERROR, "failed to fetch tuple for the recovery progress"); + + remote_in_recovery = DatumGetBool(slot_getattr(slot, 1, &isnull)); + + if (remote_in_recovery) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot enable retain_dead_tuples if the publisher is in recovery.")); + + ExecDropSingleTupleTableSlot(slot); + + walrcv_clear_result(res); +} + +/* + * Check if the subscriber's configuration is adequate to enable the + * retain_dead_tuples option. + * + * Issue an ERROR if the wal_level does not support the use of replication + * slots when check_guc is set to true. + * + * Issue a WARNING if track_commit_timestamp is not enabled when check_guc is + * set to true. This is only to highlight the importance of enabling + * track_commit_timestamp instead of catching all the misconfigurations, as + * this setting can be adjusted after subscription creation. Without it, the + * apply worker will simply skip conflict detection. + * + * Issue a WARNING or NOTICE if the subscription is disabled. Do not raise an + * ERROR since users can only modify retain_dead_tuples for disabled + * subscriptions. And as long as the subscription is enabled promptly, it will + * not pose issues. + */ +void +CheckSubDeadTupleRetention(bool check_guc, bool sub_disabled, + int elevel_for_sub_disabled) +{ + Assert(elevel_for_sub_disabled == NOTICE || + elevel_for_sub_disabled == WARNING); + + if (check_guc && wal_level < WAL_LEVEL_REPLICA) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("\"wal_level\" is insufficient to create the replication slot required by retain_dead_tuples"), + errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")); + + if (check_guc && !track_commit_timestamp) + ereport(WARNING, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("commit timestamp and origin data required for detecting conflicts won't be retained"), + errhint("Consider setting \"%s\" to true.", + "track_commit_timestamp")); + + if (sub_disabled) + ereport(elevel_for_sub_disabled, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("deleted rows to detect conflicts would not be removed until the subscription is enabled"), + (elevel_for_sub_disabled > NOTICE) + ? errhint("Consider setting %s to false.", + "retain_dead_tuples") : 0); +} + +/* * Get the list of tables which belong to specified publications on the * publisher connection. * diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 67f8e70f9c1..7dc121f73f1 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -80,6 +80,7 @@ static bool GetTupleForTrigger(EState *estate, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot *oldslot, + bool do_epq_recheck, TupleTableSlot **epqslot, TM_Result *tmresultp, TM_FailureData *tmfdp); @@ -2693,7 +2694,8 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, - TM_FailureData *tmfd) + TM_FailureData *tmfd, + bool is_merge_delete) { TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo); TriggerDesc *trigdesc = relinfo->ri_TrigDesc; @@ -2708,9 +2710,17 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, { TupleTableSlot *epqslot_candidate = NULL; + /* + * Get a copy of the on-disk tuple we are planning to delete. In + * general, if the tuple has been concurrently updated, we should + * recheck it using EPQ. However, if this is a MERGE DELETE action, + * we skip this EPQ recheck and leave it to the caller (it must do + * additional rechecking, and might end up executing a different + * action entirely). + */ if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - LockTupleExclusive, slot, &epqslot_candidate, - tmresult, tmfd)) + LockTupleExclusive, slot, !is_merge_delete, + &epqslot_candidate, tmresult, tmfd)) return false; /* @@ -2800,6 +2810,7 @@ ExecARDeleteTriggers(EState *estate, tupleid, LockTupleExclusive, slot, + false, NULL, NULL, NULL); @@ -2944,7 +2955,8 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, - TM_FailureData *tmfd) + TM_FailureData *tmfd, + bool is_merge_update) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; TupleTableSlot *oldslot = ExecGetTriggerOldSlot(estate, relinfo); @@ -2965,10 +2977,17 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, { TupleTableSlot *epqslot_candidate = NULL; - /* get a copy of the on-disk tuple we are planning to update */ + /* + * Get a copy of the on-disk tuple we are planning to update. In + * general, if the tuple has been concurrently updated, we should + * recheck it using EPQ. However, if this is a MERGE UPDATE action, + * we skip this EPQ recheck and leave it to the caller (it must do + * additional rechecking, and might end up executing a different + * action entirely). + */ if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - lockmode, oldslot, &epqslot_candidate, - tmresult, tmfd)) + lockmode, oldslot, !is_merge_update, + &epqslot_candidate, tmresult, tmfd)) return false; /* cancel the update action */ /* @@ -3142,6 +3161,7 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, tupleid, LockTupleExclusive, oldslot, + false, NULL, NULL, NULL); @@ -3298,6 +3318,7 @@ GetTupleForTrigger(EState *estate, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot *oldslot, + bool do_epq_recheck, TupleTableSlot **epqslot, TM_Result *tmresultp, TM_FailureData *tmfdp) @@ -3357,29 +3378,30 @@ GetTupleForTrigger(EState *estate, if (tmfd.traversed) { /* - * Recheck the tuple using EPQ. For MERGE, we leave this - * to the caller (it must do additional rechecking, and - * might end up executing a different action entirely). + * Recheck the tuple using EPQ, if requested. Otherwise, + * just return that it was concurrently updated. */ - if (estate->es_plannedstmt->commandType == CMD_MERGE) + if (do_epq_recheck) { - if (tmresultp) - *tmresultp = TM_Updated; - return false; + *epqslot = EvalPlanQual(epqstate, + relation, + relinfo->ri_RangeTableIndex, + oldslot); + + /* + * If PlanQual failed for updated tuple - we must not + * process this tuple! + */ + if (TupIsNull(*epqslot)) + { + *epqslot = NULL; + return false; + } } - - *epqslot = EvalPlanQual(epqstate, - relation, - relinfo->ri_RangeTableIndex, - oldslot); - - /* - * If PlanQual failed for updated tuple - we must not - * process this tuple! - */ - if (TupIsNull(*epqslot)) + else { - *epqslot = NULL; + if (tmresultp) + *tmresultp = TM_Updated; return false; } } diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 8a72b5e70a4..1a37737d4a2 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -5228,7 +5228,6 @@ ExecEvalJsonCoercionFinish(ExprState *state, ExprEvalStep *op) * JsonBehavior expression. */ jsestate->escontext.error_occurred = false; - jsestate->escontext.error_occurred = false; jsestate->escontext.details_wanted = true; } } diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index f3e77bda279..f098a5557cf 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -189,6 +189,7 @@ ExecSerializePlan(Plan *plan, EState *estate) pstmt->permInfos = estate->es_rteperminfos; pstmt->resultRelations = NIL; pstmt->appendRelations = NIL; + pstmt->planOrigin = PLAN_STMT_INTERNAL; /* * Transfer only parallel-safe subplans, leaving a NULL "hole" in the list diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 53ddd25c42d..f262e7a66f7 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -670,7 +670,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - tid, NULL, slot, NULL, NULL)) + tid, NULL, slot, NULL, NULL, false)) skip_tuple = true; /* "do nothing" */ } @@ -746,7 +746,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tid, NULL, NULL, NULL, NULL); + tid, NULL, NULL, NULL, NULL, false); } if (!skip_tuple) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 54da8e7995b..7c6c2c1f6e4 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1474,7 +1474,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return ExecBRDeleteTriggers(context->estate, context->epqstate, resultRelInfo, tupleid, oldtuple, - epqreturnslot, result, &context->tmfd); + epqreturnslot, result, &context->tmfd, + context->mtstate->operation == CMD_MERGE); } return true; @@ -2117,7 +2118,8 @@ ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return ExecBRUpdateTriggers(context->estate, context->epqstate, resultRelInfo, tupleid, oldtuple, slot, - result, &context->tmfd); + result, &context->tmfd, + context->mtstate->operation == CMD_MERGE); } return true; diff --git a/src/backend/jit/llvm/Makefile b/src/backend/jit/llvm/Makefile index e8c12060b93..68677ba42e1 100644 --- a/src/backend/jit/llvm/Makefile +++ b/src/backend/jit/llvm/Makefile @@ -31,7 +31,7 @@ endif # All files in this directory use LLVM. CFLAGS += $(LLVM_CFLAGS) CXXFLAGS += $(LLVM_CXXFLAGS) -override CPPFLAGS := $(LLVM_CPPFLAGS) $(CPPFLAGS) +override CPPFLAGS += $(LLVM_CPPFLAGS) SHLIB_LINK += $(LLVM_LIBS) # Because this module includes C++ files, we need to use a C++ diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index 9f4d05ffbd4..4da46666439 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -94,8 +94,16 @@ static int auth_peer(hbaPort *port); #define PGSQL_PAM_SERVICE "postgresql" /* Service name passed to PAM */ +/* Work around original Solaris' lack of "const" in the conv_proc signature */ +#ifdef _PAM_LEGACY_NONCONST +#define PG_PAM_CONST +#else +#define PG_PAM_CONST const +#endif + static int CheckPAMAuth(Port *port, const char *user, const char *password); -static int pam_passwd_conv_proc(int num_msg, const struct pam_message **msg, +static int pam_passwd_conv_proc(int num_msg, + PG_PAM_CONST struct pam_message **msg, struct pam_response **resp, void *appdata_ptr); static struct pam_conv pam_passw_conv = { @@ -1917,7 +1925,7 @@ auth_peer(hbaPort *port) */ static int -pam_passwd_conv_proc(int num_msg, const struct pam_message **msg, +pam_passwd_conv_proc(int num_msg, PG_PAM_CONST struct pam_message **msg, struct pam_response **resp, void *appdata_ptr) { const char *passwd; diff --git a/src/backend/libpq/pg_ident.conf.sample b/src/backend/libpq/pg_ident.conf.sample index f5225f26cdf..8ee6c0ba315 100644 --- a/src/backend/libpq/pg_ident.conf.sample +++ b/src/backend/libpq/pg_ident.conf.sample @@ -13,25 +13,25 @@ # user names to their corresponding PostgreSQL user names. Records # are of the form: # -# MAPNAME SYSTEM-USERNAME PG-USERNAME +# MAPNAME SYSTEM-USERNAME DATABASE-USERNAME # # (The uppercase quantities must be replaced by actual values.) # # MAPNAME is the (otherwise freely chosen) map name that was used in # pg_hba.conf. SYSTEM-USERNAME is the detected user name of the -# client. PG-USERNAME is the requested PostgreSQL user name. The -# existence of a record specifies that SYSTEM-USERNAME may connect as -# PG-USERNAME. +# client. DATABASE-USERNAME is the requested PostgreSQL user name. +# The existence of a record specifies that SYSTEM-USERNAME may connect +# as DATABASE-USERNAME. # -# If SYSTEM-USERNAME starts with a slash (/), it will be treated as a -# regular expression. Optionally this can contain a capture (a -# parenthesized subexpression). The substring matching the capture -# will be substituted for \1 (backslash-one) if present in -# PG-USERNAME. +# If SYSTEM-USERNAME starts with a slash (/), the rest of it will be +# treated as a regular expression. Optionally this can contain a capture +# (a parenthesized subexpression). The substring matching the capture +# will be substituted for \1 (backslash-one) if that appears in +# DATABASE-USERNAME. # -# PG-USERNAME can be "all", a user name, a group name prefixed with "+", or -# a regular expression (if it starts with a slash (/)). If it is a regular -# expression, the substring matching with \1 has no effect. +# DATABASE-USERNAME can be "all", a user name, a group name prefixed with "+", +# or a regular expression (if it starts with a slash (/)). If it is a regular +# expression, no substitution for \1 will occur. # # Multiple maps may be specified in this file and used by pg_hba.conf. # @@ -69,4 +69,4 @@ # Put your actual configuration here # ---------------------------------- -# MAPNAME SYSTEM-USERNAME PG-USERNAME +# MAPNAME SYSTEM-USERNAME DATABASE-USERNAME diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index e5171467de1..25f739a6a17 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -858,7 +858,6 @@ RemoveSocketFiles(void) (void) unlink(sock_path); } /* Since we're about to exit, no need to reclaim storage */ - sock_paths = NIL; } diff --git a/src/backend/libpq/pqmq.c b/src/backend/libpq/pqmq.c index f1a08bc32ca..5f39949a367 100644 --- a/src/backend/libpq/pqmq.c +++ b/src/backend/libpq/pqmq.c @@ -23,7 +23,7 @@ #include "tcop/tcopprot.h" #include "utils/builtins.h" -static shm_mq_handle *pq_mq_handle; +static shm_mq_handle *pq_mq_handle = NULL; static bool pq_mq_busy = false; static pid_t pq_mq_parallel_leader_pid = 0; static ProcNumber pq_mq_parallel_leader_proc_number = INVALID_PROC_NUMBER; @@ -66,7 +66,11 @@ pq_redirect_to_shm_mq(dsm_segment *seg, shm_mq_handle *mqh) static void pq_cleanup_redirect_to_shm_mq(dsm_segment *seg, Datum arg) { - pq_mq_handle = NULL; + if (pq_mq_handle != NULL) + { + pfree(pq_mq_handle); + pq_mq_handle = NULL; + } whereToSendOutput = DestNone; } @@ -131,8 +135,11 @@ mq_putmessage(char msgtype, const char *s, size_t len) if (pq_mq_busy) { if (pq_mq_handle != NULL) + { shm_mq_detach(pq_mq_handle); - pq_mq_handle = NULL; + pfree(pq_mq_handle); + pq_mq_handle = NULL; + } return EOF; } @@ -152,8 +159,6 @@ mq_putmessage(char msgtype, const char *s, size_t len) iov[1].data = s; iov[1].len = len; - Assert(pq_mq_handle != NULL); - for (;;) { /* @@ -161,6 +166,7 @@ mq_putmessage(char msgtype, const char *s, size_t len) * that the shared memory value is updated before we send the parallel * message signal right after this. */ + Assert(pq_mq_handle != NULL); result = shm_mq_sendv(pq_mq_handle, iov, 2, true, true); if (pq_mq_parallel_leader_pid != 0) diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 7d63cf94a6b..bdcb5e4f261 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -125,13 +125,17 @@ main(int argc, char *argv[]) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("postgres")); /* - * In the postmaster, absorb the environment values for LC_COLLATE and - * LC_CTYPE. Individual backends will change these later to settings - * taken from pg_database, but the postmaster cannot do that. If we leave - * these set to "C" then message localization might not work well in the - * postmaster. + * Collation is handled by pg_locale.c, and the behavior is dependent on + * the provider. strcoll(), etc., should not be called directly. + */ + init_locale("LC_COLLATE", LC_COLLATE, "C"); + + /* + * In the postmaster, absorb the environment value for LC_CTYPE. + * Individual backends will change it later to pg_database.datctype, but + * the postmaster cannot do that. If we leave it set to "C" then message + * localization might not work well in the postmaster. */ - init_locale("LC_COLLATE", LC_COLLATE, ""); init_locale("LC_CTYPE", LC_CTYPE, ""); /* diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 1f04a2c182c..344a3188317 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -2572,13 +2572,13 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, Cost input_startup_cost = mpath->subpath->startup_cost; Cost input_total_cost = mpath->subpath->total_cost; double tuples = mpath->subpath->rows; - double calls = mpath->calls; + Cardinality est_calls = mpath->est_calls; int width = mpath->subpath->pathtarget->width; double hash_mem_bytes; double est_entry_bytes; - double est_cache_entries; - double ndistinct; + Cardinality est_cache_entries; + Cardinality ndistinct; double evict_ratio; double hit_ratio; Cost startup_cost; @@ -2604,7 +2604,7 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, est_cache_entries = floor(hash_mem_bytes / est_entry_bytes); /* estimate on the distinct number of parameter values */ - ndistinct = estimate_num_groups(root, mpath->param_exprs, calls, NULL, + ndistinct = estimate_num_groups(root, mpath->param_exprs, est_calls, NULL, &estinfo); /* @@ -2616,7 +2616,10 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, * certainly mean a MemoizePath will never survive add_path(). */ if ((estinfo.flags & SELFLAG_USED_DEFAULT) != 0) - ndistinct = calls; + ndistinct = est_calls; + + /* Remember the ndistinct estimate for EXPLAIN */ + mpath->est_unique_keys = ndistinct; /* * Since we've already estimated the maximum number of entries we can @@ -2644,9 +2647,12 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, * must look at how many scans are estimated in total for this node and * how many of those scans we expect to get a cache hit. */ - hit_ratio = ((calls - ndistinct) / calls) * + hit_ratio = ((est_calls - ndistinct) / est_calls) * (est_cache_entries / Max(ndistinct, est_cache_entries)); + /* Remember the hit ratio estimate for EXPLAIN */ + mpath->est_hit_ratio = hit_ratio; + Assert(hit_ratio >= 0 && hit_ratio <= 1.0); /* diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 8a9f1d7a943..bfefc7dbea1 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -284,7 +284,10 @@ static Material *make_material(Plan *lefttree); static Memoize *make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, List *param_exprs, bool singlerow, bool binary_mode, - uint32 est_entries, Bitmapset *keyparamids); + uint32 est_entries, Bitmapset *keyparamids, + Cardinality est_calls, + Cardinality est_unique_keys, + double est_hit_ratio); static WindowAgg *make_windowagg(List *tlist, WindowClause *wc, int partNumCols, AttrNumber *partColIdx, Oid *partOperators, Oid *partCollations, int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators, Oid *ordCollations, @@ -1753,7 +1756,8 @@ create_memoize_plan(PlannerInfo *root, MemoizePath *best_path, int flags) plan = make_memoize(subplan, operators, collations, param_exprs, best_path->singlerow, best_path->binary_mode, - best_path->est_entries, keyparamids); + best_path->est_entries, keyparamids, best_path->est_calls, + best_path->est_unique_keys, best_path->est_hit_ratio); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -6749,7 +6753,9 @@ materialize_finished_plan(Plan *subplan) static Memoize * make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, List *param_exprs, bool singlerow, bool binary_mode, - uint32 est_entries, Bitmapset *keyparamids) + uint32 est_entries, Bitmapset *keyparamids, + Cardinality est_calls, Cardinality est_unique_keys, + double est_hit_ratio) { Memoize *node = makeNode(Memoize); Plan *plan = &node->plan; @@ -6767,6 +6773,9 @@ make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, node->binary_mode = binary_mode; node->est_entries = est_entries; node->keyparamids = keyparamids; + node->est_calls = est_calls; + node->est_unique_keys = est_unique_keys; + node->est_hit_ratio = est_hit_ratio; return node; } diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 01804b085b3..3e3fec89252 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -3048,36 +3048,16 @@ add_base_clause_to_rel(PlannerInfo *root, Index relid, * expr_is_nonnullable * Check to see if the Expr cannot be NULL * - * If the Expr is a simple Var that is defined NOT NULL and meanwhile is not - * nulled by any outer joins, then we can know that it cannot be NULL. + * Currently we only support simple Vars. */ static bool expr_is_nonnullable(PlannerInfo *root, Expr *expr) { - RelOptInfo *rel; - Var *var; - /* For now only check simple Vars */ if (!IsA(expr, Var)) return false; - var = (Var *) expr; - - /* could the Var be nulled by any outer joins? */ - if (!bms_is_empty(var->varnullingrels)) - return false; - - /* system columns cannot be NULL */ - if (var->varattno < 0) - return true; - - /* is the column defined NOT NULL? */ - rel = find_base_rel(root, var->varno); - if (var->varattno > 0 && - bms_is_member(var->varattno, rel->notnullattnums)) - return true; - - return false; + return var_is_nonnullable(root, (Var *) expr, true); } /* diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 549aedcfa99..d59d6e4c6a0 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -342,6 +342,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, glob->transientPlan = false; glob->dependsOnRole = false; glob->partition_directory = NULL; + glob->rel_notnullatts_hash = NULL; /* * Assess whether it's feasible to use parallel mode for this query. We @@ -557,6 +558,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->commandType = parse->commandType; result->queryId = parse->queryId; + result->planOrigin = PLAN_STMT_STANDARD; result->hasReturning = (parse->returningList != NIL); result->hasModifyingCTE = parse->hasModifyingCTE; result->canSetTag = parse->canSetTag; @@ -721,6 +723,18 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, transform_MERGE_to_join(parse); /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. Note that this step does not descend into sublinks and + * subqueries; if we pull up any sublinks or subqueries below, their + * relation RTEs are processed just before pulling them up. + */ + parse = root->parse = preprocess_relation_rtes(root); + + /* * If the FROM clause is empty, replace it with a dummy RTE_RESULT RTE, so * that we don't need so many special cases to deal with that situation. */ @@ -744,14 +758,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, preprocess_function_rtes(root); /* - * Scan the rangetable for relations with virtual generated columns, and - * replace all Var nodes in the query that reference these columns with - * the generation expressions. Recursion issues here are handled in the - * same way as for SubLinks. - */ - parse = root->parse = expand_virtual_generated_columns(root); - - /* * Check to see if any subqueries in the jointree can be merged into this * query. */ @@ -787,23 +793,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, switch (rte->rtekind) { - case RTE_RELATION: - if (rte->inh) - { - /* - * Check to see if the relation actually has any children; - * if not, clear the inh flag so we can treat it as a - * plain base relation. - * - * Note: this could give a false-positive result, if the - * rel once had children but no longer does. We used to - * be able to clear rte->inh later on when we discovered - * that, but no more; we have to handle such cases as - * full-fledged inheritance. - */ - rte->inh = has_subclass(rte->relid); - } - break; case RTE_JOIN: root->hasJoinRTEs = true; if (IS_OUTER_JOIN(rte->jointype)) diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index e7cb3fede66..d71ed958e31 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1454,6 +1454,7 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink, Query *parse = root->parse; Query *subselect = (Query *) sublink->subselect; Node *whereClause; + PlannerInfo subroot; int rtoffset; int varno; Relids clause_varnos; @@ -1516,6 +1517,35 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink, return NULL; /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. + * + * Note: we construct up an entirely dummy PlannerInfo for use here. This + * is fine because only the "glob" and "parse" links will be used in this + * case. + * + * Note: we temporarily assign back the WHERE clause so that any virtual + * generated column references within it can be expanded. It should be + * separated out again afterward. + */ + MemSet(&subroot, 0, sizeof(subroot)); + subroot.type = T_PlannerInfo; + subroot.glob = root->glob; + subroot.parse = subselect; + subselect->jointree->quals = whereClause; + subselect = preprocess_relation_rtes(&subroot); + + /* + * Now separate out the WHERE clause again. + */ + whereClause = subselect->jointree->quals; + subselect->jointree->quals = NULL; + + /* * The subquery must have a nonempty jointree, but we can make it so. */ replace_empty_jointree(subselect); @@ -1732,6 +1762,7 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect, Node **testexpr, List **paramIds) { Node *whereClause; + PlannerInfo subroot; List *leftargs, *rightargs, *opids, @@ -1791,12 +1822,15 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect, * parent aliases were flattened already, and we're not going to pull any * child Vars (of any description) into the parent. * - * Note: passing the parent's root to eval_const_expressions is - * technically wrong, but we can get away with it since only the - * boundParams (if any) are used, and those would be the same in a - * subroot. - */ - whereClause = eval_const_expressions(root, whereClause); + * Note: we construct up an entirely dummy PlannerInfo to pass to + * eval_const_expressions. This is fine because only the "glob" and + * "parse" links are used by eval_const_expressions. + */ + MemSet(&subroot, 0, sizeof(subroot)); + subroot.type = T_PlannerInfo; + subroot.glob = root->glob; + subroot.parse = subselect; + whereClause = eval_const_expressions(&subroot, whereClause); whereClause = (Node *) canonicalize_qual((Expr *) whereClause, false); whereClause = (Node *) make_ands_implicit((Expr *) whereClause); diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index 87dc6f56b57..35e8d3c183b 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -4,10 +4,10 @@ * Planner preprocessing for subqueries and join tree manipulation. * * NOTE: the intended sequence for invoking these operations is + * preprocess_relation_rtes * replace_empty_jointree * pull_up_sublinks * preprocess_function_rtes - * expand_virtual_generated_columns * pull_up_subqueries * flatten_simple_union_all * do expression preprocessing (including flattening JOIN alias vars) @@ -36,6 +36,7 @@ #include "optimizer/clauses.h" #include "optimizer/optimizer.h" #include "optimizer/placeholder.h" +#include "optimizer/plancat.h" #include "optimizer/prep.h" #include "optimizer/subselect.h" #include "optimizer/tlist.h" @@ -102,6 +103,9 @@ typedef struct reduce_outer_joins_partial_state Relids unreduced_side; /* relids in its still-nullable side */ } reduce_outer_joins_partial_state; +static Query *expand_virtual_generated_columns(PlannerInfo *root, Query *parse, + RangeTblEntry *rte, int rt_index, + Relation relation); static Node *pull_up_sublinks_jointree_recurse(PlannerInfo *root, Node *jtnode, Relids *relids); static Node *pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node, @@ -393,6 +397,181 @@ transform_MERGE_to_join(Query *parse) } /* + * preprocess_relation_rtes + * Do the preprocessing work for any relation RTEs in the FROM clause. + * + * This scans the rangetable for relation RTEs and retrieves the necessary + * catalog information for each relation. Using this information, it clears + * the inh flag for any relation that has no children, collects not-null + * attribute numbers for any relation that has column not-null constraints, and + * expands virtual generated columns for any relation that contains them. + * + * Note that expanding virtual generated columns may cause the query tree to + * have new copies of rangetable entries. Therefore, we have to use list_nth + * instead of foreach when iterating over the query's rangetable. + * + * Returns a modified copy of the query tree, if any relations with virtual + * generated columns are present. + */ +Query * +preprocess_relation_rtes(PlannerInfo *root) +{ + Query *parse = root->parse; + int rtable_size; + int rt_index; + + rtable_size = list_length(parse->rtable); + + for (rt_index = 0; rt_index < rtable_size; rt_index++) + { + RangeTblEntry *rte = rt_fetch(rt_index + 1, parse->rtable); + Relation relation; + + /* We only care about relation RTEs. */ + if (rte->rtekind != RTE_RELATION) + continue; + + /* + * We need not lock the relation since it was already locked by the + * rewriter. + */ + relation = table_open(rte->relid, NoLock); + + /* + * Check to see if the relation actually has any children; if not, + * clear the inh flag so we can treat it as a plain base relation. + * + * Note: this could give a false-positive result, if the rel once had + * children but no longer does. We used to be able to clear rte->inh + * later on when we discovered that, but no more; we have to handle + * such cases as full-fledged inheritance. + */ + if (rte->inh) + rte->inh = relation->rd_rel->relhassubclass; + + /* + * Check to see if the relation has any column not-null constraints; + * if so, retrieve the constraint information and store it in a + * relation OID based hash table. + */ + get_relation_notnullatts(root, relation); + + /* + * Check to see if the relation has any virtual generated columns; if + * so, replace all Var nodes in the query that reference these columns + * with the generation expressions. + */ + parse = expand_virtual_generated_columns(root, parse, + rte, rt_index + 1, + relation); + + table_close(relation, NoLock); + } + + return parse; +} + +/* + * expand_virtual_generated_columns + * Expand virtual generated columns for the given relation. + * + * This checks whether the given relation has any virtual generated columns, + * and if so, replaces all Var nodes in the query that reference those columns + * with their generation expressions. + * + * Returns a modified copy of the query tree if the relation contains virtual + * generated columns. + */ +static Query * +expand_virtual_generated_columns(PlannerInfo *root, Query *parse, + RangeTblEntry *rte, int rt_index, + Relation relation) +{ + TupleDesc tupdesc; + + /* Only normal relations can have virtual generated columns */ + Assert(rte->rtekind == RTE_RELATION); + + tupdesc = RelationGetDescr(relation); + if (tupdesc->constr && tupdesc->constr->has_generated_virtual) + { + List *tlist = NIL; + pullup_replace_vars_context rvcontext; + + for (int i = 0; i < tupdesc->natts; i++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, i); + TargetEntry *tle; + + if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + { + Node *defexpr; + + defexpr = build_generation_expression(relation, i + 1); + ChangeVarNodes(defexpr, 1, rt_index, 0); + + tle = makeTargetEntry((Expr *) defexpr, i + 1, 0, false); + tlist = lappend(tlist, tle); + } + else + { + Var *var; + + var = makeVar(rt_index, + i + 1, + attr->atttypid, + attr->atttypmod, + attr->attcollation, + 0); + + tle = makeTargetEntry((Expr *) var, i + 1, 0, false); + tlist = lappend(tlist, tle); + } + } + + Assert(list_length(tlist) > 0); + Assert(!rte->lateral); + + /* + * The relation's targetlist items are now in the appropriate form to + * insert into the query, except that we may need to wrap them in + * PlaceHolderVars. Set up required context data for + * pullup_replace_vars. + */ + rvcontext.root = root; + rvcontext.targetlist = tlist; + rvcontext.target_rte = rte; + rvcontext.result_relation = parse->resultRelation; + /* won't need these values */ + rvcontext.relids = NULL; + rvcontext.nullinfo = NULL; + /* pass NULL for outer_hasSubLinks */ + rvcontext.outer_hasSubLinks = NULL; + rvcontext.varno = rt_index; + /* this flag will be set below, if needed */ + rvcontext.wrap_option = REPLACE_WRAP_NONE; + /* initialize cache array with indexes 0 .. length(tlist) */ + rvcontext.rv_cache = palloc0((list_length(tlist) + 1) * + sizeof(Node *)); + + /* + * If the query uses grouping sets, we need a PlaceHolderVar for each + * expression of the relation's targetlist items. (See comments in + * pull_up_simple_subquery().) + */ + if (parse->groupingSets) + rvcontext.wrap_option = REPLACE_WRAP_ALL; + + /* + * Apply pullup variable replacement throughout the query tree. + */ + parse = (Query *) pullup_replace_vars((Node *) parse, &rvcontext); + } + + return parse; +} + +/* * replace_empty_jointree * If the Query's jointree is empty, replace it with a dummy RTE_RESULT * relation. @@ -950,128 +1129,6 @@ preprocess_function_rtes(PlannerInfo *root) } /* - * expand_virtual_generated_columns - * Expand all virtual generated column references in a query. - * - * This scans the rangetable for relations with virtual generated columns, and - * replaces all Var nodes in the query that reference these columns with the - * generation expressions. Note that we do not descend into subqueries; that - * is taken care of when the subqueries are planned. - * - * This has to be done after we have pulled up any SubLinks within the query's - * quals; otherwise any virtual generated column references within the SubLinks - * that should be transformed into joins wouldn't get expanded. - * - * Returns a modified copy of the query tree, if any relations with virtual - * generated columns are present. - */ -Query * -expand_virtual_generated_columns(PlannerInfo *root) -{ - Query *parse = root->parse; - int rt_index; - ListCell *lc; - - rt_index = 0; - foreach(lc, parse->rtable) - { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); - Relation rel; - TupleDesc tupdesc; - - ++rt_index; - - /* - * Only normal relations can have virtual generated columns. - */ - if (rte->rtekind != RTE_RELATION) - continue; - - rel = table_open(rte->relid, NoLock); - - tupdesc = RelationGetDescr(rel); - if (tupdesc->constr && tupdesc->constr->has_generated_virtual) - { - List *tlist = NIL; - pullup_replace_vars_context rvcontext; - - for (int i = 0; i < tupdesc->natts; i++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, i); - TargetEntry *tle; - - if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) - { - Node *defexpr; - - defexpr = build_generation_expression(rel, i + 1); - ChangeVarNodes(defexpr, 1, rt_index, 0); - - tle = makeTargetEntry((Expr *) defexpr, i + 1, 0, false); - tlist = lappend(tlist, tle); - } - else - { - Var *var; - - var = makeVar(rt_index, - i + 1, - attr->atttypid, - attr->atttypmod, - attr->attcollation, - 0); - - tle = makeTargetEntry((Expr *) var, i + 1, 0, false); - tlist = lappend(tlist, tle); - } - } - - Assert(list_length(tlist) > 0); - Assert(!rte->lateral); - - /* - * The relation's targetlist items are now in the appropriate form - * to insert into the query, except that we may need to wrap them - * in PlaceHolderVars. Set up required context data for - * pullup_replace_vars. - */ - rvcontext.root = root; - rvcontext.targetlist = tlist; - rvcontext.target_rte = rte; - rvcontext.result_relation = parse->resultRelation; - /* won't need these values */ - rvcontext.relids = NULL; - rvcontext.nullinfo = NULL; - /* pass NULL for outer_hasSubLinks */ - rvcontext.outer_hasSubLinks = NULL; - rvcontext.varno = rt_index; - /* this flag will be set below, if needed */ - rvcontext.wrap_option = REPLACE_WRAP_NONE; - /* initialize cache array with indexes 0 .. length(tlist) */ - rvcontext.rv_cache = palloc0((list_length(tlist) + 1) * - sizeof(Node *)); - - /* - * If the query uses grouping sets, we need a PlaceHolderVar for - * each expression of the relation's targetlist items. (See - * comments in pull_up_simple_subquery().) - */ - if (parse->groupingSets) - rvcontext.wrap_option = REPLACE_WRAP_ALL; - - /* - * Apply pullup variable replacement throughout the query tree. - */ - parse = (Query *) pullup_replace_vars((Node *) parse, &rvcontext); - } - - table_close(rel, NoLock); - } - - return parse; -} - -/* * pull_up_subqueries * Look for subqueries in the rangetable that can be pulled up into * the parent query. If the subquery has no special features like @@ -1334,6 +1391,16 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, Assert(subquery->cteList == NIL); /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. + */ + subquery = subroot->parse = preprocess_relation_rtes(subroot); + + /* * If the FROM clause is empty, replace it with a dummy RTE_RESULT RTE, so * that we don't need so many special cases to deal with that situation. */ @@ -1353,13 +1420,6 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, preprocess_function_rtes(subroot); /* - * Scan the rangetable for relations with virtual generated columns, and - * replace all Var nodes in the query that reference these columns with - * the generation expressions. - */ - subquery = subroot->parse = expand_virtual_generated_columns(subroot); - - /* * Recursively pull up the subquery's subqueries, so that * pull_up_subqueries' processing is complete for its jointree and * rangetable. diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index f45131c34c5..6f0b338d2cd 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -20,6 +20,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "catalog/pg_class.h" #include "catalog/pg_language.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" @@ -36,6 +37,7 @@ #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/optimizer.h" +#include "optimizer/pathnode.h" #include "optimizer/plancat.h" #include "optimizer/planmain.h" #include "parser/analyze.h" @@ -43,6 +45,7 @@ #include "parser/parse_collate.h" #include "parser/parse_func.h" #include "parser/parse_oper.h" +#include "parser/parsetree.h" #include "rewrite/rewriteHandler.h" #include "rewrite/rewriteManip.h" #include "tcop/tcopprot.h" @@ -2242,7 +2245,8 @@ rowtype_field_matches(Oid rowtypeid, int fieldnum, * only operators and functions that are reasonable to try to execute. * * NOTE: "root" can be passed as NULL if the caller never wants to do any - * Param substitutions nor receive info about inlined functions. + * Param substitutions nor receive info about inlined functions nor reduce + * NullTest for Vars to constant true or constant false. * * NOTE: the planner assumes that this will always flatten nested AND and * OR clauses into N-argument form. See comments in prepqual.c. @@ -3544,6 +3548,31 @@ eval_const_expressions_mutator(Node *node, return makeBoolConst(result, false); } + if (!ntest->argisrow && arg && IsA(arg, Var) && context->root) + { + Var *varg = (Var *) arg; + bool result; + + if (var_is_nonnullable(context->root, varg, false)) + { + switch (ntest->nulltesttype) + { + case IS_NULL: + result = false; + break; + case IS_NOT_NULL: + result = true; + break; + default: + elog(ERROR, "unrecognized nulltesttype: %d", + (int) ntest->nulltesttype); + result = false; /* keep compiler quiet */ + break; + } + + return makeBoolConst(result, false); + } + } newntest = makeNode(NullTest); newntest->arg = (Expr *) arg; @@ -4163,6 +4192,67 @@ simplify_function(Oid funcid, Oid result_type, int32 result_typmod, } /* + * var_is_nonnullable: check to see if the Var cannot be NULL + * + * If the Var is defined NOT NULL and meanwhile is not nulled by any outer + * joins or grouping sets, then we can know that it cannot be NULL. + * + * use_rel_info indicates whether the corresponding RelOptInfo is available for + * use. + */ +bool +var_is_nonnullable(PlannerInfo *root, Var *var, bool use_rel_info) +{ + Relids notnullattnums = NULL; + + Assert(IsA(var, Var)); + + /* skip upper-level Vars */ + if (var->varlevelsup != 0) + return false; + + /* could the Var be nulled by any outer joins or grouping sets? */ + if (!bms_is_empty(var->varnullingrels)) + return false; + + /* system columns cannot be NULL */ + if (var->varattno < 0) + return true; + + /* + * Check if the Var is defined as NOT NULL. We retrieve the column NOT + * NULL constraint information from the corresponding RelOptInfo if it is + * available; otherwise, we search the hash table for this information. + */ + if (use_rel_info) + { + RelOptInfo *rel = find_base_rel(root, var->varno); + + notnullattnums = rel->notnullattnums; + } + else + { + RangeTblEntry *rte = planner_rt_fetch(var->varno, root); + + /* + * We must skip inheritance parent tables, as some child tables may + * have a NOT NULL constraint for a column while others may not. This + * cannot happen with partitioned tables, though. + */ + if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE) + return false; + + notnullattnums = find_relation_notnullatts(root, rte->relid); + } + + if (var->varattno > 0 && + bms_is_member(var->varattno, notnullattnums)) + return true; + + return false; +} + +/* * expand_function_arguments: convert named-notation args to positional args * and/or insert default args, as needed * diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c index 17e51cd75d7..30d158069e3 100644 --- a/src/backend/optimizer/util/inherit.c +++ b/src/backend/optimizer/util/inherit.c @@ -466,8 +466,7 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index *childRTindex_p) { Query *parse = root->parse; - Oid parentOID PG_USED_FOR_ASSERTS_ONLY = - RelationGetRelid(parentrel); + Oid parentOID = RelationGetRelid(parentrel); Oid childOID = RelationGetRelid(childrel); RangeTblEntry *childrte; Index childRTindex; @@ -514,6 +513,13 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, *childRTindex_p = childRTindex; /* + * Retrieve column not-null constraint information for the child relation + * if its relation OID is different from the parent's. + */ + if (childOID != parentOID) + get_relation_notnullatts(root, childrel); + + /* * Build an AppendRelInfo struct for each parent/child pair. */ appinfo = make_append_rel_info(parentrel, childrel, diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 9cc602788ea..a4c5867cdcb 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1689,7 +1689,7 @@ create_material_path(RelOptInfo *rel, Path *subpath) MemoizePath * create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, List *param_exprs, List *hash_operators, - bool singlerow, bool binary_mode, double calls) + bool singlerow, bool binary_mode, Cardinality est_calls) { MemoizePath *pathnode = makeNode(MemoizePath); @@ -1710,7 +1710,6 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->param_exprs = param_exprs; pathnode->singlerow = singlerow; pathnode->binary_mode = binary_mode; - pathnode->calls = clamp_row_est(calls); /* * For now we set est_entries to 0. cost_memoize_rescan() does all the @@ -1720,6 +1719,12 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, */ pathnode->est_entries = 0; + pathnode->est_calls = clamp_row_est(est_calls); + + /* These will also be set later in cost_memoize_rescan() */ + pathnode->est_unique_keys = 0.0; + pathnode->est_hit_ratio = 0.0; + /* we should not generate this path type when enable_memoize=false */ Assert(enable_memoize); pathnode->path.disabled_nodes = subpath->disabled_nodes; @@ -4259,7 +4264,7 @@ reparameterize_path(PlannerInfo *root, Path *path, mpath->hash_operators, mpath->singlerow, mpath->binary_mode, - mpath->calls); + mpath->est_calls); } default: break; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 59233b64730..c6a58afc5e5 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -59,6 +59,12 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; /* Hook for plugins to get control in get_relation_info() */ get_relation_info_hook_type get_relation_info_hook = NULL; +typedef struct NotnullHashEntry +{ + Oid relid; /* OID of the relation */ + Relids notnullattnums; /* attnums of NOT NULL columns */ +} NotnullHashEntry; + static void get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, Relation relation, bool inhparent); @@ -172,27 +178,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * RangeTblEntry does get populated. */ if (!inhparent || relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { - for (int i = 0; i < relation->rd_att->natts; i++) - { - CompactAttribute *attr = TupleDescCompactAttr(relation->rd_att, i); - - Assert(attr->attnullability != ATTNULLABLE_UNKNOWN); - - if (attr->attnullability == ATTNULLABLE_VALID) - { - rel->notnullattnums = bms_add_member(rel->notnullattnums, - i + 1); - - /* - * Per RemoveAttributeById(), dropped columns will have their - * attnotnull unset, so we needn't check for dropped columns - * in the above condition. - */ - Assert(!attr->attisdropped); - } - } - } + rel->notnullattnums = find_relation_notnullatts(root, relationObjectId); /* * Estimate relation size --- unless it's an inheritance parent, in which @@ -684,6 +670,105 @@ get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, } /* + * get_relation_notnullatts - + * Retrieves column not-null constraint information for a given relation. + * + * We do this while we have the relcache entry open, and store the column + * not-null constraint information in a hash table based on the relation OID. + */ +void +get_relation_notnullatts(PlannerInfo *root, Relation relation) +{ + Oid relid = RelationGetRelid(relation); + NotnullHashEntry *hentry; + bool found; + Relids notnullattnums = NULL; + + /* bail out if the relation has no not-null constraints */ + if (relation->rd_att->constr == NULL || + !relation->rd_att->constr->has_not_null) + return; + + /* create the hash table if it hasn't been created yet */ + if (root->glob->rel_notnullatts_hash == NULL) + { + HTAB *hashtab; + HASHCTL hash_ctl; + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(NotnullHashEntry); + hash_ctl.hcxt = CurrentMemoryContext; + + hashtab = hash_create("Relation NOT NULL attnums", + 64L, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + root->glob->rel_notnullatts_hash = hashtab; + } + + /* + * Create a hash entry for this relation OID, if we don't have one + * already. + */ + hentry = (NotnullHashEntry *) hash_search(root->glob->rel_notnullatts_hash, + &relid, + HASH_ENTER, + &found); + + /* bail out if a hash entry already exists for this relation OID */ + if (found) + return; + + /* collect the column not-null constraint information for this relation */ + for (int i = 0; i < relation->rd_att->natts; i++) + { + CompactAttribute *attr = TupleDescCompactAttr(relation->rd_att, i); + + Assert(attr->attnullability != ATTNULLABLE_UNKNOWN); + + if (attr->attnullability == ATTNULLABLE_VALID) + { + notnullattnums = bms_add_member(notnullattnums, i + 1); + + /* + * Per RemoveAttributeById(), dropped columns will have their + * attnotnull unset, so we needn't check for dropped columns in + * the above condition. + */ + Assert(!attr->attisdropped); + } + } + + /* ... and initialize the new hash entry */ + hentry->notnullattnums = notnullattnums; +} + +/* + * find_relation_notnullatts - + * Searches the hash table and returns the column not-null constraint + * information for a given relation. + */ +Relids +find_relation_notnullatts(PlannerInfo *root, Oid relid) +{ + NotnullHashEntry *hentry; + bool found; + + if (root->glob->rel_notnullatts_hash == NULL) + return NULL; + + hentry = (NotnullHashEntry *) hash_search(root->glob->rel_notnullatts_hash, + &relid, + HASH_FIND, + &found); + if (!found) + return NULL; + + return hentry->notnullattnums; +} + +/* * infer_arbiter_indexes - * Determine the unique indexes used to arbitrate speculative insertion. * diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 73345bb3c70..db43034b9db 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -318,6 +318,11 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type <list> opt_qualified_name %type <boolean> opt_concurrently %type <dbehavior> opt_drop_behavior +%type <list> opt_utility_option_list +%type <list> utility_option_list +%type <defelt> utility_option_elem +%type <str> utility_option_name +%type <node> utility_option_arg %type <node> alter_column_default opclass_item opclass_drop alter_using %type <ival> add_drop opt_asc_desc opt_nulls_order @@ -338,10 +343,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); create_extension_opt_item alter_extension_opt_item %type <ival> opt_lock lock_type cast_context -%type <str> utility_option_name -%type <defelt> utility_option_elem -%type <list> utility_option_list -%type <node> utility_option_arg %type <defelt> drop_option %type <boolean> opt_or_replace opt_no opt_grant_grant_option @@ -556,7 +557,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type <list> generic_option_list alter_generic_option_list %type <ival> reindex_target_relation reindex_target_all -%type <list> opt_reindex_option_list %type <node> copy_generic_opt_arg copy_generic_opt_arg_list_item %type <defelt> copy_generic_opt_elem @@ -1141,6 +1141,41 @@ opt_drop_behavior: | /* EMPTY */ { $$ = DROP_RESTRICT; /* default */ } ; +opt_utility_option_list: + '(' utility_option_list ')' { $$ = $2; } + | /* EMPTY */ { $$ = NULL; } + ; + +utility_option_list: + utility_option_elem + { + $$ = list_make1($1); + } + | utility_option_list ',' utility_option_elem + { + $$ = lappend($1, $3); + } + ; + +utility_option_elem: + utility_option_name utility_option_arg + { + $$ = makeDefElem($1, $2, @1); + } + ; + +utility_option_name: + NonReservedWord { $$ = $1; } + | analyze_keyword { $$ = "analyze"; } + | FORMAT_LA { $$ = "format"; } + ; + +utility_option_arg: + opt_boolean_or_string { $$ = (Node *) makeString($1); } + | NumericOnly { $$ = (Node *) $1; } + | /* EMPTY */ { $$ = NULL; } + ; + /***************************************************************************** * * CALL statement @@ -2028,18 +2063,12 @@ constraints_set_mode: * Checkpoint statement */ CheckPointStmt: - CHECKPOINT + CHECKPOINT opt_utility_option_list { CheckPointStmt *n = makeNode(CheckPointStmt); $$ = (Node *) n; - } - | CHECKPOINT '(' utility_option_list ')' - { - CheckPointStmt *n = makeNode(CheckPointStmt); - - $$ = (Node *) n; - n->options = $3; + n->options = $2; } ; @@ -9354,7 +9383,7 @@ DropTransformStmt: DROP TRANSFORM opt_if_exists FOR Typename LANGUAGE name opt_d *****************************************************************************/ ReindexStmt: - REINDEX opt_reindex_option_list reindex_target_relation opt_concurrently qualified_name + REINDEX opt_utility_option_list reindex_target_relation opt_concurrently qualified_name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9367,7 +9396,7 @@ ReindexStmt: makeDefElem("concurrently", NULL, @4)); $$ = (Node *) n; } - | REINDEX opt_reindex_option_list SCHEMA opt_concurrently name + | REINDEX opt_utility_option_list SCHEMA opt_concurrently name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9380,7 +9409,7 @@ ReindexStmt: makeDefElem("concurrently", NULL, @4)); $$ = (Node *) n; } - | REINDEX opt_reindex_option_list reindex_target_all opt_concurrently opt_single_name + | REINDEX opt_utility_option_list reindex_target_all opt_concurrently opt_single_name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9402,10 +9431,6 @@ reindex_target_all: SYSTEM_P { $$ = REINDEX_OBJECT_SYSTEM; } | DATABASE { $$ = REINDEX_OBJECT_DATABASE; } ; -opt_reindex_option_list: - '(' utility_option_list ')' { $$ = $2; } - | /* EMPTY */ { $$ = NULL; } - ; /***************************************************************************** * @@ -11903,13 +11928,13 @@ ClusterStmt: n->params = $3; $$ = (Node *) n; } - | CLUSTER '(' utility_option_list ')' + | CLUSTER opt_utility_option_list { ClusterStmt *n = makeNode(ClusterStmt); n->relation = NULL; n->indexname = NULL; - n->params = $3; + n->params = $2; $$ = (Node *) n; } /* unparenthesized VERBOSE kept for pre-14 compatibility */ @@ -11919,21 +11944,18 @@ ClusterStmt: n->relation = $3; n->indexname = $4; - n->params = NIL; if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } /* unparenthesized VERBOSE kept for pre-17 compatibility */ - | CLUSTER opt_verbose + | CLUSTER VERBOSE { ClusterStmt *n = makeNode(ClusterStmt); n->relation = NULL; n->indexname = NULL; - n->params = NIL; - if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } /* kept for pre-8.3 compatibility */ @@ -11943,9 +11965,8 @@ ClusterStmt: n->relation = $5; n->indexname = $3; - n->params = NIL; if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } ; @@ -11996,64 +12017,31 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose opt_analyze opt_vacuum_relati } ; -AnalyzeStmt: analyze_keyword opt_verbose opt_vacuum_relation_list +AnalyzeStmt: analyze_keyword opt_utility_option_list opt_vacuum_relation_list { VacuumStmt *n = makeNode(VacuumStmt); - n->options = NIL; - if ($2) - n->options = lappend(n->options, - makeDefElem("verbose", NULL, @2)); + n->options = $2; n->rels = $3; n->is_vacuumcmd = false; $$ = (Node *) n; } - | analyze_keyword '(' utility_option_list ')' opt_vacuum_relation_list + | analyze_keyword VERBOSE opt_vacuum_relation_list { VacuumStmt *n = makeNode(VacuumStmt); - n->options = $3; - n->rels = $5; + n->options = list_make1(makeDefElem("verbose", NULL, @2)); + n->rels = $3; n->is_vacuumcmd = false; $$ = (Node *) n; } ; -utility_option_list: - utility_option_elem - { - $$ = list_make1($1); - } - | utility_option_list ',' utility_option_elem - { - $$ = lappend($1, $3); - } - ; - analyze_keyword: ANALYZE | ANALYSE /* British */ ; -utility_option_elem: - utility_option_name utility_option_arg - { - $$ = makeDefElem($1, $2, @1); - } - ; - -utility_option_name: - NonReservedWord { $$ = $1; } - | analyze_keyword { $$ = "analyze"; } - | FORMAT_LA { $$ = "format"; } - ; - -utility_option_arg: - opt_boolean_or_string { $$ = (Node *) makeString($1); } - | NumericOnly { $$ = (Node *) $1; } - | /* EMPTY */ { $$ = NULL; } - ; - opt_analyze: analyze_keyword { $$ = true; } | /*EMPTY*/ { $$ = false; } diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index 4bdc2941efb..822cf4ec451 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -1007,9 +1007,6 @@ partition_bounds_copy(PartitionBoundInfo src, int ndatums; int nindexes; int partnatts; - bool hash_part; - int natts; - Datum *boundDatums; dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData)); @@ -1023,7 +1020,7 @@ partition_bounds_copy(PartitionBoundInfo src, dest->datums = (Datum **) palloc(sizeof(Datum *) * ndatums); - if (src->kind != NULL) + if (src->kind != NULL && ndatums > 0) { PartitionRangeDatumKind *boundKinds; @@ -1058,36 +1055,40 @@ partition_bounds_copy(PartitionBoundInfo src, * For hash partitioning, datums array will have two elements - modulus * and remainder. */ - hash_part = (key->strategy == PARTITION_STRATEGY_HASH); - natts = hash_part ? 2 : partnatts; - boundDatums = palloc(ndatums * natts * sizeof(Datum)); - - for (i = 0; i < ndatums; i++) + if (ndatums > 0) { - int j; - - dest->datums[i] = &boundDatums[i * natts]; + bool hash_part = (key->strategy == PARTITION_STRATEGY_HASH); + int natts = hash_part ? 2 : partnatts; + Datum *boundDatums = palloc(ndatums * natts * sizeof(Datum)); - for (j = 0; j < natts; j++) + for (i = 0; i < ndatums; i++) { - bool byval; - int typlen; + int j; - if (hash_part) - { - typlen = sizeof(int32); /* Always int4 */ - byval = true; /* int4 is pass-by-value */ - } - else + dest->datums[i] = &boundDatums[i * natts]; + + for (j = 0; j < natts; j++) { - byval = key->parttypbyval[j]; - typlen = key->parttyplen[j]; - } + if (dest->kind == NULL || + dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE) + { + bool byval; + int typlen; - if (dest->kind == NULL || - dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE) - dest->datums[i][j] = datumCopy(src->datums[i][j], - byval, typlen); + if (hash_part) + { + typlen = sizeof(int32); /* Always int4 */ + byval = true; /* int4 is pass-by-value */ + } + else + { + byval = key->parttypbyval[j]; + typlen = key->parttyplen[j]; + } + dest->datums[i][j] = datumCopy(src->datums[i][j], + byval, typlen); + } + } } } diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 9474095f271..ff96b36d710 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -310,6 +310,16 @@ static AutoVacuumShmemStruct *AutoVacuumShmem; static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList); static MemoryContext DatabaseListCxt = NULL; +/* + * Dummy pointer to persuade Valgrind that we've not leaked the array of + * avl_dbase structs. Make it global to ensure the compiler doesn't + * optimize it away. + */ +#ifdef USE_VALGRIND +extern avl_dbase *avl_dbase_array; +avl_dbase *avl_dbase_array; +#endif + /* Pointer to my own WorkerInfo, valid on each worker */ static WorkerInfo MyWorkerInfo = NULL; @@ -562,10 +572,10 @@ AutoVacLauncherMain(const void *startup_data, size_t startup_data_len) /* * Create the initial database list. The invariant we want this list to - * keep is that it's ordered by decreasing next_time. As soon as an entry - * is updated to a higher time, it will be moved to the front (which is - * correct because the only operation is to add autovacuum_naptime to the - * entry, and time always increases). + * keep is that it's ordered by decreasing next_worker. As soon as an + * entry is updated to a higher time, it will be moved to the front (which + * is correct because the only operation is to add autovacuum_naptime to + * the entry, and time always increases). */ rebuild_database_list(InvalidOid); @@ -1020,6 +1030,10 @@ rebuild_database_list(Oid newdb) /* put all the hash elements into an array */ dbary = palloc(nelems * sizeof(avl_dbase)); + /* keep Valgrind quiet */ +#ifdef USE_VALGRIND + avl_dbase_array = dbary; +#endif i = 0; hash_seq_init(&seq, dbhash); @@ -2565,8 +2579,18 @@ deleted: /* * We leak table_toast_map here (among other things), but since we're - * going away soon, it's not a problem. + * going away soon, it's not a problem normally. But when using Valgrind, + * release some stuff to reduce complaints about leaked storage. */ +#ifdef USE_VALGRIND + hash_destroy(table_toast_map); + FreeTupleDesc(pg_class_desc); + if (bstrategy) + pfree(bstrategy); +#endif + + /* Run the rest in xact context, mainly to avoid Valgrind leak warnings */ + MemoryContextSwitchTo(TopTransactionContext); /* * Update pg_database.datfrozenxid, and truncate pg_xact if possible. We diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 116ddf7b835..1ad65c237c3 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -613,6 +613,7 @@ ResetBackgroundWorkerCrashTimes(void) * resetting. */ rw->rw_crashed_at = 0; + rw->rw_pid = 0; /* * If there was anyone waiting for it, they're history. diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 2809e298a44..8490148a47d 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -130,6 +130,13 @@ typedef struct int num_requests; /* current # of requests */ int max_requests; /* allocated array size */ + + int head; /* Index of the first request in the ring + * buffer */ + int tail; /* Index of the last request in the ring + * buffer */ + + /* The ring buffer of pending checkpointer requests */ CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER]; } CheckpointerShmemStruct; @@ -138,6 +145,12 @@ static CheckpointerShmemStruct *CheckpointerShmem; /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */ #define WRITES_PER_ABSORB 1000 +/* Maximum number of checkpointer requests to process in one batch */ +#define CKPT_REQ_BATCH_SIZE 10000 + +/* Max number of requests the checkpointer request queue can hold */ +#define MAX_CHECKPOINT_REQUESTS 10000000 + /* * GUC parameters */ @@ -973,7 +986,8 @@ CheckpointerShmemInit(void) */ MemSet(CheckpointerShmem, 0, size); SpinLockInit(&CheckpointerShmem->ckpt_lck); - CheckpointerShmem->max_requests = NBuffers; + CheckpointerShmem->max_requests = Min(NBuffers, MAX_CHECKPOINT_REQUESTS); + CheckpointerShmem->head = CheckpointerShmem->tail = 0; ConditionVariableInit(&CheckpointerShmem->start_cv); ConditionVariableInit(&CheckpointerShmem->done_cv); } @@ -1201,6 +1215,7 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) { CheckpointerRequest *request; bool too_full; + int insert_pos; if (!IsUnderPostmaster) return false; /* probably shouldn't even get here */ @@ -1224,10 +1239,14 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) } /* OK, insert request */ - request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++]; + insert_pos = CheckpointerShmem->tail; + request = &CheckpointerShmem->requests[insert_pos]; request->ftag = *ftag; request->type = type; + CheckpointerShmem->tail = (CheckpointerShmem->tail + 1) % CheckpointerShmem->max_requests; + CheckpointerShmem->num_requests++; + /* If queue is more than half full, nudge the checkpointer to empty it */ too_full = (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests / 2); @@ -1269,12 +1288,16 @@ CompactCheckpointerRequestQueue(void) struct CheckpointerSlotMapping { CheckpointerRequest request; - int slot; + int ring_idx; }; - int n, - preserve_count; + int n; int num_skipped = 0; + int head; + int max_requests; + int num_requests; + int read_idx, + write_idx; HASHCTL ctl; HTAB *htab; bool *skip_slot; @@ -1286,8 +1309,13 @@ CompactCheckpointerRequestQueue(void) if (CritSectionCount > 0) return false; + max_requests = CheckpointerShmem->max_requests; + num_requests = CheckpointerShmem->num_requests; + /* Initialize skip_slot array */ - skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); + skip_slot = palloc0(sizeof(bool) * max_requests); + + head = CheckpointerShmem->head; /* Initialize temporary hash table */ ctl.keysize = sizeof(CheckpointerRequest); @@ -1311,7 +1339,8 @@ CompactCheckpointerRequestQueue(void) * away preceding entries that would end up being canceled anyhow), but * it's not clear that the extra complexity would buy us anything. */ - for (n = 0; n < CheckpointerShmem->num_requests; n++) + read_idx = head; + for (n = 0; n < num_requests; n++) { CheckpointerRequest *request; struct CheckpointerSlotMapping *slotmap; @@ -1324,16 +1353,19 @@ CompactCheckpointerRequestQueue(void) * CheckpointerShmemInit. Note also that RelFileLocator had better * contain no pad bytes. */ - request = &CheckpointerShmem->requests[n]; + request = &CheckpointerShmem->requests[read_idx]; slotmap = hash_search(htab, request, HASH_ENTER, &found); if (found) { /* Duplicate, so mark the previous occurrence as skippable */ - skip_slot[slotmap->slot] = true; + skip_slot[slotmap->ring_idx] = true; num_skipped++; } /* Remember slot containing latest occurrence of this request value */ - slotmap->slot = n; + slotmap->ring_idx = read_idx; + + /* Move to the next request in the ring buffer */ + read_idx = (read_idx + 1) % max_requests; } /* Done with the hash table. */ @@ -1347,17 +1379,34 @@ CompactCheckpointerRequestQueue(void) } /* We found some duplicates; remove them. */ - preserve_count = 0; - for (n = 0; n < CheckpointerShmem->num_requests; n++) + read_idx = write_idx = head; + for (n = 0; n < num_requests; n++) { - if (skip_slot[n]) - continue; - CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n]; + /* If this slot is NOT skipped, keep it */ + if (!skip_slot[read_idx]) + { + /* If the read and write positions are different, copy the request */ + if (write_idx != read_idx) + CheckpointerShmem->requests[write_idx] = + CheckpointerShmem->requests[read_idx]; + + /* Advance the write position */ + write_idx = (write_idx + 1) % max_requests; + } + + read_idx = (read_idx + 1) % max_requests; } + + /* + * Update ring buffer state: head remains the same, tail moves, count + * decreases + */ + CheckpointerShmem->tail = write_idx; + CheckpointerShmem->num_requests -= num_skipped; + ereport(DEBUG1, (errmsg_internal("compacted fsync request queue from %d entries to %d entries", - CheckpointerShmem->num_requests, preserve_count))); - CheckpointerShmem->num_requests = preserve_count; + num_requests, CheckpointerShmem->num_requests))); /* Cleanup. */ pfree(skip_slot); @@ -1378,40 +1427,64 @@ AbsorbSyncRequests(void) { CheckpointerRequest *requests = NULL; CheckpointerRequest *request; - int n; + int n, + i; + bool loop; if (!AmCheckpointerProcess()) return; - LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); - - /* - * We try to avoid holding the lock for a long time by copying the request - * array, and processing the requests after releasing the lock. - * - * Once we have cleared the requests from shared memory, we have to PANIC - * if we then fail to absorb them (eg, because our hashtable runs out of - * memory). This is because the system cannot run safely if we are unable - * to fsync what we have been told to fsync. Fortunately, the hashtable - * is so small that the problem is quite unlikely to arise in practice. - */ - n = CheckpointerShmem->num_requests; - if (n > 0) + do { - requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); - memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); - } + LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); + + /*--- + * We try to avoid holding the lock for a long time by: + * 1. Copying the request array and processing the requests after + * releasing the lock; + * 2. Processing not the whole queue, but only batches of + * CKPT_REQ_BATCH_SIZE at once. + * + * Once we have cleared the requests from shared memory, we must + * PANIC if we then fail to absorb them (e.g., because our hashtable + * runs out of memory). This is because the system cannot run safely + * if we are unable to fsync what we have been told to fsync. + * Fortunately, the hashtable is so small that the problem is quite + * unlikely to arise in practice. + * + * Note: The maximum possible size of a ring buffer is + * MAX_CHECKPOINT_REQUESTS entries, which fit into a maximum palloc + * allocation size of 1Gb. Our maximum batch size, + * CKPT_REQ_BATCH_SIZE, is even smaller. + */ + n = Min(CheckpointerShmem->num_requests, CKPT_REQ_BATCH_SIZE); + if (n > 0) + { + if (!requests) + requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); - START_CRIT_SECTION(); + for (i = 0; i < n; i++) + { + requests[i] = CheckpointerShmem->requests[CheckpointerShmem->head]; + CheckpointerShmem->head = (CheckpointerShmem->head + 1) % CheckpointerShmem->max_requests; + } - CheckpointerShmem->num_requests = 0; + CheckpointerShmem->num_requests -= n; - LWLockRelease(CheckpointerCommLock); + } + + START_CRIT_SECTION(); + + /* Are there any requests in the queue? If so, keep going. */ + loop = CheckpointerShmem->num_requests != 0; + + LWLockRelease(CheckpointerCommLock); - for (request = requests; n > 0; request++, n--) - RememberSyncRequest(&request->ftag, request->type); + for (request = requests; n > 0; request++, n--) + RememberSyncRequest(&request->ftag, request->type); - END_CRIT_SECTION(); + END_CRIT_SECTION(); + } while (loop); if (requests) pfree(requests); diff --git a/src/backend/postmaster/pmchild.c b/src/backend/postmaster/pmchild.c index cde1d23a4ca..584bb58c8ab 100644 --- a/src/backend/postmaster/pmchild.c +++ b/src/backend/postmaster/pmchild.c @@ -60,6 +60,17 @@ NON_EXEC_STATIC int num_pmchild_slots = 0; dlist_head ActiveChildList; /* + * Dummy pointer to persuade Valgrind that we've not leaked the array of + * PMChild structs. Make it global to ensure the compiler doesn't + * optimize it away. + */ +#ifdef USE_VALGRIND +extern PMChild *pmchild_array; +PMChild *pmchild_array; +#endif + + +/* * MaxLivePostmasterChildren * * This reports the number of postmaster child processes that can be active. @@ -125,8 +136,13 @@ InitPostmasterChildSlots(void) for (int i = 0; i < BACKEND_NUM_TYPES; i++) num_pmchild_slots += pmchild_pools[i].size; - /* Initialize them */ + /* Allocate enough slots, and make sure Valgrind doesn't complain */ slots = palloc(num_pmchild_slots * sizeof(PMChild)); +#ifdef USE_VALGRIND + pmchild_array = slots; +#endif + + /* Initialize them */ slotno = 0; for (int btype = 0; btype < BACKEND_NUM_TYPES; btype++) { diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index cca9b946e53..e01d9f0cfe8 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2630,6 +2630,13 @@ CleanupBackend(PMChild *bp, } bp = NULL; + /* + * In a crash case, exit immediately without resetting background worker + * state. However, if restart_after_crash is enabled, the background + * worker state (e.g., rw_pid) still needs be reset so the worker can + * restart after crash recovery. This reset is handled in + * ResetBackgroundWorkerCrashTimes(), not here. + */ if (crashed) { HandleChildCrash(bp_pid, exitstatus, procname); diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index f7b5d093681..239641bfbb6 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -232,6 +232,9 @@ libpqrcv_connect(const char *conninfo, bool replication, bool logical, errhint("Target server's authentication method must be changed, or set password_required=false in the subscription parameters."))); } + PQsetNoticeReceiver(conn->streamConn, libpqsrv_notice_receiver, + "received message via replication"); + /* * Set always-secure search path for the cases where the connection is * used to run SQL queries, so malicious users can't get control. @@ -418,31 +421,22 @@ libpqrcv_identify_system(WalReceiverConn *conn, TimeLineID *primary_tli) "IDENTIFY_SYSTEM", WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive database system identifier and timeline ID from " "the primary server: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } /* * IDENTIFY_SYSTEM returns 3 columns in 9.3 and earlier, and 4 columns in * 9.4 and onwards. */ if (PQnfields(res) < 3 || PQntuples(res) != 1) - { - int ntuples = PQntuples(res); - int nfields = PQnfields(res); - - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid response from primary server"), errdetail("Could not identify system: got %d rows and %d fields, expected %d rows and %d or more fields.", - ntuples, nfields, 1, 3))); - } + PQntuples(res), PQnfields(res), 1, 3))); primary_sysid = pstrdup(PQgetvalue(res, 0, 0)); *primary_tli = pg_strtoint32(PQgetvalue(res, 0, 1)); PQclear(res); @@ -604,13 +598,10 @@ libpqrcv_startstreaming(WalReceiverConn *conn, return false; } else if (PQresultStatus(res) != PGRES_COPY_BOTH) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not start WAL streaming: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } PQclear(res); return true; } @@ -718,26 +709,17 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, cmd, WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive timeline history file from " "the primary server: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } if (PQnfields(res) != 2 || PQntuples(res) != 1) - { - int ntuples = PQntuples(res); - int nfields = PQnfields(res); - - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid response from primary server"), errdetail("Expected 1 tuple with 2 fields, got %d tuples with %d fields.", - ntuples, nfields))); - } + PQntuples(res), PQnfields(res)))); *filename = pstrdup(PQgetvalue(res, 0, 0)); *len = PQgetlength(res, 0, 1); @@ -841,13 +823,10 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer, return -1; } else - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive data from WAL stream: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } } if (rawlen < -1) ereport(ERROR, @@ -971,13 +950,10 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, pfree(cmd.data); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not create replication slot \"%s\": %s", slotname, pchomp(PQerrorMessage(conn->streamConn))))); - } if (lsn) *lsn = DatumGetLSN(DirectFunctionCall1Coll(pg_lsn_in, InvalidOid, diff --git a/src/backend/replication/logical/applyparallelworker.c b/src/backend/replication/logical/applyparallelworker.c index d25085d3515..1fa931a7422 100644 --- a/src/backend/replication/logical/applyparallelworker.c +++ b/src/backend/replication/logical/applyparallelworker.c @@ -441,7 +441,8 @@ pa_launch_parallel_worker(void) MySubscription->name, MyLogicalRepWorker->userid, InvalidOid, - dsm_segment_handle(winfo->dsm_seg)); + dsm_segment_handle(winfo->dsm_seg), + false); if (launched) { diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index 4aed0dfcebb..37377f7eb63 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -32,6 +32,7 @@ #include "postmaster/interrupt.h" #include "replication/logicallauncher.h" #include "replication/origin.h" +#include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/worker_internal.h" #include "storage/ipc.h" @@ -91,7 +92,6 @@ static dshash_table *last_start_times = NULL; static bool on_commit_launcher_wakeup = false; -static void ApplyLauncherWakeup(void); static void logicalrep_launcher_onexit(int code, Datum arg); static void logicalrep_worker_onexit(int code, Datum arg); static void logicalrep_worker_detach(void); @@ -100,6 +100,9 @@ static int logicalrep_pa_worker_count(Oid subid); static void logicalrep_launcher_attach_dshmem(void); static void ApplyLauncherSetWorkerStartTime(Oid subid, TimestampTz start_time); static TimestampTz ApplyLauncherGetWorkerStartTime(Oid subid); +static void compute_min_nonremovable_xid(LogicalRepWorker *worker, TransactionId *xmin); +static bool acquire_conflict_slot_if_exists(void); +static void advance_conflict_slot_xmin(TransactionId new_xmin); /* @@ -148,6 +151,7 @@ get_subscription_list(void) sub->owner = subform->subowner; sub->enabled = subform->subenabled; sub->name = pstrdup(NameStr(subform->subname)); + sub->retaindeadtuples = subform->subretaindeadtuples; /* We don't fill fields we are not interested in. */ res = lappend(res, sub); @@ -309,7 +313,8 @@ logicalrep_workers_find(Oid subid, bool only_running, bool acquire_lock) bool logicalrep_worker_launch(LogicalRepWorkerType wtype, Oid dbid, Oid subid, const char *subname, Oid userid, - Oid relid, dsm_handle subworker_dsm) + Oid relid, dsm_handle subworker_dsm, + bool retain_dead_tuples) { BackgroundWorker bgw; BackgroundWorkerHandle *bgw_handle; @@ -328,10 +333,13 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, * - must be valid worker type * - tablesync workers are only ones to have relid * - parallel apply worker is the only kind of subworker + * - The replication slot used in conflict detection is created when + * retain_dead_tuples is enabled */ Assert(wtype != WORKERTYPE_UNKNOWN); Assert(is_tablesync_worker == OidIsValid(relid)); Assert(is_parallel_apply_worker == (subworker_dsm != DSM_HANDLE_INVALID)); + Assert(!retain_dead_tuples || MyReplicationSlot); ereport(DEBUG1, (errmsg_internal("starting logical replication worker for subscription \"%s\"", @@ -454,6 +462,9 @@ retry: worker->stream_fileset = NULL; worker->leader_pid = is_parallel_apply_worker ? MyProcPid : InvalidPid; worker->parallel_apply = is_parallel_apply_worker; + worker->oldest_nonremovable_xid = retain_dead_tuples + ? MyReplicationSlot->data.xmin + : InvalidTransactionId; worker->last_lsn = InvalidXLogRecPtr; TIMESTAMP_NOBEGIN(worker->last_send_time); TIMESTAMP_NOBEGIN(worker->last_recv_time); @@ -779,6 +790,8 @@ logicalrep_worker_detach(void) } LWLockRelease(LogicalRepWorkerLock); + + list_free(workers); } /* Block concurrent access. */ @@ -1118,7 +1131,10 @@ ApplyLauncherWakeupAtCommit(void) on_commit_launcher_wakeup = true; } -static void +/* + * Wakeup the launcher immediately. + */ +void ApplyLauncherWakeup(void) { if (LogicalRepCtx->launcher_pid != 0) @@ -1150,6 +1166,12 @@ ApplyLauncherMain(Datum main_arg) */ BackgroundWorkerInitializeConnection(NULL, NULL, 0); + /* + * Acquire the conflict detection slot at startup to ensure it can be + * dropped if no longer needed after a restart. + */ + acquire_conflict_slot_if_exists(); + /* Enter main loop */ for (;;) { @@ -1159,6 +1181,9 @@ ApplyLauncherMain(Datum main_arg) MemoryContext subctx; MemoryContext oldctx; long wait_time = DEFAULT_NAPTIME_PER_CYCLE; + bool can_advance_xmin = true; + bool retain_dead_tuples = false; + TransactionId xmin = InvalidTransactionId; CHECK_FOR_INTERRUPTS(); @@ -1168,7 +1193,14 @@ ApplyLauncherMain(Datum main_arg) ALLOCSET_DEFAULT_SIZES); oldctx = MemoryContextSwitchTo(subctx); - /* Start any missing workers for enabled subscriptions. */ + /* + * Start any missing workers for enabled subscriptions. + * + * Also, during the iteration through all subscriptions, we compute + * the minimum XID required to protect deleted tuples for conflict + * detection if one of the subscription enables retain_dead_tuples + * option. + */ sublist = get_subscription_list(); foreach(lc, sublist) { @@ -1178,6 +1210,38 @@ ApplyLauncherMain(Datum main_arg) TimestampTz now; long elapsed; + if (sub->retaindeadtuples) + { + retain_dead_tuples = true; + + /* + * Can't advance xmin of the slot unless all the subscriptions + * with retain_dead_tuples are enabled. This is required to + * ensure that we don't advance the xmin of + * CONFLICT_DETECTION_SLOT if one of the subscriptions is not + * enabled. Otherwise, we won't be able to detect conflicts + * reliably for such a subscription even though it has set the + * retain_dead_tuples option. + */ + can_advance_xmin &= sub->enabled; + + /* + * Create a replication slot to retain information necessary + * for conflict detection such as dead tuples, commit + * timestamps, and origins. + * + * The slot is created before starting the apply worker to + * prevent it from unnecessarily maintaining its + * oldest_nonremovable_xid. + * + * The slot is created even for a disabled subscription to + * ensure that conflict-related information is available when + * applying remote changes that occurred before the + * subscription was enabled. + */ + CreateConflictDetectionSlot(); + } + if (!sub->enabled) continue; @@ -1186,7 +1250,27 @@ ApplyLauncherMain(Datum main_arg) LWLockRelease(LogicalRepWorkerLock); if (w != NULL) - continue; /* worker is running already */ + { + /* + * Compute the minimum xmin required to protect dead tuples + * required for conflict detection among all running apply + * workers that enables retain_dead_tuples. + */ + if (sub->retaindeadtuples && can_advance_xmin) + compute_min_nonremovable_xid(w, &xmin); + + /* worker is running already */ + continue; + } + + /* + * Can't advance xmin of the slot unless all the workers + * corresponding to subscriptions with retain_dead_tuples are + * running, disabling the further computation of the minimum + * nonremovable xid. + */ + if (sub->retaindeadtuples) + can_advance_xmin = false; /* * If the worker is eligible to start now, launch it. Otherwise, @@ -1210,7 +1294,8 @@ ApplyLauncherMain(Datum main_arg) if (!logicalrep_worker_launch(WORKERTYPE_APPLY, sub->dbid, sub->oid, sub->name, sub->owner, InvalidOid, - DSM_HANDLE_INVALID)) + DSM_HANDLE_INVALID, + sub->retaindeadtuples)) { /* * We get here either if we failed to launch a worker @@ -1230,6 +1315,20 @@ ApplyLauncherMain(Datum main_arg) } } + /* + * Drop the CONFLICT_DETECTION_SLOT slot if there is no subscription + * that requires us to retain dead tuples. Otherwise, if required, + * advance the slot's xmin to protect dead tuples required for the + * conflict detection. + */ + if (MyReplicationSlot) + { + if (!retain_dead_tuples) + ReplicationSlotDropAcquired(); + else if (can_advance_xmin) + advance_conflict_slot_xmin(xmin); + } + /* Switch back to original memory context. */ MemoryContextSwitchTo(oldctx); /* Clean the temporary memory. */ @@ -1258,6 +1357,125 @@ ApplyLauncherMain(Datum main_arg) } /* + * Determine the minimum non-removable transaction ID across all apply workers + * for subscriptions that have retain_dead_tuples enabled. Store the result + * in *xmin. + */ +static void +compute_min_nonremovable_xid(LogicalRepWorker *worker, TransactionId *xmin) +{ + TransactionId nonremovable_xid; + + Assert(worker != NULL); + + /* + * The replication slot for conflict detection must be created before the + * worker starts. + */ + Assert(MyReplicationSlot); + + SpinLockAcquire(&worker->relmutex); + nonremovable_xid = worker->oldest_nonremovable_xid; + SpinLockRelease(&worker->relmutex); + + Assert(TransactionIdIsValid(nonremovable_xid)); + + if (!TransactionIdIsValid(*xmin) || + TransactionIdPrecedes(nonremovable_xid, *xmin)) + *xmin = nonremovable_xid; +} + +/* + * Acquire the replication slot used to retain information for conflict + * detection, if it exists. + * + * Return true if successfully acquired, otherwise return false. + */ +static bool +acquire_conflict_slot_if_exists(void) +{ + if (!SearchNamedReplicationSlot(CONFLICT_DETECTION_SLOT, true)) + return false; + + ReplicationSlotAcquire(CONFLICT_DETECTION_SLOT, true, false); + return true; +} + +/* + * Advance the xmin the replication slot used to retain information required + * for conflict detection. + */ +static void +advance_conflict_slot_xmin(TransactionId new_xmin) +{ + Assert(MyReplicationSlot); + Assert(TransactionIdIsValid(new_xmin)); + Assert(TransactionIdPrecedesOrEquals(MyReplicationSlot->data.xmin, new_xmin)); + + /* Return if the xmin value of the slot cannot be advanced */ + if (TransactionIdEquals(MyReplicationSlot->data.xmin, new_xmin)) + return; + + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_xmin = new_xmin; + MyReplicationSlot->data.xmin = new_xmin; + SpinLockRelease(&MyReplicationSlot->mutex); + + elog(DEBUG1, "updated xmin: %u", MyReplicationSlot->data.xmin); + + ReplicationSlotMarkDirty(); + ReplicationSlotsComputeRequiredXmin(false); + + /* + * Like PhysicalConfirmReceivedLocation(), do not save slot information + * each time. This is acceptable because all concurrent transactions on + * the publisher that require the data preceding the slot's xmin should + * have already been applied and flushed on the subscriber before the xmin + * is advanced. So, even if the slot's xmin regresses after a restart, it + * will be advanced again in the next cycle. Therefore, no data required + * for conflict detection will be prematurely removed. + */ + return; +} + +/* + * Create and acquire the replication slot used to retain information for + * conflict detection, if not yet. + */ +void +CreateConflictDetectionSlot(void) +{ + TransactionId xmin_horizon; + + /* Exit early, if the replication slot is already created and acquired */ + if (MyReplicationSlot) + return; + + ereport(LOG, + errmsg("creating replication conflict detection slot")); + + ReplicationSlotCreate(CONFLICT_DETECTION_SLOT, false, RS_PERSISTENT, false, + false, false); + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + xmin_horizon = GetOldestSafeDecodingTransactionId(false); + + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_xmin = xmin_horizon; + MyReplicationSlot->data.xmin = xmin_horizon; + SpinLockRelease(&MyReplicationSlot->mutex); + + ReplicationSlotsComputeRequiredXmin(true); + + LWLockRelease(ProcArrayLock); + + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); +} + +/* * Is current process the logical replication launcher? */ bool diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 7b4e8629553..34cf05668ae 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -2599,7 +2599,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, if (++changes_count >= CHANGES_THRESHOLD) { - rb->update_progress_txn(rb, txn, change->lsn); + rb->update_progress_txn(rb, txn, prev_lsn); changes_count = 0; } } @@ -4917,7 +4917,7 @@ StartupReorderBuffer(void) continue; /* if it cannot be a slot, skip the directory */ - if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2)) + if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2)) continue; /* diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c index e4fd6347fd1..d3356bc84ee 100644 --- a/src/backend/replication/logical/tablesync.c +++ b/src/backend/replication/logical/tablesync.c @@ -316,7 +316,8 @@ process_syncing_tables_for_sync(XLogRecPtr current_lsn) UpdateSubscriptionRelState(MyLogicalRepWorker->subid, MyLogicalRepWorker->relid, MyLogicalRepWorker->relstate, - MyLogicalRepWorker->relstate_lsn); + MyLogicalRepWorker->relstate_lsn, + false); /* * End streaming so that LogRepWorkerWalRcvConn can be used to drop @@ -425,6 +426,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) ListCell *lc; bool started_tx = false; bool should_exit = false; + Relation rel = NULL; Assert(!IsTransactionState()); @@ -492,7 +494,17 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) * worker to remove the origin tracking as if there is any * error while dropping we won't restart it to drop the * origin. So passing missing_ok = true. + * + * Lock the subscription and origin in the same order as we + * are doing during DDL commands to avoid deadlocks. See + * AlterSubscription_refresh. */ + LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid, + 0, AccessShareLock); + + if (!rel) + rel = table_open(SubscriptionRelRelationId, RowExclusiveLock); + ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid, rstate->relid, originname, @@ -504,7 +516,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) */ UpdateSubscriptionRelState(MyLogicalRepWorker->subid, rstate->relid, rstate->state, - rstate->lsn); + rstate->lsn, true); } } else @@ -555,7 +567,14 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) * This is required to avoid any undetected deadlocks * due to any existing lock as deadlock detector won't * be able to detect the waits on the latch. + * + * Also close any tables prior to the commit. */ + if (rel) + { + table_close(rel, NoLock); + rel = NULL; + } CommitTransactionCommand(); pgstat_report_stat(false); } @@ -615,13 +634,19 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) MySubscription->name, MyLogicalRepWorker->userid, rstate->relid, - DSM_HANDLE_INVALID); + DSM_HANDLE_INVALID, + false); } } } } } + /* Close table if opened */ + if (rel) + table_close(rel, NoLock); + + if (started_tx) { /* @@ -1413,7 +1438,8 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) UpdateSubscriptionRelState(MyLogicalRepWorker->subid, MyLogicalRepWorker->relid, MyLogicalRepWorker->relstate, - MyLogicalRepWorker->relstate_lsn); + MyLogicalRepWorker->relstate_lsn, + false); CommitTransactionCommand(); pgstat_report_stat(true); @@ -1546,7 +1572,8 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) UpdateSubscriptionRelState(MyLogicalRepWorker->subid, MyLogicalRepWorker->relid, SUBREL_STATE_FINISHEDCOPY, - MyLogicalRepWorker->relstate_lsn); + MyLogicalRepWorker->relstate_lsn, + false); CommitTransactionCommand(); diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index c5fb627aa56..b59221c4d06 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -132,6 +132,96 @@ * failover = true when creating the subscription. Enabling failover allows us * to smoothly transition to the promoted standby, ensuring that we can * subscribe to the new primary without losing any data. + * + * RETAIN DEAD TUPLES + * ---------------------- + * Each apply worker that enabled retain_dead_tuples option maintains a + * non-removable transaction ID (oldest_nonremovable_xid) in shared memory to + * prevent dead rows from being removed prematurely when the apply worker still + * needs them to detect conflicts reliably. This helps to retain the required + * commit_ts module information, which further helps to detect + * update_origin_differs and delete_origin_differs conflicts reliably, as + * otherwise, vacuum freeze could remove the required information. + * + * The logical replication launcher manages an internal replication slot named + * "pg_conflict_detection". It asynchronously aggregates the non-removable + * transaction ID from all apply workers to determine the appropriate xmin for + * the slot, thereby retaining necessary tuples. + * + * The non-removable transaction ID in the apply worker is advanced to the + * oldest running transaction ID once all concurrent transactions on the + * publisher have been applied and flushed locally. The process involves: + * + * - RDT_GET_CANDIDATE_XID: + * Call GetOldestActiveTransactionId() to take oldestRunningXid as the + * candidate xid. + * + * - RDT_REQUEST_PUBLISHER_STATUS: + * Send a message to the walsender requesting the publisher status, which + * includes the latest WAL write position and information about transactions + * that are in the commit phase. + * + * - RDT_WAIT_FOR_PUBLISHER_STATUS: + * Wait for the status from the walsender. After receiving the first status, + * do not proceed if there are concurrent remote transactions that are still + * in the commit phase. These transactions might have been assigned an + * earlier commit timestamp but have not yet written the commit WAL record. + * Continue to request the publisher status (RDT_REQUEST_PUBLISHER_STATUS) + * until all these transactions have completed. + * + * - RDT_WAIT_FOR_LOCAL_FLUSH: + * Advance the non-removable transaction ID if the current flush location has + * reached or surpassed the last received WAL position. + * + * The overall state progression is: GET_CANDIDATE_XID -> + * REQUEST_PUBLISHER_STATUS -> WAIT_FOR_PUBLISHER_STATUS -> (loop to + * REQUEST_PUBLISHER_STATUS till concurrent remote transactions end) -> + * WAIT_FOR_LOCAL_FLUSH -> loop back to GET_CANDIDATE_XID. + * + * Retaining the dead tuples for this period is sufficient for ensuring + * eventual consistency using last-update-wins strategy, as dead tuples are + * useful for detecting conflicts only during the application of concurrent + * transactions from remote nodes. After applying and flushing all remote + * transactions that occurred concurrently with the tuple DELETE, any + * subsequent UPDATE from a remote node should have a later timestamp. In such + * cases, it is acceptable to detect an update_missing scenario and convert the + * UPDATE to an INSERT when applying it. But, detecting concurrent remote + * transactions with earlier timestamps than the DELETE is necessary, as the + * UPDATEs in remote transactions should be ignored if their timestamp is + * earlier than that of the dead tuples. + * + * Note that advancing the non-removable transaction ID is not supported if the + * publisher is also a physical standby. This is because the logical walsender + * on the standby can only get the WAL replay position but there may be more + * WALs that are being replicated from the primary and those WALs could have + * earlier commit timestamp. + * + * Similarly, when the publisher has subscribed to another publisher, + * information necessary for conflict detection cannot be retained for + * changes from origins other than the publisher. This is because publisher + * lacks the information on concurrent transactions of other publishers to + * which it subscribes. As the information on concurrent transactions is + * unavailable beyond subscriber's immediate publishers, the non-removable + * transaction ID might be advanced prematurely before changes from other + * origins have been fully applied. + * + * XXX Retaining information for changes from other origins might be possible + * by requesting the subscription on that origin to enable retain_dead_tuples + * and fetching the conflict detection slot.xmin along with the publisher's + * status. In the RDT_WAIT_FOR_PUBLISHER_STATUS phase, the apply worker could + * wait for the remote slot's xmin to reach the oldest active transaction ID, + * ensuring that all transactions from other origins have been applied on the + * publisher, thereby getting the latest WAL position that includes all + * concurrent changes. However, this approach may impact performance, so it + * might not worth the effort. + * + * XXX It seems feasible to get the latest commit's WAL location from the + * publisher and wait till that is applied. However, we can't do that + * because commit timestamps can regress as a commit with a later LSN is not + * guaranteed to have a later timestamp than those with earlier LSNs. Having + * said that, even if that is possible, it won't improve performance much as + * the apply always lag and moves slowly as compared with the transactions + * on the publisher. *------------------------------------------------------------------------- */ @@ -140,6 +230,7 @@ #include <sys/stat.h> #include <unistd.h> +#include "access/commit_ts.h" #include "access/table.h" #include "access/tableam.h" #include "access/twophase.h" @@ -148,6 +239,7 @@ #include "catalog/pg_inherits.h" #include "catalog/pg_subscription.h" #include "catalog/pg_subscription_rel.h" +#include "commands/subscriptioncmds.h" #include "commands/tablecmds.h" #include "commands/trigger.h" #include "executor/executor.h" @@ -166,12 +258,14 @@ #include "replication/logicalrelation.h" #include "replication/logicalworker.h" #include "replication/origin.h" +#include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/worker_internal.h" #include "rewrite/rewriteHandler.h" #include "storage/buffile.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/procarray.h" #include "tcop/tcopprot.h" #include "utils/acl.h" #include "utils/dynahash.h" @@ -268,6 +362,78 @@ typedef enum TRANS_PARALLEL_APPLY, } TransApplyAction; +/* + * The phases involved in advancing the non-removable transaction ID. + * + * See comments atop worker.c for details of the transition between these + * phases. + */ +typedef enum +{ + RDT_GET_CANDIDATE_XID, + RDT_REQUEST_PUBLISHER_STATUS, + RDT_WAIT_FOR_PUBLISHER_STATUS, + RDT_WAIT_FOR_LOCAL_FLUSH +} RetainDeadTuplesPhase; + +/* + * Critical information for managing phase transitions within the + * RetainDeadTuplesPhase. + */ +typedef struct RetainDeadTuplesData +{ + RetainDeadTuplesPhase phase; /* current phase */ + XLogRecPtr remote_lsn; /* WAL write position on the publisher */ + + /* + * Oldest transaction ID that was in the commit phase on the publisher. + * Use FullTransactionId to prevent issues with transaction ID wraparound, + * where a new remote_oldestxid could falsely appear to originate from the + * past and block advancement. + */ + FullTransactionId remote_oldestxid; + + /* + * Next transaction ID to be assigned on the publisher. Use + * FullTransactionId for consistency and to allow straightforward + * comparisons with remote_oldestxid. + */ + FullTransactionId remote_nextxid; + + TimestampTz reply_time; /* when the publisher responds with status */ + + /* + * Publisher transaction ID that must be awaited to complete before + * entering the final phase (RDT_WAIT_FOR_LOCAL_FLUSH). Use + * FullTransactionId for the same reason as remote_nextxid. + */ + FullTransactionId remote_wait_for; + + TransactionId candidate_xid; /* candidate for the non-removable + * transaction ID */ + TimestampTz flushpos_update_time; /* when the remote flush position was + * updated in final phase + * (RDT_WAIT_FOR_LOCAL_FLUSH) */ + + /* + * The following fields are used to determine the timing for the next + * round of transaction ID advancement. + */ + TimestampTz last_recv_time; /* when the last message was received */ + TimestampTz candidate_xid_time; /* when the candidate_xid is decided */ + int xid_advance_interval; /* how much time (ms) to wait before + * attempting to advance the + * non-removable transaction ID */ +} RetainDeadTuplesData; + +/* + * The minimum (100ms) and maximum (3 minutes) intervals for advancing + * non-removable transaction IDs. The maximum interval is a bit arbitrary but + * is sufficient to not cause any undue network traffic. + */ +#define MIN_XID_ADVANCE_INTERVAL 100 +#define MAX_XID_ADVANCE_INTERVAL 180000 + /* errcontext tracker */ static ApplyErrorCallbackArg apply_error_callback_arg = { @@ -332,6 +498,13 @@ static XLogRecPtr skip_xact_finish_lsn = InvalidXLogRecPtr; /* BufFile handle of the current streaming file */ static BufFile *stream_fd = NULL; +/* + * The remote WAL position that has been applied and flushed locally. We record + * and use this information both while sending feedback to the server and + * advancing oldest_nonremovable_xid. + */ +static XLogRecPtr last_flushpos = InvalidXLogRecPtr; + typedef struct SubXactInfo { TransactionId xid; /* XID of the subxact */ @@ -372,6 +545,19 @@ static void stream_close_file(void); static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); +static void maybe_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data, + bool status_received); +static bool can_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data); +static void process_rdt_phase_transition(RetainDeadTuplesData *rdt_data, + bool status_received); +static void get_candidate_xid(RetainDeadTuplesData *rdt_data); +static void request_publisher_status(RetainDeadTuplesData *rdt_data); +static void wait_for_publisher_status(RetainDeadTuplesData *rdt_data, + bool status_received); +static void wait_for_local_flush(RetainDeadTuplesData *rdt_data); +static void adjust_xid_advance_interval(RetainDeadTuplesData *rdt_data, + bool new_xid_found); + static void apply_handle_commit_internal(LogicalRepCommitData *commit_data); static void apply_handle_insert_internal(ApplyExecutionData *edata, ResultRelInfo *relinfo, @@ -3577,6 +3763,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received) bool ping_sent = false; TimeLineID tli; ErrorContextCallback errcallback; + RetainDeadTuplesData rdt_data = {0}; /* * Init the ApplyMessageContext which we clean up after each replication @@ -3655,6 +3842,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received) last_recv_timestamp = GetCurrentTimestamp(); ping_sent = false; + rdt_data.last_recv_time = last_recv_timestamp; + /* Ensure we are reading the data into our memory context. */ MemoryContextSwitchTo(ApplyMessageContext); @@ -3681,6 +3870,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received) UpdateWorkerStats(last_received, send_time, false); apply_dispatch(&s); + + maybe_advance_nonremovable_xid(&rdt_data, false); } else if (c == 'k') { @@ -3696,8 +3887,31 @@ LogicalRepApplyLoop(XLogRecPtr last_received) last_received = end_lsn; send_feedback(last_received, reply_requested, false); + + maybe_advance_nonremovable_xid(&rdt_data, false); + UpdateWorkerStats(last_received, timestamp, true); } + else if (c == 's') /* Primary status update */ + { + rdt_data.remote_lsn = pq_getmsgint64(&s); + rdt_data.remote_oldestxid = FullTransactionIdFromU64((uint64) pq_getmsgint64(&s)); + rdt_data.remote_nextxid = FullTransactionIdFromU64((uint64) pq_getmsgint64(&s)); + rdt_data.reply_time = pq_getmsgint64(&s); + + /* + * This should never happen, see + * ProcessStandbyPSRequestMessage. But if it happens + * due to a bug, we don't want to proceed as it can + * incorrectly advance oldest_nonremovable_xid. + */ + if (XLogRecPtrIsInvalid(rdt_data.remote_lsn)) + elog(ERROR, "cannot get the latest WAL position from the publisher"); + + maybe_advance_nonremovable_xid(&rdt_data, true); + + UpdateWorkerStats(last_received, rdt_data.reply_time, false); + } /* other message types are purposefully ignored */ MemoryContextReset(ApplyMessageContext); @@ -3710,6 +3924,11 @@ LogicalRepApplyLoop(XLogRecPtr last_received) /* confirm all writes so far */ send_feedback(last_received, false, false); + /* Reset the timestamp if no message was received */ + rdt_data.last_recv_time = 0; + + maybe_advance_nonremovable_xid(&rdt_data, false); + if (!in_remote_transaction && !in_streamed_transaction) { /* @@ -3744,6 +3963,14 @@ LogicalRepApplyLoop(XLogRecPtr last_received) else wait_time = NAPTIME_PER_CYCLE; + /* + * Ensure to wake up when it's possible to advance the non-removable + * transaction ID. + */ + if (rdt_data.phase == RDT_GET_CANDIDATE_XID && + rdt_data.xid_advance_interval) + wait_time = Min(wait_time, rdt_data.xid_advance_interval); + rc = WaitLatchOrSocket(MyLatch, WL_SOCKET_READABLE | WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, @@ -3807,6 +4034,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received) send_feedback(last_received, requestReply, requestReply); + maybe_advance_nonremovable_xid(&rdt_data, false); + /* * Force reporting to ensure long idle periods don't lead to * arbitrarily delayed stats. Stats can only be reported outside @@ -3842,7 +4071,6 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) static XLogRecPtr last_recvpos = InvalidXLogRecPtr; static XLogRecPtr last_writepos = InvalidXLogRecPtr; - static XLogRecPtr last_flushpos = InvalidXLogRecPtr; XLogRecPtr writepos; XLogRecPtr flushpos; @@ -3921,6 +4149,367 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) } /* + * Attempt to advance the non-removable transaction ID. + * + * See comments atop worker.c for details. + */ +static void +maybe_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + if (!can_advance_nonremovable_xid(rdt_data)) + return; + + process_rdt_phase_transition(rdt_data, status_received); +} + +/* + * Preliminary check to determine if advancing the non-removable transaction ID + * is allowed. + */ +static bool +can_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data) +{ + /* + * It is sufficient to manage non-removable transaction ID for a + * subscription by the main apply worker to detect conflicts reliably even + * for table sync or parallel apply workers. + */ + if (!am_leader_apply_worker()) + return false; + + /* No need to advance if retaining dead tuples is not required */ + if (!MySubscription->retaindeadtuples) + return false; + + return true; +} + +/* + * Process phase transitions during the non-removable transaction ID + * advancement. See comments atop worker.c for details of the transition. + */ +static void +process_rdt_phase_transition(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + switch (rdt_data->phase) + { + case RDT_GET_CANDIDATE_XID: + get_candidate_xid(rdt_data); + break; + case RDT_REQUEST_PUBLISHER_STATUS: + request_publisher_status(rdt_data); + break; + case RDT_WAIT_FOR_PUBLISHER_STATUS: + wait_for_publisher_status(rdt_data, status_received); + break; + case RDT_WAIT_FOR_LOCAL_FLUSH: + wait_for_local_flush(rdt_data); + break; + } +} + +/* + * Workhorse for the RDT_GET_CANDIDATE_XID phase. + */ +static void +get_candidate_xid(RetainDeadTuplesData *rdt_data) +{ + TransactionId oldest_running_xid; + TimestampTz now; + + /* + * Use last_recv_time when applying changes in the loop to avoid + * unnecessary system time retrieval. If last_recv_time is not available, + * obtain the current timestamp. + */ + now = rdt_data->last_recv_time ? rdt_data->last_recv_time : GetCurrentTimestamp(); + + /* + * Compute the candidate_xid and request the publisher status at most once + * per xid_advance_interval. Refer to adjust_xid_advance_interval() for + * details on how this value is dynamically adjusted. This is to avoid + * using CPU and network resources without making much progress. + */ + if (!TimestampDifferenceExceeds(rdt_data->candidate_xid_time, now, + rdt_data->xid_advance_interval)) + return; + + /* + * Immediately update the timer, even if the function returns later + * without setting candidate_xid due to inactivity on the subscriber. This + * avoids frequent calls to GetOldestActiveTransactionId. + */ + rdt_data->candidate_xid_time = now; + + /* + * Consider transactions in the current database, as only dead tuples from + * this database are required for conflict detection. + */ + oldest_running_xid = GetOldestActiveTransactionId(false, false); + + /* + * Oldest active transaction ID (oldest_running_xid) can't be behind any + * of its previously computed value. + */ + Assert(TransactionIdPrecedesOrEquals(MyLogicalRepWorker->oldest_nonremovable_xid, + oldest_running_xid)); + + /* Return if the oldest_nonremovable_xid cannot be advanced */ + if (TransactionIdEquals(MyLogicalRepWorker->oldest_nonremovable_xid, + oldest_running_xid)) + { + adjust_xid_advance_interval(rdt_data, false); + return; + } + + adjust_xid_advance_interval(rdt_data, true); + + rdt_data->candidate_xid = oldest_running_xid; + rdt_data->phase = RDT_REQUEST_PUBLISHER_STATUS; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Workhorse for the RDT_REQUEST_PUBLISHER_STATUS phase. + */ +static void +request_publisher_status(RetainDeadTuplesData *rdt_data) +{ + static StringInfo request_message = NULL; + + if (!request_message) + { + MemoryContext oldctx = MemoryContextSwitchTo(ApplyContext); + + request_message = makeStringInfo(); + MemoryContextSwitchTo(oldctx); + } + else + resetStringInfo(request_message); + + /* + * Send the current time to update the remote walsender's latest reply + * message received time. + */ + pq_sendbyte(request_message, 'p'); + pq_sendint64(request_message, GetCurrentTimestamp()); + + elog(DEBUG2, "sending publisher status request message"); + + /* Send a request for the publisher status */ + walrcv_send(LogRepWorkerWalRcvConn, + request_message->data, request_message->len); + + rdt_data->phase = RDT_WAIT_FOR_PUBLISHER_STATUS; + + /* + * Skip calling maybe_advance_nonremovable_xid() since further transition + * is possible only once we receive the publisher status message. + */ +} + +/* + * Workhorse for the RDT_WAIT_FOR_PUBLISHER_STATUS phase. + */ +static void +wait_for_publisher_status(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + /* + * Return if we have requested but not yet received the publisher status. + */ + if (!status_received) + return; + + if (!FullTransactionIdIsValid(rdt_data->remote_wait_for)) + rdt_data->remote_wait_for = rdt_data->remote_nextxid; + + /* + * Check if all remote concurrent transactions that were active at the + * first status request have now completed. If completed, proceed to the + * next phase; otherwise, continue checking the publisher status until + * these transactions finish. + * + * It's possible that transactions in the commit phase during the last + * cycle have now finished committing, but remote_oldestxid remains older + * than remote_wait_for. This can happen if some old transaction came in + * the commit phase when we requested status in this cycle. We do not + * handle this case explicitly as it's rare and the benefit doesn't + * justify the required complexity. Tracking would require either caching + * all xids at the publisher or sending them to subscribers. The condition + * will resolve naturally once the remaining transactions are finished. + * + * Directly advancing the non-removable transaction ID is possible if + * there are no activities on the publisher since the last advancement + * cycle. However, it requires maintaining two fields, last_remote_nextxid + * and last_remote_lsn, within the structure for comparison with the + * current cycle's values. Considering the minimal cost of continuing in + * RDT_WAIT_FOR_LOCAL_FLUSH without awaiting changes, we opted not to + * advance the transaction ID here. + */ + if (FullTransactionIdPrecedesOrEquals(rdt_data->remote_wait_for, + rdt_data->remote_oldestxid)) + rdt_data->phase = RDT_WAIT_FOR_LOCAL_FLUSH; + else + rdt_data->phase = RDT_REQUEST_PUBLISHER_STATUS; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Workhorse for the RDT_WAIT_FOR_LOCAL_FLUSH phase. + */ +static void +wait_for_local_flush(RetainDeadTuplesData *rdt_data) +{ + Assert(!XLogRecPtrIsInvalid(rdt_data->remote_lsn) && + TransactionIdIsValid(rdt_data->candidate_xid)); + + /* + * We expect the publisher and subscriber clocks to be in sync using time + * sync service like NTP. Otherwise, we will advance this worker's + * oldest_nonremovable_xid prematurely, leading to the removal of rows + * required to detect conflicts reliably. This check primarily addresses + * scenarios where the publisher's clock falls behind; if the publisher's + * clock is ahead, subsequent transactions will naturally bear later + * commit timestamps, conforming to the design outlined atop worker.c. + * + * XXX Consider waiting for the publisher's clock to catch up with the + * subscriber's before proceeding to the next phase. + */ + if (TimestampDifferenceExceeds(rdt_data->reply_time, + rdt_data->candidate_xid_time, 0)) + ereport(ERROR, + errmsg_internal("oldest_nonremovable_xid transaction ID could be advanced prematurely"), + errdetail_internal("The clock on the publisher is behind that of the subscriber.")); + + /* + * Do not attempt to advance the non-removable transaction ID when table + * sync is in progress. During this time, changes from a single + * transaction may be applied by multiple table sync workers corresponding + * to the target tables. So, it's necessary for all table sync workers to + * apply and flush the corresponding changes before advancing the + * transaction ID, otherwise, dead tuples that are still needed for + * conflict detection in table sync workers could be removed prematurely. + * However, confirming the apply and flush progress across all table sync + * workers is complex and not worth the effort, so we simply return if not + * all tables are in the READY state. + * + * It is safe to add new tables with initial states to the subscription + * after this check because any changes applied to these tables should + * have a WAL position greater than the rdt_data->remote_lsn. + */ + if (!AllTablesyncsReady()) + return; + + /* + * Update and check the remote flush position if we are applying changes + * in a loop. This is done at most once per WalWriterDelay to avoid + * performing costly operations in get_flush_position() too frequently + * during change application. + */ + if (last_flushpos < rdt_data->remote_lsn && rdt_data->last_recv_time && + TimestampDifferenceExceeds(rdt_data->flushpos_update_time, + rdt_data->last_recv_time, WalWriterDelay)) + { + XLogRecPtr writepos; + XLogRecPtr flushpos; + bool have_pending_txes; + + /* Fetch the latest remote flush position */ + get_flush_position(&writepos, &flushpos, &have_pending_txes); + + if (flushpos > last_flushpos) + last_flushpos = flushpos; + + rdt_data->flushpos_update_time = rdt_data->last_recv_time; + } + + /* Return to wait for the changes to be applied */ + if (last_flushpos < rdt_data->remote_lsn) + return; + + /* + * Reaching here means the remote WAL position has been received, and all + * transactions up to that position on the publisher have been applied and + * flushed locally. So, we can advance the non-removable transaction ID. + */ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->oldest_nonremovable_xid = rdt_data->candidate_xid; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + elog(DEBUG2, "confirmed flush up to remote lsn %X/%X: new oldest_nonremovable_xid %u", + LSN_FORMAT_ARGS(rdt_data->remote_lsn), + rdt_data->candidate_xid); + + /* Notify launcher to update the xmin of the conflict slot */ + ApplyLauncherWakeup(); + + /* + * Reset all data fields except those used to determine the timing for the + * next round of transaction ID advancement. We can even use + * flushpos_update_time in the next round to decide whether to get the + * latest flush position. + */ + rdt_data->phase = RDT_GET_CANDIDATE_XID; + rdt_data->remote_lsn = InvalidXLogRecPtr; + rdt_data->remote_oldestxid = InvalidFullTransactionId; + rdt_data->remote_nextxid = InvalidFullTransactionId; + rdt_data->reply_time = 0; + rdt_data->remote_wait_for = InvalidFullTransactionId; + rdt_data->candidate_xid = InvalidTransactionId; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Adjust the interval for advancing non-removable transaction IDs. + * + * We double the interval to try advancing the non-removable transaction IDs + * if there is no activity on the node. The maximum value of the interval is + * capped by wal_receiver_status_interval if it is not zero, otherwise to a + * 3 minutes which should be sufficient to avoid using CPU or network + * resources without much benefit. + * + * The interval is reset to a minimum value of 100ms once there is some + * activity on the node. + * + * XXX The use of wal_receiver_status_interval is a bit arbitrary so we can + * consider the other interval or a separate GUC if the need arises. + */ +static void +adjust_xid_advance_interval(RetainDeadTuplesData *rdt_data, bool new_xid_found) +{ + if (!new_xid_found && rdt_data->xid_advance_interval) + { + int max_interval = wal_receiver_status_interval + ? wal_receiver_status_interval * 1000 + : MAX_XID_ADVANCE_INTERVAL; + + /* + * No new transaction ID has been assigned since the last check, so + * double the interval, but not beyond the maximum allowable value. + */ + rdt_data->xid_advance_interval = Min(rdt_data->xid_advance_interval * 2, + max_interval); + } + else + { + /* + * A new transaction ID was found or the interval is not yet + * initialized, so set the interval to the minimum value. + */ + rdt_data->xid_advance_interval = MIN_XID_ADVANCE_INTERVAL; + } +} + +/* * Exit routine for apply workers due to subscription parameter changes. */ static void @@ -4708,6 +5297,30 @@ InitializeLogRepWorker(void) apply_worker_exit(); } + /* + * Restart the worker if retain_dead_tuples was enabled during startup. + * + * At this point, the replication slot used for conflict detection might + * not exist yet, or could be dropped soon if the launcher perceives + * retain_dead_tuples as disabled. To avoid unnecessary tracking of + * oldest_nonremovable_xid when the slot is absent or at risk of being + * dropped, a restart is initiated. + * + * The oldest_nonremovable_xid should be initialized only when the + * retain_dead_tuples is enabled before launching the worker. See + * logicalrep_worker_launch. + */ + if (am_leader_apply_worker() && + MySubscription->retaindeadtuples && + !TransactionIdIsValid(MyLogicalRepWorker->oldest_nonremovable_xid)) + { + ereport(LOG, + errmsg("logical replication worker for subscription \"%s\" will restart because the option %s was enabled during startup", + MySubscription->name, "retain_dead_tuples")); + + apply_worker_exit(); + } + /* Setup synchronous commit according to the user's wishes */ SetConfigOption("synchronous_commit", MySubscription->synccommit, PGC_BACKEND, PGC_S_OVERRIDE); @@ -4864,6 +5477,14 @@ DisableSubscriptionAndExit(void) errmsg("subscription \"%s\" has been disabled because of an error", MySubscription->name)); + /* + * Skip the track_commit_timestamp check when disabling the worker due to + * an error, as verifying commit timestamps is unnecessary in this + * context. + */ + if (MySubscription->retaindeadtuples) + CheckSubDeadTupleRetention(false, true, WARNING); + proc_exit(0); } diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 082b4d9d327..f4c977262c5 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -297,10 +297,12 @@ parse_output_parameters(List *options, PGOutputData *data) bool two_phase_option_given = false; bool origin_option_given = false; + /* Initialize optional parameters to defaults */ data->binary = false; data->streaming = LOGICALREP_STREAM_OFF; data->messages = false; data->two_phase = false; + data->publish_no_origin = false; foreach(lc, options) { diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index e44ad576bc7..8605776ad86 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -47,6 +47,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "postmaster/interrupt.h" +#include "replication/logicallauncher.h" #include "replication/slotsync.h" #include "replication/slot.h" #include "replication/walsender_private.h" @@ -172,6 +173,7 @@ static SyncStandbySlotsConfigData *synchronized_standby_slots_config; static XLogRecPtr ss_oldest_flush_lsn = InvalidXLogRecPtr; static void ReplicationSlotShmemExit(int code, Datum arg); +static bool IsSlotForConflictCheck(const char *name); static void ReplicationSlotDropPtr(ReplicationSlot *slot); /* internal persistency functions */ @@ -258,13 +260,17 @@ ReplicationSlotShmemExit(int code, Datum arg) /* * Check whether the passed slot name is valid and report errors at elevel. * + * An error will be reported for a reserved replication slot name if + * allow_reserved_name is set to false. + * * Slot names may consist out of [a-z0-9_]{1,NAMEDATALEN-1} which should allow * the name to be used as a directory name on every supported OS. * * Returns whether the directory name is valid or not if elevel < ERROR. */ bool -ReplicationSlotValidateName(const char *name, int elevel) +ReplicationSlotValidateName(const char *name, bool allow_reserved_name, + int elevel) { const char *cp; @@ -300,10 +306,32 @@ ReplicationSlotValidateName(const char *name, int elevel) return false; } } + + if (!allow_reserved_name && IsSlotForConflictCheck(name)) + { + ereport(elevel, + errcode(ERRCODE_RESERVED_NAME), + errmsg("replication slot name \"%s\" is reserved", + name), + errdetail("The name \"%s\" is reserved for the conflict detection slot.", + CONFLICT_DETECTION_SLOT)); + + return false; + } + return true; } /* + * Return true if the replication slot name is "pg_conflict_detection". + */ +static bool +IsSlotForConflictCheck(const char *name) +{ + return (strcmp(name, CONFLICT_DETECTION_SLOT) == 0); +} + +/* * Create a new replication slot and mark it as used by this backend. * * name: Name of the slot @@ -330,7 +358,12 @@ ReplicationSlotCreate(const char *name, bool db_specific, Assert(MyReplicationSlot == NULL); - ReplicationSlotValidateName(name, ERROR); + /* + * The logical launcher or pg_upgrade may create or migrate an internal + * slot, so using a reserved name is allowed in these cases. + */ + ReplicationSlotValidateName(name, IsBinaryUpgrade || IsLogicalLauncher(), + ERROR); if (failover) { @@ -582,6 +615,17 @@ retry: } /* + * Do not allow users to acquire the reserved slot. This scenario may + * occur if the launcher that owns the slot has terminated unexpectedly + * due to an error, and a backend process attempts to reuse the slot. + */ + if (!IsLogicalLauncher() && IsSlotForConflictCheck(name)) + ereport(ERROR, + errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("cannot acquire replication slot \"%s\"", name), + errdetail("The slot is reserved for conflict detection and can only be acquired by logical replication launcher.")); + + /* * This is the slot we want; check if it's active under some other * process. In single user mode, we don't need this check. */ diff --git a/src/backend/replication/syncrep_scanner.l b/src/backend/replication/syncrep_scanner.l index 7dec1f869c7..02004d621e7 100644 --- a/src/backend/replication/syncrep_scanner.l +++ b/src/backend/replication/syncrep_scanner.l @@ -157,17 +157,16 @@ syncrep_yyerror(SyncRepConfigData **syncrep_parse_result_p, char **syncrep_parse { struct yyguts_t *yyg = (struct yyguts_t *) yyscanner; /* needed for yytext * macro */ - char *syncrep_parse_error_msg = *syncrep_parse_error_msg_p; /* report only the first error in a parse operation */ - if (syncrep_parse_error_msg) + if (*syncrep_parse_error_msg_p) return; if (yytext[0]) - syncrep_parse_error_msg = psprintf("%s at or near \"%s\"", - message, yytext); + *syncrep_parse_error_msg_p = psprintf("%s at or near \"%s\"", + message, yytext); else - syncrep_parse_error_msg = psprintf("%s at end of input", - message); + *syncrep_parse_error_msg_p = psprintf("%s at end of input", + message); } void diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 28b8591efa5..ee911394a23 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -65,6 +65,7 @@ #include "funcapi.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" +#include "libpq/protocol.h" #include "miscadmin.h" #include "nodes/replnodes.h" #include "pgstat.h" @@ -84,6 +85,7 @@ #include "storage/ipc.h" #include "storage/pmsignal.h" #include "storage/proc.h" +#include "storage/procarray.h" #include "tcop/dest.h" #include "tcop/tcopprot.h" #include "utils/acl.h" @@ -258,6 +260,7 @@ static void StartLogicalReplication(StartReplicationCmd *cmd); static void ProcessStandbyMessage(void); static void ProcessStandbyReplyMessage(void); static void ProcessStandbyHSFeedbackMessage(void); +static void ProcessStandbyPSRequestMessage(void); static void ProcessRepliesIfAny(void); static void ProcessPendingWrites(void); static void WalSndKeepalive(bool requestReply, XLogRecPtr writePtr); @@ -733,13 +736,13 @@ HandleUploadManifestPacket(StringInfo buf, off_t *offset, switch (mtype) { - case 'd': /* CopyData */ + case PqMsg_CopyData: maxmsglen = PQ_LARGE_MESSAGE_LIMIT; break; - case 'c': /* CopyDone */ - case 'f': /* CopyFail */ - case 'H': /* Flush */ - case 'S': /* Sync */ + case PqMsg_CopyDone: + case PqMsg_CopyFail: + case PqMsg_Flush: + case PqMsg_Sync: maxmsglen = PQ_SMALL_MESSAGE_LIMIT; break; default: @@ -761,19 +764,19 @@ HandleUploadManifestPacket(StringInfo buf, off_t *offset, /* Process the message */ switch (mtype) { - case 'd': /* CopyData */ + case PqMsg_CopyData: AppendIncrementalManifestData(ib, buf->data, buf->len); return true; - case 'c': /* CopyDone */ + case PqMsg_CopyDone: return false; - case 'H': /* Sync */ - case 'S': /* Flush */ + case PqMsg_Sync: + case PqMsg_Flush: /* Ignore these while in CopyOut mode as we do elsewhere. */ return true; - case 'f': + case PqMsg_CopyFail: ereport(ERROR, (errcode(ERRCODE_QUERY_CANCELED), errmsg("COPY from stdin failed: %s", @@ -1567,7 +1570,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, tmpbuf.data, sizeof(int64)); /* output previously gathered data in a CopyData packet */ - pq_putmessage_noblock('d', ctx->out->data, ctx->out->len); + pq_putmessage_noblock(PqMsg_CopyData, ctx->out->data, ctx->out->len); CHECK_FOR_INTERRUPTS(); @@ -2303,7 +2306,7 @@ ProcessRepliesIfAny(void) case PqMsg_CopyDone: if (!streamingDoneSending) { - pq_putmessage_noblock('c', NULL, 0); + pq_putmessage_noblock(PqMsg_CopyDone, NULL, 0); streamingDoneSending = true; } @@ -2355,6 +2358,10 @@ ProcessStandbyMessage(void) ProcessStandbyHSFeedbackMessage(); break; + case 'p': + ProcessStandbyPSRequestMessage(); + break; + default: ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), @@ -2702,6 +2709,60 @@ ProcessStandbyHSFeedbackMessage(void) } /* + * Process the request for a primary status update message. + */ +static void +ProcessStandbyPSRequestMessage(void) +{ + XLogRecPtr lsn = InvalidXLogRecPtr; + TransactionId oldestXidInCommit; + FullTransactionId nextFullXid; + FullTransactionId fullOldestXidInCommit; + WalSnd *walsnd = MyWalSnd; + TimestampTz replyTime; + + /* + * This shouldn't happen because we don't support getting primary status + * message from standby. + */ + if (RecoveryInProgress()) + elog(ERROR, "the primary status is unavailable during recovery"); + + replyTime = pq_getmsgint64(&reply_message); + + /* + * Update shared state for this WalSender process based on reply data from + * standby. + */ + SpinLockAcquire(&walsnd->mutex); + walsnd->replyTime = replyTime; + SpinLockRelease(&walsnd->mutex); + + /* + * Consider transactions in the current database, as only these are the + * ones replicated. + */ + oldestXidInCommit = GetOldestActiveTransactionId(true, false); + nextFullXid = ReadNextFullTransactionId(); + fullOldestXidInCommit = FullTransactionIdFromAllowableAt(nextFullXid, + oldestXidInCommit); + lsn = GetXLogWriteRecPtr(); + + elog(DEBUG2, "sending primary status"); + + /* construct the message... */ + resetStringInfo(&output_message); + pq_sendbyte(&output_message, 's'); + pq_sendint64(&output_message, lsn); + pq_sendint64(&output_message, (int64) U64FromFullTransactionId(fullOldestXidInCommit)); + pq_sendint64(&output_message, (int64) U64FromFullTransactionId(nextFullXid)); + pq_sendint64(&output_message, GetCurrentTimestamp()); + + /* ... and send it wrapped in CopyData */ + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); +} + +/* * Compute how long send/receive loops should sleep. * * If wal_sender_timeout is enabled we want to wake up in time to send @@ -3246,7 +3307,7 @@ XLogSendPhysical(void) wal_segment_close(xlogreader); /* Send CopyDone */ - pq_putmessage_noblock('c', NULL, 0); + pq_putmessage_noblock(PqMsg_CopyDone, NULL, 0); streamingDoneSending = true; WalSndCaughtUp = true; @@ -3374,7 +3435,7 @@ retry: memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], tmpbuf.data, sizeof(int64)); - pq_putmessage_noblock('d', output_message.data, output_message.len); + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); sentPtr = endptr; @@ -4080,7 +4141,7 @@ WalSndKeepalive(bool requestReply, XLogRecPtr writePtr) pq_sendbyte(&output_message, requestReply ? 1 : 0); /* ... and send it wrapped in CopyData */ - pq_putmessage_noblock('d', output_message.data, output_message.len); + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); /* Set local flag */ if (requestReply) diff --git a/src/backend/storage/aio/README.md b/src/backend/storage/aio/README.md index f10b5c7e31e..72ae3b3737d 100644 --- a/src/backend/storage/aio/README.md +++ b/src/backend/storage/aio/README.md @@ -94,7 +94,7 @@ pgaio_io_register_callbacks(ioh, PGAIO_HCB_SHARED_BUFFER_READV, 0); * * In this example we're reading only a single buffer, hence the 1. */ -pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); +pgaio_io_set_handle_data_32(ioh, (uint32 *) &buffer, 1); /* * Pass the AIO handle to lower-level function. When operating on the level of @@ -119,8 +119,9 @@ pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); * e.g. due to reaching a limit on the number of unsubmitted IOs, and even * complete before smgrstartreadv() returns. */ +void *page = BufferGetBlock(buffer); smgrstartreadv(ioh, operation->smgr, forknum, blkno, - BufferGetBlock(buffer), 1); + &page, 1); /* * To benefit from AIO, it is beneficial to perform other work, including diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6afdd28dba6..67431208e7f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2743,12 +2743,10 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, * because mdread doesn't complain about reads beyond EOF (when * zero_damaged_pages is ON) and so a previous attempt to read a block * beyond EOF could have left a "valid" zero-filled buffer. - * Unfortunately, we have also seen this case occurring because of - * buggy Linux kernels that sometimes return an lseek(SEEK_END) result - * that doesn't account for a recent write. In that situation, the - * pre-existing buffer would contain valid data that we don't want to - * overwrite. Since the legitimate cases should always have left a - * zero-filled buffer, complain if not PageIsNew. + * + * This has also been observed when relation was overwritten by + * external process. Since the legitimate cases should always have + * left a zero-filled buffer, complain if not PageIsNew. */ if (existing_id >= 0) { @@ -2778,8 +2776,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, ereport(ERROR, (errmsg("unexpected data beyond EOF in block %u of relation %s", existing_hdr->tag.blockNum, - relpath(bmr.smgr->smgr_rlocator, fork).str), - errhint("This has been seen to occur with buggy kernels; consider updating your system."))); + relpath(bmr.smgr->smgr_rlocator, fork).str))); /* * We *must* do smgr[zero]extend before succeeding, else the page diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3da9c41ee1d..3c0d20f4659 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -932,10 +932,11 @@ GetLocalBufferStorage(void) num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ); /* Buffers should be I/O aligned. */ - cur_block = (char *) - TYPEALIGN(PG_IO_ALIGN_SIZE, - MemoryContextAlloc(LocalBufferContext, - num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE)); + cur_block = MemoryContextAllocAligned(LocalBufferContext, + num_bufs * BLCKSZ, + PG_IO_ALIGN_SIZE, + 0); + next_buf_in_block = 0; num_bufs_in_block = num_bufs; } diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index c6aefd2f688..beadeb5e46a 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -187,9 +187,11 @@ WaitLatch(Latch *latch, int wakeEvents, long timeout, if (!(wakeEvents & WL_LATCH_SET)) latch = NULL; ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch); - ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos, - (wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)), - NULL); + + if (IsUnderPostmaster) + ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos, + (wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)), + NULL); if (WaitEventSetWait(LatchWaitSet, (wakeEvents & WL_TIMEOUT) ? timeout : -1, diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 2418967def6..bf987aed8d3 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -2814,8 +2814,10 @@ GetRunningTransactionData(void) * * Similar to GetSnapshotData but returns just oldestActiveXid. We include * all PGPROCs with an assigned TransactionId, even VACUUM processes. - * We look at all databases, though there is no need to include WALSender - * since this has no effect on hot standby conflicts. + * + * If allDbs is true, we look at all databases, though there is no need to + * include WALSender since this has no effect on hot standby conflicts. If + * allDbs is false, skip processes attached to other databases. * * This is never executed during recovery so there is no need to look at * KnownAssignedXids. @@ -2823,9 +2825,12 @@ GetRunningTransactionData(void) * We don't worry about updating other counters, we want to keep this as * simple as possible and leave GetSnapshotData() as the primary code for * that bookkeeping. + * + * inCommitOnly indicates getting the oldestActiveXid among the transactions + * in the commit critical section. */ TransactionId -GetOldestActiveTransactionId(void) +GetOldestActiveTransactionId(bool inCommitOnly, bool allDbs) { ProcArrayStruct *arrayP = procArray; TransactionId *other_xids = ProcGlobal->xids; @@ -2852,6 +2857,8 @@ GetOldestActiveTransactionId(void) for (index = 0; index < arrayP->numProcs; index++) { TransactionId xid; + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; /* Fetch xid just once - see GetNewTransactionId */ xid = UINT32_ACCESS_ONCE(other_xids[index]); @@ -2859,6 +2866,13 @@ GetOldestActiveTransactionId(void) if (!TransactionIdIsNormal(xid)) continue; + if (inCommitOnly && + (proc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0) + continue; + + if (!allDbs && proc->databaseId != MyDatabaseId) + continue; + if (TransactionIdPrecedes(xid, oldestRunningXid)) oldestRunningXid = xid; diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index a9bb540b55a..087821311cc 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -728,7 +728,11 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) void SendCancelRequest(int backendPID, const uint8 *cancel_key, int cancel_key_len) { - Assert(backendPID != 0); + if (backendPID == 0) + { + ereport(LOG, (errmsg("invalid cancel request with PID 0"))); + return; + } /* * See if we have a matching backend. Reading the pss_pid and diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl index 4441b7cba0c..cd3e43c448a 100644 --- a/src/backend/storage/lmgr/generate-lwlocknames.pl +++ b/src/backend/storage/lmgr/generate-lwlocknames.pl @@ -10,7 +10,6 @@ use Getopt::Long; my $output_path = '.'; my $lastlockidx = -1; -my $continue = "\n"; GetOptions('outdir:s' => \$output_path); @@ -28,18 +27,24 @@ print $h "/* there is deliberately not an #ifndef LWLOCKNAMES_H here */\n\n"; # -# First, record the predefined LWLocks listed in wait_event_names.txt. We'll -# cross-check those with the ones in lwlocklist.h. +# First, record the predefined LWLocks and built-in tranches listed in +# wait_event_names.txt. We'll cross-check those with the ones in lwlocklist.h. # +my @wait_event_tranches; my @wait_event_lwlocks; my $record_lwlocks = 0; +my $in_tranches = 0; while (<$wait_event_names>) { chomp; # Check for end marker. - last if /^# END OF PREDEFINED LWLOCKS/; + if (/^# END OF PREDEFINED LWLOCKS/) + { + $in_tranches = 1; + next; + } # Skip comments and empty lines. next if /^#/; @@ -55,13 +60,29 @@ while (<$wait_event_names>) # Go to the next line if we are not yet recording LWLocks. next if not $record_lwlocks; + # Stop recording if we reach another section. + last if /^Section:/; + # Record the LWLock. (my $waiteventname, my $waitevendocsentence) = split(/\t/, $_); - push(@wait_event_lwlocks, $waiteventname); + + if ($in_tranches) + { + push(@wait_event_tranches, $waiteventname); + } + else + { + push(@wait_event_lwlocks, $waiteventname); + } } +# +# While gathering the list of predefined LWLocks, cross-check the lists in +# lwlocklist.h with the wait events we just recorded. +# my $in_comment = 0; -my $i = 0; +my $lwlock_count = 0; +my $tranche_count = 0; while (<$lwlocklist>) { chomp; @@ -82,40 +103,72 @@ while (<$lwlocklist>) next; } - die "unable to parse lwlocklist.h line \"$_\"" - unless /^PG_LWLOCK\((\d+),\s+(\w+)\)$/; + # + # Gather list of predefined LWLocks and cross-check with the wait events. + # + if (/^PG_LWLOCK\((\d+),\s+(\w+)\)$/) + { + my ($lockidx, $lockname) = ($1, $2); - (my $lockidx, my $lockname) = ($1, $2); + die "lwlocklist.h not in order" if $lockidx < $lastlockidx; + die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx; - die "lwlocklist.h not in order" if $lockidx < $lastlockidx; - die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx; + die "$lockname defined in lwlocklist.h but missing from " + . "wait_event_names.txt" + if $lwlock_count >= scalar @wait_event_lwlocks; + die "lists of predefined LWLocks do not match (first mismatch at " + . "$wait_event_lwlocks[$lwlock_count] in wait_event_names.txt and " + . "$lockname in lwlocklist.h)" + if $wait_event_lwlocks[$lwlock_count] ne $lockname; - die "$lockname defined in lwlocklist.h but missing from " - . "wait_event_names.txt" - if $i >= scalar @wait_event_lwlocks; - die "lists of predefined LWLocks do not match (first mismatch at " - . "$wait_event_lwlocks[$i] in wait_event_names.txt and $lockname in " - . "lwlocklist.h)" - if $wait_event_lwlocks[$i] ne $lockname; - $i++; + $lwlock_count++; - while ($lastlockidx < $lockidx - 1) + while ($lastlockidx < $lockidx - 1) + { + ++$lastlockidx; + } + $lastlockidx = $lockidx; + + # Add a "Lock" suffix to each lock name, as the C code depends on that. + printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n", + $lockname . "Lock"; + + next; + } + + # + # Cross-check the built-in LWLock tranches with the wait events. + # + if (/^PG_LWLOCKTRANCHE\((\w+),\s+(\w+)\)$/) { - ++$lastlockidx; - $continue = ",\n"; + my ($tranche_id, $tranche_name) = ($1, $2); + + die "$tranche_name defined in lwlocklist.h but missing from " + . "wait_event_names.txt" + if $tranche_count >= scalar @wait_event_tranches; + die + "lists of built-in LWLock tranches do not match (first mismatch at " + . "$wait_event_tranches[$tranche_count] in wait_event_names.txt and " + . "$tranche_name in lwlocklist.h)" + if $wait_event_tranches[$tranche_count] ne $tranche_name; + + $tranche_count++; + + next; } - $lastlockidx = $lockidx; - $continue = ",\n"; - # Add a "Lock" suffix to each lock name, as the C code depends on that - printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n", - $lockname . "Lock"; + die "unable to parse lwlocklist.h line \"$_\""; } die - "$wait_event_lwlocks[$i] defined in wait_event_names.txt but missing from " - . "lwlocklist.h" - if $i < scalar @wait_event_lwlocks; + "$wait_event_lwlocks[$lwlock_count] defined in wait_event_names.txt but " + . " missing from lwlocklist.h" + if $lwlock_count < scalar @wait_event_lwlocks; + +die + "$wait_event_tranches[$tranche_count] defined in wait_event_names.txt but " + . "missing from lwlocklist.h" + if $tranche_count < scalar @wait_event_tranches; print $h "\n"; printf $h "#define NUM_INDIVIDUAL_LWLOCKS %s\n", $lastlockidx + 1; diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 46f44bc4511..ec9c345ffdf 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -122,9 +122,8 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0, * own tranche. We absorb the names of these tranches from there into * BuiltinTrancheNames here. * - * 2. There are some predefined tranches for built-in groups of locks. - * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names - * appear in BuiltinTrancheNames[] below. + * 2. There are some predefined tranches for built-in groups of locks defined + * in lwlocklist.h. We absorb the names of these tranches, too. * * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche * or LWLockRegisterTranche. The names of these that are known in the current @@ -135,49 +134,10 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0, */ static const char *const BuiltinTrancheNames[] = { #define PG_LWLOCK(id, lockname) [id] = CppAsString(lockname), +#define PG_LWLOCKTRANCHE(id, lockname) [LWTRANCHE_##id] = CppAsString(lockname), #include "storage/lwlocklist.h" #undef PG_LWLOCK - [LWTRANCHE_XACT_BUFFER] = "XactBuffer", - [LWTRANCHE_COMMITTS_BUFFER] = "CommitTsBuffer", - [LWTRANCHE_SUBTRANS_BUFFER] = "SubtransBuffer", - [LWTRANCHE_MULTIXACTOFFSET_BUFFER] = "MultiXactOffsetBuffer", - [LWTRANCHE_MULTIXACTMEMBER_BUFFER] = "MultiXactMemberBuffer", - [LWTRANCHE_NOTIFY_BUFFER] = "NotifyBuffer", - [LWTRANCHE_SERIAL_BUFFER] = "SerialBuffer", - [LWTRANCHE_WAL_INSERT] = "WALInsert", - [LWTRANCHE_BUFFER_CONTENT] = "BufferContent", - [LWTRANCHE_REPLICATION_ORIGIN_STATE] = "ReplicationOriginState", - [LWTRANCHE_REPLICATION_SLOT_IO] = "ReplicationSlotIO", - [LWTRANCHE_LOCK_FASTPATH] = "LockFastPath", - [LWTRANCHE_BUFFER_MAPPING] = "BufferMapping", - [LWTRANCHE_LOCK_MANAGER] = "LockManager", - [LWTRANCHE_PREDICATE_LOCK_MANAGER] = "PredicateLockManager", - [LWTRANCHE_PARALLEL_HASH_JOIN] = "ParallelHashJoin", - [LWTRANCHE_PARALLEL_BTREE_SCAN] = "ParallelBtreeScan", - [LWTRANCHE_PARALLEL_QUERY_DSA] = "ParallelQueryDSA", - [LWTRANCHE_PER_SESSION_DSA] = "PerSessionDSA", - [LWTRANCHE_PER_SESSION_RECORD_TYPE] = "PerSessionRecordType", - [LWTRANCHE_PER_SESSION_RECORD_TYPMOD] = "PerSessionRecordTypmod", - [LWTRANCHE_SHARED_TUPLESTORE] = "SharedTupleStore", - [LWTRANCHE_SHARED_TIDBITMAP] = "SharedTidBitmap", - [LWTRANCHE_PARALLEL_APPEND] = "ParallelAppend", - [LWTRANCHE_PER_XACT_PREDICATE_LIST] = "PerXactPredicateList", - [LWTRANCHE_PGSTATS_DSA] = "PgStatsDSA", - [LWTRANCHE_PGSTATS_HASH] = "PgStatsHash", - [LWTRANCHE_PGSTATS_DATA] = "PgStatsData", - [LWTRANCHE_LAUNCHER_DSA] = "LogicalRepLauncherDSA", - [LWTRANCHE_LAUNCHER_HASH] = "LogicalRepLauncherHash", - [LWTRANCHE_DSM_REGISTRY_DSA] = "DSMRegistryDSA", - [LWTRANCHE_DSM_REGISTRY_HASH] = "DSMRegistryHash", - [LWTRANCHE_COMMITTS_SLRU] = "CommitTsSLRU", - [LWTRANCHE_MULTIXACTOFFSET_SLRU] = "MultixactOffsetSLRU", - [LWTRANCHE_MULTIXACTMEMBER_SLRU] = "MultixactMemberSLRU", - [LWTRANCHE_NOTIFY_SLRU] = "NotifySLRU", - [LWTRANCHE_SERIAL_SLRU] = "SerialSLRU", - [LWTRANCHE_SUBTRANS_SLRU] = "SubtransSLRU", - [LWTRANCHE_XACT_SLRU] = "XactSLRU", - [LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA", - [LWTRANCHE_AIO_URING_COMPLETION] = "AioUringCompletion", +#undef PG_LWLOCKTRANCHE }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c index ad0af5edc1f..14d5fc0b196 100644 --- a/src/backend/tcop/backend_startup.c +++ b/src/backend/tcop/backend_startup.c @@ -492,7 +492,7 @@ static int ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) { int32 len; - char *buf; + char *buf = NULL; ProtocolVersion proto; MemoryContext oldcontext; @@ -516,7 +516,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) * scanners, which may be less benign, but it's not really our job to * notice those.) */ - return STATUS_ERROR; + goto fail; } if (pq_getbytes(((char *) &len) + 1, 3) == EOF) @@ -526,7 +526,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("incomplete startup packet"))); - return STATUS_ERROR; + goto fail; } len = pg_ntoh32(len); @@ -538,7 +538,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid length of startup packet"))); - return STATUS_ERROR; + goto fail; } /* @@ -554,7 +554,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("incomplete startup packet"))); - return STATUS_ERROR; + goto fail; } pq_endmsgread(); @@ -568,7 +568,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) { ProcessCancelRequestPacket(port, buf, len); /* Not really an error, but we don't want to proceed further */ - return STATUS_ERROR; + goto fail; } if (proto == NEGOTIATE_SSL_CODE && !ssl_done) @@ -607,14 +607,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode_for_socket_access(), errmsg("failed to send SSL negotiation response: %m"))); - return STATUS_ERROR; /* close the connection */ + goto fail; /* close the connection */ } #ifdef USE_SSL if (SSLok == 'S' && secure_open_server(port) == -1) - return STATUS_ERROR; + goto fail; #endif + pfree(buf); + /* * At this point we should have no data already buffered. If we do, * it was received before we performed the SSL handshake, so it wasn't @@ -661,14 +663,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode_for_socket_access(), errmsg("failed to send GSSAPI negotiation response: %m"))); - return STATUS_ERROR; /* close the connection */ + goto fail; /* close the connection */ } #ifdef ENABLE_GSS if (GSSok == 'G' && secure_open_gssapi(port) == -1) - return STATUS_ERROR; + goto fail; #endif + pfree(buf); + /* * At this point we should have no data already buffered. If we do, * it was received before we performed the GSS handshake, so it wasn't @@ -863,7 +867,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) */ MemoryContextSwitchTo(oldcontext); + pfree(buf); + return STATUS_OK; + +fail: + /* be tidy, just to avoid Valgrind complaints */ + if (buf) + pfree(buf); + + return STATUS_ERROR; } /* diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 2f8c3d5f918..0cecd464902 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -988,6 +988,7 @@ pg_plan_queries(List *querytrees, const char *query_string, int cursorOptions, stmt->stmt_location = query->stmt_location; stmt->stmt_len = query->stmt_len; stmt->queryId = query->queryId; + stmt->planOrigin = PLAN_STMT_INTERNAL; } else { diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index d1593f38b35..08791b8f75e 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -1350,24 +1350,15 @@ PortalRunMulti(Portal portal, PopActiveSnapshot(); /* - * If a query completion data was supplied, use it. Otherwise use the - * portal's query completion data. - * - * Exception: Clients expect INSERT/UPDATE/DELETE tags to have counts, so - * fake them with zeros. This can happen with DO INSTEAD rules if there - * is no replacement query of the same type as the original. We print "0 - * 0" here because technically there is no query of the matching tag type, - * and printing a non-zero count for a different query type seems wrong, - * e.g. an INSERT that does an UPDATE instead should not print "0 1" if - * one row was updated. See QueryRewrite(), step 3, for details. + * If a command tag was requested and we did not fill in a run-time- + * determined tag above, copy the parse-time tag from the Portal. (There + * might not be any tag there either, in edge cases such as empty prepared + * statements. That's OK.) */ - if (qc && qc->commandTag == CMDTAG_UNKNOWN) - { - if (portal->qc.commandTag != CMDTAG_UNKNOWN) - CopyQueryCompletion(qc, &portal->qc); - /* If the caller supplied a qc, we should have set it by now. */ - Assert(qc->commandTag != CMDTAG_UNKNOWN); - } + if (qc && + qc->commandTag == CMDTAG_UNKNOWN && + portal->qc.commandTag != CMDTAG_UNKNOWN) + CopyQueryCompletion(qc, &portal->qc); } /* diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 4c1faf5575c..4f4191b0ea6 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1234,6 +1234,7 @@ ProcessUtilitySlow(ParseState *pstate, wrapper->utilityStmt = stmt; wrapper->stmt_location = pstmt->stmt_location; wrapper->stmt_len = pstmt->stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; ProcessUtility(wrapper, queryString, @@ -1964,6 +1965,7 @@ ProcessUtilityForAlterTable(Node *stmt, AlterTableUtilityContext *context) wrapper->utilityStmt = stmt; wrapper->stmt_location = context->pstmt->stmt_location; wrapper->stmt_len = context->pstmt->stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; ProcessUtility(wrapper, context->queryString, diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c index 63bd193a78a..debfbf956cc 100644 --- a/src/backend/tsearch/dict_ispell.c +++ b/src/backend/tsearch/dict_ispell.c @@ -47,24 +47,30 @@ dispell_init(PG_FUNCTION_ARGS) if (strcmp(defel->defname, "dictfile") == 0) { + char *filename; + if (dictloaded) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple DictFile parameters"))); - NIImportDictionary(&(d->obj), - get_tsearch_config_filename(defGetString(defel), - "dict")); + filename = get_tsearch_config_filename(defGetString(defel), + "dict"); + NIImportDictionary(&(d->obj), filename); + pfree(filename); dictloaded = true; } else if (strcmp(defel->defname, "afffile") == 0) { + char *filename; + if (affloaded) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple AffFile parameters"))); - NIImportAffixes(&(d->obj), - get_tsearch_config_filename(defGetString(defel), - "affix")); + filename = get_tsearch_config_filename(defGetString(defel), + "affix"); + NIImportAffixes(&(d->obj), filename); + pfree(filename); affloaded = true; } else if (strcmp(defel->defname, "stopwords") == 0) diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c index 0da5a9d6868..c2773eb01ad 100644 --- a/src/backend/tsearch/dict_synonym.c +++ b/src/backend/tsearch/dict_synonym.c @@ -199,6 +199,7 @@ skipline: } tsearch_readline_end(&trst); + pfree(filename); d->len = cur; qsort(d->syn, d->len, sizeof(Syn), compareSyn); diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c index 1bebe36a691..1e6bbde1ca7 100644 --- a/src/backend/tsearch/dict_thesaurus.c +++ b/src/backend/tsearch/dict_thesaurus.c @@ -167,17 +167,17 @@ addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 p static void thesaurusRead(const char *filename, DictThesaurus *d) { + char *real_filename = get_tsearch_config_filename(filename, "ths"); tsearch_readline_state trst; uint32 idsubst = 0; bool useasis = false; char *line; - filename = get_tsearch_config_filename(filename, "ths"); - if (!tsearch_readline_begin(&trst, filename)) + if (!tsearch_readline_begin(&trst, real_filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open thesaurus file \"%s\": %m", - filename))); + real_filename))); while ((line = tsearch_readline(&trst)) != NULL) { @@ -297,6 +297,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) d->nsubst = idsubst; tsearch_readline_end(&trst); + pfree(real_filename); } static TheLexeme * diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index 8b57845e870..6bc91ce0dad 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -212,6 +212,11 @@ int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE; PgStat_LocalState pgStatLocal; +/* + * Track pending reports for fixed-numbered stats, used by + * pgstat_report_stat(). + */ +bool pgstat_report_fixed = false; /* ---------- * Local data @@ -370,7 +375,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_off = offsetof(PgStatShared_Backend, stats), .shared_data_len = sizeof(((PgStatShared_Backend *) 0)->stats), - .have_static_pending_cb = pgstat_backend_have_pending_cb, .flush_static_cb = pgstat_backend_flush_cb, .reset_timestamp_cb = pgstat_backend_reset_timestamp_cb, }, @@ -437,7 +441,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_len = sizeof(((PgStatShared_IO *) 0)->stats), .flush_static_cb = pgstat_io_flush_cb, - .have_static_pending_cb = pgstat_io_have_pending_cb, .init_shmem_cb = pgstat_io_init_shmem_cb, .reset_all_cb = pgstat_io_reset_all_cb, .snapshot_cb = pgstat_io_snapshot_cb, @@ -455,7 +458,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_len = sizeof(((PgStatShared_SLRU *) 0)->stats), .flush_static_cb = pgstat_slru_flush_cb, - .have_static_pending_cb = pgstat_slru_have_pending_cb, .init_shmem_cb = pgstat_slru_init_shmem_cb, .reset_all_cb = pgstat_slru_reset_all_cb, .snapshot_cb = pgstat_slru_snapshot_cb, @@ -474,7 +476,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .init_backend_cb = pgstat_wal_init_backend_cb, .flush_static_cb = pgstat_wal_flush_cb, - .have_static_pending_cb = pgstat_wal_have_pending_cb, .init_shmem_cb = pgstat_wal_init_shmem_cb, .reset_all_cb = pgstat_wal_reset_all_cb, .snapshot_cb = pgstat_wal_snapshot_cb, @@ -708,29 +709,10 @@ pgstat_report_stat(bool force) } /* Don't expend a clock check if nothing to do */ - if (dlist_is_empty(&pgStatPending)) + if (dlist_is_empty(&pgStatPending) && + !pgstat_report_fixed) { - bool do_flush = false; - - /* Check for pending stats */ - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) - { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - - if (!kind_info) - continue; - if (!kind_info->have_static_pending_cb) - continue; - - if (kind_info->have_static_pending_cb()) - { - do_flush = true; - break; - } - } - - if (!do_flush) - return 0; + return 0; } /* @@ -784,16 +766,19 @@ pgstat_report_stat(bool force) partial_flush |= pgstat_flush_pending_entries(nowait); /* flush of other stats kinds */ - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + if (pgstat_report_fixed) { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - if (!kind_info) - continue; - if (!kind_info->flush_static_cb) - continue; + if (!kind_info) + continue; + if (!kind_info->flush_static_cb) + continue; - partial_flush |= kind_info->flush_static_cb(nowait); + partial_flush |= kind_info->flush_static_cb(nowait); + } } last_flush = now; @@ -815,6 +800,7 @@ pgstat_report_stat(bool force) } pending_since = 0; + pgstat_report_fixed = false; return 0; } diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 51256277e8d..8714a85e2d9 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -66,6 +66,7 @@ pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context, io_time); backend_has_iostats = true; + pgstat_report_fixed = true; } void @@ -81,6 +82,7 @@ pgstat_count_backend_io_op(IOObject io_object, IOContext io_context, PendingBackendStats.pending_io.bytes[io_object][io_context][io_op] += bytes; backend_has_iostats = true; + pgstat_report_fixed = true; } /* @@ -302,18 +304,6 @@ pgstat_flush_backend(bool nowait, bits32 flags) } /* - * Check if there are any backend stats waiting for flush. - */ -bool -pgstat_backend_have_pending_cb(void) -{ - if (!pgstat_tracks_backend_bktype(MyBackendType)) - return false; - - return (backend_has_iostats || pgstat_backend_wal_have_pending()); -} - -/* * Callback to flush out locally pending backend statistics. * * If some stats could not be flushed due to lock contention, return true. diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index d8d26379a57..13ae57ed649 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -80,6 +80,7 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes); have_iostats = true; + pgstat_report_fixed = true; } /* @@ -168,15 +169,6 @@ pgstat_fetch_stat_io(void) } /* - * Check if there any IO stats waiting for flush. - */ -bool -pgstat_io_have_pending_cb(void) -{ - return have_iostats; -} - -/* * Simpler wrapper of pgstat_io_flush_cb() */ void diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c index b9e940dde45..7bd8744accb 100644 --- a/src/backend/utils/activity/pgstat_slru.c +++ b/src/backend/utils/activity/pgstat_slru.c @@ -144,15 +144,6 @@ pgstat_get_slru_index(const char *name) } /* - * Check if there are any SLRU stats entries waiting for flush. - */ -bool -pgstat_slru_have_pending_cb(void) -{ - return have_slrustats; -} - -/* * Flush out locally pending SLRU stats entries * * If nowait is true, this function returns false on lock failure. Otherwise @@ -247,6 +238,7 @@ get_slru_entry(int slru_idx) Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS)); have_slrustats = true; + pgstat_report_fixed = true; return &pending_SLRUStats[slru_idx]; } diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c index 16a1ecb4d90..0d04480d2f6 100644 --- a/src/backend/utils/activity/pgstat_wal.c +++ b/src/backend/utils/activity/pgstat_wal.c @@ -72,6 +72,15 @@ pgstat_fetch_stat_wal(void) } /* + * To determine whether WAL usage happened. + */ +static inline bool +pgstat_wal_have_pending(void) +{ + return pgWalUsage.wal_records != prevWalUsage.wal_records; +} + +/* * Calculate how much WAL usage counters have increased by subtracting the * previous counters from the current ones. * @@ -92,7 +101,7 @@ pgstat_wal_flush_cb(bool nowait) * This function can be called even if nothing at all has happened. Avoid * taking lock for nothing in that case. */ - if (!pgstat_wal_have_pending_cb()) + if (!pgstat_wal_have_pending()) return false; /* @@ -136,15 +145,6 @@ pgstat_wal_init_backend_cb(void) prevWalUsage = pgWalUsage; } -/* - * To determine whether WAL usage happened. - */ -bool -pgstat_wal_have_pending_cb(void) -{ - return pgWalUsage.wal_records != prevWalUsage.wal_records; -} - void pgstat_wal_init_shmem_cb(void *stats) { diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 4da68312b5f..0be307d2ca0 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -356,9 +356,13 @@ AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) # -# Predefined LWLocks (i.e., those declared in lwlocknames.h) must be listed -# in the section above and must be listed in the same order as in -# lwlocknames.h. Other LWLocks must be listed in the section below. +# Predefined LWLocks (i.e., those declared at the top of lwlocknames.h) must be +# listed in the section above and must be listed in the same order as in +# lwlocknames.h. +# +# Likewise, the built-in LWLock tranches (i.e., those declared at the bottom of +# lwlocknames.h) must be listed in the section below and must be listed in the +# same order as in lwlocknames.h. # XactBuffer "Waiting for I/O on a transaction status SLRU buffer." diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c index 2e539c2504e..6e7b914c563 100644 --- a/src/backend/utils/adt/bytea.c +++ b/src/backend/utils/adt/bytea.c @@ -182,27 +182,21 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl) * * Non-printable characters must be passed as '\nnn' (octal) and are * converted to internal form. '\' must be passed as '\\'. - * ereport(ERROR, ...) if bad form. - * - * BUGS: - * The input is scanned twice. - * The error checking of input is minimal. */ Datum byteain(PG_FUNCTION_ARGS) { char *inputText = PG_GETARG_CSTRING(0); Node *escontext = fcinfo->context; + size_t len = strlen(inputText); + size_t bc; char *tp; char *rp; - int bc; bytea *result; /* Recognize hex input */ if (inputText[0] == '\\' && inputText[1] == 'x') { - size_t len = strlen(inputText); - bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ result = palloc(bc); bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), @@ -213,33 +207,7 @@ byteain(PG_FUNCTION_ARGS) } /* Else, it's the traditional escaped style */ - for (bc = 0, tp = inputText; *tp != '\0'; bc++) - { - if (tp[0] != '\\') - tp++; - else if ((tp[0] == '\\') && - (tp[1] >= '0' && tp[1] <= '3') && - (tp[2] >= '0' && tp[2] <= '7') && - (tp[3] >= '0' && tp[3] <= '7')) - tp += 4; - else if ((tp[0] == '\\') && - (tp[1] == '\\')) - tp += 2; - else - { - /* - * one backslash, not followed by another or ### valid octal - */ - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "bytea"))); - } - } - - bc += VARHDRSZ; - - result = (bytea *) palloc(bc); - SET_VARSIZE(result, bc); + result = (bytea *) palloc(len + VARHDRSZ); /* maximum possible length */ tp = inputText; rp = VARDATA(result); @@ -247,21 +215,21 @@ byteain(PG_FUNCTION_ARGS) { if (tp[0] != '\\') *rp++ = *tp++; - else if ((tp[0] == '\\') && - (tp[1] >= '0' && tp[1] <= '3') && + else if ((tp[1] >= '0' && tp[1] <= '3') && (tp[2] >= '0' && tp[2] <= '7') && (tp[3] >= '0' && tp[3] <= '7')) { - bc = VAL(tp[1]); - bc <<= 3; - bc += VAL(tp[2]); - bc <<= 3; - *rp++ = bc + VAL(tp[3]); + int v; + + v = VAL(tp[1]); + v <<= 3; + v += VAL(tp[2]); + v <<= 3; + *rp++ = v + VAL(tp[3]); tp += 4; } - else if ((tp[0] == '\\') && - (tp[1] == '\\')) + else if (tp[1] == '\\') { *rp++ = '\\'; tp += 2; @@ -269,7 +237,7 @@ byteain(PG_FUNCTION_ARGS) else { /* - * We should never get here. The first pass should not allow it. + * one backslash, not followed by another or ### valid octal */ ereturn(escontext, (Datum) 0, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), @@ -277,6 +245,9 @@ byteain(PG_FUNCTION_ARGS) } } + bc = rp - VARDATA(result); /* actual length */ + SET_VARSIZE(result, bc + VARHDRSZ); + PG_RETURN_BYTEA_P(result); } diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index c8b6c15e059..82b807d067a 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -277,22 +277,16 @@ compareJsonbContainers(JsonbContainer *a, JsonbContainer *b) else { /* - * It's safe to assume that the types differed, and that the va - * and vb values passed were set. - * - * If the two values were of the same container type, then there'd - * have been a chance to observe the variation in the number of - * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're - * either two heterogeneously-typed containers, or a container and - * some scalar type. - * - * We don't have to consider the WJB_END_ARRAY and WJB_END_OBJECT - * cases here, because we would have seen the corresponding - * WJB_BEGIN_ARRAY and WJB_BEGIN_OBJECT tokens first, and - * concluded that they don't match. + * It's not possible for one iterator to report end of array or + * object while the other one reports something else, because we + * would have detected a length mismatch when we processed the + * container-start tokens above. Likewise we can't see WJB_DONE + * from one but not the other. So we have two different-type + * containers, or a container and some scalar type, or two + * different scalar types. Sort on the basis of the type code. */ - Assert(ra != WJB_END_ARRAY && ra != WJB_END_OBJECT); - Assert(rb != WJB_END_ARRAY && rb != WJB_END_OBJECT); + Assert(ra != WJB_DONE && ra != WJB_END_ARRAY && ra != WJB_END_OBJECT); + Assert(rb != WJB_DONE && rb != WJB_END_ARRAY && rb != WJB_END_OBJECT); Assert(va.type != vb.type); Assert(va.type != jbvBinary); @@ -852,15 +846,20 @@ JsonbIteratorInit(JsonbContainer *container) * It is our job to expand the jbvBinary representation without bothering them * with it. However, clients should not take it upon themselves to touch array * or Object element/pair buffers, since their element/pair pointers are - * garbage. Also, *val will not be set when returning WJB_END_ARRAY or - * WJB_END_OBJECT, on the assumption that it's only useful to access values - * when recursing in. + * garbage. + * + * *val is not meaningful when the result is WJB_DONE, WJB_END_ARRAY or + * WJB_END_OBJECT. However, we set val->type = jbvNull in those cases, + * so that callers may assume that val->type is always well-defined. */ JsonbIteratorToken JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) { if (*it == NULL) + { + val->type = jbvNull; return WJB_DONE; + } /* * When stepping into a nested container, we jump back here to start @@ -898,6 +897,7 @@ recurse: * nesting). */ *it = freeAndGetParent(*it); + val->type = jbvNull; return WJB_END_ARRAY; } @@ -951,6 +951,7 @@ recurse: * of nesting). */ *it = freeAndGetParent(*it); + val->type = jbvNull; return WJB_END_OBJECT; } else @@ -995,8 +996,10 @@ recurse: return WJB_VALUE; } - elog(ERROR, "invalid iterator state"); - return -1; + elog(ERROR, "invalid jsonb iterator state"); + /* satisfy compilers that don't know that elog(ERROR) doesn't return */ + val->type = jbvNull; + return WJB_DONE; } /* diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c index d44f8c262ba..a4f8b4faa90 100644 --- a/src/backend/utils/adt/pg_upgrade_support.c +++ b/src/backend/utils/adt/pg_upgrade_support.c @@ -21,6 +21,7 @@ #include "commands/extension.h" #include "miscadmin.h" #include "replication/logical.h" +#include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/worker_internal.h" #include "storage/lmgr.h" @@ -410,3 +411,21 @@ binary_upgrade_replorigin_advance(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * binary_upgrade_create_conflict_detection_slot + * + * Create a replication slot to retain information necessary for conflict + * detection such as dead tuples, commit timestamps, and origins. + */ +Datum +binary_upgrade_create_conflict_detection_slot(PG_FUNCTION_ARGS) +{ + CHECK_IS_BINARY_UPGRADE; + + CreateConflictDetectionSlot(); + + ReplicationSlotRelease(); + + PG_RETURN_VOID(); +} diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index ce6a626eba2..17fbfa9b410 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3798,18 +3798,25 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, List *hashclauses, Selectivity *innerbucketsize) { - List *clauses = list_copy(hashclauses); - List *otherclauses = NIL; - double ndistinct = 1.0; + List *clauses; + List *otherclauses; + double ndistinct; if (list_length(hashclauses) <= 1) - + { /* * Nothing to do for a single clause. Could we employ univariate * extended stat here? */ return hashclauses; + } + /* "clauses" is the list of hashclauses we've not dealt with yet */ + clauses = list_copy(hashclauses); + /* "otherclauses" holds clauses we are going to return to caller */ + otherclauses = NIL; + /* current estimate of ndistinct */ + ndistinct = 1.0; while (clauses != NIL) { ListCell *lc; @@ -3874,12 +3881,13 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, group_rel = root->simple_rel_array[relid]; } else if (group_relid != relid) - + { /* * Being in the group forming state we don't need other * clauses. */ continue; + } /* * We're going to add the new clause to the varinfos list. We diff --git a/src/backend/utils/adt/tid.c b/src/backend/utils/adt/tid.c index 1b0df111717..39dab3e42df 100644 --- a/src/backend/utils/adt/tid.c +++ b/src/backend/utils/adt/tid.c @@ -84,7 +84,7 @@ tidin(PG_FUNCTION_ARGS) /* * Cope with possibility that unsigned long is wider than BlockNumber, in * which case strtoul will not raise an error for some values that are out - * of the range of BlockNumber. (See similar code in oidin().) + * of the range of BlockNumber. (See similar code in uint32in_subr().) */ #if SIZEOF_LONG > 4 if (cvt != (unsigned long) blockNumber && diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index f7b731825fc..182e8f75db7 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -1769,7 +1769,7 @@ xml_doctype_in_content(const xmlChar *str) * xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode). * * If parsed_nodes isn't NULL and we parse in CONTENT mode, the list - * of parsed nodes from the xmlParseInNodeContext call will be returned + * of parsed nodes from the xmlParseBalancedChunkMemory call will be returned * to *parsed_nodes. (It is caller's responsibility to free that.) * * Errors normally result in ereport(ERROR), but if escontext is an @@ -1795,6 +1795,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, PgXmlErrorContext *xmlerrcxt; volatile xmlParserCtxtPtr ctxt = NULL; volatile xmlDocPtr doc = NULL; + volatile int save_keep_blanks = -1; /* * This step looks annoyingly redundant, but we must do it to have a @@ -1822,7 +1823,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg, PG_TRY(); { bool parse_as_document = false; - int options; int res_code; size_t count = 0; xmlChar *version = NULL; @@ -1853,18 +1853,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg, parse_as_document = true; } - /* - * Select parse options. - * - * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR) - * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined by - * internal DTD are applied'. As for external DTDs, we try to support - * them too (see SQL/XML:2008 GR 10.16.7.e), but that doesn't really - * happen because xmlPgEntityLoader prevents it. - */ - options = XML_PARSE_NOENT | XML_PARSE_DTDATTR - | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS); - /* initialize output parameters */ if (parsed_xmloptiontype != NULL) *parsed_xmloptiontype = parse_as_document ? XMLOPTION_DOCUMENT : @@ -1874,11 +1862,26 @@ xml_parse(text *data, XmlOptionType xmloption_arg, if (parse_as_document) { + int options; + + /* set up parser context used by xmlCtxtReadDoc */ ctxt = xmlNewParserCtxt(); if (ctxt == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); + /* + * Select parse options. + * + * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR) + * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined + * by internal DTD are applied'. As for external DTDs, we try to + * support them too (see SQL/XML:2008 GR 10.16.7.e), but that + * doesn't really happen because xmlPgEntityLoader prevents it. + */ + options = XML_PARSE_NOENT | XML_PARSE_DTDATTR + | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS); + doc = xmlCtxtReadDoc(ctxt, utf8string, NULL, /* no URL */ "UTF-8", @@ -1900,10 +1903,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } else { - xmlNodePtr root; - xmlNodePtr oldroot PG_USED_FOR_ASSERTS_ONLY; - - /* set up document with empty root node to be the context node */ + /* set up document that xmlParseBalancedChunkMemory will add to */ doc = xmlNewDoc(version); if (doc == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, @@ -1916,43 +1916,22 @@ xml_parse(text *data, XmlOptionType xmloption_arg, "could not allocate XML document"); doc->standalone = standalone; - root = xmlNewNode(NULL, (const xmlChar *) "content-root"); - if (root == NULL || xmlerrcxt->err_occurred) - xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, - "could not allocate xml node"); - - /* - * This attaches root to doc, so we need not free it separately; - * and there can't yet be any old root to free. - */ - oldroot = xmlDocSetRootElement(doc, root); - Assert(oldroot == NULL); + /* set parse options --- have to do this the ugly way */ + save_keep_blanks = xmlKeepBlanksDefault(preserve_whitespace ? 1 : 0); /* allow empty content */ if (*(utf8string + count)) { - xmlNodePtr node_list = NULL; - xmlParserErrors res; - - res = xmlParseInNodeContext(root, - (char *) utf8string + count, - strlen((char *) utf8string + count), - options, - &node_list); - - if (res != XML_ERR_OK || xmlerrcxt->err_occurred) + res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, + utf8string + count, + parsed_nodes); + if (res_code != 0 || xmlerrcxt->err_occurred) { - xmlFreeNodeList(node_list); xml_errsave(escontext, xmlerrcxt, ERRCODE_INVALID_XML_CONTENT, "invalid XML content"); goto fail; } - - if (parsed_nodes != NULL) - *parsed_nodes = node_list; - else - xmlFreeNodeList(node_list); } } @@ -1961,6 +1940,8 @@ fail: } PG_CATCH(); { + if (save_keep_blanks != -1) + xmlKeepBlanksDefault(save_keep_blanks); if (doc != NULL) xmlFreeDoc(doc); if (ctxt != NULL) @@ -1972,6 +1953,9 @@ fail: } PG_END_TRY(); + if (save_keep_blanks != -1) + xmlKeepBlanksDefault(save_keep_blanks); + if (ctxt != NULL) xmlFreeParserCtxt(ctxt); diff --git a/src/backend/utils/cache/evtcache.c b/src/backend/utils/cache/evtcache.c index ce596bf5638..b9d5a5998be 100644 --- a/src/backend/utils/cache/evtcache.c +++ b/src/backend/utils/cache/evtcache.c @@ -78,7 +78,6 @@ BuildEventTriggerCache(void) { HASHCTL ctl; HTAB *cache; - MemoryContext oldcontext; Relation rel; Relation irel; SysScanDesc scan; @@ -110,9 +109,6 @@ BuildEventTriggerCache(void) (Datum) 0); } - /* Switch to correct memory context. */ - oldcontext = MemoryContextSwitchTo(EventTriggerCacheContext); - /* Prevent the memory context from being nuked while we're rebuilding. */ EventTriggerCacheState = ETCS_REBUILD_STARTED; @@ -145,6 +141,7 @@ BuildEventTriggerCache(void) bool evttags_isnull; EventTriggerCacheEntry *entry; bool found; + MemoryContext oldcontext; /* Get next tuple. */ tup = systable_getnext_ordered(scan, ForwardScanDirection); @@ -171,6 +168,9 @@ BuildEventTriggerCache(void) else continue; + /* Switch to correct memory context. */ + oldcontext = MemoryContextSwitchTo(EventTriggerCacheContext); + /* Allocate new cache item. */ item = palloc0(sizeof(EventTriggerCacheItem)); item->fnoid = form->evtfoid; @@ -188,6 +188,9 @@ BuildEventTriggerCache(void) entry->triggerlist = lappend(entry->triggerlist, item); else entry->triggerlist = list_make1(item); + + /* Restore previous memory context. */ + MemoryContextSwitchTo(oldcontext); } /* Done with pg_event_trigger scan. */ @@ -195,9 +198,6 @@ BuildEventTriggerCache(void) index_close(irel, AccessShareLock); relation_close(rel, AccessShareLock); - /* Restore previous memory context. */ - MemoryContextSwitchTo(oldcontext); - /* Install new cache. */ EventTriggerCache = cache; @@ -240,6 +240,8 @@ DecodeTextArrayToBitmapset(Datum array) } pfree(elems); + if ((Pointer) arr != DatumGetPointer(array)) + pfree(arr); return bms; } diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 89a1c79e984..6661d2c6b73 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -463,8 +463,7 @@ CompleteCachedPlan(CachedPlanSource *plansource, /* * Save the final parameter types (or other parameter specification data) - * into the source_context, as well as our other parameters. Also save - * the result tuple descriptor. + * into the source_context, as well as our other parameters. */ MemoryContextSwitchTo(source_context); @@ -480,9 +479,25 @@ CompleteCachedPlan(CachedPlanSource *plansource, plansource->parserSetupArg = parserSetupArg; plansource->cursor_options = cursor_options; plansource->fixed_result = fixed_result; - plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list); + /* + * Also save the result tuple descriptor. PlanCacheComputeResultDesc may + * leak some cruft; normally we just accept that to save a copy step, but + * in USE_VALGRIND mode be tidy by running it in the caller's context. + */ +#ifdef USE_VALGRIND + MemoryContextSwitchTo(oldcxt); + plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list); + if (plansource->resultDesc) + { + MemoryContextSwitchTo(source_context); + plansource->resultDesc = CreateTupleDescCopy(plansource->resultDesc); + MemoryContextSwitchTo(oldcxt); + } +#else + plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list); MemoryContextSwitchTo(oldcxt); +#endif plansource->is_complete = true; plansource->is_valid = true; @@ -1283,6 +1298,7 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, CachedPlan *plan = NULL; List *qlist; bool customplan; + ListCell *lc; /* Assert caller is doing things in a sane order */ Assert(plansource->magic == CACHEDPLANSOURCE_MAGIC); @@ -1385,6 +1401,13 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, plan->is_saved = true; } + foreach(lc, plan->stmt_list) + { + PlannedStmt *pstmt = (PlannedStmt *) lfirst(lc); + + pstmt->planOrigin = customplan ? PLAN_STMT_CACHE_CUSTOM : PLAN_STMT_CACHE_GENERIC; + } + return plan; } diff --git a/src/backend/utils/cache/ts_cache.c b/src/backend/utils/cache/ts_cache.c index 18cccd778fd..e8ae53238d0 100644 --- a/src/backend/utils/cache/ts_cache.c +++ b/src/backend/utils/cache/ts_cache.c @@ -321,7 +321,9 @@ lookup_ts_dictionary_cache(Oid dictId) /* * Init method runs in dictionary's private memory context, and we - * make sure the options are stored there too + * make sure the options are stored there too. This typically + * results in a small amount of memory leakage, but it's not worth + * complicating the API for tmplinit functions to avoid it. */ oldcontext = MemoryContextSwitchTo(entry->dictCtx); diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c index f9aec38a11f..6a347698edf 100644 --- a/src/backend/utils/cache/typcache.c +++ b/src/backend/utils/cache/typcache.c @@ -1171,9 +1171,6 @@ load_domaintype_info(TypeCacheEntry *typentry) elog(ERROR, "domain \"%s\" constraint \"%s\" has NULL conbin", NameStr(typTup->typname), NameStr(c->conname)); - /* Convert conbin to C string in caller context */ - constring = TextDatumGetCString(val); - /* Create the DomainConstraintCache object and context if needed */ if (dcc == NULL) { @@ -1189,9 +1186,8 @@ load_domaintype_info(TypeCacheEntry *typentry) dcc->dccRefCount = 0; } - /* Create node trees in DomainConstraintCache's context */ - oldcxt = MemoryContextSwitchTo(dcc->dccContext); - + /* Convert conbin to a node tree, still in caller's context */ + constring = TextDatumGetCString(val); check_expr = (Expr *) stringToNode(constring); /* @@ -1206,10 +1202,13 @@ load_domaintype_info(TypeCacheEntry *typentry) */ check_expr = expression_planner(check_expr); + /* Create only the minimally needed stuff in dccContext */ + oldcxt = MemoryContextSwitchTo(dcc->dccContext); + r = makeNode(DomainConstraintState); r->constrainttype = DOM_CONSTRAINT_CHECK; r->name = pstrdup(NameStr(c->conname)); - r->check_expr = check_expr; + r->check_expr = copyObject(check_expr); r->check_exprstate = NULL; MemoryContextSwitchTo(oldcxt); diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 1ad155d446e..81da03629f0 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -22,10 +22,11 @@ * lookup key's hash value as a partition number --- this will work because * of the way calc_bucket() maps hash values to bucket numbers. * - * For hash tables in shared memory, the memory allocator function should - * match malloc's semantics of returning NULL on failure. For hash tables - * in local memory, we typically use palloc() which will throw error on - * failure. The code in this file has to cope with both cases. + * The memory allocator function should match malloc's semantics of returning + * NULL on failure. (This is essential for hash tables in shared memory. + * For hash tables in local memory, we used to use palloc() which will throw + * error on failure; but we no longer do, so it's untested whether this + * module will still cope with that behavior.) * * dynahash.c provides support for these types of lookup keys: * @@ -98,6 +99,7 @@ #include "access/xact.h" #include "common/hashfn.h" +#include "lib/ilist.h" #include "port/pg_bitutils.h" #include "storage/shmem.h" #include "storage/spin.h" @@ -195,6 +197,7 @@ struct HASHHDR long ssize; /* segment size --- must be power of 2 */ int sshift; /* segment shift = log2(ssize) */ int nelem_alloc; /* number of entries to allocate at once */ + bool isfixed; /* if true, don't enlarge */ #ifdef HASH_STATISTICS @@ -227,7 +230,6 @@ struct HTAB MemoryContext hcxt; /* memory context if default allocator used */ char *tabname; /* table name (for error messages) */ bool isshared; /* true if table is in shared memory */ - bool isfixed; /* if true, don't enlarge */ /* freezing a shared table isn't allowed, so we can keep state here */ bool frozen; /* true = no more inserts allowed */ @@ -236,6 +238,16 @@ struct HTAB Size keysize; /* hash key length in bytes */ long ssize; /* segment size --- must be power of 2 */ int sshift; /* segment shift = log2(ssize) */ + + /* + * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of + * all the element blocks they have allocated. This pacifies Valgrind, + * which would otherwise often claim that the element blocks are "possibly + * lost" for lack of any non-interior pointers to their starts. + */ +#ifdef USE_VALGRIND + slist_head element_blocks; +#endif }; /* @@ -618,8 +630,10 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags) } } + /* Set isfixed if requested, but not till after we build initial entries */ if (flags & HASH_FIXED_SIZE) - hashp->isfixed = true; + hctl->isfixed = true; + return hashp; } @@ -644,6 +658,8 @@ hdefault(HTAB *hashp) hctl->ssize = DEF_SEGSIZE; hctl->sshift = DEF_SEGSIZE_SHIFT; + hctl->isfixed = false; /* can be enlarged */ + #ifdef HASH_STATISTICS hctl->accesses = hctl->collisions = 0; #endif @@ -1708,23 +1724,51 @@ element_alloc(HTAB *hashp, int nelem, int freelist_idx) { HASHHDR *hctl = hashp->hctl; Size elementSize; + Size requestSize; + char *allocedBlock; HASHELEMENT *firstElement; HASHELEMENT *tmpElement; HASHELEMENT *prevElement; int i; - if (hashp->isfixed) + if (hctl->isfixed) return false; /* Each element has a HASHELEMENT header plus user data. */ elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize); + requestSize = nelem * elementSize; + + /* Add space for slist_node list link if we need one. */ +#ifdef USE_VALGRIND + if (!hashp->isshared) + requestSize += MAXALIGN(sizeof(slist_node)); +#endif + + /* Allocate the memory. */ CurrentDynaHashCxt = hashp->hcxt; - firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize); + allocedBlock = hashp->alloc(requestSize); - if (!firstElement) + if (!allocedBlock) return false; + /* + * If USE_VALGRIND, each allocated block of elements of a non-shared + * hashtable is chained into a list, so that Valgrind won't think it's + * been leaked. + */ +#ifdef USE_VALGRIND + if (hashp->isshared) + firstElement = (HASHELEMENT *) allocedBlock; + else + { + slist_push_head(&hashp->element_blocks, (slist_node *) allocedBlock); + firstElement = (HASHELEMENT *) (allocedBlock + MAXALIGN(sizeof(slist_node))); + } +#else + firstElement = (HASHELEMENT *) allocedBlock; +#endif + /* prepare to link all the new entries into the freelist */ prevElement = NULL; tmpElement = firstElement; diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 43b4dbccc3d..65d8cbfaed5 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -1183,7 +1183,6 @@ UnlinkLockFiles(int status, Datum arg) /* Should we complain if the unlink fails? */ } /* Since we're about to exit, no need to reclaim storage */ - lock_files = NIL; /* * Lock file removal should always be the last externally visible action diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index c86ceefda94..641e535a73c 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -417,12 +417,11 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype); ctype = TextDatumGetCString(datum); - if (pg_perm_setlocale(LC_COLLATE, collate) == NULL) - ereport(FATAL, - (errmsg("database locale is incompatible with operating system"), - errdetail("The database was initialized with LC_COLLATE \"%s\", " - " which is not recognized by setlocale().", collate), - errhint("Recreate the database with another locale or install the missing locale."))); + /* + * Historcally, we set LC_COLLATE from datcollate, as well. That's no + * longer necessary because all collation behavior is handled through + * pg_locale_t. + */ if (pg_perm_setlocale(LC_CTYPE, ctype) == NULL) ereport(FATAL, diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 667df448732..e404c345e6e 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -249,6 +249,7 @@ static void reapply_stacked_values(struct config_generic *variable, const char *curvalue, GucContext curscontext, GucSource cursource, Oid cursrole); +static void free_placeholder(struct config_string *pHolder); static bool validate_option_array_item(const char *name, const char *value, bool skipIfNoPermissions); static void write_auto_conf_file(int fd, const char *filename, ConfigVariable *head); @@ -4722,8 +4723,13 @@ AlterSystemSetConfigFile(AlterSystemStmt *altersysstmt) * the config file cannot cause postmaster start to fail, so we * don't have to be too tense about possibly installing a bad * value.) + * + * As an exception, we skip this check if this is a RESET command + * for an unknown custom GUC, else there'd be no way for users to + * remove such settings with reserved prefixes. */ - (void) assignable_custom_variable_name(name, false, ERROR); + if (value || !valid_custom_variable_name(name)) + (void) assignable_custom_variable_name(name, false, ERROR); } /* @@ -5018,16 +5024,8 @@ define_custom_variable(struct config_generic *variable) set_config_sourcefile(name, pHolder->gen.sourcefile, pHolder->gen.sourceline); - /* - * Free up as much as we conveniently can of the placeholder structure. - * (This neglects any stack items, so it's possible for some memory to be - * leaked. Since this can only happen once per session per variable, it - * doesn't seem worth spending much code on.) - */ - set_string_field(pHolder, pHolder->variable, NULL); - set_string_field(pHolder, &pHolder->reset_val, NULL); - - guc_free(pHolder); + /* Now we can free the no-longer-referenced placeholder variable */ + free_placeholder(pHolder); } /* @@ -5127,6 +5125,25 @@ reapply_stacked_values(struct config_generic *variable, } /* + * Free up a no-longer-referenced placeholder GUC variable. + * + * This neglects any stack items, so it's possible for some memory to be + * leaked. Since this can only happen once per session per variable, it + * doesn't seem worth spending much code on. + */ +static void +free_placeholder(struct config_string *pHolder) +{ + /* Placeholders are always STRING type, so free their values */ + Assert(pHolder->gen.vartype == PGC_STRING); + set_string_field(pHolder, pHolder->variable, NULL); + set_string_field(pHolder, &pHolder->reset_val, NULL); + + guc_free(unconstify(char *, pHolder->gen.name)); + guc_free(pHolder); +} + +/* * Functions for extensions to call to define their custom GUC variables. */ void @@ -5286,9 +5303,7 @@ MarkGUCPrefixReserved(const char *className) /* * Check for existing placeholders. We must actually remove invalid - * placeholders, else future parallel worker startups will fail. (We - * don't bother trying to free associated memory, since this shouldn't - * happen often.) + * placeholders, else future parallel worker startups will fail. */ hash_seq_init(&status, guc_hashtab); while ((hentry = (GUCHashEntry *) hash_seq_search(&status)) != NULL) @@ -5312,6 +5327,8 @@ MarkGUCPrefixReserved(const char *className) NULL); /* Remove it from any lists it's in, too */ RemoveGUCFromLists(var); + /* And free it */ + free_placeholder((struct config_string *) var); } } @@ -6711,6 +6728,7 @@ validate_option_array_item(const char *name, const char *value, { struct config_generic *gconf; + bool reset_custom; /* * There are three cases to consider: @@ -6729,16 +6747,21 @@ validate_option_array_item(const char *name, const char *value, * it's assumed to be fully validated.) * * name is not known and can't be created as a placeholder. Throw error, - * unless skipIfNoPermissions is true, in which case return false. + * unless skipIfNoPermissions or reset_custom is true. If reset_custom is + * true, this is a RESET or RESET ALL operation for an unknown custom GUC + * with a reserved prefix, in which case we want to fall through to the + * placeholder case described in the preceding paragraph (else there'd be + * no way for users to remove them). Otherwise, return false. */ - gconf = find_option(name, true, skipIfNoPermissions, ERROR); - if (!gconf) + reset_custom = (!value && valid_custom_variable_name(name)); + gconf = find_option(name, true, skipIfNoPermissions || reset_custom, ERROR); + if (!gconf && !reset_custom) { /* not known, failed to make a placeholder */ return false; } - if (gconf->flags & GUC_CUSTOM_PLACEHOLDER) + if (!gconf || gconf->flags & GUC_CUSTOM_PLACEHOLDER) { /* * We cannot do any meaningful check on the value, so only permissions diff --git a/src/backend/utils/misc/ps_status.c b/src/backend/utils/misc/ps_status.c index e08b26e8c14..4df25944deb 100644 --- a/src/backend/utils/misc/ps_status.c +++ b/src/backend/utils/misc/ps_status.c @@ -100,6 +100,17 @@ static void flush_ps_display(void); static int save_argc; static char **save_argv; +/* + * Valgrind seems not to consider the global "environ" variable as a valid + * root pointer; so when we allocate a new environment array, it claims that + * data is leaked. To fix that, keep our own statically-allocated copy of the + * pointer. (Oddly, this doesn't seem to be a problem for "argv".) + */ +#if defined(PS_USE_CLOBBER_ARGV) && defined(USE_VALGRIND) +extern char **ps_status_new_environ; +char **ps_status_new_environ; +#endif + /* * Call this early in startup to save the original argc/argv values. @@ -206,6 +217,11 @@ save_ps_display_args(int argc, char **argv) } new_environ[i] = NULL; environ = new_environ; + + /* See notes about Valgrind above. */ +#ifdef USE_VALGRIND + ps_status_new_environ = new_environ; +#endif } /* diff --git a/src/backend/utils/mmgr/alignedalloc.c b/src/backend/utils/mmgr/alignedalloc.c index 7eea695de62..b1be7426914 100644 --- a/src/backend/utils/mmgr/alignedalloc.c +++ b/src/backend/utils/mmgr/alignedalloc.c @@ -45,6 +45,15 @@ AlignedAllocFree(void *pointer) GetMemoryChunkContext(unaligned)->name, chunk); #endif + /* + * Create a dummy vchunk covering the start of the unaligned chunk, but + * not overlapping the aligned chunk. This will be freed while pfree'ing + * the unaligned chunk, keeping Valgrind happy. Then when we return to + * the outer pfree, that will clean up the vchunk for the aligned chunk. + */ + VALGRIND_MEMPOOL_ALLOC(GetMemoryChunkContext(unaligned), unaligned, + (char *) pointer - (char *) unaligned); + /* Recursively pfree the unaligned chunk */ pfree(unaligned); } @@ -123,6 +132,15 @@ AlignedAllocRealloc(void *pointer, Size size, int flags) VALGRIND_MAKE_MEM_DEFINED(pointer, old_size); memcpy(newptr, pointer, Min(size, old_size)); + /* + * Create a dummy vchunk covering the start of the old unaligned chunk, + * but not overlapping the aligned chunk. This will be freed while + * pfree'ing the old unaligned chunk, keeping Valgrind happy. Then when + * we return to repalloc, it will move the vchunk for the aligned chunk. + */ + VALGRIND_MEMPOOL_ALLOC(ctx, unaligned, + (char *) pointer - (char *) unaligned); + pfree(unaligned); return newptr; diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c index 666ecd8f78d..9ef109ca586 100644 --- a/src/backend/utils/mmgr/aset.c +++ b/src/backend/utils/mmgr/aset.c @@ -103,6 +103,8 @@ #define ALLOC_BLOCKHDRSZ MAXALIGN(sizeof(AllocBlockData)) #define ALLOC_CHUNKHDRSZ sizeof(MemoryChunk) +#define FIRST_BLOCKHDRSZ (MAXALIGN(sizeof(AllocSetContext)) + \ + ALLOC_BLOCKHDRSZ) typedef struct AllocBlockData *AllocBlock; /* forward reference */ @@ -458,6 +460,21 @@ AllocSetContextCreateInternal(MemoryContext parent, * we'd leak the header/initial block if we ereport in this stretch. */ + /* Create a vpool associated with the context */ + VALGRIND_CREATE_MEMPOOL(set, 0, false); + + /* + * Create a vchunk covering both the AllocSetContext struct and the keeper + * block's header. (Perhaps it would be more sensible for these to be two + * separate vchunks, but doing that seems to tickle bugs in some versions + * of Valgrind.) We must have these vchunks, and also a vchunk for each + * subsequently-added block header, so that Valgrind considers the + * pointers within them while checking for leaked memory. Note that + * Valgrind doesn't distinguish between these vchunks and those created by + * mcxt.c for the user-accessible-data chunks we allocate. + */ + VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ); + /* Fill in the initial block's block header */ block = KeeperBlock(set); block->aset = set; @@ -585,6 +602,14 @@ AllocSetReset(MemoryContext context) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, block->freeptr - ((char *) block)); #endif + + /* + * We need to free the block header's vchunk explicitly, although + * the user-data vchunks within will go away in the TRIM below. + * Otherwise Valgrind complains about leaked allocations. + */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } block = next; @@ -592,6 +617,14 @@ AllocSetReset(MemoryContext context) Assert(context->mem_allocated == keepersize); + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the AllocSetContext and + * keeper-block header. This gets rid of the vchunks for whatever user + * data is getting discarded by the context reset. + */ + VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ); + /* Reset block size allocation sequence, too */ set->nextBlockSize = set->initBlockSize; } @@ -648,6 +681,9 @@ AllocSetDelete(MemoryContext context) freelist->first_free = (AllocSetContext *) oldset->header.nextchild; freelist->num_free--; + /* Destroy the context's vpool --- see notes below */ + VALGRIND_DESTROY_MEMPOOL(oldset); + /* All that remains is to free the header/initial block */ free(oldset); } @@ -675,13 +711,24 @@ AllocSetDelete(MemoryContext context) #endif if (!IsKeeperBlock(set, block)) + { + /* As in AllocSetReset, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); free(block); + } block = next; } Assert(context->mem_allocated == keepersize); + /* + * Destroy the vpool. We don't seem to need to explicitly free the + * initial block's header vchunk, nor any user-data vchunks that Valgrind + * still knows about; they'll all go away automatically. + */ + VALGRIND_DESTROY_MEMPOOL(set); + /* Finally, free the context header, including the keeper block */ free(set); } @@ -716,6 +763,9 @@ AllocSetAllocLarge(MemoryContext context, Size size, int flags) if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, ALLOC_BLOCKHDRSZ); + context->mem_allocated += blksize; block->aset = set; @@ -922,6 +972,9 @@ AllocSetAllocFromNewBlock(MemoryContext context, Size size, int flags, if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, ALLOC_BLOCKHDRSZ); + context->mem_allocated += blksize; block->aset = set; @@ -1104,6 +1157,10 @@ AllocSetFree(void *pointer) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, block->freeptr - ((char *) block)); #endif + + /* As in AllocSetReset, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } else @@ -1184,6 +1241,7 @@ AllocSetRealloc(void *pointer, Size size, int flags) * realloc() to make the containing block bigger, or smaller, with * minimum space wastage. */ + AllocBlock newblock; Size chksize; Size blksize; Size oldblksize; @@ -1223,14 +1281,21 @@ AllocSetRealloc(void *pointer, Size size, int flags) blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; oldblksize = block->endptr - ((char *) block); - block = (AllocBlock) realloc(block, blksize); - if (block == NULL) + newblock = (AllocBlock) realloc(block, blksize); + if (newblock == NULL) { /* Disallow access to the chunk header. */ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ); return MemoryContextAllocationFailure(&set->header, size, flags); } + /* + * Move the block-header vchunk explicitly. (mcxt.c will take care of + * moving the vchunk for the user data.) + */ + VALGRIND_MEMPOOL_CHANGE(set, block, newblock, ALLOC_BLOCKHDRSZ); + block = newblock; + /* updated separately, not to underflow when (oldblksize > blksize) */ set->header.mem_allocated -= oldblksize; set->header.mem_allocated += blksize; @@ -1294,7 +1359,7 @@ AllocSetRealloc(void *pointer, Size size, int flags) /* Ensure any padding bytes are marked NOACCESS. */ VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size, chksize - size); - /* Disallow access to the chunk header . */ + /* Disallow access to the chunk header. */ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ); return pointer; diff --git a/src/backend/utils/mmgr/bump.c b/src/backend/utils/mmgr/bump.c index f7a37d1b3e8..2805d55a2ec 100644 --- a/src/backend/utils/mmgr/bump.c +++ b/src/backend/utils/mmgr/bump.c @@ -45,7 +45,9 @@ #include "utils/memutils_memorychunk.h" #include "utils/memutils_internal.h" -#define Bump_BLOCKHDRSZ MAXALIGN(sizeof(BumpBlock)) +#define Bump_BLOCKHDRSZ MAXALIGN(sizeof(BumpBlock)) +#define FIRST_BLOCKHDRSZ (MAXALIGN(sizeof(BumpContext)) + \ + Bump_BLOCKHDRSZ) /* No chunk header unless built with MEMORY_CONTEXT_CHECKING */ #ifdef MEMORY_CONTEXT_CHECKING @@ -189,6 +191,12 @@ BumpContextCreate(MemoryContext parent, const char *name, Size minContextSize, * Avoid writing code that can fail between here and MemoryContextCreate; * we'd leak the header and initial block if we ereport in this stretch. */ + + /* See comments about Valgrind interactions in aset.c */ + VALGRIND_CREATE_MEMPOOL(set, 0, false); + /* This vchunk covers the BumpContext and the keeper block header */ + VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ); + dlist_init(&set->blocks); /* Fill in the initial block's block header */ @@ -262,6 +270,14 @@ BumpReset(MemoryContext context) BumpBlockFree(set, block); } + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the BumpContext and keeper-block + * header. This gets rid of the vchunks for whatever user data is getting + * discarded by the context reset. + */ + VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ); + /* Reset block size allocation sequence, too */ set->nextBlockSize = set->initBlockSize; @@ -279,6 +295,10 @@ BumpDelete(MemoryContext context) { /* Reset to release all releasable BumpBlocks */ BumpReset(context); + + /* Destroy the vpool -- see notes in aset.c */ + VALGRIND_DESTROY_MEMPOOL(context); + /* And free the context header and keeper block */ free(context); } @@ -318,6 +338,9 @@ BumpAllocLarge(MemoryContext context, Size size, int flags) if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Bump_BLOCKHDRSZ); + context->mem_allocated += blksize; /* the block is completely full */ @@ -455,6 +478,9 @@ BumpAllocFromNewBlock(MemoryContext context, Size size, int flags, if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Bump_BLOCKHDRSZ); + context->mem_allocated += blksize; /* initialize the new block */ @@ -606,6 +632,9 @@ BumpBlockFree(BumpContext *set, BumpBlock *block) wipe_mem(block, ((char *) block->endptr - (char *) block)); #endif + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c index 18679ad4f1e..cfafc9bf082 100644 --- a/src/backend/utils/mmgr/generation.c +++ b/src/backend/utils/mmgr/generation.c @@ -45,6 +45,8 @@ #define Generation_BLOCKHDRSZ MAXALIGN(sizeof(GenerationBlock)) #define Generation_CHUNKHDRSZ sizeof(MemoryChunk) +#define FIRST_BLOCKHDRSZ (MAXALIGN(sizeof(GenerationContext)) + \ + Generation_BLOCKHDRSZ) #define Generation_CHUNK_FRACTION 8 @@ -221,6 +223,12 @@ GenerationContextCreate(MemoryContext parent, * Avoid writing code that can fail between here and MemoryContextCreate; * we'd leak the header if we ereport in this stretch. */ + + /* See comments about Valgrind interactions in aset.c */ + VALGRIND_CREATE_MEMPOOL(set, 0, false); + /* This vchunk covers the GenerationContext and the keeper block header */ + VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ); + dlist_init(&set->blocks); /* Fill in the initial block's block header */ @@ -309,6 +317,14 @@ GenerationReset(MemoryContext context) GenerationBlockFree(set, block); } + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the GenerationContext and + * keeper-block header. This gets rid of the vchunks for whatever user + * data is getting discarded by the context reset. + */ + VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ); + /* set it so new allocations to make use of the keeper block */ set->block = KeeperBlock(set); @@ -329,6 +345,10 @@ GenerationDelete(MemoryContext context) { /* Reset to release all releasable GenerationBlocks */ GenerationReset(context); + + /* Destroy the vpool -- see notes in aset.c */ + VALGRIND_DESTROY_MEMPOOL(context); + /* And free the context header and keeper block */ free(context); } @@ -365,6 +385,9 @@ GenerationAllocLarge(MemoryContext context, Size size, int flags) if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Generation_BLOCKHDRSZ); + context->mem_allocated += blksize; /* block with a single (used) chunk */ @@ -487,6 +510,9 @@ GenerationAllocFromNewBlock(MemoryContext context, Size size, int flags, if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Generation_BLOCKHDRSZ); + context->mem_allocated += blksize; /* initialize the new block */ @@ -677,6 +703,9 @@ GenerationBlockFree(GenerationContext *set, GenerationBlock *block) wipe_mem(block, block->blksize); #endif + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index 15fa4d0a55e..47fd774c7d2 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -8,6 +8,23 @@ * context-type-specific operations via the function pointers in a * context's MemoryContextMethods struct. * + * A note about Valgrind support: when USE_VALGRIND is defined, we provide + * support for memory leak tracking at the allocation-unit level. Valgrind + * does leak detection by tracking allocated "chunks", which can be grouped + * into "pools". The "chunk" terminology is overloaded, since we use that + * word for our allocation units, and it's sometimes important to distinguish + * those from the Valgrind objects that describe them. To reduce confusion, + * let's use the terms "vchunk" and "vpool" for the Valgrind objects. + * + * We use a separate vpool for each memory context. The context-type-specific + * code is responsible for creating and deleting the vpools, and also for + * creating vchunks to cover its management data structures such as block + * headers. (There must be a vchunk that includes every pointer we want + * Valgrind to consider for leak-tracking purposes.) This module creates + * and deletes the vchunks that cover the caller-visible allocated chunks. + * However, the context-type-specific code must handle cleaning up those + * vchunks too during memory context reset operations. + * * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -418,8 +435,6 @@ MemoryContextResetOnly(MemoryContext context) context->methods->reset(context); context->isReset = true; - VALGRIND_DESTROY_MEMPOOL(context); - VALGRIND_CREATE_MEMPOOL(context, 0, false); } } @@ -526,8 +541,6 @@ MemoryContextDeleteOnly(MemoryContext context) context->ident = NULL; context->methods->delete_context(context); - - VALGRIND_DESTROY_MEMPOOL(context); } /* @@ -560,9 +573,7 @@ MemoryContextDeleteChildren(MemoryContext context) * the specified context, since that means it will automatically be freed * when no longer needed. * - * There is no API for deregistering a callback once registered. If you - * want it to not do anything anymore, adjust the state pointed to by its - * "arg" to indicate that. + * Note that callers can assume this cannot fail. */ void MemoryContextRegisterResetCallback(MemoryContext context, @@ -578,6 +589,41 @@ MemoryContextRegisterResetCallback(MemoryContext context, } /* + * MemoryContextUnregisterResetCallback + * Undo the effects of MemoryContextRegisterResetCallback. + * + * This can be used if a callback's effects are no longer required + * at some point before the context has been reset/deleted. It is the + * caller's responsibility to pfree the callback struct (if needed). + * + * An assertion failure occurs if the callback was not registered. + * We could alternatively define that case as a no-op, but that seems too + * likely to mask programming errors such as passing the wrong context. + */ +void +MemoryContextUnregisterResetCallback(MemoryContext context, + MemoryContextCallback *cb) +{ + MemoryContextCallback *prev, + *cur; + + Assert(MemoryContextIsValid(context)); + + for (prev = NULL, cur = context->reset_cbs; cur != NULL; + prev = cur, cur = cur->next) + { + if (cur != cb) + continue; + if (prev) + prev->next = cur->next; + else + context->reset_cbs = cur->next; + return; + } + Assert(false); +} + +/* * MemoryContextCallResetCallbacks * Internal function to call all registered callbacks for context. */ @@ -1137,8 +1183,6 @@ MemoryContextCreate(MemoryContext node, node->nextchild = NULL; node->allowInCritSection = false; } - - VALGRIND_CREATE_MEMPOOL(node, 0, false); } /* @@ -1421,7 +1465,13 @@ MemoryContextAllocAligned(MemoryContext context, void *unaligned; void *aligned; - /* wouldn't make much sense to waste that much space */ + /* + * Restrict alignto to ensure that it can fit into the "value" field of + * the redirection MemoryChunk, and that the distance back to the start of + * the unaligned chunk will fit into the space available for that. This + * isn't a limitation in practice, since it wouldn't make much sense to + * waste that much space. + */ Assert(alignto < (128 * 1024 * 1024)); /* ensure alignto is a power of 2 */ @@ -1458,10 +1508,15 @@ MemoryContextAllocAligned(MemoryContext context, alloc_size += 1; #endif - /* perform the actual allocation */ - unaligned = MemoryContextAllocExtended(context, alloc_size, flags); + /* + * Perform the actual allocation, but do not pass down MCXT_ALLOC_ZERO. + * This ensures that wasted bytes beyond the aligned chunk do not become + * DEFINED. + */ + unaligned = MemoryContextAllocExtended(context, alloc_size, + flags & ~MCXT_ALLOC_ZERO); - /* set the aligned pointer */ + /* compute the aligned pointer */ aligned = (void *) TYPEALIGN(alignto, (char *) unaligned + sizeof(MemoryChunk)); @@ -1489,12 +1544,23 @@ MemoryContextAllocAligned(MemoryContext context, set_sentinel(aligned, size); #endif - /* Mark the bytes before the redirection header as noaccess */ - VALGRIND_MAKE_MEM_NOACCESS(unaligned, - (char *) alignedchunk - (char *) unaligned); + /* + * MemoryContextAllocExtended marked the whole unaligned chunk as a + * vchunk. Undo that, instead making just the aligned chunk be a vchunk. + * This prevents Valgrind from complaining that the vchunk is possibly + * leaked, since only pointers to the aligned chunk will exist. + * + * After these calls, the aligned chunk will be marked UNDEFINED, and all + * the rest of the unaligned chunk (the redirection chunk header, the + * padding bytes before it, and any wasted trailing bytes) will be marked + * NOACCESS, which is what we want. + */ + VALGRIND_MEMPOOL_FREE(context, unaligned); + VALGRIND_MEMPOOL_ALLOC(context, aligned, size); - /* Disallow access to the redirection chunk header. */ - VALGRIND_MAKE_MEM_NOACCESS(alignedchunk, sizeof(MemoryChunk)); + /* Now zero (and make DEFINED) just the aligned chunk, if requested */ + if ((flags & MCXT_ALLOC_ZERO) != 0) + MemSetAligned(aligned, 0, size); return aligned; } @@ -1528,16 +1594,12 @@ void pfree(void *pointer) { #ifdef USE_VALGRIND - MemoryContextMethodID method = GetMemoryChunkMethodID(pointer); MemoryContext context = GetMemoryChunkContext(pointer); #endif MCXT_METHOD(pointer, free_p) (pointer); -#ifdef USE_VALGRIND - if (method != MCTX_ALIGNED_REDIRECT_ID) - VALGRIND_MEMPOOL_FREE(context, pointer); -#endif + VALGRIND_MEMPOOL_FREE(context, pointer); } /* @@ -1547,9 +1609,6 @@ pfree(void *pointer) void * repalloc(void *pointer, Size size) { -#ifdef USE_VALGRIND - MemoryContextMethodID method = GetMemoryChunkMethodID(pointer); -#endif #if defined(USE_ASSERT_CHECKING) || defined(USE_VALGRIND) MemoryContext context = GetMemoryChunkContext(pointer); #endif @@ -1572,10 +1631,7 @@ repalloc(void *pointer, Size size) */ ret = MCXT_METHOD(pointer, realloc) (pointer, size, 0); -#ifdef USE_VALGRIND - if (method != MCTX_ALIGNED_REDIRECT_ID) - VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size); -#endif + VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size); return ret; } diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c index d32c0d318fb..0e35abcf5a0 100644 --- a/src/backend/utils/mmgr/slab.c +++ b/src/backend/utils/mmgr/slab.c @@ -377,6 +377,11 @@ SlabContextCreate(MemoryContext parent, * we'd leak the header if we ereport in this stretch. */ + /* See comments about Valgrind interactions in aset.c */ + VALGRIND_CREATE_MEMPOOL(slab, 0, false); + /* This vchunk covers the SlabContext only */ + VALGRIND_MEMPOOL_ALLOC(slab, slab, sizeof(SlabContext)); + /* Fill in SlabContext-specific header fields */ slab->chunkSize = (uint32) chunkSize; slab->fullChunkSize = (uint32) fullChunkSize; @@ -451,6 +456,10 @@ SlabReset(MemoryContext context) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, slab->blockSize); #endif + + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(slab, block); + free(block); context->mem_allocated -= slab->blockSize; } @@ -467,11 +476,23 @@ SlabReset(MemoryContext context) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, slab->blockSize); #endif + + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(slab, block); + free(block); context->mem_allocated -= slab->blockSize; } } + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the SlabContext. This gets rid of + * the vchunks for whatever user data is getting discarded by the context + * reset. + */ + VALGRIND_MEMPOOL_TRIM(slab, slab, sizeof(SlabContext)); + slab->curBlocklistIndex = 0; Assert(context->mem_allocated == 0); @@ -486,6 +507,10 @@ SlabDelete(MemoryContext context) { /* Reset to release all the SlabBlocks */ SlabReset(context); + + /* Destroy the vpool -- see notes in aset.c */ + VALGRIND_DESTROY_MEMPOOL(context); + /* And free the context header */ free(context); } @@ -567,6 +592,9 @@ SlabAllocFromNewBlock(MemoryContext context, Size size, int flags) if (unlikely(block == NULL)) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(slab, block, Slab_BLOCKHDRSZ); + block->slab = slab; context->mem_allocated += slab->blockSize; @@ -795,6 +823,10 @@ SlabFree(void *pointer) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, slab->blockSize); #endif + + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(slab, block); + free(block); slab->header.mem_allocated -= slab->blockSize; } diff --git a/src/bin/initdb/Makefile b/src/bin/initdb/Makefile index 997e0a013e9..c0470efda92 100644 --- a/src/bin/initdb/Makefile +++ b/src/bin/initdb/Makefile @@ -20,7 +20,7 @@ include $(top_builddir)/src/Makefile.global # from libpq, else we have risks of version skew if we run with a libpq # shared library from a different PG version. Define # USE_PRIVATE_ENCODING_FUNCS to ensure that that happens. -override CPPFLAGS := -DUSE_PRIVATE_ENCODING_FUNCS -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(ICU_CFLAGS) $(CPPFLAGS) +override CPPFLAGS := -DUSE_PRIVATE_ENCODING_FUNCS -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(CPPFLAGS) $(ICU_CFLAGS) # We need libpq only because fe_utils does. LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) $(ICU_LIBS) diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c index 025b893a41e..3986882f042 100644 --- a/src/bin/pg_basebackup/pg_createsubscriber.c +++ b/src/bin/pg_basebackup/pg_createsubscriber.c @@ -1250,8 +1250,17 @@ setup_recovery(const struct LogicalRepInfo *dbinfo, const char *datadir, const c appendPQExpBufferStr(recoveryconfcontents, "recovery_target = ''\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_timeline = 'latest'\n"); + + /* + * Set recovery_target_inclusive = false to avoid reapplying the + * transaction committed at 'lsn' after subscription is enabled. This is + * because the provided 'lsn' is also used as the replication start point + * for the subscription. So, the server can send the transaction committed + * at that 'lsn' after replication is started which can lead to applying + * the same transaction twice if we keep recovery_target_inclusive = true. + */ appendPQExpBufferStr(recoveryconfcontents, - "recovery_target_inclusive = true\n"); + "recovery_target_inclusive = false\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_action = promote\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_name = ''\n"); diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c index aa1589e3331..a1976fae607 100644 --- a/src/bin/pg_dump/common.c +++ b/src/bin/pg_dump/common.c @@ -17,6 +17,7 @@ #include <ctype.h> +#include "catalog/pg_am_d.h" #include "catalog/pg_class_d.h" #include "catalog/pg_collation_d.h" #include "catalog/pg_extension_d.h" @@ -945,6 +946,24 @@ findOprByOid(Oid oid) } /* + * findAccessMethodByOid + * finds the DumpableObject for the access method with the given oid + * returns NULL if not found + */ +AccessMethodInfo * +findAccessMethodByOid(Oid oid) +{ + CatalogId catId; + DumpableObject *dobj; + + catId.tableoid = AccessMethodRelationId; + catId.oid = oid; + dobj = findObjectByCatalogId(catId); + Assert(dobj == NULL || dobj->objType == DO_ACCESS_METHOD); + return (AccessMethodInfo *) dobj; +} + +/* * findCollationByOid * finds the DumpableObject for the collation with the given oid * returns NULL if not found diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build index 4a4ebbd8ec9..a2233b0a1b4 100644 --- a/src/bin/pg_dump/meson.build +++ b/src/bin/pg_dump/meson.build @@ -102,7 +102,6 @@ tests += { 't/003_pg_dump_with_server.pl', 't/004_pg_dump_parallel.pl', 't/005_pg_dump_filterfile.pl', - 't/006_pg_dumpall.pl', 't/010_dump_connstr.pl', ], }, diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c index 5974d6706fd..086adcdc502 100644 --- a/src/bin/pg_dump/parallel.c +++ b/src/bin/pg_dump/parallel.c @@ -334,16 +334,6 @@ on_exit_close_archive(Archive *AHX) } /* - * When pg_restore restores multiple databases, then update already added entry - * into array for cleanup. - */ -void -replace_on_exit_close_archive(Archive *AHX) -{ - shutdown_info.AHX = AHX; -} - -/* * on_exit_nicely handler for shutting down database connections and * worker processes cleanly. */ diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index af0007fb6d2..4ebef1e8644 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -308,7 +308,7 @@ extern void SetArchiveOptions(Archive *AH, DumpOptions *dopt, RestoreOptions *ro extern void ProcessArchiveRestoreOptions(Archive *AHX); -extern void RestoreArchive(Archive *AHX, bool append_data); +extern void RestoreArchive(Archive *AHX); /* Open an existing archive */ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt); diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 197c1295d93..dce88f040ac 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -31,6 +31,8 @@ #endif #include "catalog/pg_class_d.h" +#include "catalog/pg_largeobject_metadata_d.h" +#include "catalog/pg_shdepend_d.h" #include "common/string.h" #include "compress_io.h" #include "dumputils.h" @@ -85,7 +87,7 @@ static int RestoringToDB(ArchiveHandle *AH); static void dump_lo_buf(ArchiveHandle *AH); static void dumpTimestamp(ArchiveHandle *AH, const char *msg, time_t tim); static void SetOutput(ArchiveHandle *AH, const char *filename, - const pg_compress_specification compression_spec, bool append_data); + const pg_compress_specification compression_spec); static CompressFileHandle *SaveOutput(ArchiveHandle *AH); static void RestoreOutput(ArchiveHandle *AH, CompressFileHandle *savedOutput); @@ -337,14 +339,9 @@ ProcessArchiveRestoreOptions(Archive *AHX) StrictNamesCheck(ropt); } -/* - * RestoreArchive - * - * If append_data is set, then append data into file as we are restoring dump - * of multiple databases which was taken by pg_dumpall. - */ +/* Public */ void -RestoreArchive(Archive *AHX, bool append_data) +RestoreArchive(Archive *AHX) { ArchiveHandle *AH = (ArchiveHandle *) AHX; RestoreOptions *ropt = AH->public.ropt; @@ -461,7 +458,7 @@ RestoreArchive(Archive *AHX, bool append_data) */ sav = SaveOutput(AH); if (ropt->filename || ropt->compression_spec.algorithm != PG_COMPRESSION_NONE) - SetOutput(AH, ropt->filename, ropt->compression_spec, append_data); + SetOutput(AH, ropt->filename, ropt->compression_spec); ahprintf(AH, "--\n-- PostgreSQL database dump\n--\n\n"); @@ -1300,7 +1297,7 @@ PrintTOCSummary(Archive *AHX) sav = SaveOutput(AH); if (ropt->filename) - SetOutput(AH, ropt->filename, out_compression_spec, false); + SetOutput(AH, ropt->filename, out_compression_spec); if (strftime(stamp_str, sizeof(stamp_str), PGDUMP_STRFTIME_FMT, localtime(&AH->createDate)) == 0) @@ -1679,8 +1676,7 @@ archprintf(Archive *AH, const char *fmt,...) static void SetOutput(ArchiveHandle *AH, const char *filename, - const pg_compress_specification compression_spec, - bool append_data) + const pg_compress_specification compression_spec) { CompressFileHandle *CFH; const char *mode; @@ -1700,7 +1696,7 @@ SetOutput(ArchiveHandle *AH, const char *filename, else fn = fileno(stdout); - if (append_data || AH->mode == archModeAppend) + if (AH->mode == archModeAppend) mode = PG_BINARY_A; else mode = PG_BINARY_W; @@ -2974,6 +2970,19 @@ _tocEntryRequired(TocEntry *te, teSection curSection, ArchiveHandle *AH) int res = REQ_SCHEMA | REQ_DATA; RestoreOptions *ropt = AH->public.ropt; + /* + * For binary upgrade mode, dump pg_largeobject_metadata and the + * associated pg_shdepend rows. This is faster to restore than the + * equivalent set of large object commands. We can only do this for + * upgrades from v12 and newer; in older versions, pg_largeobject_metadata + * was created WITH OIDS, so the OID column is hidden and won't be dumped. + */ + if (ropt->binary_upgrade && AH->public.remoteVersion >= 120000 && + strcmp(te->desc, "TABLE DATA") == 0 && + (te->catalogId.oid == LargeObjectMetadataRelationId || + te->catalogId.oid == SharedDependRelationId)) + return REQ_DATA; + /* These items are treated specially */ if (strcmp(te->desc, "ENCODING") == 0 || strcmp(te->desc, "STDSTRINGS") == 0 || diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index 365073b3eae..325b53fc9bd 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -394,7 +394,6 @@ struct _tocEntry extern int parallel_restore(ArchiveHandle *AH, TocEntry *te); extern void on_exit_close_archive(Archive *AHX); -extern void replace_on_exit_close_archive(Archive *AHX); extern void warn_or_exit_horribly(ArchiveHandle *AH, const char *fmt,...) pg_attribute_printf(2, 3); diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index d94d0de2a5d..b5ba3b46dd9 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -826,7 +826,7 @@ _CloseArchive(ArchiveHandle *AH) savVerbose = AH->public.verbose; AH->public.verbose = 0; - RestoreArchive((Archive *) AH, false); + RestoreArchive((Archive *) AH); SetArchiveOptions((Archive *) AH, savDopt, savRopt); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 1937997ea67..f3a353a61a5 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -47,10 +47,13 @@ #include "catalog/pg_authid_d.h" #include "catalog/pg_cast_d.h" #include "catalog/pg_class_d.h" +#include "catalog/pg_constraint_d.h" #include "catalog/pg_default_acl_d.h" #include "catalog/pg_largeobject_d.h" +#include "catalog/pg_largeobject_metadata_d.h" #include "catalog/pg_proc_d.h" #include "catalog/pg_publication_d.h" +#include "catalog/pg_shdepend_d.h" #include "catalog/pg_subscription_d.h" #include "catalog/pg_type_d.h" #include "common/connect.h" @@ -209,6 +212,12 @@ static int nbinaryUpgradeClassOids = 0; static SequenceItem *sequences = NULL; static int nsequences = 0; +/* + * For binary upgrade, the dump ID of pg_largeobject_metadata is saved for use + * as a dependency for pg_shdepend and any large object comments/seclabels. + */ +static DumpId lo_metadata_dumpId; + /* Maximum number of relations to fetch in a fetchAttributeStats() call. */ #define MAX_ATTR_STATS_RELS 64 @@ -440,8 +449,6 @@ main(int argc, char **argv) bool data_only = false; bool schema_only = false; bool statistics_only = false; - bool with_data = false; - bool with_schema = false; bool with_statistics = false; bool no_data = false; bool no_schema = false; @@ -505,6 +512,7 @@ main(int argc, char **argv) {"section", required_argument, NULL, 5}, {"serializable-deferrable", no_argument, &dopt.serializable_deferrable, 1}, {"snapshot", required_argument, NULL, 6}, + {"statistics", no_argument, NULL, 22}, {"statistics-only", no_argument, NULL, 18}, {"strict-names", no_argument, &strict_names, 1}, {"use-set-session-authorization", no_argument, &dopt.use_setsessauth, 1}, @@ -519,9 +527,6 @@ main(int argc, char **argv) {"no-toast-compression", no_argument, &dopt.no_toast_compression, 1}, {"no-unlogged-table-data", no_argument, &dopt.no_unlogged_table_data, 1}, {"no-sync", no_argument, NULL, 7}, - {"with-data", no_argument, NULL, 22}, - {"with-schema", no_argument, NULL, 23}, - {"with-statistics", no_argument, NULL, 24}, {"on-conflict-do-nothing", no_argument, &dopt.do_nothing, 1}, {"rows-per-insert", required_argument, NULL, 10}, {"include-foreign-data", required_argument, NULL, 11}, @@ -789,14 +794,6 @@ main(int argc, char **argv) break; case 22: - with_data = true; - break; - - case 23: - with_schema = true; - break; - - case 24: with_statistics = true; break; @@ -843,13 +840,17 @@ main(int argc, char **argv) if (statistics_only && no_statistics) pg_fatal("options --statistics-only and --no-statistics cannot be used together"); - /* reject conflicting "with-" and "no-" options */ - if (with_data && no_data) - pg_fatal("options --with-data and --no-data cannot be used together"); - if (with_schema && no_schema) - pg_fatal("options --with-schema and --no-schema cannot be used together"); + /* reject conflicting "no-" options */ if (with_statistics && no_statistics) - pg_fatal("options --with-statistics and --no-statistics cannot be used together"); + pg_fatal("options --statistics and --no-statistics cannot be used together"); + + /* reject conflicting "-only" options */ + if (data_only && with_statistics) + pg_fatal("options %s and %s cannot be used together", + "-a/--data-only", "--statistics"); + if (schema_only && with_statistics) + pg_fatal("options %s and %s cannot be used together", + "-s/--schema-only", "--statistics"); if (schema_only && foreign_servers_include_patterns.head != NULL) pg_fatal("options -s/--schema-only and --include-foreign-data cannot be used together"); @@ -864,16 +865,14 @@ main(int argc, char **argv) pg_fatal("option --if-exists requires option -c/--clean"); /* - * Set derivative flags. An "-only" option may be overridden by an - * explicit "with-" option; e.g. "--schema-only --with-statistics" will - * include schema and statistics. Other ambiguous or nonsensical - * combinations, e.g. "--schema-only --no-schema", will have already - * caused an error in one of the checks above. + * Set derivative flags. Ambiguous or nonsensical combinations, e.g. + * "--schema-only --no-schema", will have already caused an error in one + * of the checks above. */ dopt.dumpData = ((dopt.dumpData && !schema_only && !statistics_only) || - (data_only || with_data)) && !no_data; + data_only) && !no_data; dopt.dumpSchema = ((dopt.dumpSchema && !data_only && !statistics_only) || - (schema_only || with_schema)) && !no_schema; + schema_only) && !no_schema; dopt.dumpStatistics = ((dopt.dumpStatistics && !schema_only && !data_only) || (statistics_only || with_statistics)) && !no_statistics; @@ -1086,6 +1085,36 @@ main(int argc, char **argv) getTableData(&dopt, tblinfo, numTables, RELKIND_SEQUENCE); /* + * For binary upgrade mode, dump pg_largeobject_metadata and the + * associated pg_shdepend rows. This is faster to restore than the + * equivalent set of large object commands. We can only do this for + * upgrades from v12 and newer; in older versions, pg_largeobject_metadata + * was created WITH OIDS, so the OID column is hidden and won't be dumped. + */ + if (dopt.binary_upgrade && fout->remoteVersion >= 120000) + { + TableInfo *lo_metadata = findTableByOid(LargeObjectMetadataRelationId); + TableInfo *shdepend = findTableByOid(SharedDependRelationId); + + makeTableDataInfo(&dopt, lo_metadata); + makeTableDataInfo(&dopt, shdepend); + + /* + * Save pg_largeobject_metadata's dump ID for use as a dependency for + * pg_shdepend and any large object comments/seclabels. + */ + lo_metadata_dumpId = lo_metadata->dataObj->dobj.dumpId; + addObjectDependency(&shdepend->dataObj->dobj, lo_metadata_dumpId); + + /* + * Only dump large object shdepend rows for this database. + */ + shdepend->dataObj->filtercond = "WHERE classid = 'pg_largeobject'::regclass " + "AND dbid = (SELECT oid FROM pg_database " + " WHERE datname = current_database())"; + } + + /* * In binary-upgrade mode, we do not have to worry about the actual LO * data or the associated metadata that resides in the pg_largeobject and * pg_largeobject_metadata tables, respectively. @@ -1226,7 +1255,7 @@ main(int argc, char **argv) * right now. */ if (plainText) - RestoreArchive(fout, false); + RestoreArchive(fout); CloseArchive(fout); @@ -1316,6 +1345,7 @@ help(const char *progname) printf(_(" --sequence-data include sequence data in dump\n")); printf(_(" --serializable-deferrable wait until the dump can run without anomalies\n")); printf(_(" --snapshot=SNAPSHOT use given snapshot for the dump\n")); + printf(_(" --statistics dump the statistics\n")); printf(_(" --statistics-only dump only the statistics, not schema or data\n")); printf(_(" --strict-names require table and/or schema include patterns to\n" " match at least one entity each\n")); @@ -1324,9 +1354,6 @@ help(const char *progname) printf(_(" --use-set-session-authorization\n" " use SET SESSION AUTHORIZATION commands instead of\n" " ALTER OWNER commands to set ownership\n")); - printf(_(" --with-data dump the data\n")); - printf(_(" --with-schema dump the schema\n")); - printf(_(" --with-statistics dump the statistics\n")); printf(_("\nConnection options:\n")); printf(_(" -d, --dbname=DBNAME database to dump\n")); @@ -2168,6 +2195,13 @@ selectDumpableProcLang(ProcLangInfo *plang, Archive *fout) static void selectDumpableAccessMethod(AccessMethodInfo *method, Archive *fout) { + /* see getAccessMethods() comment about v9.6. */ + if (fout->remoteVersion < 90600) + { + method->dobj.dump = DUMP_COMPONENT_NONE; + return; + } + if (checkExtensionMembership(&method->dobj, fout)) return; /* extension membership overrides all else */ @@ -3924,10 +3958,37 @@ getLOs(Archive *fout) * as it will be copied by pg_upgrade, which simply copies the * pg_largeobject table. We *do* however dump out anything but the * data, as pg_upgrade copies just pg_largeobject, but not - * pg_largeobject_metadata, after the dump is restored. + * pg_largeobject_metadata, after the dump is restored. In versions + * before v12, this is done via proper large object commands. In + * newer versions, we dump the content of pg_largeobject_metadata and + * any associated pg_shdepend rows, which is faster to restore. (On + * <v12, pg_largeobject_metadata was created WITH OIDS, so the OID + * column is hidden and won't be dumped.) */ if (dopt->binary_upgrade) - loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA; + { + if (fout->remoteVersion >= 120000) + { + /* + * We should've saved pg_largeobject_metadata's dump ID before + * this point. + */ + Assert(lo_metadata_dumpId); + + loinfo->dobj.dump &= ~(DUMP_COMPONENT_DATA | DUMP_COMPONENT_ACL | DUMP_COMPONENT_DEFINITION); + + /* + * Mark the large object as dependent on + * pg_largeobject_metadata so that any large object + * comments/seclables are dumped after it. + */ + loinfo->dobj.dependencies = (DumpId *) pg_malloc(sizeof(DumpId)); + loinfo->dobj.dependencies[0] = lo_metadata_dumpId; + loinfo->dobj.nDeps = loinfo->dobj.allocDeps = 1; + } + else + loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA; + } /* * Create a "BLOBS" data item for the group, too. This is just a @@ -4962,6 +5023,7 @@ getSubscriptions(Archive *fout) int i_suboriginremotelsn; int i_subenabled; int i_subfailover; + int i_subretaindeadtuples; int i, ntups; @@ -5034,10 +5096,17 @@ getSubscriptions(Archive *fout) if (fout->remoteVersion >= 170000) appendPQExpBufferStr(query, - " s.subfailover\n"); + " s.subfailover,\n"); else appendPQExpBufferStr(query, - " false AS subfailover\n"); + " false AS subfailover,\n"); + + if (fout->remoteVersion >= 190000) + appendPQExpBufferStr(query, + " s.subretaindeadtuples\n"); + else + appendPQExpBufferStr(query, + " false AS subretaindeadtuples\n"); appendPQExpBufferStr(query, "FROM pg_subscription s\n"); @@ -5071,6 +5140,7 @@ getSubscriptions(Archive *fout) i_subpasswordrequired = PQfnumber(res, "subpasswordrequired"); i_subrunasowner = PQfnumber(res, "subrunasowner"); i_subfailover = PQfnumber(res, "subfailover"); + i_subretaindeadtuples = PQfnumber(res, "subretaindeadtuples"); i_subconninfo = PQfnumber(res, "subconninfo"); i_subslotname = PQfnumber(res, "subslotname"); i_subsynccommit = PQfnumber(res, "subsynccommit"); @@ -5104,6 +5174,8 @@ getSubscriptions(Archive *fout) (strcmp(PQgetvalue(res, i, i_subrunasowner), "t") == 0); subinfo[i].subfailover = (strcmp(PQgetvalue(res, i, i_subfailover), "t") == 0); + subinfo[i].subretaindeadtuples = + (strcmp(PQgetvalue(res, i, i_subretaindeadtuples), "t") == 0); subinfo[i].subconninfo = pg_strdup(PQgetvalue(res, i, i_subconninfo)); if (PQgetisnull(res, i, i_subslotname)) @@ -5362,6 +5434,9 @@ dumpSubscription(Archive *fout, const SubscriptionInfo *subinfo) if (subinfo->subfailover) appendPQExpBufferStr(query, ", failover = true"); + if (subinfo->subretaindeadtuples) + appendPQExpBufferStr(query, ", retain_dead_tuples = true"); + if (strcmp(subinfo->subsynccommit, "off") != 0) appendPQExpBuffer(query, ", synchronous_commit = %s", fmtId(subinfo->subsynccommit)); @@ -6122,6 +6197,7 @@ getTypes(Archive *fout) */ tyinfo[i].nDomChecks = 0; tyinfo[i].domChecks = NULL; + tyinfo[i].notnull = NULL; if ((tyinfo[i].dobj.dump & DUMP_COMPONENT_DEFINITION) && tyinfo[i].typtype == TYPTYPE_DOMAIN) getDomainConstraints(fout, &(tyinfo[i])); @@ -6181,6 +6257,8 @@ getOperators(Archive *fout) int i_oprnamespace; int i_oprowner; int i_oprkind; + int i_oprleft; + int i_oprright; int i_oprcode; /* @@ -6192,6 +6270,8 @@ getOperators(Archive *fout) "oprnamespace, " "oprowner, " "oprkind, " + "oprleft, " + "oprright, " "oprcode::oid AS oprcode " "FROM pg_operator"); @@ -6207,6 +6287,8 @@ getOperators(Archive *fout) i_oprnamespace = PQfnumber(res, "oprnamespace"); i_oprowner = PQfnumber(res, "oprowner"); i_oprkind = PQfnumber(res, "oprkind"); + i_oprleft = PQfnumber(res, "oprleft"); + i_oprright = PQfnumber(res, "oprright"); i_oprcode = PQfnumber(res, "oprcode"); for (i = 0; i < ntups; i++) @@ -6220,6 +6302,8 @@ getOperators(Archive *fout) findNamespace(atooid(PQgetvalue(res, i, i_oprnamespace))); oprinfo[i].rolname = getRoleName(PQgetvalue(res, i, i_oprowner)); oprinfo[i].oprkind = (PQgetvalue(res, i, i_oprkind))[0]; + oprinfo[i].oprleft = atooid(PQgetvalue(res, i, i_oprleft)); + oprinfo[i].oprright = atooid(PQgetvalue(res, i, i_oprright)); oprinfo[i].oprcode = atooid(PQgetvalue(res, i, i_oprcode)); /* Decide whether we want to dump it */ @@ -6248,6 +6332,7 @@ getCollations(Archive *fout) int i_collname; int i_collnamespace; int i_collowner; + int i_collencoding; query = createPQExpBuffer(); @@ -6258,7 +6343,8 @@ getCollations(Archive *fout) appendPQExpBufferStr(query, "SELECT tableoid, oid, collname, " "collnamespace, " - "collowner " + "collowner, " + "collencoding " "FROM pg_collation"); res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); @@ -6272,6 +6358,7 @@ getCollations(Archive *fout) i_collname = PQfnumber(res, "collname"); i_collnamespace = PQfnumber(res, "collnamespace"); i_collowner = PQfnumber(res, "collowner"); + i_collencoding = PQfnumber(res, "collencoding"); for (i = 0; i < ntups; i++) { @@ -6283,6 +6370,7 @@ getCollations(Archive *fout) collinfo[i].dobj.namespace = findNamespace(atooid(PQgetvalue(res, i, i_collnamespace))); collinfo[i].rolname = getRoleName(PQgetvalue(res, i, i_collowner)); + collinfo[i].collencoding = atoi(PQgetvalue(res, i, i_collencoding)); /* Decide whether we want to dump it */ selectDumpableObject(&(collinfo[i].dobj), fout); @@ -6373,16 +6461,28 @@ getAccessMethods(Archive *fout) int i_amhandler; int i_amtype; - /* Before 9.6, there are no user-defined access methods */ - if (fout->remoteVersion < 90600) - return; - query = createPQExpBuffer(); - /* Select all access methods from pg_am table */ - appendPQExpBufferStr(query, "SELECT tableoid, oid, amname, amtype, " - "amhandler::pg_catalog.regproc AS amhandler " - "FROM pg_am"); + /* + * Select all access methods from pg_am table. v9.6 introduced CREATE + * ACCESS METHOD, so earlier versions usually have only built-in access + * methods. v9.6 also changed the access method API, replacing dozens of + * pg_am columns with amhandler. Even if a user created an access method + * by "INSERT INTO pg_am", we have no way to translate pre-v9.6 pg_am + * columns to a v9.6+ CREATE ACCESS METHOD. Hence, before v9.6, read + * pg_am just to facilitate findAccessMethodByOid() providing the + * OID-to-name mapping. + */ + appendPQExpBufferStr(query, "SELECT tableoid, oid, amname, "); + if (fout->remoteVersion >= 90600) + appendPQExpBufferStr(query, + "amtype, " + "amhandler::pg_catalog.regproc AS amhandler "); + else + appendPQExpBufferStr(query, + "'i'::pg_catalog.\"char\" AS amtype, " + "'-'::pg_catalog.regproc AS amhandler "); + appendPQExpBufferStr(query, "FROM pg_am"); res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); @@ -6431,6 +6531,7 @@ getOpclasses(Archive *fout) OpclassInfo *opcinfo; int i_tableoid; int i_oid; + int i_opcmethod; int i_opcname; int i_opcnamespace; int i_opcowner; @@ -6440,7 +6541,7 @@ getOpclasses(Archive *fout) * system-defined opclasses at dump-out time. */ - appendPQExpBufferStr(query, "SELECT tableoid, oid, opcname, " + appendPQExpBufferStr(query, "SELECT tableoid, oid, opcmethod, opcname, " "opcnamespace, " "opcowner " "FROM pg_opclass"); @@ -6453,6 +6554,7 @@ getOpclasses(Archive *fout) i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); + i_opcmethod = PQfnumber(res, "opcmethod"); i_opcname = PQfnumber(res, "opcname"); i_opcnamespace = PQfnumber(res, "opcnamespace"); i_opcowner = PQfnumber(res, "opcowner"); @@ -6466,6 +6568,7 @@ getOpclasses(Archive *fout) opcinfo[i].dobj.name = pg_strdup(PQgetvalue(res, i, i_opcname)); opcinfo[i].dobj.namespace = findNamespace(atooid(PQgetvalue(res, i, i_opcnamespace))); + opcinfo[i].opcmethod = atooid(PQgetvalue(res, i, i_opcmethod)); opcinfo[i].rolname = getRoleName(PQgetvalue(res, i, i_opcowner)); /* Decide whether we want to dump it */ @@ -6491,6 +6594,7 @@ getOpfamilies(Archive *fout) OpfamilyInfo *opfinfo; int i_tableoid; int i_oid; + int i_opfmethod; int i_opfname; int i_opfnamespace; int i_opfowner; @@ -6502,7 +6606,7 @@ getOpfamilies(Archive *fout) * system-defined opfamilies at dump-out time. */ - appendPQExpBufferStr(query, "SELECT tableoid, oid, opfname, " + appendPQExpBufferStr(query, "SELECT tableoid, oid, opfmethod, opfname, " "opfnamespace, " "opfowner " "FROM pg_opfamily"); @@ -6516,6 +6620,7 @@ getOpfamilies(Archive *fout) i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); i_opfname = PQfnumber(res, "opfname"); + i_opfmethod = PQfnumber(res, "opfmethod"); i_opfnamespace = PQfnumber(res, "opfnamespace"); i_opfowner = PQfnumber(res, "opfowner"); @@ -6528,6 +6633,7 @@ getOpfamilies(Archive *fout) opfinfo[i].dobj.name = pg_strdup(PQgetvalue(res, i, i_opfname)); opfinfo[i].dobj.namespace = findNamespace(atooid(PQgetvalue(res, i, i_opfnamespace))); + opfinfo[i].opfmethod = atooid(PQgetvalue(res, i, i_opfmethod)); opfinfo[i].rolname = getRoleName(PQgetvalue(res, i, i_opfowner)); /* Decide whether we want to dump it */ @@ -8247,27 +8353,33 @@ addConstrChildIdxDeps(DumpableObject *dobj, const IndxInfo *refidx) static void getDomainConstraints(Archive *fout, TypeInfo *tyinfo) { - int i; ConstraintInfo *constrinfo; PQExpBuffer query = createPQExpBuffer(); PGresult *res; int i_tableoid, i_oid, i_conname, - i_consrc; + i_consrc, + i_convalidated, + i_contype; int ntups; if (!fout->is_prepared[PREPQUERY_GETDOMAINCONSTRAINTS]) { - /* Set up query for constraint-specific details */ - appendPQExpBufferStr(query, - "PREPARE getDomainConstraints(pg_catalog.oid) AS\n" - "SELECT tableoid, oid, conname, " - "pg_catalog.pg_get_constraintdef(oid) AS consrc, " - "convalidated " - "FROM pg_catalog.pg_constraint " - "WHERE contypid = $1 AND contype = 'c' " - "ORDER BY conname"); + /* + * Set up query for constraint-specific details. For servers 17 and + * up, domains have constraints of type 'n' as well as 'c', otherwise + * just the latter. + */ + appendPQExpBuffer(query, + "PREPARE getDomainConstraints(pg_catalog.oid) AS\n" + "SELECT tableoid, oid, conname, " + "pg_catalog.pg_get_constraintdef(oid) AS consrc, " + "convalidated, contype " + "FROM pg_catalog.pg_constraint " + "WHERE contypid = $1 AND contype IN (%s) " + "ORDER BY conname", + fout->remoteVersion < 170000 ? "'c'" : "'c', 'n'"); ExecuteSqlStatement(fout, query->data); @@ -8286,33 +8398,50 @@ getDomainConstraints(Archive *fout, TypeInfo *tyinfo) i_oid = PQfnumber(res, "oid"); i_conname = PQfnumber(res, "conname"); i_consrc = PQfnumber(res, "consrc"); + i_convalidated = PQfnumber(res, "convalidated"); + i_contype = PQfnumber(res, "contype"); constrinfo = (ConstraintInfo *) pg_malloc(ntups * sizeof(ConstraintInfo)); - - tyinfo->nDomChecks = ntups; tyinfo->domChecks = constrinfo; - for (i = 0; i < ntups; i++) + /* 'i' tracks result rows; 'j' counts CHECK constraints */ + for (int i = 0, j = 0; i < ntups; i++) { - bool validated = PQgetvalue(res, i, 4)[0] == 't'; - - constrinfo[i].dobj.objType = DO_CONSTRAINT; - constrinfo[i].dobj.catId.tableoid = atooid(PQgetvalue(res, i, i_tableoid)); - constrinfo[i].dobj.catId.oid = atooid(PQgetvalue(res, i, i_oid)); - AssignDumpId(&constrinfo[i].dobj); - constrinfo[i].dobj.name = pg_strdup(PQgetvalue(res, i, i_conname)); - constrinfo[i].dobj.namespace = tyinfo->dobj.namespace; - constrinfo[i].contable = NULL; - constrinfo[i].condomain = tyinfo; - constrinfo[i].contype = 'c'; - constrinfo[i].condef = pg_strdup(PQgetvalue(res, i, i_consrc)); - constrinfo[i].confrelid = InvalidOid; - constrinfo[i].conindex = 0; - constrinfo[i].condeferrable = false; - constrinfo[i].condeferred = false; - constrinfo[i].conislocal = true; - - constrinfo[i].separate = !validated; + bool validated = PQgetvalue(res, i, i_convalidated)[0] == 't'; + char contype = (PQgetvalue(res, i, i_contype))[0]; + ConstraintInfo *constraint; + + if (contype == CONSTRAINT_CHECK) + { + constraint = &constrinfo[j++]; + tyinfo->nDomChecks++; + } + else + { + Assert(contype == CONSTRAINT_NOTNULL); + Assert(tyinfo->notnull == NULL); + /* use last item in array for the not-null constraint */ + tyinfo->notnull = &(constrinfo[ntups - 1]); + constraint = tyinfo->notnull; + } + + constraint->dobj.objType = DO_CONSTRAINT; + constraint->dobj.catId.tableoid = atooid(PQgetvalue(res, i, i_tableoid)); + constraint->dobj.catId.oid = atooid(PQgetvalue(res, i, i_oid)); + AssignDumpId(&(constraint->dobj)); + constraint->dobj.name = pg_strdup(PQgetvalue(res, i, i_conname)); + constraint->dobj.namespace = tyinfo->dobj.namespace; + constraint->contable = NULL; + constraint->condomain = tyinfo; + constraint->contype = contype; + constraint->condef = pg_strdup(PQgetvalue(res, i, i_consrc)); + constraint->confrelid = InvalidOid; + constraint->conindex = 0; + constraint->condeferrable = false; + constraint->condeferred = false; + constraint->conislocal = true; + + constraint->separate = !validated; /* * Make the domain depend on the constraint, ensuring it won't be @@ -8321,8 +8450,7 @@ getDomainConstraints(Archive *fout, TypeInfo *tyinfo) * anyway, so this doesn't matter. */ if (validated) - addObjectDependency(&tyinfo->dobj, - constrinfo[i].dobj.dumpId); + addObjectDependency(&tyinfo->dobj, constraint->dobj.dumpId); } PQclear(res); @@ -9039,8 +9167,20 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) if (tbinfo->relkind == RELKIND_SEQUENCE) continue; - /* Don't bother with uninteresting tables, either */ - if (!tbinfo->interesting) + /* + * Don't bother with uninteresting tables, either. For binary + * upgrades, this is bypassed for pg_largeobject_metadata and + * pg_shdepend so that the columns names are collected for the + * corresponding COPY commands. Restoring the data for those catalogs + * is faster than restoring the equivalent set of large object + * commands. We can only do this for upgrades from v12 and newer; in + * older versions, pg_largeobject_metadata was created WITH OIDS, so + * the OID column is hidden and won't be dumped. + */ + if (!tbinfo->interesting && + !(fout->dopt->binary_upgrade && fout->remoteVersion >= 120000 && + (tbinfo->dobj.catId.oid == LargeObjectMetadataRelationId || + tbinfo->dobj.catId.oid == SharedDependRelationId))) continue; /* OK, we need info for this table */ @@ -9244,7 +9384,10 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) pg_fatal("unrecognized table OID %u", attrelid); /* cross-check that we only got requested tables */ if (tbinfo->relkind == RELKIND_SEQUENCE || - !tbinfo->interesting) + (!tbinfo->interesting && + !(fout->dopt->binary_upgrade && fout->remoteVersion >= 120000 && + (tbinfo->dobj.catId.oid == LargeObjectMetadataRelationId || + tbinfo->dobj.catId.oid == SharedDependRelationId)))) pg_fatal("unexpected column data for table \"%s\"", tbinfo->dobj.name); @@ -12517,8 +12660,36 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) appendPQExpBuffer(q, " COLLATE %s", fmtQualifiedDumpable(coll)); } + /* + * Print a not-null constraint if there's one. In servers older than 17 + * these don't have names, so just print it unadorned; in newer ones they + * do, but most of the time it's going to be the standard generated one, + * so omit the name in that case also. + */ if (typnotnull[0] == 't') - appendPQExpBufferStr(q, " NOT NULL"); + { + if (fout->remoteVersion < 170000 || tyinfo->notnull == NULL) + appendPQExpBufferStr(q, " NOT NULL"); + else + { + ConstraintInfo *notnull = tyinfo->notnull; + + if (!notnull->separate) + { + char *default_name; + + /* XXX should match ChooseConstraintName better */ + default_name = psprintf("%s_not_null", tyinfo->dobj.name); + + if (strcmp(default_name, notnull->dobj.name) == 0) + appendPQExpBufferStr(q, " NOT NULL"); + else + appendPQExpBuffer(q, " CONSTRAINT %s %s", + fmtId(notnull->dobj.name), notnull->condef); + free(default_name); + } + } + } if (typdefault != NULL) { @@ -12538,7 +12709,7 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) { ConstraintInfo *domcheck = &(tyinfo->domChecks[i]); - if (!domcheck->separate) + if (!domcheck->separate && domcheck->contype == 'c') appendPQExpBuffer(q, "\n\tCONSTRAINT %s %s", fmtId(domcheck->dobj.name), domcheck->condef); } @@ -12583,8 +12754,13 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) for (i = 0; i < tyinfo->nDomChecks; i++) { ConstraintInfo *domcheck = &(tyinfo->domChecks[i]); - PQExpBuffer conprefix = createPQExpBuffer(); + PQExpBuffer conprefix; + + /* but only if the constraint itself was dumped here */ + if (domcheck->separate) + continue; + conprefix = createPQExpBuffer(); appendPQExpBuffer(conprefix, "CONSTRAINT %s ON DOMAIN", fmtId(domcheck->dobj.name)); @@ -12597,6 +12773,25 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) destroyPQExpBuffer(conprefix); } + /* + * And a comment on the not-null constraint, if there's one -- but only if + * the constraint itself was dumped here + */ + if (tyinfo->notnull != NULL && !tyinfo->notnull->separate) + { + PQExpBuffer conprefix = createPQExpBuffer(); + + appendPQExpBuffer(conprefix, "CONSTRAINT %s ON DOMAIN", + fmtId(tyinfo->notnull->dobj.name)); + + if (tyinfo->notnull->dobj.dump & DUMP_COMPONENT_COMMENT) + dumpComment(fout, conprefix->data, qtypname, + tyinfo->dobj.namespace->dobj.name, + tyinfo->rolname, + tyinfo->notnull->dobj.catId, 0, tyinfo->dobj.dumpId); + destroyPQExpBuffer(conprefix); + } + destroyPQExpBuffer(q); destroyPQExpBuffer(delq); destroyPQExpBuffer(query); @@ -18458,14 +18653,23 @@ dumpConstraint(Archive *fout, const ConstraintInfo *coninfo) .dropStmt = delq->data)); } } - else if (coninfo->contype == 'c' && tbinfo == NULL) + else if (tbinfo == NULL) { - /* CHECK constraint on a domain */ + /* CHECK, NOT NULL constraint on a domain */ TypeInfo *tyinfo = coninfo->condomain; + Assert(coninfo->contype == 'c' || coninfo->contype == 'n'); + /* Ignore if not to be dumped separately */ if (coninfo->separate) { + const char *keyword; + + if (coninfo->contype == 'c') + keyword = "CHECK CONSTRAINT"; + else + keyword = "CONSTRAINT"; + appendPQExpBuffer(q, "ALTER DOMAIN %s\n", fmtQualifiedDumpable(tyinfo)); appendPQExpBuffer(q, " ADD CONSTRAINT %s %s;\n", @@ -18484,10 +18688,26 @@ dumpConstraint(Archive *fout, const ConstraintInfo *coninfo) ARCHIVE_OPTS(.tag = tag, .namespace = tyinfo->dobj.namespace->dobj.name, .owner = tyinfo->rolname, - .description = "CHECK CONSTRAINT", + .description = keyword, .section = SECTION_POST_DATA, .createStmt = q->data, .dropStmt = delq->data)); + + if (coninfo->dobj.dump & DUMP_COMPONENT_COMMENT) + { + PQExpBuffer conprefix = createPQExpBuffer(); + char *qtypname = pg_strdup(fmtId(tyinfo->dobj.name)); + + appendPQExpBuffer(conprefix, "CONSTRAINT %s ON DOMAIN", + fmtId(coninfo->dobj.name)); + + dumpComment(fout, conprefix->data, qtypname, + tyinfo->dobj.namespace->dobj.name, + tyinfo->rolname, + coninfo->dobj.catId, 0, tyinfo->dobj.dumpId); + destroyPQExpBuffer(conprefix); + free(qtypname); + } } } else diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 39eef1d6617..dde85ed156c 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -222,7 +222,9 @@ typedef struct _typeInfo bool isDefined; /* true if typisdefined */ /* If needed, we'll create a "shell type" entry for it; link that here: */ struct _shellTypeInfo *shellType; /* shell-type entry, or NULL */ - /* If it's a domain, we store links to its constraints here: */ + /* If it's a domain, its not-null constraint is here: */ + struct _constraintInfo *notnull; + /* If it's a domain, we store links to its CHECK constraints here: */ int nDomChecks; struct _constraintInfo *domChecks; } TypeInfo; @@ -258,6 +260,8 @@ typedef struct _oprInfo DumpableObject dobj; const char *rolname; char oprkind; + Oid oprleft; + Oid oprright; Oid oprcode; } OprInfo; @@ -271,12 +275,14 @@ typedef struct _accessMethodInfo typedef struct _opclassInfo { DumpableObject dobj; + Oid opcmethod; const char *rolname; } OpclassInfo; typedef struct _opfamilyInfo { DumpableObject dobj; + Oid opfmethod; const char *rolname; } OpfamilyInfo; @@ -284,6 +290,7 @@ typedef struct _collInfo { DumpableObject dobj; const char *rolname; + int collencoding; } CollInfo; typedef struct _convInfo @@ -709,6 +716,7 @@ typedef struct _SubscriptionInfo bool subpasswordrequired; bool subrunasowner; bool subfailover; + bool subretaindeadtuples; char *subconninfo; char *subslotname; char *subsynccommit; @@ -757,6 +765,7 @@ extern TableInfo *findTableByOid(Oid oid); extern TypeInfo *findTypeByOid(Oid oid); extern FuncInfo *findFuncByOid(Oid oid); extern OprInfo *findOprByOid(Oid oid); +extern AccessMethodInfo *findAccessMethodByOid(Oid oid); extern CollInfo *findCollationByOid(Oid oid); extern NamespaceInfo *findNamespaceByOid(Oid oid); extern ExtensionInfo *findExtensionByOid(Oid oid); diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c index 538e7dcb493..a02da3e9652 100644 --- a/src/bin/pg_dump/pg_dump_sort.c +++ b/src/bin/pg_dump/pg_dump_sort.c @@ -162,6 +162,8 @@ static DumpId postDataBoundId; static int DOTypeNameCompare(const void *p1, const void *p2); +static int pgTypeNameCompare(Oid typid1, Oid typid2); +static int accessMethodNameCompare(Oid am1, Oid am2); static bool TopoSort(DumpableObject **objs, int numObjs, DumpableObject **ordering, @@ -228,12 +230,39 @@ DOTypeNameCompare(const void *p1, const void *p2) else if (obj2->namespace) return 1; - /* Sort by name */ + /* + * Sort by name. With a few exceptions, names here are single catalog + * columns. To get a fuller picture, grep pg_dump.c for "dobj.name = ". + * Names here don't match "Name:" in plain format output, which is a + * _tocEntry.tag. For example, DumpableObject.name of a constraint is + * pg_constraint.conname, but _tocEntry.tag of a constraint is relname and + * conname joined with a space. + */ cmpval = strcmp(obj1->name, obj2->name); if (cmpval != 0) return cmpval; - /* To have a stable sort order, break ties for some object types */ + /* + * Sort by type. This helps types that share a type priority without + * sharing a unique name constraint, e.g. opclass and opfamily. + */ + cmpval = obj1->objType - obj2->objType; + if (cmpval != 0) + return cmpval; + + /* + * To have a stable sort order, break ties for some object types. Most + * catalogs have a natural key, e.g. pg_proc_proname_args_nsp_index. Where + * the above "namespace" and "name" comparisons don't cover all natural + * key columns, compare the rest here. + * + * The natural key usually refers to other catalogs by surrogate keys. + * Hence, this translates each of those references to the natural key of + * the referenced catalog. That may descend through multiple levels of + * catalog references. For example, to sort by pg_proc.proargtypes, + * descend to each pg_type and then further to its pg_namespace, for an + * overall sort by (nspname, typname). + */ if (obj1->objType == DO_FUNC || obj1->objType == DO_AGG) { FuncInfo *fobj1 = *(FuncInfo *const *) p1; @@ -246,22 +275,10 @@ DOTypeNameCompare(const void *p1, const void *p2) return cmpval; for (i = 0; i < fobj1->nargs; i++) { - TypeInfo *argtype1 = findTypeByOid(fobj1->argtypes[i]); - TypeInfo *argtype2 = findTypeByOid(fobj2->argtypes[i]); - - if (argtype1 && argtype2) - { - if (argtype1->dobj.namespace && argtype2->dobj.namespace) - { - cmpval = strcmp(argtype1->dobj.namespace->dobj.name, - argtype2->dobj.namespace->dobj.name); - if (cmpval != 0) - return cmpval; - } - cmpval = strcmp(argtype1->dobj.name, argtype2->dobj.name); - if (cmpval != 0) - return cmpval; - } + cmpval = pgTypeNameCompare(fobj1->argtypes[i], + fobj2->argtypes[i]); + if (cmpval != 0) + return cmpval; } } else if (obj1->objType == DO_OPERATOR) @@ -273,6 +290,57 @@ DOTypeNameCompare(const void *p1, const void *p2) cmpval = (oobj2->oprkind - oobj1->oprkind); if (cmpval != 0) return cmpval; + /* Within an oprkind, sort by argument type names */ + cmpval = pgTypeNameCompare(oobj1->oprleft, oobj2->oprleft); + if (cmpval != 0) + return cmpval; + cmpval = pgTypeNameCompare(oobj1->oprright, oobj2->oprright); + if (cmpval != 0) + return cmpval; + } + else if (obj1->objType == DO_OPCLASS) + { + OpclassInfo *opcobj1 = *(OpclassInfo *const *) p1; + OpclassInfo *opcobj2 = *(OpclassInfo *const *) p2; + + /* Sort by access method name, per pg_opclass_am_name_nsp_index */ + cmpval = accessMethodNameCompare(opcobj1->opcmethod, + opcobj2->opcmethod); + if (cmpval != 0) + return cmpval; + } + else if (obj1->objType == DO_OPFAMILY) + { + OpfamilyInfo *opfobj1 = *(OpfamilyInfo *const *) p1; + OpfamilyInfo *opfobj2 = *(OpfamilyInfo *const *) p2; + + /* Sort by access method name, per pg_opfamily_am_name_nsp_index */ + cmpval = accessMethodNameCompare(opfobj1->opfmethod, + opfobj2->opfmethod); + if (cmpval != 0) + return cmpval; + } + else if (obj1->objType == DO_COLLATION) + { + CollInfo *cobj1 = *(CollInfo *const *) p1; + CollInfo *cobj2 = *(CollInfo *const *) p2; + + /* + * Sort by encoding, per pg_collation_name_enc_nsp_index. Technically, + * this is not necessary, because wherever this changes dump order, + * restoring the dump fails anyway. CREATE COLLATION can't create a + * tie for this to break, because it imposes restrictions to make + * (nspname, collname) uniquely identify a collation within a given + * DatabaseEncoding. While pg_import_system_collations() can create a + * tie, pg_dump+restore fails after + * pg_import_system_collations('my_schema') does so. However, there's + * little to gain by ignoring one natural key column on the basis of + * those limitations elsewhere, so respect the full natural key like + * we do for other object types. + */ + cmpval = cobj1->collencoding - cobj2->collencoding; + if (cmpval != 0) + return cmpval; } else if (obj1->objType == DO_ATTRDEF) { @@ -317,11 +385,143 @@ DOTypeNameCompare(const void *p1, const void *p2) if (cmpval != 0) return cmpval; } + else if (obj1->objType == DO_CONSTRAINT) + { + ConstraintInfo *robj1 = *(ConstraintInfo *const *) p1; + ConstraintInfo *robj2 = *(ConstraintInfo *const *) p2; + + /* + * Sort domain constraints before table constraints, for consistency + * with our decision to sort CREATE DOMAIN before CREATE TABLE. + */ + if (robj1->condomain) + { + if (robj2->condomain) + { + /* Sort by domain name (domain namespace was considered) */ + cmpval = strcmp(robj1->condomain->dobj.name, + robj2->condomain->dobj.name); + if (cmpval != 0) + return cmpval; + } + else + return PRIO_TYPE - PRIO_TABLE; + } + else if (robj2->condomain) + return PRIO_TABLE - PRIO_TYPE; + else + { + /* Sort by table name (table namespace was considered already) */ + cmpval = strcmp(robj1->contable->dobj.name, + robj2->contable->dobj.name); + if (cmpval != 0) + return cmpval; + } + } + else if (obj1->objType == DO_PUBLICATION_REL) + { + PublicationRelInfo *probj1 = *(PublicationRelInfo *const *) p1; + PublicationRelInfo *probj2 = *(PublicationRelInfo *const *) p2; + + /* Sort by publication name, since (namespace, name) match the rel */ + cmpval = strcmp(probj1->publication->dobj.name, + probj2->publication->dobj.name); + if (cmpval != 0) + return cmpval; + } + else if (obj1->objType == DO_PUBLICATION_TABLE_IN_SCHEMA) + { + PublicationSchemaInfo *psobj1 = *(PublicationSchemaInfo *const *) p1; + PublicationSchemaInfo *psobj2 = *(PublicationSchemaInfo *const *) p2; + + /* Sort by publication name, since ->name is just nspname */ + cmpval = strcmp(psobj1->publication->dobj.name, + psobj2->publication->dobj.name); + if (cmpval != 0) + return cmpval; + } - /* Usually shouldn't get here, but if we do, sort by OID */ + /* + * Shouldn't get here except after catalog corruption, but if we do, sort + * by OID. This may make logically-identical databases differ in the + * order of objects in dump output. Users will get spurious schema diffs. + * Expect flaky failures of 002_pg_upgrade.pl test 'dump outputs from + * original and restored regression databases match' if the regression + * database contains objects allowing that test to reach here. That's a + * consequence of the test using "pg_restore -j", which doesn't fully + * constrain OID assignment order. + */ + Assert(false); return oidcmp(obj1->catId.oid, obj2->catId.oid); } +/* Compare two OID-identified pg_type values by nspname, then by typname. */ +static int +pgTypeNameCompare(Oid typid1, Oid typid2) +{ + TypeInfo *typobj1; + TypeInfo *typobj2; + int cmpval; + + if (typid1 == typid2) + return 0; + + typobj1 = findTypeByOid(typid1); + typobj2 = findTypeByOid(typid2); + + if (!typobj1 || !typobj2) + { + /* + * getTypes() didn't find some OID. Assume catalog corruption, e.g. + * an oprright value without the corresponding OID in a pg_type row. + * Report as "equal", so the caller uses the next available basis for + * comparison, e.g. the next function argument. + * + * Unary operators have InvalidOid in oprleft (if oprkind='r') or in + * oprright (if oprkind='l'). Caller already sorted by oprkind, + * calling us only for like-kind operators. Hence, "typid1 == typid2" + * took care of InvalidOid. (v14 removed postfix operator support. + * Hence, when dumping from v14+, only oprleft can be InvalidOid.) + */ + Assert(false); + return 0; + } + + if (!typobj1->dobj.namespace || !typobj2->dobj.namespace) + Assert(false); /* catalog corruption */ + else + { + cmpval = strcmp(typobj1->dobj.namespace->dobj.name, + typobj2->dobj.namespace->dobj.name); + if (cmpval != 0) + return cmpval; + } + return strcmp(typobj1->dobj.name, typobj2->dobj.name); +} + +/* Compare two OID-identified pg_am values by amname. */ +static int +accessMethodNameCompare(Oid am1, Oid am2) +{ + AccessMethodInfo *amobj1; + AccessMethodInfo *amobj2; + + if (am1 == am2) + return 0; + + amobj1 = findAccessMethodByOid(am1); + amobj2 = findAccessMethodByOid(am2); + + if (!amobj1 || !amobj2) + { + /* catalog corruption: handle like pgTypeNameCompare() does */ + Assert(false); + return 0; + } + + return strcmp(amobj1->dobj.name, amobj2->dobj.name); +} + /* * Sort the given objects into a safe dump order using dependency @@ -907,7 +1107,7 @@ repairTableAttrDefMultiLoop(DumpableObject *tableobj, } /* - * CHECK constraints on domains work just like those on tables ... + * CHECK, NOT NULL constraints on domains work just like those on tables ... */ static void repairDomainConstraintLoop(DumpableObject *domainobj, @@ -1173,11 +1373,12 @@ repairDependencyLoop(DumpableObject **loop, } } - /* Domain and CHECK constraint */ + /* Domain and CHECK or NOT NULL constraint */ if (nLoop == 2 && loop[0]->objType == DO_TYPE && loop[1]->objType == DO_CONSTRAINT && - ((ConstraintInfo *) loop[1])->contype == 'c' && + (((ConstraintInfo *) loop[1])->contype == 'c' || + ((ConstraintInfo *) loop[1])->contype == 'n') && ((ConstraintInfo *) loop[1])->condomain == (TypeInfo *) loop[0]) { repairDomainConstraintLoop(loop[0], loop[1]); @@ -1186,14 +1387,15 @@ repairDependencyLoop(DumpableObject **loop, if (nLoop == 2 && loop[1]->objType == DO_TYPE && loop[0]->objType == DO_CONSTRAINT && - ((ConstraintInfo *) loop[0])->contype == 'c' && + (((ConstraintInfo *) loop[0])->contype == 'c' || + ((ConstraintInfo *) loop[0])->contype == 'n') && ((ConstraintInfo *) loop[0])->condomain == (TypeInfo *) loop[1]) { repairDomainConstraintLoop(loop[1], loop[0]); return; } - /* Indirect loop involving domain and CHECK constraint */ + /* Indirect loop involving domain and CHECK or NOT NULL constraint */ if (nLoop > 2) { for (i = 0; i < nLoop; i++) @@ -1203,7 +1405,8 @@ repairDependencyLoop(DumpableObject **loop, for (j = 0; j < nLoop; j++) { if (loop[j]->objType == DO_CONSTRAINT && - ((ConstraintInfo *) loop[j])->contype == 'c' && + (((ConstraintInfo *) loop[j])->contype == 'c' || + ((ConstraintInfo *) loop[j])->contype == 'n') && ((ConstraintInfo *) loop[j])->condomain == (TypeInfo *) loop[i]) { repairDomainConstraintMultiLoop(loop[i], loop[j]); diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 3cbcad65c5f..27aa1b65698 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -65,10 +65,9 @@ static void dropTablespaces(PGconn *conn); static void dumpTablespaces(PGconn *conn); static void dropDBs(PGconn *conn); static void dumpUserConfig(PGconn *conn, const char *username); -static void dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat); +static void dumpDatabases(PGconn *conn); static void dumpTimestamp(const char *msg); -static int runPgDump(const char *dbname, const char *create_opts, - char *dbfile, ArchiveFormat archDumpFormat); +static int runPgDump(const char *dbname, const char *create_opts); static void buildShSecLabels(PGconn *conn, const char *catalog_name, Oid objectId, const char *objtype, const char *objname, @@ -77,7 +76,6 @@ static void executeCommand(PGconn *conn, const char *query); static void expand_dbname_patterns(PGconn *conn, SimpleStringList *patterns, SimpleStringList *names); static void read_dumpall_filters(const char *filename, SimpleStringList *pattern); -static ArchiveFormat parseDumpFormat(const char *format); static char pg_dump_bin[MAXPGPATH]; static PQExpBuffer pgdumpopts; @@ -107,8 +105,6 @@ static int no_subscriptions = 0; static int no_toast_compression = 0; static int no_unlogged_table_data = 0; static int no_role_passwords = 0; -static int with_data = 0; -static int with_schema = 0; static int with_statistics = 0; static int server_version; static int load_via_partition_root = 0; @@ -150,7 +146,6 @@ main(int argc, char *argv[]) {"password", no_argument, NULL, 'W'}, {"no-privileges", no_argument, NULL, 'x'}, {"no-acl", no_argument, NULL, 'x'}, - {"format", required_argument, NULL, 'F'}, /* * the following options don't have an equivalent short option letter @@ -183,11 +178,9 @@ main(int argc, char *argv[]) {"no-sync", no_argument, NULL, 4}, {"no-toast-compression", no_argument, &no_toast_compression, 1}, {"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1}, - {"with-data", no_argument, &with_data, 1}, - {"with-schema", no_argument, &with_schema, 1}, - {"with-statistics", no_argument, &with_statistics, 1}, {"on-conflict-do-nothing", no_argument, &on_conflict_do_nothing, 1}, {"rows-per-insert", required_argument, NULL, 7}, + {"statistics", no_argument, &with_statistics, 1}, {"statistics-only", no_argument, &statistics_only, 1}, {"filter", required_argument, NULL, 8}, {"sequence-data", no_argument, &sequence_data, 1}, @@ -201,8 +194,6 @@ main(int argc, char *argv[]) char *pgdb = NULL; char *use_role = NULL; const char *dumpencoding = NULL; - ArchiveFormat archDumpFormat = archNull; - const char *formatName = "p"; trivalue prompt_password = TRI_DEFAULT; bool data_only = false; bool globals_only = false; @@ -252,7 +243,7 @@ main(int argc, char *argv[]) pgdumpopts = createPQExpBuffer(); - while ((c = getopt_long(argc, argv, "acd:E:f:F:gh:l:Op:rsS:tU:vwWx", long_options, &optindex)) != -1) + while ((c = getopt_long(argc, argv, "acd:E:f:gh:l:Op:rsS:tU:vwWx", long_options, &optindex)) != -1) { switch (c) { @@ -280,9 +271,7 @@ main(int argc, char *argv[]) appendPQExpBufferStr(pgdumpopts, " -f "); appendShellString(pgdumpopts, filename); break; - case 'F': - formatName = pg_strdup(optarg); - break; + case 'g': globals_only = true; break; @@ -431,21 +420,6 @@ main(int argc, char *argv[]) exit_nicely(1); } - /* Get format for dump. */ - archDumpFormat = parseDumpFormat(formatName); - - /* - * If a non-plain format is specified, a file name is also required as the - * path to the main directory. - */ - if (archDumpFormat != archNull && - (!filename || strcmp(filename, "") == 0)) - { - pg_log_error("option -F/--format=d|c|t requires option -f/--file"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - exit_nicely(1); - } - /* * If password values are not required in the dump, switch to using * pg_roles which is equally useful, just more likely to have unrestricted @@ -497,12 +471,8 @@ main(int argc, char *argv[]) appendPQExpBufferStr(pgdumpopts, " --no-toast-compression"); if (no_unlogged_table_data) appendPQExpBufferStr(pgdumpopts, " --no-unlogged-table-data"); - if (with_data) - appendPQExpBufferStr(pgdumpopts, " --with-data"); - if (with_schema) - appendPQExpBufferStr(pgdumpopts, " --with-schema"); if (with_statistics) - appendPQExpBufferStr(pgdumpopts, " --with-statistics"); + appendPQExpBufferStr(pgdumpopts, " --statistics"); if (on_conflict_do_nothing) appendPQExpBufferStr(pgdumpopts, " --on-conflict-do-nothing"); if (statistics_only) @@ -511,33 +481,6 @@ main(int argc, char *argv[]) appendPQExpBufferStr(pgdumpopts, " --sequence-data"); /* - * Open the output file if required, otherwise use stdout. If required, - * then create new directory and global.dat file. - */ - if (archDumpFormat != archNull) - { - char global_path[MAXPGPATH]; - - /* Create new directory or accept the empty existing directory. */ - create_or_open_dir(filename); - - snprintf(global_path, MAXPGPATH, "%s/global.dat", filename); - - OPF = fopen(global_path, PG_BINARY_W); - if (!OPF) - pg_fatal("could not open file \"%s\": %m", global_path); - } - else if (filename) - { - OPF = fopen(filename, PG_BINARY_W); - if (!OPF) - pg_fatal("could not open output file \"%s\": %m", - filename); - } - else - OPF = stdout; - - /* * If there was a database specified on the command line, use that, * otherwise try to connect to database "postgres", and failing that * "template1". @@ -577,6 +520,19 @@ main(int argc, char *argv[]) &database_exclude_names); /* + * Open the output file if required, otherwise use stdout + */ + if (filename) + { + OPF = fopen(filename, PG_BINARY_W); + if (!OPF) + pg_fatal("could not open output file \"%s\": %m", + filename); + } + else + OPF = stdout; + + /* * Set the client encoding if requested. */ if (dumpencoding) @@ -632,7 +588,7 @@ main(int argc, char *argv[]) fprintf(OPF, "SET escape_string_warning = off;\n"); fprintf(OPF, "\n"); - if (!data_only) + if (!data_only && !statistics_only && !no_schema) { /* * If asked to --clean, do that first. We can avoid detailed @@ -675,7 +631,7 @@ main(int argc, char *argv[]) } if (!globals_only && !roles_only && !tablespaces_only) - dumpDatabases(conn, archDumpFormat); + dumpDatabases(conn); PQfinish(conn); @@ -688,7 +644,7 @@ main(int argc, char *argv[]) fclose(OPF); /* sync the resulting file, errors are not fatal */ - if (dosync && (archDumpFormat == archNull)) + if (dosync) (void) fsync_fname(filename, false); } @@ -699,14 +655,12 @@ main(int argc, char *argv[]) static void help(void) { - printf(_("%s exports a PostgreSQL database cluster as an SQL script or to other formats.\n\n"), progname); + printf(_("%s exports a PostgreSQL database cluster as an SQL script.\n\n"), progname); printf(_("Usage:\n")); printf(_(" %s [OPTION]...\n"), progname); printf(_("\nGeneral options:\n")); printf(_(" -f, --file=FILENAME output file name\n")); - printf(_(" -F, --format=c|d|t|p output file format (custom, directory, tar,\n" - " plain text (default))\n")); printf(_(" -v, --verbose verbose mode\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" --lock-wait-timeout=TIMEOUT fail after waiting TIMEOUT for a table lock\n")); @@ -750,13 +704,11 @@ help(void) printf(_(" --quote-all-identifiers quote all identifiers, even if not key words\n")); printf(_(" --rows-per-insert=NROWS number of rows per INSERT; implies --inserts\n")); printf(_(" --sequence-data include sequence data in dump\n")); + printf(_(" --statistics dump the statistics\n")); printf(_(" --statistics-only dump only the statistics, not schema or data\n")); printf(_(" --use-set-session-authorization\n" " use SET SESSION AUTHORIZATION commands instead of\n" " ALTER OWNER commands to set ownership\n")); - printf(_(" --with-data dump the data\n")); - printf(_(" --with-schema dump the schema\n")); - printf(_(" --with-statistics dump the statistics\n")); printf(_("\nConnection options:\n")); printf(_(" -d, --dbname=CONNSTR connect using connection string\n")); @@ -1013,6 +965,9 @@ dumpRoles(PGconn *conn) * We do it this way because config settings for roles could mention the * names of other roles. */ + if (PQntuples(res) > 0) + fprintf(OPF, "\n--\n-- User Configurations\n--\n"); + for (i = 0; i < PQntuples(res); i++) dumpUserConfig(conn, PQgetvalue(res, i, i_rolname)); @@ -1526,7 +1481,6 @@ dumpUserConfig(PGconn *conn, const char *username) { PQExpBuffer buf = createPQExpBuffer(); PGresult *res; - static bool header_done = false; printfPQExpBuffer(buf, "SELECT unnest(setconfig) FROM pg_db_role_setting " "WHERE setdatabase = 0 AND setrole = " @@ -1538,13 +1492,7 @@ dumpUserConfig(PGconn *conn, const char *username) res = executeQuery(conn, buf->data); if (PQntuples(res) > 0) - { - if (!header_done) - fprintf(OPF, "\n--\n-- User Configurations\n--\n"); - header_done = true; - fprintf(OPF, "\n--\n-- User Config \"%s\"\n--\n\n", username); - } for (int i = 0; i < PQntuples(res); i++) { @@ -1618,13 +1566,10 @@ expand_dbname_patterns(PGconn *conn, * Dump contents of databases. */ static void -dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) +dumpDatabases(PGconn *conn) { PGresult *res; int i; - char db_subdir[MAXPGPATH]; - char dbfilepath[MAXPGPATH]; - FILE *map_file = NULL; /* * Skip databases marked not datallowconn, since we'd be unable to connect @@ -1638,42 +1583,18 @@ dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) * doesn't have some failure mode with --clean. */ res = executeQuery(conn, - "SELECT datname, oid " + "SELECT datname " "FROM pg_database d " "WHERE datallowconn AND datconnlimit != -2 " "ORDER BY (datname <> 'template1'), datname"); - if (archDumpFormat == archNull && PQntuples(res) > 0) + if (PQntuples(res) > 0) fprintf(OPF, "--\n-- Databases\n--\n\n"); - /* - * If directory/tar/custom format is specified, create a subdirectory - * under the main directory and each database dump file or subdirectory - * will be created in that subdirectory by pg_dump. - */ - if (archDumpFormat != archNull) - { - char map_file_path[MAXPGPATH]; - - snprintf(db_subdir, MAXPGPATH, "%s/databases", filename); - - /* Create a subdirectory with 'databases' name under main directory. */ - if (mkdir(db_subdir, pg_dir_create_mode) != 0) - pg_fatal("could not create directory \"%s\": %m", db_subdir); - - snprintf(map_file_path, MAXPGPATH, "%s/map.dat", filename); - - /* Create a map file (to store dboid and dbname) */ - map_file = fopen(map_file_path, PG_BINARY_W); - if (!map_file) - pg_fatal("could not open file \"%s\": %m", map_file_path); - } - for (i = 0; i < PQntuples(res); i++) { char *dbname = PQgetvalue(res, i, 0); - char *oid = PQgetvalue(res, i, 1); - const char *create_opts = ""; + const char *create_opts; int ret; /* Skip template0, even if it's not marked !datallowconn. */ @@ -1687,27 +1608,9 @@ dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) continue; } - /* - * If this is not a plain format dump, then append dboid and dbname to - * the map.dat file. - */ - if (archDumpFormat != archNull) - { - if (archDumpFormat == archCustom) - snprintf(dbfilepath, MAXPGPATH, "\"%s\"/\"%s\".dmp", db_subdir, oid); - else if (archDumpFormat == archTar) - snprintf(dbfilepath, MAXPGPATH, "\"%s\"/\"%s\".tar", db_subdir, oid); - else - snprintf(dbfilepath, MAXPGPATH, "\"%s\"/\"%s\"", db_subdir, oid); - - /* Put one line entry for dboid and dbname in map file. */ - fprintf(map_file, "%s %s\n", oid, dbname); - } - pg_log_info("dumping database \"%s\"", dbname); - if (archDumpFormat == archNull) - fprintf(OPF, "--\n-- Database \"%s\" dump\n--\n\n", dbname); + fprintf(OPF, "--\n-- Database \"%s\" dump\n--\n\n", dbname); /* * We assume that "template1" and "postgres" already exist in the @@ -1721,9 +1624,12 @@ dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) { if (output_clean) create_opts = "--clean --create"; - /* Since pg_dump won't emit a \connect command, we must */ - else if (archDumpFormat == archNull) + else + { + create_opts = ""; + /* Since pg_dump won't emit a \connect command, we must */ fprintf(OPF, "\\connect %s\n\n", dbname); + } } else create_opts = "--create"; @@ -1731,30 +1637,19 @@ dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) if (filename) fclose(OPF); - ret = runPgDump(dbname, create_opts, dbfilepath, archDumpFormat); + ret = runPgDump(dbname, create_opts); if (ret != 0) pg_fatal("pg_dump failed on database \"%s\", exiting", dbname); if (filename) { - char global_path[MAXPGPATH]; - - if (archDumpFormat != archNull) - snprintf(global_path, MAXPGPATH, "%s/global.dat", filename); - else - snprintf(global_path, MAXPGPATH, "%s", filename); - - OPF = fopen(global_path, PG_BINARY_A); + OPF = fopen(filename, PG_BINARY_A); if (!OPF) pg_fatal("could not re-open the output file \"%s\": %m", - global_path); + filename); } } - /* Close map file */ - if (archDumpFormat != archNull) - fclose(map_file); - PQclear(res); } @@ -1764,8 +1659,7 @@ dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) * Run pg_dump on dbname, with specified options. */ static int -runPgDump(const char *dbname, const char *create_opts, char *dbfile, - ArchiveFormat archDumpFormat) +runPgDump(const char *dbname, const char *create_opts) { PQExpBufferData connstrbuf; PQExpBufferData cmd; @@ -1774,36 +1668,17 @@ runPgDump(const char *dbname, const char *create_opts, char *dbfile, initPQExpBuffer(&connstrbuf); initPQExpBuffer(&cmd); + printfPQExpBuffer(&cmd, "\"%s\" %s %s", pg_dump_bin, + pgdumpopts->data, create_opts); + /* - * If this is not a plain format dump, then append file name and dump - * format to the pg_dump command to get archive dump. + * If we have a filename, use the undocumented plain-append pg_dump + * format. */ - if (archDumpFormat != archNull) - { - printfPQExpBuffer(&cmd, "\"%s\" -f %s %s", pg_dump_bin, - dbfile, create_opts); - - if (archDumpFormat == archDirectory) - appendPQExpBufferStr(&cmd, " --format=directory "); - else if (archDumpFormat == archCustom) - appendPQExpBufferStr(&cmd, " --format=custom "); - else if (archDumpFormat == archTar) - appendPQExpBufferStr(&cmd, " --format=tar "); - } + if (filename) + appendPQExpBufferStr(&cmd, " -Fa "); else - { - printfPQExpBuffer(&cmd, "\"%s\" %s %s", pg_dump_bin, - pgdumpopts->data, create_opts); - - /* - * If we have a filename, use the undocumented plain-append pg_dump - * format. - */ - if (filename) - appendPQExpBufferStr(&cmd, " -Fa "); - else - appendPQExpBufferStr(&cmd, " -Fp "); - } + appendPQExpBufferStr(&cmd, " -Fp "); /* * Append the database name to the already-constructed stem of connection @@ -1948,36 +1823,3 @@ read_dumpall_filters(const char *filename, SimpleStringList *pattern) filter_free(&fstate); } - -/* - * parseDumpFormat - * - * This will validate dump formats. - */ -static ArchiveFormat -parseDumpFormat(const char *format) -{ - ArchiveFormat archDumpFormat; - - if (pg_strcasecmp(format, "c") == 0) - archDumpFormat = archCustom; - else if (pg_strcasecmp(format, "custom") == 0) - archDumpFormat = archCustom; - else if (pg_strcasecmp(format, "d") == 0) - archDumpFormat = archDirectory; - else if (pg_strcasecmp(format, "directory") == 0) - archDumpFormat = archDirectory; - else if (pg_strcasecmp(format, "p") == 0) - archDumpFormat = archNull; - else if (pg_strcasecmp(format, "plain") == 0) - archDumpFormat = archNull; - else if (pg_strcasecmp(format, "t") == 0) - archDumpFormat = archTar; - else if (pg_strcasecmp(format, "tar") == 0) - archDumpFormat = archTar; - else - pg_fatal("unrecognized output format \"%s\"; please specify \"c\", \"d\", \"p\", or \"t\"", - format); - - return archDumpFormat; -} diff --git a/src/bin/pg_dump/pg_restore.c b/src/bin/pg_dump/pg_restore.c index 6ef789cb06d..6c129278bc5 100644 --- a/src/bin/pg_dump/pg_restore.c +++ b/src/bin/pg_dump/pg_restore.c @@ -2,7 +2,7 @@ * * pg_restore.c * pg_restore is an utility extracting postgres database definitions - * from a backup archive created by pg_dump/pg_dumpall using the archiver + * from a backup archive created by pg_dump using the archiver * interface. * * pg_restore will read the backup archive and @@ -41,15 +41,11 @@ #include "postgres_fe.h" #include <ctype.h> -#include <sys/stat.h> #ifdef HAVE_TERMIOS_H #include <termios.h> #endif -#include "common/string.h" -#include "connectdb.h" #include "fe_utils/option_utils.h" -#include "fe_utils/string_utils.h" #include "filter.h" #include "getopt_long.h" #include "parallel.h" @@ -57,43 +53,18 @@ static void usage(const char *progname); static void read_restore_filters(const char *filename, RestoreOptions *opts); -static bool file_exists_in_directory(const char *dir, const char *filename); -static int restore_one_database(const char *inputFileSpec, RestoreOptions *opts, - int numWorkers, bool append_data, int num); -static int read_one_statement(StringInfo inBuf, FILE *pfile); -static int restore_all_databases(PGconn *conn, const char *dumpdirpath, - SimpleStringList db_exclude_patterns, RestoreOptions *opts, int numWorkers); -static int process_global_sql_commands(PGconn *conn, const char *dumpdirpath, - const char *outfile); -static void copy_or_print_global_file(const char *outfile, FILE *pfile); -static int get_dbnames_list_to_restore(PGconn *conn, - SimplePtrList *dbname_oid_list, - SimpleStringList db_exclude_patterns); -static int get_dbname_oid_list_from_mfile(const char *dumpdirpath, - SimplePtrList *dbname_oid_list); - -/* - * Stores a database OID and the corresponding name. - */ -typedef struct DbOidName -{ - Oid oid; - char str[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated string here */ -} DbOidName; - int main(int argc, char **argv) { RestoreOptions *opts; int c; + int exit_code; int numWorkers = 1; + Archive *AH; char *inputFileSpec; bool data_only = false; bool schema_only = false; - int n_errors = 0; - bool globals_only = false; - SimpleStringList db_exclude_patterns = {NULL, NULL}; static int disable_triggers = 0; static int enable_row_security = 0; static int if_exists = 0; @@ -111,15 +82,12 @@ main(int argc, char **argv) static int no_subscriptions = 0; static int strict_names = 0; static int statistics_only = 0; - static int with_data = 0; - static int with_schema = 0; static int with_statistics = 0; struct option cmdopts[] = { {"clean", 0, NULL, 'c'}, {"create", 0, NULL, 'C'}, {"data-only", 0, NULL, 'a'}, - {"globals-only", 0, NULL, 'g'}, {"dbname", 1, NULL, 'd'}, {"exit-on-error", 0, NULL, 'e'}, {"exclude-schema", 1, NULL, 'N'}, @@ -169,12 +137,9 @@ main(int argc, char **argv) {"no-security-labels", no_argument, &no_security_labels, 1}, {"no-subscriptions", no_argument, &no_subscriptions, 1}, {"no-statistics", no_argument, &no_statistics, 1}, - {"with-data", no_argument, &with_data, 1}, - {"with-schema", no_argument, &with_schema, 1}, - {"with-statistics", no_argument, &with_statistics, 1}, + {"statistics", no_argument, &with_statistics, 1}, {"statistics-only", no_argument, &statistics_only, 1}, {"filter", required_argument, NULL, 4}, - {"exclude-database", required_argument, NULL, 6}, {NULL, 0, NULL, 0} }; @@ -203,7 +168,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "acCd:ef:F:gh:I:j:lL:n:N:Op:P:RsS:t:T:U:vwWx1", + while ((c = getopt_long(argc, argv, "acCd:ef:F:h:I:j:lL:n:N:Op:P:RsS:t:T:U:vwWx1", cmdopts, NULL)) != -1) { switch (c) @@ -230,14 +195,11 @@ main(int argc, char **argv) if (strlen(optarg) != 0) opts->formatName = pg_strdup(optarg); break; - case 'g': - /* restore only global.dat file from directory */ - globals_only = true; - break; case 'h': if (strlen(optarg) != 0) opts->cparams.pghost = pg_strdup(optarg); break; + case 'j': /* number of restore jobs */ if (!option_parse_int(optarg, "-j/--jobs", 1, PG_MAX_JOBS, @@ -352,9 +314,6 @@ main(int argc, char **argv) exit(1); opts->exit_on_error = true; break; - case 6: /* database patterns to skip */ - simple_string_list_append(&db_exclude_patterns, optarg); - break; default: /* getopt_long already emitted a complaint */ @@ -382,13 +341,6 @@ main(int argc, char **argv) if (!opts->cparams.dbname && !opts->filename && !opts->tocSummary) pg_fatal("one of -d/--dbname and -f/--file must be specified"); - if (db_exclude_patterns.head != NULL && globals_only) - { - pg_log_error("option --exclude-database cannot be used together with -g/--globals-only"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - exit_nicely(1); - } - /* Should get at most one of -d and -f, else user is confused */ if (opts->cparams.dbname) { @@ -417,13 +369,17 @@ main(int argc, char **argv) if (statistics_only && no_statistics) pg_fatal("options --statistics-only and --no-statistics cannot be used together"); - /* reject conflicting "with-" and "no-" options */ - if (with_data && no_data) - pg_fatal("options --with-data and --no-data cannot be used together"); - if (with_schema && no_schema) - pg_fatal("options --with-schema and --no-schema cannot be used together"); + /* reject conflicting "no-" options */ if (with_statistics && no_statistics) - pg_fatal("options --with-statistics and --no-statistics cannot be used together"); + pg_fatal("options --statistics and --no-statistics cannot be used together"); + + /* reject conflicting "only-" options */ + if (data_only && with_statistics) + pg_fatal("options %s and %s cannot be used together", + "-a/--data-only", "--statistics"); + if (schema_only && with_statistics) + pg_fatal("options %s and %s cannot be used together", + "-s/--schema-only", "--statistics"); if (data_only && opts->dropSchema) pg_fatal("options -c/--clean and -a/--data-only cannot be used together"); @@ -443,16 +399,14 @@ main(int argc, char **argv) pg_fatal("cannot specify both --single-transaction and multiple jobs"); /* - * Set derivative flags. An "-only" option may be overridden by an - * explicit "with-" option; e.g. "--schema-only --with-statistics" will - * include schema and statistics. Other ambiguous or nonsensical - * combinations, e.g. "--schema-only --no-schema", will have already - * caused an error in one of the checks above. + * Set derivative flags. Ambiguous or nonsensical combinations, e.g. + * "--schema-only --no-schema", will have already caused an error in one + * of the checks above. */ opts->dumpData = ((opts->dumpData && !schema_only && !statistics_only) || - (data_only || with_data)) && !no_data; + data_only) && !no_data; opts->dumpSchema = ((opts->dumpSchema && !data_only && !statistics_only) || - (schema_only || with_schema)) && !no_schema; + schema_only) && !no_schema; opts->dumpStatistics = ((opts->dumpStatistics && !schema_only && !data_only) || (statistics_only || with_statistics)) && !no_statistics; @@ -496,114 +450,6 @@ main(int argc, char **argv) opts->formatName); } - /* - * If toc.dat file is not present in the current path, then check for - * global.dat. If global.dat file is present, then restore all the - * databases from map.dat (if it exists), but skip restoring those - * matching --exclude-database patterns. - */ - if (inputFileSpec != NULL && !file_exists_in_directory(inputFileSpec, "toc.dat") && - file_exists_in_directory(inputFileSpec, "global.dat")) - { - PGconn *conn = NULL; /* Connection to restore global sql - * commands. */ - - /* - * Can only use --list or --use-list options with a single database - * dump. - */ - if (opts->tocSummary) - pg_fatal("option -l/--list cannot be used when restoring an archive created by pg_dumpall"); - else if (opts->tocFile) - pg_fatal("option -L/--use-list cannot be used when restoring an archive created by pg_dumpall"); - - /* - * To restore from a pg_dumpall archive, -C (create database) option - * must be specified unless we are only restoring globals. - */ - if (!globals_only && opts->createDB != 1) - { - pg_log_error("option -C/--create must be specified when restoring an archive created by pg_dumpall"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - pg_log_error_hint("Individual databases can be restored using their specific archives."); - exit_nicely(1); - } - - /* - * Connect to the database to execute global sql commands from - * global.dat file. - */ - if (opts->cparams.dbname) - { - conn = ConnectDatabase(opts->cparams.dbname, NULL, opts->cparams.pghost, - opts->cparams.pgport, opts->cparams.username, TRI_DEFAULT, - false, progname, NULL, NULL, NULL, NULL); - - - if (!conn) - pg_fatal("could not connect to database \"%s\"", opts->cparams.dbname); - } - - /* If globals-only, then return from here. */ - if (globals_only) - { - /* - * Open global.dat file and execute/append all the global sql - * commands. - */ - n_errors = process_global_sql_commands(conn, inputFileSpec, - opts->filename); - - if (conn) - PQfinish(conn); - - pg_log_info("database restoring skipped because option -g/--globals-only was specified"); - } - else - { - /* Now restore all the databases from map.dat */ - n_errors = restore_all_databases(conn, inputFileSpec, db_exclude_patterns, - opts, numWorkers); - } - - /* Free db pattern list. */ - simple_string_list_destroy(&db_exclude_patterns); - } - else /* process if global.dat file does not exist. */ - { - if (db_exclude_patterns.head != NULL) - pg_fatal("option --exclude-database can be used only when restoring an archive created by pg_dumpall"); - - if (globals_only) - pg_fatal("option -g/--globals-only can be used only when restoring an archive created by pg_dumpall"); - - n_errors = restore_one_database(inputFileSpec, opts, numWorkers, false, 0); - } - - /* Done, print a summary of ignored errors during restore. */ - if (n_errors) - { - pg_log_warning("errors ignored on restore: %d", n_errors); - return 1; - } - - return 0; -} - -/* - * restore_one_database - * - * This will restore one database using toc.dat file. - * - * returns the number of errors while doing restore. - */ -static int -restore_one_database(const char *inputFileSpec, RestoreOptions *opts, - int numWorkers, bool append_data, int num) -{ - Archive *AH; - int n_errors; - AH = OpenArchive(inputFileSpec, opts->format); SetArchiveOptions(AH, NULL, opts); @@ -611,15 +457,9 @@ restore_one_database(const char *inputFileSpec, RestoreOptions *opts, /* * We don't have a connection yet but that doesn't matter. The connection * is initialized to NULL and if we terminate through exit_nicely() while - * it's still NULL, the cleanup function will just be a no-op. If we are - * restoring multiple databases, then only update AX handle for cleanup as - * the previous entry was already in the array and we had closed previous - * connection, so we can use the same array slot. + * it's still NULL, the cleanup function will just be a no-op. */ - if (!append_data || num == 0) - on_exit_close_archive(AH); - else - replace_on_exit_close_archive(AH); + on_exit_close_archive(AH); /* Let the archiver know how noisy to be */ AH->verbose = opts->verbose; @@ -639,21 +479,25 @@ restore_one_database(const char *inputFileSpec, RestoreOptions *opts, else { ProcessArchiveRestoreOptions(AH); - RestoreArchive(AH, append_data); + RestoreArchive(AH); } - n_errors = AH->n_errors; + /* done, print a summary of ignored errors */ + if (AH->n_errors) + pg_log_warning("errors ignored on restore: %d", AH->n_errors); /* AH may be freed in CloseArchive? */ + exit_code = AH->n_errors ? 1 : 0; + CloseArchive(AH); - return n_errors; + return exit_code; } static void usage(const char *progname) { - printf(_("%s restores PostgreSQL databases from archives created by pg_dump or pg_dumpall.\n\n"), progname); + printf(_("%s restores a PostgreSQL database from an archive created by pg_dump.\n\n"), progname); printf(_("Usage:\n")); printf(_(" %s [OPTION]... [FILE]\n"), progname); @@ -671,7 +515,6 @@ usage(const char *progname) printf(_(" -c, --clean clean (drop) database objects before recreating\n")); printf(_(" -C, --create create the target database\n")); printf(_(" -e, --exit-on-error exit on error, default is to continue\n")); - printf(_(" -g, --globals-only restore only global objects, no databases\n")); printf(_(" -I, --index=NAME restore named index\n")); printf(_(" -j, --jobs=NUM use this many parallel jobs to restore\n")); printf(_(" -L, --use-list=FILENAME use table of contents from this file for\n" @@ -688,7 +531,6 @@ usage(const char *progname) printf(_(" -1, --single-transaction restore as a single transaction\n")); printf(_(" --disable-triggers disable triggers during data-only restore\n")); printf(_(" --enable-row-security enable row security\n")); - printf(_(" --exclude-database=PATTERN do not restore the specified database(s)\n")); printf(_(" --filter=FILENAME restore or skip objects based on expressions\n" " in FILENAME\n")); printf(_(" --if-exists use IF EXISTS when dropping objects\n")); @@ -705,6 +547,7 @@ usage(const char *progname) printf(_(" --no-table-access-method do not restore table access methods\n")); printf(_(" --no-tablespaces do not restore tablespace assignments\n")); printf(_(" --section=SECTION restore named section (pre-data, data, or post-data)\n")); + printf(_(" --statistics restore the statistics\n")); printf(_(" --statistics-only restore only the statistics, not schema or data\n")); printf(_(" --strict-names require table and/or schema include patterns to\n" " match at least one entity each\n")); @@ -712,9 +555,6 @@ usage(const char *progname) printf(_(" --use-set-session-authorization\n" " use SET SESSION AUTHORIZATION commands instead of\n" " ALTER OWNER commands to set ownership\n")); - printf(_(" --with-data restore the data\n")); - printf(_(" --with-schema restore the schema\n")); - printf(_(" --with-statistics restore the statistics\n")); printf(_("\nConnection options:\n")); printf(_(" -h, --host=HOSTNAME database server host or socket directory\n")); @@ -725,8 +565,8 @@ usage(const char *progname) printf(_(" --role=ROLENAME do SET ROLE before restore\n")); printf(_("\n" - "The options -I, -n, -N, -P, -t, -T, --section, and --exclude-database can be\n" - "combined and specified multiple times to select multiple objects.\n")); + "The options -I, -n, -N, -P, -t, -T, and --section can be combined and specified\n" + "multiple times to select multiple objects.\n")); printf(_("\nIf no input file name is supplied, then standard input is used.\n\n")); printf(_("Report bugs to <%s>.\n"), PACKAGE_BUGREPORT); printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); @@ -831,585 +671,3 @@ read_restore_filters(const char *filename, RestoreOptions *opts) filter_free(&fstate); } - -/* - * file_exists_in_directory - * - * Returns true if the file exists in the given directory. - */ -static bool -file_exists_in_directory(const char *dir, const char *filename) -{ - struct stat st; - char buf[MAXPGPATH]; - - if (snprintf(buf, MAXPGPATH, "%s/%s", dir, filename) >= MAXPGPATH) - pg_fatal("directory name too long: \"%s\"", dir); - - return (stat(buf, &st) == 0 && S_ISREG(st.st_mode)); -} - -/* - * read_one_statement - * - * This will start reading from passed file pointer using fgetc and read till - * semicolon(sql statement terminator for global.dat file) - * - * EOF is returned if end-of-file input is seen; time to shut down. - */ - -static int -read_one_statement(StringInfo inBuf, FILE *pfile) -{ - int c; /* character read from getc() */ - int m; - - StringInfoData q; - - initStringInfo(&q); - - resetStringInfo(inBuf); - - /* - * Read characters until EOF or the appropriate delimiter is seen. - */ - while ((c = fgetc(pfile)) != EOF) - { - if (c != '\'' && c != '"' && c != '\n' && c != ';') - { - appendStringInfoChar(inBuf, (char) c); - while ((c = fgetc(pfile)) != EOF) - { - if (c != '\'' && c != '"' && c != ';' && c != '\n') - appendStringInfoChar(inBuf, (char) c); - else - break; - } - } - - if (c == '\'' || c == '"') - { - appendStringInfoChar(&q, (char) c); - m = c; - - while ((c = fgetc(pfile)) != EOF) - { - appendStringInfoChar(&q, (char) c); - - if (c == m) - { - appendStringInfoString(inBuf, q.data); - resetStringInfo(&q); - break; - } - } - } - - if (c == ';') - { - appendStringInfoChar(inBuf, (char) ';'); - break; - } - - if (c == '\n') - appendStringInfoChar(inBuf, (char) '\n'); - } - - pg_free(q.data); - - /* No input before EOF signal means time to quit. */ - if (c == EOF && inBuf->len == 0) - return EOF; - - /* return something that's not EOF */ - return 'Q'; -} - -/* - * get_dbnames_list_to_restore - * - * This will mark for skipping any entries from dbname_oid_list that pattern match an - * entry in the db_exclude_patterns list. - * - * Returns the number of database to be restored. - * - */ -static int -get_dbnames_list_to_restore(PGconn *conn, - SimplePtrList *dbname_oid_list, - SimpleStringList db_exclude_patterns) -{ - int count_db = 0; - PQExpBuffer query; - PGresult *res; - - query = createPQExpBuffer(); - - if (!conn) - pg_log_info("considering PATTERN as NAME for --exclude-database option as no database connection while doing pg_restore"); - - /* - * Process one by one all dbnames and if specified to skip restoring, then - * remove dbname from list. - */ - for (SimplePtrListCell *db_cell = dbname_oid_list->head; - db_cell; db_cell = db_cell->next) - { - DbOidName *dbidname = (DbOidName *) db_cell->ptr; - bool skip_db_restore = false; - PQExpBuffer db_lit = createPQExpBuffer(); - - appendStringLiteralConn(db_lit, dbidname->str, conn); - - for (SimpleStringListCell *pat_cell = db_exclude_patterns.head; pat_cell; pat_cell = pat_cell->next) - { - /* - * If there is an exact match then we don't need to try a pattern - * match - */ - if (pg_strcasecmp(dbidname->str, pat_cell->val) == 0) - skip_db_restore = true; - /* Otherwise, try a pattern match if there is a connection */ - else if (conn) - { - int dotcnt; - - appendPQExpBufferStr(query, "SELECT 1 "); - processSQLNamePattern(conn, query, pat_cell->val, false, - false, NULL, db_lit->data, - NULL, NULL, NULL, &dotcnt); - - if (dotcnt > 0) - { - pg_log_error("improper qualified name (too many dotted names): %s", - dbidname->str); - PQfinish(conn); - exit_nicely(1); - } - - res = executeQuery(conn, query->data); - - if ((PQresultStatus(res) == PGRES_TUPLES_OK) && PQntuples(res)) - { - skip_db_restore = true; - pg_log_info("database name \"%s\" matches exclude pattern \"%s\"", dbidname->str, pat_cell->val); - } - - PQclear(res); - resetPQExpBuffer(query); - } - - if (skip_db_restore) - break; - } - - destroyPQExpBuffer(db_lit); - - /* - * Mark db to be skipped or increment the counter of dbs to be - * restored - */ - if (skip_db_restore) - { - pg_log_info("excluding database \"%s\"", dbidname->str); - dbidname->oid = InvalidOid; - } - else - { - count_db++; - } - } - - destroyPQExpBuffer(query); - - return count_db; -} - -/* - * get_dbname_oid_list_from_mfile - * - * Open map.dat file and read line by line and then prepare a list of database - * names and corresponding db_oid. - * - * Returns, total number of database names in map.dat file. - */ -static int -get_dbname_oid_list_from_mfile(const char *dumpdirpath, SimplePtrList *dbname_oid_list) -{ - StringInfoData linebuf; - FILE *pfile; - char map_file_path[MAXPGPATH]; - int count = 0; - - - /* - * If there is only global.dat file in dump, then return from here as - * there is no database to restore. - */ - if (!file_exists_in_directory(dumpdirpath, "map.dat")) - { - pg_log_info("database restoring is skipped because file \"%s\" does not exist in directory \"%s\"", "map.dat", dumpdirpath); - return 0; - } - - snprintf(map_file_path, MAXPGPATH, "%s/map.dat", dumpdirpath); - - /* Open map.dat file. */ - pfile = fopen(map_file_path, PG_BINARY_R); - - if (pfile == NULL) - pg_fatal("could not open file \"%s\": %m", map_file_path); - - initStringInfo(&linebuf); - - /* Append all the dbname/db_oid combinations to the list. */ - while (pg_get_line_buf(pfile, &linebuf)) - { - Oid db_oid = InvalidOid; - char *dbname; - DbOidName *dbidname; - int namelen; - char *p = linebuf.data; - - /* Extract dboid. */ - while (isdigit((unsigned char) *p)) - p++; - if (p > linebuf.data && *p == ' ') - { - sscanf(linebuf.data, "%u", &db_oid); - p++; - } - - /* dbname is the rest of the line */ - dbname = p; - namelen = strlen(dbname); - - /* Report error and exit if the file has any corrupted data. */ - if (!OidIsValid(db_oid) || namelen <= 1) - pg_fatal("invalid entry in file \"%s\" on line %d", map_file_path, - count + 1); - - pg_log_info("found database \"%s\" (OID: %u) in file \"%s\"", - dbname, db_oid, map_file_path); - - dbidname = pg_malloc(offsetof(DbOidName, str) + namelen + 1); - dbidname->oid = db_oid; - strlcpy(dbidname->str, dbname, namelen); - - simple_ptr_list_append(dbname_oid_list, dbidname); - count++; - } - - /* Close map.dat file. */ - fclose(pfile); - - return count; -} - -/* - * restore_all_databases - * - * This will restore databases those dumps are present in - * directory based on map.dat file mapping. - * - * This will skip restoring for databases that are specified with - * exclude-database option. - * - * returns, number of errors while doing restore. - */ -static int -restore_all_databases(PGconn *conn, const char *dumpdirpath, - SimpleStringList db_exclude_patterns, RestoreOptions *opts, - int numWorkers) -{ - SimplePtrList dbname_oid_list = {NULL, NULL}; - int num_db_restore = 0; - int num_total_db; - int n_errors_total; - int count = 0; - char *connected_db = NULL; - bool dumpData = opts->dumpData; - bool dumpSchema = opts->dumpSchema; - bool dumpStatistics = opts->dumpSchema; - - /* Save db name to reuse it for all the database. */ - if (opts->cparams.dbname) - connected_db = opts->cparams.dbname; - - num_total_db = get_dbname_oid_list_from_mfile(dumpdirpath, &dbname_oid_list); - - /* If map.dat has no entries, return after processing global.dat */ - if (dbname_oid_list.head == NULL) - return process_global_sql_commands(conn, dumpdirpath, opts->filename); - - pg_log_info(ngettext("found %d database name in \"%s\"", - "found %d database names in \"%s\"", - num_total_db), - num_total_db, "map.dat"); - - if (!conn) - { - pg_log_info("trying to connect to database \"%s\"", "postgres"); - - conn = ConnectDatabase("postgres", NULL, opts->cparams.pghost, - opts->cparams.pgport, opts->cparams.username, TRI_DEFAULT, - false, progname, NULL, NULL, NULL, NULL); - - /* Try with template1. */ - if (!conn) - { - pg_log_info("trying to connect to database \"%s\"", "template1"); - - conn = ConnectDatabase("template1", NULL, opts->cparams.pghost, - opts->cparams.pgport, opts->cparams.username, TRI_DEFAULT, - false, progname, NULL, NULL, NULL, NULL); - } - } - - /* - * filter the db list according to the exclude patterns - */ - num_db_restore = get_dbnames_list_to_restore(conn, &dbname_oid_list, - db_exclude_patterns); - - /* Open global.dat file and execute/append all the global sql commands. */ - n_errors_total = process_global_sql_commands(conn, dumpdirpath, opts->filename); - - /* Close the db connection as we are done with globals and patterns. */ - if (conn) - PQfinish(conn); - - /* Exit if no db needs to be restored. */ - if (dbname_oid_list.head == NULL || num_db_restore == 0) - { - pg_log_info(ngettext("no database needs restoring out of %d database", - "no database needs restoring out of %d databases", num_total_db), - num_total_db); - return n_errors_total; - } - - pg_log_info("need to restore %d databases out of %d databases", num_db_restore, num_total_db); - - /* - * We have a list of databases to restore after processing the - * exclude-database switch(es). Now we can restore them one by one. - */ - for (SimplePtrListCell *db_cell = dbname_oid_list.head; - db_cell; db_cell = db_cell->next) - { - DbOidName *dbidname = (DbOidName *) db_cell->ptr; - char subdirpath[MAXPGPATH]; - char subdirdbpath[MAXPGPATH]; - char dbfilename[MAXPGPATH]; - int n_errors; - - /* ignore dbs marked for skipping */ - if (dbidname->oid == InvalidOid) - continue; - - /* - * We need to reset override_dbname so that objects can be restored - * into an already created database. (used with -d/--dbname option) - */ - if (opts->cparams.override_dbname) - { - pfree(opts->cparams.override_dbname); - opts->cparams.override_dbname = NULL; - } - - snprintf(subdirdbpath, MAXPGPATH, "%s/databases", dumpdirpath); - - /* - * Look for the database dump file/dir. If there is an {oid}.tar or - * {oid}.dmp file, use it. Otherwise try to use a directory called - * {oid} - */ - snprintf(dbfilename, MAXPGPATH, "%u.tar", dbidname->oid); - if (file_exists_in_directory(subdirdbpath, dbfilename)) - snprintf(subdirpath, MAXPGPATH, "%s/databases/%u.tar", dumpdirpath, dbidname->oid); - else - { - snprintf(dbfilename, MAXPGPATH, "%u.dmp", dbidname->oid); - - if (file_exists_in_directory(subdirdbpath, dbfilename)) - snprintf(subdirpath, MAXPGPATH, "%s/databases/%u.dmp", dumpdirpath, dbidname->oid); - else - snprintf(subdirpath, MAXPGPATH, "%s/databases/%u", dumpdirpath, dbidname->oid); - } - - pg_log_info("restoring database \"%s\"", dbidname->str); - - /* If database is already created, then don't set createDB flag. */ - if (opts->cparams.dbname) - { - PGconn *test_conn; - - test_conn = ConnectDatabase(dbidname->str, NULL, opts->cparams.pghost, - opts->cparams.pgport, opts->cparams.username, TRI_DEFAULT, - false, progname, NULL, NULL, NULL, NULL); - if (test_conn) - { - PQfinish(test_conn); - - /* Use already created database for connection. */ - opts->createDB = 0; - opts->cparams.dbname = dbidname->str; - } - else - { - /* we'll have to create it */ - opts->createDB = 1; - opts->cparams.dbname = connected_db; - } - } - - /* - * Reset flags - might have been reset in pg_backup_archiver.c by the - * previous restore. - */ - opts->dumpData = dumpData; - opts->dumpSchema = dumpSchema; - opts->dumpStatistics = dumpStatistics; - - /* Restore the single database. */ - n_errors = restore_one_database(subdirpath, opts, numWorkers, true, count); - - /* Print a summary of ignored errors during single database restore. */ - if (n_errors) - { - n_errors_total += n_errors; - pg_log_warning("errors ignored on database \"%s\" restore: %d", dbidname->str, n_errors); - } - - count++; - } - - /* Log number of processed databases. */ - pg_log_info("number of restored databases is %d", num_db_restore); - - /* Free dbname and dboid list. */ - simple_ptr_list_destroy(&dbname_oid_list); - - return n_errors_total; -} - -/* - * process_global_sql_commands - * - * Open global.dat and execute or copy the sql commands one by one. - * - * If outfile is not NULL, copy all sql commands into outfile rather than - * executing them. - * - * Returns the number of errors while processing global.dat - */ -static int -process_global_sql_commands(PGconn *conn, const char *dumpdirpath, const char *outfile) -{ - char global_file_path[MAXPGPATH]; - PGresult *result; - StringInfoData sqlstatement, - user_create; - FILE *pfile; - int n_errors = 0; - - snprintf(global_file_path, MAXPGPATH, "%s/global.dat", dumpdirpath); - - /* Open global.dat file. */ - pfile = fopen(global_file_path, PG_BINARY_R); - - if (pfile == NULL) - pg_fatal("could not open file \"%s\": %m", global_file_path); - - /* - * If outfile is given, then just copy all global.dat file data into - * outfile. - */ - if (outfile) - { - copy_or_print_global_file(outfile, pfile); - return 0; - } - - /* Init sqlstatement to append commands. */ - initStringInfo(&sqlstatement); - - /* creation statement for our current role */ - initStringInfo(&user_create); - appendStringInfoString(&user_create, "CREATE ROLE "); - /* should use fmtId here, but we don't know the encoding */ - appendStringInfoString(&user_create, PQuser(conn)); - appendStringInfoChar(&user_create, ';'); - - /* Process file till EOF and execute sql statements. */ - while (read_one_statement(&sqlstatement, pfile) != EOF) - { - /* don't try to create the role we are connected as */ - if (strstr(sqlstatement.data, user_create.data)) - continue; - - pg_log_info("executing query: %s", sqlstatement.data); - result = PQexec(conn, sqlstatement.data); - - switch (PQresultStatus(result)) - { - case PGRES_COMMAND_OK: - case PGRES_TUPLES_OK: - case PGRES_EMPTY_QUERY: - break; - default: - n_errors++; - pg_log_error("could not execute query: %s", PQerrorMessage(conn)); - pg_log_error_detail("Command was: %s", sqlstatement.data); - } - PQclear(result); - } - - /* Print a summary of ignored errors during global.dat. */ - if (n_errors) - pg_log_warning(ngettext("ignored %d error in file \"%s\"", - "ignored %d errors in file \"%s\"", n_errors), - n_errors, global_file_path); - fclose(pfile); - - return n_errors; -} - -/* - * copy_or_print_global_file - * - * Copy global.dat into the output file. If "-" is used as outfile, - * then print commands to stdout. - */ -static void -copy_or_print_global_file(const char *outfile, FILE *pfile) -{ - char out_file_path[MAXPGPATH]; - FILE *OPF; - int c; - - /* "-" is used for stdout. */ - if (strcmp(outfile, "-") == 0) - OPF = stdout; - else - { - snprintf(out_file_path, MAXPGPATH, "%s", outfile); - OPF = fopen(out_file_path, PG_BINARY_W); - - if (OPF == NULL) - { - fclose(pfile); - pg_fatal("could not open file: \"%s\"", outfile); - } - } - - /* Append global.dat into output file or print to stdout. */ - while ((c = fgetc(pfile)) != EOF) - fputc(c, OPF); - - fclose(pfile); - - /* Close output file. */ - if (strcmp(outfile, "-") != 0) - fclose(OPF); -} diff --git a/src/bin/pg_dump/t/001_basic.pl b/src/bin/pg_dump/t/001_basic.pl index c3c5fae11ea..37d893d5e6a 100644 --- a/src/bin/pg_dump/t/001_basic.pl +++ b/src/bin/pg_dump/t/001_basic.pl @@ -237,24 +237,6 @@ command_fails_like( 'pg_restore: options -C\/--create and -1\/--single-transaction cannot be used together' ); -command_fails_like( - [ 'pg_restore', '--exclude-database=foo', '--globals-only', '-d', 'xxx' ], - qr/\Qpg_restore: error: option --exclude-database cannot be used together with -g\/--globals-only\E/, - 'pg_restore: option --exclude-database cannot be used together with -g/--globals-only' -); - -command_fails_like( - [ 'pg_restore', '--exclude-database=foo', '-d', 'xxx', 'dumpdir' ], - qr/\Qpg_restore: error: option --exclude-database can be used only when restoring an archive created by pg_dumpall\E/, - 'When option --exclude-database is used in pg_restore with dump of pg_dump' -); - -command_fails_like( - [ 'pg_restore', '--globals-only', '-d', 'xxx', 'dumpdir' ], - qr/\Qpg_restore: error: option -g\/--globals-only can be used only when restoring an archive created by pg_dumpall\E/, - 'When option --globals-only is not used in pg_restore with dump of pg_dump' -); - # also fails for -r and -t, but it seems pointless to add more tests for those. command_fails_like( [ 'pg_dumpall', '--exclude-database=foo', '--globals-only' ], @@ -262,8 +244,4 @@ command_fails_like( 'pg_dumpall: option --exclude-database cannot be used together with -g/--globals-only' ); -command_fails_like( - [ 'pg_dumpall', '--format', 'x' ], - qr/\Qpg_dumpall: error: unrecognized output format "x";\E/, - 'pg_dumpall: unrecognized output format'); done_testing(); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index 2485d8f360e..a86b38466de 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -68,7 +68,7 @@ my %pgdump_runs = ( '--no-data', '--sequence-data', '--binary-upgrade', - '--with-statistics', + '--statistics', '--dbname' => 'postgres', # alternative way to specify database ], restore_cmd => [ @@ -76,7 +76,7 @@ my %pgdump_runs = ( '--format' => 'custom', '--verbose', '--file' => "$tempdir/binary_upgrade.sql", - '--with-statistics', + '--statistics', "$tempdir/binary_upgrade.dump", ], }, @@ -90,13 +90,13 @@ my %pgdump_runs = ( '--format' => 'custom', '--compress' => '1', '--file' => "$tempdir/compression_gzip_custom.dump", - '--with-statistics', + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--file' => "$tempdir/compression_gzip_custom.sql", - '--with-statistics', + '--statistics', "$tempdir/compression_gzip_custom.dump", ], command_like => { @@ -119,7 +119,7 @@ my %pgdump_runs = ( '--format' => 'directory', '--compress' => 'gzip:1', '--file' => "$tempdir/compression_gzip_dir", - '--with-statistics', + '--statistics', 'postgres', ], # Give coverage for manually compressed blobs.toc files during @@ -137,7 +137,7 @@ my %pgdump_runs = ( 'pg_restore', '--jobs' => '2', '--file' => "$tempdir/compression_gzip_dir.sql", - '--with-statistics', + '--statistics', "$tempdir/compression_gzip_dir", ], }, @@ -150,7 +150,7 @@ my %pgdump_runs = ( '--format' => 'plain', '--compress' => '1', '--file' => "$tempdir/compression_gzip_plain.sql.gz", - '--with-statistics', + '--statistics', 'postgres', ], # Decompress the generated file to run through the tests. @@ -169,13 +169,13 @@ my %pgdump_runs = ( '--format' => 'custom', '--compress' => 'lz4', '--file' => "$tempdir/compression_lz4_custom.dump", - '--with-statistics', + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--file' => "$tempdir/compression_lz4_custom.sql", - '--with-statistics', + '--statistics', "$tempdir/compression_lz4_custom.dump", ], command_like => { @@ -198,7 +198,7 @@ my %pgdump_runs = ( '--format' => 'directory', '--compress' => 'lz4:1', '--file' => "$tempdir/compression_lz4_dir", - '--with-statistics', + '--statistics', 'postgres', ], # Verify that data files were compressed @@ -210,7 +210,7 @@ my %pgdump_runs = ( 'pg_restore', '--jobs' => '2', '--file' => "$tempdir/compression_lz4_dir.sql", - '--with-statistics', + '--statistics', "$tempdir/compression_lz4_dir", ], }, @@ -223,7 +223,7 @@ my %pgdump_runs = ( '--format' => 'plain', '--compress' => 'lz4', '--file' => "$tempdir/compression_lz4_plain.sql.lz4", - '--with-statistics', + '--statistics', 'postgres', ], # Decompress the generated file to run through the tests. @@ -245,13 +245,13 @@ my %pgdump_runs = ( '--format' => 'custom', '--compress' => 'zstd', '--file' => "$tempdir/compression_zstd_custom.dump", - '--with-statistics', + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--file' => "$tempdir/compression_zstd_custom.sql", - '--with-statistics', + '--statistics', "$tempdir/compression_zstd_custom.dump", ], command_like => { @@ -273,7 +273,7 @@ my %pgdump_runs = ( '--format' => 'directory', '--compress' => 'zstd:1', '--file' => "$tempdir/compression_zstd_dir", - '--with-statistics', + '--statistics', 'postgres', ], # Give coverage for manually compressed blobs.toc files during @@ -294,7 +294,7 @@ my %pgdump_runs = ( 'pg_restore', '--jobs' => '2', '--file' => "$tempdir/compression_zstd_dir.sql", - '--with-statistics', + '--statistics', "$tempdir/compression_zstd_dir", ], }, @@ -308,7 +308,7 @@ my %pgdump_runs = ( '--format' => 'plain', '--compress' => 'zstd:long', '--file' => "$tempdir/compression_zstd_plain.sql.zst", - '--with-statistics', + '--statistics', 'postgres', ], # Decompress the generated file to run through the tests. @@ -327,7 +327,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/clean.sql", '--clean', - '--with-statistics', + '--statistics', '--dbname' => 'postgres', # alternative way to specify database ], }, @@ -338,7 +338,7 @@ my %pgdump_runs = ( '--clean', '--if-exists', '--encoding' => 'UTF8', # no-op, just for testing - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -357,7 +357,7 @@ my %pgdump_runs = ( '--create', '--no-reconnect', # no-op, just for testing '--verbose', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -376,7 +376,7 @@ my %pgdump_runs = ( dump_cmd => [ 'pg_dump', '--no-sync', '--file' => "$tempdir/defaults.sql", - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -385,7 +385,7 @@ my %pgdump_runs = ( dump_cmd => [ 'pg_dump', '--no-sync', '--file' => "$tempdir/defaults_no_public.sql", - '--with-statistics', + '--statistics', 'regress_pg_dump_test', ], }, @@ -395,7 +395,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--clean', '--file' => "$tempdir/defaults_no_public_clean.sql", - '--with-statistics', + '--statistics', 'regress_pg_dump_test', ], }, @@ -404,7 +404,7 @@ my %pgdump_runs = ( dump_cmd => [ 'pg_dump', '--no-sync', '--file' => "$tempdir/defaults_public_owner.sql", - '--with-statistics', + '--statistics', 'regress_public_owner', ], }, @@ -419,14 +419,14 @@ my %pgdump_runs = ( 'pg_dump', '--format' => 'custom', '--file' => "$tempdir/defaults_custom_format.dump", - '--with-statistics', + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--format' => 'custom', '--file' => "$tempdir/defaults_custom_format.sql", - '--with-statistics', + '--statistics', "$tempdir/defaults_custom_format.dump", ], command_like => { @@ -451,14 +451,14 @@ my %pgdump_runs = ( 'pg_dump', '--format' => 'directory', '--file' => "$tempdir/defaults_dir_format", - '--with-statistics', + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--format' => 'directory', '--file' => "$tempdir/defaults_dir_format.sql", - '--with-statistics', + '--statistics', "$tempdir/defaults_dir_format", ], command_like => { @@ -484,13 +484,13 @@ my %pgdump_runs = ( '--format' => 'directory', '--jobs' => 2, '--file' => "$tempdir/defaults_parallel", - '--with-statistics', + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--file' => "$tempdir/defaults_parallel.sql", - '--with-statistics', + '--statistics', "$tempdir/defaults_parallel", ], }, @@ -502,14 +502,14 @@ my %pgdump_runs = ( 'pg_dump', '--format' => 'tar', '--file' => "$tempdir/defaults_tar_format.tar", - '--with-statistics', + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--format' => 'tar', '--file' => "$tempdir/defaults_tar_format.sql", - '--with-statistics', + '--statistics', "$tempdir/defaults_tar_format.tar", ], }, @@ -518,7 +518,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/exclude_dump_test_schema.sql", '--exclude-schema' => 'dump_test', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -527,7 +527,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/exclude_test_table.sql", '--exclude-table' => 'dump_test.test_table', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -536,7 +536,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/exclude_measurement.sql", '--exclude-table-and-children' => 'dump_test.measurement', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -546,7 +546,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/exclude_measurement_data.sql", '--exclude-table-data-and-children' => 'dump_test.measurement', '--no-unlogged-table-data', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -556,7 +556,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/exclude_test_table_data.sql", '--exclude-table-data' => 'dump_test.test_table', '--no-unlogged-table-data', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -575,7 +575,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/pg_dumpall_globals.sql", '--globals-only', '--no-sync', - '--with-statistics', + '--statistics', ], }, pg_dumpall_globals_clean => { @@ -585,14 +585,14 @@ my %pgdump_runs = ( '--globals-only', '--clean', '--no-sync', - '--with-statistics', + '--statistics', ], }, pg_dumpall_dbprivs => { dump_cmd => [ 'pg_dumpall', '--no-sync', '--file' => "$tempdir/pg_dumpall_dbprivs.sql", - '--with-statistics', + '--statistics', ], }, pg_dumpall_exclude => { @@ -602,7 +602,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/pg_dumpall_exclude.sql", '--exclude-database' => '*dump_test*', '--no-sync', - '--with-statistics', + '--statistics', ], }, no_toast_compression => { @@ -610,7 +610,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_toast_compression.sql", '--no-toast-compression', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -619,7 +619,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_large_objects.sql", '--no-large-objects', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -628,7 +628,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_policies.sql", '--no-policies', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -637,7 +637,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_privs.sql", '--no-privileges', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -646,7 +646,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_owner.sql", '--no-owner', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -655,7 +655,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_table_access_method.sql", '--no-table-access-method', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -664,7 +664,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/only_dump_test_schema.sql", '--schema' => 'dump_test', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -675,7 +675,7 @@ my %pgdump_runs = ( '--table' => 'dump_test.test_table', '--lock-wait-timeout' => (1000 * $PostgreSQL::Test::Utils::timeout_default), - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -686,7 +686,7 @@ my %pgdump_runs = ( '--table-and-children' => 'dump_test.measurement', '--lock-wait-timeout' => (1000 * $PostgreSQL::Test::Utils::timeout_default), - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -696,7 +696,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/role.sql", '--role' => 'regress_dump_test_role', '--schema' => 'dump_test_second_schema', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -709,13 +709,13 @@ my %pgdump_runs = ( '--file' => "$tempdir/role_parallel", '--role' => 'regress_dump_test_role', '--schema' => 'dump_test_second_schema', - '--with-statistics', + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--file' => "$tempdir/role_parallel.sql", - '--with-statistics', + '--statistics', "$tempdir/role_parallel", ], }, @@ -744,7 +744,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/section_pre_data.sql", '--section' => 'pre-data', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -753,7 +753,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/section_data.sql", '--section' => 'data', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -762,7 +762,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/section_post_data.sql", '--section' => 'post-data', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -773,7 +773,7 @@ my %pgdump_runs = ( '--schema' => 'dump_test', '--large-objects', '--no-large-objects', - '--with-statistics', + '--statistics', 'postgres', ], }, @@ -789,7 +789,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', "--file=$tempdir/no_data_no_schema.sql", '--no-data', '--no-schema', 'postgres', - '--with-statistics', + '--statistics', ], }, statistics_only => { @@ -799,18 +799,11 @@ my %pgdump_runs = ( 'postgres', ], }, - schema_only_with_statistics => { - dump_cmd => [ - 'pg_dump', '--no-sync', - "--file=$tempdir/schema_only_with_statistics.sql", - '--schema-only', '--with-statistics', 'postgres', - ], - }, no_schema => { dump_cmd => [ 'pg_dump', '--no-sync', "--file=$tempdir/no_schema.sql", '--no-schema', - '--with-statistics', 'postgres', + '--statistics', 'postgres', ], },); @@ -1087,6 +1080,7 @@ my %tests = ( test_schema_plus_large_objects => 1, }, unlike => { + binary_upgrade => 1, no_large_objects => 1, no_owner => 1, schema_only => 1, @@ -1605,6 +1599,7 @@ my %tests = ( test_schema_plus_large_objects => 1, }, unlike => { + binary_upgrade => 1, schema_only => 1, schema_only_with_statistics => 1, no_large_objects => 1, @@ -2377,17 +2372,19 @@ my %tests = ( create_sql => 'CREATE DOMAIN dump_test.us_postal_code AS TEXT COLLATE "C" DEFAULT \'10014\' + CONSTRAINT nn NOT NULL CHECK(VALUE ~ \'^\d{5}$\' OR VALUE ~ \'^\d{5}-\d{4}$\'); + COMMENT ON CONSTRAINT nn + ON DOMAIN dump_test.us_postal_code IS \'not null\'; COMMENT ON CONSTRAINT us_postal_code_check ON DOMAIN dump_test.us_postal_code IS \'check it\';', regexp => qr/^ - \QCREATE DOMAIN dump_test.us_postal_code AS text COLLATE pg_catalog."C" DEFAULT '10014'::text\E\n\s+ + \QCREATE DOMAIN dump_test.us_postal_code AS text COLLATE pg_catalog."C" CONSTRAINT nn NOT NULL DEFAULT '10014'::text\E\n\s+ \QCONSTRAINT us_postal_code_check CHECK \E \Q(((VALUE ~ '^\d{5}\E \$\Q'::text) OR (VALUE ~ '^\d{5}-\d{4}\E\$ \Q'::text)));\E(.|\n)* - \QCOMMENT ON CONSTRAINT us_postal_code_check ON DOMAIN dump_test.us_postal_code IS 'check it';\E /xm, like => { %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, @@ -2397,6 +2394,30 @@ my %tests = ( }, }, + 'COMMENT ON CONSTRAINT ON DOMAIN (1)' => { + regexp => qr/^ + \QCOMMENT ON CONSTRAINT nn ON DOMAIN dump_test.us_postal_code IS 'not null';\E + /xm, + like => + { %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, + unlike => { + exclude_dump_test_schema => 1, + only_dump_measurement => 1, + }, + }, + + 'COMMENT ON CONSTRAINT ON DOMAIN (2)' => { + regexp => qr/^ + \QCOMMENT ON CONSTRAINT us_postal_code_check ON DOMAIN dump_test.us_postal_code IS 'check it';\E + /xm, + like => + { %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, + unlike => { + exclude_dump_test_schema => 1, + only_dump_measurement => 1, + }, + }, + 'CREATE FUNCTION dump_test.pltestlang_call_handler' => { create_order => 17, create_sql => 'CREATE FUNCTION dump_test.pltestlang_call_handler() @@ -4612,9 +4633,9 @@ my %tests = ( no_schema => 1, section_data => 1, test_schema_plus_large_objects => 1, - binary_upgrade => 1, }, unlike => { + binary_upgrade => 1, no_large_objects => 1, no_privs => 1, schema_only => 1, @@ -5184,6 +5205,17 @@ command_fails_like( 'pg_dump', '--port' => $port, '--strict-names', + '--schema-only', + '--statistics', + ], + qr/\Qpg_dump: error: options -s\/--schema-only and --statistics cannot be used together\E/, + 'cannot use --schema-only and --statistics together'); + +command_fails_like( + [ + 'pg_dump', + '--port' => $port, + '--strict-names', '--table' => 'nonexistent*' ], qr/\Qpg_dump: error: no matching tables were found for pattern\E/, diff --git a/src/bin/pg_dump/t/006_pg_dumpall.pl b/src/bin/pg_dump/t/006_pg_dumpall.pl deleted file mode 100644 index c274b777586..00000000000 --- a/src/bin/pg_dump/t/006_pg_dumpall.pl +++ /dev/null @@ -1,400 +0,0 @@ -# Copyright (c) 2021-2025, PostgreSQL Global Development Group - -use strict; -use warnings FATAL => 'all'; - -use PostgreSQL::Test::Cluster; -use PostgreSQL::Test::Utils; -use Test::More; - -my $tempdir = PostgreSQL::Test::Utils::tempdir; -my $run_db = 'postgres'; -my $sep = $windows_os ? "\\" : "/"; - -# Tablespace locations used by "restore_tablespace" test case. -my $tablespace1 = "${tempdir}${sep}tbl1"; -my $tablespace2 = "${tempdir}${sep}tbl2"; -mkdir($tablespace1) || die "mkdir $tablespace1 $!"; -mkdir($tablespace2) || die "mkdir $tablespace2 $!"; - -# Scape tablespace locations on Windows. -$tablespace1 = $windows_os ? ($tablespace1 =~ s/\\/\\\\/gr) : $tablespace1; -$tablespace2 = $windows_os ? ($tablespace2 =~ s/\\/\\\\/gr) : $tablespace2; - -# Where pg_dumpall will be executed. -my $node = PostgreSQL::Test::Cluster->new('node'); -$node->init; -$node->start; - - -############################################################### -# Definition of the pg_dumpall test cases to run. -# -# Each of these test cases are named and those names are used for fail -# reporting and also to save the dump and restore information needed for the -# test to assert. -# -# The "setup_sql" is a psql valid script that contains SQL commands to execute -# before of actually execute the tests. The setups are all executed before of -# any test execution. -# -# The "dump_cmd" and "restore_cmd" are the commands that will be executed. The -# "restore_cmd" must have the --file flag to save the restore output so that we -# can assert on it. -# -# The "like" and "unlike" is a regexp that is used to match the pg_restore -# output. It must have at least one of then filled per test cases but it also -# can have both. See "excluding_databases" test case for example. -my %pgdumpall_runs = ( - restore_roles => { - setup_sql => ' - CREATE ROLE dumpall WITH ENCRYPTED PASSWORD \'admin\' SUPERUSER; - CREATE ROLE dumpall2 WITH REPLICATION CONNECTION LIMIT 10;', - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--file' => "$tempdir/restore_roles", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'directory', - '--file' => "$tempdir/restore_roles.sql", - "$tempdir/restore_roles", - ], - like => qr/ - ^\s*\QCREATE ROLE dumpall;\E\s*\n - \s*\QALTER ROLE dumpall WITH SUPERUSER INHERIT NOCREATEROLE NOCREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS PASSWORD 'SCRAM-SHA-256\E - [^']+';\s*\n - \s*\QCREATE ROLE dumpall2;\E - \s*\QALTER ROLE dumpall2 WITH NOSUPERUSER INHERIT NOCREATEROLE NOCREATEDB NOLOGIN REPLICATION NOBYPASSRLS CONNECTION LIMIT 10;\E - /xm - }, - - restore_tablespace => { - setup_sql => " - CREATE ROLE tap; - CREATE TABLESPACE tbl1 OWNER tap LOCATION '$tablespace1'; - CREATE TABLESPACE tbl2 OWNER tap LOCATION '$tablespace2' WITH (seq_page_cost=1.0);", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--file' => "$tempdir/restore_tablespace", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'directory', - '--file' => "$tempdir/restore_tablespace.sql", - "$tempdir/restore_tablespace", - ], - # Match "E" as optional since it is added on LOCATION when running on - # Windows. - like => qr/^ - \n\QCREATE TABLESPACE tbl1 OWNER tap LOCATION \E(?:E)?\Q'$tablespace1';\E - \n\QCREATE TABLESPACE tbl2 OWNER tap LOCATION \E(?:E)?\Q'$tablespace2';\E - \n\QALTER TABLESPACE tbl2 SET (seq_page_cost=1.0);\E - /xm, - }, - - restore_grants => { - setup_sql => " - CREATE DATABASE tapgrantsdb; - CREATE SCHEMA private; - CREATE SEQUENCE serial START 101; - CREATE FUNCTION fn() RETURNS void AS \$\$ - BEGIN - END; - \$\$ LANGUAGE plpgsql; - CREATE ROLE super; - CREATE ROLE grant1; - CREATE ROLE grant2; - CREATE ROLE grant3; - CREATE ROLE grant4; - CREATE ROLE grant5; - CREATE ROLE grant6; - CREATE ROLE grant7; - CREATE ROLE grant8; - - CREATE TABLE t (id int); - INSERT INTO t VALUES (1), (2), (3), (4); - - GRANT SELECT ON TABLE t TO grant1; - GRANT INSERT ON TABLE t TO grant2; - GRANT ALL PRIVILEGES ON TABLE t to grant3; - GRANT CONNECT, CREATE ON DATABASE tapgrantsdb TO grant4; - GRANT USAGE, CREATE ON SCHEMA private TO grant5; - GRANT USAGE, SELECT, UPDATE ON SEQUENCE serial TO grant6; - GRANT super TO grant7; - GRANT EXECUTE ON FUNCTION fn() TO grant8; - ", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--file' => "$tempdir/restore_grants", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'directory', - '--file' => "$tempdir/restore_grants.sql", - "$tempdir/restore_grants", - ], - like => qr/^ - \n\QGRANT super TO grant7 WITH INHERIT TRUE GRANTED BY\E - (.*\n)* - \n\QGRANT ALL ON SCHEMA private TO grant5;\E - (.*\n)* - \n\QGRANT ALL ON FUNCTION public.fn() TO grant8;\E - (.*\n)* - \n\QGRANT ALL ON SEQUENCE public.serial TO grant6;\E - (.*\n)* - \n\QGRANT SELECT ON TABLE public.t TO grant1;\E - \n\QGRANT INSERT ON TABLE public.t TO grant2;\E - \n\QGRANT ALL ON TABLE public.t TO grant3;\E - (.*\n)* - \n\QGRANT CREATE,CONNECT ON DATABASE tapgrantsdb TO grant4;\E - /xm, - }, - - excluding_databases => { - setup_sql => 'CREATE DATABASE db1; - \c db1 - CREATE TABLE t1 (id int); - INSERT INTO t1 VALUES (1), (2), (3), (4); - CREATE TABLE t2 (id int); - INSERT INTO t2 VALUES (1), (2), (3), (4); - - CREATE DATABASE db2; - \c db2 - CREATE TABLE t3 (id int); - INSERT INTO t3 VALUES (1), (2), (3), (4); - CREATE TABLE t4 (id int); - INSERT INTO t4 VALUES (1), (2), (3), (4); - - CREATE DATABASE dbex3; - \c dbex3 - CREATE TABLE t5 (id int); - INSERT INTO t5 VALUES (1), (2), (3), (4); - CREATE TABLE t6 (id int); - INSERT INTO t6 VALUES (1), (2), (3), (4); - - CREATE DATABASE dbex4; - \c dbex4 - CREATE TABLE t7 (id int); - INSERT INTO t7 VALUES (1), (2), (3), (4); - CREATE TABLE t8 (id int); - INSERT INTO t8 VALUES (1), (2), (3), (4); - - CREATE DATABASE db5; - \c db5 - CREATE TABLE t9 (id int); - INSERT INTO t9 VALUES (1), (2), (3), (4); - CREATE TABLE t10 (id int); - INSERT INTO t10 VALUES (1), (2), (3), (4); - ', - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--file' => "$tempdir/excluding_databases", - '--exclude-database' => 'dbex*', - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'directory', - '--file' => "$tempdir/excluding_databases.sql", - '--exclude-database' => 'db5', - "$tempdir/excluding_databases", - ], - like => qr/^ - \n\QCREATE DATABASE db1\E - (.*\n)* - \n\QCREATE TABLE public.t1 (\E - (.*\n)* - \n\QCREATE TABLE public.t2 (\E - (.*\n)* - \n\QCREATE DATABASE db2\E - (.*\n)* - \n\QCREATE TABLE public.t3 (\E - (.*\n)* - \n\QCREATE TABLE public.t4 (/xm, - unlike => qr/^ - \n\QCREATE DATABASE db3\E - (.*\n)* - \n\QCREATE TABLE public.t5 (\E - (.*\n)* - \n\QCREATE TABLE public.t6 (\E - (.*\n)* - \n\QCREATE DATABASE db4\E - (.*\n)* - \n\QCREATE TABLE public.t7 (\E - (.*\n)* - \n\QCREATE TABLE public.t8 (\E - \n\QCREATE DATABASE db5\E - (.*\n)* - \n\QCREATE TABLE public.t9 (\E - (.*\n)* - \n\QCREATE TABLE public.t10 (\E - /xm, - }, - - format_directory => { - setup_sql => "CREATE TABLE format_directory(a int, b boolean, c text); - INSERT INTO format_directory VALUES (1, true, 'name1'), (2, false, 'name2');", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--file' => "$tempdir/format_directory", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'directory', - '--file' => "$tempdir/format_directory.sql", - "$tempdir/format_directory", - ], - like => qr/^\n\QCOPY public.format_directory (a, b, c) FROM stdin;/xm - }, - - format_tar => { - setup_sql => "CREATE TABLE format_tar(a int, b boolean, c text); - INSERT INTO format_tar VALUES (1, false, 'name3'), (2, true, 'name4');", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'tar', - '--file' => "$tempdir/format_tar", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'tar', - '--file' => "$tempdir/format_tar.sql", - "$tempdir/format_tar", - ], - like => qr/^\n\QCOPY public.format_tar (a, b, c) FROM stdin;/xm - }, - - format_custom => { - setup_sql => "CREATE TABLE format_custom(a int, b boolean, c text); - INSERT INTO format_custom VALUES (1, false, 'name5'), (2, true, 'name6');", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'custom', - '--file' => "$tempdir/format_custom", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'custom', - '--file' => "$tempdir/format_custom.sql", - "$tempdir/format_custom", - ], - like => qr/^ \n\QCOPY public.format_custom (a, b, c) FROM stdin;/xm - }, - - dump_globals_only => { - setup_sql => "CREATE TABLE format_dir(a int, b boolean, c text); - INSERT INTO format_dir VALUES (1, false, 'name5'), (2, true, 'name6');", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--globals-only', - '--file' => "$tempdir/dump_globals_only", - ], - restore_cmd => [ - 'pg_restore', '-C', '--globals-only', - '--format' => 'directory', - '--file' => "$tempdir/dump_globals_only.sql", - "$tempdir/dump_globals_only", - ], - like => qr/ - ^\s*\QCREATE ROLE dumpall;\E\s*\n - /xm - },); - -# First execute the setup_sql -foreach my $run (sort keys %pgdumpall_runs) -{ - if ($pgdumpall_runs{$run}->{setup_sql}) - { - $node->safe_psql($run_db, $pgdumpall_runs{$run}->{setup_sql}); - } -} - -# Execute the tests -foreach my $run (sort keys %pgdumpall_runs) -{ - # Create a new target cluster to pg_restore each test case run so that we - # don't need to take care of the cleanup from the target cluster after each - # run. - my $target_node = PostgreSQL::Test::Cluster->new("target_$run"); - $target_node->init; - $target_node->start; - - # Dumpall from node cluster. - $node->command_ok(\@{ $pgdumpall_runs{$run}->{dump_cmd} }, - "$run: pg_dumpall runs"); - - # Restore the dump on "target_node" cluster. - my @restore_cmd = ( - @{ $pgdumpall_runs{$run}->{restore_cmd} }, - '--host', $target_node->host, '--port', $target_node->port); - - my ($stdout, $stderr) = run_command(\@restore_cmd); - - # pg_restore --file output file. - my $output_file = slurp_file("$tempdir/${run}.sql"); - - if ( !($pgdumpall_runs{$run}->{like}) - && !($pgdumpall_runs{$run}->{unlike})) - { - die "missing \"like\" or \"unlike\" in test \"$run\""; - } - - if ($pgdumpall_runs{$run}->{like}) - { - like($output_file, $pgdumpall_runs{$run}->{like}, "should dump $run"); - } - - if ($pgdumpall_runs{$run}->{unlike}) - { - unlike( - $output_file, - $pgdumpall_runs{$run}->{unlike}, - "should not dump $run"); - } -} - -# Some negative test case with dump of pg_dumpall and restore using pg_restore -# test case 1: when -C is not used in pg_restore with dump of pg_dumpall -$node->command_fails_like( - [ - 'pg_restore', - "$tempdir/format_custom", - '--format' => 'custom', - '--file' => "$tempdir/error_test.sql", - ], - qr/\Qpg_restore: error: option -C\/--create must be specified when restoring an archive created by pg_dumpall\E/, - 'When -C is not used in pg_restore with dump of pg_dumpall'); - -# test case 2: When --list option is used with dump of pg_dumpall -$node->command_fails_like( - [ - 'pg_restore', - "$tempdir/format_custom", '-C', - '--format' => 'custom', - '--list', - '--file' => "$tempdir/error_test.sql", - ], - qr/\Qpg_restore: error: option -l\/--list cannot be used when restoring an archive created by pg_dumpall\E/, - 'When --list is used in pg_restore with dump of pg_dumpall'); - -# test case 3: When non-exist database is given with -d option -$node->command_fails_like( - [ - 'pg_restore', - "$tempdir/format_custom", '-C', - '--format' => 'custom', - '-d' => 'dbpq', - ], - qr/\Qpg_restore: error: could not connect to database "dbpq"\E/, - 'When non-existent database is given with -d option in pg_restore with dump of pg_dumpall' -); - -$node->stop('fast'); - -done_testing(); diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 30579ef2051..310f53c5577 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -28,7 +28,7 @@ static void check_for_pg_role_prefix(ClusterInfo *cluster); static void check_for_new_tablespace_dir(void); static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster); static void check_for_unicode_update(ClusterInfo *cluster); -static void check_new_cluster_logical_replication_slots(void); +static void check_new_cluster_replication_slots(void); static void check_new_cluster_subscription_configuration(void); static void check_old_cluster_for_valid_slots(void); static void check_old_cluster_subscription_state(void); @@ -631,7 +631,7 @@ check_and_dump_old_cluster(void) * Before that the logical slots are not upgraded, so we will not be * able to upgrade the logical replication clusters completely. */ - get_subscription_count(&old_cluster); + get_subscription_info(&old_cluster); check_old_cluster_subscription_state(); } @@ -764,7 +764,7 @@ check_new_cluster(void) check_for_new_tablespace_dir(); - check_new_cluster_logical_replication_slots(); + check_new_cluster_replication_slots(); check_new_cluster_subscription_configuration(); } @@ -956,12 +956,12 @@ check_for_new_tablespace_dir(void) prep_status("Checking for new cluster tablespace directories"); - for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (tblnum = 0; tblnum < new_cluster.num_tablespaces; tblnum++) { struct stat statbuf; snprintf(new_tablespace_dir, MAXPGPATH, "%s%s", - os_info.old_tablespaces[tblnum], + new_cluster.tablespaces[tblnum], new_cluster.tablespace_suffix); if (stat(new_tablespace_dir, &statbuf) == 0 || errno != ENOENT) @@ -1013,17 +1013,17 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name) * directory. We can't create a proper old cluster delete script in that * case. */ - for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (tblnum = 0; tblnum < new_cluster.num_tablespaces; tblnum++) { - char old_tablespace_dir[MAXPGPATH]; + char new_tablespace_dir[MAXPGPATH]; - strlcpy(old_tablespace_dir, os_info.old_tablespaces[tblnum], MAXPGPATH); - canonicalize_path(old_tablespace_dir); - if (path_is_prefix_of_path(old_cluster_pgdata, old_tablespace_dir)) + strlcpy(new_tablespace_dir, new_cluster.tablespaces[tblnum], MAXPGPATH); + canonicalize_path(new_tablespace_dir); + if (path_is_prefix_of_path(old_cluster_pgdata, new_tablespace_dir)) { /* reproduce warning from CREATE TABLESPACE that is in the log */ pg_log(PG_WARNING, - "\nWARNING: user-defined tablespace locations should not be inside the data directory, i.e. %s", old_tablespace_dir); + "\nWARNING: user-defined tablespace locations should not be inside the data directory, i.e. %s", new_tablespace_dir); /* Unlink file in case it is left over from a previous run. */ unlink(*deletion_script_file_name); @@ -1051,9 +1051,9 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name) /* delete old cluster's alternate tablespaces */ old_tblspc_suffix = pg_strdup(old_cluster.tablespace_suffix); fix_path_separator(old_tblspc_suffix); - for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (tblnum = 0; tblnum < old_cluster.num_tablespaces; tblnum++) fprintf(script, RMDIR_CMD " %c%s%s%c\n", PATH_QUOTE, - fix_path_separator(os_info.old_tablespaces[tblnum]), + fix_path_separator(old_cluster.tablespaces[tblnum]), old_tblspc_suffix, PATH_QUOTE); pfree(old_tblspc_suffix); @@ -2040,48 +2040,80 @@ check_for_unicode_update(ClusterInfo *cluster) } /* - * check_new_cluster_logical_replication_slots() + * check_new_cluster_replication_slots() * - * Verify that there are no logical replication slots on the new cluster and - * that the parameter settings necessary for creating slots are sufficient. + * Validate the new cluster's readiness for migrating replication slots: + * - Ensures no existing logical replication slots on the new cluster when + * migrating logical slots. + * - Ensure conflict detection slot does not exist on the new cluster when + * migrating subscriptions with retain_dead_tuples enabled. + * - Ensure that the parameter settings on the new cluster necessary for + * creating slots are sufficient. */ static void -check_new_cluster_logical_replication_slots(void) +check_new_cluster_replication_slots(void) { PGresult *res; PGconn *conn; int nslots_on_old; int nslots_on_new; + int rdt_slot_on_new; int max_replication_slots; char *wal_level; + int i_nslots_on_new; + int i_rdt_slot_on_new; - /* Logical slots can be migrated since PG17. */ + /* + * Logical slots can be migrated since PG17 and a physical slot + * CONFLICT_DETECTION_SLOT can be migrated since PG19. + */ if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1600) return; nslots_on_old = count_old_cluster_logical_slots(); - /* Quick return if there are no logical slots to be migrated. */ - if (nslots_on_old == 0) + /* + * Quick return if there are no slots to be migrated and no subscriptions + * have the retain_dead_tuples option enabled. + */ + if (nslots_on_old == 0 && !old_cluster.sub_retain_dead_tuples) return; conn = connectToServer(&new_cluster, "template1"); - prep_status("Checking for new cluster logical replication slots"); + prep_status("Checking for new cluster replication slots"); - res = executeQueryOrDie(conn, "SELECT count(*) " - "FROM pg_catalog.pg_replication_slots " - "WHERE slot_type = 'logical' AND " - "temporary IS FALSE;"); + res = executeQueryOrDie(conn, "SELECT %s AS nslots_on_new, %s AS rdt_slot_on_new " + "FROM pg_catalog.pg_replication_slots", + nslots_on_old > 0 + ? "COUNT(*) FILTER (WHERE slot_type = 'logical' AND temporary IS FALSE)" + : "0", + old_cluster.sub_retain_dead_tuples + ? "COUNT(*) FILTER (WHERE slot_name = 'pg_conflict_detection')" + : "0"); if (PQntuples(res) != 1) - pg_fatal("could not count the number of logical replication slots"); + pg_fatal("could not count the number of replication slots"); - nslots_on_new = atoi(PQgetvalue(res, 0, 0)); + i_nslots_on_new = PQfnumber(res, "nslots_on_new"); + i_rdt_slot_on_new = PQfnumber(res, "rdt_slot_on_new"); + + nslots_on_new = atoi(PQgetvalue(res, 0, i_nslots_on_new)); if (nslots_on_new) + { + Assert(nslots_on_old); pg_fatal("expected 0 logical replication slots but found %d", nslots_on_new); + } + + rdt_slot_on_new = atoi(PQgetvalue(res, 0, i_rdt_slot_on_new)); + + if (rdt_slot_on_new) + { + Assert(old_cluster.sub_retain_dead_tuples); + pg_fatal("The replication slot \"pg_conflict_detection\" already exists on the new cluster"); + } PQclear(res); @@ -2094,12 +2126,24 @@ check_new_cluster_logical_replication_slots(void) wal_level = PQgetvalue(res, 0, 0); - if (strcmp(wal_level, "logical") != 0) + if (nslots_on_old > 0 && strcmp(wal_level, "logical") != 0) pg_fatal("\"wal_level\" must be \"logical\" but is set to \"%s\"", wal_level); + if (old_cluster.sub_retain_dead_tuples && + strcmp(wal_level, "minimal") == 0) + pg_fatal("\"wal_level\" must be \"replica\" or \"logical\" but is set to \"%s\"", + wal_level); + max_replication_slots = atoi(PQgetvalue(res, 1, 0)); + if (old_cluster.sub_retain_dead_tuples && + nslots_on_old + 1 > max_replication_slots) + pg_fatal("\"max_replication_slots\" (%d) must be greater than or equal to the number of " + "logical replication slots on the old cluster plus one additional slot required " + "for retaining conflict detection information (%d)", + max_replication_slots, nslots_on_old + 1); + if (nslots_on_old > max_replication_slots) pg_fatal("\"max_replication_slots\" (%d) must be greater than or equal to the number of " "logical replication slots (%d) on the old cluster", @@ -2211,6 +2255,22 @@ check_old_cluster_for_valid_slots(void) "The slot \"%s\" has not consumed the WAL yet\n", slot->slotname); } + + /* + * The name "pg_conflict_detection" (defined as + * CONFLICT_DETECTION_SLOT) has been reserved for logical + * replication conflict detection slot since PG19. + */ + if (strcmp(slot->slotname, "pg_conflict_detection") == 0) + { + if (script == NULL && + (script = fopen_priv(output_path, "w")) == NULL) + pg_fatal("could not open file \"%s\": %m", output_path); + + fprintf(script, + "The slot name \"%s\" is reserved\n", + slot->slotname); + } } } diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c index 183f08ce1e8..55f6e7b4d9c 100644 --- a/src/bin/pg_upgrade/dump.c +++ b/src/bin/pg_upgrade/dump.c @@ -58,7 +58,7 @@ generate_old_dump(void) (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? "" : "--sequence-data", log_opts.verbose ? "--verbose" : "", - user_opts.do_statistics ? "--with-statistics" : "--no-statistics", + user_opts.do_statistics ? "--statistics" : "--no-statistics", log_opts.dumpdir, sql_file_name, escaped_connstr.data); diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index 4b7a56f5b3b..c39eb077c2f 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -443,10 +443,26 @@ get_db_infos(ClusterInfo *cluster) for (tupnum = 0; tupnum < ntups; tupnum++) { + char *spcloc = PQgetvalue(res, tupnum, i_spclocation); + bool inplace = spcloc[0] && !is_absolute_path(spcloc); + dbinfos[tupnum].db_oid = atooid(PQgetvalue(res, tupnum, i_oid)); dbinfos[tupnum].db_name = pg_strdup(PQgetvalue(res, tupnum, i_datname)); - snprintf(dbinfos[tupnum].db_tablespace, sizeof(dbinfos[tupnum].db_tablespace), "%s", - PQgetvalue(res, tupnum, i_spclocation)); + + /* + * The tablespace location might be "", meaning the cluster default + * location, i.e. pg_default or pg_global. For in-place tablespaces, + * pg_tablespace_location() returns a path relative to the data + * directory. + */ + if (inplace) + snprintf(dbinfos[tupnum].db_tablespace, + sizeof(dbinfos[tupnum].db_tablespace), + "%s/%s", cluster->pgdata, spcloc); + else + snprintf(dbinfos[tupnum].db_tablespace, + sizeof(dbinfos[tupnum].db_tablespace), + "%s", spcloc); } PQclear(res); @@ -616,11 +632,21 @@ process_rel_infos(DbInfo *dbinfo, PGresult *res, void *arg) /* Is the tablespace oid non-default? */ if (atooid(PQgetvalue(res, relnum, i_reltablespace)) != 0) { + char *spcloc = PQgetvalue(res, relnum, i_spclocation); + bool inplace = spcloc[0] && !is_absolute_path(spcloc); + /* * The tablespace location might be "", meaning the cluster - * default location, i.e. pg_default or pg_global. + * default location, i.e. pg_default or pg_global. For in-place + * tablespaces, pg_tablespace_location() returns a path relative + * to the data directory. */ - tablespace = PQgetvalue(res, relnum, i_spclocation); + if (inplace) + tablespace = psprintf("%s/%s", + os_info.running_cluster->pgdata, + spcloc); + else + tablespace = spcloc; /* Can we reuse the previous string allocation? */ if (last_tablespace && strcmp(tablespace, last_tablespace) == 0) @@ -630,6 +656,10 @@ process_rel_infos(DbInfo *dbinfo, PGresult *res, void *arg) last_tablespace = curr->tablespace = pg_strdup(tablespace); curr->tblsp_alloc = true; } + + /* Free palloc'd string for in-place tablespaces. */ + if (inplace) + pfree(tablespace); } else /* A zero reltablespace oid indicates the database tablespace. */ @@ -752,20 +782,33 @@ count_old_cluster_logical_slots(void) } /* - * get_subscription_count() + * get_subscription_info() * - * Gets the number of subscriptions in the cluster. + * Gets the information of subscriptions in the cluster. */ void -get_subscription_count(ClusterInfo *cluster) +get_subscription_info(ClusterInfo *cluster) { PGconn *conn; PGresult *res; + int i_nsub; + int i_retain_dead_tuples; conn = connectToServer(cluster, "template1"); - res = executeQueryOrDie(conn, "SELECT count(*) " - "FROM pg_catalog.pg_subscription"); - cluster->nsubs = atoi(PQgetvalue(res, 0, 0)); + if (GET_MAJOR_VERSION(cluster->major_version) >= 1900) + res = executeQueryOrDie(conn, "SELECT count(*) AS nsub," + "COUNT(CASE WHEN subretaindeadtuples THEN 1 END) > 0 AS retain_dead_tuples " + "FROM pg_catalog.pg_subscription"); + else + res = executeQueryOrDie(conn, "SELECT count(*) AS nsub," + "'f' AS retain_dead_tuples " + "FROM pg_catalog.pg_subscription"); + + i_nsub = PQfnumber(res, "nsub"); + i_retain_dead_tuples = PQfnumber(res, "retain_dead_tuples"); + + cluster->nsubs = atoi(PQgetvalue(res, 0, i_nsub)); + cluster->sub_retain_dead_tuples = (strcmp(PQgetvalue(res, 0, i_retain_dead_tuples), "t") == 0); PQclear(res); PQfinish(conn); diff --git a/src/bin/pg_upgrade/parallel.c b/src/bin/pg_upgrade/parallel.c index 056aa2edaee..6d7941844a7 100644 --- a/src/bin/pg_upgrade/parallel.c +++ b/src/bin/pg_upgrade/parallel.c @@ -40,6 +40,7 @@ typedef struct char *old_pgdata; char *new_pgdata; char *old_tablespace; + char *new_tablespace; } transfer_thread_arg; static exec_thread_arg **exec_thread_args; @@ -171,7 +172,7 @@ win32_exec_prog(exec_thread_arg *args) void parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata, - char *old_tablespace) + char *old_tablespace, char *new_tablespace) { #ifndef WIN32 pid_t child; @@ -181,7 +182,7 @@ parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, #endif if (user_opts.jobs <= 1) - transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, NULL); + transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, NULL, NULL); else { /* parallel */ @@ -225,7 +226,7 @@ parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, if (child == 0) { transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, - old_tablespace); + old_tablespace, new_tablespace); /* if we take another exit path, it will be non-zero */ /* use _exit to skip atexit() functions */ _exit(0); @@ -246,6 +247,7 @@ parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, new_arg->new_pgdata = pg_strdup(new_pgdata); pg_free(new_arg->old_tablespace); new_arg->old_tablespace = old_tablespace ? pg_strdup(old_tablespace) : NULL; + new_arg->new_tablespace = new_tablespace ? pg_strdup(new_tablespace) : NULL; child = (HANDLE) _beginthreadex(NULL, 0, (void *) win32_transfer_all_new_dbs, new_arg, 0, NULL); @@ -263,7 +265,8 @@ DWORD win32_transfer_all_new_dbs(transfer_thread_arg *args) { transfer_all_new_dbs(args->old_db_arr, args->new_db_arr, args->old_pgdata, - args->new_pgdata, args->old_tablespace); + args->new_pgdata, args->old_tablespace, + args->new_tablespace); /* terminates thread */ return 0; diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 536e49d2616..d5cd5bf0b3a 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -67,6 +67,7 @@ static void set_frozenxids(bool minmxid_only); static void make_outputdirs(char *pgdata); static void setup(char *argv0); static void create_logical_replication_slots(void); +static void create_conflict_detection_slot(void); ClusterInfo old_cluster, new_cluster; @@ -88,6 +89,7 @@ int main(int argc, char **argv) { char *deletion_script_file_name = NULL; + bool migrate_logical_slots; /* * pg_upgrade doesn't currently use common/logging.c, but initialize it @@ -198,18 +200,39 @@ main(int argc, char **argv) new_cluster.pgdata); check_ok(); + migrate_logical_slots = count_old_cluster_logical_slots(); + /* - * Migrate the logical slots to the new cluster. Note that we need to do - * this after resetting WAL because otherwise the required WAL would be - * removed and slots would become unusable. There is a possibility that - * background processes might generate some WAL before we could create the - * slots in the new cluster but we can ignore that WAL as that won't be - * required downstream. + * Migrate replication slots to the new cluster. + * + * Note that we must migrate logical slots after resetting WAL because + * otherwise the required WAL would be removed and slots would become + * unusable. There is a possibility that background processes might + * generate some WAL before we could create the slots in the new cluster + * but we can ignore that WAL as that won't be required downstream. + * + * The conflict detection slot is not affected by concerns related to WALs + * as it only retains the dead tuples. It is created here for consistency. + * Note that the new conflict detection slot uses the latest transaction + * ID as xmin, so it cannot protect dead tuples that existed before the + * upgrade. Additionally, commit timestamps and origin data are not + * preserved during the upgrade. So, even after creating the slot, the + * upgraded subscriber may be unable to detect conflicts or log relevant + * commit timestamps and origins when applying changes from the publisher + * occurred before the upgrade especially if those changes were not + * replicated. It can only protect tuples that might be deleted after the + * new cluster starts. */ - if (count_old_cluster_logical_slots()) + if (migrate_logical_slots || old_cluster.sub_retain_dead_tuples) { start_postmaster(&new_cluster, true); - create_logical_replication_slots(); + + if (migrate_logical_slots) + create_logical_replication_slots(); + + if (old_cluster.sub_retain_dead_tuples) + create_conflict_detection_slot(); + stop_postmaster(false); } @@ -1025,3 +1048,24 @@ create_logical_replication_slots(void) return; } + +/* + * create_conflict_detection_slot() + * + * Create a replication slot to retain information necessary for conflict + * detection such as dead tuples, commit timestamps, and origins, for migrated + * subscriptions with retain_dead_tuples enabled. + */ +static void +create_conflict_detection_slot(void) +{ + PGconn *conn_new_template1; + + prep_status("Creating the replication conflict detection slot"); + + conn_new_template1 = connectToServer(&new_cluster, "template1"); + PQclear(executeQueryOrDie(conn_new_template1, "SELECT pg_catalog.binary_upgrade_create_conflict_detection_slot()")); + PQfinish(conn_new_template1); + + check_ok(); +} diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 69c965bb7d0..0ef47be0dc1 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -300,8 +300,12 @@ typedef struct uint32 major_version; /* PG_VERSION of cluster */ char major_version_str[64]; /* string PG_VERSION of cluster */ uint32 bin_version; /* version returned from pg_ctl */ + char **tablespaces; /* tablespace directories */ + int num_tablespaces; const char *tablespace_suffix; /* directory specification */ int nsubs; /* number of subscriptions */ + bool sub_retain_dead_tuples; /* whether a subscription enables + * retain_dead_tuples. */ } ClusterInfo; @@ -354,8 +358,6 @@ typedef struct const char *progname; /* complete pathname for this program */ char *user; /* username for clusters */ bool user_specified; /* user specified on command-line */ - char **old_tablespaces; /* tablespaces */ - int num_old_tablespaces; LibraryInfo *libraries; /* loadable libraries */ int num_libraries; ClusterInfo *running_cluster; @@ -441,7 +443,7 @@ FileNameMap *gen_db_file_maps(DbInfo *old_db, const char *new_pgdata); void get_db_rel_and_slot_infos(ClusterInfo *cluster); int count_old_cluster_logical_slots(void); -void get_subscription_count(ClusterInfo *cluster); +void get_subscription_info(ClusterInfo *cluster); /* option.c */ @@ -455,7 +457,7 @@ void transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata); void transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata, - char *old_tablespace); + char *old_tablespace, char *new_tablespace); /* tablespace.c */ @@ -503,7 +505,7 @@ void parallel_exec_prog(const char *log_file, const char *opt_log_file, const char *fmt,...) pg_attribute_printf(3, 4); void parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata, - char *old_tablespace); + char *old_tablespace, char *new_tablespace); bool reap_child(bool wait_for_child); /* task.c */ diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index 8d8e816a01f..38c17ceabf2 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -17,7 +17,7 @@ #include "common/logging.h" #include "pg_upgrade.h" -static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); +static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace, char *new_tablespace); static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit); /* @@ -136,21 +136,22 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, */ if (user_opts.jobs <= 1) parallel_transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, - new_pgdata, NULL); + new_pgdata, NULL, NULL); else { int tblnum; /* transfer default tablespace */ parallel_transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, - new_pgdata, old_pgdata); + new_pgdata, old_pgdata, new_pgdata); - for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (tblnum = 0; tblnum < old_cluster.num_tablespaces; tblnum++) parallel_transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, - os_info.old_tablespaces[tblnum]); + old_cluster.tablespaces[tblnum], + new_cluster.tablespaces[tblnum]); /* reap all children */ while (reap_child(true) == true) ; @@ -169,7 +170,8 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, */ void transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, - char *old_pgdata, char *new_pgdata, char *old_tablespace) + char *old_pgdata, char *new_pgdata, + char *old_tablespace, char *new_tablespace) { int old_dbnum, new_dbnum; @@ -204,7 +206,7 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, new_pgdata); if (n_maps) { - transfer_single_new_db(mappings, n_maps, old_tablespace); + transfer_single_new_db(mappings, n_maps, old_tablespace, new_tablespace); } /* We allocate something even for n_maps == 0 */ pg_free(mappings); @@ -234,10 +236,10 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, * moved_db_dir: Destination for the pg_restore-generated database directory. */ static bool -prepare_for_swap(const char *old_tablespace, Oid db_oid, - char *old_catalog_dir, char *new_db_dir, char *moved_db_dir) +prepare_for_swap(const char *old_tablespace, const char *new_tablespace, + Oid db_oid, char *old_catalog_dir, char *new_db_dir, + char *moved_db_dir) { - const char *new_tablespace; const char *old_tblspc_suffix; const char *new_tblspc_suffix; char old_tblspc[MAXPGPATH]; @@ -247,24 +249,14 @@ prepare_for_swap(const char *old_tablespace, Oid db_oid, struct stat st; if (strcmp(old_tablespace, old_cluster.pgdata) == 0) - { - new_tablespace = new_cluster.pgdata; - new_tblspc_suffix = "/base"; old_tblspc_suffix = "/base"; - } else - { - /* - * XXX: The below line is a hack to deal with the fact that we - * presently don't have an easy way to find the corresponding new - * tablespace's path. This will need to be fixed if/when we add - * pg_upgrade support for in-place tablespaces. - */ - new_tablespace = old_tablespace; + old_tblspc_suffix = old_cluster.tablespace_suffix; + if (strcmp(new_tablespace, new_cluster.pgdata) == 0) + new_tblspc_suffix = "/base"; + else new_tblspc_suffix = new_cluster.tablespace_suffix; - old_tblspc_suffix = old_cluster.tablespace_suffix; - } /* Old and new cluster paths. */ snprintf(old_tblspc, sizeof(old_tblspc), "%s%s", old_tablespace, old_tblspc_suffix); @@ -450,7 +442,7 @@ swap_catalog_files(FileNameMap *maps, int size, const char *old_catalog_dir, * during pg_restore. */ static void -do_swap(FileNameMap *maps, int size, char *old_tablespace) +do_swap(FileNameMap *maps, int size, char *old_tablespace, char *new_tablespace) { char old_catalog_dir[MAXPGPATH]; char new_db_dir[MAXPGPATH]; @@ -470,21 +462,23 @@ do_swap(FileNameMap *maps, int size, char *old_tablespace) */ if (old_tablespace) { - if (prepare_for_swap(old_tablespace, maps[0].db_oid, + if (prepare_for_swap(old_tablespace, new_tablespace, maps[0].db_oid, old_catalog_dir, new_db_dir, moved_db_dir)) swap_catalog_files(maps, size, old_catalog_dir, new_db_dir, moved_db_dir); } else { - if (prepare_for_swap(old_cluster.pgdata, maps[0].db_oid, + if (prepare_for_swap(old_cluster.pgdata, new_cluster.pgdata, maps[0].db_oid, old_catalog_dir, new_db_dir, moved_db_dir)) swap_catalog_files(maps, size, old_catalog_dir, new_db_dir, moved_db_dir); - for (int tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (int tblnum = 0; tblnum < old_cluster.num_tablespaces; tblnum++) { - if (prepare_for_swap(os_info.old_tablespaces[tblnum], maps[0].db_oid, + if (prepare_for_swap(old_cluster.tablespaces[tblnum], + new_cluster.tablespaces[tblnum], + maps[0].db_oid, old_catalog_dir, new_db_dir, moved_db_dir)) swap_catalog_files(maps, size, old_catalog_dir, new_db_dir, moved_db_dir); @@ -498,7 +492,8 @@ do_swap(FileNameMap *maps, int size, char *old_tablespace) * create links for mappings stored in "maps" array. */ static void -transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) +transfer_single_new_db(FileNameMap *maps, int size, + char *old_tablespace, char *new_tablespace) { int mapnum; bool vm_must_add_frozenbit = false; @@ -520,7 +515,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) */ Assert(!vm_must_add_frozenbit); - do_swap(maps, size, old_tablespace); + do_swap(maps, size, old_tablespace, new_tablespace); return; } diff --git a/src/bin/pg_upgrade/t/004_subscription.pl b/src/bin/pg_upgrade/t/004_subscription.pl index e46f02c6cc6..77387be0f9d 100644 --- a/src/bin/pg_upgrade/t/004_subscription.pl +++ b/src/bin/pg_upgrade/t/004_subscription.pl @@ -22,13 +22,13 @@ $publisher->start; # Initialize the old subscriber node my $old_sub = PostgreSQL::Test::Cluster->new('old_sub'); -$old_sub->init; +$old_sub->init(allows_streaming => 'physical'); $old_sub->start; my $oldbindir = $old_sub->config_data('--bindir'); # Initialize the new subscriber my $new_sub = PostgreSQL::Test::Cluster->new('new_sub'); -$new_sub->init; +$new_sub->init(allows_streaming => 'physical'); my $newbindir = $new_sub->config_data('--bindir'); # In a VPATH build, we'll be started in the source directory, but we want @@ -90,6 +90,54 @@ $old_sub->start; $old_sub->safe_psql('postgres', "DROP SUBSCRIPTION regress_sub1;"); # ------------------------------------------------------ +# Check that pg_upgrade fails when max_replication_slots configured in the new +# cluster is less than the number of logical slots in the old cluster + 1 when +# subscription's retain_dead_tuples option is enabled. +# ------------------------------------------------------ +# It is sufficient to use disabled subscription to test upgrade failure. + +$publisher->safe_psql('postgres', "CREATE PUBLICATION regress_pub1"); +$old_sub->safe_psql('postgres', + "CREATE SUBSCRIPTION regress_sub1 CONNECTION '$connstr' PUBLICATION regress_pub1 WITH (enabled = false, retain_dead_tuples = true)" +); + +$old_sub->stop; + +$new_sub->append_conf('postgresql.conf', 'max_replication_slots = 0'); + +# pg_upgrade will fail because the new cluster has insufficient +# max_replication_slots. +command_checks_all( + [ + 'pg_upgrade', + '--no-sync', + '--old-datadir' => $old_sub->data_dir, + '--new-datadir' => $new_sub->data_dir, + '--old-bindir' => $oldbindir, + '--new-bindir' => $newbindir, + '--socketdir' => $new_sub->host, + '--old-port' => $old_sub->port, + '--new-port' => $new_sub->port, + $mode, + '--check', + ], + 1, + [ + qr/"max_replication_slots" \(0\) must be greater than or equal to the number of logical replication slots on the old cluster plus one additional slot required for retaining conflict detection information \(1\)/ + ], + [qr//], + 'run of pg_upgrade where the new cluster has insufficient max_replication_slots' +); + +# Reset max_replication_slots +$new_sub->append_conf('postgresql.conf', 'max_replication_slots = 10'); + +# Cleanup +$publisher->safe_psql('postgres', "DROP PUBLICATION regress_pub1"); +$old_sub->start; +$old_sub->safe_psql('postgres', "DROP SUBSCRIPTION regress_sub1;"); + +# ------------------------------------------------------ # Check that pg_upgrade refuses to run if: # a) there's a subscription with tables in a state other than 'r' (ready) or # 'i' (init) and/or @@ -200,8 +248,9 @@ $old_sub->safe_psql( rmtree($new_sub->data_dir . "/pg_upgrade_output.d"); # Verify that the upgrade should be successful with tables in 'ready'/'init' -# state along with retaining the replication origin's remote lsn, subscription's -# running status, and failover option. +# state along with retaining the replication origin's remote lsn, +# subscription's running status, failover option, and retain_dead_tuples +# option. $publisher->safe_psql( 'postgres', qq[ CREATE TABLE tab_upgraded1(id int); @@ -211,7 +260,7 @@ $publisher->safe_psql( $old_sub->safe_psql( 'postgres', qq[ CREATE TABLE tab_upgraded1(id int); - CREATE SUBSCRIPTION regress_sub4 CONNECTION '$connstr' PUBLICATION regress_pub4 WITH (failover = true); + CREATE SUBSCRIPTION regress_sub4 CONNECTION '$connstr' PUBLICATION regress_pub4 WITH (failover = true, retain_dead_tuples = true); ]); # Wait till the table tab_upgraded1 reaches 'ready' state @@ -270,7 +319,8 @@ $new_sub->append_conf('postgresql.conf', # Check that pg_upgrade is successful when all tables are in ready or in # init state (tab_upgraded1 table is in ready state and tab_upgraded2 table is # in init state) along with retaining the replication origin's remote lsn, -# subscription's running status, and failover option. +# subscription's running status, failover option, and retain_dead_tuples +# option. # ------------------------------------------------------ command_ok( [ @@ -293,7 +343,8 @@ ok( !-d $new_sub->data_dir . "/pg_upgrade_output.d", # ------------------------------------------------------ # Check that the data inserted to the publisher when the new subscriber is down # will be replicated once it is started. Also check that the old subscription -# states and relations origins are all preserved. +# states and relations origins are all preserved, and that the conflict +# detection slot is created. # ------------------------------------------------------ $publisher->safe_psql( 'postgres', qq[ @@ -303,15 +354,16 @@ $publisher->safe_psql( $new_sub->start; -# The subscription's running status and failover option should be preserved -# in the upgraded instance. So regress_sub4 should still have subenabled and -# subfailover set to true, while regress_sub5 should have both set to false. +# The subscription's running status, failover option, and retain_dead_tuples +# option should be preserved in the upgraded instance. So regress_sub4 should +# still have subenabled, subfailover, and subretaindeadtuples set to true, +# while regress_sub5 should have both set to false. $result = $new_sub->safe_psql('postgres', - "SELECT subname, subenabled, subfailover FROM pg_subscription ORDER BY subname" + "SELECT subname, subenabled, subfailover, subretaindeadtuples FROM pg_subscription ORDER BY subname" ); -is( $result, qq(regress_sub4|t|t -regress_sub5|f|f), - "check that the subscription's running status and failover are preserved" +is( $result, qq(regress_sub4|t|t|t +regress_sub5|f|f|f), + "check that the subscription's running status, failover, and retain_dead_tuples are preserved" ); # Subscription relations should be preserved @@ -330,6 +382,11 @@ $result = $new_sub->safe_psql('postgres', ); is($result, qq($remote_lsn), "remote_lsn should have been preserved"); +# The conflict detection slot should be created +$result = $new_sub->safe_psql('postgres', + "SELECT xmin IS NOT NULL from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'"); +is($result, qq(t), "conflict detection slot exists"); + # Resume the initial sync and wait until all tables of subscription # 'regress_sub5' are synchronized $new_sub->append_conf('postgresql.conf', diff --git a/src/bin/pg_upgrade/t/006_transfer_modes.pl b/src/bin/pg_upgrade/t/006_transfer_modes.pl index 58fe8a8c7dc..348f4021462 100644 --- a/src/bin/pg_upgrade/t/006_transfer_modes.pl +++ b/src/bin/pg_upgrade/t/006_transfer_modes.pl @@ -38,6 +38,13 @@ sub test_mode } $new->init(); + # allow_in_place_tablespaces is available as far back as v10. + if ($old->pg_version >= 10) + { + $new->append_conf('postgresql.conf', "allow_in_place_tablespaces = true"); + $old->append_conf('postgresql.conf', "allow_in_place_tablespaces = true"); + } + # Create a small variety of simple test objects on the old cluster. We'll # check that these reach the new version after upgrading. $old->start; @@ -49,8 +56,7 @@ sub test_mode $old->safe_psql('testdb1', "VACUUM FULL test2"); $old->safe_psql('testdb1', "CREATE SEQUENCE testseq START 5432"); - # For cross-version tests, we can also check that pg_upgrade handles - # tablespaces. + # If an old installation is provided, we can test non-in-place tablespaces. if (defined($ENV{oldinstall})) { my $tblspc = PostgreSQL::Test::Utils::tempdir_short(); @@ -64,6 +70,19 @@ sub test_mode $old->safe_psql('testdb2', "CREATE TABLE test4 AS SELECT generate_series(400, 502)"); } + + # If the old cluster is >= v10, we can test in-place tablespaces. + if ($old->pg_version >= 10) + { + $old->safe_psql('postgres', + "CREATE TABLESPACE inplc_tblspc LOCATION ''"); + $old->safe_psql('postgres', + "CREATE DATABASE testdb3 TABLESPACE inplc_tblspc"); + $old->safe_psql('postgres', + "CREATE TABLE test5 TABLESPACE inplc_tblspc AS SELECT generate_series(503, 606)"); + $old->safe_psql('testdb3', + "CREATE TABLE test6 AS SELECT generate_series(607, 711)"); + } $old->stop; my $result = command_ok_or_fails_like( @@ -94,8 +113,7 @@ sub test_mode $result = $new->safe_psql('testdb1', "SELECT nextval('testseq')"); is($result, '5432', "sequence data after pg_upgrade $mode"); - # For cross-version tests, we should have some objects in a non-default - # tablespace. + # Tests for non-in-place tablespaces. if (defined($ENV{oldinstall})) { $result = @@ -105,6 +123,15 @@ sub test_mode $new->safe_psql('testdb2', "SELECT COUNT(*) FROM test4"); is($result, '103', "test4 data after pg_upgrade $mode"); } + + # Tests for in-place tablespaces. + if ($old->pg_version >= 10) + { + $result = $new->safe_psql('postgres', "SELECT COUNT(*) FROM test5"); + is($result, '104', "test5 data after pg_upgrade $mode"); + $result = $new->safe_psql('testdb3', "SELECT COUNT(*) FROM test6"); + is($result, '105', "test6 data after pg_upgrade $mode"); + } $new->stop; } diff --git a/src/bin/pg_upgrade/tablespace.c b/src/bin/pg_upgrade/tablespace.c index 3520a75ba31..151d74e1734 100644 --- a/src/bin/pg_upgrade/tablespace.c +++ b/src/bin/pg_upgrade/tablespace.c @@ -23,10 +23,20 @@ init_tablespaces(void) set_tablespace_directory_suffix(&old_cluster); set_tablespace_directory_suffix(&new_cluster); - if (os_info.num_old_tablespaces > 0 && + if (old_cluster.num_tablespaces > 0 && strcmp(old_cluster.tablespace_suffix, new_cluster.tablespace_suffix) == 0) - pg_fatal("Cannot upgrade to/from the same system catalog version when\n" - "using tablespaces."); + { + for (int i = 0; i < old_cluster.num_tablespaces; i++) + { + /* + * In-place tablespaces are okay for same-version upgrades because + * their paths will differ between clusters. + */ + if (strcmp(old_cluster.tablespaces[i], new_cluster.tablespaces[i]) == 0) + pg_fatal("Cannot upgrade to/from the same system catalog version when\n" + "using tablespaces."); + } + } } @@ -53,19 +63,48 @@ get_tablespace_paths(void) res = executeQueryOrDie(conn, "%s", query); - if ((os_info.num_old_tablespaces = PQntuples(res)) != 0) - os_info.old_tablespaces = - (char **) pg_malloc(os_info.num_old_tablespaces * sizeof(char *)); + old_cluster.num_tablespaces = PQntuples(res); + new_cluster.num_tablespaces = PQntuples(res); + + if (PQntuples(res) != 0) + { + old_cluster.tablespaces = + (char **) pg_malloc(old_cluster.num_tablespaces * sizeof(char *)); + new_cluster.tablespaces = + (char **) pg_malloc(new_cluster.num_tablespaces * sizeof(char *)); + } else - os_info.old_tablespaces = NULL; + { + old_cluster.tablespaces = NULL; + new_cluster.tablespaces = NULL; + } i_spclocation = PQfnumber(res, "spclocation"); - for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (tblnum = 0; tblnum < old_cluster.num_tablespaces; tblnum++) { struct stat statBuf; + char *spcloc = PQgetvalue(res, tblnum, i_spclocation); - os_info.old_tablespaces[tblnum] = pg_strdup(PQgetvalue(res, tblnum, i_spclocation)); + /* + * For now, we do not expect non-in-place tablespaces to move during + * upgrade. If that changes, it will likely become necessary to run + * the above query on the new cluster, too. + * + * pg_tablespace_location() returns absolute paths for non-in-place + * tablespaces and relative paths for in-place ones, so we use + * is_absolute_path() to distinguish between them. + */ + if (is_absolute_path(PQgetvalue(res, tblnum, i_spclocation))) + { + old_cluster.tablespaces[tblnum] = pg_strdup(spcloc); + new_cluster.tablespaces[tblnum] = old_cluster.tablespaces[tblnum]; + } + else + { + old_cluster.tablespaces[tblnum] = psprintf("%s/%s", old_cluster.pgdata, spcloc); + new_cluster.tablespaces[tblnum] = psprintf("%s/%s", new_cluster.pgdata, spcloc); + } /* * Check that the tablespace path exists and is a directory. @@ -76,21 +115,21 @@ get_tablespace_paths(void) * that contains user tablespaces is moved as part of pg_upgrade * preparation and the symbolic links are not updated. */ - if (stat(os_info.old_tablespaces[tblnum], &statBuf) != 0) + if (stat(old_cluster.tablespaces[tblnum], &statBuf) != 0) { if (errno == ENOENT) report_status(PG_FATAL, "tablespace directory \"%s\" does not exist", - os_info.old_tablespaces[tblnum]); + old_cluster.tablespaces[tblnum]); else report_status(PG_FATAL, "could not stat tablespace directory \"%s\": %m", - os_info.old_tablespaces[tblnum]); + old_cluster.tablespaces[tblnum]); } if (!S_ISDIR(statBuf.st_mode)) report_status(PG_FATAL, "tablespace path \"%s\" is not a directory", - os_info.old_tablespaces[tblnum]); + old_cluster.tablespaces[tblnum]); } PQclear(res); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 497a936c141..125f3c7bbbe 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -3495,6 +3495,8 @@ doRetry(CState *st, pg_time_usec_t *now) static int discardUntilSync(CState *st) { + bool received_sync = false; + /* send a sync */ if (!PQpipelineSync(st->con)) { @@ -3509,10 +3511,21 @@ discardUntilSync(CState *st) PGresult *res = PQgetResult(st->con); if (PQresultStatus(res) == PGRES_PIPELINE_SYNC) + received_sync = true; + else if (received_sync) { - PQclear(res); - res = PQgetResult(st->con); + /* + * PGRES_PIPELINE_SYNC must be followed by another + * PGRES_PIPELINE_SYNC or NULL; otherwise, assert failure. + */ Assert(res == NULL); + + /* + * Reset ongoing sync count to 0 since all PGRES_PIPELINE_SYNC + * results have been discarded. + */ + st->num_syncs = 0; + PQclear(res); break; } PQclear(res); diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index dd25d2fe7b8..7a06af48842 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -6746,7 +6746,7 @@ describeSubscriptions(const char *pattern, bool verbose) printQueryOpt myopt = pset.popt; static const bool translate_columns[] = {false, false, false, false, false, false, false, false, false, false, false, false, false, false, - false}; + false, false}; if (pset.sversion < 100000) { @@ -6814,6 +6814,10 @@ describeSubscriptions(const char *pattern, bool verbose) appendPQExpBuffer(&buf, ", subfailover AS \"%s\"\n", gettext_noop("Failover")); + if (pset.sversion >= 190000) + appendPQExpBuffer(&buf, + ", subretaindeadtuples AS \"%s\"\n", + gettext_noop("Retain dead tuples")); appendPQExpBuffer(&buf, ", subsynccommit AS \"%s\"\n" diff --git a/src/bin/psql/help.c b/src/bin/psql/help.c index a2e009ab9be..8c62729a0d1 100644 --- a/src/bin/psql/help.c +++ b/src/bin/psql/help.c @@ -748,7 +748,7 @@ void print_copyright(void) { puts("PostgreSQL Database Management System\n" - "(formerly known as Postgres, then as Postgres95)\n\n" + "(also known as Postgres, formerly known as Postgres95)\n\n" "Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group\n\n" "Portions Copyright (c) 1994, The Regents of the University of California\n\n" "Permission to use, copy, modify, and distribute this software and its\n" diff --git a/src/bin/psql/tab-complete.in.c b/src/bin/psql/tab-complete.in.c index 6872653c6c8..1f2ca946fc5 100644 --- a/src/bin/psql/tab-complete.in.c +++ b/src/bin/psql/tab-complete.in.c @@ -1010,7 +1010,7 @@ static const SchemaQuery Query_for_trigger_of_table = { #define Query_for_list_of_database_vars \ "SELECT conf FROM ("\ -" SELECT setdatabase, pg_catalog.split_part(unnest(setconfig),'=',1) conf"\ +" SELECT setdatabase, pg_catalog.split_part(pg_catalog.unnest(setconfig),'=',1) conf"\ " FROM pg_db_role_setting "\ " ) s, pg_database d "\ " WHERE s.setdatabase = d.oid "\ @@ -1086,9 +1086,12 @@ Keywords_for_list_of_owner_roles, "PUBLIC" " WHERE usename LIKE '%s'" #define Query_for_list_of_user_vars \ -" SELECT pg_catalog.split_part(pg_catalog.unnest(rolconfig),'=',1) "\ -" FROM pg_catalog.pg_roles "\ -" WHERE rolname LIKE '%s'" +"SELECT conf FROM ("\ +" SELECT rolname, pg_catalog.split_part(pg_catalog.unnest(rolconfig),'=',1) conf"\ +" FROM pg_catalog.pg_roles"\ +" ) s"\ +" WHERE s.conf like '%s' "\ +" AND s.rolname LIKE '%s'" #define Query_for_list_of_access_methods \ " SELECT amname "\ @@ -2319,8 +2322,9 @@ match_previous_words(int pattern_id, /* ALTER SUBSCRIPTION <name> SET ( */ else if (Matches("ALTER", "SUBSCRIPTION", MatchAny, MatchAnyN, "SET", "(")) COMPLETE_WITH("binary", "disable_on_error", "failover", "origin", - "password_required", "run_as_owner", "slot_name", - "streaming", "synchronous_commit", "two_phase"); + "password_required", "retain_dead_tuples", + "run_as_owner", "slot_name", "streaming", + "synchronous_commit", "two_phase"); /* ALTER SUBSCRIPTION <name> SKIP ( */ else if (Matches("ALTER", "SUBSCRIPTION", MatchAny, MatchAnyN, "SKIP", "(")) COMPLETE_WITH("lsn"); @@ -2516,7 +2520,10 @@ match_previous_words(int pattern_id, /* ALTER USER,ROLE <name> RESET */ else if (Matches("ALTER", "USER|ROLE", MatchAny, "RESET")) + { + set_completion_reference(prev2_wd); COMPLETE_WITH_QUERY_PLUS(Query_for_list_of_user_vars, "ALL"); + } /* ALTER USER,ROLE <name> WITH */ else if (Matches("ALTER", "USER|ROLE", MatchAny, "WITH")) @@ -3774,8 +3781,9 @@ match_previous_words(int pattern_id, else if (Matches("CREATE", "SUBSCRIPTION", MatchAnyN, "WITH", "(")) COMPLETE_WITH("binary", "connect", "copy_data", "create_slot", "disable_on_error", "enabled", "failover", "origin", - "password_required", "run_as_owner", "slot_name", - "streaming", "synchronous_commit", "two_phase"); + "password_required", "retain_dead_tuples", + "run_as_owner", "slot_name", "streaming", + "synchronous_commit", "two_phase"); /* CREATE TRIGGER --- is allowed inside CREATE SCHEMA, so use TailMatches */ @@ -4619,10 +4627,14 @@ match_previous_words(int pattern_id, else if (Matches("ALTER", "DEFAULT", "PRIVILEGES", MatchAnyN, "TO", MatchAny)) COMPLETE_WITH("WITH GRANT OPTION"); /* Complete "GRANT/REVOKE ... ON * *" with TO/FROM */ - else if (Matches("GRANT", MatchAnyN, "ON", MatchAny, MatchAny)) - COMPLETE_WITH("TO"); - else if (Matches("REVOKE", MatchAnyN, "ON", MatchAny, MatchAny)) - COMPLETE_WITH("FROM"); + else if (Matches("GRANT|REVOKE", MatchAnyN, "ON", MatchAny, MatchAny) && + !TailMatches("FOREIGN", "SERVER") && !TailMatches("LARGE", "OBJECT")) + { + if (Matches("GRANT", MatchAnyN, "ON", MatchAny, MatchAny)) + COMPLETE_WITH("TO"); + else + COMPLETE_WITH("FROM"); + } /* Complete "GRANT/REVOKE * ON ALL * IN SCHEMA *" with TO/FROM */ else if (TailMatches("GRANT|REVOKE", MatchAny, "ON", "ALL", MatchAny, "IN", "SCHEMA", MatchAny) || @@ -5009,7 +5021,7 @@ match_previous_words(int pattern_id, /* Complete with a variable name */ else if (TailMatches("SET|RESET") && !TailMatches("UPDATE", MatchAny, "SET") && - !TailMatches("ALTER", "DATABASE", MatchAny, "RESET")) + !TailMatches("ALTER", "DATABASE|USER|ROLE", MatchAny, "RESET")) COMPLETE_WITH_QUERY_VERBATIM_PLUS(Query_for_list_of_set_vars, "CONSTRAINTS", "TRANSACTION", diff --git a/src/common/Makefile b/src/common/Makefile index 1e2b91c83c4..2c720caa509 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -163,7 +163,7 @@ libpgcommon_shlib.a: $(OBJS_SHLIB) # The JSON API normally exits on out-of-memory; disable that behavior for shared # library builds. This requires libpq's pqexpbuffer.h. jsonapi_shlib.o: override CPPFLAGS += -DJSONAPI_USE_PQEXPBUFFER -jsonapi_shlib.o: override CPPFLAGS += -I$(libpq_srcdir) +jsonapi_shlib.o: override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) # Because this uses its own compilation rule, it doesn't use the # dependency tracking logic from Makefile.global. To make sure that diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 52916bab7a3..70949de56ac 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -293,7 +293,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; - aminsertcleanup_function aminsertcleanup; + aminsertcleanup_function aminsertcleanup; /* can be NULL */ ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 2cf8d55d706..cc06fc29ab2 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -316,16 +316,6 @@ typedef struct XLogRecData uint32 len; /* length of rmgr data to include */ } XLogRecData; -/* - * Recovery target action. - */ -typedef enum -{ - RECOVERY_TARGET_ACTION_PAUSE, - RECOVERY_TARGET_ACTION_PROMOTE, - RECOVERY_TARGET_ACTION_SHUTDOWN, -} RecoveryTargetAction; - struct LogicalDecodingContext; struct XLogRecordBuffer; diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h index 91446303024..8e475e266d1 100644 --- a/src/include/access/xlogrecovery.h +++ b/src/include/access/xlogrecovery.h @@ -40,6 +40,16 @@ typedef enum RECOVERY_TARGET_TIMELINE_NUMERIC, } RecoveryTargetTimeLineGoal; +/* + * Recovery target action. + */ +typedef enum +{ + RECOVERY_TARGET_ACTION_PAUSE, + RECOVERY_TARGET_ACTION_PROMOTE, + RECOVERY_TARGET_ACTION_SHUTDOWN, +} RecoveryTargetAction; + /* Recovery pause states */ typedef enum RecoveryPauseState { diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index a3f3315fed9..5173d422d46 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202507091 +#define CATALOG_VERSION_NO 202507231 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 1fc19146f46..3ee8fed7e53 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -11801,6 +11801,10 @@ proname => 'binary_upgrade_replorigin_advance', proisstrict => 'f', provolatile => 'v', proparallel => 'u', prorettype => 'void', proargtypes => 'text pg_lsn', prosrc => 'binary_upgrade_replorigin_advance' }, +{ oid => '9159', descr => 'for use by pg_upgrade (conflict detection slot)', + proname => 'binary_upgrade_create_conflict_detection_slot', proisstrict => 'f', + provolatile => 'v', proparallel => 'u', prorettype => 'void', + proargtypes => '', prosrc => 'binary_upgrade_create_conflict_detection_slot' }, # conversion functions { oid => '4302', diff --git a/src/include/catalog/pg_subscription.h b/src/include/catalog/pg_subscription.h index 20fc329992d..231ef84ec9a 100644 --- a/src/include/catalog/pg_subscription.h +++ b/src/include/catalog/pg_subscription.h @@ -78,6 +78,9 @@ CATALOG(pg_subscription,6100,SubscriptionRelationId) BKI_SHARED_RELATION BKI_ROW * slots) in the upstream database are enabled * to be synchronized to the standbys. */ + bool subretaindeadtuples; /* True if dead tuples useful for + * conflict detection are retained */ + #ifdef CATALOG_VARLEN /* variable-length fields start here */ /* Connection string to the publisher */ text subconninfo BKI_FORCE_NOT_NULL; @@ -131,6 +134,8 @@ typedef struct Subscription * (i.e. the main slot and the table sync * slots) in the upstream database are enabled * to be synchronized to the standbys. */ + bool retaindeadtuples; /* True if dead tuples useful for conflict + * detection are retained */ char *conninfo; /* Connection string to the publisher */ char *slotname; /* Name of the replication slot */ char *synccommit; /* Synchronous commit setting for worker */ diff --git a/src/include/catalog/pg_subscription_rel.h b/src/include/catalog/pg_subscription_rel.h index c91797c869c..f458447a0e5 100644 --- a/src/include/catalog/pg_subscription_rel.h +++ b/src/include/catalog/pg_subscription_rel.h @@ -85,7 +85,7 @@ typedef struct SubscriptionRelState extern void AddSubscriptionRelState(Oid subid, Oid relid, char state, XLogRecPtr sublsn, bool retain_lock); extern void UpdateSubscriptionRelState(Oid subid, Oid relid, char state, - XLogRecPtr sublsn); + XLogRecPtr sublsn, bool already_locked); extern char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn); extern void RemoveSubscriptionRel(Oid subid, Oid relid); diff --git a/src/include/commands/subscriptioncmds.h b/src/include/commands/subscriptioncmds.h index c2262e46a7f..9b288ad22a6 100644 --- a/src/include/commands/subscriptioncmds.h +++ b/src/include/commands/subscriptioncmds.h @@ -28,4 +28,9 @@ extern void AlterSubscriptionOwner_oid(Oid subid, Oid newOwnerId); extern char defGetStreamingMode(DefElem *def); +extern ObjectAddress AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, bool isTopLevel); + +extern void CheckSubDeadTupleRetention(bool check_guc, bool sub_disabled, + int elevel_for_sub_disabled); + #endif /* SUBSCRIPTIONCMDS_H */ diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index 2ed2c4bb378..cfd7daa20ed 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -213,7 +213,8 @@ extern bool ExecBRDeleteTriggers(EState *estate, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, - TM_FailureData *tmfd); + TM_FailureData *tmfd, + bool is_merge_delete); extern void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, ItemPointer tupleid, @@ -235,7 +236,8 @@ extern bool ExecBRUpdateTriggers(EState *estate, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, - TM_FailureData *tmfd); + TM_FailureData *tmfd, + bool is_merge_update); extern void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, diff --git a/src/include/libpq/libpq-be-fe-helpers.h b/src/include/libpq/libpq-be-fe-helpers.h index 16205b824fa..1c4a342090c 100644 --- a/src/include/libpq/libpq-be-fe-helpers.h +++ b/src/include/libpq/libpq-be-fe-helpers.h @@ -30,17 +30,7 @@ #ifndef LIBPQ_BE_FE_HELPERS_H #define LIBPQ_BE_FE_HELPERS_H -/* - * Despite the name, BUILDING_DLL is set only when building code directly part - * of the backend. Which also is where libpq isn't allowed to be - * used. Obviously this doesn't protect against libpq-fe.h getting included - * otherwise, but perhaps still protects against a few mistakes... - */ -#ifdef BUILDING_DLL -#error "libpq may not be used code directly built into the backend" -#endif - -#include "libpq-fe.h" +#include "libpq/libpq-be-fe.h" #include "miscadmin.h" #include "storage/fd.h" #include "storage/latch.h" @@ -289,41 +279,30 @@ libpqsrv_exec_params(PGconn *conn, static inline PGresult * libpqsrv_get_result_last(PGconn *conn, uint32 wait_event_info) { - PGresult *volatile lastResult = NULL; + PGresult *lastResult = NULL; - /* In what follows, do not leak any PGresults on an error. */ - PG_TRY(); + for (;;) { - for (;;) - { - /* Wait for, and collect, the next PGresult. */ - PGresult *result; - - result = libpqsrv_get_result(conn, wait_event_info); - if (result == NULL) - break; /* query is complete, or failure */ + /* Wait for, and collect, the next PGresult. */ + PGresult *result; - /* - * Emulate PQexec()'s behavior of returning the last result when - * there are many. - */ - PQclear(lastResult); - lastResult = result; + result = libpqsrv_get_result(conn, wait_event_info); + if (result == NULL) + break; /* query is complete, or failure */ - if (PQresultStatus(lastResult) == PGRES_COPY_IN || - PQresultStatus(lastResult) == PGRES_COPY_OUT || - PQresultStatus(lastResult) == PGRES_COPY_BOTH || - PQstatus(conn) == CONNECTION_BAD) - break; - } - } - PG_CATCH(); - { + /* + * Emulate PQexec()'s behavior of returning the last result when there + * are many. + */ PQclear(lastResult); - PG_RE_THROW(); - } - PG_END_TRY(); + lastResult = result; + if (PQresultStatus(lastResult) == PGRES_COPY_IN || + PQresultStatus(lastResult) == PGRES_COPY_OUT || + PQresultStatus(lastResult) == PGRES_COPY_BOTH || + PQstatus(conn) == CONNECTION_BAD) + break; + } return lastResult; } @@ -454,4 +433,45 @@ exit: ; return error; } +/* + * libpqsrv_notice_receiver + * + * Custom notice receiver for libpq connections. + * + * This function is intended to be set via PQsetNoticeReceiver() so that + * NOTICE, WARNING, and similar messages from the connection are reported via + * ereport(), instead of being printed to stderr. + * + * Because this will be called from libpq with a "real" (not wrapped) + * PGresult, we need to temporarily ignore libpq-be-fe.h's wrapper macros + * for PGresult and also PQresultErrorMessage, and put back the wrappers + * afterwards. That's not pretty, but there seems no better alternative. + */ +#undef PGresult +#undef PQresultErrorMessage + +static inline void +libpqsrv_notice_receiver(void *arg, const PGresult *res) +{ + const char *message; + int len; + const char *prefix = (const char *) arg; + + /* + * Trim the trailing newline from the message text returned from + * PQresultErrorMessage(), as it always includes one, to produce cleaner + * log output. + */ + message = PQresultErrorMessage(res); + len = strlen(message); + if (len > 0 && message[len - 1] == '\n') + len--; + + ereport(LOG, + errmsg_internal("%s: %.*s", prefix, len, message)); +} + +#define PGresult libpqsrv_PGresult +#define PQresultErrorMessage libpqsrv_PQresultErrorMessage + #endif /* LIBPQ_BE_FE_HELPERS_H */ diff --git a/src/include/libpq/libpq-be-fe.h b/src/include/libpq/libpq-be-fe.h new file mode 100644 index 00000000000..e3f796b0230 --- /dev/null +++ b/src/include/libpq/libpq-be-fe.h @@ -0,0 +1,259 @@ +/*------------------------------------------------------------------------- + * + * libpq-be-fe.h + * Wrapper functions for using libpq in extensions + * + * Code built directly into the backend is not allowed to link to libpq + * directly. Extension code is allowed to use libpq however. One of the + * main risks in doing so is leaking the malloc-allocated structures + * returned by libpq, causing a process-lifespan memory leak. + * + * This file provides wrapper objects to help in building memory-safe code. + * A PGresult object wrapped this way acts much as if it were palloc'd: + * it will go away when the specified context is reset or deleted. + * We might later extend the concept to other objects such as PGconns. + * + * See also the libpq-be-fe-helpers.h file, which provides additional + * facilities built on top of this one. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/libpq/libpq-be-fe.h + * + *------------------------------------------------------------------------- + */ +#ifndef LIBPQ_BE_FE_H +#define LIBPQ_BE_FE_H + +/* + * Despite the name, BUILDING_DLL is set only when building code directly part + * of the backend. Which also is where libpq isn't allowed to be + * used. Obviously this doesn't protect against libpq-fe.h getting included + * otherwise, but perhaps still protects against a few mistakes... + */ +#ifdef BUILDING_DLL +#error "libpq may not be used in code directly built into the backend" +#endif + +#include "libpq-fe.h" + +/* + * Memory-context-safe wrapper object for a PGresult. + */ +typedef struct libpqsrv_PGresult +{ + PGresult *res; /* the wrapped PGresult */ + MemoryContext ctx; /* the MemoryContext it's attached to */ + MemoryContextCallback cb; /* the callback that implements freeing */ +} libpqsrv_PGresult; + + +/* + * Wrap the given PGresult in a libpqsrv_PGresult object, so that it will + * go away automatically if the current memory context is reset or deleted. + * + * To avoid potential memory leaks, backend code must always apply this + * immediately to the output of any PGresult-yielding libpq function. + */ +static inline libpqsrv_PGresult * +libpqsrv_PQwrap(PGresult *res) +{ + libpqsrv_PGresult *bres; + MemoryContext ctx = CurrentMemoryContext; + + /* We pass through a NULL result as-is, since there's nothing to free */ + if (res == NULL) + return NULL; + /* Attempt to allocate the wrapper ... this had better not throw error */ + bres = (libpqsrv_PGresult *) + MemoryContextAllocExtended(ctx, + sizeof(libpqsrv_PGresult), + MCXT_ALLOC_NO_OOM); + /* If we failed to allocate a wrapper, free the PGresult before failing */ + if (bres == NULL) + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + /* Okay, set up the wrapper */ + bres->res = res; + bres->ctx = ctx; + bres->cb.func = (MemoryContextCallbackFunction) PQclear; + bres->cb.arg = res; + MemoryContextRegisterResetCallback(ctx, &bres->cb); + return bres; +} + +/* + * Free a wrapped PGresult, after detaching it from the memory context. + * Like PQclear(), allow the argument to be NULL. + */ +static inline void +libpqsrv_PQclear(libpqsrv_PGresult *bres) +{ + if (bres) + { + MemoryContextUnregisterResetCallback(bres->ctx, &bres->cb); + PQclear(bres->res); + pfree(bres); + } +} + +/* + * Move a wrapped PGresult to have a different parent context. + */ +static inline libpqsrv_PGresult * +libpqsrv_PGresultSetParent(libpqsrv_PGresult *bres, MemoryContext ctx) +{ + libpqsrv_PGresult *newres; + + /* We pass through a NULL result as-is */ + if (bres == NULL) + return NULL; + /* Make a new wrapper in the target context, raising error on OOM */ + newres = (libpqsrv_PGresult *) + MemoryContextAlloc(ctx, sizeof(libpqsrv_PGresult)); + /* Okay, set up the new wrapper */ + newres->res = bres->res; + newres->ctx = ctx; + newres->cb.func = (MemoryContextCallbackFunction) PQclear; + newres->cb.arg = bres->res; + MemoryContextRegisterResetCallback(ctx, &newres->cb); + /* Disarm and delete the old wrapper */ + MemoryContextUnregisterResetCallback(bres->ctx, &bres->cb); + pfree(bres); + return newres; +} + +/* + * Convenience wrapper for PQgetResult. + * + * We could supply wrappers for other PGresult-returning functions too, + * but at present there's no need. + */ +static inline libpqsrv_PGresult * +libpqsrv_PQgetResult(PGconn *conn) +{ + return libpqsrv_PQwrap(PQgetResult(conn)); +} + +/* + * Accessor functions for libpqsrv_PGresult. While it's not necessary to use + * these, they emulate the behavior of the underlying libpq functions when + * passed a NULL pointer. This is particularly important for PQresultStatus, + * which is often the first check on a result. + */ + +static inline ExecStatusType +libpqsrv_PQresultStatus(const libpqsrv_PGresult *res) +{ + if (!res) + return PGRES_FATAL_ERROR; + return PQresultStatus(res->res); +} + +static inline const char * +libpqsrv_PQresultErrorMessage(const libpqsrv_PGresult *res) +{ + if (!res) + return ""; + return PQresultErrorMessage(res->res); +} + +static inline char * +libpqsrv_PQresultErrorField(const libpqsrv_PGresult *res, int fieldcode) +{ + if (!res) + return NULL; + return PQresultErrorField(res->res, fieldcode); +} + +static inline char * +libpqsrv_PQcmdStatus(const libpqsrv_PGresult *res) +{ + if (!res) + return NULL; + return PQcmdStatus(res->res); +} + +static inline int +libpqsrv_PQntuples(const libpqsrv_PGresult *res) +{ + if (!res) + return 0; + return PQntuples(res->res); +} + +static inline int +libpqsrv_PQnfields(const libpqsrv_PGresult *res) +{ + if (!res) + return 0; + return PQnfields(res->res); +} + +static inline char * +libpqsrv_PQgetvalue(const libpqsrv_PGresult *res, int tup_num, int field_num) +{ + if (!res) + return NULL; + return PQgetvalue(res->res, tup_num, field_num); +} + +static inline int +libpqsrv_PQgetlength(const libpqsrv_PGresult *res, int tup_num, int field_num) +{ + if (!res) + return 0; + return PQgetlength(res->res, tup_num, field_num); +} + +static inline int +libpqsrv_PQgetisnull(const libpqsrv_PGresult *res, int tup_num, int field_num) +{ + if (!res) + return 1; /* pretend it is null */ + return PQgetisnull(res->res, tup_num, field_num); +} + +static inline char * +libpqsrv_PQfname(const libpqsrv_PGresult *res, int field_num) +{ + if (!res) + return NULL; + return PQfname(res->res, field_num); +} + +static inline const char * +libpqsrv_PQcmdTuples(const libpqsrv_PGresult *res) +{ + if (!res) + return ""; + return PQcmdTuples(res->res); +} + +/* + * Redefine these libpq entry point names concerned with PGresults so that + * they will operate on libpqsrv_PGresults instead. This avoids needing to + * convert a lot of pre-existing code, and reduces the notational differences + * between frontend and backend libpq-using code. + */ +#define PGresult libpqsrv_PGresult +#define PQclear libpqsrv_PQclear +#define PQgetResult libpqsrv_PQgetResult +#define PQresultStatus libpqsrv_PQresultStatus +#define PQresultErrorMessage libpqsrv_PQresultErrorMessage +#define PQresultErrorField libpqsrv_PQresultErrorField +#define PQcmdStatus libpqsrv_PQcmdStatus +#define PQntuples libpqsrv_PQntuples +#define PQnfields libpqsrv_PQnfields +#define PQgetvalue libpqsrv_PQgetvalue +#define PQgetlength libpqsrv_PQgetlength +#define PQgetisnull libpqsrv_PQgetisnull +#define PQfname libpqsrv_PQfname +#define PQcmdTuples libpqsrv_PQcmdTuples + +#endif /* LIBPQ_BE_FE_H */ diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 6567759595d..ad2726f026f 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -179,6 +179,9 @@ typedef struct PlannerGlobal /* partition descriptors */ PartitionDirectory partition_directory pg_node_attr(read_write_ignore); + + /* hash table for NOT NULL attnums of relations */ + struct HTAB *rel_notnullatts_hash pg_node_attr(read_write_ignore); } PlannerGlobal; /* macro for fetching the Plan associated with a SubPlan node */ @@ -719,6 +722,9 @@ typedef struct PartitionSchemeData *PartitionScheme; * the attribute is needed as part of final targetlist * attr_widths - cache space for per-attribute width estimates; * zero means not computed yet + * notnullattnums - zero-based set containing attnums of NOT NULL + * columns (not populated for rels corresponding to + * non-partitioned inh==true RTEs) * nulling_relids - relids of outer joins that can null this rel * lateral_vars - lateral cross-references of rel, if any (list of * Vars and PlaceHolderVars) @@ -952,11 +958,7 @@ typedef struct RelOptInfo Relids *attr_needed pg_node_attr(read_write_ignore); /* array indexed [min_attr .. max_attr] */ int32 *attr_widths pg_node_attr(read_write_ignore); - - /* - * Zero-based set containing attnums of NOT NULL columns. Not populated - * for rels corresponding to non-partitioned inh==true RTEs. - */ + /* zero-based set containing attnums of NOT NULL columns */ Bitmapset *notnullattnums; /* relids of outer joins that can null this baserel */ Relids nulling_relids; @@ -2131,10 +2133,12 @@ typedef struct MemoizePath * complete after caching the first record. */ bool binary_mode; /* true when cache key should be compared bit * by bit, false when using hash equality ops */ - Cardinality calls; /* expected number of rescans */ uint32 est_entries; /* The maximum number of entries that the * planner expects will fit in the cache, or 0 * if unknown */ + Cardinality est_calls; /* expected number of rescans */ + Cardinality est_unique_keys; /* estimated unique keys, for EXPLAIN */ + double est_hit_ratio; /* estimated cache hit ratio, for EXPLAIN */ } MemoizePath; /* diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 4f59e30d62d..29d7732d6a0 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -29,6 +29,21 @@ */ /* ---------------- + * PlannedStmtOrigin + * + * PlannedStmtOrigin identifies from where a PlannedStmt comes from. + * ---------------- + */ +typedef enum PlannedStmtOrigin +{ + PLAN_STMT_UNKNOWN = 0, /* plan origin is not yet known */ + PLAN_STMT_INTERNAL, /* generated internally by a query */ + PLAN_STMT_STANDARD, /* standard planned statement */ + PLAN_STMT_CACHE_GENERIC, /* Generic cached plan */ + PLAN_STMT_CACHE_CUSTOM, /* Custom cached plan */ +} PlannedStmtOrigin; + +/* ---------------- * PlannedStmt node * * The output of the planner is a Plan tree headed by a PlannedStmt node. @@ -58,6 +73,9 @@ typedef struct PlannedStmt /* plan identifier (can be set by plugins) */ int64 planId; + /* origin of plan */ + PlannedStmtOrigin planOrigin; + /* is it insert|update|delete|merge RETURNING? */ bool hasReturning; @@ -1056,6 +1074,16 @@ typedef struct Memoize /* paramids from param_exprs */ Bitmapset *keyparamids; + + /* Estimated number of rescans, for EXPLAIN */ + Cardinality est_calls; + + /* Estimated number of distinct lookup keys, for EXPLAIN */ + Cardinality est_unique_keys; + + /* Estimated cache hit ratio, for EXPLAIN */ + double est_hit_ratio; + } Memoize; /* ---------------- diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h index 546828b54bd..37bc13c2cbd 100644 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -154,6 +154,8 @@ extern Node *estimate_expression_value(PlannerInfo *root, Node *node); extern Expr *evaluate_expr(Expr *expr, Oid result_type, int32 result_typmod, Oid result_collation); +extern bool var_is_nonnullable(PlannerInfo *root, Var *var, bool use_rel_info); + extern List *expand_function_arguments(List *args, bool include_out_arguments, Oid result_type, struct HeapTupleData *func_tuple); diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 60dcdb77e41..58936e963cb 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -90,7 +90,7 @@ extern MemoizePath *create_memoize_path(PlannerInfo *root, List *hash_operators, bool singlerow, bool binary_mode, - double calls); + Cardinality est_calls); extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, SpecialJoinInfo *sjinfo); extern GatherPath *create_gather_path(PlannerInfo *root, diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index cd74e4b1e8b..d6f6f4ad2d7 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -28,6 +28,10 @@ extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel); +extern void get_relation_notnullatts(PlannerInfo *root, Relation relation); + +extern Relids find_relation_notnullatts(PlannerInfo *root, Oid relid); + extern List *infer_arbiter_indexes(PlannerInfo *root); extern void estimate_rel_size(Relation rel, int32 *attr_widths, diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h index df56202777c..4fbecdb4462 100644 --- a/src/include/optimizer/prep.h +++ b/src/include/optimizer/prep.h @@ -22,10 +22,10 @@ * prototypes for prepjointree.c */ extern void transform_MERGE_to_join(Query *parse); +extern Query *preprocess_relation_rtes(PlannerInfo *root); extern void replace_empty_jointree(Query *parse); extern void pull_up_sublinks(PlannerInfo *root); extern void preprocess_function_rtes(PlannerInfo *root); -extern Query *expand_virtual_generated_columns(PlannerInfo *root); extern void pull_up_subqueries(PlannerInfo *root); extern void flatten_simple_union_all(PlannerInfo *root); extern void reduce_outer_joins(PlannerInfo *root); diff --git a/src/include/port/solaris.h b/src/include/port/solaris.h index e63a3bd824d..8ff40007c7f 100644 --- a/src/include/port/solaris.h +++ b/src/include/port/solaris.h @@ -24,3 +24,12 @@ #if defined(__i386__) #include <sys/isa_defs.h> #endif + +/* + * On original Solaris, PAM conversation procs lack a "const" in their + * declaration; but recent OpenIndiana versions put it there by default. + * The least messy way to deal with this is to define _PAM_LEGACY_NONCONST, + * which causes OpenIndiana to declare pam_conv per the Solaris tradition, + * and also use that symbol to control omitting the "const" in our own code. + */ +#define _PAM_LEGACY_NONCONST 1 diff --git a/src/include/replication/logicallauncher.h b/src/include/replication/logicallauncher.h index 82b202f3305..b29453e8e4f 100644 --- a/src/include/replication/logicallauncher.h +++ b/src/include/replication/logicallauncher.h @@ -25,8 +25,11 @@ extern void ApplyLauncherShmemInit(void); extern void ApplyLauncherForgetWorkerStartTime(Oid subid); extern void ApplyLauncherWakeupAtCommit(void); +extern void ApplyLauncherWakeup(void); extern void AtEOXact_ApplyLauncher(bool isCommit); +extern void CreateConflictDetectionSlot(void); + extern bool IsLogicalLauncher(void); extern pid_t GetLeaderApplyWorkerPid(pid_t pid); diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h index 76aeeb92242..e8fc342d1a9 100644 --- a/src/include/replication/slot.h +++ b/src/include/replication/slot.h @@ -21,6 +21,13 @@ #define PG_REPLSLOT_DIR "pg_replslot" /* + * The reserved name for a replication slot used to retain dead tuples for + * conflict detection in logical replication. See + * maybe_advance_nonremovable_xid() for detail. + */ +#define CONFLICT_DETECTION_SLOT "pg_conflict_detection" + +/* * Behaviour of replication slots, upon release or crash. * * Slots marked as PERSISTENT are crash-safe and will not be dropped when @@ -220,6 +227,25 @@ typedef struct ReplicationSlot * Latest restart_lsn that has been flushed to disk. For persistent slots * the flushed LSN should be taken into account when calculating the * oldest LSN for WAL segments removal. + * + * Do not assume that restart_lsn will always move forward, i.e., that the + * previously flushed restart_lsn is always behind data.restart_lsn. In + * streaming replication using a physical slot, the restart_lsn is updated + * based on the flushed WAL position reported by the walreceiver. + * + * This replication mode allows duplicate WAL records to be received and + * overwritten. If the walreceiver receives older WAL records and then + * reports them as flushed to the walsender, the restart_lsn may appear to + * move backward. + * + * This typically occurs at the beginning of replication. One reason is + * that streaming replication starts at the beginning of a segment, so, if + * restart_lsn is in the middle of a segment, it will be updated to an + * earlier LSN, see RequestXLogStreaming. Another reason is that the + * walreceiver chooses its startpoint based on the replayed LSN, so, if + * some records have been received but not yet applied, they will be + * received again and leads to updating the restart_lsn to an earlier + * position. */ XLogRecPtr last_saved_restart_lsn; @@ -292,7 +318,9 @@ extern void ReplicationSlotMarkDirty(void); /* misc stuff */ extern void ReplicationSlotInitialize(void); -extern bool ReplicationSlotValidateName(const char *name, int elevel); +extern bool ReplicationSlotValidateName(const char *name, + bool allow_reserved_name, + int elevel); extern void ReplicationSlotReserveWal(void); extern void ReplicationSlotsComputeRequiredXmin(bool already_locked); extern void ReplicationSlotsComputeRequiredLSN(void); diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h index 30b2775952c..0c7b8440a61 100644 --- a/src/include/replication/worker_internal.h +++ b/src/include/replication/worker_internal.h @@ -86,6 +86,16 @@ typedef struct LogicalRepWorker /* Indicates whether apply can be performed in parallel. */ bool parallel_apply; + /* + * The changes made by this and later transactions must be retained to + * ensure reliable conflict detection during the apply phase. + * + * The logical replication launcher manages an internal replication slot + * named "pg_conflict_detection". It asynchronously collects this ID to + * decide when to advance the xmin value of the slot. + */ + TransactionId oldest_nonremovable_xid; + /* Stats. */ XLogRecPtr last_lsn; TimestampTz last_send_time; @@ -245,7 +255,8 @@ extern List *logicalrep_workers_find(Oid subid, bool only_running, extern bool logicalrep_worker_launch(LogicalRepWorkerType wtype, Oid dbid, Oid subid, const char *subname, Oid userid, Oid relid, - dsm_handle subworker_dsm); + dsm_handle subworker_dsm, + bool retain_dead_tuples); extern void logicalrep_worker_stop(Oid subid, Oid relid); extern void logicalrep_pa_worker_stop(ParallelApplyWorkerInfo *winfo); extern void logicalrep_worker_wakeup(Oid subid, Oid relid); diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h index e7a0a234b6c..2933eea0649 100644 --- a/src/include/storage/aio.h +++ b/src/include/storage/aio.h @@ -201,7 +201,7 @@ typedef enum PgAioHandleCallbackID } PgAioHandleCallbackID; #define PGAIO_HCB_MAX PGAIO_HCB_LOCAL_BUFFER_READV -StaticAssertDecl(PGAIO_HCB_MAX <= (1 << PGAIO_RESULT_ID_BITS), +StaticAssertDecl(PGAIO_HCB_MAX < (1 << PGAIO_RESULT_ID_BITS), "PGAIO_HCB_MAX is too big for PGAIO_RESULT_ID_BITS"); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 08a72569ae5..5e717765764 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -176,51 +176,23 @@ extern void LWLockInitialize(LWLock *lock, int tranche_id); * Every tranche ID less than NUM_INDIVIDUAL_LWLOCKS is reserved; also, * we reserve additional tranche IDs for builtin tranches not included in * the set of individual LWLocks. A call to LWLockNewTrancheId will never - * return a value less than LWTRANCHE_FIRST_USER_DEFINED. + * return a value less than LWTRANCHE_FIRST_USER_DEFINED. The actual list of + * built-in tranches is kept in lwlocklist.h. */ typedef enum BuiltinTrancheIds { - LWTRANCHE_XACT_BUFFER = NUM_INDIVIDUAL_LWLOCKS, - LWTRANCHE_COMMITTS_BUFFER, - LWTRANCHE_SUBTRANS_BUFFER, - LWTRANCHE_MULTIXACTOFFSET_BUFFER, - LWTRANCHE_MULTIXACTMEMBER_BUFFER, - LWTRANCHE_NOTIFY_BUFFER, - LWTRANCHE_SERIAL_BUFFER, - LWTRANCHE_WAL_INSERT, - LWTRANCHE_BUFFER_CONTENT, - LWTRANCHE_REPLICATION_ORIGIN_STATE, - LWTRANCHE_REPLICATION_SLOT_IO, - LWTRANCHE_LOCK_FASTPATH, - LWTRANCHE_BUFFER_MAPPING, - LWTRANCHE_LOCK_MANAGER, - LWTRANCHE_PREDICATE_LOCK_MANAGER, - LWTRANCHE_PARALLEL_HASH_JOIN, - LWTRANCHE_PARALLEL_BTREE_SCAN, - LWTRANCHE_PARALLEL_QUERY_DSA, - LWTRANCHE_PER_SESSION_DSA, - LWTRANCHE_PER_SESSION_RECORD_TYPE, - LWTRANCHE_PER_SESSION_RECORD_TYPMOD, - LWTRANCHE_SHARED_TUPLESTORE, - LWTRANCHE_SHARED_TIDBITMAP, - LWTRANCHE_PARALLEL_APPEND, - LWTRANCHE_PER_XACT_PREDICATE_LIST, - LWTRANCHE_PGSTATS_DSA, - LWTRANCHE_PGSTATS_HASH, - LWTRANCHE_PGSTATS_DATA, - LWTRANCHE_LAUNCHER_DSA, - LWTRANCHE_LAUNCHER_HASH, - LWTRANCHE_DSM_REGISTRY_DSA, - LWTRANCHE_DSM_REGISTRY_HASH, - LWTRANCHE_COMMITTS_SLRU, - LWTRANCHE_MULTIXACTMEMBER_SLRU, - LWTRANCHE_MULTIXACTOFFSET_SLRU, - LWTRANCHE_NOTIFY_SLRU, - LWTRANCHE_SERIAL_SLRU, - LWTRANCHE_SUBTRANS_SLRU, - LWTRANCHE_XACT_SLRU, - LWTRANCHE_PARALLEL_VACUUM_DSA, - LWTRANCHE_AIO_URING_COMPLETION, + /* + * LWTRANCHE_INVALID is an unused value that only exists to initialize the + * rest of the tranches to appropriate values. + */ + LWTRANCHE_INVALID = NUM_INDIVIDUAL_LWLOCKS - 1, + +#define PG_LWLOCK(id, name) +#define PG_LWLOCKTRANCHE(id, name) LWTRANCHE_##id, +#include "storage/lwlocklist.h" +#undef PG_LWLOCK +#undef PG_LWLOCKTRANCHE + LWTRANCHE_FIRST_USER_DEFINED, } BuiltinTrancheIds; diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index a9681738146..208d2e3a8ed 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -2,9 +2,10 @@ * * lwlocklist.h * - * The predefined LWLock list is kept in its own source file for use by - * automatic tools. The exact representation of a keyword is determined by - * the PG_LWLOCK macro, which is not defined in this file; it can be + * The list of predefined LWLocks and built-in LWLock tranches is kept in + * its own source file for use by automatic tools. The exact + * representation of a keyword is determined by the PG_LWLOCK and + * PG_LWLOCKTRANCHE macros, which are not defined in this file; they can be * defined by the caller for special purposes. * * Also, generate-lwlocknames.pl processes this file to create lwlocknames.h. @@ -84,3 +85,53 @@ PG_LWLOCK(50, DSMRegistry) PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, AioWorkerSubmissionQueue) + +/* + * There also exist several built-in LWLock tranches. As with the predefined + * LWLocks, be sure to update the WaitEventLWLock section of + * src/backend/utils/activity/wait_event_names.txt when modifying this list. + * + * Note that the IDs here (the first value) don't include the LWTRANCHE_ + * prefix. It's added elsewhere. + */ +PG_LWLOCKTRANCHE(XACT_BUFFER, XactBuffer) +PG_LWLOCKTRANCHE(COMMITTS_BUFFER, CommitTsBuffer) +PG_LWLOCKTRANCHE(SUBTRANS_BUFFER, SubtransBuffer) +PG_LWLOCKTRANCHE(MULTIXACTOFFSET_BUFFER, MultiXactOffsetBuffer) +PG_LWLOCKTRANCHE(MULTIXACTMEMBER_BUFFER, MultiXactMemberBuffer) +PG_LWLOCKTRANCHE(NOTIFY_BUFFER, NotifyBuffer) +PG_LWLOCKTRANCHE(SERIAL_BUFFER, SerialBuffer) +PG_LWLOCKTRANCHE(WAL_INSERT, WALInsert) +PG_LWLOCKTRANCHE(BUFFER_CONTENT, BufferContent) +PG_LWLOCKTRANCHE(REPLICATION_ORIGIN_STATE, ReplicationOriginState) +PG_LWLOCKTRANCHE(REPLICATION_SLOT_IO, ReplicationSlotIO) +PG_LWLOCKTRANCHE(LOCK_FASTPATH, LockFastPath) +PG_LWLOCKTRANCHE(BUFFER_MAPPING, BufferMapping) +PG_LWLOCKTRANCHE(LOCK_MANAGER, LockManager) +PG_LWLOCKTRANCHE(PREDICATE_LOCK_MANAGER, PredicateLockManager) +PG_LWLOCKTRANCHE(PARALLEL_HASH_JOIN, ParallelHashJoin) +PG_LWLOCKTRANCHE(PARALLEL_BTREE_SCAN, ParallelBtreeScan) +PG_LWLOCKTRANCHE(PARALLEL_QUERY_DSA, ParallelQueryDSA) +PG_LWLOCKTRANCHE(PER_SESSION_DSA, PerSessionDSA) +PG_LWLOCKTRANCHE(PER_SESSION_RECORD_TYPE, PerSessionRecordType) +PG_LWLOCKTRANCHE(PER_SESSION_RECORD_TYPMOD, PerSessionRecordTypmod) +PG_LWLOCKTRANCHE(SHARED_TUPLESTORE, SharedTupleStore) +PG_LWLOCKTRANCHE(SHARED_TIDBITMAP, SharedTidBitmap) +PG_LWLOCKTRANCHE(PARALLEL_APPEND, ParallelAppend) +PG_LWLOCKTRANCHE(PER_XACT_PREDICATE_LIST, PerXactPredicateList) +PG_LWLOCKTRANCHE(PGSTATS_DSA, PgStatsDSA) +PG_LWLOCKTRANCHE(PGSTATS_HASH, PgStatsHash) +PG_LWLOCKTRANCHE(PGSTATS_DATA, PgStatsData) +PG_LWLOCKTRANCHE(LAUNCHER_DSA, LogicalRepLauncherDSA) +PG_LWLOCKTRANCHE(LAUNCHER_HASH, LogicalRepLauncherHash) +PG_LWLOCKTRANCHE(DSM_REGISTRY_DSA, DSMRegistryDSA) +PG_LWLOCKTRANCHE(DSM_REGISTRY_HASH, DSMRegistryHash) +PG_LWLOCKTRANCHE(COMMITTS_SLRU, CommitTsSLRU) +PG_LWLOCKTRANCHE(MULTIXACTOFFSET_SLRU, MultiXactOffsetSLRU) +PG_LWLOCKTRANCHE(MULTIXACTMEMBER_SLRU, MultiXactMemberSLRU) +PG_LWLOCKTRANCHE(NOTIFY_SLRU, NotifySLRU) +PG_LWLOCKTRANCHE(SERIAL_SLRU, SerialSLRU) +PG_LWLOCKTRANCHE(SUBTRANS_SLRU, SubtransSLRU) +PG_LWLOCKTRANCHE(XACT_SLRU, XactSLRU) +PG_LWLOCKTRANCHE(PARALLEL_VACUUM_DSA, ParallelVacuumDSA) +PG_LWLOCKTRANCHE(AIO_URING_COMPLETION, AioUringCompletion) diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 9f9b3fcfbf1..c6f5ebceefd 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -130,9 +130,17 @@ extern PGDLLIMPORT int FastPathLockGroupsPerBackend; * the checkpoint are actually destroyed on disk. Replay can cope with a file * or block that doesn't exist, but not with a block that has the wrong * contents. + * + * Setting DELAY_CHKPT_IN_COMMIT is similar to setting DELAY_CHKPT_START, but + * it explicitly indicates that the reason for delaying the checkpoint is due + * to a transaction being within a critical commit section. We need this new + * flag to ensure all the transactions that have acquired commit timestamp are + * finished before we allow the logical replication client to advance its xid + * which is used to hold back dead rows for conflict detection. */ #define DELAY_CHKPT_START (1<<0) #define DELAY_CHKPT_COMPLETE (1<<1) +#define DELAY_CHKPT_IN_COMMIT (DELAY_CHKPT_START | 1<<2) typedef enum { diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index e4877d88e8f..2f4ae06c279 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -55,7 +55,8 @@ extern RunningTransactions GetRunningTransactionData(void); extern bool TransactionIdIsInProgress(TransactionId xid); extern TransactionId GetOldestNonRemovableTransactionId(Relation rel); extern TransactionId GetOldestTransactionIdConsideredRunning(void); -extern TransactionId GetOldestActiveTransactionId(void); +extern TransactionId GetOldestActiveTransactionId(bool inCommitOnly, + bool allDbs); extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); extern void GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin); diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h index 277ec33c00b..00808e23f49 100644 --- a/src/include/utils/catcache.h +++ b/src/include/utils/catcache.h @@ -87,6 +87,14 @@ typedef struct catcache typedef struct catctup { + /* + * Each tuple in a cache is a member of a dlist that stores the elements + * of its hash bucket. We keep each dlist in LRU order to speed repeated + * lookups. Keep the dlist_node field first so that Valgrind understands + * the struct is reachable. + */ + dlist_node cache_elem; /* list member of per-bucket list */ + int ct_magic; /* for identifying CatCTup entries */ #define CT_MAGIC 0x57261502 @@ -99,13 +107,6 @@ typedef struct catctup Datum keys[CATCACHE_MAXKEYS]; /* - * Each tuple in a cache is a member of a dlist that stores the elements - * of its hash bucket. We keep each dlist in LRU order to speed repeated - * lookups. - */ - dlist_node cache_elem; /* list member of per-bucket list */ - - /* * A tuple marked "dead" must not be returned by subsequent searches. * However, it won't be physically deleted from the cache until its * refcount goes to zero. (If it's a member of a CatCList, the list's @@ -158,13 +159,17 @@ typedef struct catctup */ typedef struct catclist { + /* + * Keep the dlist_node field first so that Valgrind understands the struct + * is reachable. + */ + dlist_node cache_elem; /* list member of per-catcache list */ + int cl_magic; /* for identifying CatCList entries */ #define CL_MAGIC 0x52765103 uint32 hash_value; /* hash value for lookup keys */ - dlist_node cache_elem; /* list member of per-catcache list */ - /* * Lookup keys for the entry, with the first nkeys elements being valid. * All by-reference are separately allocated. diff --git a/src/include/utils/memdebug.h b/src/include/utils/memdebug.h index 7309271834b..80692dcef93 100644 --- a/src/include/utils/memdebug.h +++ b/src/include/utils/memdebug.h @@ -29,6 +29,7 @@ #define VALGRIND_MEMPOOL_ALLOC(context, addr, size) do {} while (0) #define VALGRIND_MEMPOOL_FREE(context, addr) do {} while (0) #define VALGRIND_MEMPOOL_CHANGE(context, optr, nptr, size) do {} while (0) +#define VALGRIND_MEMPOOL_TRIM(context, addr, size) do {} while (0) #endif diff --git a/src/include/utils/palloc.h b/src/include/utils/palloc.h index e1b42267b22..039b9cba61a 100644 --- a/src/include/utils/palloc.h +++ b/src/include/utils/palloc.h @@ -133,6 +133,8 @@ MemoryContextSwitchTo(MemoryContext context) /* Registration of memory context reset/delete callbacks */ extern void MemoryContextRegisterResetCallback(MemoryContext context, MemoryContextCallback *cb); +extern void MemoryContextUnregisterResetCallback(MemoryContext context, + MemoryContextCallback *cb); /* * These are like standard strdup() except the copied string is diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h index d5557e6e998..6cf00008f63 100644 --- a/src/include/utils/pgstat_internal.h +++ b/src/include/utils/pgstat_internal.h @@ -295,18 +295,11 @@ typedef struct PgStat_KindInfo * * Returns true if some of the stats could not be flushed, due to lock * contention for example. Optional. - */ - bool (*flush_static_cb) (bool nowait); - - /* - * For fixed-numbered or variable-numbered statistics: Check for pending - * stats in need of flush with flush_static_cb, when these do not use - * PgStat_EntryRef->pending. * - * Returns true if there are any stats pending for flush, triggering - * flush_static_cb. Optional. + * "pgstat_report_fixed" needs to be set to trigger the flush of pending + * stats. */ - bool (*have_static_pending_cb) (void); + bool (*flush_static_cb) (bool nowait); /* * For fixed-numbered statistics: Reset All. @@ -627,7 +620,6 @@ extern void pgstat_archiver_snapshot_cb(void); extern bool pgstat_flush_backend(bool nowait, bits32 flags); extern bool pgstat_backend_flush_cb(bool nowait); -extern bool pgstat_backend_have_pending_cb(void); extern void pgstat_backend_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); @@ -676,7 +668,6 @@ extern bool pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); extern void pgstat_flush_io(bool nowait); -extern bool pgstat_io_have_pending_cb(void); extern bool pgstat_io_flush_cb(bool nowait); extern void pgstat_io_init_shmem_cb(void *stats); extern void pgstat_io_reset_all_cb(TimestampTz ts); @@ -738,7 +729,6 @@ extern PgStatShared_Common *pgstat_init_entry(PgStat_Kind kind, * Functions in pgstat_slru.c */ -extern bool pgstat_slru_have_pending_cb(void); extern bool pgstat_slru_flush_cb(bool nowait); extern void pgstat_slru_init_shmem_cb(void *stats); extern void pgstat_slru_reset_all_cb(TimestampTz ts); @@ -750,7 +740,6 @@ extern void pgstat_slru_snapshot_cb(void); */ extern void pgstat_wal_init_backend_cb(void); -extern bool pgstat_wal_have_pending_cb(void); extern bool pgstat_wal_flush_cb(bool nowait); extern void pgstat_wal_init_shmem_cb(void *stats); extern void pgstat_wal_reset_all_cb(TimestampTz ts); @@ -778,8 +767,23 @@ extern void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, uint64 obji * Variables in pgstat.c */ -extern PGDLLIMPORT PgStat_LocalState pgStatLocal; +/* + * Track if *any* pending fixed-numbered statistics should be flushed to + * shared memory. + * + * This flag can be switched to true by fixed-numbered statistics to let + * pgstat_report_stat() know if it needs to go through one round of + * reports, calling flush_static_cb for each fixed-numbered statistics + * kind. When this flag is not set, pgstat_report_stat() is able to do + * a fast exit, knowing that there are no pending fixed-numbered statistics. + * + * Statistics callbacks should never reset this flag; pgstat_report_stat() + * is in charge of doing that. + */ +extern PGDLLIMPORT bool pgstat_report_fixed; +/* Backend-local stats state */ +extern PGDLLIMPORT PgStat_LocalState pgStatLocal; /* * Implementation of inline functions declared above. diff --git a/src/include/utils/pgstat_kind.h b/src/include/utils/pgstat_kind.h index f44169fd5a3..eb5f0b3ae6d 100644 --- a/src/include/utils/pgstat_kind.h +++ b/src/include/utils/pgstat_kind.h @@ -18,7 +18,7 @@ /* Range of IDs allowed, for built-in and custom kinds */ #define PGSTAT_KIND_MIN 1 /* Minimum ID allowed */ -#define PGSTAT_KIND_MAX 256 /* Maximum ID allowed */ +#define PGSTAT_KIND_MAX 32 /* Maximum ID allowed */ /* use 0 for INVALID, to catch zero-initialized data */ #define PGSTAT_KIND_INVALID 0 @@ -46,7 +46,7 @@ /* Custom stats kinds */ /* Range of IDs allowed for custom stats kinds */ -#define PGSTAT_KIND_CUSTOM_MIN 128 +#define PGSTAT_KIND_CUSTOM_MIN 24 #define PGSTAT_KIND_CUSTOM_MAX PGSTAT_KIND_MAX #define PGSTAT_KIND_CUSTOM_SIZE (PGSTAT_KIND_CUSTOM_MAX - PGSTAT_KIND_CUSTOM_MIN + 1) @@ -55,7 +55,7 @@ * development and have not reserved their own unique kind ID yet. See: * https://wiki.postgresql.org/wiki/CustomCumulativeStats */ -#define PGSTAT_KIND_EXPERIMENTAL 128 +#define PGSTAT_KIND_EXPERIMENTAL 24 static inline bool pgstat_is_kind_builtin(PgStat_Kind kind) diff --git a/src/interfaces/ecpg/ecpglib/connect.c b/src/interfaces/ecpg/ecpglib/connect.c index 2bbb70333dc..78de9f298ba 100644 --- a/src/interfaces/ecpg/ecpglib/connect.c +++ b/src/interfaces/ecpg/ecpglib/connect.c @@ -58,7 +58,12 @@ ecpg_get_connection_nr(const char *connection_name) for (con = all_connections; con != NULL; con = con->next) { - if (strcmp(connection_name, con->name) == 0) + /* + * Check for the case of a NULL connection name, stored as such in + * the connection information by ECPGconnect() when the database + * name is not specified by its caller. + */ + if (con->name != NULL && strcmp(connection_name, con->name) == 0) break; } ret = con; @@ -259,7 +264,8 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p struct connection *this; int i, connect_params = 0; - char *dbname = name ? ecpg_strdup(name, lineno) : NULL, + bool alloc_failed = (sqlca == NULL); + char *dbname = name ? ecpg_strdup(name, lineno, &alloc_failed) : NULL, *host = NULL, *tmp, *port = NULL, @@ -268,11 +274,12 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p const char **conn_keywords; const char **conn_values; - if (sqlca == NULL) + if (alloc_failed) { ecpg_raise(lineno, ECPG_OUT_OF_MEMORY, ECPG_SQLSTATE_ECPG_OUT_OF_MEMORY, NULL); - ecpg_free(dbname); + if (dbname) + ecpg_free(dbname); return false; } @@ -297,7 +304,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p if (envname) { ecpg_free(dbname); - dbname = ecpg_strdup(envname, lineno); + dbname = ecpg_strdup(envname, lineno, &alloc_failed); } } @@ -349,7 +356,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p tmp = strrchr(dbname + offset, '?'); if (tmp != NULL) /* options given */ { - options = ecpg_strdup(tmp + 1, lineno); + options = ecpg_strdup(tmp + 1, lineno, &alloc_failed); *tmp = '\0'; } @@ -358,7 +365,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p { if (tmp[1] != '\0') /* non-empty database name */ { - realname = ecpg_strdup(tmp + 1, lineno); + realname = ecpg_strdup(tmp + 1, lineno, &alloc_failed); connect_params++; } *tmp = '\0'; @@ -368,7 +375,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p if (tmp != NULL) /* port number given */ { *tmp = '\0'; - port = ecpg_strdup(tmp + 1, lineno); + port = ecpg_strdup(tmp + 1, lineno, &alloc_failed); connect_params++; } @@ -402,7 +409,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p { if (*(dbname + offset) != '\0') { - host = ecpg_strdup(dbname + offset, lineno); + host = ecpg_strdup(dbname + offset, lineno, &alloc_failed); connect_params++; } } @@ -414,7 +421,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p tmp = strrchr(dbname, ':'); if (tmp != NULL) /* port number given */ { - port = ecpg_strdup(tmp + 1, lineno); + port = ecpg_strdup(tmp + 1, lineno, &alloc_failed); connect_params++; *tmp = '\0'; } @@ -422,14 +429,14 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p tmp = strrchr(dbname, '@'); if (tmp != NULL) /* host name given */ { - host = ecpg_strdup(tmp + 1, lineno); + host = ecpg_strdup(tmp + 1, lineno, &alloc_failed); connect_params++; *tmp = '\0'; } if (strlen(dbname) > 0) { - realname = ecpg_strdup(dbname, lineno); + realname = ecpg_strdup(dbname, lineno, &alloc_failed); connect_params++; } else @@ -460,7 +467,18 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p */ conn_keywords = (const char **) ecpg_alloc((connect_params + 1) * sizeof(char *), lineno); conn_values = (const char **) ecpg_alloc(connect_params * sizeof(char *), lineno); - if (conn_keywords == NULL || conn_values == NULL) + + /* Decide on a connection name */ + if (connection_name != NULL || realname != NULL) + { + this->name = ecpg_strdup(connection_name ? connection_name : realname, + lineno, &alloc_failed); + } + else + this->name = NULL; + + /* Deal with any failed allocations above */ + if (conn_keywords == NULL || conn_values == NULL || alloc_failed) { if (host) ecpg_free(host); @@ -476,6 +494,8 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p ecpg_free(conn_keywords); if (conn_values) ecpg_free(conn_values); + if (this->name) + ecpg_free(this->name); free(this); return false; } @@ -510,17 +530,14 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p ecpg_free(conn_keywords); if (conn_values) ecpg_free(conn_values); + if (this->name) + ecpg_free(this->name); free(this); return false; } } #endif - if (connection_name != NULL) - this->name = ecpg_strdup(connection_name, lineno); - else - this->name = ecpg_strdup(realname, lineno); - this->cache_head = NULL; this->prep_stmts = NULL; diff --git a/src/interfaces/ecpg/ecpglib/descriptor.c b/src/interfaces/ecpg/ecpglib/descriptor.c index 651d5c8b2ed..466428edfeb 100644 --- a/src/interfaces/ecpg/ecpglib/descriptor.c +++ b/src/interfaces/ecpg/ecpglib/descriptor.c @@ -240,8 +240,9 @@ ECPGget_desc(int lineno, const char *desc_name, int index,...) act_tuple; struct variable data_var; struct sqlca_t *sqlca = ECPGget_sqlca(); + bool alloc_failed = (sqlca == NULL); - if (sqlca == NULL) + if (alloc_failed) { ecpg_raise(lineno, ECPG_OUT_OF_MEMORY, ECPG_SQLSTATE_ECPG_OUT_OF_MEMORY, NULL); @@ -493,7 +494,14 @@ ECPGget_desc(int lineno, const char *desc_name, int index,...) #ifdef WIN32 stmt.oldthreadlocale = _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); #endif - stmt.oldlocale = ecpg_strdup(setlocale(LC_NUMERIC, NULL), lineno); + stmt.oldlocale = ecpg_strdup(setlocale(LC_NUMERIC, NULL), + lineno, &alloc_failed); + if (alloc_failed) + { + va_end(args); + return false; + } + setlocale(LC_NUMERIC, "C"); #endif diff --git a/src/interfaces/ecpg/ecpglib/ecpglib_extern.h b/src/interfaces/ecpg/ecpglib/ecpglib_extern.h index 75cc68275bd..949ff66cefc 100644 --- a/src/interfaces/ecpg/ecpglib/ecpglib_extern.h +++ b/src/interfaces/ecpg/ecpglib/ecpglib_extern.h @@ -175,7 +175,7 @@ void ecpg_free(void *ptr); bool ecpg_init(const struct connection *con, const char *connection_name, const int lineno); -char *ecpg_strdup(const char *string, int lineno); +char *ecpg_strdup(const char *string, int lineno, bool *alloc_failed); const char *ecpg_type_name(enum ECPGttype typ); int ecpg_dynamic_type(Oid type); int sqlda_dynamic_type(Oid type, enum COMPAT_MODE compat); diff --git a/src/interfaces/ecpg/ecpglib/execute.c b/src/interfaces/ecpg/ecpglib/execute.c index f52da06de9a..84a4a9fc578 100644 --- a/src/interfaces/ecpg/ecpglib/execute.c +++ b/src/interfaces/ecpg/ecpglib/execute.c @@ -860,9 +860,9 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari numeric *nval; if (var->arrsize > 1) - mallocedval = ecpg_strdup("{", lineno); + mallocedval = ecpg_strdup("{", lineno, NULL); else - mallocedval = ecpg_strdup("", lineno); + mallocedval = ecpg_strdup("", lineno, NULL); if (!mallocedval) return false; @@ -923,9 +923,9 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari int slen; if (var->arrsize > 1) - mallocedval = ecpg_strdup("{", lineno); + mallocedval = ecpg_strdup("{", lineno, NULL); else - mallocedval = ecpg_strdup("", lineno); + mallocedval = ecpg_strdup("", lineno, NULL); if (!mallocedval) return false; @@ -970,9 +970,9 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari int slen; if (var->arrsize > 1) - mallocedval = ecpg_strdup("{", lineno); + mallocedval = ecpg_strdup("{", lineno, NULL); else - mallocedval = ecpg_strdup("", lineno); + mallocedval = ecpg_strdup("", lineno, NULL); if (!mallocedval) return false; @@ -1017,9 +1017,9 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari int slen; if (var->arrsize > 1) - mallocedval = ecpg_strdup("{", lineno); + mallocedval = ecpg_strdup("{", lineno, NULL); else - mallocedval = ecpg_strdup("", lineno); + mallocedval = ecpg_strdup("", lineno, NULL); if (!mallocedval) return false; @@ -2001,7 +2001,8 @@ ecpg_do_prologue(int lineno, const int compat, const int force_indicator, return false; } #endif - stmt->oldlocale = ecpg_strdup(setlocale(LC_NUMERIC, NULL), lineno); + stmt->oldlocale = ecpg_strdup(setlocale(LC_NUMERIC, NULL), lineno, + NULL); if (stmt->oldlocale == NULL) { ecpg_do_epilogue(stmt); @@ -2030,7 +2031,14 @@ ecpg_do_prologue(int lineno, const int compat, const int force_indicator, statement_type = ECPGst_execute; } else - stmt->command = ecpg_strdup(query, lineno); + { + stmt->command = ecpg_strdup(query, lineno, NULL); + if (!stmt->command) + { + ecpg_do_epilogue(stmt); + return false; + } + } stmt->name = NULL; @@ -2042,7 +2050,12 @@ ecpg_do_prologue(int lineno, const int compat, const int force_indicator, if (command) { stmt->name = stmt->command; - stmt->command = ecpg_strdup(command, lineno); + stmt->command = ecpg_strdup(command, lineno, NULL); + if (!stmt->command) + { + ecpg_do_epilogue(stmt); + return false; + } } else { @@ -2175,7 +2188,12 @@ ecpg_do_prologue(int lineno, const int compat, const int force_indicator, if (!is_prepared_name_set && stmt->statement_type == ECPGst_prepare) { - stmt->name = ecpg_strdup(var->value, lineno); + stmt->name = ecpg_strdup(var->value, lineno, NULL); + if (!stmt->name) + { + ecpg_do_epilogue(stmt); + return false; + } is_prepared_name_set = true; } } diff --git a/src/interfaces/ecpg/ecpglib/memory.c b/src/interfaces/ecpg/ecpglib/memory.c index 6979be2c988..2112e55b6e4 100644 --- a/src/interfaces/ecpg/ecpglib/memory.c +++ b/src/interfaces/ecpg/ecpglib/memory.c @@ -43,8 +43,15 @@ ecpg_realloc(void *ptr, long size, int lineno) return new; } +/* + * Wrapper for strdup(), with NULL in input treated as a correct case. + * + * "alloc_failed" can be optionally specified by the caller to check for + * allocation failures. The caller is responsible for its initialization, + * as ecpg_strdup() may be called repeatedly across multiple allocations. + */ char * -ecpg_strdup(const char *string, int lineno) +ecpg_strdup(const char *string, int lineno, bool *alloc_failed) { char *new; @@ -54,6 +61,8 @@ ecpg_strdup(const char *string, int lineno) new = strdup(string); if (!new) { + if (alloc_failed) + *alloc_failed = true; ecpg_raise(lineno, ECPG_OUT_OF_MEMORY, ECPG_SQLSTATE_ECPG_OUT_OF_MEMORY, NULL); return NULL; } diff --git a/src/interfaces/ecpg/ecpglib/prepare.c b/src/interfaces/ecpg/ecpglib/prepare.c index ea1146f520f..06f0135813b 100644 --- a/src/interfaces/ecpg/ecpglib/prepare.c +++ b/src/interfaces/ecpg/ecpglib/prepare.c @@ -85,9 +85,22 @@ ecpg_register_prepared_stmt(struct statement *stmt) /* create statement */ prep_stmt->lineno = lineno; prep_stmt->connection = con; - prep_stmt->command = ecpg_strdup(stmt->command, lineno); + prep_stmt->command = ecpg_strdup(stmt->command, lineno, NULL); + if (!prep_stmt->command) + { + ecpg_free(prep_stmt); + ecpg_free(this); + return false; + } prep_stmt->inlist = prep_stmt->outlist = NULL; - this->name = ecpg_strdup(stmt->name, lineno); + this->name = ecpg_strdup(stmt->name, lineno, NULL); + if (!this->name) + { + ecpg_free(prep_stmt->command); + ecpg_free(prep_stmt); + ecpg_free(this); + return false; + } this->stmt = prep_stmt; this->prepared = true; @@ -177,14 +190,27 @@ prepare_common(int lineno, struct connection *con, const char *name, const char /* create statement */ stmt->lineno = lineno; stmt->connection = con; - stmt->command = ecpg_strdup(variable, lineno); + stmt->command = ecpg_strdup(variable, lineno, NULL); + if (!stmt->command) + { + ecpg_free(stmt); + ecpg_free(this); + return false; + } stmt->inlist = stmt->outlist = NULL; /* if we have C variables in our statement replace them with '?' */ replace_variables(&(stmt->command), lineno); /* add prepared statement to our list */ - this->name = ecpg_strdup(name, lineno); + this->name = ecpg_strdup(name, lineno, NULL); + if (!this->name) + { + ecpg_free(stmt->command); + ecpg_free(stmt); + ecpg_free(this); + return false; + } this->stmt = stmt; /* and finally really prepare the statement */ @@ -540,7 +566,9 @@ AddStmtToCache(int lineno, /* line # of statement */ /* add the query to the entry */ entry = &stmtCacheEntries[entNo]; entry->lineno = lineno; - entry->ecpgQuery = ecpg_strdup(ecpgQuery, lineno); + entry->ecpgQuery = ecpg_strdup(ecpgQuery, lineno, NULL); + if (!entry->ecpgQuery) + return -1; entry->connection = connection; entry->execs = 0; memcpy(entry->stmtID, stmtID, sizeof(entry->stmtID)); @@ -567,14 +595,19 @@ ecpg_auto_prepare(int lineno, const char *connection_name, const int compat, cha ecpg_log("ecpg_auto_prepare on line %d: statement found in cache; entry %d\n", lineno, entNo); stmtID = stmtCacheEntries[entNo].stmtID; + *name = ecpg_strdup(stmtID, lineno, NULL); + if (*name == NULL) + return false; con = ecpg_get_connection(connection_name); prep = ecpg_find_prepared_statement(stmtID, con, NULL); /* This prepared name doesn't exist on this connection. */ if (!prep && !prepare_common(lineno, con, stmtID, query)) + { + ecpg_free(*name); return false; + } - *name = ecpg_strdup(stmtID, lineno); } else { @@ -584,15 +617,22 @@ ecpg_auto_prepare(int lineno, const char *connection_name, const int compat, cha /* generate a statement ID */ sprintf(stmtID, "ecpg%d", nextStmtID++); + *name = ecpg_strdup(stmtID, lineno, NULL); + if (*name == NULL) + return false; if (!ECPGprepare(lineno, connection_name, 0, stmtID, query)) + { + ecpg_free(*name); return false; + } entNo = AddStmtToCache(lineno, stmtID, connection_name, compat, query); if (entNo < 0) + { + ecpg_free(*name); return false; - - *name = ecpg_strdup(stmtID, lineno); + } } /* increase usage counter */ diff --git a/src/interfaces/libpq-oauth/Makefile b/src/interfaces/libpq-oauth/Makefile index 270fc0cf2d9..682f17413b3 100644 --- a/src/interfaces/libpq-oauth/Makefile +++ b/src/interfaces/libpq-oauth/Makefile @@ -24,7 +24,7 @@ NAME = pq-oauth-$(MAJORVERSION) override shlib := lib$(NAME)$(DLSUFFIX) override stlib := libpq-oauth.a -override CPPFLAGS := -I$(libpq_srcdir) -I$(top_builddir)/src/port $(LIBCURL_CPPFLAGS) $(CPPFLAGS) +override CPPFLAGS := -I$(libpq_srcdir) -I$(top_builddir)/src/port $(CPPFLAGS) $(LIBCURL_CPPFLAGS) OBJS = \ $(WIN32RES) diff --git a/src/interfaces/libpq/Makefile b/src/interfaces/libpq/Makefile index 47d67811509..da6650066d4 100644 --- a/src/interfaces/libpq/Makefile +++ b/src/interfaces/libpq/Makefile @@ -24,7 +24,7 @@ NAME= pq SO_MAJOR_VERSION= 5 SO_MINOR_VERSION= $(MAJORVERSION) -override CPPFLAGS := -I$(srcdir) $(CPPFLAGS) -I$(top_builddir)/src/port -I$(top_srcdir)/src/port +override CPPFLAGS := -I$(srcdir) -I$(top_builddir)/src/port -I$(top_srcdir)/src/port $(CPPFLAGS) ifneq ($(PORTNAME), win32) override CFLAGS += $(PTHREAD_CFLAGS) endif diff --git a/src/interfaces/libpq/fe-cancel.c b/src/interfaces/libpq/fe-cancel.c index 65517c5703b..c872a0267f0 100644 --- a/src/interfaces/libpq/fe-cancel.c +++ b/src/interfaces/libpq/fe-cancel.c @@ -379,7 +379,24 @@ PQgetCancel(PGconn *conn) /* Check that we have received a cancellation key */ if (conn->be_cancel_key_len == 0) - return NULL; + { + /* + * In case there is no cancel key, return an all-zero PGcancel object. + * Actually calling PQcancel on this will fail, but we allow creating + * the PGcancel object anyway. Arguably it would be better return NULL + * to indicate that cancellation is not possible, but there'd be no + * way for the caller to distinguish "out of memory" from "server did + * not send a cancel key". Also, this is how PGgetCancel() has always + * behaved, and if we changed it, some clients would stop working + * altogether with servers that don't support cancellation. (The + * modern PQcancelCreate() function returns a failed connection object + * instead.) + * + * The returned dummy object has cancel_pkt_len == 0; we check for + * that in PQcancel() to identify it as a dummy. + */ + return calloc(1, sizeof(PGcancel)); + } cancel_req_len = offsetof(CancelRequestPacket, cancelAuthCode) + conn->be_cancel_key_len; cancel = malloc(offsetof(PGcancel, cancel_req) + cancel_req_len); @@ -544,6 +561,15 @@ PQcancel(PGcancel *cancel, char *errbuf, int errbufsize) return false; } + if (cancel->cancel_pkt_len == 0) + { + /* This is a dummy PGcancel object, see PQgetCancel */ + strlcpy(errbuf, "PQcancel() -- no cancellation key received", errbufsize); + /* strlcpy probably doesn't change errno, but be paranoid */ + SOCK_ERRNO_SET(save_errno); + return false; + } + /* * We need to open a temporary connection to the postmaster. Do this with * only kernel calls. diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 2a2b10d5a29..afa85d9fca9 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -7574,10 +7574,12 @@ PQport(const PGconn *conn) if (!conn) return NULL; - if (conn->connhost != NULL) + if (conn->connhost != NULL && + conn->connhost[conn->whichhost].port != NULL && + conn->connhost[conn->whichhost].port[0] != '\0') return conn->connhost[conn->whichhost].port; - return ""; + return DEF_PGPORT_STR; } /* diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h index 70c28f2ffca..a701c25038a 100644 --- a/src/interfaces/libpq/libpq-int.h +++ b/src/interfaces/libpq/libpq-int.h @@ -357,7 +357,8 @@ typedef struct pg_conn_host pg_conn_host_type type; /* type of host address */ char *host; /* host name or socket path */ char *hostaddr; /* host numeric IP address */ - char *port; /* port number (always provided) */ + char *port; /* port number (if NULL or empty, use + * DEF_PGPORT[_STR]) */ char *password; /* password for this host, read from the * password file; NULL if not sought or not * found in password file. */ diff --git a/src/pl/plpgsql/src/pl_comp.c b/src/pl/plpgsql/src/pl_comp.c index ee961425a5b..f6976689a69 100644 --- a/src/pl/plpgsql/src/pl_comp.c +++ b/src/pl/plpgsql/src/pl_comp.c @@ -177,6 +177,7 @@ plpgsql_compile_callback(FunctionCallInfo fcinfo, yyscan_t scanner; Datum prosrcdatum; char *proc_source; + char *proc_signature; HeapTuple typeTup; Form_pg_type typeStruct; PLpgSQL_variable *var; @@ -223,6 +224,9 @@ plpgsql_compile_callback(FunctionCallInfo fcinfo, plpgsql_check_syntax = forValidator; plpgsql_curr_compile = function; + /* format_procedure leaks memory, so run it in temp context */ + proc_signature = format_procedure(fcinfo->flinfo->fn_oid); + /* * All the permanent output of compilation (e.g. parse tree) is kept in a * per-function memory context, so it can be reclaimed easily. @@ -237,7 +241,7 @@ plpgsql_compile_callback(FunctionCallInfo fcinfo, ALLOCSET_DEFAULT_SIZES); plpgsql_compile_tmp_cxt = MemoryContextSwitchTo(func_cxt); - function->fn_signature = format_procedure(fcinfo->flinfo->fn_oid); + function->fn_signature = pstrdup(proc_signature); MemoryContextSetIdentifier(func_cxt, function->fn_signature); function->fn_oid = fcinfo->flinfo->fn_oid; function->fn_input_collation = fcinfo->fncollation; @@ -1673,6 +1677,11 @@ plpgsql_parse_wordrowtype(char *ident) { Oid classOid; Oid typOid; + TypeName *typName; + MemoryContext oldCxt; + + /* Avoid memory leaks in long-term function context */ + oldCxt = MemoryContextSwitchTo(plpgsql_compile_tmp_cxt); /* * Look up the relation. Note that because relation rowtypes have the @@ -1695,9 +1704,12 @@ plpgsql_parse_wordrowtype(char *ident) errmsg("relation \"%s\" does not have a composite type", ident))); + typName = makeTypeName(ident); + + MemoryContextSwitchTo(oldCxt); + /* Build and return the row type struct */ - return plpgsql_build_datatype(typOid, -1, InvalidOid, - makeTypeName(ident)); + return plpgsql_build_datatype(typOid, -1, InvalidOid, typName); } /* ---------- @@ -1711,6 +1723,7 @@ plpgsql_parse_cwordrowtype(List *idents) Oid classOid; Oid typOid; RangeVar *relvar; + TypeName *typName; MemoryContext oldCxt; /* @@ -1733,11 +1746,12 @@ plpgsql_parse_cwordrowtype(List *idents) errmsg("relation \"%s\" does not have a composite type", relvar->relname))); + typName = makeTypeNameFromNameList(idents); + MemoryContextSwitchTo(oldCxt); /* Build and return the row type struct */ - return plpgsql_build_datatype(typOid, -1, InvalidOid, - makeTypeNameFromNameList(idents)); + return plpgsql_build_datatype(typOid, -1, InvalidOid, typName); } /* @@ -1952,6 +1966,8 @@ plpgsql_build_recfield(PLpgSQL_rec *rec, const char *fldname) * origtypname is the parsed form of what the user wrote as the type name. * It can be NULL if the type could not be a composite type, or if it was * identified by OID to begin with (e.g., it's a function argument type). + * origtypname is in short-lived storage and must be copied if we choose + * to incorporate it into the function's parse tree. */ PLpgSQL_type * plpgsql_build_datatype(Oid typeOid, int32 typmod, @@ -2070,7 +2086,7 @@ build_datatype(HeapTuple typeTup, int32 typmod, errmsg("type %s is not composite", format_type_be(typ->typoid)))); - typ->origtypname = origtypname; + typ->origtypname = copyObject(origtypname); typ->tcache = typentry; typ->tupdesc_id = typentry->tupDesc_identifier; } diff --git a/src/pl/plpgsql/src/pl_gram.y b/src/pl/plpgsql/src/pl_gram.y index 7b672ea5179..17568d82554 100644 --- a/src/pl/plpgsql/src/pl_gram.y +++ b/src/pl/plpgsql/src/pl_gram.y @@ -3853,6 +3853,7 @@ parse_datatype(const char *string, int location, yyscan_t yyscanner) int32 typmod; sql_error_callback_arg cbarg; ErrorContextCallback syntax_errcontext; + MemoryContext oldCxt; cbarg.location = location; cbarg.yyscanner = yyscanner; @@ -3862,9 +3863,14 @@ parse_datatype(const char *string, int location, yyscan_t yyscanner) syntax_errcontext.previous = error_context_stack; error_context_stack = &syntax_errcontext; - /* Let the main parser try to parse it under standard SQL rules */ + /* + * Let the main parser try to parse it under standard SQL rules. The + * parser leaks memory, so run it in temp context. + */ + oldCxt = MemoryContextSwitchTo(plpgsql_compile_tmp_cxt); typeName = typeStringToTypeName(string, NULL); typenameTypeIdAndMod(NULL, typeName, &type_id, &typmod); + MemoryContextSwitchTo(oldCxt); /* Restore former ereport callback */ error_context_stack = syntax_errcontext.previous; diff --git a/src/pl/plpython/Makefile b/src/pl/plpython/Makefile index f959083a0bd..25f295c3709 100644 --- a/src/pl/plpython/Makefile +++ b/src/pl/plpython/Makefile @@ -11,7 +11,7 @@ ifeq ($(PORTNAME), win32) override python_libspec = endif -override CPPFLAGS := -I. -I$(srcdir) $(python_includespec) $(CPPFLAGS) +override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) $(python_includespec) rpathdir = $(python_libdir) diff --git a/src/pl/tcl/Makefile b/src/pl/tcl/Makefile index ea52a2efc22..dd57f7d694c 100644 --- a/src/pl/tcl/Makefile +++ b/src/pl/tcl/Makefile @@ -11,7 +11,7 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -override CPPFLAGS := -I. -I$(srcdir) $(TCL_INCLUDE_SPEC) $(CPPFLAGS) +override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) $(TCL_INCLUDE_SPEC) # On Windows, we don't link directly with the Tcl library; see below ifneq ($(PORTNAME), win32) diff --git a/src/port/pgmkdirp.c b/src/port/pgmkdirp.c index d943559760d..7d7cea4dd0e 100644 --- a/src/port/pgmkdirp.c +++ b/src/port/pgmkdirp.c @@ -73,7 +73,7 @@ pg_mkdir_p(char *path, int omode) if (p[0] == '/' && p[1] == '/') { /* network drive */ - p = strstr(p + 2, "/"); + p = strchr(p + 2, '/'); if (p == NULL) { errno = EINVAL; diff --git a/src/test/isolation/expected/merge-match-recheck.out b/src/test/isolation/expected/merge-match-recheck.out index 9a44a595927..90300f1db5a 100644 --- a/src/test/isolation/expected/merge-match-recheck.out +++ b/src/test/isolation/expected/merge-match-recheck.out @@ -241,19 +241,28 @@ starting permutation: update_bal1_tg merge_bal_tg c2 select1_tg c1 s2: NOTICE: Update: (1,160,s1,setup) -> (1,50,s1,"setup updated by update_bal1_tg") step update_bal1_tg: UPDATE target_tg t SET balance = 50, val = t.val || ' updated by update_bal1_tg' WHERE t.key = 1; step merge_bal_tg: - MERGE INTO target_tg t - USING (SELECT 1 as key) s - ON s.key = t.key - WHEN MATCHED AND balance < 100 THEN - UPDATE SET balance = balance * 2, val = t.val || ' when1' - WHEN MATCHED AND balance < 200 THEN - UPDATE SET balance = balance * 4, val = t.val || ' when2' - WHEN MATCHED AND balance < 300 THEN - UPDATE SET balance = balance * 8, val = t.val || ' when3'; + WITH t AS ( + MERGE INTO target_tg t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3' + RETURNING t.* + ) + SELECT * FROM t; <waiting ...> step c2: COMMIT; s1: NOTICE: Update: (1,50,s1,"setup updated by update_bal1_tg") -> (1,100,s1,"setup updated by update_bal1_tg when1") step merge_bal_tg: <... completed> +key|balance|status|val +---+-------+------+------------------------------------- + 1| 100|s1 |setup updated by update_bal1_tg when1 +(1 row) + step select1_tg: SELECT * FROM target_tg; key|balance|status|val ---+-------+------+------------------------------------- diff --git a/src/test/isolation/specs/merge-match-recheck.spec b/src/test/isolation/specs/merge-match-recheck.spec index 26266b8c297..15226e40c9e 100644 --- a/src/test/isolation/specs/merge-match-recheck.spec +++ b/src/test/isolation/specs/merge-match-recheck.spec @@ -99,15 +99,19 @@ step "merge_bal_pa" } step "merge_bal_tg" { - MERGE INTO target_tg t - USING (SELECT 1 as key) s - ON s.key = t.key - WHEN MATCHED AND balance < 100 THEN - UPDATE SET balance = balance * 2, val = t.val || ' when1' - WHEN MATCHED AND balance < 200 THEN - UPDATE SET balance = balance * 4, val = t.val || ' when2' - WHEN MATCHED AND balance < 300 THEN - UPDATE SET balance = balance * 8, val = t.val || ' when3'; + WITH t AS ( + MERGE INTO target_tg t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3' + RETURNING t.* + ) + SELECT * FROM t; } step "merge_delete" diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index aa1d27bbed3..7d3d3d52b45 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -15,6 +15,7 @@ SUBDIRS = \ plsample \ spgist_name_ops \ test_aio \ + test_binaryheap \ test_bloomfilter \ test_copy_callbacks \ test_custom_rmgrs \ diff --git a/src/test/modules/injection_points/injection_stats.c b/src/test/modules/injection_points/injection_stats.c index 14903c629e0..e3947b23ba5 100644 --- a/src/test/modules/injection_points/injection_stats.c +++ b/src/test/modules/injection_points/injection_stats.c @@ -59,7 +59,7 @@ static const PgStat_KindInfo injection_stats = { /* * Kind ID reserved for statistics of injection points. */ -#define PGSTAT_KIND_INJECTION 129 +#define PGSTAT_KIND_INJECTION 25 /* Track if stats are loaded */ static bool inj_stats_loaded = false; diff --git a/src/test/modules/injection_points/injection_stats_fixed.c b/src/test/modules/injection_points/injection_stats_fixed.c index 3d0c01bdd05..bc54c79d190 100644 --- a/src/test/modules/injection_points/injection_stats_fixed.c +++ b/src/test/modules/injection_points/injection_stats_fixed.c @@ -64,7 +64,7 @@ static const PgStat_KindInfo injection_stats_fixed = { /* * Kind ID reserved for statistics of injection points. */ -#define PGSTAT_KIND_INJECTION_FIXED 130 +#define PGSTAT_KIND_INJECTION_FIXED 26 /* Track if fixed-numbered stats are loaded */ static bool inj_fixed_loaded = false; diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 9de0057bd1d..dd5cd065ba1 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -14,6 +14,7 @@ subdir('plsample') subdir('spgist_name_ops') subdir('ssl_passphrase_callback') subdir('test_aio') +subdir('test_binaryheap') subdir('test_bloomfilter') subdir('test_copy_callbacks') subdir('test_custom_rmgrs') diff --git a/src/test/modules/test_binaryheap/.gitignore b/src/test/modules/test_binaryheap/.gitignore new file mode 100644 index 00000000000..5dcb3ff9723 --- /dev/null +++ b/src/test/modules/test_binaryheap/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/src/test/modules/test_binaryheap/Makefile b/src/test/modules/test_binaryheap/Makefile new file mode 100644 index 00000000000..d310fbc9e88 --- /dev/null +++ b/src/test/modules/test_binaryheap/Makefile @@ -0,0 +1,24 @@ +# src/test/modules/test_binaryheap/Makefile + +MODULE_big = test_binaryheap +OBJS = \ + $(WIN32RES) \ + test_binaryheap.o + +PGFILEDESC = "test_binaryheap - test code for binaryheap" + +EXTENSION = test_binaryheap +DATA = test_binaryheap--1.0.sql + +REGRESS = test_binaryheap + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_binaryheap +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_binaryheap/expected/test_binaryheap.out b/src/test/modules/test_binaryheap/expected/test_binaryheap.out new file mode 100644 index 00000000000..16ce07875e3 --- /dev/null +++ b/src/test/modules/test_binaryheap/expected/test_binaryheap.out @@ -0,0 +1,12 @@ +CREATE EXTENSION test_binaryheap; +-- +-- These tests don't produce any interesting output. We're checking that +-- the operations complete without crashing or hanging and that none of their +-- internal sanity tests fail. +-- +SELECT test_binaryheap(); + test_binaryheap +----------------- + +(1 row) + diff --git a/src/test/modules/test_binaryheap/meson.build b/src/test/modules/test_binaryheap/meson.build new file mode 100644 index 00000000000..816a43c93e9 --- /dev/null +++ b/src/test/modules/test_binaryheap/meson.build @@ -0,0 +1,33 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +test_binaryheap_sources = files( + 'test_binaryheap.c', +) + +if host_system == 'windows' + test_binaryheap_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_binaryheap', + '--FILEDESC', 'test_binaryheap - test code for binaryheap',]) +endif + +test_binaryheap = shared_module('test_binaryheap', + test_binaryheap_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += test_binaryheap + +test_install_data += files( + 'test_binaryheap.control', + 'test_binaryheap--1.0.sql', +) + +tests += { + 'name': 'test_binaryheap', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'test_binaryheap', + ], + }, +} diff --git a/src/test/modules/test_binaryheap/sql/test_binaryheap.sql b/src/test/modules/test_binaryheap/sql/test_binaryheap.sql new file mode 100644 index 00000000000..8439545815b --- /dev/null +++ b/src/test/modules/test_binaryheap/sql/test_binaryheap.sql @@ -0,0 +1,8 @@ +CREATE EXTENSION test_binaryheap; + +-- +-- These tests don't produce any interesting output. We're checking that +-- the operations complete without crashing or hanging and that none of their +-- internal sanity tests fail. +-- +SELECT test_binaryheap(); diff --git a/src/test/modules/test_binaryheap/test_binaryheap--1.0.sql b/src/test/modules/test_binaryheap/test_binaryheap--1.0.sql new file mode 100644 index 00000000000..cddceeee603 --- /dev/null +++ b/src/test/modules/test_binaryheap/test_binaryheap--1.0.sql @@ -0,0 +1,7 @@ +/* src/test/modules/test_binaryheap/test_binaryheap--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_binaryheap" to load this file. \quit + +CREATE FUNCTION test_binaryheap() RETURNS VOID + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_binaryheap/test_binaryheap.c b/src/test/modules/test_binaryheap/test_binaryheap.c new file mode 100644 index 00000000000..583dae1da30 --- /dev/null +++ b/src/test/modules/test_binaryheap/test_binaryheap.c @@ -0,0 +1,275 @@ +/*-------------------------------------------------------------------------- + * + * test_binaryheap.c + * Test correctness of binary heap implementation. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_binaryheap/test_binaryheap.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "common/int.h" +#include "common/pg_prng.h" +#include "fmgr.h" +#include "lib/binaryheap.h" + +PG_MODULE_MAGIC; + +/* + * Test binaryheap_comparator for max-heap of integers. + */ +static int +int_cmp(Datum a, Datum b, void *arg) +{ + return pg_cmp_s32(DatumGetInt32(a), DatumGetInt32(b)); +} + +/* + * Loops through all nodes and returns the maximum value. + */ +static int +get_max_from_heap(binaryheap *heap) +{ + int max = -1; + + for (int i = 0; i < binaryheap_size(heap); i++) + max = Max(max, DatumGetInt32(binaryheap_get_node(heap, i))); + + return max; +} + +/* + * Generate a random permutation of the integers 0..size-1. + */ +static int * +get_permutation(int size) +{ + int *permutation = (int *) palloc(size * sizeof(int)); + + permutation[0] = 0; + + /* + * This is the "inside-out" variant of the Fisher-Yates shuffle algorithm. + * Notionally, we append each new value to the array and then swap it with + * a randomly-chosen array element (possibly including itself, else we + * fail to generate permutations with the last integer last). The swap + * step can be optimized by combining it with the insertion. + */ + for (int i = 1; i < size; i++) + { + int j = pg_prng_uint64_range(&pg_global_prng_state, 0, i); + + if (j < i) /* avoid fetching undefined data if j=i */ + permutation[i] = permutation[j]; + permutation[j] = i; + } + + return permutation; +} + +/* + * Ensure that the heap property holds for the given heap, i.e., each parent is + * greater than or equal to its children. + */ +static void +verify_heap_property(binaryheap *heap) +{ + for (int i = 0; i < binaryheap_size(heap); i++) + { + int left = 2 * i + 1; + int right = 2 * i + 2; + int parent_val = DatumGetInt32(binaryheap_get_node(heap, i)); + + if (left < binaryheap_size(heap) && + parent_val < DatumGetInt32(binaryheap_get_node(heap, left))) + elog(ERROR, "parent node less than left child"); + + if (right < binaryheap_size(heap) && + parent_val < DatumGetInt32(binaryheap_get_node(heap, right))) + elog(ERROR, "parent node less than right child"); + } +} + +/* + * Check correctness of basic operations. + */ +static void +test_basic(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + int *permutation = get_permutation(size); + + if (!binaryheap_empty(heap)) + elog(ERROR, "new heap not empty"); + if (binaryheap_size(heap) != 0) + elog(ERROR, "wrong size for new heap"); + + for (int i = 0; i < size; i++) + { + binaryheap_add(heap, Int32GetDatum(permutation[i])); + verify_heap_property(heap); + } + + if (binaryheap_empty(heap)) + elog(ERROR, "heap empty after adding values"); + if (binaryheap_size(heap) != size) + elog(ERROR, "wrong size for heap after adding values"); + + if (DatumGetInt32(binaryheap_first(heap)) != get_max_from_heap(heap)) + elog(ERROR, "incorrect root node after adding values"); + + for (int i = 0; i < size; i++) + { + int expected = get_max_from_heap(heap); + int actual = DatumGetInt32(binaryheap_remove_first(heap)); + + if (actual != expected) + elog(ERROR, "incorrect root node after removing root"); + verify_heap_property(heap); + } + + if (!binaryheap_empty(heap)) + elog(ERROR, "heap not empty after removing all nodes"); +} + +/* + * Test building heap after unordered additions. + */ +static void +test_build(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + int *permutation = get_permutation(size); + + for (int i = 0; i < size; i++) + binaryheap_add_unordered(heap, Int32GetDatum(permutation[i])); + + if (binaryheap_size(heap) != size) + elog(ERROR, "wrong size for heap after unordered additions"); + + binaryheap_build(heap); + verify_heap_property(heap); +} + +/* + * Test removing nodes. + */ +static void +test_remove_node(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + int *permutation = get_permutation(size); + int remove_count = pg_prng_uint64_range(&pg_global_prng_state, + 0, size - 1); + + for (int i = 0; i < size; i++) + binaryheap_add(heap, Int32GetDatum(permutation[i])); + + for (int i = 0; i < remove_count; i++) + { + int idx = pg_prng_uint64_range(&pg_global_prng_state, + 0, binaryheap_size(heap) - 1); + + binaryheap_remove_node(heap, idx); + verify_heap_property(heap); + } + + if (binaryheap_size(heap) != size - remove_count) + elog(ERROR, "wrong size after removing nodes"); +} + +/* + * Test replacing the root node. + */ +static void +test_replace_first(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + + for (int i = 0; i < size; i++) + binaryheap_add(heap, Int32GetDatum(i)); + + /* + * Replace root with a value smaller than everything in the heap. + */ + binaryheap_replace_first(heap, Int32GetDatum(-1)); + verify_heap_property(heap); + + /* + * Replace root with a value in the middle of the heap. + */ + binaryheap_replace_first(heap, Int32GetDatum(size / 2)); + verify_heap_property(heap); + + /* + * Replace root with a larger value than everything in the heap. + */ + binaryheap_replace_first(heap, Int32GetDatum(size + 1)); + verify_heap_property(heap); +} + +/* + * Test duplicate values. + */ +static void +test_duplicates(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + int dup = pg_prng_uint64_range(&pg_global_prng_state, 0, size - 1); + + for (int i = 0; i < size; i++) + binaryheap_add(heap, Int32GetDatum(dup)); + + for (int i = 0; i < size; i++) + { + if (DatumGetInt32(binaryheap_remove_first(heap)) != dup) + elog(ERROR, "unexpected value in heap with duplicates"); + } +} + +/* + * Test resetting. + */ +static void +test_reset(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + + for (int i = 0; i < size; i++) + binaryheap_add(heap, Int32GetDatum(i)); + + binaryheap_reset(heap); + + if (!binaryheap_empty(heap)) + elog(ERROR, "heap not empty after resetting"); +} + +/* + * SQL-callable entry point to perform all tests. + */ +PG_FUNCTION_INFO_V1(test_binaryheap); + +Datum +test_binaryheap(PG_FUNCTION_ARGS) +{ + static const int test_sizes[] = {1, 2, 3, 10, 100, 1000}; + + for (int i = 0; i < sizeof(test_sizes) / sizeof(int); i++) + { + int size = test_sizes[i]; + + test_basic(size); + test_build(size); + test_remove_node(size); + test_replace_first(size); + test_duplicates(size); + test_reset(size); + } + + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_binaryheap/test_binaryheap.control b/src/test/modules/test_binaryheap/test_binaryheap.control new file mode 100644 index 00000000000..dd0785e05bd --- /dev/null +++ b/src/test/modules/test_binaryheap/test_binaryheap.control @@ -0,0 +1,5 @@ +# test_binaryheap extension +comment = 'Test code for binaryheap' +default_version = '1.0' +module_pathname = '$libdir/test_binaryheap' +relocatable = true diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 301766d2ed9..35413f14019 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -290,6 +290,33 @@ sub connstr =pod +=item $node->is_alive() + +Check if the node is alive, using pg_isready. +Returns 1 if successful, 0 on failure. + +=cut + +sub is_alive +{ + my ($self) = @_; + local %ENV = $self->_get_env(); + + my $ret = PostgreSQL::Test::Utils::system_log( + 'pg_isready', + '--timeout' => $PostgreSQL::Test::Utils::timeout_default, + '--host' => $self->host, + '--port' => $self->port); + + if ($ret != 0) + { + return 0; + } + return 1; +} + +=pod + =item $node->raw_connect() Open a raw TCP or Unix domain socket connection to the server. This is diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index 6e78ff1a030..52993c32dbb 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -54,6 +54,7 @@ tests += { 't/043_no_contrecord_switch.pl', 't/044_invalidate_inactive_slots.pl', 't/045_archive_restartpoint.pl', + 't/046_checkpoint_logical_slot.pl', 't/047_checkpoint_physical_slot.pl', 't/048_vacuum_horizon_floor.pl' ], diff --git a/src/test/recovery/t/013_crash_restart.pl b/src/test/recovery/t/013_crash_restart.pl index debfa635c36..4c5af018ee4 100644 --- a/src/test/recovery/t/013_crash_restart.pl +++ b/src/test/recovery/t/013_crash_restart.pl @@ -228,6 +228,13 @@ is( $node->safe_psql( 'before-orderly-restart', 'can still write after crash restart'); +# Confirm that the logical replication launcher, a background worker +# without the never-restart flag, has also restarted successfully. +is($node->poll_query_until('postgres', + "SELECT count(*) = 1 FROM pg_stat_activity WHERE backend_type = 'logical replication launcher'"), + '1', + 'logical replication launcher restarted after crash'); + # Just to be sure, check that an orderly restart now still works $node->restart(); diff --git a/src/test/recovery/t/027_stream_regress.pl b/src/test/recovery/t/027_stream_regress.pl index 83def062d11..5d2c06ba06e 100644 --- a/src/test/recovery/t/027_stream_regress.pl +++ b/src/test/recovery/t/027_stream_regress.pl @@ -81,7 +81,14 @@ my $rc = . "--max-concurrent-tests=20 " . "--inputdir=../regress " . "--outputdir=\"$outputdir\""); -if ($rc != 0) + +# Regression diffs are only meaningful if both the primary and the standby +# are still alive after a regression test failure. A crash would cause a +# useless increase in the log quantity, mostly filled with information +# related to queries that could not run. +my $primary_alive = $node_primary->is_alive; +my $standby_alive = $node_standby_1->is_alive; +if ($rc != 0 && $primary_alive && $standby_alive) { # Dump out the regression diffs file, if there is one my $diffs = "$outputdir/regression.diffs"; @@ -93,6 +100,8 @@ if ($rc != 0) } } is($rc, 0, 'regression tests pass'); +is($primary_alive, 1, 'primary alive after regression test run'); +is($standby_alive, 1, 'standby alive after regression test run'); # Clobber all sequences with their next value, so that we don't have # differences between nodes due to caching. diff --git a/src/test/recovery/t/046_checkpoint_logical_slot.pl b/src/test/recovery/t/046_checkpoint_logical_slot.pl new file mode 100644 index 00000000000..4fd709e3a03 --- /dev/null +++ b/src/test/recovery/t/046_checkpoint_logical_slot.pl @@ -0,0 +1,142 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group +# +# This test verifies the case when the logical slot is advanced during +# checkpoint. The test checks that the logical slot's restart_lsn still refers +# to an existed WAL segment after immediate restart. +# +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; + +use Test::More; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my ($node, $result); + +$node = PostgreSQL::Test::Cluster->new('mike'); +$node->init; +$node->append_conf('postgresql.conf', "wal_level = 'logical'"); +$node->start; + +# Check if the extension injection_points is available, as it may be +# possible that this script is run with installcheck, where the module +# would not be installed by default. +if (!$node->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} + +$node->safe_psql('postgres', q(CREATE EXTENSION injection_points)); + +# Create the two slots we'll need. +$node->safe_psql('postgres', + q{select pg_create_logical_replication_slot('slot_logical', 'test_decoding')} +); +$node->safe_psql('postgres', + q{select pg_create_physical_replication_slot('slot_physical', true)}); + +# Advance both slots to the current position just to have everything "valid". +$node->safe_psql('postgres', + q{select count(*) from pg_logical_slot_get_changes('slot_logical', null, null)} +); +$node->safe_psql('postgres', + q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} +); + +# Run checkpoint to flush current state to disk and set a baseline. +$node->safe_psql('postgres', q{checkpoint}); + +# Generate some transactions to get RUNNING_XACTS. +my $xacts = $node->background_psql('postgres'); +$xacts->query_until( + qr/run_xacts/, + q(\echo run_xacts +SELECT 1 \watch 0.1 +\q +)); + +$node->advance_wal(20); + +# Run another checkpoint to set a new restore LSN. +$node->safe_psql('postgres', q{checkpoint}); + +$node->advance_wal(20); + +# Run another checkpoint, this time in the background, and make it wait +# on the injection point) so that the checkpoint stops right before +# removing old WAL segments. +note('starting checkpoint'); + +my $checkpoint = $node->background_psql('postgres'); +$checkpoint->query_safe( + q(select injection_points_attach('checkpoint-before-old-wal-removal','wait')) +); +$checkpoint->query_until( + qr/starting_checkpoint/, + q(\echo starting_checkpoint +checkpoint; +\q +)); + +# Wait until the checkpoint stops right before removing WAL segments. +note('waiting for injection_point'); +$node->wait_for_event('checkpointer', 'checkpoint-before-old-wal-removal'); +note('injection_point is reached'); + +# Try to advance the logical slot, but make it stop when it moves to the next +# WAL segment (this has to happen in the background, too). +my $logical = $node->background_psql('postgres'); +$logical->query_safe( + q{select injection_points_attach('logical-replication-slot-advance-segment','wait');} +); +$logical->query_until( + qr/get_changes/, + q( +\echo get_changes +select count(*) from pg_logical_slot_get_changes('slot_logical', null, null) \watch 1 +\q +)); + +# Wait until the slot's restart_lsn points to the next WAL segment. +note('waiting for injection_point'); +$node->wait_for_event('client backend', + 'logical-replication-slot-advance-segment'); +note('injection_point is reached'); + +# OK, we're in the right situation: time to advance the physical slot, which +# recalculates the required LSN, and then unblock the checkpoint, which +# removes the WAL still needed by the logical slot. +$node->safe_psql('postgres', + q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} +); + +# Generate a long WAL record, spawning at least two pages for the follow-up +# post-recovery check. +$node->safe_psql('postgres', + q{select pg_logical_emit_message(false, '', repeat('123456789', 1000))}); + +# Continue the checkpoint and wait for its completion. +my $log_offset = -s $node->logfile; +$node->safe_psql('postgres', + q{select injection_points_wakeup('checkpoint-before-old-wal-removal')}); +$node->wait_for_log(qr/checkpoint complete/, $log_offset); + +# Abruptly stop the server. +$node->stop('immediate'); + +$node->start; + +eval { + $node->safe_psql('postgres', + q{select count(*) from pg_logical_slot_get_changes('slot_logical', null, null);} + ); +}; +is($@, '', "Logical slot still valid"); + +done_testing(); diff --git a/src/test/recovery/t/047_checkpoint_physical_slot.pl b/src/test/recovery/t/047_checkpoint_physical_slot.pl index a1332b5d44c..9e98383e30e 100644 --- a/src/test/recovery/t/047_checkpoint_physical_slot.pl +++ b/src/test/recovery/t/047_checkpoint_physical_slot.pl @@ -94,9 +94,11 @@ $node->safe_psql('postgres', q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} ); -# Continue the checkpoint. +# Continue the checkpoint and wait for its completion. +my $log_offset = -s $node->logfile; $node->safe_psql('postgres', q{select injection_points_wakeup('checkpoint-before-old-wal-removal')}); +$node->wait_for_log(qr/checkpoint complete/, $log_offset); my $restart_lsn_old = $node->safe_psql('postgres', q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'} @@ -104,8 +106,7 @@ my $restart_lsn_old = $node->safe_psql('postgres', chomp($restart_lsn_old); note("restart lsn before stop: $restart_lsn_old"); -# Abruptly stop the server (1 second should be enough for the checkpoint -# to finish; it would be better). +# Abruptly stop the server. $node->stop('immediate'); $node->start; diff --git a/src/test/regress/expected/compression.out b/src/test/regress/expected/compression.out index 4dd9ee7200d..09f198149aa 100644 --- a/src/test/regress/expected/compression.out +++ b/src/test/regress/expected/compression.out @@ -1,3 +1,7 @@ +-- Default set of tests for TOAST compression, independent on compression +-- methods supported by the build. +CREATE SCHEMA pglz; +SET search_path TO pglz, public; \set HIDE_TOAST_COMPRESSION false -- ensure we get stable results regardless of installation's default SET default_toast_compression = 'pglz'; @@ -6,21 +10,13 @@ CREATE TABLE cmdata(f1 text COMPRESSION pglz); CREATE INDEX idx ON cmdata(f1); INSERT INTO cmdata VALUES(repeat('1234567890', 1000)); \d+ cmdata - Table "public.cmdata" + Table "pglz.cmdata" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+------+-----------+----------+---------+----------+-------------+--------------+------------- f1 | text | | | | extended | pglz | | Indexes: "idx" btree (f1) -CREATE TABLE cmdata1(f1 TEXT COMPRESSION lz4); -INSERT INTO cmdata1 VALUES(repeat('1234567890', 1004)); -\d+ cmdata1 - Table "public.cmdata1" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | lz4 | | - -- verify stored compression method in the data SELECT pg_column_compression(f1) FROM cmdata; pg_column_compression @@ -28,12 +24,6 @@ SELECT pg_column_compression(f1) FROM cmdata; pglz (1 row) -SELECT pg_column_compression(f1) FROM cmdata1; - pg_column_compression ------------------------ - lz4 -(1 row) - -- decompress data slice SELECT SUBSTR(f1, 200, 5) FROM cmdata; substr @@ -41,16 +31,10 @@ SELECT SUBSTR(f1, 200, 5) FROM cmdata; 01234 (1 row) -SELECT SUBSTR(f1, 2000, 50) FROM cmdata1; - substr ----------------------------------------------------- - 01234567890123456789012345678901234567890123456789 -(1 row) - -- copy with table creation SELECT * INTO cmmove1 FROM cmdata; \d+ cmmove1 - Table "public.cmmove1" + Table "pglz.cmmove1" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+------+-----------+----------+---------+----------+-------------+--------------+------------- f1 | text | | | | extended | | | @@ -61,45 +45,9 @@ SELECT pg_column_compression(f1) FROM cmmove1; pglz (1 row) --- copy to existing table -CREATE TABLE cmmove3(f1 text COMPRESSION pglz); -INSERT INTO cmmove3 SELECT * FROM cmdata; -INSERT INTO cmmove3 SELECT * FROM cmdata1; -SELECT pg_column_compression(f1) FROM cmmove3; - pg_column_compression ------------------------ - pglz - lz4 -(2 rows) - --- test LIKE INCLUDING COMPRESSION -CREATE TABLE cmdata2 (LIKE cmdata1 INCLUDING COMPRESSION); -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | lz4 | | - -DROP TABLE cmdata2; -- try setting compression for incompressible data type CREATE TABLE cmdata2 (f1 int COMPRESSION pglz); ERROR: column data type integer does not support compression --- update using datum from different table -CREATE TABLE cmmove2(f1 text COMPRESSION pglz); -INSERT INTO cmmove2 VALUES (repeat('1234567890', 1004)); -SELECT pg_column_compression(f1) FROM cmmove2; - pg_column_compression ------------------------ - pglz -(1 row) - -UPDATE cmmove2 SET f1 = cmdata1.f1 FROM cmdata1; -SELECT pg_column_compression(f1) FROM cmmove2; - pg_column_compression ------------------------ - lz4 -(1 row) - -- test externally stored compressed data CREATE OR REPLACE FUNCTION large_val() RETURNS TEXT LANGUAGE SQL AS 'select array_agg(fipshash(g::text))::text from generate_series(1, 256) g'; @@ -111,21 +59,6 @@ SELECT pg_column_compression(f1) FROM cmdata2; pglz (1 row) -INSERT INTO cmdata1 SELECT large_val() || repeat('a', 4000); -SELECT pg_column_compression(f1) FROM cmdata1; - pg_column_compression ------------------------ - lz4 - lz4 -(2 rows) - -SELECT SUBSTR(f1, 200, 5) FROM cmdata1; - substr --------- - 01234 - 79026 -(2 rows) - SELECT SUBSTR(f1, 200, 5) FROM cmdata2; substr -------- @@ -136,21 +69,21 @@ DROP TABLE cmdata2; --test column type update varlena/non-varlena CREATE TABLE cmdata2 (f1 int); \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+---------+-----------+----------+---------+---------+-------------+--------------+------------- f1 | integer | | | | plain | | | ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE varchar; \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+-------------------+-----------+----------+---------+----------+-------------+--------------+------------- f1 | character varying | | | | extended | | | ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE int USING f1::integer; \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+---------+-----------+----------+---------+---------+-------------+--------------+------------- f1 | integer | | | | plain | | | @@ -160,14 +93,14 @@ ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE int USING f1::integer; ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE varchar; ALTER TABLE cmdata2 ALTER COLUMN f1 SET COMPRESSION pglz; \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+-------------------+-----------+----------+---------+----------+-------------+--------------+------------- f1 | character varying | | | | extended | pglz | | ALTER TABLE cmdata2 ALTER COLUMN f1 SET STORAGE plain; \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+-------------------+-----------+----------+---------+---------+-------------+--------------+------------- f1 | character varying | | | | plain | pglz | | @@ -179,164 +112,47 @@ SELECT pg_column_compression(f1) FROM cmdata2; (1 row) --- test compression with materialized view -CREATE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata1; -\d+ compressmv - Materialized view "public.compressmv" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - x | text | | | | extended | | | -View definition: - SELECT f1 AS x - FROM cmdata1; - -SELECT pg_column_compression(f1) FROM cmdata1; - pg_column_compression ------------------------ - lz4 - lz4 -(2 rows) - -SELECT pg_column_compression(x) FROM compressmv; - pg_column_compression ------------------------ - lz4 - lz4 -(2 rows) - --- test compression with partition -CREATE TABLE cmpart(f1 text COMPRESSION lz4) PARTITION BY HASH(f1); -CREATE TABLE cmpart1 PARTITION OF cmpart FOR VALUES WITH (MODULUS 2, REMAINDER 0); -CREATE TABLE cmpart2(f1 text COMPRESSION pglz); -ALTER TABLE cmpart ATTACH PARTITION cmpart2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -SELECT pg_column_compression(f1) FROM cmpart1; - pg_column_compression ------------------------ - lz4 -(1 row) - -SELECT pg_column_compression(f1) FROM cmpart2; - pg_column_compression ------------------------ - pglz -(1 row) - -- test compression with inheritance -CREATE TABLE cminh() INHERITS(cmdata, cmdata1); -- error -NOTICE: merging multiple inherited definitions of column "f1" -ERROR: column "f1" has a compression method conflict -DETAIL: pglz versus lz4 -CREATE TABLE cminh(f1 TEXT COMPRESSION lz4) INHERITS(cmdata); -- error -NOTICE: merging column "f1" with inherited definition -ERROR: column "f1" has a compression method conflict -DETAIL: pglz versus lz4 CREATE TABLE cmdata3(f1 text); CREATE TABLE cminh() INHERITS (cmdata, cmdata3); NOTICE: merging multiple inherited definitions of column "f1" -- test default_toast_compression GUC +-- suppress machine-dependent details +\set VERBOSITY terse SET default_toast_compression = ''; ERROR: invalid value for parameter "default_toast_compression": "" -HINT: Available values: pglz, lz4. SET default_toast_compression = 'I do not exist compression'; ERROR: invalid value for parameter "default_toast_compression": "I do not exist compression" -HINT: Available values: pglz, lz4. -SET default_toast_compression = 'lz4'; SET default_toast_compression = 'pglz'; --- test alter compression method -ALTER TABLE cmdata ALTER COLUMN f1 SET COMPRESSION lz4; -INSERT INTO cmdata VALUES (repeat('123456789', 4004)); -\d+ cmdata - Table "public.cmdata" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | lz4 | | -Indexes: - "idx" btree (f1) -Child tables: cminh - -SELECT pg_column_compression(f1) FROM cmdata; - pg_column_compression ------------------------ - pglz - lz4 -(2 rows) - +\set VERBOSITY default ALTER TABLE cmdata2 ALTER COLUMN f1 SET COMPRESSION default; \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+-------------------+-----------+----------+---------+---------+-------------+--------------+------------- f1 | character varying | | | | plain | | | --- test alter compression method for materialized views -ALTER MATERIALIZED VIEW compressmv ALTER COLUMN x SET COMPRESSION lz4; -\d+ compressmv - Materialized view "public.compressmv" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - x | text | | | | extended | lz4 | | -View definition: - SELECT f1 AS x - FROM cmdata1; - --- test alter compression method for partitioned tables -ALTER TABLE cmpart1 ALTER COLUMN f1 SET COMPRESSION pglz; -ALTER TABLE cmpart2 ALTER COLUMN f1 SET COMPRESSION lz4; --- new data should be compressed with the current compression method -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -SELECT pg_column_compression(f1) FROM cmpart1; - pg_column_compression ------------------------ - lz4 - pglz -(2 rows) - -SELECT pg_column_compression(f1) FROM cmpart2; - pg_column_compression ------------------------ - pglz - lz4 -(2 rows) - +DROP TABLE cmdata2; -- VACUUM FULL does not recompress SELECT pg_column_compression(f1) FROM cmdata; pg_column_compression ----------------------- pglz - lz4 -(2 rows) +(1 row) VACUUM FULL cmdata; SELECT pg_column_compression(f1) FROM cmdata; pg_column_compression ----------------------- pglz - lz4 -(2 rows) +(1 row) --- test expression index -DROP TABLE cmdata2; -CREATE TABLE cmdata2 (f1 TEXT COMPRESSION pglz, f2 TEXT COMPRESSION lz4); -CREATE UNIQUE INDEX idx1 ON cmdata2 ((f1 || f2)); -INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEXT))::TEXT FROM -generate_series(1, 50) g), VERSION()); -- check data is ok SELECT length(f1) FROM cmdata; length -------- 10000 - 36036 -(2 rows) - -SELECT length(f1) FROM cmdata1; - length --------- - 10040 - 12449 -(2 rows) +(1 row) SELECT length(f1) FROM cmmove1; length @@ -344,19 +160,6 @@ SELECT length(f1) FROM cmmove1; 10000 (1 row) -SELECT length(f1) FROM cmmove2; - length --------- - 10040 -(1 row) - -SELECT length(f1) FROM cmmove3; - length --------- - 10000 - 10040 -(2 rows) - CREATE TABLE badcompresstbl (a text COMPRESSION I_Do_Not_Exist_Compression); -- fails ERROR: invalid compression method "i_do_not_exist_compression" CREATE TABLE badcompresstbl (a text); diff --git a/src/test/regress/expected/compression_1.out b/src/test/regress/expected/compression_1.out deleted file mode 100644 index 7bd7642b4b9..00000000000 --- a/src/test/regress/expected/compression_1.out +++ /dev/null @@ -1,360 +0,0 @@ -\set HIDE_TOAST_COMPRESSION false --- ensure we get stable results regardless of installation's default -SET default_toast_compression = 'pglz'; --- test creating table with compression method -CREATE TABLE cmdata(f1 text COMPRESSION pglz); -CREATE INDEX idx ON cmdata(f1); -INSERT INTO cmdata VALUES(repeat('1234567890', 1000)); -\d+ cmdata - Table "public.cmdata" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | pglz | | -Indexes: - "idx" btree (f1) - -CREATE TABLE cmdata1(f1 TEXT COMPRESSION lz4); -ERROR: compression method lz4 not supported -DETAIL: This functionality requires the server to be built with lz4 support. -INSERT INTO cmdata1 VALUES(repeat('1234567890', 1004)); -ERROR: relation "cmdata1" does not exist -LINE 1: INSERT INTO cmdata1 VALUES(repeat('1234567890', 1004)); - ^ -\d+ cmdata1 --- verify stored compression method in the data -SELECT pg_column_compression(f1) FROM cmdata; - pg_column_compression ------------------------ - pglz -(1 row) - -SELECT pg_column_compression(f1) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT pg_column_compression(f1) FROM cmdata1; - ^ --- decompress data slice -SELECT SUBSTR(f1, 200, 5) FROM cmdata; - substr --------- - 01234 -(1 row) - -SELECT SUBSTR(f1, 2000, 50) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT SUBSTR(f1, 2000, 50) FROM cmdata1; - ^ --- copy with table creation -SELECT * INTO cmmove1 FROM cmdata; -\d+ cmmove1 - Table "public.cmmove1" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | | | - -SELECT pg_column_compression(f1) FROM cmmove1; - pg_column_compression ------------------------ - pglz -(1 row) - --- copy to existing table -CREATE TABLE cmmove3(f1 text COMPRESSION pglz); -INSERT INTO cmmove3 SELECT * FROM cmdata; -INSERT INTO cmmove3 SELECT * FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: INSERT INTO cmmove3 SELECT * FROM cmdata1; - ^ -SELECT pg_column_compression(f1) FROM cmmove3; - pg_column_compression ------------------------ - pglz -(1 row) - --- test LIKE INCLUDING COMPRESSION -CREATE TABLE cmdata2 (LIKE cmdata1 INCLUDING COMPRESSION); -ERROR: relation "cmdata1" does not exist -LINE 1: CREATE TABLE cmdata2 (LIKE cmdata1 INCLUDING COMPRESSION); - ^ -\d+ cmdata2 -DROP TABLE cmdata2; -ERROR: table "cmdata2" does not exist --- try setting compression for incompressible data type -CREATE TABLE cmdata2 (f1 int COMPRESSION pglz); -ERROR: column data type integer does not support compression --- update using datum from different table -CREATE TABLE cmmove2(f1 text COMPRESSION pglz); -INSERT INTO cmmove2 VALUES (repeat('1234567890', 1004)); -SELECT pg_column_compression(f1) FROM cmmove2; - pg_column_compression ------------------------ - pglz -(1 row) - -UPDATE cmmove2 SET f1 = cmdata1.f1 FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: UPDATE cmmove2 SET f1 = cmdata1.f1 FROM cmdata1; - ^ -SELECT pg_column_compression(f1) FROM cmmove2; - pg_column_compression ------------------------ - pglz -(1 row) - --- test externally stored compressed data -CREATE OR REPLACE FUNCTION large_val() RETURNS TEXT LANGUAGE SQL AS -'select array_agg(fipshash(g::text))::text from generate_series(1, 256) g'; -CREATE TABLE cmdata2 (f1 text COMPRESSION pglz); -INSERT INTO cmdata2 SELECT large_val() || repeat('a', 4000); -SELECT pg_column_compression(f1) FROM cmdata2; - pg_column_compression ------------------------ - pglz -(1 row) - -INSERT INTO cmdata1 SELECT large_val() || repeat('a', 4000); -ERROR: relation "cmdata1" does not exist -LINE 1: INSERT INTO cmdata1 SELECT large_val() || repeat('a', 4000); - ^ -SELECT pg_column_compression(f1) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT pg_column_compression(f1) FROM cmdata1; - ^ -SELECT SUBSTR(f1, 200, 5) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT SUBSTR(f1, 200, 5) FROM cmdata1; - ^ -SELECT SUBSTR(f1, 200, 5) FROM cmdata2; - substr --------- - 79026 -(1 row) - -DROP TABLE cmdata2; ---test column type update varlena/non-varlena -CREATE TABLE cmdata2 (f1 int); -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+---------+-----------+----------+---------+---------+-------------+--------------+------------- - f1 | integer | | | | plain | | | - -ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE varchar; -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+-------------------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | character varying | | | | extended | | | - -ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE int USING f1::integer; -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+---------+-----------+----------+---------+---------+-------------+--------------+------------- - f1 | integer | | | | plain | | | - ---changing column storage should not impact the compression method ---but the data should not be compressed -ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE varchar; -ALTER TABLE cmdata2 ALTER COLUMN f1 SET COMPRESSION pglz; -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+-------------------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | character varying | | | | extended | pglz | | - -ALTER TABLE cmdata2 ALTER COLUMN f1 SET STORAGE plain; -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+-------------------+-----------+----------+---------+---------+-------------+--------------+------------- - f1 | character varying | | | | plain | pglz | | - -INSERT INTO cmdata2 VALUES (repeat('123456789', 800)); -SELECT pg_column_compression(f1) FROM cmdata2; - pg_column_compression ------------------------ - -(1 row) - --- test compression with materialized view -CREATE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: ...TE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata1; - ^ -\d+ compressmv -SELECT pg_column_compression(f1) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT pg_column_compression(f1) FROM cmdata1; - ^ -SELECT pg_column_compression(x) FROM compressmv; -ERROR: relation "compressmv" does not exist -LINE 1: SELECT pg_column_compression(x) FROM compressmv; - ^ --- test compression with partition -CREATE TABLE cmpart(f1 text COMPRESSION lz4) PARTITION BY HASH(f1); -ERROR: compression method lz4 not supported -DETAIL: This functionality requires the server to be built with lz4 support. -CREATE TABLE cmpart1 PARTITION OF cmpart FOR VALUES WITH (MODULUS 2, REMAINDER 0); -ERROR: relation "cmpart" does not exist -CREATE TABLE cmpart2(f1 text COMPRESSION pglz); -ALTER TABLE cmpart ATTACH PARTITION cmpart2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); -ERROR: relation "cmpart" does not exist -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -ERROR: relation "cmpart" does not exist -LINE 1: INSERT INTO cmpart VALUES (repeat('123456789', 1004)); - ^ -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -ERROR: relation "cmpart" does not exist -LINE 1: INSERT INTO cmpart VALUES (repeat('123456789', 4004)); - ^ -SELECT pg_column_compression(f1) FROM cmpart1; -ERROR: relation "cmpart1" does not exist -LINE 1: SELECT pg_column_compression(f1) FROM cmpart1; - ^ -SELECT pg_column_compression(f1) FROM cmpart2; - pg_column_compression ------------------------ -(0 rows) - --- test compression with inheritance -CREATE TABLE cminh() INHERITS(cmdata, cmdata1); -- error -ERROR: relation "cmdata1" does not exist -CREATE TABLE cminh(f1 TEXT COMPRESSION lz4) INHERITS(cmdata); -- error -NOTICE: merging column "f1" with inherited definition -ERROR: column "f1" has a compression method conflict -DETAIL: pglz versus lz4 -CREATE TABLE cmdata3(f1 text); -CREATE TABLE cminh() INHERITS (cmdata, cmdata3); -NOTICE: merging multiple inherited definitions of column "f1" --- test default_toast_compression GUC -SET default_toast_compression = ''; -ERROR: invalid value for parameter "default_toast_compression": "" -HINT: Available values: pglz. -SET default_toast_compression = 'I do not exist compression'; -ERROR: invalid value for parameter "default_toast_compression": "I do not exist compression" -HINT: Available values: pglz. -SET default_toast_compression = 'lz4'; -ERROR: invalid value for parameter "default_toast_compression": "lz4" -HINT: Available values: pglz. -SET default_toast_compression = 'pglz'; --- test alter compression method -ALTER TABLE cmdata ALTER COLUMN f1 SET COMPRESSION lz4; -ERROR: compression method lz4 not supported -DETAIL: This functionality requires the server to be built with lz4 support. -INSERT INTO cmdata VALUES (repeat('123456789', 4004)); -\d+ cmdata - Table "public.cmdata" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | pglz | | -Indexes: - "idx" btree (f1) -Child tables: cminh - -SELECT pg_column_compression(f1) FROM cmdata; - pg_column_compression ------------------------ - pglz - pglz -(2 rows) - -ALTER TABLE cmdata2 ALTER COLUMN f1 SET COMPRESSION default; -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+-------------------+-----------+----------+---------+---------+-------------+--------------+------------- - f1 | character varying | | | | plain | | | - --- test alter compression method for materialized views -ALTER MATERIALIZED VIEW compressmv ALTER COLUMN x SET COMPRESSION lz4; -ERROR: relation "compressmv" does not exist -\d+ compressmv --- test alter compression method for partitioned tables -ALTER TABLE cmpart1 ALTER COLUMN f1 SET COMPRESSION pglz; -ERROR: relation "cmpart1" does not exist -ALTER TABLE cmpart2 ALTER COLUMN f1 SET COMPRESSION lz4; -ERROR: compression method lz4 not supported -DETAIL: This functionality requires the server to be built with lz4 support. --- new data should be compressed with the current compression method -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -ERROR: relation "cmpart" does not exist -LINE 1: INSERT INTO cmpart VALUES (repeat('123456789', 1004)); - ^ -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -ERROR: relation "cmpart" does not exist -LINE 1: INSERT INTO cmpart VALUES (repeat('123456789', 4004)); - ^ -SELECT pg_column_compression(f1) FROM cmpart1; -ERROR: relation "cmpart1" does not exist -LINE 1: SELECT pg_column_compression(f1) FROM cmpart1; - ^ -SELECT pg_column_compression(f1) FROM cmpart2; - pg_column_compression ------------------------ -(0 rows) - --- VACUUM FULL does not recompress -SELECT pg_column_compression(f1) FROM cmdata; - pg_column_compression ------------------------ - pglz - pglz -(2 rows) - -VACUUM FULL cmdata; -SELECT pg_column_compression(f1) FROM cmdata; - pg_column_compression ------------------------ - pglz - pglz -(2 rows) - --- test expression index -DROP TABLE cmdata2; -CREATE TABLE cmdata2 (f1 TEXT COMPRESSION pglz, f2 TEXT COMPRESSION lz4); -ERROR: compression method lz4 not supported -DETAIL: This functionality requires the server to be built with lz4 support. -CREATE UNIQUE INDEX idx1 ON cmdata2 ((f1 || f2)); -ERROR: relation "cmdata2" does not exist -INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEXT))::TEXT FROM -generate_series(1, 50) g), VERSION()); -ERROR: relation "cmdata2" does not exist -LINE 1: INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEX... - ^ --- check data is ok -SELECT length(f1) FROM cmdata; - length --------- - 10000 - 36036 -(2 rows) - -SELECT length(f1) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT length(f1) FROM cmdata1; - ^ -SELECT length(f1) FROM cmmove1; - length --------- - 10000 -(1 row) - -SELECT length(f1) FROM cmmove2; - length --------- - 10040 -(1 row) - -SELECT length(f1) FROM cmmove3; - length --------- - 10000 -(1 row) - -CREATE TABLE badcompresstbl (a text COMPRESSION I_Do_Not_Exist_Compression); -- fails -ERROR: invalid compression method "i_do_not_exist_compression" -CREATE TABLE badcompresstbl (a text); -ALTER TABLE badcompresstbl ALTER a SET COMPRESSION I_Do_Not_Exist_Compression; -- fails -ERROR: invalid compression method "i_do_not_exist_compression" -DROP TABLE badcompresstbl; -\set HIDE_TOAST_COMPRESSION true diff --git a/src/test/regress/expected/compression_lz4.out b/src/test/regress/expected/compression_lz4.out new file mode 100644 index 00000000000..068dd7c3674 --- /dev/null +++ b/src/test/regress/expected/compression_lz4.out @@ -0,0 +1,249 @@ +-- Tests for TOAST compression with lz4 +SELECT NOT(enumvals @> '{lz4}') AS skip_test FROM pg_settings WHERE + name = 'default_toast_compression' \gset +\if :skip_test + \echo '*** skipping TOAST tests with lz4 (not supported) ***' + \quit +\endif +CREATE SCHEMA lz4; +SET search_path TO lz4, public; +\set HIDE_TOAST_COMPRESSION false +-- Ensure we get stable results regardless of the installation's default. +-- We rely on this GUC value for a few tests. +SET default_toast_compression = 'pglz'; +-- test creating table with compression method +CREATE TABLE cmdata_pglz(f1 text COMPRESSION pglz); +CREATE INDEX idx ON cmdata_pglz(f1); +INSERT INTO cmdata_pglz VALUES(repeat('1234567890', 1000)); +\d+ cmdata +CREATE TABLE cmdata_lz4(f1 TEXT COMPRESSION lz4); +INSERT INTO cmdata_lz4 VALUES(repeat('1234567890', 1004)); +\d+ cmdata1 +-- verify stored compression method in the data +SELECT pg_column_compression(f1) FROM cmdata_lz4; + pg_column_compression +----------------------- + lz4 +(1 row) + +-- decompress data slice +SELECT SUBSTR(f1, 200, 5) FROM cmdata_pglz; + substr +-------- + 01234 +(1 row) + +SELECT SUBSTR(f1, 2000, 50) FROM cmdata_lz4; + substr +---------------------------------------------------- + 01234567890123456789012345678901234567890123456789 +(1 row) + +-- copy with table creation +SELECT * INTO cmmove1 FROM cmdata_lz4; +\d+ cmmove1 + Table "lz4.cmmove1" + Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description +--------+------+-----------+----------+---------+----------+-------------+--------------+------------- + f1 | text | | | | extended | | | + +SELECT pg_column_compression(f1) FROM cmmove1; + pg_column_compression +----------------------- + lz4 +(1 row) + +-- test LIKE INCLUDING COMPRESSION. The GUC default_toast_compression +-- has no effect, the compression method from the table being copied. +CREATE TABLE cmdata2 (LIKE cmdata_lz4 INCLUDING COMPRESSION); +\d+ cmdata2 + Table "lz4.cmdata2" + Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description +--------+------+-----------+----------+---------+----------+-------------+--------------+------------- + f1 | text | | | | extended | lz4 | | + +DROP TABLE cmdata2; +-- copy to existing table +CREATE TABLE cmmove3(f1 text COMPRESSION pglz); +INSERT INTO cmmove3 SELECT * FROM cmdata_pglz; +INSERT INTO cmmove3 SELECT * FROM cmdata_lz4; +SELECT pg_column_compression(f1) FROM cmmove3; + pg_column_compression +----------------------- + pglz + lz4 +(2 rows) + +-- update using datum from different table with LZ4 data. +CREATE TABLE cmmove2(f1 text COMPRESSION pglz); +INSERT INTO cmmove2 VALUES (repeat('1234567890', 1004)); +SELECT pg_column_compression(f1) FROM cmmove2; + pg_column_compression +----------------------- + pglz +(1 row) + +UPDATE cmmove2 SET f1 = cmdata_lz4.f1 FROM cmdata_lz4; +SELECT pg_column_compression(f1) FROM cmmove2; + pg_column_compression +----------------------- + lz4 +(1 row) + +-- test externally stored compressed data +CREATE OR REPLACE FUNCTION large_val_lz4() RETURNS TEXT LANGUAGE SQL AS +'select array_agg(fipshash(g::text))::text from generate_series(1, 256) g'; +CREATE TABLE cmdata2 (f1 text COMPRESSION lz4); +INSERT INTO cmdata2 SELECT large_val_lz4() || repeat('a', 4000); +SELECT pg_column_compression(f1) FROM cmdata2; + pg_column_compression +----------------------- + lz4 +(1 row) + +SELECT SUBSTR(f1, 200, 5) FROM cmdata2; + substr +-------- + 79026 +(1 row) + +DROP TABLE cmdata2; +DROP FUNCTION large_val_lz4; +-- test compression with materialized view +CREATE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata_lz4; +\d+ compressmv + Materialized view "lz4.compressmv" + Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description +--------+------+-----------+----------+---------+----------+-------------+--------------+------------- + x | text | | | | extended | | | +View definition: + SELECT f1 AS x + FROM cmdata_lz4; + +SELECT pg_column_compression(f1) FROM cmdata_lz4; + pg_column_compression +----------------------- + lz4 +(1 row) + +SELECT pg_column_compression(x) FROM compressmv; + pg_column_compression +----------------------- + lz4 +(1 row) + +-- test compression with partition +CREATE TABLE cmpart(f1 text COMPRESSION lz4) PARTITION BY HASH(f1); +CREATE TABLE cmpart1 PARTITION OF cmpart FOR VALUES WITH (MODULUS 2, REMAINDER 0); +CREATE TABLE cmpart2(f1 text COMPRESSION pglz); +ALTER TABLE cmpart ATTACH PARTITION cmpart2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); +INSERT INTO cmpart VALUES (repeat('123456789', 1004)); +INSERT INTO cmpart VALUES (repeat('123456789', 4004)); +SELECT pg_column_compression(f1) FROM cmpart1; + pg_column_compression +----------------------- + lz4 +(1 row) + +SELECT pg_column_compression(f1) FROM cmpart2; + pg_column_compression +----------------------- + pglz +(1 row) + +-- test compression with inheritance +CREATE TABLE cminh() INHERITS(cmdata_pglz, cmdata_lz4); -- error +NOTICE: merging multiple inherited definitions of column "f1" +ERROR: column "f1" has a compression method conflict +DETAIL: pglz versus lz4 +CREATE TABLE cminh(f1 TEXT COMPRESSION lz4) INHERITS(cmdata_pglz); -- error +NOTICE: merging column "f1" with inherited definition +ERROR: column "f1" has a compression method conflict +DETAIL: pglz versus lz4 +CREATE TABLE cmdata3(f1 text); +CREATE TABLE cminh() INHERITS (cmdata_pglz, cmdata3); +NOTICE: merging multiple inherited definitions of column "f1" +-- test default_toast_compression GUC +SET default_toast_compression = 'lz4'; +-- test alter compression method +ALTER TABLE cmdata_pglz ALTER COLUMN f1 SET COMPRESSION lz4; +INSERT INTO cmdata_pglz VALUES (repeat('123456789', 4004)); +\d+ cmdata +SELECT pg_column_compression(f1) FROM cmdata_pglz; + pg_column_compression +----------------------- + pglz + lz4 +(2 rows) + +ALTER TABLE cmdata_pglz ALTER COLUMN f1 SET COMPRESSION pglz; +-- test alter compression method for materialized views +ALTER MATERIALIZED VIEW compressmv ALTER COLUMN x SET COMPRESSION lz4; +\d+ compressmv + Materialized view "lz4.compressmv" + Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description +--------+------+-----------+----------+---------+----------+-------------+--------------+------------- + x | text | | | | extended | lz4 | | +View definition: + SELECT f1 AS x + FROM cmdata_lz4; + +-- test alter compression method for partitioned tables +ALTER TABLE cmpart1 ALTER COLUMN f1 SET COMPRESSION pglz; +ALTER TABLE cmpart2 ALTER COLUMN f1 SET COMPRESSION lz4; +-- new data should be compressed with the current compression method +INSERT INTO cmpart VALUES (repeat('123456789', 1004)); +INSERT INTO cmpart VALUES (repeat('123456789', 4004)); +SELECT pg_column_compression(f1) FROM cmpart1; + pg_column_compression +----------------------- + lz4 + pglz +(2 rows) + +SELECT pg_column_compression(f1) FROM cmpart2; + pg_column_compression +----------------------- + pglz + lz4 +(2 rows) + +-- test expression index +CREATE TABLE cmdata2 (f1 TEXT COMPRESSION pglz, f2 TEXT COMPRESSION lz4); +CREATE UNIQUE INDEX idx1 ON cmdata2 ((f1 || f2)); +INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEXT))::TEXT FROM +generate_series(1, 50) g), VERSION()); +-- check data is ok +SELECT length(f1) FROM cmdata_pglz; + length +-------- + 10000 + 36036 +(2 rows) + +SELECT length(f1) FROM cmdata_lz4; + length +-------- + 10040 +(1 row) + +SELECT length(f1) FROM cmmove1; + length +-------- + 10040 +(1 row) + +SELECT length(f1) FROM cmmove2; + length +-------- + 10040 +(1 row) + +SELECT length(f1) FROM cmmove3; + length +-------- + 10000 + 10040 +(2 rows) + +\set HIDE_TOAST_COMPRESSION true diff --git a/src/test/regress/expected/compression_lz4_1.out b/src/test/regress/expected/compression_lz4_1.out new file mode 100644 index 00000000000..198056fa224 --- /dev/null +++ b/src/test/regress/expected/compression_lz4_1.out @@ -0,0 +1,7 @@ +-- Tests for TOAST compression with lz4 +SELECT NOT(enumvals @> '{lz4}') AS skip_test FROM pg_settings WHERE + name = 'default_toast_compression' \gset +\if :skip_test + \echo '*** skipping TOAST tests with lz4 (not supported) ***' +*** skipping TOAST tests with lz4 (not supported) *** + \quit diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out index ccea883cffd..3590d3274f0 100644 --- a/src/test/regress/expected/constraints.out +++ b/src/test/regress/expected/constraints.out @@ -1701,3 +1701,7 @@ DROP TABLE constraint_comments_tbl; DROP DOMAIN constraint_comments_dom; DROP ROLE regress_constraint_comments; DROP ROLE regress_constraint_comments_noaccess; +-- Leave some constraints for the pg_upgrade test to pick up +CREATE DOMAIN constraint_comments_dom AS int; +ALTER DOMAIN constraint_comments_dom ADD CONSTRAINT inv_ck CHECK (value > 0) NOT VALID; +COMMENT ON CONSTRAINT inv_ck ON DOMAIN constraint_comments_dom IS 'comment on invalid constraint'; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out index f9bd252444f..dc541d61adf 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1750,7 +1750,7 @@ Indexes: Referenced by: TABLE "fk_partitioned_fk" CONSTRAINT "fk_partitioned_fk_a_b_fkey" FOREIGN KEY (a, b) REFERENCES fk_notpartitioned_pk(a, b) --- Check the exsting FK trigger +-- Check the existing FK trigger SELECT conname, tgrelid::regclass as tgrel, regexp_replace(tgname, '[0-9]+', 'N') as tgname, tgtype FROM pg_trigger t JOIN pg_constraint c ON (t.tgconstraint = c.oid) WHERE tgrelid IN (SELECT relid FROM pg_partition_tree('fk_partitioned_fk'::regclass) diff --git a/src/test/regress/expected/generated_virtual.out b/src/test/regress/expected/generated_virtual.out index 3b40e15a95a..aca6347babe 100644 --- a/src/test/regress/expected/generated_virtual.out +++ b/src/test/regress/expected/generated_virtual.out @@ -1550,11 +1550,11 @@ where coalesce(t2.b, 1) = 2; explain (costs off) select t1.a from gtest32 t1 left join gtest32 t2 on t1.a = t2.a where coalesce(t2.b, 1) = 2 or t1.a is null; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +----------------------------------------- Hash Left Join Hash Cond: (t1.a = t2.a) - Filter: ((COALESCE((t2.a * 2), 1) = 2) OR (t1.a IS NULL)) + Filter: (COALESCE((t2.a * 2), 1) = 2) -> Seq Scan on gtest32 t1 -> Hash -> Seq Scan on gtest32 t2 @@ -1613,4 +1613,26 @@ select * from gtest32 t group by grouping sets (a, b, c, d, e) having c = 20; -- Ensure that the virtual generated columns in ALTER COLUMN TYPE USING expression are expanded alter table gtest32 alter column e type bigint using b; +-- Ensure that virtual generated column references within SubLinks that should +-- be transformed into joins can get expanded +explain (costs off) +select 1 from gtest32 t1 where exists + (select 1 from gtest32 t2 where t1.a > t2.a and t2.b = 2); + QUERY PLAN +------------------------------------- + Nested Loop Semi Join + Join Filter: (t1.a > t2.a) + -> Seq Scan on gtest32 t1 + -> Materialize + -> Seq Scan on gtest32 t2 + Filter: ((a * 2) = 2) +(6 rows) + +select 1 from gtest32 t1 where exists + (select 1 from gtest32 t2 where t1.a > t2.a and t2.b = 2); + ?column? +---------- + 1 +(1 row) + drop table gtest32; diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 46ddfa844c5..4d5d35d0727 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -3639,8 +3639,8 @@ from nt3 as nt3 ) as ss2 on ss2.id = nt3.nt2_id where nt3.id = 1 and ss2.b3; - QUERY PLAN ------------------------------------------------ + QUERY PLAN +---------------------------------------------- Nested Loop -> Nested Loop -> Index Scan using nt3_pkey on nt3 @@ -3649,7 +3649,7 @@ where nt3.id = 1 and ss2.b3; Index Cond: (id = nt3.nt2_id) -> Index Only Scan using nt1_pkey on nt1 Index Cond: (id = nt2.nt1_id) - Filter: (nt2.b1 AND (id IS NOT NULL)) + Filter: (nt2.b1 AND true) (9 rows) select nt3.id diff --git a/src/test/regress/expected/predicate.out b/src/test/regress/expected/predicate.out index b79037748b7..59bfe33bb1c 100644 --- a/src/test/regress/expected/predicate.out +++ b/src/test/regress/expected/predicate.out @@ -84,10 +84,10 @@ SELECT * FROM pred_tab t WHERE t.a IS NULL OR t.c IS NULL; -- are provably false EXPLAIN (COSTS OFF) SELECT * FROM pred_tab t WHERE t.b IS NULL OR t.c IS NULL; - QUERY PLAN ----------------------------------------- + QUERY PLAN +------------------------ Seq Scan on pred_tab t - Filter: ((b IS NULL) OR (c IS NULL)) + Filter: (b IS NULL) (2 rows) -- @@ -231,6 +231,54 @@ SELECT * FROM pred_tab t1 -> Seq Scan on pred_tab t3 (9 rows) +-- +-- Tests for NullTest reduction in EXISTS sublink +-- +-- Ensure the IS_NOT_NULL qual is ignored +EXPLAIN (COSTS OFF) +SELECT * FROM pred_tab t1 + LEFT JOIN pred_tab t2 ON EXISTS + (SELECT 1 FROM pred_tab t3, pred_tab t4, pred_tab t5, pred_tab t6 + WHERE t1.a = t3.a AND t6.a IS NOT NULL); + QUERY PLAN +--------------------------------------------------------- + Nested Loop Left Join + Join Filter: EXISTS(SubPlan 1) + -> Seq Scan on pred_tab t1 + -> Materialize + -> Seq Scan on pred_tab t2 + SubPlan 1 + -> Nested Loop + -> Nested Loop + -> Nested Loop + -> Seq Scan on pred_tab t4 + -> Materialize + -> Seq Scan on pred_tab t3 + Filter: (t1.a = a) + -> Materialize + -> Seq Scan on pred_tab t5 + -> Materialize + -> Seq Scan on pred_tab t6 +(17 rows) + +-- Ensure the IS_NULL qual is reduced to constant-FALSE +EXPLAIN (COSTS OFF) +SELECT * FROM pred_tab t1 + LEFT JOIN pred_tab t2 ON EXISTS + (SELECT 1 FROM pred_tab t3, pred_tab t4, pred_tab t5, pred_tab t6 + WHERE t1.a = t3.a AND t6.a IS NULL); + QUERY PLAN +------------------------------------- + Nested Loop Left Join + Join Filter: (InitPlan 1).col1 + InitPlan 1 + -> Result + One-Time Filter: false + -> Seq Scan on pred_tab t1 + -> Materialize + -> Seq Scan on pred_tab t2 +(8 rows) + DROP TABLE pred_tab; -- Validate we handle IS NULL and IS NOT NULL quals correctly with inheritance -- parents. diff --git a/src/test/regress/expected/publication.out b/src/test/regress/expected/publication.out index 3a2eacd793f..1ec3fa34a2d 100644 --- a/src/test/regress/expected/publication.out +++ b/src/test/regress/expected/publication.out @@ -1934,3 +1934,24 @@ RESET client_min_messages; RESET SESSION AUTHORIZATION; DROP ROLE regress_publication_user, regress_publication_user2; DROP ROLE regress_publication_user_dummy; +-- stage objects for pg_dump tests +CREATE SCHEMA pubme CREATE TABLE t0 (c int, d int) CREATE TABLE t1 (c int); +CREATE SCHEMA pubme2 CREATE TABLE t0 (c int, d int); +SET client_min_messages = 'ERROR'; +CREATE PUBLICATION dump_pub_qual_1ct FOR + TABLE ONLY pubme.t0 (c, d) WHERE (c > 0); +CREATE PUBLICATION dump_pub_qual_2ct FOR + TABLE ONLY pubme.t0 (c) WHERE (c > 0), + TABLE ONLY pubme.t1 (c); +CREATE PUBLICATION dump_pub_nsp_1ct FOR + TABLES IN SCHEMA pubme; +CREATE PUBLICATION dump_pub_nsp_2ct FOR + TABLES IN SCHEMA pubme, + TABLES IN SCHEMA pubme2; +CREATE PUBLICATION dump_pub_all FOR + TABLE ONLY pubme.t0, + TABLE ONLY pubme.t1 WHERE (c < 0), + TABLES IN SCHEMA pubme, + TABLES IN SCHEMA pubme2 + WITH (publish_via_partition_root = true); +RESET client_min_messages; diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 788844abd20..1bfd33de3f3 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -236,6 +236,12 @@ SELECT E'De\\678dBeEf'::bytea; ERROR: invalid input syntax for type bytea LINE 1: SELECT E'De\\678dBeEf'::bytea; ^ +SELECT E'DeAd\\\\BeEf'::bytea; + bytea +---------------------- + \x446541645c42654566 +(1 row) + SELECT reverse(''::bytea); reverse --------- @@ -291,6 +297,12 @@ SELECT E'De\\123dBeEf'::bytea; DeSdBeEf (1 row) +SELECT E'DeAd\\\\BeEf'::bytea; + bytea +------------ + DeAd\\BeEf +(1 row) + -- Test non-error-throwing API too SELECT pg_input_is_valid(E'\\xDeAdBeE', 'bytea'); pg_input_is_valid diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out index 529b2241731..a98c97f7616 100644 --- a/src/test/regress/expected/subscription.out +++ b/src/test/regress/expected/subscription.out @@ -116,18 +116,18 @@ CREATE SUBSCRIPTION regress_testsub4 CONNECTION 'dbname=regress_doesnotexist' PU WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ regress_testsub4 - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN -------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | none | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | none | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub4 SET (origin = any); \dRs+ regress_testsub4 - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN -------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) DROP SUBSCRIPTION regress_testsub3; @@ -145,10 +145,10 @@ ALTER SUBSCRIPTION regress_testsub CONNECTION 'foobar'; ERROR: invalid connection string syntax: missing "=" after "foobar" in connection info string \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET PUBLICATION testpub2, testpub3 WITH (refresh = false); @@ -157,10 +157,10 @@ ALTER SUBSCRIPTION regress_testsub SET (slot_name = 'newname'); ALTER SUBSCRIPTION regress_testsub SET (password_required = false); ALTER SUBSCRIPTION regress_testsub SET (run_as_owner = true); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | f | t | f | off | dbname=regress_doesnotexist2 | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+------------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | f | t | f | f | off | dbname=regress_doesnotexist2 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (password_required = true); @@ -176,10 +176,10 @@ ERROR: unrecognized subscription parameter: "create_slot" -- ok ALTER SUBSCRIPTION regress_testsub SKIP (lsn = '0/12345'); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist2 | 0/00012345 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+------------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist2 | 0/00012345 (1 row) -- ok - with lsn = NONE @@ -188,10 +188,10 @@ ALTER SUBSCRIPTION regress_testsub SKIP (lsn = NONE); ALTER SUBSCRIPTION regress_testsub SKIP (lsn = '0/0'); ERROR: invalid WAL location (LSN): 0/0 \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist2 | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+------------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist2 | 0/00000000 (1 row) BEGIN; @@ -223,10 +223,10 @@ ALTER SUBSCRIPTION regress_testsub_foo SET (synchronous_commit = foobar); ERROR: invalid value for parameter "synchronous_commit": "foobar" HINT: Available values: local, remote_write, remote_apply, on, off. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ----------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------------+------------ - regress_testsub_foo | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | local | dbname=regress_doesnotexist2 | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +---------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+------------------------------+------------ + regress_testsub_foo | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | local | dbname=regress_doesnotexist2 | 0/00000000 (1 row) -- rename back to keep the rest simple @@ -255,19 +255,19 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | t | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | t | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (binary = false); ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) DROP SUBSCRIPTION regress_testsub; @@ -279,27 +279,27 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | on | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | on | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (streaming = parallel); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (streaming = false); ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) -- fail - publication already exists @@ -314,10 +314,10 @@ ALTER SUBSCRIPTION regress_testsub ADD PUBLICATION testpub1, testpub2 WITH (refr ALTER SUBSCRIPTION regress_testsub ADD PUBLICATION testpub1, testpub2 WITH (refresh = false); ERROR: publication "testpub1" is already in subscription "regress_testsub" \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-----------------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub,testpub1,testpub2} | f | off | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-----------------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub,testpub1,testpub2} | f | off | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) -- fail - publication used more than once @@ -332,10 +332,10 @@ ERROR: publication "testpub3" is not in subscription "regress_testsub" -- ok - delete publications ALTER SUBSCRIPTION regress_testsub DROP PUBLICATION testpub1, testpub2 WITH (refresh = false); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) DROP SUBSCRIPTION regress_testsub; @@ -371,19 +371,19 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | p | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | p | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) -- we can alter streaming when two_phase enabled ALTER SUBSCRIPTION regress_testsub SET (streaming = true); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); @@ -393,10 +393,10 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); @@ -409,18 +409,34 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (disable_on_error = true); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+------------ - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | t | any | t | f | f | off | dbname=regress_doesnotexist | 0/00000000 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | t | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 +(1 row) + +ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); +DROP SUBSCRIPTION regress_testsub; +-- fail - retain_dead_tuples must be boolean +CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, retain_dead_tuples = foo); +ERROR: retain_dead_tuples requires a Boolean value +-- ok +CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, retain_dead_tuples = false); +WARNING: subscription was created, but is not connected +HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. +\dRs+ + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index a424be2a6bf..fbffc67ae60 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -123,7 +123,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr # The stats test resets stats, so nothing else needing stats access can be in # this group. # ---------- -test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa +test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression compression_lz4 memoize stats predicate numa # event_trigger depends on create_am and cannot run concurrently with # any test that runs DDL diff --git a/src/test/regress/sql/compression.sql b/src/test/regress/sql/compression.sql index 490595fcfb2..ce5ea37a660 100644 --- a/src/test/regress/sql/compression.sql +++ b/src/test/regress/sql/compression.sql @@ -1,3 +1,8 @@ +-- Default set of tests for TOAST compression, independent on compression +-- methods supported by the build. + +CREATE SCHEMA pglz; +SET search_path TO pglz, public; \set HIDE_TOAST_COMPRESSION false -- ensure we get stable results regardless of installation's default @@ -8,53 +13,27 @@ CREATE TABLE cmdata(f1 text COMPRESSION pglz); CREATE INDEX idx ON cmdata(f1); INSERT INTO cmdata VALUES(repeat('1234567890', 1000)); \d+ cmdata -CREATE TABLE cmdata1(f1 TEXT COMPRESSION lz4); -INSERT INTO cmdata1 VALUES(repeat('1234567890', 1004)); -\d+ cmdata1 -- verify stored compression method in the data SELECT pg_column_compression(f1) FROM cmdata; -SELECT pg_column_compression(f1) FROM cmdata1; -- decompress data slice SELECT SUBSTR(f1, 200, 5) FROM cmdata; -SELECT SUBSTR(f1, 2000, 50) FROM cmdata1; -- copy with table creation SELECT * INTO cmmove1 FROM cmdata; \d+ cmmove1 SELECT pg_column_compression(f1) FROM cmmove1; --- copy to existing table -CREATE TABLE cmmove3(f1 text COMPRESSION pglz); -INSERT INTO cmmove3 SELECT * FROM cmdata; -INSERT INTO cmmove3 SELECT * FROM cmdata1; -SELECT pg_column_compression(f1) FROM cmmove3; - --- test LIKE INCLUDING COMPRESSION -CREATE TABLE cmdata2 (LIKE cmdata1 INCLUDING COMPRESSION); -\d+ cmdata2 -DROP TABLE cmdata2; - -- try setting compression for incompressible data type CREATE TABLE cmdata2 (f1 int COMPRESSION pglz); --- update using datum from different table -CREATE TABLE cmmove2(f1 text COMPRESSION pglz); -INSERT INTO cmmove2 VALUES (repeat('1234567890', 1004)); -SELECT pg_column_compression(f1) FROM cmmove2; -UPDATE cmmove2 SET f1 = cmdata1.f1 FROM cmdata1; -SELECT pg_column_compression(f1) FROM cmmove2; - -- test externally stored compressed data CREATE OR REPLACE FUNCTION large_val() RETURNS TEXT LANGUAGE SQL AS 'select array_agg(fipshash(g::text))::text from generate_series(1, 256) g'; CREATE TABLE cmdata2 (f1 text COMPRESSION pglz); INSERT INTO cmdata2 SELECT large_val() || repeat('a', 4000); SELECT pg_column_compression(f1) FROM cmdata2; -INSERT INTO cmdata1 SELECT large_val() || repeat('a', 4000); -SELECT pg_column_compression(f1) FROM cmdata1; -SELECT SUBSTR(f1, 200, 5) FROM cmdata1; SELECT SUBSTR(f1, 200, 5) FROM cmdata2; DROP TABLE cmdata2; @@ -76,76 +55,31 @@ ALTER TABLE cmdata2 ALTER COLUMN f1 SET STORAGE plain; INSERT INTO cmdata2 VALUES (repeat('123456789', 800)); SELECT pg_column_compression(f1) FROM cmdata2; --- test compression with materialized view -CREATE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata1; -\d+ compressmv -SELECT pg_column_compression(f1) FROM cmdata1; -SELECT pg_column_compression(x) FROM compressmv; - --- test compression with partition -CREATE TABLE cmpart(f1 text COMPRESSION lz4) PARTITION BY HASH(f1); -CREATE TABLE cmpart1 PARTITION OF cmpart FOR VALUES WITH (MODULUS 2, REMAINDER 0); -CREATE TABLE cmpart2(f1 text COMPRESSION pglz); - -ALTER TABLE cmpart ATTACH PARTITION cmpart2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -SELECT pg_column_compression(f1) FROM cmpart1; -SELECT pg_column_compression(f1) FROM cmpart2; - -- test compression with inheritance -CREATE TABLE cminh() INHERITS(cmdata, cmdata1); -- error -CREATE TABLE cminh(f1 TEXT COMPRESSION lz4) INHERITS(cmdata); -- error CREATE TABLE cmdata3(f1 text); CREATE TABLE cminh() INHERITS (cmdata, cmdata3); -- test default_toast_compression GUC +-- suppress machine-dependent details +\set VERBOSITY terse SET default_toast_compression = ''; SET default_toast_compression = 'I do not exist compression'; -SET default_toast_compression = 'lz4'; SET default_toast_compression = 'pglz'; - --- test alter compression method -ALTER TABLE cmdata ALTER COLUMN f1 SET COMPRESSION lz4; -INSERT INTO cmdata VALUES (repeat('123456789', 4004)); -\d+ cmdata -SELECT pg_column_compression(f1) FROM cmdata; +\set VERBOSITY default ALTER TABLE cmdata2 ALTER COLUMN f1 SET COMPRESSION default; \d+ cmdata2 --- test alter compression method for materialized views -ALTER MATERIALIZED VIEW compressmv ALTER COLUMN x SET COMPRESSION lz4; -\d+ compressmv - --- test alter compression method for partitioned tables -ALTER TABLE cmpart1 ALTER COLUMN f1 SET COMPRESSION pglz; -ALTER TABLE cmpart2 ALTER COLUMN f1 SET COMPRESSION lz4; - --- new data should be compressed with the current compression method -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -SELECT pg_column_compression(f1) FROM cmpart1; -SELECT pg_column_compression(f1) FROM cmpart2; +DROP TABLE cmdata2; -- VACUUM FULL does not recompress SELECT pg_column_compression(f1) FROM cmdata; VACUUM FULL cmdata; SELECT pg_column_compression(f1) FROM cmdata; --- test expression index -DROP TABLE cmdata2; -CREATE TABLE cmdata2 (f1 TEXT COMPRESSION pglz, f2 TEXT COMPRESSION lz4); -CREATE UNIQUE INDEX idx1 ON cmdata2 ((f1 || f2)); -INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEXT))::TEXT FROM -generate_series(1, 50) g), VERSION()); - -- check data is ok SELECT length(f1) FROM cmdata; -SELECT length(f1) FROM cmdata1; SELECT length(f1) FROM cmmove1; -SELECT length(f1) FROM cmmove2; -SELECT length(f1) FROM cmmove3; CREATE TABLE badcompresstbl (a text COMPRESSION I_Do_Not_Exist_Compression); -- fails CREATE TABLE badcompresstbl (a text); diff --git a/src/test/regress/sql/compression_lz4.sql b/src/test/regress/sql/compression_lz4.sql new file mode 100644 index 00000000000..3849f8618de --- /dev/null +++ b/src/test/regress/sql/compression_lz4.sql @@ -0,0 +1,129 @@ +-- Tests for TOAST compression with lz4 + +SELECT NOT(enumvals @> '{lz4}') AS skip_test FROM pg_settings WHERE + name = 'default_toast_compression' \gset +\if :skip_test + \echo '*** skipping TOAST tests with lz4 (not supported) ***' + \quit +\endif + +CREATE SCHEMA lz4; +SET search_path TO lz4, public; + +\set HIDE_TOAST_COMPRESSION false + +-- Ensure we get stable results regardless of the installation's default. +-- We rely on this GUC value for a few tests. +SET default_toast_compression = 'pglz'; + +-- test creating table with compression method +CREATE TABLE cmdata_pglz(f1 text COMPRESSION pglz); +CREATE INDEX idx ON cmdata_pglz(f1); +INSERT INTO cmdata_pglz VALUES(repeat('1234567890', 1000)); +\d+ cmdata +CREATE TABLE cmdata_lz4(f1 TEXT COMPRESSION lz4); +INSERT INTO cmdata_lz4 VALUES(repeat('1234567890', 1004)); +\d+ cmdata1 + +-- verify stored compression method in the data +SELECT pg_column_compression(f1) FROM cmdata_lz4; + +-- decompress data slice +SELECT SUBSTR(f1, 200, 5) FROM cmdata_pglz; +SELECT SUBSTR(f1, 2000, 50) FROM cmdata_lz4; + +-- copy with table creation +SELECT * INTO cmmove1 FROM cmdata_lz4; +\d+ cmmove1 +SELECT pg_column_compression(f1) FROM cmmove1; + +-- test LIKE INCLUDING COMPRESSION. The GUC default_toast_compression +-- has no effect, the compression method from the table being copied. +CREATE TABLE cmdata2 (LIKE cmdata_lz4 INCLUDING COMPRESSION); +\d+ cmdata2 +DROP TABLE cmdata2; + +-- copy to existing table +CREATE TABLE cmmove3(f1 text COMPRESSION pglz); +INSERT INTO cmmove3 SELECT * FROM cmdata_pglz; +INSERT INTO cmmove3 SELECT * FROM cmdata_lz4; +SELECT pg_column_compression(f1) FROM cmmove3; + +-- update using datum from different table with LZ4 data. +CREATE TABLE cmmove2(f1 text COMPRESSION pglz); +INSERT INTO cmmove2 VALUES (repeat('1234567890', 1004)); +SELECT pg_column_compression(f1) FROM cmmove2; +UPDATE cmmove2 SET f1 = cmdata_lz4.f1 FROM cmdata_lz4; +SELECT pg_column_compression(f1) FROM cmmove2; + +-- test externally stored compressed data +CREATE OR REPLACE FUNCTION large_val_lz4() RETURNS TEXT LANGUAGE SQL AS +'select array_agg(fipshash(g::text))::text from generate_series(1, 256) g'; +CREATE TABLE cmdata2 (f1 text COMPRESSION lz4); +INSERT INTO cmdata2 SELECT large_val_lz4() || repeat('a', 4000); +SELECT pg_column_compression(f1) FROM cmdata2; +SELECT SUBSTR(f1, 200, 5) FROM cmdata2; +DROP TABLE cmdata2; +DROP FUNCTION large_val_lz4; + +-- test compression with materialized view +CREATE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata_lz4; +\d+ compressmv +SELECT pg_column_compression(f1) FROM cmdata_lz4; +SELECT pg_column_compression(x) FROM compressmv; + +-- test compression with partition +CREATE TABLE cmpart(f1 text COMPRESSION lz4) PARTITION BY HASH(f1); +CREATE TABLE cmpart1 PARTITION OF cmpart FOR VALUES WITH (MODULUS 2, REMAINDER 0); +CREATE TABLE cmpart2(f1 text COMPRESSION pglz); + +ALTER TABLE cmpart ATTACH PARTITION cmpart2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); +INSERT INTO cmpart VALUES (repeat('123456789', 1004)); +INSERT INTO cmpart VALUES (repeat('123456789', 4004)); +SELECT pg_column_compression(f1) FROM cmpart1; +SELECT pg_column_compression(f1) FROM cmpart2; + +-- test compression with inheritance +CREATE TABLE cminh() INHERITS(cmdata_pglz, cmdata_lz4); -- error +CREATE TABLE cminh(f1 TEXT COMPRESSION lz4) INHERITS(cmdata_pglz); -- error +CREATE TABLE cmdata3(f1 text); +CREATE TABLE cminh() INHERITS (cmdata_pglz, cmdata3); + +-- test default_toast_compression GUC +SET default_toast_compression = 'lz4'; + +-- test alter compression method +ALTER TABLE cmdata_pglz ALTER COLUMN f1 SET COMPRESSION lz4; +INSERT INTO cmdata_pglz VALUES (repeat('123456789', 4004)); +\d+ cmdata +SELECT pg_column_compression(f1) FROM cmdata_pglz; +ALTER TABLE cmdata_pglz ALTER COLUMN f1 SET COMPRESSION pglz; + +-- test alter compression method for materialized views +ALTER MATERIALIZED VIEW compressmv ALTER COLUMN x SET COMPRESSION lz4; +\d+ compressmv + +-- test alter compression method for partitioned tables +ALTER TABLE cmpart1 ALTER COLUMN f1 SET COMPRESSION pglz; +ALTER TABLE cmpart2 ALTER COLUMN f1 SET COMPRESSION lz4; + +-- new data should be compressed with the current compression method +INSERT INTO cmpart VALUES (repeat('123456789', 1004)); +INSERT INTO cmpart VALUES (repeat('123456789', 4004)); +SELECT pg_column_compression(f1) FROM cmpart1; +SELECT pg_column_compression(f1) FROM cmpart2; + +-- test expression index +CREATE TABLE cmdata2 (f1 TEXT COMPRESSION pglz, f2 TEXT COMPRESSION lz4); +CREATE UNIQUE INDEX idx1 ON cmdata2 ((f1 || f2)); +INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEXT))::TEXT FROM +generate_series(1, 50) g), VERSION()); + +-- check data is ok +SELECT length(f1) FROM cmdata_pglz; +SELECT length(f1) FROM cmdata_lz4; +SELECT length(f1) FROM cmmove1; +SELECT length(f1) FROM cmmove2; +SELECT length(f1) FROM cmmove3; + +\set HIDE_TOAST_COMPRESSION true diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql index 7487723ab84..1f6dc8fd69f 100644 --- a/src/test/regress/sql/constraints.sql +++ b/src/test/regress/sql/constraints.sql @@ -1043,3 +1043,9 @@ DROP DOMAIN constraint_comments_dom; DROP ROLE regress_constraint_comments; DROP ROLE regress_constraint_comments_noaccess; + +-- Leave some constraints for the pg_upgrade test to pick up +CREATE DOMAIN constraint_comments_dom AS int; + +ALTER DOMAIN constraint_comments_dom ADD CONSTRAINT inv_ck CHECK (value > 0) NOT VALID; +COMMENT ON CONSTRAINT inv_ck ON DOMAIN constraint_comments_dom IS 'comment on invalid constraint'; diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql index cfcecb4e911..39174ad1eb9 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -1296,7 +1296,7 @@ UPDATE fk_notpartitioned_pk SET b = 2504 WHERE a = 2500; -- check psql behavior \d fk_notpartitioned_pk --- Check the exsting FK trigger +-- Check the existing FK trigger SELECT conname, tgrelid::regclass as tgrel, regexp_replace(tgname, '[0-9]+', 'N') as tgname, tgtype FROM pg_trigger t JOIN pg_constraint c ON (t.tgconstraint = c.oid) WHERE tgrelid IN (SELECT relid FROM pg_partition_tree('fk_partitioned_fk'::regclass) diff --git a/src/test/regress/sql/generated_virtual.sql b/src/test/regress/sql/generated_virtual.sql index e2b31853e01..ba19bc4c701 100644 --- a/src/test/regress/sql/generated_virtual.sql +++ b/src/test/regress/sql/generated_virtual.sql @@ -858,4 +858,13 @@ select * from gtest32 t group by grouping sets (a, b, c, d, e) having c = 20; -- Ensure that the virtual generated columns in ALTER COLUMN TYPE USING expression are expanded alter table gtest32 alter column e type bigint using b; +-- Ensure that virtual generated column references within SubLinks that should +-- be transformed into joins can get expanded +explain (costs off) +select 1 from gtest32 t1 where exists + (select 1 from gtest32 t2 where t1.a > t2.a and t2.b = 2); + +select 1 from gtest32 t1 where exists + (select 1 from gtest32 t2 where t1.a > t2.a and t2.b = 2); + drop table gtest32; diff --git a/src/test/regress/sql/predicate.sql b/src/test/regress/sql/predicate.sql index 9dcb81b1bc5..d92277353a0 100644 --- a/src/test/regress/sql/predicate.sql +++ b/src/test/regress/sql/predicate.sql @@ -115,6 +115,24 @@ SELECT * FROM pred_tab t1 LEFT JOIN pred_tab t2 ON t1.a = 1 LEFT JOIN pred_tab t3 ON t2.a IS NULL OR t2.c IS NULL; +-- +-- Tests for NullTest reduction in EXISTS sublink +-- + +-- Ensure the IS_NOT_NULL qual is ignored +EXPLAIN (COSTS OFF) +SELECT * FROM pred_tab t1 + LEFT JOIN pred_tab t2 ON EXISTS + (SELECT 1 FROM pred_tab t3, pred_tab t4, pred_tab t5, pred_tab t6 + WHERE t1.a = t3.a AND t6.a IS NOT NULL); + +-- Ensure the IS_NULL qual is reduced to constant-FALSE +EXPLAIN (COSTS OFF) +SELECT * FROM pred_tab t1 + LEFT JOIN pred_tab t2 ON EXISTS + (SELECT 1 FROM pred_tab t3, pred_tab t4, pred_tab t5, pred_tab t6 + WHERE t1.a = t3.a AND t6.a IS NULL); + DROP TABLE pred_tab; -- Validate we handle IS NULL and IS NOT NULL quals correctly with inheritance diff --git a/src/test/regress/sql/publication.sql b/src/test/regress/sql/publication.sql index c9e309190df..2585f083181 100644 --- a/src/test/regress/sql/publication.sql +++ b/src/test/regress/sql/publication.sql @@ -1229,3 +1229,25 @@ RESET client_min_messages; RESET SESSION AUTHORIZATION; DROP ROLE regress_publication_user, regress_publication_user2; DROP ROLE regress_publication_user_dummy; + +-- stage objects for pg_dump tests +CREATE SCHEMA pubme CREATE TABLE t0 (c int, d int) CREATE TABLE t1 (c int); +CREATE SCHEMA pubme2 CREATE TABLE t0 (c int, d int); +SET client_min_messages = 'ERROR'; +CREATE PUBLICATION dump_pub_qual_1ct FOR + TABLE ONLY pubme.t0 (c, d) WHERE (c > 0); +CREATE PUBLICATION dump_pub_qual_2ct FOR + TABLE ONLY pubme.t0 (c) WHERE (c > 0), + TABLE ONLY pubme.t1 (c); +CREATE PUBLICATION dump_pub_nsp_1ct FOR + TABLES IN SCHEMA pubme; +CREATE PUBLICATION dump_pub_nsp_2ct FOR + TABLES IN SCHEMA pubme, + TABLES IN SCHEMA pubme2; +CREATE PUBLICATION dump_pub_all FOR + TABLE ONLY pubme.t0, + TABLE ONLY pubme.t1 WHERE (c < 0), + TABLES IN SCHEMA pubme, + TABLES IN SCHEMA pubme2 + WITH (publish_via_partition_root = true); +RESET client_min_messages; diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 2577a42987d..92c445c2439 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -76,6 +76,7 @@ SELECT E'De\\000dBeEf'::bytea; SELECT E'De\123dBeEf'::bytea; SELECT E'De\\123dBeEf'::bytea; SELECT E'De\\678dBeEf'::bytea; +SELECT E'DeAd\\\\BeEf'::bytea; SELECT reverse(''::bytea); SELECT reverse('\xaa'::bytea); @@ -88,6 +89,7 @@ SELECT E'\\xDe00BeEf'::bytea; SELECT E'DeAdBeEf'::bytea; SELECT E'De\\000dBeEf'::bytea; SELECT E'De\\123dBeEf'::bytea; +SELECT E'DeAd\\\\BeEf'::bytea; -- Test non-error-throwing API too SELECT pg_input_is_valid(E'\\xDeAdBeE', 'bytea'); diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql index 007c9e70374..f0f714fe747 100644 --- a/src/test/regress/sql/subscription.sql +++ b/src/test/regress/sql/subscription.sql @@ -287,6 +287,17 @@ ALTER SUBSCRIPTION regress_testsub SET (disable_on_error = true); ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); DROP SUBSCRIPTION regress_testsub; +-- fail - retain_dead_tuples must be boolean +CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, retain_dead_tuples = foo); + +-- ok +CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, retain_dead_tuples = false); + +\dRs+ + +ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); +DROP SUBSCRIPTION regress_testsub; + -- let's do some tests with pg_create_subscription rather than superuser SET SESSION AUTHORIZATION regress_subscription_user3; diff --git a/src/test/subscription/t/035_conflicts.pl b/src/test/subscription/t/035_conflicts.pl index d78a6bac16a..976d53a870e 100644 --- a/src/test/subscription/t/035_conflicts.pl +++ b/src/test/subscription/t/035_conflicts.pl @@ -1,6 +1,6 @@ # Copyright (c) 2025, PostgreSQL Global Development Group -# Test the conflict detection of conflict type 'multiple_unique_conflicts'. +# Test conflicts in logical replication use strict; use warnings FATAL => 'all'; use PostgreSQL::Test::Cluster; @@ -18,7 +18,7 @@ $node_publisher->start; # Create a subscriber node my $node_subscriber = PostgreSQL::Test::Cluster->new('subscriber'); -$node_subscriber->init; +$node_subscriber->init(allows_streaming => 'logical'); $node_subscriber->start; # Create a table on publisher @@ -146,4 +146,205 @@ $node_subscriber->wait_for_log( pass('multiple_unique_conflicts detected on a leaf partition during insert'); +############################################################################### +# Setup a bidirectional logical replication between node_A & node_B +############################################################################### + +# Initialize nodes. + +# node_A. Increase the log_min_messages setting to DEBUG2 to debug test +# failures. Disable autovacuum to avoid generating xid that could affect the +# replication slot's xmin value. +my $node_A = $node_publisher; +$node_A->append_conf( + 'postgresql.conf', + qq{autovacuum = off + log_min_messages = 'debug2'}); +$node_A->restart; + +# node_B +my $node_B = $node_subscriber; +$node_B->append_conf('postgresql.conf', "track_commit_timestamp = on"); +$node_B->restart; + +# Create table on node_A +$node_A->safe_psql('postgres', "CREATE TABLE tab (a int PRIMARY KEY, b int)"); + +# Create the same table on node_B +$node_B->safe_psql('postgres', "CREATE TABLE tab (a int PRIMARY KEY, b int)"); + +my $subname_AB = 'tap_sub_a_b'; +my $subname_BA = 'tap_sub_b_a'; + +# Setup logical replication +# node_A (pub) -> node_B (sub) +my $node_A_connstr = $node_A->connstr . ' dbname=postgres'; +$node_A->safe_psql('postgres', "CREATE PUBLICATION tap_pub_A FOR TABLE tab"); +$node_B->safe_psql( + 'postgres', " + CREATE SUBSCRIPTION $subname_BA + CONNECTION '$node_A_connstr application_name=$subname_BA' + PUBLICATION tap_pub_A + WITH (origin = none, retain_dead_tuples = true)"); + +# node_B (pub) -> node_A (sub) +my $node_B_connstr = $node_B->connstr . ' dbname=postgres'; +$node_B->safe_psql('postgres', "CREATE PUBLICATION tap_pub_B FOR TABLE tab"); +$node_A->safe_psql( + 'postgres', " + CREATE SUBSCRIPTION $subname_AB + CONNECTION '$node_B_connstr application_name=$subname_AB' + PUBLICATION tap_pub_B + WITH (origin = none, copy_data = off)"); + +# Wait for initial table sync to finish +$node_A->wait_for_subscription_sync($node_B, $subname_AB); +$node_B->wait_for_subscription_sync($node_A, $subname_BA); + +is(1, 1, 'Bidirectional replication setup is complete'); + +# Confirm that the conflict detection slot is created on Node B and the xmin +# value is valid. +ok( $node_B->poll_query_until( + 'postgres', + "SELECT xmin IS NOT NULL from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is valid on Node B"); + +################################################## +# Check that the retain_dead_tuples option can be enabled only for disabled +# subscriptions. Validate the NOTICE message during the subscription DDL, and +# ensure the conflict detection slot is created upon enabling the +# retain_dead_tuples option. +################################################## + +# Alter retain_dead_tuples for enabled subscription +my ($cmdret, $stdout, $stderr) = $node_A->psql('postgres', + "ALTER SUBSCRIPTION $subname_AB SET (retain_dead_tuples = true)"); +ok( $stderr =~ + /ERROR: cannot set option \"retain_dead_tuples\" for enabled subscription/, + "altering retain_dead_tuples is not allowed for enabled subscription"); + +# Disable the subscription +$node_A->psql('postgres', "ALTER SUBSCRIPTION $subname_AB DISABLE;"); + +# Wait for the apply worker to stop +$node_A->poll_query_until('postgres', + "SELECT count(*) = 0 FROM pg_stat_activity WHERE backend_type = 'logical replication apply worker'" +); + +# Enable retain_dead_tuples for disabled subscription +($cmdret, $stdout, $stderr) = $node_A->psql('postgres', + "ALTER SUBSCRIPTION $subname_AB SET (retain_dead_tuples = true);"); +ok( $stderr =~ + /NOTICE: deleted rows to detect conflicts would not be removed until the subscription is enabled/, + "altering retain_dead_tuples is allowed for disabled subscription"); + +# Re-enable the subscription +$node_A->safe_psql('postgres', "ALTER SUBSCRIPTION $subname_AB ENABLE;"); + +# Confirm that the conflict detection slot is created on Node A and the xmin +# value is valid. +ok( $node_A->poll_query_until( + 'postgres', + "SELECT xmin IS NOT NULL from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is valid on Node A"); + +################################################## +# Check the WARNING when changing the origin to ANY, if retain_dead_tuples is +# enabled. This warns of the possibility of receiving changes from origins +# other than the publisher. +################################################## + +($cmdret, $stdout, $stderr) = $node_A->psql('postgres', + "ALTER SUBSCRIPTION $subname_AB SET (origin = any);"); +ok( $stderr =~ + /WARNING: subscription "tap_sub_a_b" enabled retain_dead_tuples but might not reliably detect conflicts for changes from different origins/, + "warn of the possibility of receiving changes from origins other than the publisher"); + +# Reset the origin to none +$node_A->psql('postgres', + "ALTER SUBSCRIPTION $subname_AB SET (origin = none);"); + +############################################################################### +# Check that dead tuples on node A cannot be cleaned by VACUUM until the +# concurrent transactions on Node B have been applied and flushed on Node A. +############################################################################### + +# Insert a record +$node_A->safe_psql('postgres', "INSERT INTO tab VALUES (1, 1), (2, 2);"); +$node_A->wait_for_catchup($subname_BA); + +my $result = $node_B->safe_psql('postgres', "SELECT * FROM tab;"); +is($result, qq(1|1 +2|2), 'check replicated insert on node B'); + +# Disable the logical replication from node B to node A +$node_A->safe_psql('postgres', "ALTER SUBSCRIPTION $subname_AB DISABLE"); + +# Wait for the apply worker to stop +$node_A->poll_query_until('postgres', + "SELECT count(*) = 0 FROM pg_stat_activity WHERE backend_type = 'logical replication apply worker'" +); + +$node_B->safe_psql('postgres', "UPDATE tab SET b = 3 WHERE a = 1;"); +$node_A->safe_psql('postgres', "DELETE FROM tab WHERE a = 1;"); + +($cmdret, $stdout, $stderr) = $node_A->psql( + 'postgres', qq(VACUUM (verbose) public.tab;) +); + +ok( $stderr =~ + qr/1 are dead but not yet removable/, + 'the deleted column is non-removable'); + +$node_A->safe_psql( + 'postgres', "ALTER SUBSCRIPTION $subname_AB ENABLE;"); +$node_B->wait_for_catchup($subname_AB); + +# Remember the next transaction ID to be assigned +my $next_xid = $node_A->safe_psql('postgres', "SELECT txid_current() + 1;"); + +# Confirm that the xmin value is advanced to the latest nextXid. If no +# transactions are running, the apply worker selects nextXid as the candidate +# for the non-removable xid. See GetOldestActiveTransactionId(). +ok( $node_A->poll_query_until( + 'postgres', + "SELECT xmin = $next_xid from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is updated on Node A"); + +# Confirm that the dead tuple can be removed now +($cmdret, $stdout, $stderr) = $node_A->psql( + 'postgres', qq(VACUUM (verbose) public.tab;) +); + +ok( $stderr =~ + qr/1 removed, 1 remain, 0 are dead but not yet removable/, + 'the deleted column is removed'); + +############################################################################### +# Check that the replication slot pg_conflict_detection is dropped after +# removing all the subscriptions. +############################################################################### + +$node_B->safe_psql( + 'postgres', "DROP SUBSCRIPTION $subname_BA"); + +ok( $node_B->poll_query_until( + 'postgres', + "SELECT count(*) = 0 FROM pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the slot 'pg_conflict_detection' has been dropped on Node B"); + +$node_A->safe_psql( + 'postgres', "DROP SUBSCRIPTION $subname_AB"); + +ok( $node_A->poll_query_until( + 'postgres', + "SELECT count(*) = 0 FROM pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the slot 'pg_conflict_detection' has been dropped on Node A"); + done_testing(); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index ff050e93a50..e6f2e93b2d6 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1759,6 +1759,7 @@ NonEmptyRange Notification NotificationList NotifyStmt +NotnullHashEntry Nsrt NtDllRoutine NtFlushBuffersFileEx_t @@ -2274,6 +2275,7 @@ PlanInvalItem PlanRowMark PlanState PlannedStmt +PlannedStmtOrigin PlannerGlobal PlannerInfo PlannerParamItem @@ -2565,6 +2567,8 @@ RestrictInfo Result ResultRelInfo ResultState +RetainDeadTuplesData +RetainDeadTuplesPhase ReturnSetInfo ReturnStmt ReturningClause @@ -3753,6 +3757,7 @@ leafSegmentInfo leaf_item libpq_gettext_func libpq_source +libpqsrv_PGresult line_t lineno_t list_sort_comparator diff --git a/src/tools/valgrind.supp b/src/tools/valgrind.supp index 2ad5b81526d..3880007dfb3 100644 --- a/src/tools/valgrind.supp +++ b/src/tools/valgrind.supp @@ -194,3 +194,36 @@ Memcheck:Addr8 fun:pg_numa_touch_mem_if_required } + + +# Memory-leak suppressions +# Note that a suppression rule will silence complaints about memory blocks +# allocated in matching places, but it won't prevent "indirectly lost" +# complaints about blocks that are only reachable via the suppressed blocks. + +# Suppress complaints about stuff leaked during function cache loading. +# Both the PL/pgSQL and SQL-function parsing processes generate some cruft +# within the function's cache context, which doesn't seem worth the trouble +# to get rid of. Moreover, there are cases where CachedFunction structs +# are intentionally leaked because we're unsure if any fn_extra pointers +# remain. +{ + hide_function_cache_leaks + Memcheck:Leak + match-leak-kinds: definite,possible,indirect + + ... + fun:cached_function_compile +} + +# Suppress complaints about stuff leaked during TS dictionary loading. +# Not very much is typically lost there, and preventing it would +# require a risky API change for TS tmplinit functions. +{ + hide_ts_dictionary_leaks + Memcheck:Leak + match-leak-kinds: definite,possible,indirect + + ... + fun:lookup_ts_dictionary_cache +} |