diff options
Diffstat (limited to 'src')
663 files changed, 19859 insertions, 12434 deletions
diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 04952b533de..8b1b357beaa 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -254,7 +254,7 @@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ PG_SYSROOT = @PG_SYSROOT@ -override CPPFLAGS := $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) $(CPPFLAGS) +override CPPFLAGS += $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) ifdef PGXS override CPPFLAGS := -I$(includedir_server) -I$(includedir_internal) $(CPPFLAGS) diff --git a/src/Makefile.shlib b/src/Makefile.shlib index fa81f6ffdd6..3825af5b228 100644 --- a/src/Makefile.shlib +++ b/src/Makefile.shlib @@ -112,7 +112,7 @@ ifeq ($(PORTNAME), darwin) ifneq ($(SO_MAJOR_VERSION), 0) version_link = -compatibility_version $(SO_MAJOR_VERSION) -current_version $(SO_MAJOR_VERSION).$(SO_MINOR_VERSION) endif - LINK.shared = $(COMPILER) -dynamiclib -install_name '$(libdir)/lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX)' $(version_link) $(exported_symbols_list) + LINK.shared = $(COMPILER) -dynamiclib -install_name '$(libdir)/lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX)' $(version_link) shlib = lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX) shlib_major = lib$(NAME).$(SO_MAJOR_VERSION)$(DLSUFFIX) else @@ -122,7 +122,7 @@ ifeq ($(PORTNAME), darwin) BUILD.exports = $(AWK) '/^[^\#]/ {printf "_%s\n",$$1}' $< >$@ exports_file = $(SHLIB_EXPORTS:%.txt=%.list) ifneq (,$(exports_file)) - exported_symbols_list = -exported_symbols_list $(exports_file) + LINK.shared += -exported_symbols_list $(exports_file) endif endif diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 01e1db7f856..4204088fa0d 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -68,7 +68,7 @@ typedef struct BrinShared int scantuplesortstates; /* Query ID, for report in worker processes */ - uint64 queryid; + int64 queryid; /* * workersdonecv is used to monitor the progress of workers. All parallel diff --git a/src/backend/access/brin/brin_minmax_multi.c b/src/backend/access/brin/brin_minmax_multi.c index 0d1507a2a36..a5a414182ca 100644 --- a/src/backend/access/brin/brin_minmax_multi.c +++ b/src/backend/access/brin/brin_minmax_multi.c @@ -624,7 +624,7 @@ brin_range_serialize(Ranges *range) for (i = 0; i < nvalues; i++) { - len += VARSIZE_ANY(range->values[i]); + len += VARSIZE_ANY(DatumGetPointer(range->values[i])); } } else if (typlen == -2) /* cstring */ @@ -2032,7 +2032,7 @@ brin_minmax_multi_distance_numeric(PG_FUNCTION_ARGS) d = DirectFunctionCall2(numeric_sub, a2, a1); /* a2 - a1 */ - PG_RETURN_FLOAT8(DirectFunctionCall1(numeric_float8, d)); + PG_RETURN_DATUM(DirectFunctionCall1(numeric_float8, d)); } /* diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index 969d1028cae..a410b5eb99b 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -189,7 +189,7 @@ getmissingattr(TupleDesc tupleDesc, if (att->attlen > 0) key.len = att->attlen; else - key.len = VARSIZE_ANY(attrmiss->am_value); + key.len = VARSIZE_ANY(DatumGetPointer(attrmiss->am_value)); key.value = attrmiss->am_value; entry = hash_search(missing_cache, &key, HASH_ENTER, &found); @@ -901,9 +901,9 @@ expand_tuple(HeapTuple *targetHeapTuple, att->attlen, attrmiss[attnum].am_value); - targetDataLen = att_addlength_pointer(targetDataLen, - att->attlen, - attrmiss[attnum].am_value); + targetDataLen = att_addlength_datum(targetDataLen, + att->attlen, + attrmiss[attnum].am_value); } else { diff --git a/src/backend/access/common/printsimple.c b/src/backend/access/common/printsimple.c index f346ab3e812..a09c8fcd332 100644 --- a/src/backend/access/common/printsimple.c +++ b/src/backend/access/common/printsimple.c @@ -123,7 +123,7 @@ printsimple(TupleTableSlot *slot, DestReceiver *self) case OIDOID: { - Oid num = ObjectIdGetDatum(value); + Oid num = DatumGetObjectId(value); char str[10]; /* 10 digits */ int len; diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c index 830a3d883aa..6d3045e2332 100644 --- a/src/backend/access/common/printtup.c +++ b/src/backend/access/common/printtup.c @@ -350,7 +350,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self) */ if (thisState->typisvarlena) VALGRIND_CHECK_MEM_IS_DEFINED(DatumGetPointer(attr), - VARSIZE_ANY(attr)); + VARSIZE_ANY(DatumGetPointer(attr))); if (thisState->format == 0) { diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 46c1dce222d..0af3fea68fa 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -1164,7 +1164,7 @@ add_local_string_reloption(local_relopts *relopts, const char *name, * but we declare them as Datums to avoid including array.h in reloptions.h. */ Datum -transformRelOptions(Datum oldOptions, List *defList, const char *namspace, +transformRelOptions(Datum oldOptions, List *defList, const char *nameSpace, const char *const validnsps[], bool acceptOidsOff, bool isReset) { Datum result; @@ -1190,8 +1190,8 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, for (i = 0; i < noldoptions; i++) { - char *text_str = VARDATA(oldoptions[i]); - int text_len = VARSIZE(oldoptions[i]) - VARHDRSZ; + char *text_str = VARDATA(DatumGetPointer(oldoptions[i])); + int text_len = VARSIZE(DatumGetPointer(oldoptions[i])) - VARHDRSZ; /* Search for a match in defList */ foreach(cell, defList) @@ -1200,14 +1200,14 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, int kw_len; /* ignore if not in the same namespace */ - if (namspace == NULL) + if (nameSpace == NULL) { if (def->defnamespace != NULL) continue; } else if (def->defnamespace == NULL) continue; - else if (strcmp(def->defnamespace, namspace) != 0) + else if (strcmp(def->defnamespace, nameSpace) != 0) continue; kw_len = strlen(def->defname); @@ -1243,8 +1243,9 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, } else { - text *t; + const char *name; const char *value; + text *t; Size len; /* @@ -1276,14 +1277,14 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, } /* ignore if not in the same namespace */ - if (namspace == NULL) + if (nameSpace == NULL) { if (def->defnamespace != NULL) continue; } else if (def->defnamespace == NULL) continue; - else if (strcmp(def->defnamespace, namspace) != 0) + else if (strcmp(def->defnamespace, nameSpace) != 0) continue; /* @@ -1291,11 +1292,19 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, * have just "name", assume "name=true" is meant. Note: the * namespace is not output. */ + name = def->defname; if (def->arg != NULL) value = defGetString(def); else value = "true"; + /* Insist that name not contain "=", else "a=b=c" is ambiguous */ + if (strchr(name, '=') != NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid option name \"%s\": must not contain \"=\"", + name))); + /* * This is not a great place for this test, but there's no other * convenient place to filter the option out. As WITH (oids = @@ -1303,7 +1312,7 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, * amount of ugly. */ if (acceptOidsOff && def->defnamespace == NULL && - strcmp(def->defname, "oids") == 0) + strcmp(name, "oids") == 0) { if (defGetBoolean(def)) ereport(ERROR, @@ -1313,11 +1322,11 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace, continue; } - len = VARHDRSZ + strlen(def->defname) + 1 + strlen(value); + len = VARHDRSZ + strlen(name) + 1 + strlen(value); /* +1 leaves room for sprintf's trailing null */ t = (text *) palloc(len + 1); SET_VARSIZE(t, len); - sprintf(VARDATA(t), "%s=%s", def->defname, value); + sprintf(VARDATA(t), "%s=%s", name, value); astate = accumArrayResult(astate, PointerGetDatum(t), false, TEXTOID, @@ -1447,8 +1456,8 @@ parseRelOptionsInternal(Datum options, bool validate, for (i = 0; i < noptions; i++) { - char *text_str = VARDATA(optiondatums[i]); - int text_len = VARSIZE(optiondatums[i]) - VARHDRSZ; + char *text_str = VARDATA(DatumGetPointer(optiondatums[i])); + int text_len = VARSIZE(DatumGetPointer(optiondatums[i])) - VARHDRSZ; int j; /* Search for a match in reloptions */ diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c index 21f2f4af97e..926f1e4008a 100644 --- a/src/backend/access/common/toast_compression.c +++ b/src/backend/access/common/toast_compression.c @@ -25,11 +25,11 @@ /* GUC */ int default_toast_compression = TOAST_PGLZ_COMPRESSION; -#define NO_LZ4_SUPPORT() \ +#define NO_COMPRESSION_SUPPORT(method) \ ereport(ERROR, \ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ - errmsg("compression method lz4 not supported"), \ - errdetail("This functionality requires the server to be built with lz4 support."))) + errmsg("compression method %s not supported", method), \ + errdetail("This functionality requires the server to be built with %s support.", method))) /* * Compress a varlena using PGLZ. @@ -139,7 +139,7 @@ struct varlena * lz4_compress_datum(const struct varlena *value) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 valsize; @@ -182,7 +182,7 @@ struct varlena * lz4_decompress_datum(const struct varlena *value) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 rawsize; @@ -215,7 +215,7 @@ struct varlena * lz4_decompress_datum_slice(const struct varlena *value, int32 slicelength) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); return NULL; /* keep compiler quiet */ #else int32 rawsize; @@ -289,7 +289,7 @@ CompressionNameToMethod(const char *compression) else if (strcmp(compression, "lz4") == 0) { #ifndef USE_LZ4 - NO_LZ4_SUPPORT(); + NO_COMPRESSION_SUPPORT("lz4"); #endif return TOAST_LZ4_COMPRESSION; } diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 7d8be8346ce..196e06115e9 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -144,7 +144,7 @@ toast_save_datum(Relation rel, Datum value, int num_indexes; int validIndex; - Assert(!VARATT_IS_EXTERNAL(value)); + Assert(!VARATT_IS_EXTERNAL(dval)); /* * Open the toast relation and its indexes. We can use the index to check diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c index ffd0c78f905..be60005ae46 100644 --- a/src/backend/access/common/tupdesc.c +++ b/src/backend/access/common/tupdesc.c @@ -142,11 +142,18 @@ void verify_compact_attribute(TupleDesc tupdesc, int attnum) { #ifdef USE_ASSERT_CHECKING - CompactAttribute *cattr = &tupdesc->compact_attrs[attnum]; + CompactAttribute cattr; Form_pg_attribute attr = TupleDescAttr(tupdesc, attnum); CompactAttribute tmp; /* + * Make a temp copy of the TupleDesc's CompactAttribute. This may be a + * shared TupleDesc and the attcacheoff might get changed by another + * backend. + */ + memcpy(&cattr, &tupdesc->compact_attrs[attnum], sizeof(CompactAttribute)); + + /* * Populate the temporary CompactAttribute from the corresponding * Form_pg_attribute */ @@ -156,11 +163,11 @@ verify_compact_attribute(TupleDesc tupdesc, int attnum) * Make the attcacheoff match since it's been reset to -1 by * populate_compact_attribute_internal. Same with attnullability. */ - tmp.attcacheoff = cattr->attcacheoff; - tmp.attnullability = cattr->attnullability; + tmp.attcacheoff = cattr.attcacheoff; + tmp.attnullability = cattr.attnullability; /* Check the freshly populated CompactAttribute matches the TupleDesc's */ - Assert(memcmp(&tmp, cattr, sizeof(CompactAttribute)) == 0); + Assert(memcmp(&tmp, &cattr, sizeof(CompactAttribute)) == 0); #endif } @@ -808,10 +815,10 @@ hashRowType(TupleDesc desc) uint32 s; int i; - s = hash_combine(0, hash_uint32(desc->natts)); - s = hash_combine(s, hash_uint32(desc->tdtypeid)); + s = hash_combine(0, hash_bytes_uint32(desc->natts)); + s = hash_combine(s, hash_bytes_uint32(desc->tdtypeid)); for (i = 0; i < desc->natts; ++i) - s = hash_combine(s, hash_uint32(TupleDescAttr(desc, i)->atttypid)); + s = hash_combine(s, hash_bytes_uint32(TupleDescAttr(desc, i)->atttypid)); return s; } diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index a65acd89104..47b1898a064 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -2233,7 +2233,7 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category, else if (typlen > 0) keylen = typlen; else if (typlen == -1) - keylen = VARSIZE_ANY(key); + keylen = VARSIZE_ANY(DatumGetPointer(key)); else if (typlen == -2) keylen = strlen(DatumGetPointer(key)) + 1; else diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index a6b701943d3..c0aa7d0222f 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -1058,11 +1058,11 @@ gistGetFakeLSN(Relation rel) } /* - * This is a stratnum support function for GiST opclasses that use the - * RT*StrategyNumber constants. + * This is a stratnum translation support function for GiST opclasses that use + * the RT*StrategyNumber constants. */ Datum -gist_stratnum_common(PG_FUNCTION_ARGS) +gist_translate_cmptype_common(PG_FUNCTION_ARGS) { CompareType cmptype = PG_GETARG_INT32(0); @@ -1090,9 +1090,9 @@ gist_stratnum_common(PG_FUNCTION_ARGS) /* * Returns the opclass's private stratnum used for the given compare type. * - * Calls the opclass's GIST_STRATNUM_PROC support function, if any, - * and returns the result. - * Returns InvalidStrategy if the function is not defined. + * Calls the opclass's GIST_TRANSLATE_CMPTYPE_PROC support function, if any, + * and returns the result. Returns InvalidStrategy if the function is not + * defined. */ StrategyNumber gisttranslatecmptype(CompareType cmptype, Oid opfamily) @@ -1101,7 +1101,7 @@ gisttranslatecmptype(CompareType cmptype, Oid opfamily) Datum result; /* Check whether the function is provided. */ - funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_STRATNUM_PROC); + funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_TRANSLATE_CMPTYPE_PROC); if (!OidIsValid(funcid)) return InvalidStrategy; diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c index 2a49e6d20f0..2ed6f74fce9 100644 --- a/src/backend/access/gist/gistvalidate.c +++ b/src/backend/access/gist/gistvalidate.c @@ -138,7 +138,7 @@ gistvalidate(Oid opclassoid) ok = check_amproc_signature(procform->amproc, VOIDOID, true, 1, 1, INTERNALOID); break; - case GIST_STRATNUM_PROC: + case GIST_TRANSLATE_CMPTYPE_PROC: ok = check_amproc_signature(procform->amproc, INT2OID, true, 1, 1, INT4OID) && procform->amproclefttype == ANYOID && @@ -265,7 +265,7 @@ gistvalidate(Oid opclassoid) if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC || i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC || i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC || - i == GIST_STRATNUM_PROC) + i == GIST_TRANSLATE_CMPTYPE_PROC) continue; /* optional methods */ ereport(INFO, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), @@ -336,7 +336,7 @@ gistadjustmembers(Oid opfamilyoid, case GIST_FETCH_PROC: case GIST_OPTIONS_PROC: case GIST_SORTSUPPORT_PROC: - case GIST_STRATNUM_PROC: + case GIST_TRANSLATE_CMPTYPE_PROC: /* Optional, so force it to be a soft family dependency */ op->ref_is_hard = false; op->ref_is_family = true; diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 9ec8cda1c68..0dcd6ee817e 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -213,6 +213,27 @@ static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = #define TUPLOCK_from_mxstatus(status) \ (MultiXactStatusLock[(status)]) +/* + * Check that we have a valid snapshot if we might need TOAST access. + */ +static inline void +AssertHasSnapshotForToast(Relation rel) +{ +#ifdef USE_ASSERT_CHECKING + + /* bootstrap mode in particular breaks this rule */ + if (!IsNormalProcessingMode()) + return; + + /* if the relation doesn't have a TOAST table, we are good */ + if (!OidIsValid(rel->rd_rel->reltoastrelid)) + return; + + Assert(HaveRegisteredOrActiveSnapshot()); + +#endif /* USE_ASSERT_CHECKING */ +} + /* ---------------------------------------------------------------- * heap support routines * ---------------------------------------------------------------- @@ -2066,6 +2087,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, Assert(HeapTupleHeaderGetNatts(tup->t_data) <= RelationGetNumberOfAttributes(relation)); + AssertHasSnapshotForToast(relation); + /* * Fill in tuple header fields and toast the tuple if necessary. * @@ -2343,6 +2366,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, /* currently not needed (thus unsupported) for heap_multi_insert() */ Assert(!(options & HEAP_INSERT_NO_LOGICAL)); + AssertHasSnapshotForToast(relation); + needwal = RelationNeedsWAL(relation); saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); @@ -2765,6 +2790,8 @@ heap_delete(Relation relation, ItemPointer tid, Assert(ItemPointerIsValid(tid)); + AssertHasSnapshotForToast(relation); + /* * Forbid this during a parallel operation, lest it allocate a combo CID. * Other workers might need that combo CID for visibility checks, and we @@ -3260,6 +3287,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, Assert(HeapTupleHeaderGetNatts(newtup->t_data) <= RelationGetNumberOfAttributes(relation)); + AssertHasSnapshotForToast(relation); + /* * Forbid this during a parallel operation, lest it allocate a combo CID. * Other workers might need that combo CID for visibility checks, and we @@ -4953,7 +4982,7 @@ l3: case LockWaitError: if (!ConditionalMultiXactIdWait((MultiXactId) xwait, status, infomask, relation, - NULL, log_lock_failure)) + NULL, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", @@ -4991,7 +5020,7 @@ l3: } break; case LockWaitError: - if (!ConditionalXactLockTableWait(xwait, log_lock_failure)) + if (!ConditionalXactLockTableWait(xwait, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", @@ -5256,7 +5285,7 @@ heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, break; case LockWaitError: - if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failure)) + if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ac082fefa77..cb4bc35c93e 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -464,7 +464,7 @@ tuple_lock_retry: return TM_WouldBlock; break; case LockWaitError: - if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failure)) + if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failures)) ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index 30f4c2d3c67..eb4bd3d6ae3 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -438,6 +438,9 @@ heap_xlog_insert(XLogReaderState *record) ItemPointerSetBlockNumber(&target_tid, blkno); ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + /* No freezing in the heap_insert() code path */ + Assert(!(xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)); + /* * The visibility map may need to be fixed even if the heap page is * already up-to-date. @@ -508,10 +511,6 @@ heap_xlog_insert(XLogReaderState *record) if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) PageClearAllVisible(page); - /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */ - if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET) - PageSetAllVisible(page); - MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index f28326bad09..14036c27e87 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -423,7 +423,7 @@ typedef struct LVSavedErrInfo /* non-export function prototypes */ static void lazy_scan_heap(LVRelState *vacrel); static void heap_vacuum_eager_scan_setup(LVRelState *vacrel, - VacuumParams *params); + const VacuumParams params); static BlockNumber heap_vac_scan_next_block(ReadStream *stream, void *callback_private_data, void *per_buffer_data); @@ -431,7 +431,7 @@ static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis); static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, bool sharelock, Buffer vmbuffer); -static void lazy_scan_prune(LVRelState *vacrel, Buffer buf, +static int lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, Buffer vmbuffer, bool all_visible_according_to_vm, bool *has_lpdead_items, bool *vm_page_frozen); @@ -485,7 +485,7 @@ static void restore_vacuum_error_info(LVRelState *vacrel, * vacuum options or for relfrozenxid/relminmxid advancement. */ static void -heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) +heap_vacuum_eager_scan_setup(LVRelState *vacrel, const VacuumParams params) { uint32 randseed; BlockNumber allvisible; @@ -504,7 +504,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) vacrel->eager_scan_remaining_successes = 0; /* If eager scanning is explicitly disabled, just return. */ - if (params->max_eager_freeze_failure_rate == 0) + if (params.max_eager_freeze_failure_rate == 0) return; /* @@ -581,11 +581,11 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) vacrel->next_eager_scan_region_start = randseed % EAGER_SCAN_REGION_SIZE; - Assert(params->max_eager_freeze_failure_rate > 0 && - params->max_eager_freeze_failure_rate <= 1); + Assert(params.max_eager_freeze_failure_rate > 0 && + params.max_eager_freeze_failure_rate <= 1); vacrel->eager_scan_max_fails_per_region = - params->max_eager_freeze_failure_rate * + params.max_eager_freeze_failure_rate * EAGER_SCAN_REGION_SIZE; /* @@ -612,7 +612,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params) * and locked the relation. */ void -heap_vacuum_rel(Relation rel, VacuumParams *params, +heap_vacuum_rel(Relation rel, const VacuumParams params, BufferAccessStrategy bstrategy) { LVRelState *vacrel; @@ -634,9 +634,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, ErrorContextCallback errcallback; char **indnames = NULL; - verbose = (params->options & VACOPT_VERBOSE) != 0; + verbose = (params.options & VACOPT_VERBOSE) != 0; instrument = (verbose || (AmAutoVacuumWorkerProcess() && - params->log_min_duration >= 0)); + params.log_min_duration >= 0)); if (instrument) { pg_rusage_init(&ru0); @@ -699,9 +699,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * The truncate param allows user to avoid attempting relation truncation, * though it can't force truncation to happen. */ - Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED); - Assert(params->truncate != VACOPTVALUE_UNSPECIFIED && - params->truncate != VACOPTVALUE_AUTO); + Assert(params.index_cleanup != VACOPTVALUE_UNSPECIFIED); + Assert(params.truncate != VACOPTVALUE_UNSPECIFIED && + params.truncate != VACOPTVALUE_AUTO); /* * While VacuumFailSafeActive is reset to false before calling this, we @@ -711,14 +711,14 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->consider_bypass_optimization = true; vacrel->do_index_vacuuming = true; vacrel->do_index_cleanup = true; - vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED); - if (params->index_cleanup == VACOPTVALUE_DISABLED) + vacrel->do_rel_truncate = (params.truncate != VACOPTVALUE_DISABLED); + if (params.index_cleanup == VACOPTVALUE_DISABLED) { /* Force disable index vacuuming up-front */ vacrel->do_index_vacuuming = false; vacrel->do_index_cleanup = false; } - else if (params->index_cleanup == VACOPTVALUE_ENABLED) + else if (params.index_cleanup == VACOPTVALUE_ENABLED) { /* Force index vacuuming. Note that failsafe can still bypass. */ vacrel->consider_bypass_optimization = false; @@ -726,7 +726,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, else { /* Default/auto, make all decisions dynamically */ - Assert(params->index_cleanup == VACOPTVALUE_AUTO); + Assert(params.index_cleanup == VACOPTVALUE_AUTO); } /* Initialize page counters explicitly (be tidy) */ @@ -757,7 +757,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->vm_new_visible_pages = 0; vacrel->vm_new_visible_frozen_pages = 0; vacrel->vm_new_frozen_pages = 0; - vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); /* * Get cutoffs that determine which deleted tuples are considered DEAD, @@ -776,7 +775,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * to increase the number of dead tuples it can prune away.) */ vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs); + vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); vacrel->vistest = GlobalVisTestFor(rel); + /* Initialize state used to track oldest extant XID/MXID */ vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin; vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact; @@ -788,7 +789,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, */ vacrel->skippedallvis = false; skipwithvm = true; - if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) + if (params.options & VACOPT_DISABLE_PAGE_SKIPPING) { /* * Force aggressive mode, and disable skipping blocks using the @@ -829,7 +830,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * is already dangerously old.) */ lazy_check_wraparound_failsafe(vacrel); - dead_items_alloc(vacrel, params->nworkers); + dead_items_alloc(vacrel, params.nworkers); /* * Call lazy_scan_heap to perform all required heap pruning, index @@ -946,9 +947,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, { TimestampTz endtime = GetCurrentTimestamp(); - if (verbose || params->log_min_duration == 0 || + if (verbose || params.log_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, - params->log_min_duration)) + params.log_min_duration)) { long secs_dur; int usecs_dur; @@ -983,10 +984,10 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * Aggressiveness already reported earlier, in dedicated * VACUUM VERBOSE ereport */ - Assert(!params->is_wraparound); + Assert(!params.is_wraparound); msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n"); } - else if (params->is_wraparound) + else if (params.is_wraparound) { /* * While it's possible for a VACUUM to be both is_wraparound @@ -1244,6 +1245,7 @@ lazy_scan_heap(LVRelState *vacrel) Buffer buf; Page page; uint8 blk_info = 0; + int ndeleted = 0; bool has_lpdead_items; void *per_buffer_data = NULL; bool vm_page_frozen = false; @@ -1386,10 +1388,10 @@ lazy_scan_heap(LVRelState *vacrel) * line pointers previously marked LP_DEAD. */ if (got_cleanup_lock) - lazy_scan_prune(vacrel, buf, blkno, page, - vmbuffer, - blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM, - &has_lpdead_items, &vm_page_frozen); + ndeleted = lazy_scan_prune(vacrel, buf, blkno, page, + vmbuffer, + blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM, + &has_lpdead_items, &vm_page_frozen); /* * Count an eagerly scanned page as a failure or a success. @@ -1413,12 +1415,26 @@ lazy_scan_heap(LVRelState *vacrel) if (vm_page_frozen) { - Assert(vacrel->eager_scan_remaining_successes > 0); - vacrel->eager_scan_remaining_successes--; + if (vacrel->eager_scan_remaining_successes > 0) + vacrel->eager_scan_remaining_successes--; if (vacrel->eager_scan_remaining_successes == 0) { /* + * Report only once that we disabled eager scanning. We + * may eagerly read ahead blocks in excess of the success + * or failure caps before attempting to freeze them, so we + * could reach here even after disabling additional eager + * scanning. + */ + if (vacrel->eager_scan_max_fails_per_region > 0) + ereport(vacrel->verbose ? INFO : DEBUG2, + (errmsg("disabling eager scanning after freezing %u eagerly scanned blocks of relation \"%s.%s.%s\"", + orig_eager_scan_success_limit, + vacrel->dbname, vacrel->relnamespace, + vacrel->relname))); + + /* * If we hit our success cap, permanently disable eager * scanning by setting the other eager scan management * fields to their disabled values. @@ -1426,19 +1442,10 @@ lazy_scan_heap(LVRelState *vacrel) vacrel->eager_scan_remaining_fails = 0; vacrel->next_eager_scan_region_start = InvalidBlockNumber; vacrel->eager_scan_max_fails_per_region = 0; - - ereport(vacrel->verbose ? INFO : DEBUG2, - (errmsg("disabling eager scanning after freezing %u eagerly scanned blocks of \"%s.%s.%s\"", - orig_eager_scan_success_limit, - vacrel->dbname, vacrel->relnamespace, - vacrel->relname))); } } - else - { - Assert(vacrel->eager_scan_remaining_fails > 0); + else if (vacrel->eager_scan_remaining_fails > 0) vacrel->eager_scan_remaining_fails--; - } } /* @@ -1475,7 +1482,7 @@ lazy_scan_heap(LVRelState *vacrel) * table has indexes. There will only be newly-freed space if we * held the cleanup lock and lazy_scan_prune() was called. */ - if (got_cleanup_lock && vacrel->nindexes == 0 && has_lpdead_items && + if (got_cleanup_lock && vacrel->nindexes == 0 && ndeleted > 0 && blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) { FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, @@ -1866,8 +1873,6 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, */ if (!PageIsAllVisible(page)) { - uint8 old_vmbits; - START_CRIT_SECTION(); /* mark buffer dirty before writing a WAL record */ @@ -1887,24 +1892,16 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, log_newpage_buffer(buf, true); PageSetAllVisible(page); - old_vmbits = visibilitymap_set(vacrel->rel, blkno, buf, - InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId, - VISIBILITYMAP_ALL_VISIBLE | - VISIBILITYMAP_ALL_FROZEN); + visibilitymap_set(vacrel->rel, blkno, buf, + InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | + VISIBILITYMAP_ALL_FROZEN); END_CRIT_SECTION(); - /* - * If the page wasn't already set all-visible and/or all-frozen in - * the VM, count it as newly set for logging. - */ - if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0) - { - vacrel->vm_new_visible_pages++; - vacrel->vm_new_visible_frozen_pages++; - } - else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0) - vacrel->vm_new_frozen_pages++; + /* Count the newly all-frozen pages for logging */ + vacrel->vm_new_visible_pages++; + vacrel->vm_new_visible_frozen_pages++; } freespace = PageGetHeapFreeSpace(page); @@ -1940,8 +1937,10 @@ cmpOffsetNumbers(const void *a, const void *b) * *vm_page_frozen is set to true if the page is newly set all-frozen in the * VM. The caller currently only uses this for determining whether an eagerly * scanned page was successfully set all-frozen. + * + * Returns the number of tuples deleted from the page during HOT pruning. */ -static void +static int lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, @@ -2212,6 +2211,8 @@ lazy_scan_prune(LVRelState *vacrel, *vm_page_frozen = true; } } + + return presult.ndeleted; } /* @@ -2909,7 +2910,6 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid, &all_frozen)) { - uint8 old_vmbits; uint8 flags = VISIBILITYMAP_ALL_VISIBLE; if (all_frozen) @@ -2919,25 +2919,15 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, } PageSetAllVisible(page); - old_vmbits = visibilitymap_set(vacrel->rel, blkno, buffer, - InvalidXLogRecPtr, - vmbuffer, visibility_cutoff_xid, - flags); - - /* - * If the page wasn't already set all-visible and/or all-frozen in the - * VM, count it as newly set for logging. - */ - if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0) - { - vacrel->vm_new_visible_pages++; - if (all_frozen) - vacrel->vm_new_visible_frozen_pages++; - } + visibilitymap_set(vacrel->rel, blkno, buffer, + InvalidXLogRecPtr, + vmbuffer, visibility_cutoff_xid, + flags); - else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 && - all_frozen) - vacrel->vm_new_frozen_pages++; + /* Count the newly set VM page for logging */ + vacrel->vm_new_visible_pages++; + if (all_frozen) + vacrel->vm_new_visible_frozen_pages++; } /* Revert to the previous phase information for error traceback */ diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 745a04ef26e..8f918e00af7 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -364,7 +364,7 @@ visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) { *vmbuf = vm_readbuf(rel, mapBlock, false); if (!BufferIsValid(*vmbuf)) - return false; + return (uint8) 0; } map = PageGetContents(BufferGetPage(*vmbuf)); diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index f0f4f974bce..60684c53422 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -42,6 +42,19 @@ GetIndexAmRoutine(Oid amhandler) elog(ERROR, "index access method handler function %u did not return an IndexAmRoutine struct", amhandler); + /* Assert that all required callbacks are present. */ + Assert(routine->ambuild != NULL); + Assert(routine->ambuildempty != NULL); + Assert(routine->aminsert != NULL); + Assert(routine->ambulkdelete != NULL); + Assert(routine->amvacuumcleanup != NULL); + Assert(routine->amcostestimate != NULL); + Assert(routine->amoptions != NULL); + Assert(routine->amvalidate != NULL); + Assert(routine->ambeginscan != NULL); + Assert(routine->amrescan != NULL); + Assert(routine->amendscan != NULL); + return routine; } diff --git a/src/backend/access/nbtree/nbtcompare.c b/src/backend/access/nbtree/nbtcompare.c index 4da5a3c1d16..e1b52acd20d 100644 --- a/src/backend/access/nbtree/nbtcompare.c +++ b/src/backend/access/nbtree/nbtcompare.c @@ -555,7 +555,7 @@ btcharcmp(PG_FUNCTION_ARGS) static Datum char_decrement(Relation rel, Datum existing, bool *underflow) { - uint8 cexisting = UInt8GetDatum(existing); + uint8 cexisting = DatumGetUInt8(existing); if (cexisting == 0) { @@ -571,7 +571,7 @@ char_decrement(Relation rel, Datum existing, bool *underflow) static Datum char_increment(Relation rel, Datum existing, bool *overflow) { - uint8 cexisting = UInt8GetDatum(existing); + uint8 cexisting = DatumGetUInt8(existing); if (cexisting == UCHAR_MAX) { diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index a136e4bbfdf..21c519cd108 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -16,6 +16,7 @@ #include "postgres.h" #include "access/nbtree.h" +#include "common/int.h" #include "lib/qunique.h" #include "utils/array.h" #include "utils/lsyscache.h" @@ -56,6 +57,8 @@ static void _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk, BTArrayKeyInfo *array); static void _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, BTArrayKeyInfo *array); +static void _bt_unmark_keys(IndexScanDesc scan, int *keyDataMap); +static int _bt_reorder_array_cmp(const void *a, const void *b); static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys); static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); static int _bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops_out, @@ -96,7 +99,7 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * incomplete sets of cross-type operators, we may fail to detect redundant * or contradictory keys, but we can survive that.) * - * The output keys must be sorted by index attribute. Presently we expect + * Required output keys are sorted by index attribute. Presently we expect * (but verify) that the input keys are already so sorted --- this is done * by match_clauses_to_index() in indxpath.c. Some reordering of the keys * within each attribute may be done as a byproduct of the processing here. @@ -127,29 +130,36 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * This has the potential to be much more efficient than a full index scan * (though it behaves like a full scan when there's many distinct "x" values). * - * If possible, redundant keys are eliminated: we keep only the tightest + * Typically, redundant keys are eliminated: we keep only the tightest * >/>= bound and the tightest </<= bound, and if there's an = key then * that's the only one returned. (So, we return either a single = key, * or one or two boundary-condition keys for each attr.) However, if we * cannot compare two keys for lack of a suitable cross-type operator, - * we cannot eliminate either. If there are two such keys of the same - * operator strategy, the second one is just pushed into the output array - * without further processing here. We may also emit both >/>= or both - * </<= keys if we can't compare them. The logic about required keys still - * works if we don't eliminate redundant keys. - * - * Note that one reason we need direction-sensitive required-key flags is - * precisely that we may not be able to eliminate redundant keys. Suppose - * we have "x > 4::int AND x > 10::bigint", and we are unable to determine - * which key is more restrictive for lack of a suitable cross-type operator. - * _bt_first will arbitrarily pick one of the keys to do the initial - * positioning with. If it picks x > 4, then the x > 10 condition will fail - * until we reach index entries > 10; but we can't stop the scan just because - * x > 10 is failing. On the other hand, if we are scanning backwards, then - * failure of either key is indeed enough to stop the scan. (In general, when - * inequality keys are present, the initial-positioning code only promises to - * position before the first possible match, not exactly at the first match, - * for a forward scan; or after the last match for a backward scan.) + * we cannot eliminate either key. + * + * When all redundant keys could not be eliminated, we'll output a key array + * that can more or less be treated as if it had no redundant keys. Suppose + * we have "x > 4::int AND x > 10::bigint AND x < 70", and we are unable to + * determine which > key is more restrictive for lack of a suitable cross-type + * operator. We'll arbitrarily pick one of the > keys; the other > key won't + * be marked required. Obviously, the scan will be less efficient if we + * choose x > 4 over x > 10 -- but it can still largely proceed as if there + * was only a single > condition. "x > 10" will be placed at the end of the + * so->keyData[] output array. It'll always be evaluated last, after the keys + * that could be marked required in the usual way (after "x > 4 AND x < 70"). + * This can sometimes result in so->keyData[] keys that aren't even in index + * attribute order (if the qual involves multiple attributes). The scan's + * required keys will still be in attribute order, though, so it can't matter. + * + * This scheme ensures that _bt_first always uses the same set of keys at the + * start of a forwards scan as those _bt_checkkeys uses to determine when to + * end a similar backwards scan (and vice-versa). _bt_advance_array_keys + * depends on this: it expects to be able to reliably predict what the next + * _bt_first call will do by testing whether _bt_checkkeys' routines report + * that the final tuple on the page is past the end of matches for the scan's + * keys with the scan direction flipped. If it is (if continuescan=false), + * then it follows that calling _bt_first will, at a minimum, relocate the + * scan to the very next leaf page (in the current scan direction). * * As a byproduct of this work, we can detect contradictory quals such * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = false, @@ -188,7 +198,8 @@ _bt_preprocess_keys(IndexScanDesc scan) int numberOfEqualCols; ScanKey inkeys; BTScanKeyPreproc xform[BTMaxStrategyNumber]; - bool test_result; + bool test_result, + redundant_key_kept = false; AttrNumber attno; ScanKey arrayKeyData; int *keyDataMap = NULL; @@ -388,7 +399,8 @@ _bt_preprocess_keys(IndexScanDesc scan) xform[j].inkey = NULL; xform[j].inkeyi = -1; } - /* else, cannot determine redundancy, keep both keys */ + else + redundant_key_kept = true; } /* track number of attrs for which we have "=" keys */ numberOfEqualCols++; @@ -409,6 +421,8 @@ _bt_preprocess_keys(IndexScanDesc scan) else xform[BTLessStrategyNumber - 1].inkey = NULL; } + else + redundant_key_kept = true; } /* try to keep only one of >, >= */ @@ -426,6 +440,8 @@ _bt_preprocess_keys(IndexScanDesc scan) else xform[BTGreaterStrategyNumber - 1].inkey = NULL; } + else + redundant_key_kept = true; } /* @@ -466,25 +482,6 @@ _bt_preprocess_keys(IndexScanDesc scan) /* check strategy this key's operator corresponds to */ j = inkey->sk_strategy - 1; - /* if row comparison, push it directly to the output array */ - if (inkey->sk_flags & SK_ROW_HEADER) - { - ScanKey outkey = &so->keyData[new_numberOfKeys++]; - - memcpy(outkey, inkey, sizeof(ScanKeyData)); - if (arrayKeyData) - keyDataMap[new_numberOfKeys - 1] = i; - if (numberOfEqualCols == attno - 1) - _bt_mark_scankey_required(outkey); - - /* - * We don't support RowCompare using equality; such a qual would - * mess up the numberOfEqualCols tracking. - */ - Assert(j != (BTEqualStrategyNumber - 1)); - continue; - } - if (inkey->sk_strategy == BTEqualStrategyNumber && (inkey->sk_flags & SK_SEARCHARRAY)) { @@ -593,9 +590,8 @@ _bt_preprocess_keys(IndexScanDesc scan) * the new scan key. * * Note: We do things this way around so that our arrays are - * always in the same order as their corresponding scan keys, - * even with incomplete opfamilies. _bt_advance_array_keys - * depends on this. + * always in the same order as their corresponding scan keys. + * _bt_preprocess_array_keys_final expects this. */ ScanKey outkey = &so->keyData[new_numberOfKeys++]; @@ -607,6 +603,7 @@ _bt_preprocess_keys(IndexScanDesc scan) xform[j].inkey = inkey; xform[j].inkeyi = i; xform[j].arrayidx = arrayidx; + redundant_key_kept = true; } } } @@ -622,6 +619,15 @@ _bt_preprocess_keys(IndexScanDesc scan) if (arrayKeyData) _bt_preprocess_array_keys_final(scan, keyDataMap); + /* + * If there are remaining redundant inequality keys, we must make sure + * that each index attribute has no more than one required >/>= key, and + * no more than one required </<= key. Attributes that have one or more + * required = keys now must keep only one required key (the first = key). + */ + if (unlikely(redundant_key_kept) && so->qual_ok) + _bt_unmark_keys(scan, keyDataMap); + /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */ } @@ -746,9 +752,12 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) * * Depending on the operator type, the key may be required for both scan * directions or just one. Also, if the key is a row comparison header, - * we have to mark its first subsidiary ScanKey as required. (Subsequent - * subsidiary ScanKeys are normally for lower-order columns, and thus - * cannot be required, since they're after the first non-equality scankey.) + * we have to mark the appropriate subsidiary ScanKeys as required. In such + * cases, the first subsidiary key is required, but subsequent ones are + * required only as long as they correspond to successive index columns and + * match the leading column as to sort direction. Otherwise the row + * comparison ordering is different from the index ordering and so we can't + * stop the scan on the basis of those lower-order columns. * * Note: when we set required-key flag bits in a subsidiary scankey, we are * scribbling on a data structure belonging to the index AM's caller, not on @@ -786,12 +795,25 @@ _bt_mark_scankey_required(ScanKey skey) if (skey->sk_flags & SK_ROW_HEADER) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + AttrNumber attno = skey->sk_attno; /* First subkey should be same column/operator as the header */ - Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(subkey->sk_attno == skey->sk_attno); + Assert(subkey->sk_attno == attno); Assert(subkey->sk_strategy == skey->sk_strategy); - subkey->sk_flags |= addflags; + + for (;;) + { + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_attno != attno) + break; /* non-adjacent key, so not required */ + if (subkey->sk_strategy != skey->sk_strategy) + break; /* wrong direction, so not required */ + subkey->sk_flags |= addflags; + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + attno++; + } } } @@ -847,8 +869,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, cmp_op; StrategyNumber strat; - Assert(!((leftarg->sk_flags | rightarg->sk_flags) & - (SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_MEMBER)); /* * First, deal with cases where one or both args are NULL. This should @@ -925,6 +946,16 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, } /* + * We don't yet know how to determine redundancy when it involves a row + * compare key (barring simple cases involving IS NULL/IS NOT NULL) + */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_HEADER) + { + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP)); + return false; + } + + /* * If either leftarg or rightarg are equality-type array scankeys, we need * specialized handling (since by now we know that IS NULL wasn't used) */ @@ -1468,6 +1499,283 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, } /* + * _bt_unmark_keys() -- make superfluous required keys nonrequired after all + * + * When _bt_preprocess_keys fails to eliminate one or more redundant keys, it + * calls here to make sure that no index attribute has more than one > or >= + * key marked required, and no more than one required < or <= key. Attributes + * with = keys will always get one = key as their required key. All other + * keys that were initially marked required get "unmarked" here. That way, + * _bt_first and _bt_checkkeys will reliably agree on which keys to use to + * start and/or to end the scan. + * + * We also relocate keys that become/started out nonrequired to the end of + * so->keyData[]. That way, _bt_first and _bt_checkkeys cannot fail to reach + * a required key due to some earlier nonrequired key getting in the way. + * + * Only call here when _bt_compare_scankey_args returned false at least once + * (otherwise, calling here will just waste cycles). + */ +static void +_bt_unmark_keys(IndexScanDesc scan, int *keyDataMap) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + AttrNumber attno; + bool *unmarkikey; + int nunmark, + nunmarked, + nkept, + firsti; + ScanKey keepKeys, + unmarkKeys; + FmgrInfo *keepOrderProcs = NULL, + *unmarkOrderProcs = NULL; + bool haveReqEquals, + haveReqForward, + haveReqBackward; + + /* + * Do an initial pass over so->keyData[] that determines which keys to + * keep as required. We expect so->keyData[] to still be in attribute + * order when we're called (though we don't expect any particular order + * among each attribute's keys). + * + * When both equality and inequality keys remain on a single attribute, we + * *must* make sure that exactly one of the equalities remains required. + * Any requiredness markings that we might leave on later keys/attributes + * are predicated on there being required = keys on all prior columns. + */ + unmarkikey = palloc0(so->numberOfKeys * sizeof(bool)); + nunmark = 0; + + /* Set things up for first key's attribute */ + attno = so->keyData[0].sk_attno; + firsti = 0; + haveReqEquals = false; + haveReqForward = false; + haveReqBackward = false; + for (int i = 0; i < so->numberOfKeys; i++) + { + ScanKey origkey = &so->keyData[i]; + + if (origkey->sk_attno != attno) + { + /* Reset for next attribute */ + attno = origkey->sk_attno; + firsti = i; + + haveReqEquals = false; + haveReqForward = false; + haveReqBackward = false; + } + + /* Equalities get priority over inequalities */ + if (haveReqEquals) + { + /* + * We already found the first "=" key for this attribute. We've + * already decided that all its other keys will be unmarked. + */ + Assert(!(origkey->sk_flags & SK_SEARCHNULL)); + unmarkikey[i] = true; + nunmark++; + continue; + } + else if ((origkey->sk_flags & SK_BT_REQFWD) && + (origkey->sk_flags & SK_BT_REQBKWD)) + { + /* + * Found the first "=" key for attno. All other attno keys will + * be unmarked. + */ + Assert(origkey->sk_strategy == BTEqualStrategyNumber); + + haveReqEquals = true; + for (int j = firsti; j < i; j++) + { + /* Unmark any prior inequality keys on attno after all */ + if (!unmarkikey[j]) + { + unmarkikey[j] = true; + nunmark++; + } + } + continue; + } + + /* Deal with inequalities next */ + if ((origkey->sk_flags & SK_BT_REQFWD) && !haveReqForward) + { + haveReqForward = true; + continue; + } + else if ((origkey->sk_flags & SK_BT_REQBKWD) && !haveReqBackward) + { + haveReqBackward = true; + continue; + } + + /* + * We have either a redundant inequality key that will be unmarked, or + * we have a key that wasn't marked required in the first place + */ + unmarkikey[i] = true; + nunmark++; + } + + /* Should only be called when _bt_compare_scankey_args reported failure */ + Assert(nunmark > 0); + + /* + * Next, allocate temp arrays: one for required keys that'll remain + * required, the other for all remaining keys + */ + unmarkKeys = palloc(nunmark * sizeof(ScanKeyData)); + keepKeys = palloc((so->numberOfKeys - nunmark) * sizeof(ScanKeyData)); + nunmarked = 0; + nkept = 0; + if (so->numArrayKeys) + { + unmarkOrderProcs = palloc(nunmark * sizeof(FmgrInfo)); + keepOrderProcs = palloc((so->numberOfKeys - nunmark) * sizeof(FmgrInfo)); + } + + /* + * Next, copy the contents of so->keyData[] into the appropriate temp + * array. + * + * Scans with = array keys need us to maintain invariants around the order + * of so->orderProcs[] and so->arrayKeys[] relative to so->keyData[]. See + * _bt_preprocess_array_keys_final for a full explanation. + */ + for (int i = 0; i < so->numberOfKeys; i++) + { + ScanKey origkey = &so->keyData[i]; + ScanKey unmark; + + if (!unmarkikey[i]) + { + /* + * Key gets to keep its original requiredness markings. + * + * Key will stay in its original position, unless we're going to + * unmark an earlier key (in which case this key gets moved back). + */ + memcpy(keepKeys + nkept, origkey, sizeof(ScanKeyData)); + + if (so->numArrayKeys) + { + keyDataMap[i] = nkept; + memcpy(keepOrderProcs + nkept, &so->orderProcs[i], + sizeof(FmgrInfo)); + } + + nkept++; + continue; + } + + /* + * Key will be unmarked as needed, and moved to the end of the array, + * next to other keys that will become (or always were) nonrequired + */ + unmark = unmarkKeys + nunmarked; + memcpy(unmark, origkey, sizeof(ScanKeyData)); + + if (so->numArrayKeys) + { + keyDataMap[i] = (so->numberOfKeys - nunmark) + nunmarked; + memcpy(&unmarkOrderProcs[nunmarked], &so->orderProcs[i], + sizeof(FmgrInfo)); + } + + /* + * Preprocessing only generates skip arrays when it knows that they'll + * be the only required = key on the attr. We'll never unmark them. + */ + Assert(!(unmark->sk_flags & SK_BT_SKIP)); + + /* + * Also shouldn't have to unmark an IS NULL or an IS NOT NULL key. + * They aren't cross-type, so an incomplete opfamily can't matter. + */ + Assert(!(unmark->sk_flags & SK_ISNULL) || + !(unmark->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))); + + /* Clear requiredness flags on redundant key (and on any subkeys) */ + unmark->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD); + if (unmark->sk_flags & SK_ROW_HEADER) + { + ScanKey subkey = (ScanKey) DatumGetPointer(unmark->sk_argument); + + Assert(subkey->sk_strategy == unmark->sk_strategy); + for (;;) + { + Assert(subkey->sk_flags & SK_ROW_MEMBER); + subkey->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD); + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + } + } + + nunmarked++; + } + + /* Copy both temp arrays back into so->keyData[] to reorder */ + Assert(nkept == so->numberOfKeys - nunmark); + Assert(nunmarked == nunmark); + memcpy(so->keyData, keepKeys, sizeof(ScanKeyData) * nkept); + memcpy(so->keyData + nkept, unmarkKeys, sizeof(ScanKeyData) * nunmarked); + + /* Done with temp arrays */ + pfree(unmarkikey); + pfree(keepKeys); + pfree(unmarkKeys); + + /* + * Now copy so->orderProcs[] temp entries needed by scans with = array + * keys back (just like with the so->keyData[] temp arrays) + */ + if (so->numArrayKeys) + { + memcpy(so->orderProcs, keepOrderProcs, sizeof(FmgrInfo) * nkept); + memcpy(so->orderProcs + nkept, unmarkOrderProcs, + sizeof(FmgrInfo) * nunmarked); + + /* Also fix-up array->scan_key references */ + for (int arridx = 0; arridx < so->numArrayKeys; arridx++) + { + BTArrayKeyInfo *array = &so->arrayKeys[arridx]; + + array->scan_key = keyDataMap[array->scan_key]; + } + + /* + * Sort so->arrayKeys[] based on its new BTArrayKeyInfo.scan_key + * offsets, so that its order matches so->keyData[] order as expected + */ + qsort(so->arrayKeys, so->numArrayKeys, sizeof(BTArrayKeyInfo), + _bt_reorder_array_cmp); + + /* Done with temp arrays */ + pfree(unmarkOrderProcs); + pfree(keepOrderProcs); + } +} + +/* + * qsort comparator for reordering so->arrayKeys[] BTArrayKeyInfo entries + */ +static int +_bt_reorder_array_cmp(const void *a, const void *b) +{ + BTArrayKeyInfo *arraya = (BTArrayKeyInfo *) a; + BTArrayKeyInfo *arrayb = (BTArrayKeyInfo *) b; + + return pg_cmp_s32(arraya->scan_key, arrayb->scan_key); +} + +/* * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys * * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 765659887af..fdff960c130 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -228,6 +228,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) BTScanOpaque so = (BTScanOpaque) scan->opaque; bool res; + Assert(scan->heapRelation != NULL); + /* btree indexes are never lossy */ scan->xs_recheck = false; @@ -289,6 +291,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) int64 ntids = 0; ItemPointer heapTid; + Assert(scan->heapRelation == NULL); + /* Each loop iteration performs another primitive index scan */ do { @@ -393,6 +397,34 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, BTScanPosInvalidate(so->currPos); } + /* + * We prefer to eagerly drop leaf page pins before btgettuple returns. + * This avoids making VACUUM wait to acquire a cleanup lock on the page. + * + * We cannot safely drop leaf page pins during index-only scans due to a + * race condition involving VACUUM setting pages all-visible in the VM. + * It's also unsafe for plain index scans that use a non-MVCC snapshot. + * + * When we drop pins eagerly, the mechanism that marks so->killedItems[] + * index tuples LP_DEAD has to deal with concurrent TID recycling races. + * The scheme used to detect unsafe TID recycling won't work when scanning + * unlogged relations (since it involves saving an affected page's LSN). + * Opt out of eager pin dropping during unlogged relation scans for now + * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting). + * + * Also opt out of dropping leaf page pins eagerly during bitmap scans. + * Pins cannot be held for more than an instant during bitmap scans either + * way, so we might as well avoid wasting cycles on acquiring page LSNs. + * + * See nbtree/README section on making concurrent TID recycling safe. + * + * Note: so->dropPin should never change across rescans. + */ + so->dropPin = (!scan->xs_want_itup && + IsMVCCSnapshot(scan->xs_snapshot) && + RelationNeedsWAL(scan->indexRelation) && + scan->heapRelation != NULL); + so->markItemIndex = -1; so->needPrimScan = false; so->scanBehind = false; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index fe9a3886913..d69798795b4 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,7 +25,7 @@ #include "utils/rel.h" -static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so); static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access); @@ -57,24 +57,29 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); /* * _bt_drop_lock_and_maybe_pin() * - * Unlock the buffer; and if it is safe to release the pin, do that, too. - * This will prevent vacuum from stalling in a blocked state trying to read a - * page when a cursor is sitting on it. - * - * See nbtree/README section on making concurrent TID recycling safe. + * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too. + * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock. */ -static void -_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) +static inline void +_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so) { - _bt_unlockbuf(scan->indexRelation, sp->buf); - - if (IsMVCCSnapshot(scan->xs_snapshot) && - RelationNeedsWAL(scan->indexRelation) && - !scan->xs_want_itup) + if (!so->dropPin) { - ReleaseBuffer(sp->buf); - sp->buf = InvalidBuffer; + /* Just drop the lock (not the pin) */ + _bt_unlockbuf(rel, so->currPos.buf); + return; } + + /* + * Drop both the lock and the pin. + * + * Have to set so->currPos.lsn so that _bt_killitems has a way to detect + * when concurrent heap TID recycling by VACUUM might have taken place. + */ + Assert(RelationNeedsWAL(rel)); + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; } /* @@ -866,8 +871,8 @@ _bt_compare(Relation rel, * if backwards scan, the last item) in the tree that satisfies the * qualifications in the scan key. On success exit, data about the * matching tuple(s) on the page has been loaded into so->currPos. We'll - * drop all locks and hold onto a pin on page's buffer, except when - * _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM. + * drop all locks and hold onto a pin on page's buffer, except during + * so->dropPin scans, when we drop both the lock and the pin. * _bt_returnitem sets the next item to return to scan on success exit. * * If there are no matching items in the index, we return false, with no @@ -887,9 +892,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) OffsetNumber offnum; BTScanInsertData inskey; ScanKey startKeys[INDEX_MAX_KEYS]; - ScanKeyData notnullkeys[INDEX_MAX_KEYS]; + ScanKeyData notnullkey; int keysz = 0; - StrategyNumber strat_total; + StrategyNumber strat_total = InvalidStrategy; BlockNumber blkno = InvalidBlockNumber, lastcurrblkno; @@ -955,46 +960,51 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /*---------- * Examine the scan keys to discover where we need to start the scan. + * The selected scan keys (at most one per index column) are remembered by + * storing their addresses into the local startKeys[] array. The final + * startKeys[] entry's strategy is set in strat_total. (Actually, there + * are a couple of cases where we force a less/more restrictive strategy.) * - * We want to identify the keys that can be used as starting boundaries; - * these are =, >, or >= keys for a forward scan or =, <, <= keys for - * a backwards scan. We can use keys for multiple attributes so long as - * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept - * a > or < boundary or find an attribute with no boundary (which can be - * thought of as the same as "> -infinity"), we can't use keys for any - * attributes to its right, because it would break our simplistic notion - * of what initial positioning strategy to use. + * We must use the key that was marked required (in the direction opposite + * our own scan's) during preprocessing. Each index attribute can only + * have one such required key. In general, the keys that we use to find + * an initial position when scanning forwards are the same keys that end + * the scan on the leaf level when scanning backwards (and vice-versa). * * When the scan keys include cross-type operators, _bt_preprocess_keys - * may not be able to eliminate redundant keys; in such cases we will - * arbitrarily pick a usable one for each attribute. This is correct - * but possibly not optimal behavior. (For example, with keys like - * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when - * x=5 would be more efficient.) Since the situation only arises given - * a poorly-worded query plus an incomplete opfamily, live with it. + * may not be able to eliminate redundant keys; in such cases it will + * arbitrarily pick a usable key for each attribute (and scan direction), + * ensuring that there is no more than one key required in each direction. + * We stop considering further keys once we reach the first nonrequired + * key (which must come after all required keys), so this can't affect us. + * + * The required keys that we use as starting boundaries have to be =, >, + * or >= keys for a forward scan or =, <, <= keys for a backwards scan. + * We can use keys for multiple attributes so long as the prior attributes + * had only =, >= (resp. =, <=) keys. These rules are very similar to the + * rules that preprocessing used to determine which keys to mark required. + * We cannot always use every required key as a positioning key, though. + * Skip arrays necessitate independently applying our own rules here. + * Skip arrays are always generally considered = array keys, but we'll + * nevertheless treat them as inequalities at certain points of the scan. + * When that happens, it _might_ have implications for the number of + * required keys that we can safely use for initial positioning purposes. * - * When both equality and inequality keys appear for a single attribute - * (again, only possible when cross-type operators appear), we *must* - * select one of the equality keys for the starting point, because - * _bt_checkkeys() will stop the scan as soon as an equality qual fails. - * For example, if we have keys like "x >= 4 AND x = 10" and we elect to - * start at x=4, we will fail and stop before reaching x=10. If multiple - * equality quals survive preprocessing, however, it doesn't matter which - * one we use --- by definition, they are either redundant or - * contradictory. + * For example, a forward scan with a skip array on its leading attribute + * (with no low_compare/high_compare) will have at least two required scan + * keys, but we won't use any of them as boundary keys during the scan's + * initial call here. Our positioning key during the first call here can + * be thought of as representing "> -infinity". Similarly, if such a skip + * array's low_compare is "a > 'foo'", then we position using "a > 'foo'" + * during the scan's initial call here; a lower-order key such as "b = 42" + * can't be used until the "a" array advances beyond MINVAL/low_compare. * - * In practice we rarely see any "attribute boundary key gaps" here. - * Preprocessing can usually backfill skip array keys for any attributes - * that were omitted from the original scan->keyData[] input keys. All - * array keys are always considered = keys, but we'll sometimes need to - * treat the current key value as if we were using an inequality strategy. - * This happens with range skip arrays, which store inequality keys in the - * array's low_compare/high_compare fields (used to find the first/last - * set of matches, when = key will lack a usable sk_argument value). - * These are always preferred over any redundant "standard" inequality - * keys on the same column (per the usual rule about preferring = keys). - * Note also that any column with an = skip array key can never have an - * additional, contradictory = key. + * On the other hand, if such a skip array's low_compare was "a >= 'foo'", + * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here. + * A subsequent call here might have us use "a = 'fop' AND b = 42". Note + * that we treat = and >= as equivalent when scanning forwards (just as we + * treat = and <= as equivalent when scanning backwards). We effectively + * do the same thing (though with a distinct "a" element/value) each time. * * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP * array keys whose array is "null_elem=true") imply a NOT NULL qualifier. @@ -1006,41 +1016,38 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * traversing a lot of null entries at the start of the scan. * * In this loop, row-comparison keys are treated the same as keys on their - * first (leftmost) columns. We'll add on lower-order columns of the row - * comparison below, if possible. + * first (leftmost) columns. We'll add all lower-order columns of the row + * comparison that were marked required during preprocessing below. * - * The selected scan keys (at most one per index column) are remembered by - * storing their addresses into the local startKeys[] array. - * - * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start - * the next primitive index scan (for scans with array keys) based in part - * on an understanding of how it'll enable us to reposition the scan. - * They're directly aware of how we'll sometimes cons up an explicit - * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a - * symmetric "deduce NOT NULL" rule of their own. This allows top-level - * scans to skip large groups of NULLs through repeated deductions about - * key strictness (for a required inequality key) and whether NULLs in the - * key's index column are stored last or first (relative to non-NULLs). + * _bt_advance_array_keys needs to know exactly how we'll reposition the + * scan (should it opt to schedule another primitive index scan). It is + * critical that primscans only be scheduled when they'll definitely make + * some useful progress. _bt_advance_array_keys does this by calling + * _bt_checkkeys routines that report whether a tuple is past the end of + * matches for the scan's keys (given the scan's current array elements). + * If the page's final tuple is "after the end of matches" for a scan that + * uses the *opposite* scan direction, then it must follow that it's also + * "before the start of matches" for the actual current scan direction. + * It is therefore essential that all of our initial positioning rules are + * symmetric with _bt_checkkeys's corresponding continuescan=false rule. * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might * need to be kept in sync. *---------- */ - strat_total = BTEqualStrategyNumber; if (so->numberOfKeys > 0) { AttrNumber curattr; - ScanKey chosen; + ScanKey bkey; ScanKey impliesNN; ScanKey cur; /* - * chosen is the so-far-chosen key for the current attribute, if any. - * We don't cast the decision in stone until we reach keys for the - * next attribute. + * bkey will be set to the key that preprocessing left behind as the + * boundary key for this attribute, in this scan direction (if any) */ cur = so->keyData; curattr = 1; - chosen = NULL; + bkey = NULL; /* Also remember any scankey that implies a NOT NULL constraint */ impliesNN = NULL; @@ -1053,23 +1060,29 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { if (i >= so->numberOfKeys || cur->sk_attno != curattr) { + /* Done looking for the curattr boundary key */ + Assert(bkey == NULL || + (bkey->sk_attno == curattr && + (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))); + Assert(impliesNN == NULL || + (impliesNN->sk_attno == curattr && + (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))); + /* - * Done looking at keys for curattr. - * * If this is a scan key for a skip array whose current * element is MINVAL, choose low_compare (when scanning * backwards it'll be MAXVAL, and we'll choose high_compare). * - * Note: if the array's low_compare key makes 'chosen' NULL, + * Note: if the array's low_compare key makes 'bkey' NULL, * then we behave as if the array's first element is -inf, * except when !array->null_elem implies a usable NOT NULL * constraint. */ - if (chosen != NULL && - (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) + if (bkey != NULL && + (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) { - int ikey = chosen - so->keyData; - ScanKey skipequalitykey = chosen; + int ikey = bkey - so->keyData; + ScanKey skipequalitykey = bkey; BTArrayKeyInfo *array = NULL; for (int arridx = 0; arridx < so->numArrayKeys; arridx++) @@ -1082,42 +1095,41 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (ScanDirectionIsForward(dir)) { Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL)); - chosen = array->low_compare; + bkey = array->low_compare; } else { Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL)); - chosen = array->high_compare; + bkey = array->high_compare; } - Assert(chosen == NULL || - chosen->sk_attno == skipequalitykey->sk_attno); + Assert(bkey == NULL || + bkey->sk_attno == skipequalitykey->sk_attno); if (!array->null_elem) impliesNN = skipequalitykey; else - Assert(chosen == NULL && impliesNN == NULL); + Assert(bkey == NULL && impliesNN == NULL); } /* * If we didn't find a usable boundary key, see if we can * deduce a NOT NULL key */ - if (chosen == NULL && impliesNN != NULL && + if (bkey == NULL && impliesNN != NULL && ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? ScanDirectionIsForward(dir) : ScanDirectionIsBackward(dir))) { - /* Yes, so build the key in notnullkeys[keysz] */ - chosen = ¬nullkeys[keysz]; - ScanKeyEntryInitialize(chosen, + /* Final startKeys[] entry will be deduced NOT NULL key */ + bkey = ¬nullkey; + ScanKeyEntryInitialize(bkey, (SK_SEARCHNOTNULL | SK_ISNULL | (impliesNN->sk_flags & (SK_BT_DESC | SK_BT_NULLS_FIRST))), curattr, - ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? - BTGreaterStrategyNumber : - BTLessStrategyNumber), + ScanDirectionIsForward(dir) ? + BTGreaterStrategyNumber : BTLessStrategyNumber, InvalidOid, InvalidOid, InvalidOid, @@ -1125,12 +1137,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* - * If we still didn't find a usable boundary key, quit; else - * save the boundary key pointer in startKeys. + * If preprocessing didn't leave a usable boundary key, quit; + * else save the boundary key pointer in startKeys[] */ - if (chosen == NULL) + if (bkey == NULL) break; - startKeys[keysz++] = chosen; + startKeys[keysz++] = bkey; /* * We can only consider adding more boundary keys when the one @@ -1138,7 +1150,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * (during backwards scans we can only do so when the key that * we just added to startKeys[] uses the = or <= strategy) */ - strat_total = chosen->sk_strategy; + strat_total = bkey->sk_strategy; if (strat_total == BTGreaterStrategyNumber || strat_total == BTLessStrategyNumber) break; @@ -1149,19 +1161,19 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * make strat_total > or < (and stop adding boundary keys). * This can only happen with opclasses that lack skip support. */ - if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) + if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) { - Assert(chosen->sk_flags & SK_BT_SKIP); + Assert(bkey->sk_flags & SK_BT_SKIP); Assert(strat_total == BTEqualStrategyNumber); if (ScanDirectionIsForward(dir)) { - Assert(!(chosen->sk_flags & SK_BT_PRIOR)); + Assert(!(bkey->sk_flags & SK_BT_PRIOR)); strat_total = BTGreaterStrategyNumber; } else { - Assert(!(chosen->sk_flags & SK_BT_NEXT)); + Assert(!(bkey->sk_flags & SK_BT_NEXT)); strat_total = BTLessStrategyNumber; } @@ -1175,24 +1187,30 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /* * Done if that was the last scan key output by preprocessing. - * Also done if there is a gap index attribute that lacks a - * usable key (only possible when preprocessing was unable to - * generate a skip array key to "fill in the gap"). + * Also done if we've now examined all keys marked required. */ if (i >= so->numberOfKeys || - cur->sk_attno != curattr + 1) + !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) break; /* * Reset for next attr. */ + Assert(cur->sk_attno == curattr + 1); curattr = cur->sk_attno; - chosen = NULL; + bkey = NULL; impliesNN = NULL; } /* - * Can we use this key as a starting boundary for this attr? + * If we've located the starting boundary key for curattr, we have + * no interest in curattr's other required key + */ + if (bkey != NULL) + continue; + + /* + * Is this key the starting boundary key for curattr? * * If not, does it imply a NOT NULL constraint? (Because * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, @@ -1202,27 +1220,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { case BTLessStrategyNumber: case BTLessEqualStrategyNumber: - if (chosen == NULL) - { - if (ScanDirectionIsBackward(dir)) - chosen = cur; - else - impliesNN = cur; - } + if (ScanDirectionIsBackward(dir)) + bkey = cur; + else if (impliesNN == NULL) + impliesNN = cur; break; case BTEqualStrategyNumber: - /* override any non-equality choice */ - chosen = cur; + bkey = cur; break; case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: - if (chosen == NULL) - { - if (ScanDirectionIsForward(dir)) - chosen = cur; - else - impliesNN = cur; - } + if (ScanDirectionIsForward(dir)) + bkey = cur; + else if (impliesNN == NULL) + impliesNN = cur; break; } } @@ -1248,16 +1259,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Assert(keysz <= INDEX_MAX_KEYS); for (int i = 0; i < keysz; i++) { - ScanKey cur = startKeys[i]; + ScanKey bkey = startKeys[i]; - Assert(cur->sk_attno == i + 1); + Assert(bkey->sk_attno == i + 1); - if (cur->sk_flags & SK_ROW_HEADER) + if (bkey->sk_flags & SK_ROW_HEADER) { /* * Row comparison header: look to the first row member instead */ - ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); + ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument); + bool loosen_strat = false, + tighten_strat = false; /* * Cannot be a NULL in the first row member: _bt_preprocess_keys @@ -1265,122 +1278,160 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * ever getting this far */ Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(subkey->sk_attno == cur->sk_attno); + Assert(subkey->sk_attno == bkey->sk_attno); Assert(!(subkey->sk_flags & SK_ISNULL)); /* + * This is either a > or >= key (during backwards scans it is + * either < or <=) that was marked required during preprocessing. + * Later so->keyData[] keys can't have been marked required, so + * our row compare header key must be the final startKeys[] entry. + */ + Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)); + Assert(i == keysz - 1); + + /* * The member scankeys are already in insertion format (ie, they * have sk_func = 3-way-comparison function) */ memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData)); /* - * If the row comparison is the last positioning key we accepted, - * try to add additional keys from the lower-order row members. - * (If we accepted independent conditions on additional index - * columns, we use those instead --- doesn't seem worth trying to - * determine which is more restrictive.) Note that this is OK - * even if the row comparison is of ">" or "<" type, because the - * condition applied to all but the last row member is effectively - * ">=" or "<=", and so the extra keys don't break the positioning - * scheme. But, by the same token, if we aren't able to use all - * the row members, then the part of the row comparison that we - * did use has to be treated as just a ">=" or "<=" condition, and - * so we'd better adjust strat_total accordingly. + * Now look to later row compare members. + * + * If there's an "index attribute gap" between two row compare + * members, the second member won't have been marked required, and + * so can't be used as a starting boundary key here. The part of + * the row comparison that we do still use has to be treated as a + * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)" + * with an omitted intervening index attribute "b" will use an + * insertion scan key "a >= 1". Even the first "a = 1" tuple on + * the leaf level might satisfy the row compare qual. + * + * We're able to use a _more_ restrictive strategy when we reach a + * NULL row compare member, since they're always unsatisfiable. + * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an + * insertion scan key "a > 1". All tuples where "a = 1" cannot + * possibly satisfy the row compare qual, so this is safe. */ - if (i == keysz - 1) + Assert(!(subkey->sk_flags & SK_ROW_END)); + for (;;) { - bool used_all_subkeys = false; + subkey++; + Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(!(subkey->sk_flags & SK_ROW_END)); - for (;;) + if (subkey->sk_flags & SK_ISNULL) { - subkey++; - Assert(subkey->sk_flags & SK_ROW_MEMBER); - if (subkey->sk_attno != keysz + 1) - break; /* out-of-sequence, can't use it */ - if (subkey->sk_strategy != cur->sk_strategy) - break; /* wrong direction, can't use it */ - if (subkey->sk_flags & SK_ISNULL) - break; /* can't use null keys */ - Assert(keysz < INDEX_MAX_KEYS); - memcpy(inskey.scankeys + keysz, subkey, - sizeof(ScanKeyData)); - keysz++; - if (subkey->sk_flags & SK_ROW_END) - { - used_all_subkeys = true; - break; - } + /* + * NULL member key, can only use earlier keys. + * + * We deliberately avoid checking if this key is marked + * required. All earlier keys are required, and this key + * is unsatisfiable either way, so we can't miss anything. + */ + tighten_strat = true; + break; } - if (!used_all_subkeys) + + if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) { - switch (strat_total) - { - case BTLessStrategyNumber: - strat_total = BTLessEqualStrategyNumber; - break; - case BTGreaterStrategyNumber: - strat_total = BTGreaterEqualStrategyNumber; - break; - } + /* nonrequired member key, can only use earlier keys */ + loosen_strat = true; + break; } - break; /* done with outer loop */ + + Assert(subkey->sk_attno == keysz + 1); + Assert(subkey->sk_strategy == bkey->sk_strategy); + Assert(keysz < INDEX_MAX_KEYS); + + memcpy(inskey.scankeys + keysz, subkey, + sizeof(ScanKeyData)); + keysz++; + if (subkey->sk_flags & SK_ROW_END) + break; } - } - else - { - /* - * Ordinary comparison key. Transform the search-style scan key - * to an insertion scan key by replacing the sk_func with the - * appropriate btree comparison function. - * - * If scankey operator is not a cross-type comparison, we can use - * the cached comparison function; otherwise gotta look it up in - * the catalogs. (That can't lead to infinite recursion, since no - * indexscan initiated by syscache lookup will use cross-data-type - * operators.) - * - * We support the convention that sk_subtype == InvalidOid means - * the opclass input type; this is a hack to simplify life for - * ScanKeyInit(). - */ - if (cur->sk_subtype == rel->rd_opcintype[i] || - cur->sk_subtype == InvalidOid) + Assert(!(loosen_strat && tighten_strat)); + if (loosen_strat) { - FmgrInfo *procinfo; - - procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); - ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, - cur->sk_flags, - cur->sk_attno, - InvalidStrategy, - cur->sk_subtype, - cur->sk_collation, - procinfo, - cur->sk_argument); + /* Use less restrictive strategy (and fewer member keys) */ + switch (strat_total) + { + case BTLessStrategyNumber: + strat_total = BTLessEqualStrategyNumber; + break; + case BTGreaterStrategyNumber: + strat_total = BTGreaterEqualStrategyNumber; + break; + } } - else + if (tighten_strat) { - RegProcedure cmp_proc; - - cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], - rel->rd_opcintype[i], - cur->sk_subtype, - BTORDER_PROC); - if (!RegProcedureIsValid(cmp_proc)) - elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", - BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, - cur->sk_attno, RelationGetRelationName(rel)); - ScanKeyEntryInitialize(inskey.scankeys + i, - cur->sk_flags, - cur->sk_attno, - InvalidStrategy, - cur->sk_subtype, - cur->sk_collation, - cmp_proc, - cur->sk_argument); + /* Use more restrictive strategy (and fewer member keys) */ + switch (strat_total) + { + case BTLessEqualStrategyNumber: + strat_total = BTLessStrategyNumber; + break; + case BTGreaterEqualStrategyNumber: + strat_total = BTGreaterStrategyNumber; + break; + } } + + /* done adding to inskey (row comparison keys always come last) */ + break; + } + + /* + * Ordinary comparison key/search-style key. + * + * Transform the search-style scan key to an insertion scan key by + * replacing the sk_func with the appropriate btree 3-way-comparison + * function. + * + * If scankey operator is not a cross-type comparison, we can use the + * cached comparison function; otherwise gotta look it up in the + * catalogs. (That can't lead to infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * We support the convention that sk_subtype == InvalidOid means the + * opclass input type; this hack simplifies life for ScanKeyInit(). + */ + if (bkey->sk_subtype == rel->rd_opcintype[i] || + bkey->sk_subtype == InvalidOid) + { + FmgrInfo *procinfo; + + procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC); + ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, + bkey->sk_flags, + bkey->sk_attno, + InvalidStrategy, + bkey->sk_subtype, + bkey->sk_collation, + procinfo, + bkey->sk_argument); + } + else + { + RegProcedure cmp_proc; + + cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], + rel->rd_opcintype[i], + bkey->sk_subtype, BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype, + bkey->sk_attno, RelationGetRelationName(rel)); + ScanKeyEntryInitialize(inskey.scankeys + i, + bkey->sk_flags, + bkey->sk_attno, + InvalidStrategy, + bkey->sk_subtype, + bkey->sk_collation, + cmp_proc, + bkey->sk_argument); } } @@ -1469,6 +1520,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (!BufferIsValid(so->currPos.buf)) { + Assert(!so->needPrimScan); + /* * We only get here if the index is completely empty. Lock relation * because nothing finer to lock exists. Without a buffer lock, it's @@ -1487,7 +1540,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (!BufferIsValid(so->currPos.buf)) { - Assert(!so->needPrimScan); _bt_parallel_done(scan); return false; } @@ -1610,7 +1662,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); so->currPos.prevPage = opaque->btpo_prev; so->currPos.nextPage = opaque->btpo_next; + /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */ + so->currPos.dir = dir; + so->currPos.nextTupleOffset = 0; + /* either moreRight or moreLeft should be set now (may be unset later) */ + Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : + so->currPos.moreLeft); Assert(!P_IGNORE(opaque)); Assert(BTScanPosIsPinned(so->currPos)); Assert(!so->needPrimScan); @@ -1626,14 +1684,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->currPos.currPage); } - /* initialize remaining currPos fields related to current page */ - so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); - so->currPos.dir = dir; - so->currPos.nextTupleOffset = 0; - /* either moreLeft or moreRight should be set now (may be unset later) */ - Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight : - so->currPos.moreLeft); - PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot); /* initialize local variables */ @@ -2107,10 +2157,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) * * Wrapper on _bt_readnextpage that performs final steps for the current page. * - * On entry, if so->currPos.buf is valid the buffer is pinned but not locked. - * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped - * the pin eagerly earlier on. The scan must have so->currPos.currPage set to - * a valid block, in any case. + * On entry, so->currPos must be valid. Its buffer will be pinned, though + * never locked. (Actually, when so->dropPin there won't even be a pin held, + * though so->currPos.currPage must still be set to a valid block number.) */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) @@ -2251,12 +2300,14 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) */ if (_bt_readpage(scan, dir, offnum, true)) { + Relation rel = scan->indexRelation; + /* * _bt_readpage succeeded. Drop the lock (and maybe the pin) on * so->currPos.buf in preparation for btgettuple returning tuples. */ Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(rel, so); return true; } @@ -2278,9 +2329,12 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * previously-saved right link or left link. lastcurrblkno is the page that * was current at the point where the blkno link was saved, which we use to * reason about concurrent page splits/page deletions during backwards scans. + * In the common case where seized=false, blkno is either so->currPos.nextPage + * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage. * - * On entry, caller shouldn't hold any locks or pins on any page (we work - * directly off of blkno and lastcurrblkno instead). Parallel scan callers + * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must + * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno + * won't need to be read again in almost all cases). Parallel scan callers * that seized the scan before calling here should pass seized=true; such a * caller's blkno and lastcurrblkno arguments come from the seized scan. * seized=false callers just pass us the blkno/lastcurrblkno taken from their @@ -2294,11 +2348,11 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * * On success exit, so->currPos is updated to contain data from the next * interesting page, and we return true. We hold a pin on the buffer on - * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe - * to eagerly drop the pin (to avoid blocking VACUUM). + * success exit (except during so->dropPin index scans, when we drop the pin + * eagerly to avoid blocking VACUUM). * - * If there are no more matching records in the given direction, we drop all - * locks and pins, invalidate so->currPos, and return false. + * If there are no more matching records in the given direction, we invalidate + * so->currPos (while ensuring it retains no locks or pins), and return false. * * We always release the scan for a parallel scan caller, regardless of * success or failure; we'll call _bt_parallel_release as soon as possible. @@ -2413,7 +2467,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, */ Assert(so->currPos.currPage == blkno); Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(rel, so); return true; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 3794cc924ad..9d70e89c1f3 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -105,7 +105,7 @@ typedef struct BTShared int scantuplesortstates; /* Query ID, for report in worker processes */ - uint64 queryid; + int64 queryid; /* * workersdonecv is used to monitor the progress of workers. All parallel diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 1a15dfcb7d3..9aed207995f 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -44,7 +44,6 @@ static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *arra static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, bool *skip_array_set); -static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, bool readpagetup, int sktrig, bool *scanBehind); @@ -52,7 +51,6 @@ static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, int sktrig, bool sktrig_required); #ifdef USE_ASSERT_CHECKING -static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir); static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); #endif static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, @@ -1035,73 +1033,6 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, } /* - * _bt_rewind_nonrequired_arrays() -- Rewind SAOP arrays not marked required - * - * Called when _bt_advance_array_keys decides to start a new primitive index - * scan on the basis of the current scan position being before the position - * that _bt_first is capable of repositioning the scan to by applying an - * inequality operator required in the opposite-to-scan direction only. - * - * Although equality strategy scan keys (for both arrays and non-arrays alike) - * are either marked required in both directions or in neither direction, - * there is a sense in which non-required arrays behave like required arrays. - * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)", - * the scan key on "c" is non-required, but nevertheless enables positioning - * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the - * first descent of the tree by _bt_first. Later on, there could also be a - * second descent, that places the scan right before tuples >= "(200, 3, 5)". - * _bt_first must never be allowed to build an insertion scan key whose "c" - * entry is set to a value other than 5, the "c" array's first element/value. - * (Actually, it's the first in the current scan direction. This example uses - * a forward scan.) - * - * Calling here resets the array scan key elements for the scan's non-required - * arrays. This is strictly necessary for correctness in a subset of cases - * involving "required in opposite direction"-triggered primitive index scans. - * Not all callers are at risk of _bt_first using a non-required array like - * this, but advancement always resets the arrays when another primitive scan - * is scheduled, just to keep things simple. Array advancement even makes - * sure to reset non-required arrays during scans that have no inequalities. - * (Advancement still won't call here when there are no inequalities, though - * that's just because it's all handled indirectly instead.) - * - * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that - * everybody got this right. - * - * Note: In practice almost all SAOP arrays are marked required during - * preprocessing (if necessary by generating skip arrays). It is hardly ever - * truly necessary to call here, but consistently doing so is simpler. - */ -static void -_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) -{ - Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int arrayidx = 0; - - for (int ikey = 0; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - BTArrayKeyInfo *array = NULL; - - if (!(cur->sk_flags & SK_SEARCHARRAY) || - cur->sk_strategy != BTEqualStrategyNumber) - continue; - - array = &so->arrayKeys[arrayidx++]; - Assert(array->scan_key == ikey); - - if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) - continue; - - Assert(array->num_elems != -1); /* No non-required skip arrays */ - - _bt_array_set_low_or_high(rel, cur, array, - ScanDirectionIsForward(dir)); - } -} - -/* * _bt_tuple_before_array_skeys() -- too early to advance required arrays? * * We always compare the tuple using the current array keys (which we assume @@ -1380,8 +1311,6 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) */ if (so->needPrimScan) { - Assert(_bt_verify_arrays_bt_first(scan, dir)); - /* * Flag was set -- must call _bt_first again, which will reset the * scan's needPrimScan flag @@ -2007,14 +1936,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ else if (has_required_opposite_direction_only && pstate->finaltup && unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup))) - { - /* - * Make sure that any SAOP arrays that were not marked required by - * preprocessing are reset to their first element for this direction - */ - _bt_rewind_nonrequired_arrays(scan, dir); goto new_prim_scan; - } continue_scan: @@ -2045,8 +1967,6 @@ continue_scan: */ so->oppositeDirCheck = has_required_opposite_direction_only; - _bt_rewind_nonrequired_arrays(scan, dir); - /* * skip by setting "look ahead" mechanism's offnum for forwards scans * (backwards scans check scanBehind flag directly instead) @@ -2143,48 +2063,6 @@ end_toplevel_scan: #ifdef USE_ASSERT_CHECKING /* - * Verify that the scan's qual state matches what we expect at the point that - * _bt_start_prim_scan is about to start a just-scheduled new primitive scan. - * - * We enforce a rule against non-required array scan keys: they must start out - * with whatever element is the first for the scan's current scan direction. - * See _bt_rewind_nonrequired_arrays comments for an explanation. - */ -static bool -_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int arrayidx = 0; - - for (int ikey = 0; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - BTArrayKeyInfo *array = NULL; - int first_elem_dir; - - if (!(cur->sk_flags & SK_SEARCHARRAY) || - cur->sk_strategy != BTEqualStrategyNumber) - continue; - - array = &so->arrayKeys[arrayidx++]; - - if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || - ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) - continue; - - if (ScanDirectionIsForward(dir)) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; - - if (array->cur_elem != first_elem_dir) - return false; - } - - return _bt_verify_keys_with_arraykeys(scan); -} - -/* * Verify that the scan's "so->keyData[]" scan keys are in agreement with * its array key state */ @@ -2194,6 +2072,7 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) BTScanOpaque so = (BTScanOpaque) scan->opaque; int last_sk_attno = InvalidAttrNumber, arrayidx = 0; + bool nonrequiredseen = false; if (!so->qual_ok) return false; @@ -2217,8 +2096,16 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) if (array->num_elems != -1 && cur->sk_argument != array->elem_values[array->cur_elem]) return false; - if (last_sk_attno > cur->sk_attno) - return false; + if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) + { + if (last_sk_attno > cur->sk_attno) + return false; + if (nonrequiredseen) + return false; + } + else + nonrequiredseen = true; + last_sk_attno = cur->sk_attno; } @@ -2551,37 +2438,12 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) if (!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) { /* Scan key isn't marked required (corner case) */ - Assert(!(key->sk_flags & SK_ROW_HEADER)); break; /* unsafe */ } if (key->sk_flags & SK_ROW_HEADER) { - /* - * RowCompare inequality. - * - * Only the first subkey from a RowCompare can ever be marked - * required (that happens when the row header is marked required). - * There is no simple, general way for us to transitively deduce - * whether or not every tuple on the page satisfies a RowCompare - * key based only on firsttup and lasttup -- so we just give up. - */ - if (!start_past_saop_eq && !so->skipScan) - break; /* unsafe to go further */ - - /* - * We have to be even more careful with RowCompares that come - * after an array: we assume it's unsafe to even bypass the array. - * Calling _bt_start_array_keys to recover the scan's arrays - * following use of forcenonrequired mode isn't compatible with - * _bt_check_rowcompare's continuescan=false behavior with NULL - * row compare members. _bt_advance_array_keys must not make a - * decision on the basis of a key not being satisfied in the - * opposite-to-scan direction until the scan reaches a leaf page - * where the same key begins to be satisfied in scan direction. - * The _bt_first !used_all_subkeys behavior makes this limitation - * hard to work around some other way. - */ - return; /* completely unsafe to set pstate.startikey */ + /* RowCompare inequalities currently aren't supported */ + break; /* "unsafe" */ } if (key->sk_strategy != BTEqualStrategyNumber) { @@ -3078,6 +2940,31 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, Assert(subkey->sk_flags & SK_ROW_MEMBER); + /* When a NULL row member is compared, the row never matches */ + if (subkey->sk_flags & SK_ISNULL) + { + /* + * Unlike the simple-scankey case, this isn't a disallowed case + * (except when it's the first row element that has the NULL arg). + * But it can never match. If all the earlier row comparison + * columns are required for the scan direction, we can stop the + * scan, because there can't be another tuple that will succeed. + */ + Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); + subkey--; + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + return false; + } + if (subkey->sk_attno > tupnatts) { /* @@ -3087,11 +2974,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(BTreeTupleIsPivot(tuple)); - cmpresult = 0; - if (subkey->sk_flags & SK_ROW_END) - break; - subkey++; - continue; + return true; } datum = index_getattr(tuple, @@ -3101,6 +2984,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, if (isNull) { + int reqflags; + if (forcenonrequired) { /* treating scan's keys as non-required */ @@ -3111,15 +2996,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * Since NULLs are sorted before non-NULLs, we know we have * reached the lower limit of the range of values for this * index attr. On a backward scan, we can stop if this qual - * is one of the "must match" subset. We can stop regardless - * of whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a forward scan, however, we must keep going, because we may - * have initially positioned to the start of the index. - * (_bt_advance_array_keys also relies on this behavior during - * forward scans.) + * is one of the "must match" subset. However, on a forwards + * scan, we must keep going, because we may have initially + * positioned to the start of the index. + * + * All required NULLS FIRST > row members can use NULL tuple + * values to end backwards scans, just like with other values. + * A qual "WHERE (a, b, c) > (9, 42, 'foo')" can terminate a + * backwards scan upon reaching the index's rightmost "a = 9" + * tuple whose "b" column contains a NULL (if not sooner). + * Since "b" is NULLS FIRST, we can treat its NULLs as "<" 42. */ - if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + reqflags = SK_BT_REQBKWD; + + /* + * When a most significant required NULLS FIRST < row compare + * member sees NULL tuple values during a backwards scan, it + * signals the end of matches for the whole row compare/scan. + * A qual "WHERE (a, b, c) < (9, 42, 'foo')" will terminate a + * backwards scan upon reaching the rightmost tuple whose "a" + * column has a NULL. The "a" NULL value is "<" 9, and yet + * our < row compare will still end the scan. (This isn't + * safe with later/lower-order row members. Notice that it + * can only happen with an "a" NULL some time after the scan + * completely stops needing to use its "b" and "c" members.) + */ + if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument)) + reqflags |= SK_BT_REQFWD; /* safe, first row member */ + + if ((subkey->sk_flags & reqflags) && ScanDirectionIsBackward(dir)) *continuescan = false; } @@ -3129,15 +3034,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * Since NULLs are sorted after non-NULLs, we know we have * reached the upper limit of the range of values for this * index attr. On a forward scan, we can stop if this qual is - * one of the "must match" subset. We can stop regardless of - * whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a backward scan, however, we must keep going, because we - * may have initially positioned to the end of the index. - * (_bt_advance_array_keys also relies on this behavior during - * backward scans.) + * one of the "must match" subset. However, on a backward + * scan, we must keep going, because we may have initially + * positioned to the end of the index. + * + * All required NULLS LAST < row members can use NULL tuple + * values to end forwards scans, just like with other values. + * A qual "WHERE (a, b, c) < (9, 42, 'foo')" can terminate a + * forwards scan upon reaching the index's leftmost "a = 9" + * tuple whose "b" column contains a NULL (if not sooner). + * Since "b" is NULLS LAST, we can treat its NULLs as ">" 42. + */ + reqflags = SK_BT_REQFWD; + + /* + * When a most significant required NULLS LAST > row compare + * member sees NULL tuple values during a forwards scan, it + * signals the end of matches for the whole row compare/scan. + * A qual "WHERE (a, b, c) > (9, 42, 'foo')" will terminate a + * forwards scan upon reaching the leftmost tuple whose "a" + * column has a NULL. The "a" NULL value is ">" 9, and yet + * our > row compare will end the scan. (This isn't safe with + * later/lower-order row members. Notice that it can only + * happen with an "a" NULL some time after the scan completely + * stops needing to use its "b" and "c" members.) */ - if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument)) + reqflags |= SK_BT_REQBKWD; /* safe, first row member */ + + if ((subkey->sk_flags & reqflags) && ScanDirectionIsForward(dir)) *continuescan = false; } @@ -3148,30 +3073,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, return false; } - if (subkey->sk_flags & SK_ISNULL) - { - /* - * Unlike the simple-scankey case, this isn't a disallowed case - * (except when it's the first row element that has the NULL arg). - * But it can never match. If all the earlier row comparison - * columns are required for the scan direction, we can stop the - * scan, because there can't be another tuple that will succeed. - */ - Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); - subkey--; - if (forcenonrequired) - { - /* treating scan's keys as non-required */ - } - else if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) - *continuescan = false; - else if ((subkey->sk_flags & SK_BT_REQBKWD) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - return false; - } - /* Perform the test --- three-way comparison not bool operator */ cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, subkey->sk_collation, @@ -3330,87 +3231,85 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, * current page and killed tuples thereon (generally, this should only be * called if so->numKilled > 0). * - * The caller does not have a lock on the page and may or may not have the - * page pinned in a buffer. Note that read-lock is sufficient for setting - * LP_DEAD status (which is only a hint). - * - * We match items by heap TID before assuming they are the right ones to - * delete. We cope with cases where items have moved right due to insertions. - * If an item has moved off the current page due to a split, we'll fail to - * find it and do nothing (this is not an error case --- we assume the item - * will eventually get marked in a future indexscan). + * Caller should not have a lock on the so->currPos page, but must hold a + * buffer pin when !so->dropPin. When we return, it still won't be locked. + * It'll continue to hold whatever pins were held before calling here. * - * Note that if we hold a pin on the target page continuously from initially - * reading the items until applying this function, VACUUM cannot have deleted - * any items from the page, and so there is no need to search left from the - * recorded offset. (This observation also guarantees that the item is still - * the right one to delete, which might otherwise be questionable since heap - * TIDs can get recycled.) This holds true even if the page has been modified - * by inserts and page splits, so there is no need to consult the LSN. + * We match items by heap TID before assuming they are the right ones to set + * LP_DEAD. If the scan is one that holds a buffer pin on the target page + * continuously from initially reading the items until applying this function + * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the + * page, so the page's TIDs can't have been recycled by now. There's no risk + * that we'll confuse a new index tuple that happens to use a recycled TID + * with a now-removed tuple with the same TID (that used to be on this same + * page). We can't rely on that during scans that drop buffer pins eagerly + * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on + * the page LSN having not changed since back when _bt_readpage saw the page. + * We totally give up on setting LP_DEAD bits when the page LSN changed. * - * If the pin was released after reading the page, then we re-read it. If it - * has been modified since we read it (as determined by the LSN), we dare not - * flag any entries because it is possible that the old entry was vacuumed - * away and the TID was re-used by a completely different heap tuple. + * We give up much less often during !so->dropPin scans, but it still happens. + * We cope with cases where items have moved right due to insertions. If an + * item has moved off the current page due to a split, we'll fail to find it + * and just give up on it. */ void _bt_killitems(IndexScanDesc scan) { + Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; - int i; int numKilled = so->numKilled; bool killedsomething = false; - bool droppedpin PG_USED_FOR_ASSERTS_ONLY; + Buffer buf; + Assert(numKilled > 0); Assert(BTScanPosIsValid(so->currPos)); + Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */ - /* - * Always reset the scan state, so we don't look for same items on other - * pages. - */ + /* Always invalidate so->killedItems[] before leaving so->currPos */ so->numKilled = 0; - if (BTScanPosIsPinned(so->currPos)) + if (!so->dropPin) { /* * We have held the pin on this page since we read the index tuples, * so all we need to do is lock it. The pin will have prevented - * re-use of any TID on the page, so there is no need to check the - * LSN. + * concurrent VACUUMs from recycling any of the TIDs on the page. */ - droppedpin = false; - _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); - - page = BufferGetPage(so->currPos.buf); + Assert(BTScanPosIsPinned(so->currPos)); + buf = so->currPos.buf; + _bt_lockbuf(rel, buf, BT_READ); } else { - Buffer buf; + XLogRecPtr latestlsn; - droppedpin = true; - /* Attempt to re-read the buffer, getting pin and lock. */ - buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); + Assert(!BTScanPosIsPinned(so->currPos)); + Assert(RelationNeedsWAL(rel)); + buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); - page = BufferGetPage(buf); - if (BufferGetLSNAtomic(buf) == so->currPos.lsn) - so->currPos.buf = buf; - else + latestlsn = BufferGetLSNAtomic(buf); + Assert(!XLogRecPtrIsInvalid(so->currPos.lsn)); + Assert(so->currPos.lsn <= latestlsn); + if (so->currPos.lsn != latestlsn) { - /* Modified while not pinned means hinting is not safe. */ - _bt_relbuf(scan->indexRelation, buf); + /* Modified, give up on hinting */ + _bt_relbuf(rel, buf); return; } + + /* Unmodified, hinting is safe */ } + page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); - for (i = 0; i < numKilled; i++) + for (int i = 0; i < numKilled; i++) { int itemIndex = so->killedItems[i]; BTScanPosItem *kitem = &so->currPos.items[itemIndex]; @@ -3442,7 +3341,7 @@ _bt_killitems(IndexScanDesc scan) * correctness. * * Note that the page may have been modified in almost any way - * since we first read it (in the !droppedpin case), so it's + * since we first read it (in the !so->dropPin case), so it's * possible that this posting list tuple wasn't a posting list * tuple when we first encountered its heap TIDs. */ @@ -3458,7 +3357,7 @@ _bt_killitems(IndexScanDesc scan) * though only in the common case where the page can't * have been concurrently modified */ - Assert(kitem->indexOffset == offnum || !droppedpin); + Assert(kitem->indexOffset == offnum || !so->dropPin); /* * Read-ahead to later kitems here. @@ -3522,10 +3421,13 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - MarkBufferDirtyHint(so->currPos.buf, true); + MarkBufferDirtyHint(buf, true); } - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!so->dropPin) + _bt_unlockbuf(rel, buf); + else + _bt_relbuf(rel, buf); } diff --git a/src/backend/access/rmgrdesc/replorigindesc.c b/src/backend/access/rmgrdesc/replorigindesc.c index 5dd74233996..35e3af2903e 100644 --- a/src/backend/access/rmgrdesc/replorigindesc.c +++ b/src/backend/access/rmgrdesc/replorigindesc.c @@ -29,7 +29,7 @@ replorigin_desc(StringInfo buf, XLogReaderState *record) xlrec = (xl_replorigin_set *) rec; - appendStringInfo(buf, "set %u; lsn %X/%X; force: %d", + appendStringInfo(buf, "set %u; lsn %X/%08X; force: %d", xlrec->node_id, LSN_FORMAT_ARGS(xlrec->remote_lsn), xlrec->force); diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index 715cc1f7bad..f0f696855b9 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -252,6 +252,8 @@ ParsePrepareRecord(uint8 info, xl_xact_prepare *xlrec, xl_xact_parsed_prepare *p parsed->nsubxacts = xlrec->nsubxacts; parsed->nrels = xlrec->ncommitrels; parsed->nabortrels = xlrec->nabortrels; + parsed->nstats = xlrec->ncommitstats; + parsed->nabortstats = xlrec->nabortstats; parsed->nmsgs = xlrec->ninvalmsgs; strncpy(parsed->twophase_gid, bufptr, xlrec->gidlen); @@ -357,7 +359,7 @@ xact_desc_commit(StringInfo buf, uint8 info, xl_xact_commit *xlrec, RepOriginId if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN) { - appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s", + appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s", origin_id, LSN_FORMAT_ARGS(parsed.origin_lsn), timestamptz_to_str(parsed.origin_timestamp)); @@ -382,7 +384,7 @@ xact_desc_abort(StringInfo buf, uint8 info, xl_xact_abort *xlrec, RepOriginId or if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN) { - appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s", + appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s", origin_id, LSN_FORMAT_ARGS(parsed.origin_lsn), timestamptz_to_str(parsed.origin_timestamp)); @@ -416,7 +418,7 @@ xact_desc_prepare(StringInfo buf, uint8 info, xl_xact_prepare *xlrec, RepOriginI * way as PrepareRedoAdd(). */ if (origin_id != InvalidRepOriginId) - appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s", + appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s", origin_id, LSN_FORMAT_ARGS(parsed.origin_lsn), timestamptz_to_str(parsed.origin_timestamp)); diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 58040f28656..cd6c2a2f650 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -65,7 +65,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) { CheckPoint *checkpoint = (CheckPoint *) rec; - appendStringInfo(buf, "redo %X/%X; " + appendStringInfo(buf, "redo %X/%08X; " "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; " "oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " @@ -111,7 +111,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) XLogRecPtr startpoint; memcpy(&startpoint, rec, sizeof(XLogRecPtr)); - appendStringInfo(buf, "%X/%X", LSN_FORMAT_ARGS(startpoint)); + appendStringInfo(buf, "%X/%08X", LSN_FORMAT_ARGS(startpoint)); } else if (info == XLOG_PARAMETER_CHANGE) { @@ -156,7 +156,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) xl_overwrite_contrecord xlrec; memcpy(&xlrec, rec, sizeof(xl_overwrite_contrecord)); - appendStringInfo(buf, "lsn %X/%X; time %s", + appendStringInfo(buf, "lsn %X/%08X; time %s", LSN_FORMAT_ARGS(xlrec.overwritten_lsn), timestamptz_to_str(xlrec.overwrite_time)); } diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 95fea74e296..9b86c016acb 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -785,7 +785,7 @@ SpGistGetInnerTypeSize(SpGistTypeDesc *att, Datum datum) else if (att->attlen > 0) size = att->attlen; else - size = VARSIZE_ANY(datum); + size = VARSIZE_ANY(DatumGetPointer(datum)); return MAXALIGN(size); } @@ -804,7 +804,7 @@ memcpyInnerDatum(void *target, SpGistTypeDesc *att, Datum datum) } else { - size = (att->attlen > 0) ? att->attlen : VARSIZE_ANY(datum); + size = (att->attlen > 0) ? att->attlen : VARSIZE_ANY(DatumGetPointer(datum)); memcpy(target, DatumGetPointer(datum), size); } } diff --git a/src/backend/access/table/toast_helper.c b/src/backend/access/table/toast_helper.c index b60fab0a4d2..11f97d65367 100644 --- a/src/backend/access/table/toast_helper.c +++ b/src/backend/access/table/toast_helper.c @@ -330,7 +330,7 @@ toast_delete_external(Relation rel, const Datum *values, const bool *isnull, if (isnull[i]) continue; - else if (VARATT_IS_EXTERNAL_ONDISK(value)) + else if (VARATT_IS_EXTERNAL_ONDISK(DatumGetPointer(value))) toast_delete_datum(rel, value, is_speculative); } } diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 48f10bec91e..e80fbe109cf 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -110,9 +110,7 @@ static SlruCtlData XactCtlData; #define XactCtl (&XactCtlData) -static int ZeroCLOGPage(int64 pageno, bool writeXlog); static bool CLOGPagePrecedes(int64 page1, int64 page2); -static void WriteZeroPageXlogRec(int64 pageno); static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact, Oid oldestXactDb); static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, @@ -832,41 +830,8 @@ check_transaction_buffers(int *newval, void **extra, GucSource source) void BootStrapCLOG(void) { - int slotno; - LWLock *lock = SimpleLruGetBankLock(XactCtl, 0); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the commit log */ - slotno = ZeroCLOGPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(XactCtl, slotno); - Assert(!XactCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of CLOG to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroCLOGPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(XactCtl, pageno); - - if (writeXlog) - WriteZeroPageXlogRec(pageno); - - return slotno; + /* Zero the initial page and flush it to disk */ + SimpleLruZeroAndWritePage(XactCtl, 0); } /* @@ -974,8 +939,9 @@ ExtendCLOG(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroCLOGPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(XactCtl, pageno); + XLogSimpleInsertInt64(RM_CLOG_ID, CLOG_ZEROPAGE, pageno); LWLockRelease(lock); } @@ -1068,17 +1034,6 @@ CLOGPagePrecedes(int64 page1, int64 page2) /* - * Write a ZEROPAGE xlog record - */ -static void -WriteZeroPageXlogRec(int64 pageno) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE); -} - -/* * Write a TRUNCATE xlog record * * We must flush the xlog record to disk before returning --- see notes @@ -1114,19 +1069,9 @@ clog_redo(XLogReaderState *record) if (info == CLOG_ZEROPAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(XactCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroCLOGPage(pageno, false); - SimpleLruWritePage(XactCtl, slotno); - Assert(!XactCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(XactCtl, pageno); } else if (info == CLOG_TRUNCATE) { diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 113fae1437a..370b38e048b 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -114,11 +114,9 @@ static void SetXidCommitTsInPage(TransactionId xid, int nsubxids, static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, RepOriginId nodeid, int slotno); static void error_commit_ts_disabled(void); -static int ZeroCommitTsPage(int64 pageno, bool writeXlog); static bool CommitTsPagePrecedes(int64 page1, int64 page2); static void ActivateCommitTs(void); static void DeactivateCommitTs(void); -static void WriteZeroPageXlogRec(int64 pageno); static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid); /* @@ -603,28 +601,6 @@ BootStrapCommitTs(void) } /* - * Initialize (or reinitialize) a page of CommitTs to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroCommitTsPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(CommitTsCtl, pageno); - - if (writeXlog) - WriteZeroPageXlogRec(pageno); - - return slotno; -} - -/* * This must be called ONCE during postmaster or standalone-backend startup, * after StartupXLOG has initialized TransamVariables->nextXid. */ @@ -707,6 +683,13 @@ ActivateCommitTs(void) TransactionId xid; int64 pageno; + /* + * During bootstrap, we should not register commit timestamps so skip the + * activation in this case. + */ + if (IsBootstrapProcessingMode()) + return; + /* If we've done this already, there's nothing to do */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); if (commitTsShared->commitTsActive) @@ -747,16 +730,7 @@ ActivateCommitTs(void) /* Create the current segment file, if necessary */ if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) - { - LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - int slotno; - - LWLockAcquire(lock, LW_EXCLUSIVE); - slotno = ZeroCommitTsPage(pageno, false); - SimpleLruWritePage(CommitTsCtl, slotno); - Assert(!CommitTsCtl->shared->page_dirty[slotno]); - LWLockRelease(lock); - } + SimpleLruZeroAndWritePage(CommitTsCtl, pageno); /* Change the activation status in shared memory. */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); @@ -867,8 +841,12 @@ ExtendCommitTs(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroCommitTsPage(pageno, !InRecovery); + /* Zero the page ... */ + SimpleLruZeroPage(CommitTsCtl, pageno); + + /* and make a WAL entry about that, unless we're in REDO */ + if (!InRecovery) + XLogSimpleInsertInt64(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE, pageno); LWLockRelease(lock); } @@ -983,17 +961,6 @@ CommitTsPagePrecedes(int64 page1, int64 page2) /* - * Write a ZEROPAGE xlog record - */ -static void -WriteZeroPageXlogRec(int64 pageno) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE); -} - -/* * Write a TRUNCATE xlog record */ static void @@ -1023,19 +990,9 @@ commit_ts_redo(XLogReaderState *record) if (info == COMMIT_TS_ZEROPAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(CommitTsCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroCommitTsPage(pageno, false); - SimpleLruWritePage(CommitTsCtl, slotno); - Assert(!CommitTsCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(CommitTsCtl, pageno); } else if (info == COMMIT_TS_TRUNCATE) { diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 3c06ac45532..3cb09c3d598 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -401,8 +401,6 @@ static void mXactCachePut(MultiXactId multi, int nmembers, static char *mxstatus_to_string(MultiXactStatus status); /* management of SLRU infrastructure */ -static int ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog); -static int ZeroMultiXactMemberPage(int64 pageno, bool writeXlog); static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2); static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2); static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, @@ -413,7 +411,6 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, uint32 distance); static bool SetOffsetVacuumLimit(bool is_startup); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); -static void WriteMZeroPageXlogRec(int64 pageno, uint8 info); static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, MultiXactId endTruncOff, @@ -1847,7 +1844,7 @@ AtPrepare_MultiXact(void) * Clean up after successful PREPARE TRANSACTION */ void -PostPrepare_MultiXact(TransactionId xid) +PostPrepare_MultiXact(FullTransactionId fxid) { MultiXactId myOldestMember; @@ -1858,7 +1855,7 @@ PostPrepare_MultiXact(TransactionId xid) myOldestMember = OldestMemberMXactId[MyProcNumber]; if (MultiXactIdIsValid(myOldestMember)) { - ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false); + ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false); /* * Even though storing MultiXactId is atomic, acquire lock to make @@ -1896,10 +1893,10 @@ PostPrepare_MultiXact(TransactionId xid) * Recover the state of a prepared transaction at startup */ void -multixact_twophase_recover(TransactionId xid, uint16 info, +multixact_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false); + ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false); MultiXactId oldestMember; /* @@ -1917,10 +1914,10 @@ multixact_twophase_recover(TransactionId xid, uint16 info, * Similar to AtEOXact_MultiXact but for COMMIT PREPARED */ void -multixact_twophase_postcommit(TransactionId xid, uint16 info, +multixact_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, true); + ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true); Assert(len == sizeof(MultiXactId)); @@ -1932,10 +1929,10 @@ multixact_twophase_postcommit(TransactionId xid, uint16 info, * This is actually just the same as the COMMIT case. */ void -multixact_twophase_postabort(TransactionId xid, uint16 info, +multixact_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - multixact_twophase_postcommit(xid, info, recdata, len); + multixact_twophase_postcommit(fxid, info, recdata, len); } /* @@ -2033,70 +2030,9 @@ check_multixact_member_buffers(int *newval, void **extra, GucSource source) void BootStrapMultiXact(void) { - int slotno; - LWLock *lock; - - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, 0); - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the offsets log */ - slotno = ZeroMultiXactOffsetPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); - - lock = SimpleLruGetBankLock(MultiXactMemberCtl, 0); - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the members log */ - slotno = ZeroMultiXactMemberPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(MultiXactMemberCtl, slotno); - Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. - * If writeXlog is true, also emit an XLOG record saying we did this. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); - - if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); - - return slotno; -} - -/* - * Ditto, for MultiXactMember - */ -static int -ZeroMultiXactMemberPage(int64 pageno, bool writeXlog) -{ - int slotno; - - slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); - - if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); - - return slotno; + /* Zero the initial pages and flush them to disk */ + SimpleLruZeroAndWritePage(MultiXactOffsetCtl, 0); + SimpleLruZeroAndWritePage(MultiXactMemberCtl, 0); } /* @@ -2134,7 +2070,7 @@ MaybeExtendOffsetSlru(void) * with creating a new segment file even if the page we're writing is * not the first in it, so this is enough. */ - slotno = ZeroMultiXactOffsetPage(pageno, false); + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); SimpleLruWritePage(MultiXactOffsetCtl, slotno); } @@ -2568,8 +2504,10 @@ ExtendMultiXactOffset(MultiXactId multi) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactOffsetPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + XLogSimpleInsertInt64(RM_MULTIXACT_ID, XLOG_MULTIXACT_ZERO_OFF_PAGE, + pageno); LWLockRelease(lock); } @@ -2611,8 +2549,10 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) LWLockAcquire(lock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactMemberPage(pageno, true); + /* Zero the page and make a WAL entry about it */ + SimpleLruZeroPage(MultiXactMemberCtl, pageno); + XLogSimpleInsertInt64(RM_MULTIXACT_ID, + XLOG_MULTIXACT_ZERO_MEM_PAGE, pageno); LWLockRelease(lock); } @@ -3348,18 +3288,6 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) } /* - * Write an xlog record reflecting the zeroing of either a MEMBERs or - * OFFSETs page (info shows which) - */ -static void -WriteMZeroPageXlogRec(int64 pageno, uint8 info) -{ - XLogBeginInsert(); - XLogRegisterData(&pageno, sizeof(pageno)); - (void) XLogInsert(RM_MULTIXACT_ID, info); -} - -/* * Write a TRUNCATE xlog record * * We must flush the xlog record to disk before returning --- see notes in @@ -3401,36 +3329,16 @@ multixact_redo(XLogReaderState *record) if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(MultiXactOffsetCtl, pageno); } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { int64 pageno; - int slotno; - LWLock *lock; memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - - lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactMemberPage(pageno, false); - SimpleLruWritePage(MultiXactMemberCtl, slotno); - Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); + SimpleLruZeroAndWritePage(MultiXactMemberCtl, pageno); } else if (info == XLOG_MULTIXACT_CREATE_ID) { diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index fe56286d9a9..10ec259f382 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -434,6 +434,31 @@ SimpleLruZeroLSNs(SlruCtl ctl, int slotno) } /* + * This is a convenience wrapper for the common case of zeroing a page and + * immediately flushing it to disk. + * + * Control lock is acquired and released here. + */ +void +SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno) +{ + int slotno; + LWLock *lock; + + lock = SimpleLruGetBankLock(ctl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* Create and zero the page */ + slotno = SimpleLruZeroPage(ctl, pageno); + + /* Make sure it's written out */ + SimpleLruWritePage(ctl, slotno); + Assert(!ctl->shared->page_dirty[slotno]); + + LWLockRelease(lock); +} + +/* * Wait for any active I/O on a page slot to finish. (This does not * guarantee that new I/O hasn't been started before we return, though. * In fact the slot might not even contain the same page anymore.) diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 15153618fad..09aace9e09f 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -74,7 +74,6 @@ static SlruCtlData SubTransCtlData; #define SubTransCtl (&SubTransCtlData) -static int ZeroSUBTRANSPage(int64 pageno); static bool SubTransPagePrecedes(int64 page1, int64 page2); @@ -269,33 +268,8 @@ check_subtrans_buffers(int *newval, void **extra, GucSource source) void BootStrapSUBTRANS(void) { - int slotno; - LWLock *lock = SimpleLruGetBankLock(SubTransCtl, 0); - - LWLockAcquire(lock, LW_EXCLUSIVE); - - /* Create and zero the first page of the subtrans log */ - slotno = ZeroSUBTRANSPage(0); - - /* Make sure it's written out */ - SimpleLruWritePage(SubTransCtl, slotno); - Assert(!SubTransCtl->shared->page_dirty[slotno]); - - LWLockRelease(lock); -} - -/* - * Initialize (or reinitialize) a page of SUBTRANS to zeroes. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroSUBTRANSPage(int64 pageno) -{ - return SimpleLruZeroPage(SubTransCtl, pageno); + /* Zero the initial page and flush it to disk */ + SimpleLruZeroAndWritePage(SubTransCtl, 0); } /* @@ -335,7 +309,7 @@ StartupSUBTRANS(TransactionId oldestActiveXID) prevlock = lock; } - (void) ZeroSUBTRANSPage(startPage); + (void) SimpleLruZeroPage(SubTransCtl, startPage); if (startPage == endPage) break; @@ -395,7 +369,7 @@ ExtendSUBTRANS(TransactionId newestXact) LWLockAcquire(lock, LW_EXCLUSIVE); /* Zero the page */ - ZeroSUBTRANSPage(pageno); + SimpleLruZeroPage(SubTransCtl, pageno); LWLockRelease(lock); } diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index a27f27cc037..186eb91f609 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -154,7 +154,7 @@ readTimeLineHistory(TimeLineID targetTLI) if (*ptr == '\0' || *ptr == '#') continue; - nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo); + nfields = sscanf(fline, "%u\t%X/%08X", &tli, &switchpoint_hi, &switchpoint_lo); if (nfields < 1) { @@ -399,7 +399,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, * parent file failed to end with one. */ snprintf(buffer, sizeof(buffer), - "%s%u\t%X/%X\t%s\n", + "%s%u\t%X/%08X\t%s\n", (srcfd < 0) ? "" : "\n", parentTLI, LSN_FORMAT_ARGS(switchpoint), diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 73a80559194..7918176fc58 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -159,7 +159,7 @@ typedef struct GlobalTransactionData */ XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */ XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */ - TransactionId xid; /* The GXACT id */ + FullTransactionId fxid; /* The GXACT full xid */ Oid owner; /* ID of user that executed the xact */ ProcNumber locking_backend; /* backend currently working on the xact */ @@ -197,6 +197,7 @@ static GlobalTransaction MyLockedGxact = NULL; static bool twophaseExitRegistered = false; +static void PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning); static void RecordTransactionCommitPrepared(TransactionId xid, int nchildren, TransactionId *children, @@ -216,19 +217,19 @@ static void RecordTransactionAbortPrepared(TransactionId xid, int nstats, xl_xact_stats_item *stats, const char *gid); -static void ProcessRecords(char *bufptr, TransactionId xid, +static void ProcessRecords(char *bufptr, FullTransactionId fxid, const TwoPhaseCallback callbacks[]); static void RemoveGXact(GlobalTransaction gxact); static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len); -static char *ProcessTwoPhaseBuffer(TransactionId xid, +static char *ProcessTwoPhaseBuffer(FullTransactionId fxid, XLogRecPtr prepare_start_lsn, bool fromdisk, bool setParent, bool setNextXid); -static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, +static void MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid); -static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning); -static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len); +static void RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning); +static void RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len); /* * Initialization of shared memory @@ -356,7 +357,7 @@ PostPrepare_Twophase(void) * Reserve the GID for the given transaction. */ GlobalTransaction -MarkAsPreparing(TransactionId xid, const char *gid, +MarkAsPreparing(FullTransactionId fxid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid) { GlobalTransaction gxact; @@ -407,7 +408,7 @@ MarkAsPreparing(TransactionId xid, const char *gid, gxact = TwoPhaseState->freeGXacts; TwoPhaseState->freeGXacts = gxact->next; - MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid); + MarkAsPreparingGuts(gxact, fxid, gid, prepared_at, owner, databaseid); gxact->ondisk = false; @@ -430,11 +431,13 @@ MarkAsPreparing(TransactionId xid, const char *gid, * Note: This function should be called with appropriate locks held. */ static void -MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, - TimestampTz prepared_at, Oid owner, Oid databaseid) +MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid, + const char *gid, TimestampTz prepared_at, Oid owner, + Oid databaseid) { PGPROC *proc; int i; + TransactionId xid = XidFromFullTransactionId(fxid); Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); @@ -479,7 +482,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, proc->subxidStatus.count = 0; gxact->prepared_at = prepared_at; - gxact->xid = xid; + gxact->fxid = fxid; gxact->owner = owner; gxact->locking_backend = MyProcNumber; gxact->valid = false; @@ -797,12 +800,12 @@ pg_prepared_xact(PG_FUNCTION_ARGS) * caller had better hold it. */ static GlobalTransaction -TwoPhaseGetGXact(TransactionId xid, bool lock_held) +TwoPhaseGetGXact(FullTransactionId fxid, bool lock_held) { GlobalTransaction result = NULL; int i; - static TransactionId cached_xid = InvalidTransactionId; + static FullTransactionId cached_fxid = {InvalidTransactionId}; static GlobalTransaction cached_gxact = NULL; Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock)); @@ -811,7 +814,7 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called * repeatedly for the same XID. We can save work with a simple cache. */ - if (xid == cached_xid) + if (FullTransactionIdEquals(fxid, cached_fxid)) return cached_gxact; if (!lock_held) @@ -821,7 +824,7 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) { GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; - if (gxact->xid == xid) + if (FullTransactionIdEquals(gxact->fxid, fxid)) { result = gxact; break; @@ -832,9 +835,10 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held) LWLockRelease(TwoPhaseStateLock); if (result == NULL) /* should not happen */ - elog(ERROR, "failed to find GlobalTransaction for xid %u", xid); + elog(ERROR, "failed to find GlobalTransaction for xid %u", + XidFromFullTransactionId(fxid)); - cached_xid = xid; + cached_fxid = fxid; cached_gxact = result; return result; @@ -881,7 +885,7 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, *have_more = true; break; } - result = gxact->xid; + result = XidFromFullTransactionId(gxact->fxid); } } @@ -892,7 +896,7 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, /* * TwoPhaseGetDummyProcNumber - * Get the dummy proc number for prepared transaction specified by XID + * Get the dummy proc number for prepared transaction * * Dummy proc numbers are similar to proc numbers of real backends. They * start at MaxBackends, and are unique across all currently active real @@ -900,24 +904,24 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, * TwoPhaseStateLock will not be taken, so the caller had better hold it. */ ProcNumber -TwoPhaseGetDummyProcNumber(TransactionId xid, bool lock_held) +TwoPhaseGetDummyProcNumber(FullTransactionId fxid, bool lock_held) { - GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held); + GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held); return gxact->pgprocno; } /* * TwoPhaseGetDummyProc - * Get the PGPROC that represents a prepared transaction specified by XID + * Get the PGPROC that represents a prepared transaction * * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the * caller had better hold it. */ PGPROC * -TwoPhaseGetDummyProc(TransactionId xid, bool lock_held) +TwoPhaseGetDummyProc(FullTransactionId fxid, bool lock_held) { - GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held); + GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held); return GetPGProcByNumber(gxact->pgprocno); } @@ -942,10 +946,8 @@ AdjustToFullTransactionId(TransactionId xid) } static inline int -TwoPhaseFilePath(char *path, TransactionId xid) +TwoPhaseFilePath(char *path, FullTransactionId fxid) { - FullTransactionId fxid = AdjustToFullTransactionId(xid); - return snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X%08X", EpochFromFullTransactionId(fxid), XidFromFullTransactionId(fxid)); @@ -1049,7 +1051,7 @@ void StartPrepare(GlobalTransaction gxact) { PGPROC *proc = GetPGProcByNumber(gxact->pgprocno); - TransactionId xid = gxact->xid; + TransactionId xid = XidFromFullTransactionId(gxact->fxid); TwoPhaseFileHeader hdr; TransactionId *children; RelFileLocator *commitrels; @@ -1181,7 +1183,11 @@ EndPrepare(GlobalTransaction gxact) * starting immediately after the WAL record is inserted could complete * without fsync'ing our state file. (This is essentially the same kind * of race condition as the COMMIT-to-clog-write case that - * RecordTransactionCommit uses DELAY_CHKPT_START for; see notes there.) + * RecordTransactionCommit uses DELAY_CHKPT_IN_COMMIT for; see notes + * there.) Note that DELAY_CHKPT_IN_COMMIT is used to find transactions in + * the critical commit section. We need to know about such transactions + * for conflict detection in logical replication. See + * GetOldestActiveTransactionId(true, false) and its use. * * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. @@ -1281,10 +1287,11 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info, * If it looks OK (has a valid magic number and CRC), return the palloc'd * contents of the file, issuing an error when finding corrupted data. If * missing_ok is true, which indicates that missing files can be safely - * ignored, then return NULL. This state can be reached when doing recovery. + * ignored, then return NULL. This state can be reached when doing recovery + * after discarding two-phase files from frozen epochs. */ static char * -ReadTwoPhaseFile(TransactionId xid, bool missing_ok) +ReadTwoPhaseFile(FullTransactionId fxid, bool missing_ok) { char path[MAXPGPATH]; char *buf; @@ -1296,7 +1303,7 @@ ReadTwoPhaseFile(TransactionId xid, bool missing_ok) file_crc; int r; - TwoPhaseFilePath(path, xid); + TwoPhaseFilePath(path, fxid); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); if (fd < 0) @@ -1426,12 +1433,12 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) if (errormsg) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read two-phase state from WAL at %X/%X: %s", + errmsg("could not read two-phase state from WAL at %X/%08X: %s", LSN_FORMAT_ARGS(lsn), errormsg))); else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read two-phase state from WAL at %X/%X", + errmsg("could not read two-phase state from WAL at %X/%08X", LSN_FORMAT_ARGS(lsn)))); } @@ -1439,7 +1446,7 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) (XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE) ereport(ERROR, (errcode_for_file_access(), - errmsg("expected two-phase state data is not present in WAL at %X/%X", + errmsg("expected two-phase state data is not present in WAL at %X/%08X", LSN_FORMAT_ARGS(lsn)))); if (len != NULL) @@ -1461,6 +1468,7 @@ StandbyTransactionIdIsPrepared(TransactionId xid) char *buf; TwoPhaseFileHeader *hdr; bool result; + FullTransactionId fxid; Assert(TransactionIdIsValid(xid)); @@ -1468,7 +1476,8 @@ StandbyTransactionIdIsPrepared(TransactionId xid) return false; /* nothing to do */ /* Read and validate file */ - buf = ReadTwoPhaseFile(xid, true); + fxid = AdjustToFullTransactionId(xid); + buf = ReadTwoPhaseFile(fxid, true); if (buf == NULL) return false; @@ -1488,6 +1497,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) { GlobalTransaction gxact; PGPROC *proc; + FullTransactionId fxid; TransactionId xid; bool ondisk; char *buf; @@ -1509,7 +1519,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit) */ gxact = LockGXact(gid, GetUserId()); proc = GetPGProcByNumber(gxact->pgprocno); - xid = gxact->xid; + fxid = gxact->fxid; + xid = XidFromFullTransactionId(fxid); /* * Read and validate 2PC state data. State data will typically be stored @@ -1517,7 +1528,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * to disk if for some reason they have lived for a long time. */ if (gxact->ondisk) - buf = ReadTwoPhaseFile(xid, false); + buf = ReadTwoPhaseFile(fxid, false); else XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL); @@ -1636,11 +1647,11 @@ FinishPreparedTransaction(const char *gid, bool isCommit) /* And now do the callbacks */ if (isCommit) - ProcessRecords(bufptr, xid, twophase_postcommit_callbacks); + ProcessRecords(bufptr, fxid, twophase_postcommit_callbacks); else - ProcessRecords(bufptr, xid, twophase_postabort_callbacks); + ProcessRecords(bufptr, fxid, twophase_postabort_callbacks); - PredicateLockTwoPhaseFinish(xid, isCommit); + PredicateLockTwoPhaseFinish(fxid, isCommit); /* * Read this value while holding the two-phase lock, as the on-disk 2PC @@ -1664,7 +1675,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * And now we can clean up any files we may have left. */ if (ondisk) - RemoveTwoPhaseFile(xid, true); + RemoveTwoPhaseFile(fxid, true); MyLockedGxact = NULL; @@ -1677,7 +1688,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) * Scan 2PC state data in memory and call the indicated callbacks for each 2PC record. */ static void -ProcessRecords(char *bufptr, TransactionId xid, +ProcessRecords(char *bufptr, FullTransactionId fxid, const TwoPhaseCallback callbacks[]) { for (;;) @@ -1691,24 +1702,28 @@ ProcessRecords(char *bufptr, TransactionId xid, bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk)); if (callbacks[record->rmid] != NULL) - callbacks[record->rmid] (xid, record->info, bufptr, record->len); + callbacks[record->rmid] (fxid, record->info, bufptr, record->len); bufptr += MAXALIGN(record->len); } } /* - * Remove the 2PC file for the specified XID. + * Remove the 2PC file. * * If giveWarning is false, do not complain about file-not-present; * this is an expected case during WAL replay. + * + * This routine is used at early stages at recovery where future and + * past orphaned files are checked, hence the FullTransactionId to build + * a complete file name fit for the removal. */ static void -RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) +RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning) { char path[MAXPGPATH]; - TwoPhaseFilePath(path, xid); + TwoPhaseFilePath(path, fxid); if (unlink(path)) if (errno != ENOENT || giveWarning) ereport(WARNING, @@ -1723,7 +1738,7 @@ RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) * Note: content and len don't include CRC. */ static void -RecreateTwoPhaseFile(TransactionId xid, void *content, int len) +RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len) { char path[MAXPGPATH]; pg_crc32c statefile_crc; @@ -1734,7 +1749,7 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len) COMP_CRC32C(statefile_crc, content, len); FIN_CRC32C(statefile_crc); - TwoPhaseFilePath(path, xid); + TwoPhaseFilePath(path, fxid); fd = OpenTransientFile(path, O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY); @@ -1846,7 +1861,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon) int len; XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len); - RecreateTwoPhaseFile(gxact->xid, buf, len); + RecreateTwoPhaseFile(gxact->fxid, buf, len); gxact->ondisk = true; gxact->prepare_start_lsn = InvalidXLogRecPtr; gxact->prepare_end_lsn = InvalidXLogRecPtr; @@ -1897,19 +1912,17 @@ restoreTwoPhaseData(void) if (strlen(clde->d_name) == 16 && strspn(clde->d_name, "0123456789ABCDEF") == 16) { - TransactionId xid; FullTransactionId fxid; char *buf; fxid = FullTransactionIdFromU64(strtou64(clde->d_name, NULL, 16)); - xid = XidFromFullTransactionId(fxid); - buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr, + buf = ProcessTwoPhaseBuffer(fxid, InvalidXLogRecPtr, true, false, false); if (buf == NULL) continue; - PrepareRedoAdd(buf, InvalidXLogRecPtr, + PrepareRedoAdd(fxid, buf, InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidRepOriginId); } } @@ -1968,9 +1981,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) Assert(gxact->inredo); - xid = gxact->xid; - - buf = ProcessTwoPhaseBuffer(xid, + buf = ProcessTwoPhaseBuffer(gxact->fxid, gxact->prepare_start_lsn, gxact->ondisk, false, true); @@ -1981,6 +1992,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p) * OK, we think this file is valid. Incorporate xid into the * running-minimum result. */ + xid = XidFromFullTransactionId(gxact->fxid); if (TransactionIdPrecedes(xid, result)) result = xid; @@ -2036,15 +2048,12 @@ StandbyRecoverPreparedTransactions(void) LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { - TransactionId xid; char *buf; GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; Assert(gxact->inredo); - xid = gxact->xid; - - buf = ProcessTwoPhaseBuffer(xid, + buf = ProcessTwoPhaseBuffer(gxact->fxid, gxact->prepare_start_lsn, gxact->ondisk, true, false); if (buf != NULL) @@ -2077,16 +2086,14 @@ RecoverPreparedTransactions(void) LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { - TransactionId xid; char *buf; GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; + FullTransactionId fxid = gxact->fxid; char *bufptr; TwoPhaseFileHeader *hdr; TransactionId *subxids; const char *gid; - xid = gxact->xid; - /* * Reconstruct subtrans state for the transaction --- needed because * pg_subtrans is not preserved over a restart. Note that we are @@ -2096,17 +2103,20 @@ RecoverPreparedTransactions(void) * SubTransSetParent has been set before, if the prepared transaction * generated xid assignment records. */ - buf = ProcessTwoPhaseBuffer(xid, + buf = ProcessTwoPhaseBuffer(gxact->fxid, gxact->prepare_start_lsn, gxact->ondisk, true, false); if (buf == NULL) continue; ereport(LOG, - (errmsg("recovering prepared transaction %u from shared memory", xid))); + (errmsg("recovering prepared transaction %u of epoch %u from shared memory", + XidFromFullTransactionId(gxact->fxid), + EpochFromFullTransactionId(gxact->fxid)))); hdr = (TwoPhaseFileHeader *) buf; - Assert(TransactionIdEquals(hdr->xid, xid)); + Assert(TransactionIdEquals(hdr->xid, + XidFromFullTransactionId(gxact->fxid))); bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); gid = (const char *) bufptr; bufptr += MAXALIGN(hdr->gidlen); @@ -2122,7 +2132,7 @@ RecoverPreparedTransactions(void) * Recreate its GXACT and dummy PGPROC. But, check whether it was * added in redo and already has a shmem entry for it. */ - MarkAsPreparingGuts(gxact, xid, gid, + MarkAsPreparingGuts(gxact, gxact->fxid, gid, hdr->prepared_at, hdr->owner, hdr->database); @@ -2137,7 +2147,7 @@ RecoverPreparedTransactions(void) /* * Recover other state (notably locks) using resource managers. */ - ProcessRecords(bufptr, xid, twophase_recover_callbacks); + ProcessRecords(bufptr, fxid, twophase_recover_callbacks); /* * Release locks held by the standby process after we process each @@ -2145,7 +2155,7 @@ RecoverPreparedTransactions(void) * additional locks at any one time. */ if (InHotStandby) - StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids); + StandbyReleaseLockTree(hdr->xid, hdr->nsubxacts, subxids); /* * We're done with recovering this transaction. Clear MyLockedGxact, @@ -2164,7 +2174,7 @@ RecoverPreparedTransactions(void) /* * ProcessTwoPhaseBuffer * - * Given a transaction id, read it either from disk or read it directly + * Given a FullTransactionId, read it either from disk or read it directly * via shmem xlog record pointer using the provided "prepare_start_lsn". * * If setParent is true, set up subtransaction parent linkages. @@ -2173,13 +2183,12 @@ RecoverPreparedTransactions(void) * value scanned. */ static char * -ProcessTwoPhaseBuffer(TransactionId xid, +ProcessTwoPhaseBuffer(FullTransactionId fxid, XLogRecPtr prepare_start_lsn, bool fromdisk, bool setParent, bool setNextXid) { FullTransactionId nextXid = TransamVariables->nextXid; - TransactionId origNextXid = XidFromFullTransactionId(nextXid); TransactionId *subxids; char *buf; TwoPhaseFileHeader *hdr; @@ -2191,41 +2200,46 @@ ProcessTwoPhaseBuffer(TransactionId xid, Assert(prepare_start_lsn != InvalidXLogRecPtr); /* Already processed? */ - if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + if (TransactionIdDidCommit(XidFromFullTransactionId(fxid)) || + TransactionIdDidAbort(XidFromFullTransactionId(fxid))) { if (fromdisk) { ereport(WARNING, - (errmsg("removing stale two-phase state file for transaction %u", - xid))); - RemoveTwoPhaseFile(xid, true); + (errmsg("removing stale two-phase state file for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + RemoveTwoPhaseFile(fxid, true); } else { ereport(WARNING, - (errmsg("removing stale two-phase state from memory for transaction %u", - xid))); - PrepareRedoRemove(xid, true); + (errmsg("removing stale two-phase state from memory for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + PrepareRedoRemoveFull(fxid, true); } return NULL; } /* Reject XID if too new */ - if (TransactionIdFollowsOrEquals(xid, origNextXid)) + if (FullTransactionIdFollowsOrEquals(fxid, nextXid)) { if (fromdisk) { ereport(WARNING, - (errmsg("removing future two-phase state file for transaction %u", - xid))); - RemoveTwoPhaseFile(xid, true); + (errmsg("removing future two-phase state file for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + RemoveTwoPhaseFile(fxid, true); } else { ereport(WARNING, - (errmsg("removing future two-phase state from memory for transaction %u", - xid))); - PrepareRedoRemove(xid, true); + (errmsg("removing future two-phase state from memory for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); + PrepareRedoRemoveFull(fxid, true); } return NULL; } @@ -2233,7 +2247,7 @@ ProcessTwoPhaseBuffer(TransactionId xid, if (fromdisk) { /* Read and validate file */ - buf = ReadTwoPhaseFile(xid, false); + buf = ReadTwoPhaseFile(fxid, false); } else { @@ -2243,18 +2257,20 @@ ProcessTwoPhaseBuffer(TransactionId xid, /* Deconstruct header */ hdr = (TwoPhaseFileHeader *) buf; - if (!TransactionIdEquals(hdr->xid, xid)) + if (!TransactionIdEquals(hdr->xid, XidFromFullTransactionId(fxid))) { if (fromdisk) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted two-phase state file for transaction %u", - xid))); + errmsg("corrupted two-phase state file for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); else ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("corrupted two-phase state in memory for transaction %u", - xid))); + errmsg("corrupted two-phase state in memory for transaction %u of epoch %u", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)))); } /* @@ -2268,14 +2284,14 @@ ProcessTwoPhaseBuffer(TransactionId xid, { TransactionId subxid = subxids[i]; - Assert(TransactionIdFollows(subxid, xid)); + Assert(TransactionIdFollows(subxid, XidFromFullTransactionId(fxid))); /* update nextXid if needed */ if (setNextXid) AdvanceNextFullTransactionIdPastXid(subxid); if (setParent) - SubTransSetParent(subxid, xid); + SubTransSetParent(subxid, XidFromFullTransactionId(fxid)); } return buf; @@ -2286,7 +2302,7 @@ ProcessTwoPhaseBuffer(TransactionId xid, * RecordTransactionCommitPrepared * * This is basically the same as RecordTransactionCommit (q.v. if you change - * this function): in particular, we must set DELAY_CHKPT_START to avoid a + * this function): in particular, we must set DELAY_CHKPT_IN_COMMIT to avoid a * race condition. * * We know the transaction made at least one XLOG entry (its PREPARE), @@ -2306,7 +2322,7 @@ RecordTransactionCommitPrepared(TransactionId xid, const char *gid) { XLogRecPtr recptr; - TimestampTz committs = GetCurrentTimestamp(); + TimestampTz committs; bool replorigin; /* @@ -2319,8 +2335,24 @@ RecordTransactionCommitPrepared(TransactionId xid, START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); + MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + + /* + * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible before + * commit time is written. + */ + pg_write_barrier(); + + /* + * Note it is important to set committs value after marking ourselves as + * in the commit critical section (DELAY_CHKPT_IN_COMMIT). This is because + * we want to ensure all transactions that have acquired commit timestamp + * are finished before we allow the logical replication client to advance + * its xid which is used to hold back dead rows for conflict detection. + * See comments atop worker.c. + */ + committs = GetCurrentTimestamp(); /* * Emit the XLOG commit record. Note that we mark 2PC commits as @@ -2369,7 +2401,7 @@ RecordTransactionCommitPrepared(TransactionId xid, TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT; END_CRIT_SECTION(); @@ -2466,8 +2498,9 @@ RecordTransactionAbortPrepared(TransactionId xid, * data, the entry is marked as located on disk. */ void -PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, - XLogRecPtr end_lsn, RepOriginId origin_id) +PrepareRedoAdd(FullTransactionId fxid, char *buf, + XLogRecPtr start_lsn, XLogRecPtr end_lsn, + RepOriginId origin_id) { TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf; char *bufptr; @@ -2477,6 +2510,13 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); Assert(RecoveryInProgress()); + if (!FullTransactionIdIsValid(fxid)) + { + Assert(InRecovery); + fxid = FullTransactionIdFromAllowableAt(TransamVariables->nextXid, + hdr->xid); + } + bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); gid = (const char *) bufptr; @@ -2505,14 +2545,15 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, { char path[MAXPGPATH]; - TwoPhaseFilePath(path, hdr->xid); + Assert(InRecovery); + TwoPhaseFilePath(path, fxid); if (access(path, F_OK) == 0) { ereport(reachedConsistency ? ERROR : WARNING, (errmsg("could not recover two-phase state file for transaction %u", hdr->xid), - errdetail("Two-phase state file has been found in WAL record %X/%X, but this transaction has already been restored from disk.", + errdetail("Two-phase state file has been found in WAL record %X/%08X, but this transaction has already been restored from disk.", LSN_FORMAT_ARGS(start_lsn)))); return; } @@ -2536,7 +2577,7 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, gxact->prepared_at = hdr->prepared_at; gxact->prepare_start_lsn = start_lsn; gxact->prepare_end_lsn = end_lsn; - gxact->xid = hdr->xid; + gxact->fxid = fxid; gxact->owner = hdr->owner; gxact->locking_backend = INVALID_PROC_NUMBER; gxact->valid = false; @@ -2555,11 +2596,13 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, false /* backward */ , false /* WAL */ ); } - elog(DEBUG2, "added 2PC data in shared memory for transaction %u", gxact->xid); + elog(DEBUG2, "added 2PC data in shared memory for transaction %u of epoch %u", + XidFromFullTransactionId(gxact->fxid), + EpochFromFullTransactionId(gxact->fxid)); } /* - * PrepareRedoRemove + * PrepareRedoRemoveFull * * Remove the corresponding gxact entry from TwoPhaseState. Also remove * the 2PC file if a prepared transaction was saved via an earlier checkpoint. @@ -2567,8 +2610,8 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, * Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState * is updated. */ -void -PrepareRedoRemove(TransactionId xid, bool giveWarning) +static void +PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning) { GlobalTransaction gxact = NULL; int i; @@ -2581,7 +2624,7 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning) { gxact = TwoPhaseState->prepXacts[i]; - if (gxact->xid == xid) + if (FullTransactionIdEquals(gxact->fxid, fxid)) { Assert(gxact->inredo); found = true; @@ -2598,13 +2641,29 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning) /* * And now we can clean up any files we may have left. */ - elog(DEBUG2, "removing 2PC data for transaction %u", xid); + elog(DEBUG2, "removing 2PC data for transaction %u of epoch %u ", + XidFromFullTransactionId(fxid), + EpochFromFullTransactionId(fxid)); + if (gxact->ondisk) - RemoveTwoPhaseFile(xid, giveWarning); + RemoveTwoPhaseFile(fxid, giveWarning); + RemoveGXact(gxact); } /* + * Wrapper of PrepareRedoRemoveFull(), for TransactionIds. + */ +void +PrepareRedoRemove(TransactionId xid, bool giveWarning) +{ + FullTransactionId fxid = + FullTransactionIdFromAllowableAt(TransamVariables->nextXid, xid); + + PrepareRedoRemoveFull(fxid, giveWarning); +} + +/* * LookupGXact * Check if the prepared transaction with the given GID, lsn and timestamp * exists. @@ -2648,7 +2707,7 @@ LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn, * between publisher and subscriber. */ if (gxact->ondisk) - buf = ReadTwoPhaseFile(gxact->xid, false); + buf = ReadTwoPhaseFile(gxact->fxid, false); else { Assert(gxact->prepare_start_lsn); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index b885513f765..b46e7e9c2a6 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1431,10 +1431,22 @@ RecordTransactionCommit(void) * without holding the ProcArrayLock, since we're the only one * modifying it. This makes checkpoint's determination of which xacts * are delaying the checkpoint a bit fuzzy, but it doesn't matter. + * + * Note, it is important to get the commit timestamp after marking the + * transaction in the commit critical section. See + * RecordTransactionCommitPrepared. */ - Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0); START_CRIT_SECTION(); - MyProc->delayChkptFlags |= DELAY_CHKPT_START; + MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT; + + Assert(xactStopTimestamp == 0); + + /* + * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible + * before commit time is written. + */ + pg_write_barrier(); /* * Insert the commit XLOG record. @@ -1537,7 +1549,7 @@ RecordTransactionCommit(void) */ if (markXidCommitted) { - MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT; END_CRIT_SECTION(); } @@ -2515,7 +2527,7 @@ static void PrepareTransaction(void) { TransactionState s = CurrentTransactionState; - TransactionId xid = GetCurrentTransactionId(); + FullTransactionId fxid = GetCurrentFullTransactionId(); GlobalTransaction gxact; TimestampTz prepared_at; @@ -2644,7 +2656,7 @@ PrepareTransaction(void) * Reserve the GID for this transaction. This could fail if the requested * GID is invalid or already in use. */ - gxact = MarkAsPreparing(xid, prepareGID, prepared_at, + gxact = MarkAsPreparing(fxid, prepareGID, prepared_at, GetUserId(), MyDatabaseId); prepareGID = NULL; @@ -2694,7 +2706,7 @@ PrepareTransaction(void) * ProcArrayClearTransaction(). Otherwise, a GetLockConflicts() would * conclude "xact already committed or aborted" for our locks. */ - PostPrepare_Locks(xid); + PostPrepare_Locks(fxid); /* * Let others know about no transaction in progress by me. This has to be @@ -2738,9 +2750,9 @@ PrepareTransaction(void) PostPrepare_smgr(); - PostPrepare_MultiXact(xid); + PostPrepare_MultiXact(fxid); - PostPrepare_PredicateLocks(xid); + PostPrepare_PredicateLocks(fxid); ResourceOwnerRelease(TopTransactionResourceOwner, RESOURCE_RELEASE_LOCKS, @@ -6420,7 +6432,8 @@ xact_redo(XLogReaderState *record) * gxact entry. */ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); - PrepareRedoAdd(XLogRecGetData(record), + PrepareRedoAdd(InvalidFullTransactionId, + XLogRecGetData(record), record->ReadRecPtr, record->EndRecPtr, XLogRecGetOrigin(record)); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1914859b2ee..9a4de1616bc 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -96,6 +96,7 @@ #include "utils/guc_hooks.h" #include "utils/guc_tables.h" #include "utils/injection_point.h" +#include "utils/pgstat_internal.h" #include "utils/ps_status.h" #include "utils/relmapper.h" #include "utils/snapmgr.h" @@ -449,7 +450,6 @@ typedef struct XLogCtlData /* Protected by info_lck: */ XLogwrtRqst LogwrtRqst; XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */ - FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */ XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */ XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */ @@ -703,7 +703,7 @@ static void InitControlFile(uint64 sysidentifier, uint32 data_checksum_version); static void WriteControlFile(void); static void ReadControlFile(void); static void UpdateControlFile(void); -static char *str_time(pg_time_t tnow); +static char *str_time(pg_time_t tnow, char *buf, size_t bufsize); static int get_sync_bit(int method); @@ -1028,7 +1028,7 @@ XLogInsertRecord(XLogRecData *rdata, oldCxt = MemoryContextSwitchTo(walDebugCxt); initStringInfo(&buf); - appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos)); + appendStringInfo(&buf, "INSERT @ %X/%08X: ", LSN_FORMAT_ARGS(EndPos)); /* * We have to piece together the WAL record data from the XLogRecData @@ -1092,6 +1092,9 @@ XLogInsertRecord(XLogRecData *rdata, pgWalUsage.wal_bytes += rechdr->xl_tot_len; pgWalUsage.wal_records++; pgWalUsage.wal_fpi += num_fpi; + + /* Required for the flush of pending stats WAL data */ + pgstat_report_fixed = true; } return EndPos; @@ -1549,8 +1552,8 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto) if (upto > reservedUpto) { ereport(LOG, - (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X", - LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)))); + errmsg("request to flush past end of generated WAL; request %X/%08X, current position %X/%08X", + LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))); upto = reservedUpto; } @@ -1716,7 +1719,7 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]); if (expectedEndPtr != endptr) - elog(PANIC, "could not find WAL buffer for %X/%X", + elog(PANIC, "could not find WAL buffer for %X/%08X", LSN_FORMAT_ARGS(ptr)); } else @@ -1776,7 +1779,7 @@ WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count, inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult); if (startptr + count > inserted) ereport(ERROR, - errmsg("cannot read past end of generated WAL: requested %X/%X, current position %X/%X", + errmsg("cannot read past end of generated WAL: requested %X/%08X, current position %X/%08X", LSN_FORMAT_ARGS(startptr + count), LSN_FORMAT_ARGS(inserted))); @@ -2109,6 +2112,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) LWLockRelease(WALWriteLock); pgWalUsage.wal_buffers_full++; TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); + + /* + * Required for the flush of pending stats WAL data, per + * update of pgWalUsage. + */ + pgstat_report_fixed = true; } } } @@ -2281,7 +2290,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) #ifdef WAL_DEBUG if (XLOG_DEBUG && npages > 0) { - elog(DEBUG1, "initialized %d pages, up to %X/%X", + elog(DEBUG1, "initialized %d pages, up to %X/%08X", npages, LSN_FORMAT_ARGS(NewPageEndPtr)); } #endif @@ -2347,25 +2356,6 @@ check_wal_segment_size(int *newval, void **extra, GucSource source) } /* - * GUC check_hook for max_slot_wal_keep_size - * - * We don't allow the value of max_slot_wal_keep_size other than -1 during the - * binary upgrade. See start_postmaster() in pg_upgrade for more details. - */ -bool -check_max_slot_wal_keep_size(int *newval, void **extra, GucSource source) -{ - if (IsBinaryUpgrade && *newval != -1) - { - GUC_check_errdetail("\"%s\" must be set to -1 during binary upgrade mode.", - "max_slot_wal_keep_size"); - return false; - } - - return true; -} - -/* * At a checkpoint, how many WAL segments to recycle as preallocated future * XLOG segments? Returns the highest segment that should be preallocated. */ @@ -2492,7 +2482,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) XLogRecPtr EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]); if (LogwrtResult.Write >= EndPtr) - elog(PANIC, "xlog write request %X/%X is past end of log %X/%X", + elog(PANIC, "xlog write request %X/%08X is past end of log %X/%08X", LSN_FORMAT_ARGS(LogwrtResult.Write), LSN_FORMAT_ARGS(EndPtr)); @@ -2892,7 +2882,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI); if (!force && newMinRecoveryPoint < lsn) elog(WARNING, - "xlog min recovery request %X/%X is past current point %X/%X", + "xlog min recovery request %X/%08X is past current point %X/%08X", LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint)); /* update control file */ @@ -2905,9 +2895,9 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) LocalMinRecoveryPointTLI = newMinRecoveryPointTLI; ereport(DEBUG2, - (errmsg_internal("updated min recovery point to %X/%X on timeline %u", - LSN_FORMAT_ARGS(newMinRecoveryPoint), - newMinRecoveryPointTLI))); + errmsg_internal("updated min recovery point to %X/%08X on timeline %u", + LSN_FORMAT_ARGS(newMinRecoveryPoint), + newMinRecoveryPointTLI)); } } LWLockRelease(ControlFileLock); @@ -2945,7 +2935,7 @@ XLogFlush(XLogRecPtr record) #ifdef WAL_DEBUG if (XLOG_DEBUG) - elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X", + elog(LOG, "xlog flush request %X/%08X; write %X/%08X; flush %X/%08X", LSN_FORMAT_ARGS(record), LSN_FORMAT_ARGS(LogwrtResult.Write), LSN_FORMAT_ARGS(LogwrtResult.Flush)); @@ -3078,7 +3068,7 @@ XLogFlush(XLogRecPtr record) */ if (LogwrtResult.Flush < record) elog(ERROR, - "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", + "xlog flush request %X/%08X is not satisfied --- flushed only to %X/%08X", LSN_FORMAT_ARGS(record), LSN_FORMAT_ARGS(LogwrtResult.Flush)); } @@ -3205,7 +3195,7 @@ XLogBackgroundFlush(void) #ifdef WAL_DEBUG if (XLOG_DEBUG) - elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X", + elog(LOG, "xlog bg flush request write %X/%08X; flush: %X/%08X, current is write %X/%08X; flush %X/%08X", LSN_FORMAT_ARGS(WriteRqst.Write), LSN_FORMAT_ARGS(WriteRqst.Flush), LSN_FORMAT_ARGS(LogwrtResult.Write), @@ -5381,11 +5371,9 @@ BootStrapXLOG(uint32 data_checksum_version) } static char * -str_time(pg_time_t tnow) +str_time(pg_time_t tnow, char *buf, size_t bufsize) { - char *buf = palloc(128); - - pg_strftime(buf, 128, + pg_strftime(buf, bufsize, "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&tnow, log_timezone)); @@ -5628,6 +5616,7 @@ StartupXLOG(void) XLogRecPtr missingContrecPtr; TransactionId oldestActiveXID; bool promoted = false; + char timebuf[128]; /* * We should have an aux process resource owner to use, and we should not @@ -5656,25 +5645,29 @@ StartupXLOG(void) */ ereport(IsPostmasterEnvironment ? LOG : NOTICE, (errmsg("database system was shut down at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; case DB_SHUTDOWNED_IN_RECOVERY: ereport(LOG, (errmsg("database system was shut down in recovery at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; case DB_SHUTDOWNING: ereport(LOG, (errmsg("database system shutdown was interrupted; last known up at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; case DB_IN_CRASH_RECOVERY: ereport(LOG, (errmsg("database system was interrupted while in recovery at %s", - str_time(ControlFile->time)), + str_time(ControlFile->time, + timebuf, sizeof(timebuf))), errhint("This probably means that some data is corrupted and" " you will have to use the last backup for recovery."))); break; @@ -5682,7 +5675,8 @@ StartupXLOG(void) case DB_IN_ARCHIVE_RECOVERY: ereport(LOG, (errmsg("database system was interrupted while in recovery at log time %s", - str_time(ControlFile->checkPointCopy.time)), + str_time(ControlFile->checkPointCopy.time, + timebuf, sizeof(timebuf))), errhint("If this has occurred more than once some data might be corrupted" " and you might need to choose an earlier recovery target."))); break; @@ -5690,7 +5684,8 @@ StartupXLOG(void) case DB_IN_PRODUCTION: ereport(LOG, (errmsg("database system was interrupted; last known up at %s", - str_time(ControlFile->time)))); + str_time(ControlFile->time, + timebuf, sizeof(timebuf))))); break; default: @@ -5763,7 +5758,6 @@ StartupXLOG(void) SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true); SetCommitTsLimit(checkPoint.oldestCommitTsXid, checkPoint.newestCommitTsXid); - XLogCtl->ckptFullXid = checkPoint.nextXid; /* * Clear out any old relcache cache files. This is *necessary* if we do @@ -6336,6 +6330,12 @@ StartupXLOG(void) */ CompleteCommitTsInitialization(); + /* Clean up EndOfWalRecoveryInfo data to appease Valgrind leak checking */ + if (endOfRecoveryInfo->lastPage) + pfree(endOfRecoveryInfo->lastPage); + pfree(endOfRecoveryInfo->recoveryStopReason); + pfree(endOfRecoveryInfo); + /* * All done with end-of-recovery actions. * @@ -6505,7 +6505,7 @@ PerformRecoveryXLogAction(void) else { RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY | - CHECKPOINT_IMMEDIATE | + CHECKPOINT_FAST | CHECKPOINT_WAIT); } @@ -6814,7 +6814,7 @@ ShutdownXLOG(int code, Datum arg) WalSndWaitStopping(); if (RecoveryInProgress()) - CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST); else { /* @@ -6826,7 +6826,7 @@ ShutdownXLOG(int code, Datum arg) if (XLogArchivingActive()) RequestXLogSwitch(false); - CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FAST); } } @@ -6842,24 +6842,24 @@ LogCheckpointStart(int flags, bool restartpoint) (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", - (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FAST) ? " fast" : "", (flags & CHECKPOINT_FORCE) ? " force" : "", (flags & CHECKPOINT_WAIT) ? " wait" : "", (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", - (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); + (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : ""))); else ereport(LOG, /* translator: the placeholders show checkpoint options */ (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", - (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", + (flags & CHECKPOINT_FAST) ? " fast" : "", (flags & CHECKPOINT_FORCE) ? " force" : "", (flags & CHECKPOINT_WAIT) ? " wait" : "", (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "", (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "", - (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); + (flags & CHECKPOINT_FLUSH_UNLOGGED) ? " flush-unlogged" : ""))); } /* @@ -6921,7 +6921,7 @@ LogCheckpointEnd(bool restartpoint) "%d removed, %d recycled; write=%ld.%03d s, " "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, " "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, " - "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X", + "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_slru_written, @@ -6945,7 +6945,7 @@ LogCheckpointEnd(bool restartpoint) "%d removed, %d recycled; write=%ld.%03d s, " "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, " "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, " - "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X", + "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_slru_written, @@ -7042,12 +7042,12 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset) * flags is a bitwise OR of the following: * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. - * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, - * ignoring checkpoint_completion_target parameter. + * CHECKPOINT_FAST: finish the checkpoint ASAP, ignoring + * checkpoint_completion_target parameter. * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or * CHECKPOINT_END_OF_RECOVERY). - * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables. + * CHECKPOINT_FLUSH_UNLOGGED: also flush buffers of unlogged tables. * * Note: flags contains other bits, of interest here only for logging purposes. * In particular note that this routine is synchronous and does not pay @@ -7142,7 +7142,7 @@ CreateCheckPoint(int flags) * starting snapshot of locks and transactions. */ if (!shutdown && XLogStandbyInfoActive()) - checkPoint.oldestActiveXid = GetOldestActiveTransactionId(); + checkPoint.oldestActiveXid = GetOldestActiveTransactionId(false, true); else checkPoint.oldestActiveXid = InvalidTransactionId; @@ -7456,11 +7456,6 @@ CreateCheckPoint(int flags) UpdateControlFile(); LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextXid; - SpinLockRelease(&XLogCtl->info_lck); - /* * We are now done with critical updates; no need for system panic if we * have trouble while fooling with old log segments. @@ -7498,6 +7493,10 @@ CreateCheckPoint(int flags) if (PriorRedoPtr != InvalidXLogRecPtr) UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr); +#ifdef USE_INJECTION_POINTS + INJECTION_POINT("checkpoint-before-old-wal-removal", NULL); +#endif + /* * Delete old log files, those no longer needed for last checkpoint to * prevent the disk holding the xlog from growing full. @@ -7637,7 +7636,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, if (!RecoveryInProgress()) elog(ERROR, "can only be used at end of recovery"); if (pagePtr % XLOG_BLCKSZ != 0) - elog(ERROR, "invalid position for missing continuation record %X/%X", + elog(ERROR, "invalid position for missing continuation record %X/%08X", LSN_FORMAT_ARGS(pagePtr)); /* The current WAL insert position should be right after the page header */ @@ -7648,7 +7647,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, startPos += SizeOfXLogShortPHD; recptr = GetXLogInsertRecPtr(); if (recptr != startPos) - elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD", + elog(ERROR, "invalid WAL insert position %X/%08X for OVERWRITE_CONTRECORD", LSN_FORMAT_ARGS(recptr)); START_CRIT_SECTION(); @@ -7678,7 +7677,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, /* check that the record was inserted to the right place */ if (ProcLastRecPtr != startPos) - elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X", + elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%08X", LSN_FORMAT_ARGS(ProcLastRecPtr)); XLogFlush(recptr); @@ -7747,8 +7746,7 @@ RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record) if (XLogHaveInvalidPages()) { elog(DEBUG2, - "could not record restart point at %X/%X because there " - "are unresolved references to invalid pages", + "could not record restart point at %X/%08X because there are unresolved references to invalid pages", LSN_FORMAT_ARGS(checkPoint->redo)); return; } @@ -7828,8 +7826,8 @@ CreateRestartPoint(int flags) lastCheckPoint.redo <= ControlFile->checkPointCopy.redo) { ereport(DEBUG2, - (errmsg_internal("skipping restartpoint, already performed at %X/%X", - LSN_FORMAT_ARGS(lastCheckPoint.redo)))); + errmsg_internal("skipping restartpoint, already performed at %X/%08X", + LSN_FORMAT_ARGS(lastCheckPoint.redo))); UpdateMinRecoveryPoint(InvalidXLogRecPtr, true); if (flags & CHECKPOINT_IS_SHUTDOWN) @@ -8013,10 +8011,10 @@ CreateRestartPoint(int flags) xtime = GetLatestXTime(); ereport((log_checkpoints ? LOG : DEBUG2), - (errmsg("recovery restart point at %X/%X", - LSN_FORMAT_ARGS(lastCheckPoint.redo)), - xtime ? errdetail("Last completed transaction was at log time %s.", - timestamptz_to_str(xtime)) : 0)); + errmsg("recovery restart point at %X/%08X", + LSN_FORMAT_ARGS(lastCheckPoint.redo)), + xtime ? errdetail("Last completed transaction was at log time %s.", + timestamptz_to_str(xtime)) : 0); /* * Finally, execute archive_cleanup_command, if any. @@ -8147,17 +8145,19 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) XLByteToSeg(recptr, currSegNo, wal_segment_size); segno = currSegNo; - /* - * Calculate how many segments are kept by slots first, adjusting for - * max_slot_wal_keep_size. - */ + /* Calculate how many segments are kept by slots. */ keep = XLogGetReplicationSlotMinimumLSN(); if (keep != InvalidXLogRecPtr && keep < recptr) { XLByteToSeg(keep, segno, wal_segment_size); - /* Cap by max_slot_wal_keep_size ... */ - if (max_slot_wal_keep_size_mb >= 0) + /* + * Account for max_slot_wal_keep_size to avoid keeping more than + * configured. However, don't do that during a binary upgrade: if + * slots were to be invalidated because of this, it would not be + * possible to preserve logical ones during the upgrade. + */ + if (max_slot_wal_keep_size_mb >= 0 && !IsBinaryUpgrade) { uint64 slot_keep_segs; @@ -8277,8 +8277,8 @@ XLogRestorePoint(const char *rpName) RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT); ereport(LOG, - (errmsg("restore point \"%s\" created at %X/%X", - rpName, LSN_FORMAT_ARGS(RecPtr)))); + errmsg("restore point \"%s\" created at %X/%08X", + rpName, LSN_FORMAT_ARGS(RecPtr))); return RecPtr; } @@ -8530,11 +8530,6 @@ xlog_redo(XLogReaderState *record) ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextXid; - SpinLockRelease(&XLogCtl->info_lck); - /* * We should've already switched to the new TLI before replaying this * record. @@ -8591,11 +8586,6 @@ xlog_redo(XLogReaderState *record) ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; LWLockRelease(ControlFileLock); - /* Update shared-memory copy of checkpoint XID/epoch */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->ckptFullXid = checkPoint.nextXid; - SpinLockRelease(&XLogCtl->info_lck); - /* TLI should not change in an on-line checkpoint */ (void) GetCurrentReplayRecPtr(&replayTLI); if (checkPoint.ThisTimeLineID != replayTLI) @@ -8943,9 +8933,8 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) * backup state and tablespace map. * * Input parameters are "state" (the backup state), "fast" (if true, we do - * the checkpoint in immediate mode to make it faster), and "tablespaces" - * (if non-NULL, indicates a list of tablespaceinfo structs describing the - * cluster's tablespaces.). + * the checkpoint in fast mode), and "tablespaces" (if non-NULL, indicates a + * list of tablespaceinfo structs describing the cluster's tablespaces.). * * The tablespace map contents are appended to passed-in parameter * tablespace_map and the caller is responsible for including it in the backup @@ -9022,7 +9011,7 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces, * work correctly, it is critical that sessionBackupState is only updated * after this block is over. */ - PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true)); + PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(true)); { bool gotUniqueStartpoint = false; DIR *tblspcdir; @@ -9073,11 +9062,11 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces, * during recovery means that checkpointer is running, we can use * RequestCheckpoint() to establish a restartpoint. * - * We use CHECKPOINT_IMMEDIATE only if requested by user (via - * passing fast = true). Otherwise this can take awhile. + * We use CHECKPOINT_FAST only if requested by user (via passing + * fast = true). Otherwise this can take awhile. */ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | - (fast ? CHECKPOINT_IMMEDIATE : 0)); + (fast ? CHECKPOINT_FAST : 0)); /* * Now we need to fetch the checkpoint record location, and also @@ -9261,7 +9250,7 @@ do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces, state->starttime = (pg_time_t) time(NULL); } - PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true)); + PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(true)); state->started_in_recovery = backup_started_in_recovery; @@ -9601,7 +9590,7 @@ register_persistent_abort_backup_handler(void) if (already_done) return; - before_shmem_exit(do_pg_abort_backup, DatumGetBool(false)); + before_shmem_exit(do_pg_abort_backup, BoolGetDatum(false)); already_done = true; } diff --git a/src/backend/access/transam/xlogbackup.c b/src/backend/access/transam/xlogbackup.c index 342590e0a46..cda4b38b7d6 100644 --- a/src/backend/access/transam/xlogbackup.c +++ b/src/backend/access/transam/xlogbackup.c @@ -42,7 +42,7 @@ build_backup_content(BackupState *state, bool ishistoryfile) XLByteToSeg(state->startpoint, startsegno, wal_segment_size); XLogFileName(startxlogfile, state->starttli, startsegno, wal_segment_size); - appendStringInfo(result, "START WAL LOCATION: %X/%X (file %s)\n", + appendStringInfo(result, "START WAL LOCATION: %X/%08X (file %s)\n", LSN_FORMAT_ARGS(state->startpoint), startxlogfile); if (ishistoryfile) @@ -52,11 +52,11 @@ build_backup_content(BackupState *state, bool ishistoryfile) XLByteToSeg(state->stoppoint, stopsegno, wal_segment_size); XLogFileName(stopxlogfile, state->stoptli, stopsegno, wal_segment_size); - appendStringInfo(result, "STOP WAL LOCATION: %X/%X (file %s)\n", + appendStringInfo(result, "STOP WAL LOCATION: %X/%08X (file %s)\n", LSN_FORMAT_ARGS(state->stoppoint), stopxlogfile); } - appendStringInfo(result, "CHECKPOINT LOCATION: %X/%X\n", + appendStringInfo(result, "CHECKPOINT LOCATION: %X/%08X\n", LSN_FORMAT_ARGS(state->checkpointloc)); appendStringInfoString(result, "BACKUP METHOD: streamed\n"); appendStringInfo(result, "BACKUP FROM: %s\n", @@ -81,7 +81,7 @@ build_backup_content(BackupState *state, bool ishistoryfile) Assert(XLogRecPtrIsInvalid(state->istartpoint) == (state->istarttli == 0)); if (!XLogRecPtrIsInvalid(state->istartpoint)) { - appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%X\n", + appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%08X\n", LSN_FORMAT_ARGS(state->istartpoint)); appendStringInfo(result, "INCREMENTAL FROM TLI: %u\n", state->istarttli); diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 5ee9d0b028e..c7571429e8e 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -530,6 +530,18 @@ XLogInsert(RmgrId rmid, uint8 info) } /* + * Simple wrapper to XLogInsert to insert a WAL record with elementary + * contents (only an int64 is supported as value currently). + */ +XLogRecPtr +XLogSimpleInsertInt64(RmgrId rmid, uint8 info, int64 value) +{ + XLogBeginInsert(); + XLogRegisterData(&value, sizeof(value)); + return XLogInsert(rmid, info); +} + +/* * Assemble a WAL record from the registered data and buffers into an * XLogRecData chain, ready for insertion with XLogInsertRecord(). * diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 7735562db01..ed3aacabc98 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -546,7 +546,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing all readahead until %X/%X is replayed due to possible TLI change", + "suppressing all readahead until %X/%08X is replayed due to possible TLI change", LSN_FORMAT_ARGS(record->lsn)); #endif @@ -579,7 +579,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in database %u until %X/%X is replayed due to raw file copy", + "suppressing prefetch in database %u until %X/%08X is replayed due to raw file copy", rlocator.dbOid, LSN_FORMAT_ARGS(record->lsn)); #endif @@ -607,7 +607,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation", + "suppressing prefetch in relation %u/%u/%u until %X/%08X is replayed, which creates the relation", xlrec->rlocator.spcOid, xlrec->rlocator.dbOid, xlrec->rlocator.relNumber, @@ -630,7 +630,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation", + "suppressing prefetch in relation %u/%u/%u from block %u until %X/%08X is replayed, which truncates the relation", xlrec->rlocator.spcOid, xlrec->rlocator.dbOid, xlrec->rlocator.relNumber, @@ -729,7 +729,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk", + "suppressing all prefetch in relation %u/%u/%u until %X/%08X is replayed, because the relation does not exist on disk", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -750,7 +750,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small", + "suppressing prefetch in relation %u/%u/%u from block %u until %X/%08X is replayed, because the relation is too small", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -928,7 +928,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator, { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)", + "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%08X is replayed (blocks >= %u filtered)", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno, LSN_FORMAT_ARGS(filter->filter_until_replayed), filter->filter_from_block); @@ -944,7 +944,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator, { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)", + "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%08X is replayed (whole database)", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno, LSN_FORMAT_ARGS(filter->filter_until_replayed)); #endif diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 2790ade1f91..dcc8d4f9c1b 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -617,7 +617,7 @@ restart: } else if (targetRecOff < pageHeaderSize) { - report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u", + report_invalid_record(state, "invalid record offset at %X/%08X: expected at least %u, got %u", LSN_FORMAT_ARGS(RecPtr), pageHeaderSize, targetRecOff); goto err; @@ -626,7 +626,7 @@ restart: if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize) { - report_invalid_record(state, "contrecord is requested by %X/%X", + report_invalid_record(state, "contrecord is requested by %X/%08X", LSN_FORMAT_ARGS(RecPtr)); goto err; } @@ -667,7 +667,7 @@ restart: if (total_len < SizeOfXLogRecord) { report_invalid_record(state, - "invalid record length at %X/%X: expected at least %u, got %u", + "invalid record length at %X/%08X: expected at least %u, got %u", LSN_FORMAT_ARGS(RecPtr), (uint32) SizeOfXLogRecord, total_len); goto err; @@ -723,11 +723,12 @@ restart: /* Calculate pointer to beginning of next page */ targetPagePtr += XLOG_BLCKSZ; - /* Wait for the next page to become available */ - readOff = ReadPageInternal(state, targetPagePtr, - Min(total_len - gotlen + SizeOfXLogShortPHD, - XLOG_BLCKSZ)); - + /* + * Read the page header before processing the record data, so we + * can handle the case where the previous record ended as being a + * partial one. + */ + readOff = ReadPageInternal(state, targetPagePtr, SizeOfXLogShortPHD); if (readOff == XLREAD_WOULDBLOCK) return XLREAD_WOULDBLOCK; else if (readOff < 0) @@ -756,7 +757,7 @@ restart: if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { report_invalid_record(state, - "there is no contrecord flag at %X/%X", + "there is no contrecord flag at %X/%08X", LSN_FORMAT_ARGS(RecPtr)); goto err; } @@ -769,13 +770,22 @@ restart: total_len != (pageHeader->xlp_rem_len + gotlen)) { report_invalid_record(state, - "invalid contrecord length %u (expected %lld) at %X/%X", + "invalid contrecord length %u (expected %lld) at %X/%08X", pageHeader->xlp_rem_len, ((long long) total_len) - gotlen, LSN_FORMAT_ARGS(RecPtr)); goto err; } + /* Wait for the next page to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, + XLOG_BLCKSZ)); + if (readOff == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readOff < 0) + goto err; + /* Append the continuation from this page to the buffer */ pageHeaderSize = XLogPageHeaderSize(pageHeader); @@ -1132,7 +1142,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (record->xl_tot_len < SizeOfXLogRecord) { report_invalid_record(state, - "invalid record length at %X/%X: expected at least %u, got %u", + "invalid record length at %X/%08X: expected at least %u, got %u", LSN_FORMAT_ARGS(RecPtr), (uint32) SizeOfXLogRecord, record->xl_tot_len); return false; @@ -1140,7 +1150,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (!RmgrIdIsValid(record->xl_rmid)) { report_invalid_record(state, - "invalid resource manager ID %u at %X/%X", + "invalid resource manager ID %u at %X/%08X", record->xl_rmid, LSN_FORMAT_ARGS(RecPtr)); return false; } @@ -1153,7 +1163,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (!(record->xl_prev < RecPtr)) { report_invalid_record(state, - "record with incorrect prev-link %X/%X at %X/%X", + "record with incorrect prev-link %X/%08X at %X/%08X", LSN_FORMAT_ARGS(record->xl_prev), LSN_FORMAT_ARGS(RecPtr)); return false; @@ -1169,7 +1179,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (record->xl_prev != PrevRecPtr) { report_invalid_record(state, - "record with incorrect prev-link %X/%X at %X/%X", + "record with incorrect prev-link %X/%08X at %X/%08X", LSN_FORMAT_ARGS(record->xl_prev), LSN_FORMAT_ARGS(RecPtr)); return false; @@ -1207,7 +1217,7 @@ ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr) if (!EQ_CRC32C(record->xl_crc, crc)) { report_invalid_record(state, - "incorrect resource manager data checksum in record at %X/%X", + "incorrect resource manager data checksum in record at %X/%08X", LSN_FORMAT_ARGS(recptr)); return false; } @@ -1241,7 +1251,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "invalid magic number %04X in WAL segment %s, LSN %X/%X, offset %u", + "invalid magic number %04X in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_magic, fname, LSN_FORMAT_ARGS(recptr), @@ -1256,7 +1266,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u", + "invalid info bits %04X in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_info, fname, LSN_FORMAT_ARGS(recptr), @@ -1298,7 +1308,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, /* hmm, first page of file doesn't have a long header? */ report_invalid_record(state, - "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u", + "invalid info bits %04X in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_info, fname, LSN_FORMAT_ARGS(recptr), @@ -1318,7 +1328,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "unexpected pageaddr %X/%X in WAL segment %s, LSN %X/%X, offset %u", + "unexpected pageaddr %X/%08X in WAL segment %s, LSN %X/%08X, offset %u", LSN_FORMAT_ARGS(hdr->xlp_pageaddr), fname, LSN_FORMAT_ARGS(recptr), @@ -1344,7 +1354,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); report_invalid_record(state, - "out-of-sequence timeline ID %u (after %u) in WAL segment %s, LSN %X/%X, offset %u", + "out-of-sequence timeline ID %u (after %u) in WAL segment %s, LSN %X/%08X, offset %u", hdr->xlp_tli, state->latestPageTLI, fname, @@ -1756,7 +1766,7 @@ DecodeXLogRecord(XLogReaderState *state, if (block_id <= decoded->max_block_id) { report_invalid_record(state, - "out-of-order block_id %u at %X/%X", + "out-of-order block_id %u at %X/%08X", block_id, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; @@ -1780,14 +1790,14 @@ DecodeXLogRecord(XLogReaderState *state, if (blk->has_data && blk->data_len == 0) { report_invalid_record(state, - "BKPBLOCK_HAS_DATA set, but no data included at %X/%X", + "BKPBLOCK_HAS_DATA set, but no data included at %X/%08X", LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } if (!blk->has_data && blk->data_len != 0) { report_invalid_record(state, - "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X", + "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%08X", (unsigned int) blk->data_len, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; @@ -1823,7 +1833,7 @@ DecodeXLogRecord(XLogReaderState *state, blk->bimg_len == BLCKSZ)) { report_invalid_record(state, - "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", + "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%08X", (unsigned int) blk->hole_offset, (unsigned int) blk->hole_length, (unsigned int) blk->bimg_len, @@ -1839,7 +1849,7 @@ DecodeXLogRecord(XLogReaderState *state, (blk->hole_offset != 0 || blk->hole_length != 0)) { report_invalid_record(state, - "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", + "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%08X", (unsigned int) blk->hole_offset, (unsigned int) blk->hole_length, LSN_FORMAT_ARGS(state->ReadRecPtr)); @@ -1853,7 +1863,7 @@ DecodeXLogRecord(XLogReaderState *state, blk->bimg_len == BLCKSZ) { report_invalid_record(state, - "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%X", + "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%08X", (unsigned int) blk->bimg_len, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; @@ -1868,7 +1878,7 @@ DecodeXLogRecord(XLogReaderState *state, blk->bimg_len != BLCKSZ) { report_invalid_record(state, - "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%X", + "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%08X", (unsigned int) blk->data_len, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; @@ -1884,7 +1894,7 @@ DecodeXLogRecord(XLogReaderState *state, if (rlocator == NULL) { report_invalid_record(state, - "BKPBLOCK_SAME_REL set but no previous rel at %X/%X", + "BKPBLOCK_SAME_REL set but no previous rel at %X/%08X", LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1896,7 +1906,7 @@ DecodeXLogRecord(XLogReaderState *state, else { report_invalid_record(state, - "invalid block_id %u at %X/%X", + "invalid block_id %u at %X/%08X", block_id, LSN_FORMAT_ARGS(state->ReadRecPtr)); goto err; } @@ -1963,7 +1973,7 @@ DecodeXLogRecord(XLogReaderState *state, shortdata_err: report_invalid_record(state, - "record with invalid length at %X/%X", + "record with invalid length at %X/%08X", LSN_FORMAT_ARGS(state->ReadRecPtr)); err: *errormsg = state->errormsg_buf; @@ -2073,14 +2083,14 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) !record->record->blocks[block_id].in_use) { report_invalid_record(record, - "could not restore image at %X/%X with invalid block %d specified", + "could not restore image at %X/%08X with invalid block %d specified", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; } if (!record->record->blocks[block_id].has_image) { - report_invalid_record(record, "could not restore image at %X/%X with invalid state, block %d", + report_invalid_record(record, "could not restore image at %X/%08X with invalid state, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; @@ -2107,7 +2117,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) bkpb->bimg_len, BLCKSZ - bkpb->hole_length) <= 0) decomp_success = false; #else - report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d", + report_invalid_record(record, "could not restore image at %X/%08X compressed with %s not supported by build, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), "LZ4", block_id); @@ -2124,7 +2134,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) if (ZSTD_isError(decomp_result)) decomp_success = false; #else - report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d", + report_invalid_record(record, "could not restore image at %X/%08X compressed with %s not supported by build, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), "zstd", block_id); @@ -2133,7 +2143,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) } else { - report_invalid_record(record, "could not restore image at %X/%X compressed with unknown method, block %d", + report_invalid_record(record, "could not restore image at %X/%08X compressed with unknown method, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; @@ -2141,7 +2151,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) if (!decomp_success) { - report_invalid_record(record, "could not decompress image at %X/%X, block %d", + report_invalid_record(record, "could not decompress image at %X/%08X, block %d", LSN_FORMAT_ARGS(record->ReadRecPtr), block_id); return false; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 6ce979f2d8b..f23ec8969c2 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -620,10 +620,10 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * than ControlFile->checkPoint is used. */ ereport(LOG, - (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u", - LSN_FORMAT_ARGS(RedoStartLSN), - LSN_FORMAT_ARGS(CheckPointLoc), - CheckPointTLI))); + errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u", + LSN_FORMAT_ARGS(RedoStartLSN), + LSN_FORMAT_ARGS(CheckPointLoc), + CheckPointTLI)); /* * When a backup_label file is present, we want to roll forward from @@ -636,8 +636,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); ereport(DEBUG1, - (errmsg_internal("checkpoint record is at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)))); + errmsg_internal("checkpoint record is at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc))); InRecovery = true; /* force recovery even if SHUTDOWNED */ /* @@ -652,23 +652,23 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID)) ereport(FATAL, - (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X", - LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)), - errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" - "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" - "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", - DataDir, DataDir, DataDir, DataDir))); + errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir, DataDir)); } } else { ereport(FATAL, - (errmsg("could not locate required checkpoint record at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)), - errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" - "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" - "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", - DataDir, DataDir, DataDir, DataDir))); + errmsg("could not locate required checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc)), + errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" + "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" + "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", + DataDir, DataDir, DataDir, DataDir)); wasShutdown = false; /* keep compiler quiet */ } @@ -773,8 +773,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, */ if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) ereport(LOG, - (errmsg("restarting backup recovery with redo LSN %X/%X", - LSN_FORMAT_ARGS(ControlFile->backupStartPoint)))); + errmsg("restarting backup recovery with redo LSN %X/%08X", + LSN_FORMAT_ARGS(ControlFile->backupStartPoint))); /* Get the last valid checkpoint record. */ CheckPointLoc = ControlFile->checkPoint; @@ -786,8 +786,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, if (record != NULL) { ereport(DEBUG1, - (errmsg_internal("checkpoint record is at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)))); + errmsg_internal("checkpoint record is at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc))); } else { @@ -798,8 +798,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * simplify processing around checkpoints. */ ereport(PANIC, - (errmsg("could not locate a valid checkpoint record at %X/%X", - LSN_FORMAT_ARGS(CheckPointLoc)))); + errmsg("could not locate a valid checkpoint record at %X/%08X", + LSN_FORMAT_ARGS(CheckPointLoc))); } memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); @@ -824,8 +824,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, recoveryTargetName))); else if (recoveryTarget == RECOVERY_TARGET_LSN) ereport(LOG, - (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryTargetLSN)))); + errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"", + LSN_FORMAT_ARGS(recoveryTargetLSN))); else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) ereport(LOG, (errmsg("starting point-in-time recovery to earliest consistent point"))); @@ -855,7 +855,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, (errmsg("requested timeline %u is not a child of this server's history", recoveryTargetTLI), /* translator: %s is a backup_label file or a pg_control file */ - errdetail("Latest checkpoint in file \"%s\" is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.", + errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.", haveBackupLabel ? "backup_label" : "pg_control", LSN_FORMAT_ARGS(CheckPointLoc), CheckPointTLI, @@ -870,15 +870,15 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != ControlFile->minRecoveryPointTLI) ereport(FATAL, - (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", - recoveryTargetTLI, - LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), - ControlFile->minRecoveryPointTLI))); + errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u", + recoveryTargetTLI, + LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), + ControlFile->minRecoveryPointTLI)); ereport(DEBUG1, - (errmsg_internal("redo record is at %X/%X; shutdown %s", - LSN_FORMAT_ARGS(checkPoint.redo), - wasShutdown ? "true" : "false"))); + errmsg_internal("redo record is at %X/%08X; shutdown %s", + LSN_FORMAT_ARGS(checkPoint.redo), + wasShutdown ? "true" : "false")); ereport(DEBUG1, (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", U64FromFullTransactionId(checkPoint.nextXid), @@ -1253,14 +1253,14 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, * is pretty crude, but we are not expecting any variability in the file * format). */ - if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c", + if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c", &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n') ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); RedoStartLSN = ((uint64) hi) << 32 | lo; RedoStartTLI = tli_from_walseg; - if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c", + if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c", &hi, &lo, &ch) != 3 || ch != '\n') ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), @@ -1332,7 +1332,7 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, tli_from_file, BACKUP_LABEL_FILE))); } - if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0) + if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("this is an incremental backup, not a data directory"), @@ -1626,6 +1626,7 @@ ShutdownWalRecovery(void) close(readFile); readFile = -1; } + pfree(xlogreader->private_data); XLogReaderFree(xlogreader); XLogPrefetcherFree(xlogprefetcher); @@ -1722,8 +1723,8 @@ PerformWalRecovery(void) if (record->xl_rmid != RM_XLOG_ID || (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO) ereport(FATAL, - (errmsg("unexpected record type found at redo point %X/%X", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)))); + errmsg("unexpected record type found at redo point %X/%08X", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))); } else { @@ -1745,8 +1746,8 @@ PerformWalRecovery(void) RmgrStartup(); ereport(LOG, - (errmsg("redo starts at %X/%X", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)))); + errmsg("redo starts at %X/%08X", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))); /* Prepare to report progress of the redo phase. */ if (!StandbyMode) @@ -1758,7 +1759,7 @@ PerformWalRecovery(void) do { if (!StandbyMode) - ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X", + ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X", LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)); #ifdef WAL_DEBUG @@ -1767,7 +1768,7 @@ PerformWalRecovery(void) StringInfoData buf; initStringInfo(&buf); - appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", + appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ", LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); xlog_outrec(&buf, xlogreader); @@ -1880,9 +1881,9 @@ PerformWalRecovery(void) RmgrCleanup(); ereport(LOG, - (errmsg("redo done at %X/%X system usage: %s", - LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), - pg_rusage_show(&ru0)))); + errmsg("redo done at %X/%08X system usage: %s", + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), + pg_rusage_show(&ru0))); xtime = GetLatestXTime(); if (xtime) ereport(LOG, @@ -2092,7 +2093,7 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord)); if (xlrec.overwritten_lsn != record->overwrittenRecPtr) - elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X", + elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X", LSN_FORMAT_ARGS(xlrec.overwritten_lsn), LSN_FORMAT_ARGS(record->overwrittenRecPtr)); @@ -2101,9 +2102,9 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) missingContrecPtr = InvalidXLogRecPtr; ereport(LOG, - (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s", - LSN_FORMAT_ARGS(xlrec.overwritten_lsn), - timestamptz_to_str(xlrec.overwrite_time)))); + errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s", + LSN_FORMAT_ARGS(xlrec.overwritten_lsn), + timestamptz_to_str(xlrec.overwrite_time))); /* Verifying the record should only happen once */ record->overwrittenRecPtr = InvalidXLogRecPtr; @@ -2129,7 +2130,7 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) backupEndPoint = lsn; } else - elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X", + elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X", LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint)); } } @@ -2224,9 +2225,9 @@ CheckRecoveryConsistency(void) backupEndRequired = false; ereport(LOG, - (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X", - LSN_FORMAT_ARGS(saveBackupStartPoint), - LSN_FORMAT_ARGS(saveBackupEndPoint)))); + errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X", + LSN_FORMAT_ARGS(saveBackupStartPoint), + LSN_FORMAT_ARGS(saveBackupEndPoint))); } /* @@ -2255,8 +2256,8 @@ CheckRecoveryConsistency(void) reachedConsistency = true; SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT); ereport(LOG, - (errmsg("consistent recovery state reached at %X/%X", - LSN_FORMAT_ARGS(lastReplayedEndRecPtr)))); + errmsg("consistent recovery state reached at %X/%08X", + LSN_FORMAT_ARGS(lastReplayedEndRecPtr))); } /* @@ -2293,7 +2294,7 @@ rm_redo_error_callback(void *arg) xlog_block_info(&buf, record); /* translator: %s is a WAL record description */ - errcontext("WAL redo at %X/%X for %s", + errcontext("WAL redo at %X/%08X for %s", LSN_FORMAT_ARGS(record->ReadRecPtr), buf.data); @@ -2328,7 +2329,7 @@ xlog_outdesc(StringInfo buf, XLogReaderState *record) static void xlog_outrec(StringInfo buf, XLogReaderState *record) { - appendStringInfo(buf, "prev %X/%X; xid %u", + appendStringInfo(buf, "prev %X/%08X; xid %u", LSN_FORMAT_ARGS(XLogRecGetPrev(record)), XLogRecGetXid(record)); @@ -2416,10 +2417,10 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, lsn < minRecoveryPoint && newTLI > minRecoveryPointTLI) ereport(PANIC, - (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u", - newTLI, - LSN_FORMAT_ARGS(minRecoveryPoint), - minRecoveryPointTLI))); + errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u", + newTLI, + LSN_FORMAT_ARGS(minRecoveryPoint), + minRecoveryPointTLI)); /* Looks good */ } @@ -2621,8 +2622,8 @@ recoveryStopsBefore(XLogReaderState *record) recoveryStopTime = 0; recoveryStopName[0] = '\0'; ereport(LOG, - (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryStopLSN)))); + errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"", + LSN_FORMAT_ARGS(recoveryStopLSN))); return true; } @@ -2789,8 +2790,8 @@ recoveryStopsAfter(XLogReaderState *record) recoveryStopTime = 0; recoveryStopName[0] = '\0'; ereport(LOG, - (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"", - LSN_FORMAT_ARGS(recoveryStopLSN)))); + errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"", + LSN_FORMAT_ARGS(recoveryStopLSN))); return true; } @@ -2910,7 +2911,7 @@ getRecoveryStopReason(void) timestamptz_to_str(recoveryStopTime)); else if (recoveryTarget == RECOVERY_TARGET_LSN) snprintf(reason, sizeof(reason), - "%s LSN %X/%X\n", + "%s LSN %X/%08X\n", recoveryStopAfter ? "after" : "before", LSN_FORMAT_ARGS(recoveryStopLSN)); else if (recoveryTarget == RECOVERY_TARGET_NAME) @@ -3213,11 +3214,11 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, XLogFileName(fname, xlogreader->seg.ws_tli, segno, wal_segment_size); ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), - (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u", - xlogreader->latestPageTLI, - fname, - LSN_FORMAT_ARGS(xlogreader->latestPagePtr), - offset))); + errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u", + xlogreader->latestPageTLI, + fname, + LSN_FORMAT_ARGS(xlogreader->latestPagePtr), + offset)); record = NULL; } @@ -3429,14 +3430,14 @@ retry: errno = save_errno; ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), (errcode_for_file_access(), - errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m", + errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m", fname, LSN_FORMAT_ARGS(targetPagePtr), readOff))); } else ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu", + errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu", fname, LSN_FORMAT_ARGS(targetPagePtr), readOff, r, (Size) XLOG_BLCKSZ))); goto next_record_is_invalid; @@ -3718,7 +3719,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, wait_time = wal_retrieve_retry_interval - TimestampDifferenceMilliseconds(last_fail_time, now); - elog(LOG, "waiting for WAL to become available at %X/%X", + elog(LOG, "waiting for WAL to become available at %X/%08X", LSN_FORMAT_ARGS(RecPtr)); /* Do background tasks that might benefit us later. */ @@ -3864,7 +3865,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); if (curFileTLI > 0 && tli < curFileTLI) - elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", + elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u", LSN_FORMAT_ARGS(tliRecPtr), tli, curFileTLI); } @@ -4177,10 +4178,10 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN) if (currentTle->end < replayLSN) { ereport(LOG, - (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X", - newtarget, - replayTLI, - LSN_FORMAT_ARGS(replayLSN)))); + errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X", + newtarget, + replayTLI, + LSN_FORMAT_ARGS(replayLSN))); return false; } @@ -4760,7 +4761,7 @@ bool check_primary_slot_name(char **newval, void **extra, GucSource source) { if (*newval && strcmp(*newval, "") != 0 && - !ReplicationSlotValidateName(*newval, WARNING)) + !ReplicationSlotValidateName(*newval, false, WARNING)) return false; return true; @@ -4994,13 +4995,25 @@ check_recovery_target_timeline(char **newval, void **extra, GucSource source) rttg = RECOVERY_TARGET_TIMELINE_LATEST; else { + char *endp; + uint64 timeline; + rttg = RECOVERY_TARGET_TIMELINE_NUMERIC; errno = 0; - strtoul(*newval, NULL, 0); - if (errno == EINVAL || errno == ERANGE) + timeline = strtou64(*newval, &endp, 0); + + if (*endp != '\0' || errno == EINVAL || errno == ERANGE) + { + GUC_check_errdetail("\"%s\" is not a valid number.", + "recovery_target_timeline"); + return false; + } + + if (timeline < 1 || timeline > PG_UINT32_MAX) { - GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number."); + GUC_check_errdetail("\"%s\" must be between %u and %u.", + "recovery_target_timeline", 1, UINT_MAX); return false; } } diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index c389b27f77d..27ea52fdfee 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -795,7 +795,7 @@ XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, list_free_deep(timelineHistory); - elog(DEBUG3, "switched to timeline %u valid until %X/%X", + elog(DEBUG3, "switched to timeline %u valid until %X/%08X", state->currTLI, LSN_FORMAT_ARGS(state->currTLIValidUntil)); } diff --git a/src/backend/backup/backup_manifest.c b/src/backend/backup/backup_manifest.c index 22e2be37c95..d05252f383c 100644 --- a/src/backend/backup/backup_manifest.c +++ b/src/backend/backup/backup_manifest.c @@ -281,7 +281,7 @@ AddWALInfoToBackupManifest(backup_manifest_info *manifest, XLogRecPtr startptr, } AppendToManifest(manifest, - "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%X\", \"End-LSN\": \"%X/%X\" }", + "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%08X\", \"End-LSN\": \"%X/%08X\" }", first_wal_range ? "" : ",\n", entry->tli, LSN_FORMAT_ARGS(tl_beginptr), diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index f0f88838dc2..bb7d90aa5d9 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -1048,7 +1048,7 @@ SendBaseBackup(BaseBackupCmd *cmd, IncrementalBackupInfo *ib) sink = bbsink_zstd_new(sink, &opt.compression_specification); /* Set up progress reporting. */ - sink = bbsink_progress_new(sink, opt.progress); + sink = bbsink_progress_new(sink, opt.progress, opt.incremental); /* * Perform the base backup, but make sure we clean up the bbsink even if diff --git a/src/backend/backup/basebackup_copy.c b/src/backend/backup/basebackup_copy.c index a284ce318ff..eb45d3bcb66 100644 --- a/src/backend/backup/basebackup_copy.c +++ b/src/backend/backup/basebackup_copy.c @@ -143,7 +143,7 @@ bbsink_copystream_begin_backup(bbsink *sink) buf = palloc(mysink->base.bbs_buffer_length + MAXIMUM_ALIGNOF); mysink->msgbuffer = buf + (MAXIMUM_ALIGNOF - 1); mysink->base.bbs_buffer = buf + MAXIMUM_ALIGNOF; - mysink->msgbuffer[0] = 'd'; /* archive or manifest data */ + mysink->msgbuffer[0] = PqMsg_CopyData; /* archive or manifest data */ /* Tell client the backup start location. */ SendXlogRecPtrResult(state->startptr, state->starttli); @@ -170,7 +170,7 @@ bbsink_copystream_begin_archive(bbsink *sink, const char *archive_name) ti = list_nth(state->tablespaces, state->tablespace_num); pq_beginmessage(&buf, PqMsg_CopyData); - pq_sendbyte(&buf, 'n'); /* New archive */ + pq_sendbyte(&buf, PqBackupMsg_NewArchive); pq_sendstring(&buf, archive_name); pq_sendstring(&buf, ti->path == NULL ? "" : ti->path); pq_endmessage(&buf); @@ -191,7 +191,7 @@ bbsink_copystream_archive_contents(bbsink *sink, size_t len) if (mysink->send_to_client) { /* Add one because we're also sending a leading type byte. */ - pq_putmessage('d', mysink->msgbuffer, len + 1); + pq_putmessage(PqMsg_CopyData, mysink->msgbuffer, len + 1); } /* Consider whether to send a progress report to the client. */ @@ -221,7 +221,7 @@ bbsink_copystream_archive_contents(bbsink *sink, size_t len) mysink->last_progress_report_time = now; pq_beginmessage(&buf, PqMsg_CopyData); - pq_sendbyte(&buf, 'p'); /* Progress report */ + pq_sendbyte(&buf, PqBackupMsg_ProgressReport); pq_sendint64(&buf, state->bytes_done); pq_endmessage(&buf); pq_flush_if_writable(); @@ -247,7 +247,7 @@ bbsink_copystream_end_archive(bbsink *sink) mysink->bytes_done_at_last_time_check = state->bytes_done; mysink->last_progress_report_time = GetCurrentTimestamp(); pq_beginmessage(&buf, PqMsg_CopyData); - pq_sendbyte(&buf, 'p'); /* Progress report */ + pq_sendbyte(&buf, PqBackupMsg_ProgressReport); pq_sendint64(&buf, state->bytes_done); pq_endmessage(&buf); pq_flush_if_writable(); @@ -262,7 +262,7 @@ bbsink_copystream_begin_manifest(bbsink *sink) StringInfoData buf; pq_beginmessage(&buf, PqMsg_CopyData); - pq_sendbyte(&buf, 'm'); /* Manifest */ + pq_sendbyte(&buf, PqBackupMsg_Manifest); pq_endmessage(&buf); } @@ -277,7 +277,7 @@ bbsink_copystream_manifest_contents(bbsink *sink, size_t len) if (mysink->send_to_client) { /* Add one because we're also sending a leading type byte. */ - pq_putmessage('d', mysink->msgbuffer, len + 1); + pq_putmessage(PqMsg_CopyData, mysink->msgbuffer, len + 1); } } @@ -361,7 +361,7 @@ SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli) tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); /* Data row */ - values[0] = CStringGetTextDatum(psprintf("%X/%X", LSN_FORMAT_ARGS(ptr))); + values[0] = CStringGetTextDatum(psprintf("%X/%08X", LSN_FORMAT_ARGS(ptr))); values[1] = Int64GetDatum(tli); do_tup_output(tstate, values, nulls); diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c index 28491b1e0ab..a0d48ff0fef 100644 --- a/src/backend/backup/basebackup_incremental.c +++ b/src/backend/backup/basebackup_incremental.c @@ -409,7 +409,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->start_lsn < tlep[i]->begin) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X", + errmsg("manifest requires WAL from initial timeline %u starting at %X/%08X, but that timeline begins at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->start_lsn), LSN_FORMAT_ARGS(tlep[i]->begin)))); @@ -419,7 +419,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->start_lsn != tlep[i]->begin) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X", + errmsg("manifest requires WAL from continuation timeline %u starting at %X/%08X, but that timeline begins at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->start_lsn), LSN_FORMAT_ARGS(tlep[i]->begin)))); @@ -430,7 +430,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->end_lsn > backup_state->startpoint) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X", + errmsg("manifest requires WAL from final timeline %u ending at %X/%08X, but this backup starts at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->end_lsn), LSN_FORMAT_ARGS(backup_state->startpoint)), @@ -441,7 +441,7 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (range->end_lsn != tlep[i]->end) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X", + errmsg("manifest requires WAL from non-final timeline %u ending at %X/%08X, but this server switched timelines at %X/%08X", range->tli, LSN_FORMAT_ARGS(range->end_lsn), LSN_FORMAT_ARGS(tlep[i]->end)))); @@ -522,18 +522,18 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, if (XLogRecPtrIsInvalid(tli_missing_lsn)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist", + errmsg("WAL summaries are required on timeline %u from %X/%08X to %X/%08X, but no summaries for that timeline and LSN range exist", tle->tli, LSN_FORMAT_ARGS(tli_start_lsn), LSN_FORMAT_ARGS(tli_end_lsn)))); else ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete", + errmsg("WAL summaries are required on timeline %u from %X/%08X to %X/%08X, but the summaries for that timeline and LSN range are incomplete", tle->tli, LSN_FORMAT_ARGS(tli_start_lsn), LSN_FORMAT_ARGS(tli_end_lsn)), - errdetail("The first unsummarized LSN in this range is %X/%X.", + errdetail("The first unsummarized LSN in this range is %X/%08X.", LSN_FORMAT_ARGS(tli_missing_lsn)))); } diff --git a/src/backend/backup/basebackup_progress.c b/src/backend/backup/basebackup_progress.c index 1d22b541f89..dac20593622 100644 --- a/src/backend/backup/basebackup_progress.c +++ b/src/backend/backup/basebackup_progress.c @@ -56,7 +56,7 @@ static const bbsink_ops bbsink_progress_ops = { * forwards data to a successor sink. */ bbsink * -bbsink_progress_new(bbsink *next, bool estimate_backup_size) +bbsink_progress_new(bbsink *next, bool estimate_backup_size, bool incremental) { bbsink *sink; @@ -69,10 +69,15 @@ bbsink_progress_new(bbsink *next, bool estimate_backup_size) /* * Report that a base backup is in progress, and set the total size of the * backup to -1, which will get translated to NULL. If we're estimating - * the backup size, we'll insert the real estimate when we have it. + * the backup size, we'll insert the real estimate when we have it. Also, + * the backup type is set. */ pgstat_progress_start_command(PROGRESS_COMMAND_BASEBACKUP, InvalidOid); pgstat_progress_update_param(PROGRESS_BASEBACKUP_BACKUP_TOTAL, -1); + pgstat_progress_update_param(PROGRESS_BASEBACKUP_BACKUP_TYPE, + incremental + ? PROGRESS_BASEBACKUP_BACKUP_TYPE_INCREMENTAL + : PROGRESS_BASEBACKUP_BACKUP_TYPE_FULL); return sink; } diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 6db864892d0..fc8638c1b61 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -109,6 +109,8 @@ static const struct typinfo TypInfo[] = { F_REGROLEIN, F_REGROLEOUT}, {"regnamespace", REGNAMESPACEOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, F_REGNAMESPACEIN, F_REGNAMESPACEOUT}, + {"regdatabase", REGDATABASEOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, + F_REGDATABASEIN, F_REGDATABASEOUT}, {"text", TEXTOID, 0, -1, false, TYPALIGN_INT, TYPSTORAGE_EXTENDED, DEFAULT_COLLATION_OID, F_TEXTIN, F_TEXTOUT}, {"oid", OIDOID, 0, 4, true, TYPALIGN_INT, TYPSTORAGE_PLAIN, InvalidOid, diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 18316a3968b..7dded634eb8 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -1850,6 +1850,17 @@ find_expr_references_walker(Node *node, errmsg("constant of the type %s cannot be used here", "regrole"))); break; + + /* + * Dependencies for regdatabase should be shared among all + * databases, so explicitly inhibit to have dependencies. + */ + case REGDATABASEOID: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constant of the type %s cannot be used here", + "regdatabase"))); + break; } } return false; diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index fbaed5359ad..fd6537567ea 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -665,6 +665,15 @@ CheckAttributeType(const char *attname, } /* + * For consistency with check_virtual_generated_security(). + */ + if ((flags & CHKATYPE_IS_VIRTUAL) && atttypid >= FirstUnpinnedObjectId) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("virtual generated column \"%s\" cannot have a user-defined type", attname), + errdetail("Virtual generated columns that make use of user-defined types are not yet supported.")); + + /* * This might not be strictly invalid per SQL standard, but it is pretty * useless, and it cannot be dumped, so we must disallow it. */ @@ -1100,6 +1109,7 @@ AddNewRelationType(const char *typeName, * if false, relacl is always set NULL * allow_system_table_mods: true to allow creation in system namespaces * is_internal: is this a system-generated catalog? + * relrewrite: link to original relation during a table rewrite * * Output parameters: * typaddress: if not null, gets the object address of the new pg_type entry @@ -2996,7 +3006,7 @@ AddRelationNotNullConstraints(Relation rel, List *constraints, if (constr->is_no_inherit) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), - errmsg("cannot define not-null constraint on column \"%s\" with NO INHERIT", + errmsg("cannot define not-null constraint with NO INHERIT on column \"%s\"", strVal(linitial(constr->keys))), errdetail("The column has an inherited not-null constraint."))); @@ -3215,6 +3225,86 @@ check_nested_generated(ParseState *pstate, Node *node) } /* + * Check security of virtual generated column expression. + * + * Just like selecting from a view is exploitable (CVE-2024-7348), selecting + * from a table with virtual generated columns is exploitable. Users who are + * concerned about this can avoid selecting from views, but telling them to + * avoid selecting from tables is less practical. + * + * To address this, this restricts generation expressions for virtual + * generated columns are restricted to using built-in functions and types. We + * assume that built-in functions and types cannot be exploited for this + * purpose. Note the overall security also requires that all functions in use + * a immutable. (For example, there are some built-in non-immutable functions + * that can run arbitrary SQL.) The immutability is checked elsewhere, since + * that is a property that needs to hold independent of security + * considerations. + * + * In the future, this could be expanded by some new mechanism to declare + * other functions and types as safe or trusted for this purpose, but that is + * to be designed. + */ + +/* + * Callback for check_functions_in_node() that determines whether a function + * is user-defined. + */ +static bool +contains_user_functions_checker(Oid func_id, void *context) +{ + return (func_id >= FirstUnpinnedObjectId); +} + +/* + * Checks for all the things we don't want in the generation expressions of + * virtual generated columns for security reasons. Errors out if it finds + * one. + */ +static bool +check_virtual_generated_security_walker(Node *node, void *context) +{ + ParseState *pstate = context; + + if (node == NULL) + return false; + + if (!IsA(node, List)) + { + if (check_functions_in_node(node, contains_user_functions_checker, NULL)) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("generation expression uses user-defined function"), + errdetail("Virtual generated columns that make use of user-defined functions are not yet supported."), + parser_errposition(pstate, exprLocation(node))); + + /* + * check_functions_in_node() doesn't check some node types (see + * comment there). We handle CoerceToDomain and MinMaxExpr by + * checking for built-in types. The other listed node types cannot + * call user-definable SQL-visible functions. + * + * We furthermore need this type check to handle built-in, immutable + * polymorphic functions such as array_eq(). + */ + if (exprType(node) >= FirstUnpinnedObjectId) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("generation expression uses user-defined type"), + errdetail("Virtual generated columns that make use of user-defined types are not yet supported."), + parser_errposition(pstate, exprLocation(node))); + } + + return expression_tree_walker(node, check_virtual_generated_security_walker, context); +} + +static void +check_virtual_generated_security(ParseState *pstate, Node *node) +{ + check_virtual_generated_security_walker(node, pstate); +} + +/* * Take a raw default and convert it to a cooked format ready for * storage. * @@ -3253,6 +3343,10 @@ cookDefault(ParseState *pstate, ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("generation expression is not immutable"))); + + /* Check security of expressions for virtual generated column */ + if (attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + check_virtual_generated_security(pstate, expr); } else { diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 739a92bdcc1..c4029a4f3d3 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -800,11 +800,11 @@ index_create(Relation heapRelation, errmsg("user-defined indexes on system catalog tables are not supported"))); /* - * Btree text_pattern_ops uses text_eq as the equality operator, which is - * fine as long as the collation is deterministic; text_eq then reduces to + * Btree text_pattern_ops uses texteq as the equality operator, which is + * fine as long as the collation is deterministic; texteq then reduces to * bitwise equality and so it is semantically compatible with the other * operators and functions in that opclass. But with a nondeterministic - * collation, text_eq could yield results that are incompatible with the + * collation, texteq could yield results that are incompatible with the * actual behavior of the index (which is determined by the opclass's * comparison function). We prevent such problems by refusing creation of * an index with that opclass and a nondeterministic collation. @@ -814,7 +814,7 @@ index_create(Relation heapRelation, * opclasses as incompatible with nondeterminism; but for now, this small * hack suffices. * - * Another solution is to use a special operator, not text_eq, as the + * Another solution is to use a special operator, not texteq, as the * equality opclass member; but that is undesirable because it would * prevent index usage in many queries that work fine today. */ @@ -3020,7 +3020,7 @@ index_build(Relation heapRelation, /* * Determine worker process details for parallel CREATE INDEX. Currently, - * only btree and BRIN have support for parallel builds. + * only btree, GIN, and BRIN have support for parallel builds. * * Note that planner considers parallel safety for us. */ diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c index 1395032413e..244acf52f36 100644 --- a/src/backend/catalog/pg_subscription.c +++ b/src/backend/catalog/pg_subscription.c @@ -103,6 +103,7 @@ GetSubscription(Oid subid, bool missing_ok) sub->passwordrequired = subform->subpasswordrequired; sub->runasowner = subform->subrunasowner; sub->failover = subform->subfailover; + sub->retaindeadtuples = subform->subretaindeadtuples; /* Get conninfo */ datum = SysCacheGetAttrNotNull(SUBSCRIPTIONOID, @@ -319,7 +320,7 @@ AddSubscriptionRelState(Oid subid, Oid relid, char state, */ void UpdateSubscriptionRelState(Oid subid, Oid relid, char state, - XLogRecPtr sublsn) + XLogRecPtr sublsn, bool already_locked) { Relation rel; HeapTuple tup; @@ -327,9 +328,24 @@ UpdateSubscriptionRelState(Oid subid, Oid relid, char state, Datum values[Natts_pg_subscription_rel]; bool replaces[Natts_pg_subscription_rel]; - LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock); + if (already_locked) + { +#ifdef USE_ASSERT_CHECKING + LOCKTAG tag; - rel = table_open(SubscriptionRelRelationId, RowExclusiveLock); + Assert(CheckRelationOidLockedByMe(SubscriptionRelRelationId, + RowExclusiveLock, true)); + SET_LOCKTAG_OBJECT(tag, InvalidOid, SubscriptionRelationId, subid, 0); + Assert(LockHeldByMe(&tag, AccessShareLock, true)); +#endif + + rel = table_open(SubscriptionRelRelationId, NoLock); + } + else + { + LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock); + rel = table_open(SubscriptionRelRelationId, RowExclusiveLock); + } /* Try finding existing mapping. */ tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP, diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 15efb02badb..1b3c5a55882 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -666,6 +666,14 @@ GRANT SELECT ON pg_shmem_allocations_numa TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations_numa() TO pg_read_all_stats; +CREATE VIEW pg_dsm_registry_allocations AS + SELECT * FROM pg_get_dsm_registry_allocations(); + +REVOKE ALL ON pg_dsm_registry_allocations FROM PUBLIC; +GRANT SELECT ON pg_dsm_registry_allocations TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_dsm_registry_allocations() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_dsm_registry_allocations() TO pg_read_all_stats; + CREATE VIEW pg_backend_memory_contexts AS SELECT * FROM pg_get_backend_memory_contexts(); @@ -674,11 +682,6 @@ GRANT SELECT ON pg_backend_memory_contexts TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_backend_memory_contexts() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_backend_memory_contexts() TO pg_read_all_stats; -REVOKE EXECUTE ON FUNCTION - pg_get_process_memory_contexts(integer, boolean, float) FROM PUBLIC; -GRANT EXECUTE ON FUNCTION - pg_get_process_memory_contexts(integer, boolean, float) TO pg_read_all_stats; - -- Statistics views CREATE VIEW pg_stat_all_tables AS @@ -900,7 +903,7 @@ CREATE VIEW pg_stat_activity AS S.wait_event, S.state, S.backend_xid, - s.backend_xmin, + S.backend_xmin, S.query_id, S.query, S.backend_type @@ -1324,7 +1327,10 @@ CREATE VIEW pg_stat_progress_basebackup AS CASE S.param2 WHEN -1 THEN NULL ELSE S.param2 END AS backup_total, S.param3 AS backup_streamed, S.param4 AS tablespaces_total, - S.param5 AS tablespaces_streamed + S.param5 AS tablespaces_streamed, + CASE S.param6 WHEN 1 THEN 'full' + WHEN 2 THEN 'incremental' + END AS backup_type FROM pg_stat_get_progress_info('BASEBACKUP') AS S; @@ -1383,7 +1389,8 @@ REVOKE ALL ON pg_subscription FROM public; GRANT SELECT (oid, subdbid, subskiplsn, subname, subowner, subenabled, subbinary, substream, subtwophasestate, subdisableonerr, subpasswordrequired, subrunasowner, subfailover, - subslotname, subsynccommit, subpublications, suborigin) + subretaindeadtuples, subslotname, subsynccommit, + subpublications, suborigin) ON pg_subscription TO public; CREATE VIEW pg_stat_subscription_stats AS @@ -1395,6 +1402,7 @@ CREATE VIEW pg_stat_subscription_stats AS ss.confl_insert_exists, ss.confl_update_origin_differs, ss.confl_update_exists, + ss.confl_update_deleted, ss.confl_update_missing, ss.confl_delete_origin_differs, ss.confl_delete_missing, diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 4fffb76e557..40d66537ad7 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -76,7 +76,7 @@ static BufferAccessStrategy vac_strategy; static void do_analyze_rel(Relation onerel, - VacuumParams *params, List *va_cols, + const VacuumParams params, List *va_cols, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, bool inh, bool in_outer_xact, int elevel); static void compute_index_stats(Relation onerel, double totalrows, @@ -107,7 +107,7 @@ static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); */ void analyze_rel(Oid relid, RangeVar *relation, - VacuumParams *params, List *va_cols, bool in_outer_xact, + const VacuumParams params, List *va_cols, bool in_outer_xact, BufferAccessStrategy bstrategy) { Relation onerel; @@ -116,7 +116,7 @@ analyze_rel(Oid relid, RangeVar *relation, BlockNumber relpages = 0; /* Select logging level */ - if (params->options & VACOPT_VERBOSE) + if (params.options & VACOPT_VERBOSE) elevel = INFO; else elevel = DEBUG2; @@ -138,8 +138,8 @@ analyze_rel(Oid relid, RangeVar *relation, * * Make sure to generate only logs for ANALYZE in this case. */ - onerel = vacuum_open_relation(relid, relation, params->options & ~(VACOPT_VACUUM), - params->log_min_duration >= 0, + onerel = vacuum_open_relation(relid, relation, params.options & ~(VACOPT_VACUUM), + params.log_min_duration >= 0, ShareUpdateExclusiveLock); /* leave if relation could not be opened or locked */ @@ -155,7 +155,7 @@ analyze_rel(Oid relid, RangeVar *relation, */ if (!vacuum_is_permitted_for_relation(RelationGetRelid(onerel), onerel->rd_rel, - params->options & ~VACOPT_VACUUM)) + params.options & ~VACOPT_VACUUM)) { relation_close(onerel, ShareUpdateExclusiveLock); return; @@ -227,7 +227,7 @@ analyze_rel(Oid relid, RangeVar *relation, else { /* No need for a WARNING if we already complained during VACUUM */ - if (!(params->options & VACOPT_VACUUM)) + if (!(params.options & VACOPT_VACUUM)) ereport(WARNING, (errmsg("skipping \"%s\" --- cannot analyze non-tables or special system tables", RelationGetRelationName(onerel)))); @@ -275,7 +275,7 @@ analyze_rel(Oid relid, RangeVar *relation, * appropriate acquirefunc for each child table. */ static void -do_analyze_rel(Relation onerel, VacuumParams *params, +do_analyze_rel(Relation onerel, const VacuumParams params, List *va_cols, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, bool inh, bool in_outer_xact, int elevel) @@ -309,9 +309,9 @@ do_analyze_rel(Relation onerel, VacuumParams *params, PgStat_Counter startreadtime = 0; PgStat_Counter startwritetime = 0; - verbose = (params->options & VACOPT_VERBOSE) != 0; + verbose = (params.options & VACOPT_VERBOSE) != 0; instrument = (verbose || (AmAutoVacuumWorkerProcess() && - params->log_min_duration >= 0)); + params.log_min_duration >= 0)); if (inh) ereport(elevel, (errmsg("analyzing \"%s.%s\" inheritance tree", @@ -690,8 +690,8 @@ do_analyze_rel(Relation onerel, VacuumParams *params, * only do it for inherited stats. (We're never called for not-inherited * stats on partitioned tables anyway.) * - * Reset the changes_since_analyze counter only if we analyzed all - * columns; otherwise, there is still work for auto-analyze to do. + * Reset the mod_since_analyze counter only if we analyzed all columns; + * otherwise, there is still work for auto-analyze to do. */ if (!inh) pgstat_report_analyze(onerel, totalrows, totaldeadrows, @@ -706,7 +706,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params, * amvacuumcleanup() when called in ANALYZE-only mode. The only exception * among core index AMs is GIN/ginvacuumcleanup(). */ - if (!(params->options & VACOPT_VACUUM)) + if (!(params.options & VACOPT_VACUUM)) { for (ind = 0; ind < nindexes; ind++) { @@ -736,9 +736,9 @@ do_analyze_rel(Relation onerel, VacuumParams *params, { TimestampTz endtime = GetCurrentTimestamp(); - if (verbose || params->log_min_duration == 0 || + if (verbose || params.log_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, - params->log_min_duration)) + params.log_min_duration)) { long delay_in_ms; WalUsage walusage; diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 54a08e4102e..b55221d44cd 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -917,7 +917,7 @@ copy_table_data(Relation NewHeap, Relation OldHeap, Relation OldIndex, bool verb * not to be aggressive about this. */ memset(¶ms, 0, sizeof(VacuumParams)); - vacuum_get_cutoffs(OldHeap, ¶ms, &cutoffs); + vacuum_get_cutoffs(OldHeap, params, &cutoffs); /* * FreezeXid will become the table's new relfrozenxid, and that mustn't go diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 74ae42b19a7..fae9c41db65 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -322,11 +322,13 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, } /* - * Extract a CopyHeaderChoice value from a DefElem. This is like - * defGetBoolean() but also accepts the special value "match". + * Extract the CopyFormatOptions.header_line value from a DefElem. + * + * Parses the HEADER option for COPY, which can be a boolean, a non-negative + * integer (number of lines to skip), or the special value "match". */ -static CopyHeaderChoice -defGetCopyHeaderChoice(DefElem *def, bool is_from) +static int +defGetCopyHeaderOption(DefElem *def, bool is_from) { /* * If no parameter value given, assume "true" is meant. @@ -335,20 +337,27 @@ defGetCopyHeaderChoice(DefElem *def, bool is_from) return COPY_HEADER_TRUE; /* - * Allow 0, 1, "true", "false", "on", "off", or "match". + * Allow 0, 1, "true", "false", "on", "off", a non-negative integer, or + * "match". */ switch (nodeTag(def->arg)) { case T_Integer: - switch (intVal(def->arg)) { - case 0: - return COPY_HEADER_FALSE; - case 1: - return COPY_HEADER_TRUE; - default: - /* otherwise, error out below */ - break; + int ival = intVal(def->arg); + + if (ival < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("a negative integer value cannot be " + "specified for %s", def->defname))); + + if (!is_from && ival > 1) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use multi-line header in COPY TO"))); + + return ival; } break; default: @@ -381,7 +390,8 @@ defGetCopyHeaderChoice(DefElem *def, bool is_from) } ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("%s requires a Boolean value or \"match\"", + errmsg("%s requires a Boolean value, a non-negative integer, " + "or the string \"match\"", def->defname))); return COPY_HEADER_FALSE; /* keep compiler quiet */ } @@ -566,7 +576,7 @@ ProcessCopyOptions(ParseState *pstate, if (header_specified) errorConflictingDefElem(defel, pstate); header_specified = true; - opts_out->header_line = defGetCopyHeaderChoice(defel, is_from); + opts_out->header_line = defGetCopyHeaderOption(defel, is_from); } else if (strcmp(defel->defname, "quote") == 0) { @@ -769,7 +779,7 @@ ProcessCopyOptions(ParseState *pstate, errmsg("COPY delimiter cannot be \"%s\"", opts_out->delim))); /* Check header */ - if (opts_out->binary && opts_out->header_line) + if (opts_out->binary && opts_out->header_line != COPY_HEADER_FALSE) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), /*- translator: %s is the name of a COPY option, e.g. ON_ERROR */ diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index f5fc346e201..b1ae97b833d 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -771,21 +771,30 @@ static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv) { int fldct; - bool done; + bool done = false; /* only available for text or csv input */ Assert(!cstate->opts.binary); /* on input check that the header line is correct if needed */ - if (cstate->cur_lineno == 0 && cstate->opts.header_line) + if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE) { ListCell *cur; TupleDesc tupDesc; + int lines_to_skip = cstate->opts.header_line; + + /* If set to "match", one header line is skipped */ + if (cstate->opts.header_line == COPY_HEADER_MATCH) + lines_to_skip = 1; tupDesc = RelationGetDescr(cstate->rel); - cstate->cur_lineno++; - done = CopyReadLine(cstate, is_csv); + for (int i = 0; i < lines_to_skip; i++) + { + cstate->cur_lineno++; + if ((done = CopyReadLine(cstate, is_csv))) + break; + } if (cstate->opts.header_line == COPY_HEADER_MATCH) { @@ -1538,7 +1547,7 @@ GetDecimalFromHex(char hex) if (isdigit((unsigned char) hex)) return hex - '0'; else - return tolower((unsigned char) hex) - 'a' + 10; + return pg_ascii_tolower((unsigned char) hex) - 'a' + 10; } /* diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c index f87e405351d..67b94b91cae 100644 --- a/src/backend/commands/copyto.c +++ b/src/backend/commands/copyto.c @@ -199,7 +199,7 @@ CopyToTextLikeStart(CopyToState cstate, TupleDesc tupDesc) cstate->file_encoding); /* if a header has been requested send the line */ - if (cstate->opts.header_line) + if (cstate->opts.header_line == COPY_HEADER_TRUE) { ListCell *cur; bool hdr_delim = false; @@ -835,7 +835,7 @@ BeginCopyTo(ParseState *pstate, ((DR_copy *) dest)->cstate = cstate; /* Create a QueryDesc requesting no output */ - cstate->queryDesc = CreateQueryDesc(plan, NULL, pstate->p_sourcetext, + cstate->queryDesc = CreateQueryDesc(plan, pstate->p_sourcetext, GetActiveSnapshot(), InvalidSnapshot, dest, NULL, NULL, 0); @@ -845,8 +845,7 @@ BeginCopyTo(ParseState *pstate, * * ExecutorStart computes a result tupdesc for us */ - if (!ExecutorStart(cstate->queryDesc, 0)) - elog(ERROR, "ExecutorStart() failed unexpectedly"); + ExecutorStart(cstate->queryDesc, 0); tupDesc = cstate->queryDesc->tupDesc; } diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c index 0a4155773eb..dfd2ab8e862 100644 --- a/src/backend/commands/createas.c +++ b/src/backend/commands/createas.c @@ -334,13 +334,12 @@ ExecCreateTableAs(ParseState *pstate, CreateTableAsStmt *stmt, UpdateActiveSnapshotCommandId(); /* Create a QueryDesc, redirecting output to our tuple receiver */ - queryDesc = CreateQueryDesc(plan, NULL, pstate->p_sourcetext, + queryDesc = CreateQueryDesc(plan, pstate->p_sourcetext, GetActiveSnapshot(), InvalidSnapshot, dest, params, queryEnv, 0); /* call ExecutorStart to prepare the plan for execution */ - if (!ExecutorStart(queryDesc, GetIntoRelEFlags(into))) - elog(ERROR, "ExecutorStart() failed unexpectedly"); + ExecutorStart(queryDesc, GetIntoRelEFlags(into)); /* run the plan to completion */ ExecutorRun(queryDesc, ForwardScanDirection, 0); diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 5fbbcdaabb1..92a396b8406 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -570,8 +570,8 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid, * any CREATE DATABASE commands. */ if (!IsBinaryUpgrade) - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | - CHECKPOINT_WAIT | CHECKPOINT_FLUSH_ALL); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | + CHECKPOINT_WAIT | CHECKPOINT_FLUSH_UNLOGGED); /* * Iterate through all tablespaces of the template database, and copy each @@ -673,7 +673,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid, * strategy that avoids these problems. */ if (!IsBinaryUpgrade) - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); } @@ -1052,7 +1052,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) dbctype = src_ctype; if (dblocprovider == '\0') dblocprovider = src_locprovider; - if (dblocale == NULL) + if (dblocale == NULL && dblocprovider == src_locprovider) dblocale = src_locale; if (dbicurules == NULL) dbicurules = src_icurules; @@ -1065,16 +1065,41 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) /* Check that the chosen locales are valid, and get canonical spellings */ if (!check_locale(LC_COLLATE, dbcollate, &canonname)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate), - errhint("If the locale name is specific to ICU, use ICU_LOCALE."))); + { + if (dblocprovider == COLLPROVIDER_BUILTIN) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate), + errhint("If the locale name is specific to the builtin provider, use BUILTIN_LOCALE."))); + else if (dblocprovider == COLLPROVIDER_ICU) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate), + errhint("If the locale name is specific to the ICU provider, use ICU_LOCALE."))); + else + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_COLLATE locale name: \"%s\"", dbcollate))); + } dbcollate = canonname; if (!check_locale(LC_CTYPE, dbctype, &canonname)) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype), - errhint("If the locale name is specific to ICU, use ICU_LOCALE."))); + { + if (dblocprovider == COLLPROVIDER_BUILTIN) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype), + errhint("If the locale name is specific to the builtin provider, use BUILTIN_LOCALE."))); + else if (dblocprovider == COLLPROVIDER_ICU) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype), + errhint("If the locale name is specific to the ICU provider, use ICU_LOCALE."))); + else + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid LC_CTYPE locale name: \"%s\"", dbctype))); + } + dbctype = canonname; check_encoding_locale_matches(encoding, dbcollate, dbctype); @@ -1845,7 +1870,7 @@ dropdb(const char *dbname, bool missing_ok, bool force) * Force a checkpoint to make sure the checkpointer has received the * message sent by ForgetDatabaseSyncRequests. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); /* Close all smgr fds in all backends. */ WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); @@ -2095,8 +2120,8 @@ movedb(const char *dbname, const char *tblspcname) * On Windows, this also ensures that background procs don't hold any open * files, which would cause rmdir() to fail. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT - | CHECKPOINT_FLUSH_ALL); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT + | CHECKPOINT_FLUSH_UNLOGGED); /* Close all smgr fds in all backends. */ WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE)); @@ -2227,7 +2252,7 @@ movedb(const char *dbname, const char *tblspcname) * any unlogged operations done in the new DB tablespace before the * next checkpoint. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); /* * Force synchronous commit, thus minimizing the window between diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 786ee865f14..8345bc0264b 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -369,8 +369,7 @@ standard_ExplainOneQuery(Query *query, int cursorOptions, } /* run it (if needed) and produce output */ - ExplainOnePlan(plan, NULL, NULL, -1, into, es, queryString, params, - queryEnv, + ExplainOnePlan(plan, into, es, queryString, params, queryEnv, &planduration, (es->buffers ? &bufusage : NULL), es->memory ? &mem_counters : NULL); } @@ -492,9 +491,7 @@ ExplainOneUtility(Node *utilityStmt, IntoClause *into, ExplainState *es, * to call it. */ void -ExplainOnePlan(PlannedStmt *plannedstmt, CachedPlan *cplan, - CachedPlanSource *plansource, int query_index, - IntoClause *into, ExplainState *es, +ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, const char *queryString, ParamListInfo params, QueryEnvironment *queryEnv, const instr_time *planduration, const BufferUsage *bufusage, @@ -550,7 +547,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, CachedPlan *cplan, dest = None_Receiver; /* Create a QueryDesc for the query */ - queryDesc = CreateQueryDesc(plannedstmt, cplan, queryString, + queryDesc = CreateQueryDesc(plannedstmt, queryString, GetActiveSnapshot(), InvalidSnapshot, dest, params, queryEnv, instrument_option); @@ -564,17 +561,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, CachedPlan *cplan, if (into) eflags |= GetIntoRelEFlags(into); - /* Prepare the plan for execution. */ - if (queryDesc->cplan) - { - ExecutorStartCachedPlan(queryDesc, eflags, plansource, query_index); - Assert(queryDesc->planstate); - } - else - { - if (!ExecutorStart(queryDesc, eflags)) - elog(ERROR, "ExecutorStart() failed unexpectedly"); - } + /* call ExecutorStart to prepare the plan for execution */ + ExecutorStart(queryDesc, eflags); /* Execute the plan for statistics if asked for */ if (es->analyze) @@ -823,14 +811,10 @@ ExplainPrintPlan(ExplainState *es, QueryDesc *queryDesc) * the queryid in any of the EXPLAIN plans to keep stable the results * generated by regression test suites. */ - if (es->verbose && queryDesc->plannedstmt->queryId != UINT64CONST(0) && + if (es->verbose && queryDesc->plannedstmt->queryId != INT64CONST(0) && compute_query_id != COMPUTE_QUERY_ID_REGRESS) { - /* - * Output the queryid as an int64 rather than a uint64 so we match - * what would be seen in the BIGINT pg_stat_statements.queryid column. - */ - ExplainPropertyInteger("Query Identifier", NULL, (int64) + ExplainPropertyInteger("Query Identifier", NULL, queryDesc->plannedstmt->queryId, es); } } @@ -1232,6 +1216,10 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) if (((ModifyTable *) plan)->exclRelRTI) *rels_used = bms_add_member(*rels_used, ((ModifyTable *) plan)->exclRelRTI); + /* Ensure Vars used in RETURNING will have refnames */ + if (plan->targetlist) + *rels_used = bms_add_member(*rels_used, + linitial_int(((ModifyTable *) plan)->resultRelations)); break; case T_Append: *rels_used = bms_add_members(*rels_used, @@ -3594,6 +3582,7 @@ static void show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) { Plan *plan = ((PlanState *) mstate)->plan; + Memoize *mplan = (Memoize *) plan; ListCell *lc; List *context; StringInfoData keystr; @@ -3614,7 +3603,7 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) plan, ancestors); - foreach(lc, ((Memoize *) plan)->param_exprs) + foreach(lc, mplan->param_exprs) { Node *expr = (Node *) lfirst(lc); @@ -3630,6 +3619,24 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es) pfree(keystr.data); + if (es->costs) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + appendStringInfo(es->str, "Estimates: capacity=%u distinct keys=%.0f lookups=%.0f hit percent=%.2f%%\n", + mplan->est_entries, mplan->est_unique_keys, + mplan->est_calls, mplan->est_hit_ratio * 100.0); + } + else + { + ExplainPropertyUInteger("Estimated Capacity", NULL, mplan->est_entries, es); + ExplainPropertyFloat("Estimated Distinct Lookup Keys", NULL, mplan->est_unique_keys, 0, es); + ExplainPropertyFloat("Estimated Lookups", NULL, mplan->est_calls, 0, es); + ExplainPropertyFloat("Estimated Hit Percent", NULL, mplan->est_hit_ratio * 100.0, 2, es); + } + } + if (!es->analyze) return; diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index 73c52e970f6..e6f9ab6dfd6 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -993,13 +993,11 @@ execute_sql_string(const char *sql, const char *filename) QueryDesc *qdesc; qdesc = CreateQueryDesc(stmt, - NULL, sql, GetActiveSnapshot(), NULL, dest, NULL, NULL, 0); - if (!ExecutorStart(qdesc, 0)) - elog(ERROR, "ExecutorStart() failed unexpectedly"); + ExecutorStart(qdesc, 0); ExecutorRun(qdesc, ForwardScanDirection, 0); ExecutorFinish(qdesc); ExecutorEnd(qdesc); diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c index c14e038d54f..77f8461f42e 100644 --- a/src/backend/commands/foreigncmds.c +++ b/src/backend/commands/foreigncmds.c @@ -71,15 +71,26 @@ optionListToArray(List *options) foreach(cell, options) { DefElem *def = lfirst(cell); + const char *name; const char *value; Size len; text *t; + name = def->defname; value = defGetString(def); - len = VARHDRSZ + strlen(def->defname) + 1 + strlen(value); + + /* Insist that name not contain "=", else "a=b=c" is ambiguous */ + if (strchr(name, '=') != NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid option name \"%s\": must not contain \"=\"", + name))); + + len = VARHDRSZ + strlen(name) + 1 + strlen(value); + /* +1 leaves room for sprintf's trailing null */ t = palloc(len + 1); SET_VARSIZE(t, len); - sprintf(VARDATA(t), "%s=%s", def->defname, value); + sprintf(VARDATA(t), "%s=%s", name, value); astate = accumArrayResult(astate, PointerGetDatum(t), false, TEXTOID, @@ -1577,6 +1588,7 @@ ImportForeignSchema(ImportForeignSchemaStmt *stmt) pstmt->utilityStmt = (Node *) cstmt; pstmt->stmt_location = rs->stmt_location; pstmt->stmt_len = rs->stmt_len; + pstmt->planOrigin = PLAN_STMT_INTERNAL; /* Execute statement */ ProcessUtility(pstmt, cmd, false, diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index d962fe392cd..6f753ab6d7a 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -2469,8 +2469,8 @@ GetOperatorFromCompareType(Oid opclass, Oid rhstype, CompareType cmptype, cmptype == COMPARE_EQ ? errmsg("could not identify an equality operator for type %s", format_type_be(opcintype)) : cmptype == COMPARE_OVERLAP ? errmsg("could not identify an overlaps operator for type %s", format_type_be(opcintype)) : cmptype == COMPARE_CONTAINED_BY ? errmsg("could not identify a contained-by operator for type %s", format_type_be(opcintype)) : 0, - errdetail("Could not translate compare type %d for operator family \"%s\", input type %s, access method \"%s\".", - cmptype, get_opfamily_name(opfamily, false), format_type_be(opcintype), get_am_name(amid))); + errdetail("Could not translate compare type %d for operator family \"%s\" of access method \"%s\".", + cmptype, get_opfamily_name(opfamily, false), get_am_name(amid))); /* * We parameterize rhstype so foreign keys can ask for a <@ operator @@ -2592,7 +2592,9 @@ makeObjectName(const char *name1, const char *name2, const char *label) * constraint names.) * * Note: it is theoretically possible to get a collision anyway, if someone - * else chooses the same name concurrently. This is fairly unlikely to be + * else chooses the same name concurrently. We shorten the race condition + * window by checking for conflicting relations using SnapshotDirty, but + * that doesn't close the window entirely. This is fairly unlikely to be * a problem in practice, especially if one is holding an exclusive lock on * the relation identified by name1. However, if choosing multiple names * within a single command, you'd better create the new object and do @@ -2608,15 +2610,45 @@ ChooseRelationName(const char *name1, const char *name2, int pass = 0; char *relname = NULL; char modlabel[NAMEDATALEN]; + SnapshotData SnapshotDirty; + Relation pgclassrel; + + /* prepare to search pg_class with a dirty snapshot */ + InitDirtySnapshot(SnapshotDirty); + pgclassrel = table_open(RelationRelationId, AccessShareLock); /* try the unmodified label first */ strlcpy(modlabel, label, sizeof(modlabel)); for (;;) { + ScanKeyData key[2]; + SysScanDesc scan; + bool collides; + relname = makeObjectName(name1, name2, modlabel); - if (!OidIsValid(get_relname_relid(relname, namespaceid))) + /* is there any conflicting relation name? */ + ScanKeyInit(&key[0], + Anum_pg_class_relname, + BTEqualStrategyNumber, F_NAMEEQ, + CStringGetDatum(relname)); + ScanKeyInit(&key[1], + Anum_pg_class_relnamespace, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(namespaceid)); + + scan = systable_beginscan(pgclassrel, ClassNameNspIndexId, + true /* indexOK */ , + &SnapshotDirty, + 2, key); + + collides = HeapTupleIsValid(systable_getnext(scan)); + + systable_endscan(scan); + + /* break out of loop if no conflict */ + if (!collides) { if (!isconstraint || !ConstraintNameExists(relname, namespaceid)) @@ -2628,6 +2660,8 @@ ChooseRelationName(const char *name1, const char *name2, snprintf(modlabel, sizeof(modlabel), "%s%d", label, ++pass); } + table_close(pgclassrel, AccessShareLock); + return relname; } @@ -4226,7 +4260,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein false); /* - * Updating pg_index might involve TOAST table access, so ensure we + * Swapping the indexes might involve TOAST table access, so ensure we * have a valid snapshot. */ PushActiveSnapshot(GetTransactionSnapshot()); diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index e7854add178..188e26f0e6e 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -438,13 +438,12 @@ refresh_matview_datafill(DestReceiver *dest, Query *query, UpdateActiveSnapshotCommandId(); /* Create a QueryDesc, redirecting output to our tuple receiver */ - queryDesc = CreateQueryDesc(plan, NULL, queryString, + queryDesc = CreateQueryDesc(plan, queryString, GetActiveSnapshot(), InvalidSnapshot, dest, NULL, NULL, 0); /* call ExecutorStart to prepare the plan for execution */ - if (!ExecutorStart(queryDesc, 0)) - elog(ERROR, "ExecutorStart() failed unexpectedly"); + ExecutorStart(queryDesc, 0); /* run the plan */ ExecutorRun(queryDesc, ForwardScanDirection, 0); @@ -836,7 +835,8 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner, if (!foundUniqueIndex) ereport(ERROR, errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("could not find suitable unique index on materialized view")); + errmsg("could not find suitable unique index on materialized view \"%s\"", + RelationGetRelationName(matviewRel))); appendStringInfoString(&querybuf, " AND newdata.* OPERATOR(pg_catalog.*=) mv.*) " diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c index 4c2ac045224..e7c8171c102 100644 --- a/src/backend/commands/portalcmds.c +++ b/src/backend/commands/portalcmds.c @@ -117,7 +117,6 @@ PerformCursorOpen(ParseState *pstate, DeclareCursorStmt *cstmt, ParamListInfo pa queryString, CMDTAG_SELECT, /* cursor's query is always a SELECT */ list_make1(plan), - NULL, NULL); /*---------- diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c index bf7d2b2309f..34b6410d6a2 100644 --- a/src/backend/commands/prepare.c +++ b/src/backend/commands/prepare.c @@ -205,8 +205,7 @@ ExecuteQuery(ParseState *pstate, query_string, entry->plansource->commandTag, plan_list, - cplan, - entry->plansource); + cplan); /* * For CREATE TABLE ... AS EXECUTE, we must verify that the prepared @@ -586,7 +585,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, MemoryContextCounters mem_counters; MemoryContext planner_ctx = NULL; MemoryContext saved_ctx = NULL; - int query_index = 0; if (es->memory) { @@ -659,8 +657,7 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, PlannedStmt *pstmt = lfirst_node(PlannedStmt, p); if (pstmt->commandType != CMD_UTILITY) - ExplainOnePlan(pstmt, cplan, entry->plansource, query_index, - into, es, query_string, paramLI, pstate->p_queryEnv, + ExplainOnePlan(pstmt, into, es, query_string, paramLI, pstate->p_queryEnv, &planduration, (es->buffers ? &bufusage : NULL), es->memory ? &mem_counters : NULL); else @@ -671,8 +668,6 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es, /* Separate plans with an appropriate separator */ if (lnext(plan_list, p) != NULL) ExplainSeparatePlans(es); - - query_index++; } if (estate) diff --git a/src/backend/commands/publicationcmds.c b/src/backend/commands/publicationcmds.c index 0b23d94c38e..803c26ab216 100644 --- a/src/backend/commands/publicationcmds.c +++ b/src/backend/commands/publicationcmds.c @@ -2113,25 +2113,25 @@ AlterPublicationOwner_oid(Oid pubid, Oid newOwnerId) static char defGetGeneratedColsOption(DefElem *def) { - char *sval; + char *sval = ""; /* - * If no parameter value given, assume "stored" is meant. + * A parameter value is required. */ - if (!def->arg) - return PUBLISH_GENCOLS_STORED; - - sval = defGetString(def); + if (def->arg) + { + sval = defGetString(def); - if (pg_strcasecmp(sval, "none") == 0) - return PUBLISH_GENCOLS_NONE; - if (pg_strcasecmp(sval, "stored") == 0) - return PUBLISH_GENCOLS_STORED; + if (pg_strcasecmp(sval, "none") == 0) + return PUBLISH_GENCOLS_NONE; + if (pg_strcasecmp(sval, "stored") == 0) + return PUBLISH_GENCOLS_STORED; + } ereport(ERROR, errcode(ERRCODE_SYNTAX_ERROR), - errmsg("%s requires a \"none\" or \"stored\" value", - def->defname)); + errmsg("invalid value for publication parameter \"%s\": \"%s\"", def->defname, sval), + errdetail("Valid values are \"%s\" and \"%s\".", "none", "stored")); return PUBLISH_GENCOLS_NONE; /* keep compiler quiet */ } diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c index 546160f0941..0f03d9743d2 100644 --- a/src/backend/commands/schemacmds.c +++ b/src/backend/commands/schemacmds.c @@ -215,6 +215,7 @@ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString, wrapper->utilityStmt = stmt; wrapper->stmt_location = stmt_location; wrapper->stmt_len = stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; /* do this step */ ProcessUtility(wrapper, diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c index 4aec73bcc6b..cd6c3684482 100644 --- a/src/backend/commands/subscriptioncmds.c +++ b/src/backend/commands/subscriptioncmds.c @@ -14,6 +14,7 @@ #include "postgres.h" +#include "access/commit_ts.h" #include "access/htup_details.h" #include "access/table.h" #include "access/twophase.h" @@ -71,8 +72,9 @@ #define SUBOPT_PASSWORD_REQUIRED 0x00000800 #define SUBOPT_RUN_AS_OWNER 0x00001000 #define SUBOPT_FAILOVER 0x00002000 -#define SUBOPT_LSN 0x00004000 -#define SUBOPT_ORIGIN 0x00008000 +#define SUBOPT_RETAIN_DEAD_TUPLES 0x00004000 +#define SUBOPT_LSN 0x00008000 +#define SUBOPT_ORIGIN 0x00010000 /* check if the 'val' has 'bits' set */ #define IsSet(val, bits) (((val) & (bits)) == (bits)) @@ -98,6 +100,7 @@ typedef struct SubOpts bool passwordrequired; bool runasowner; bool failover; + bool retaindeadtuples; char *origin; XLogRecPtr lsn; } SubOpts; @@ -105,8 +108,10 @@ typedef struct SubOpts static List *fetch_table_list(WalReceiverConn *wrconn, List *publications); static void check_publications_origin(WalReceiverConn *wrconn, List *publications, bool copydata, - char *origin, Oid *subrel_local_oids, - int subrel_count, char *subname); + bool retain_dead_tuples, char *origin, + Oid *subrel_local_oids, int subrel_count, + char *subname); +static void check_pub_dead_tuple_retention(WalReceiverConn *wrconn); static void check_duplicates_in_publist(List *publist, Datum *datums); static List *merge_publications(List *oldpublist, List *newpublist, bool addpub, const char *subname); static void ReportSlotConnectionError(List *rstates, Oid subid, char *slotname, char *err); @@ -162,6 +167,8 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, opts->runasowner = false; if (IsSet(supported_opts, SUBOPT_FAILOVER)) opts->failover = false; + if (IsSet(supported_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + opts->retaindeadtuples = false; if (IsSet(supported_opts, SUBOPT_ORIGIN)) opts->origin = pstrdup(LOGICALREP_ORIGIN_ANY); @@ -210,7 +217,7 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, if (strcmp(opts->slot_name, "none") == 0) opts->slot_name = NULL; else - ReplicationSlotValidateName(opts->slot_name, ERROR); + ReplicationSlotValidateName(opts->slot_name, false, ERROR); } else if (IsSet(supported_opts, SUBOPT_COPY_DATA) && strcmp(defel->defname, "copy_data") == 0) @@ -307,6 +314,15 @@ parse_subscription_options(ParseState *pstate, List *stmt_options, opts->specified_opts |= SUBOPT_FAILOVER; opts->failover = defGetBoolean(defel); } + else if (IsSet(supported_opts, SUBOPT_RETAIN_DEAD_TUPLES) && + strcmp(defel->defname, "retain_dead_tuples") == 0) + { + if (IsSet(opts->specified_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + errorConflictingDefElem(defel, pstate); + + opts->specified_opts |= SUBOPT_RETAIN_DEAD_TUPLES; + opts->retaindeadtuples = defGetBoolean(defel); + } else if (IsSet(supported_opts, SUBOPT_ORIGIN) && strcmp(defel->defname, "origin") == 0) { @@ -563,7 +579,8 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, SUBOPT_SYNCHRONOUS_COMMIT | SUBOPT_BINARY | SUBOPT_STREAMING | SUBOPT_TWOPHASE_COMMIT | SUBOPT_DISABLE_ON_ERR | SUBOPT_PASSWORD_REQUIRED | - SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | SUBOPT_ORIGIN); + SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | + SUBOPT_RETAIN_DEAD_TUPLES | SUBOPT_ORIGIN); parse_subscription_options(pstate, stmt->options, supported_opts, &opts); /* @@ -630,6 +647,10 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, stmt->subname))); } + /* Ensure that we can enable retain_dead_tuples */ + if (opts.retaindeadtuples) + CheckSubDeadTupleRetention(true, !opts.enabled, WARNING); + if (!IsSet(opts.specified_opts, SUBOPT_SLOT_NAME) && opts.slot_name == NULL) opts.slot_name = stmt->subname; @@ -670,6 +691,8 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, values[Anum_pg_subscription_subpasswordrequired - 1] = BoolGetDatum(opts.passwordrequired); values[Anum_pg_subscription_subrunasowner - 1] = BoolGetDatum(opts.runasowner); values[Anum_pg_subscription_subfailover - 1] = BoolGetDatum(opts.failover); + values[Anum_pg_subscription_subretaindeadtuples - 1] = + BoolGetDatum(opts.retaindeadtuples); values[Anum_pg_subscription_subconninfo - 1] = CStringGetTextDatum(conninfo); if (opts.slot_name) @@ -722,7 +745,11 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt, { check_publications(wrconn, publications); check_publications_origin(wrconn, publications, opts.copy_data, - opts.origin, NULL, 0, stmt->subname); + opts.retaindeadtuples, opts.origin, + NULL, 0, stmt->subname); + + if (opts.retaindeadtuples) + check_pub_dead_tuple_retention(wrconn); /* * Set sync state based on if we were asked to do data copy or @@ -881,8 +908,8 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, sizeof(Oid), oid_cmp); check_publications_origin(wrconn, sub->publications, copy_data, - sub->origin, subrel_local_oids, - subrel_count, sub->name); + sub->retaindeadtuples, sub->origin, + subrel_local_oids, subrel_count, sub->name); /* * Rels that we want to remove from subscription and drop any slots @@ -1040,18 +1067,22 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data, } /* - * Common checks for altering failover and two_phase options. + * Common checks for altering failover, two_phase, and retain_dead_tuples + * options. */ static void CheckAlterSubOption(Subscription *sub, const char *option, bool slot_needs_update, bool isTopLevel) { + Assert(strcmp(option, "failover") == 0 || + strcmp(option, "two_phase") == 0 || + strcmp(option, "retain_dead_tuples") == 0); + /* - * The checks in this function are required only for failover and - * two_phase options. + * Altering the retain_dead_tuples option does not update the slot on the + * publisher. */ - Assert(strcmp(option, "failover") == 0 || - strcmp(option, "two_phase") == 0); + Assert(!slot_needs_update || strcmp(option, "retain_dead_tuples") != 0); /* * Do not allow changing the option if the subscription is enabled. This @@ -1063,6 +1094,39 @@ CheckAlterSubOption(Subscription *sub, const char *option, * the publisher by the existing walsender, so we could have allowed that * even when the subscription is enabled. But we kept this restriction for * the sake of consistency and simplicity. + * + * Additionally, do not allow changing the retain_dead_tuples option when + * the subscription is enabled to prevent race conditions arising from the + * new option value being acknowledged asynchronously by the launcher and + * apply workers. + * + * Without the restriction, a race condition may arise when a user + * disables and immediately re-enables the retain_dead_tuples option. In + * this case, the launcher might drop the slot upon noticing the disabled + * action, while the apply worker may keep maintaining + * oldest_nonremovable_xid without noticing the option change. During this + * period, a transaction ID wraparound could falsely make this ID appear + * as if it originates from the future w.r.t the transaction ID stored in + * the slot maintained by launcher. + * + * Similarly, if the user enables retain_dead_tuples concurrently with the + * launcher starting the worker, the apply worker may start calculating + * oldest_nonremovable_xid before the launcher notices the enable action. + * Consequently, the launcher may update slot.xmin to a newer value than + * that maintained by the worker. In subsequent cycles, upon integrating + * the worker's oldest_nonremovable_xid, the launcher might detect a + * retreat in the calculated xmin, necessitating additional handling. + * + * XXX To address the above race conditions, we can define + * oldest_nonremovable_xid as FullTransactionID and adds the check to + * disallow retreating the conflict slot's xmin. For now, we kept the + * implementation simple by disallowing change to the retain_dead_tuples, + * but in the future we can change this after some more analysis. + * + * Note that we could restrict only the enabling of retain_dead_tuples to + * avoid the race conditions described above, but we maintain the + * restriction for both enable and disable operations for the sake of + * consistency. */ if (sub->enabled) ereport(ERROR, @@ -1110,6 +1174,9 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, bool update_tuple = false; bool update_failover = false; bool update_two_phase = false; + bool check_pub_rdt = false; + bool retain_dead_tuples; + char *origin; Subscription *sub; Form_pg_subscription form; bits32 supported_opts; @@ -1137,6 +1204,9 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, sub = GetSubscription(subid, false); + retain_dead_tuples = sub->retaindeadtuples; + origin = sub->origin; + /* * Don't allow non-superuser modification of a subscription with * password_required=false. @@ -1165,7 +1235,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, SUBOPT_DISABLE_ON_ERR | SUBOPT_PASSWORD_REQUIRED | SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | - SUBOPT_ORIGIN); + SUBOPT_RETAIN_DEAD_TUPLES | SUBOPT_ORIGIN); parse_subscription_options(pstate, stmt->options, supported_opts, &opts); @@ -1267,7 +1337,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, IsSet(opts.specified_opts, SUBOPT_SLOT_NAME)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("slot_name and two_phase cannot be altered at the same time"))); + errmsg("\"slot_name\" and \"two_phase\" cannot be altered at the same time"))); /* * Note that workers may still survive even if the @@ -1283,7 +1353,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, if (logicalrep_workers_find(subid, true, true)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot alter two_phase when logical replication worker is still running"), + errmsg("cannot alter \"two_phase\" when logical replication worker is still running"), errhint("Try again after some time."))); /* @@ -1297,7 +1367,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, LookupGXactBySubid(subid)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot disable two_phase when prepared transactions are present"), + errmsg("cannot disable \"two_phase\" when prepared transactions exist"), errhint("Resolve these transactions and try again."))); /* Change system catalog accordingly */ @@ -1325,11 +1395,62 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, replaces[Anum_pg_subscription_subfailover - 1] = true; } + if (IsSet(opts.specified_opts, SUBOPT_RETAIN_DEAD_TUPLES)) + { + values[Anum_pg_subscription_subretaindeadtuples - 1] = + BoolGetDatum(opts.retaindeadtuples); + replaces[Anum_pg_subscription_subretaindeadtuples - 1] = true; + + CheckAlterSubOption(sub, "retain_dead_tuples", false, isTopLevel); + + /* + * Workers may continue running even after the + * subscription has been disabled. + * + * To prevent race conditions (as described in + * CheckAlterSubOption()), ensure that all worker + * processes have already exited before proceeding. + */ + if (logicalrep_workers_find(subid, true, true)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot alter retain_dead_tuples when logical replication worker is still running"), + errhint("Try again after some time."))); + + /* + * Remind the user that enabling subscription will prevent + * the accumulation of dead tuples. + */ + if (opts.retaindeadtuples) + CheckSubDeadTupleRetention(true, !sub->enabled, NOTICE); + + /* + * Notify the launcher to manage the replication slot for + * conflict detection. This ensures that replication slot + * is efficiently handled (created, updated, or dropped) + * in response to any configuration changes. + */ + ApplyLauncherWakeupAtCommit(); + + check_pub_rdt = opts.retaindeadtuples; + retain_dead_tuples = opts.retaindeadtuples; + } + if (IsSet(opts.specified_opts, SUBOPT_ORIGIN)) { values[Anum_pg_subscription_suborigin - 1] = CStringGetTextDatum(opts.origin); replaces[Anum_pg_subscription_suborigin - 1] = true; + + /* + * Check if changes from different origins may be received + * from the publisher when the origin is changed to ANY + * and retain_dead_tuples is enabled. + */ + check_pub_rdt = retain_dead_tuples && + pg_strcasecmp(opts.origin, LOGICALREP_ORIGIN_ANY) == 0; + + origin = opts.origin; } update_tuple = true; @@ -1347,6 +1468,15 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot enable subscription that does not have a slot name"))); + /* + * Check track_commit_timestamp only when enabling the + * subscription in case it was disabled after creation. See + * comments atop CheckSubDeadTupleRetention() for details. + */ + if (sub->retaindeadtuples) + CheckSubDeadTupleRetention(opts.enabled, !opts.enabled, + WARNING); + values[Anum_pg_subscription_subenabled - 1] = BoolGetDatum(opts.enabled); replaces[Anum_pg_subscription_subenabled - 1] = true; @@ -1355,6 +1485,14 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, ApplyLauncherWakeupAtCommit(); update_tuple = true; + + /* + * The subscription might be initially created with + * connect=false and retain_dead_tuples=true, meaning the + * remote server's status may not be checked. Ensure this + * check is conducted now. + */ + check_pub_rdt = sub->retaindeadtuples && opts.enabled; break; } @@ -1369,6 +1507,13 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, CStringGetTextDatum(stmt->conninfo); replaces[Anum_pg_subscription_subconninfo - 1] = true; update_tuple = true; + + /* + * Since the remote server configuration might have changed, + * perform a check to ensure it permits enabling + * retain_dead_tuples. + */ + check_pub_rdt = sub->retaindeadtuples; break; case ALTER_SUBSCRIPTION_SET_PUBLICATION: @@ -1539,7 +1684,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, if (!XLogRecPtrIsInvalid(remote_lsn) && opts.lsn < remote_lsn) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("skip WAL location (LSN %X/%X) must be greater than origin LSN %X/%X", + errmsg("skip WAL location (LSN %X/%08X) must be greater than origin LSN %X/%08X", LSN_FORMAT_ARGS(opts.lsn), LSN_FORMAT_ARGS(remote_lsn)))); } @@ -1568,14 +1713,15 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, } /* - * Try to acquire the connection necessary for altering the slot, if - * needed. + * Try to acquire the connection necessary either for modifying the slot + * or for checking if the remote server permits enabling + * retain_dead_tuples. * * This has to be at the end because otherwise if there is an error while * doing the database operations we won't be able to rollback altered * slot. */ - if (update_failover || update_two_phase) + if (update_failover || update_two_phase || check_pub_rdt) { bool must_use_password; char *err; @@ -1584,10 +1730,14 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, /* Load the library providing us libpq calls. */ load_file("libpqwalreceiver", false); - /* Try to connect to the publisher. */ + /* + * Try to connect to the publisher, using the new connection string if + * available. + */ must_use_password = sub->passwordrequired && !sub->ownersuperuser; - wrconn = walrcv_connect(sub->conninfo, true, true, must_use_password, - sub->name, &err); + wrconn = walrcv_connect(stmt->conninfo ? stmt->conninfo : sub->conninfo, + true, true, must_use_password, sub->name, + &err); if (!wrconn) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), @@ -1596,9 +1746,17 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, PG_TRY(); { - walrcv_alter_slot(wrconn, sub->slotname, - update_failover ? &opts.failover : NULL, - update_two_phase ? &opts.twophase : NULL); + if (retain_dead_tuples) + check_pub_dead_tuple_retention(wrconn); + + check_publications_origin(wrconn, sub->publications, false, + retain_dead_tuples, origin, NULL, 0, + sub->name); + + if (update_failover || update_two_phase) + walrcv_alter_slot(wrconn, sub->slotname, + update_failover ? &opts.failover : NULL, + update_two_phase ? &opts.twophase : NULL); } PG_FINALLY(); { @@ -2086,20 +2244,29 @@ AlterSubscriptionOwner_oid(Oid subid, Oid newOwnerId) * Check and log a warning if the publisher has subscribed to the same table, * its partition ancestors (if it's a partition), or its partition children (if * it's a partitioned table), from some other publishers. This check is - * required only if "copy_data = true" and "origin = none" for CREATE - * SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements to notify the - * user that data having origin might have been copied. + * required in the following scenarios: * - * This check need not be performed on the tables that are already added - * because incremental sync for those tables will happen through WAL and the - * origin of the data can be identified from the WAL records. + * 1) For CREATE SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements + * with "copy_data = true" and "origin = none": + * - Warn the user that data with an origin might have been copied. + * - This check is skipped for tables already added, as incremental sync via + * WAL allows origin tracking. The list of such tables is in + * subrel_local_oids. * - * subrel_local_oids contains the list of relation oids that are already - * present on the subscriber. + * 2) For CREATE SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements + * with "retain_dead_tuples = true" and "origin = any", and for ALTER + * SUBSCRIPTION statements that modify retain_dead_tuples or origin, or + * when the publisher's status changes (e.g., due to a connection string + * update): + * - Warn the user that only conflict detection info for local changes on + * the publisher is retained. Data from other origins may lack sufficient + * details for reliable conflict detection. + * - See comments atop worker.c for more details. */ static void check_publications_origin(WalReceiverConn *wrconn, List *publications, - bool copydata, char *origin, Oid *subrel_local_oids, + bool copydata, bool retain_dead_tuples, + char *origin, Oid *subrel_local_oids, int subrel_count, char *subname) { WalRcvExecResult *res; @@ -2108,9 +2275,29 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, Oid tableRow[1] = {TEXTOID}; List *publist = NIL; int i; + bool check_rdt; + bool check_table_sync; + bool origin_none = origin && + pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) == 0; + + /* + * Enable retain_dead_tuples checks only when origin is set to 'any', + * since with origin='none' only local changes are replicated to the + * subscriber. + */ + check_rdt = retain_dead_tuples && !origin_none; + + /* + * Enable table synchronization checks only when origin is 'none', to + * ensure that data from other origins is not inadvertently copied. + */ + check_table_sync = copydata && origin_none; - if (!copydata || !origin || - (pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) != 0)) + /* retain_dead_tuples and table sync checks occur separately */ + Assert(!(check_rdt && check_table_sync)); + + /* Return if no checks are required */ + if (!check_rdt && !check_table_sync) return; initStringInfo(&cmd); @@ -2129,16 +2316,23 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, /* * In case of ALTER SUBSCRIPTION ... REFRESH, subrel_local_oids contains * the list of relation oids that are already present on the subscriber. - * This check should be skipped for these tables. + * This check should be skipped for these tables if checking for table + * sync scenario. However, when handling the retain_dead_tuples scenario, + * ensure all tables are checked, as some existing tables may now include + * changes from other origins due to newly created subscriptions on the + * publisher. */ - for (i = 0; i < subrel_count; i++) + if (check_table_sync) { - Oid relid = subrel_local_oids[i]; - char *schemaname = get_namespace_name(get_rel_namespace(relid)); - char *tablename = get_rel_name(relid); + for (i = 0; i < subrel_count; i++) + { + Oid relid = subrel_local_oids[i]; + char *schemaname = get_namespace_name(get_rel_namespace(relid)); + char *tablename = get_rel_name(relid); - appendStringInfo(&cmd, "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n", - schemaname, tablename); + appendStringInfo(&cmd, "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n", + schemaname, tablename); + } } res = walrcv_exec(wrconn, cmd.data, 1, tableRow); @@ -2173,22 +2367,37 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, * XXX: For simplicity, we don't check whether the table has any data or * not. If the table doesn't have any data then we don't need to * distinguish between data having origin and data not having origin so we - * can avoid logging a warning in that case. + * can avoid logging a warning for table sync scenario. */ if (publist) { StringInfo pubnames = makeStringInfo(); + StringInfo err_msg = makeStringInfo(); + StringInfo err_hint = makeStringInfo(); /* Prepare the list of publication(s) for warning message. */ GetPublicationsStr(publist, pubnames, false); + + if (check_table_sync) + { + appendStringInfo(err_msg, _("subscription \"%s\" requested copy_data with origin = NONE but might copy data that had a different origin"), + subname); + appendStringInfoString(err_hint, _("Verify that initial data copied from the publisher tables did not come from other origins.")); + } + else + { + appendStringInfo(err_msg, _("subscription \"%s\" enabled retain_dead_tuples but might not reliably detect conflicts for changes from different origins"), + subname); + appendStringInfoString(err_hint, _("Consider using origin = NONE or disabling retain_dead_tuples.")); + } + ereport(WARNING, errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("subscription \"%s\" requested copy_data with origin = NONE but might copy data that had a different origin", - subname), - errdetail_plural("The subscription being created subscribes to a publication (%s) that contains tables that are written to by other subscriptions.", - "The subscription being created subscribes to publications (%s) that contain tables that are written to by other subscriptions.", + errmsg_internal("%s", err_msg->data), + errdetail_plural("The subscription subscribes to a publication (%s) that contains tables that are written to by other subscriptions.", + "The subscription subscribes to publications (%s) that contain tables that are written to by other subscriptions.", list_length(publist), pubnames->data), - errhint("Verify that initial data copied from the publisher tables did not come from other origins.")); + errhint_internal("%s", err_hint->data)); } ExecDropSingleTupleTableSlot(slot); @@ -2197,6 +2406,101 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications, } /* + * Determine whether the retain_dead_tuples can be enabled based on the + * publisher's status. + * + * This option is disallowed if the publisher is running a version earlier + * than the PG19, or if the publisher is in recovery (i.e., it is a standby + * server). + * + * See comments atop worker.c for a detailed explanation. + */ +static void +check_pub_dead_tuple_retention(WalReceiverConn *wrconn) +{ + WalRcvExecResult *res; + Oid RecoveryRow[1] = {BOOLOID}; + TupleTableSlot *slot; + bool isnull; + bool remote_in_recovery; + + if (walrcv_server_version(wrconn) < 19000) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot enable retain_dead_tuples if the publisher is running a version earlier than PostgreSQL 19")); + + res = walrcv_exec(wrconn, "SELECT pg_is_in_recovery()", 1, RecoveryRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not obtain recovery progress from the publisher: %s", + res->err))); + + slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple); + if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + elog(ERROR, "failed to fetch tuple for the recovery progress"); + + remote_in_recovery = DatumGetBool(slot_getattr(slot, 1, &isnull)); + + if (remote_in_recovery) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot enable retain_dead_tuples if the publisher is in recovery.")); + + ExecDropSingleTupleTableSlot(slot); + + walrcv_clear_result(res); +} + +/* + * Check if the subscriber's configuration is adequate to enable the + * retain_dead_tuples option. + * + * Issue an ERROR if the wal_level does not support the use of replication + * slots when check_guc is set to true. + * + * Issue a WARNING if track_commit_timestamp is not enabled when check_guc is + * set to true. This is only to highlight the importance of enabling + * track_commit_timestamp instead of catching all the misconfigurations, as + * this setting can be adjusted after subscription creation. Without it, the + * apply worker will simply skip conflict detection. + * + * Issue a WARNING or NOTICE if the subscription is disabled. Do not raise an + * ERROR since users can only modify retain_dead_tuples for disabled + * subscriptions. And as long as the subscription is enabled promptly, it will + * not pose issues. + */ +void +CheckSubDeadTupleRetention(bool check_guc, bool sub_disabled, + int elevel_for_sub_disabled) +{ + Assert(elevel_for_sub_disabled == NOTICE || + elevel_for_sub_disabled == WARNING); + + if (check_guc && wal_level < WAL_LEVEL_REPLICA) + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("\"wal_level\" is insufficient to create the replication slot required by retain_dead_tuples"), + errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")); + + if (check_guc && !track_commit_timestamp) + ereport(WARNING, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("commit timestamp and origin data required for detecting conflicts won't be retained"), + errhint("Consider setting \"%s\" to true.", + "track_commit_timestamp")); + + if (sub_disabled) + ereport(elevel_for_sub_disabled, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("deleted rows to detect conflicts would not be removed until the subscription is enabled"), + (elevel_for_sub_disabled > NOTICE) + ? errhint("Consider setting %s to false.", + "retain_dead_tuples") : 0); +} + +/* * Get the list of tables which belong to specified publications on the * publisher connection. * diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 54ad38247aa..cb811520c29 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -430,8 +430,8 @@ static void AlterConstrUpdateConstraintEntry(ATAlterConstraint *cmdcon, Relation static ObjectAddress ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, bool recurse, bool recursing, LOCKMODE lockmode); -static void QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, - HeapTuple contuple, LOCKMODE lockmode); +static void QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation fkrel, + Oid pkrelid, HeapTuple contuple, LOCKMODE lockmode); static void QueueCheckConstraintValidation(List **wqueue, Relation conrel, Relation rel, char *constrName, HeapTuple contuple, bool recurse, bool recursing, LOCKMODE lockmode); @@ -2711,8 +2711,7 @@ MergeAttributes(List *columns, const List *supers, char relpersistence, RelationGetRelationName(relation)))); /* If existing rel is temp, it must belong to this session */ - if (relation->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !relation->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(relation)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg(!is_partition @@ -7374,7 +7373,7 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel, /* make sure datatype is legal for a column */ CheckAttributeType(NameStr(attribute->attname), attribute->atttypid, attribute->attcollation, list_make1_oid(rel->rd_rel->reltype), - 0); + (attribute->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL ? CHKATYPE_IS_VIRTUAL : 0)); InsertPgAttributeTuples(attrdesc, tupdesc, myrelid, NULL, NULL); @@ -8609,7 +8608,7 @@ ATExecSetExpression(AlteredTableInfo *tab, Relation rel, const char *colName, rel->rd_att->constr && rel->rd_att->constr->num_check > 0) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns on tables with check constraints"), + errmsg("ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns in tables with check constraints"), errdetail("Column \"%s\" of relation \"%s\" is a virtual generated column.", colName, RelationGetRelationName(rel)))); @@ -8627,7 +8626,7 @@ ATExecSetExpression(AlteredTableInfo *tab, Relation rel, const char *colName, GetRelationPublications(RelationGetRelid(rel)) != NIL) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns on tables that are part of a publication"), + errmsg("ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns in tables that are part of a publication"), errdetail("Column \"%s\" of relation \"%s\" is a virtual generated column.", colName, RelationGetRelationName(rel)))); @@ -10189,7 +10188,7 @@ ATAddForeignKeyConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, if (pk_has_without_overlaps && !with_period) ereport(ERROR, errcode(ERRCODE_INVALID_FOREIGN_KEY), - errmsg("foreign key must use PERIOD when referencing a primary using WITHOUT OVERLAPS")); + errmsg("foreign key must use PERIOD when referencing a primary key using WITHOUT OVERLAPS")); /* * Now we can check permissions. @@ -10330,8 +10329,8 @@ ATAddForeignKeyConstraint(List **wqueue, AlteredTableInfo *tab, Relation rel, for_overlaps ? errmsg("could not identify an overlaps operator for foreign key") : errmsg("could not identify an equality operator for foreign key"), - errdetail("Could not translate compare type %d for operator family \"%s\", input type %s, access method \"%s\".", - cmptype, get_opfamily_name(opfamily, false), format_type_be(opcintype), get_am_name(amid))); + errdetail("Could not translate compare type %d for operator family \"%s\" of access method \"%s\".", + cmptype, get_opfamily_name(opfamily, false), get_am_name(amid))); /* * There had better be a primary equality operator for the index. @@ -11858,6 +11857,7 @@ AttachPartitionForeignKey(List **wqueue, if (queueValidation) { Relation conrel; + Oid confrelid; conrel = table_open(ConstraintRelationId, RowExclusiveLock); @@ -11865,9 +11865,11 @@ AttachPartitionForeignKey(List **wqueue, if (!HeapTupleIsValid(partcontup)) elog(ERROR, "cache lookup failed for constraint %u", partConstrOid); + confrelid = ((Form_pg_constraint) GETSTRUCT(partcontup))->confrelid; + /* Use the same lock as for AT_ValidateConstraint */ - QueueFKConstraintValidation(wqueue, conrel, partition, partcontup, - ShareUpdateExclusiveLock); + QueueFKConstraintValidation(wqueue, conrel, partition, confrelid, + partcontup, ShareUpdateExclusiveLock); ReleaseSysCache(partcontup); table_close(conrel, RowExclusiveLock); } @@ -12463,9 +12465,12 @@ ATExecAlterConstrEnforceability(List **wqueue, ATAlterConstraint *cmdcon, /* * Tell Phase 3 to check that the constraint is satisfied by existing - * rows. + * rows. Only applies to leaf partitions, and (for constraints that + * reference a partitioned table) only if this is not one of the + * pg_constraint rows that exist solely to support action triggers. */ - if (rel->rd_rel->relkind == RELKIND_RELATION) + if (rel->rd_rel->relkind == RELKIND_RELATION && + currcon->confrelid == pkrelid) { AlteredTableInfo *tab; NewConstraint *newcon; @@ -12907,8 +12912,9 @@ ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, con->contype != CONSTRAINT_NOTNULL) ereport(ERROR, errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("constraint \"%s\" of relation \"%s\" is not a foreign key, check, or not-null constraint", - constrName, RelationGetRelationName(rel))); + errmsg("cannot validate constraint \"%s\" of relation \"%s\"", + constrName, RelationGetRelationName(rel)), + errdetail("This operation is not supported for this type of constraint.")); if (!con->conenforced) ereport(ERROR, @@ -12919,7 +12925,8 @@ ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, { if (con->contype == CONSTRAINT_FOREIGN) { - QueueFKConstraintValidation(wqueue, conrel, rel, tuple, lockmode); + QueueFKConstraintValidation(wqueue, conrel, rel, con->confrelid, + tuple, lockmode); } else if (con->contype == CONSTRAINT_CHECK) { @@ -12952,8 +12959,8 @@ ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName, * for the specified relation and all its children. */ static void -QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, - HeapTuple contuple, LOCKMODE lockmode) +QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation fkrel, + Oid pkrelid, HeapTuple contuple, LOCKMODE lockmode) { Form_pg_constraint con; AlteredTableInfo *tab; @@ -12964,7 +12971,17 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, Assert(con->contype == CONSTRAINT_FOREIGN); Assert(!con->convalidated); - if (rel->rd_rel->relkind == RELKIND_RELATION) + /* + * Add the validation to phase 3's queue; not needed for partitioned + * tables themselves, only for their partitions. + * + * When the referenced table (pkrelid) is partitioned, the referencing + * table (fkrel) has one pg_constraint row pointing to each partition + * thereof. These rows are there only to support action triggers and no + * table scan is needed, therefore skip this for them as well. + */ + if (fkrel->rd_rel->relkind == RELKIND_RELATION && + con->confrelid == pkrelid) { NewConstraint *newcon; Constraint *fkconstraint; @@ -12983,15 +13000,16 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, newcon->qual = (Node *) fkconstraint; /* Find or create work queue entry for this table */ - tab = ATGetQueueEntry(wqueue, rel); + tab = ATGetQueueEntry(wqueue, fkrel); tab->constraints = lappend(tab->constraints, newcon); } /* * If the table at either end of the constraint is partitioned, we need to - * recurse and handle every constraint that is a child of this constraint. + * recurse and handle every unvalidate constraint that is a child of this + * constraint. */ - if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE || + if (fkrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE || get_rel_relkind(con->confrelid) == RELKIND_PARTITIONED_TABLE) { ScanKeyData pkey; @@ -13023,8 +13041,12 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, childrel = table_open(childcon->conrelid, lockmode); - QueueFKConstraintValidation(wqueue, conrel, childrel, childtup, - lockmode); + /* + * NB: Note that pkrelid should be passed as-is during recursion, + * as it is required to identify the root referenced table. + */ + QueueFKConstraintValidation(wqueue, conrel, childrel, pkrelid, + childtup, lockmode); table_close(childrel, NoLock); } @@ -13032,7 +13054,11 @@ QueueFKConstraintValidation(List **wqueue, Relation conrel, Relation rel, } /* - * Now update the catalog, while we have the door open. + * Now mark the pg_constraint row as validated (even if we didn't check, + * notably the ones for partitions on the referenced side). + * + * We rely on transaction abort to roll back this change if phase 3 + * ultimately finds violating rows. This is a bit ugly. */ copyTuple = heap_copytuple(contuple); copy_con = (Form_pg_constraint) GETSTRUCT(copyTuple); @@ -14400,7 +14426,7 @@ ATPrepAlterColumnType(List **wqueue, /* make sure datatype is legal for a column */ CheckAttributeType(colName, targettype, targetcollid, list_make1_oid(rel->rd_rel->reltype), - 0); + (attTup->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL ? CHKATYPE_IS_VIRTUAL : 0)); if (attTup->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) { @@ -14458,6 +14484,9 @@ ATPrepAlterColumnType(List **wqueue, /* Fix collations after all else */ assign_expr_collations(pstate, transform); + /* Expand virtual generated columns in the expr. */ + transform = expand_generated_columns_in_expr(transform, rel, 1); + /* Plan the expr now so we can accurately assess the need to rewrite. */ transform = (Node *) expression_planner((Expr *) transform); @@ -15385,9 +15414,12 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) /* * Re-parse the index and constraint definitions, and attach them to the * appropriate work queue entries. We do this before dropping because in - * the case of a FOREIGN KEY constraint, we might not yet have exclusive - * lock on the table the constraint is attached to, and we need to get - * that before reparsing/dropping. + * the case of a constraint on another table, we might not yet have + * exclusive lock on the table the constraint is attached to, and we need + * to get that before reparsing/dropping. (That's possible at least for + * FOREIGN KEY, CHECK, and EXCLUSION constraints; in non-FK cases it + * requires a dependency on the target table's composite type in the other + * table's constraint expressions.) * * We can't rely on the output of deparsing to tell us which relation to * operate on, because concurrent activity might have made the name @@ -15403,7 +15435,6 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) Form_pg_constraint con; Oid relid; Oid confrelid; - char contype; bool conislocal; tup = SearchSysCache1(CONSTROID, ObjectIdGetDatum(oldId)); @@ -15420,7 +15451,6 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) elog(ERROR, "could not identify relation associated with constraint %u", oldId); } confrelid = con->confrelid; - contype = con->contype; conislocal = con->conislocal; ReleaseSysCache(tup); @@ -15438,12 +15468,12 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) continue; /* - * When rebuilding an FK constraint that references the table we're - * modifying, we might not yet have any lock on the FK's table, so get - * one now. We'll need AccessExclusiveLock for the DROP CONSTRAINT - * step, so there's no value in asking for anything weaker. + * When rebuilding another table's constraint that references the + * table we're modifying, we might not yet have any lock on the other + * table, so get one now. We'll need AccessExclusiveLock for the DROP + * CONSTRAINT step, so there's no value in asking for anything weaker. */ - if (relid != tab->relid && contype == CONSTRAINT_FOREIGN) + if (relid != tab->relid) LockRelationOid(relid, AccessExclusiveLock); ATPostAlterTypeParse(oldId, relid, confrelid, @@ -15457,6 +15487,14 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) Oid relid; relid = IndexGetRelation(oldId, false); + + /* + * As above, make sure we have lock on the index's table if it's not + * the same table. + */ + if (relid != tab->relid) + LockRelationOid(relid, AccessExclusiveLock); + ATPostAlterTypeParse(oldId, relid, InvalidOid, (char *) lfirst(def_item), wqueue, lockmode, tab->rewrite); @@ -15473,6 +15511,20 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) Oid relid; relid = StatisticsGetRelation(oldId, false); + + /* + * As above, make sure we have lock on the statistics object's table + * if it's not the same table. However, we take + * ShareUpdateExclusiveLock here, aligning with the lock level used in + * CreateStatistics and RemoveStatisticsById. + * + * CAUTION: this should be done after all cases that grab + * AccessExclusiveLock, else we risk causing deadlock due to needing + * to promote our table lock. + */ + if (relid != tab->relid) + LockRelationOid(relid, ShareUpdateExclusiveLock); + ATPostAlterTypeParse(oldId, relid, InvalidOid, (char *) lfirst(def_item), wqueue, lockmode, tab->rewrite); @@ -15696,7 +15748,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd, { AlterDomainStmt *stmt = (AlterDomainStmt *) stm; - if (stmt->subtype == 'C') /* ADD CONSTRAINT */ + if (stmt->subtype == AD_AddConstraint) { Constraint *con = castNode(Constraint, stmt->def); AlterTableCmd *cmd = makeNode(AlterTableCmd); @@ -17199,15 +17251,13 @@ ATExecAddInherit(Relation child_rel, RangeVar *parent, LOCKMODE lockmode) RelationGetRelationName(parent_rel)))); /* If parent rel is temp, it must belong to this session */ - if (parent_rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !parent_rel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(parent_rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot inherit from temporary relation of another session"))); /* Ditto for the child */ - if (child_rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !child_rel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(child_rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot inherit to temporary relation of another session"))); @@ -20278,15 +20328,13 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd, RelationGetRelationName(rel)))); /* If the parent is temp, it must belong to this session */ - if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !rel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot attach as partition of temporary relation of another session"))); /* Ditto for the partition */ - if (attachrel->rd_rel->relpersistence == RELPERSISTENCE_TEMP && - !attachrel->rd_islocaltemp) + if (RELATION_IS_OTHER_TEMP(attachrel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("cannot attach temporary relation of another session as partition"))); @@ -20964,9 +21012,17 @@ ATExecDetachPartition(List **wqueue, AlteredTableInfo *tab, Relation rel, tab->rel = rel; } + /* + * Detaching the partition might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + /* Do the final part of detaching */ DetachPartitionFinalize(rel, partRel, concurrent, defaultPartOid); + PopActiveSnapshot(); + ObjectAddressSet(address, RelationRelationId, RelationGetRelid(partRel)); /* keep our lock until commit */ diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index a9005cc7212..df31eace47a 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -500,7 +500,7 @@ DropTableSpace(DropTableSpaceStmt *stmt) * mustn't delete. So instead, we force a checkpoint which will clean * out any lingering files, and try again. */ - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT); + RequestCheckpoint(CHECKPOINT_FAST | CHECKPOINT_FORCE | CHECKPOINT_WAIT); /* * On Windows, an unlinked file persists in the directory listing diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index c9f61130c69..7dc121f73f1 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -80,6 +80,7 @@ static bool GetTupleForTrigger(EState *estate, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot *oldslot, + bool do_epq_recheck, TupleTableSlot **epqslot, TM_Result *tmresultp, TM_FailureData *tmfdp); @@ -2693,7 +2694,8 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, - TM_FailureData *tmfd) + TM_FailureData *tmfd, + bool is_merge_delete) { TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo); TriggerDesc *trigdesc = relinfo->ri_TrigDesc; @@ -2708,9 +2710,17 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, { TupleTableSlot *epqslot_candidate = NULL; + /* + * Get a copy of the on-disk tuple we are planning to delete. In + * general, if the tuple has been concurrently updated, we should + * recheck it using EPQ. However, if this is a MERGE DELETE action, + * we skip this EPQ recheck and leave it to the caller (it must do + * additional rechecking, and might end up executing a different + * action entirely). + */ if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - LockTupleExclusive, slot, &epqslot_candidate, - tmresult, tmfd)) + LockTupleExclusive, slot, !is_merge_delete, + &epqslot_candidate, tmresult, tmfd)) return false; /* @@ -2800,6 +2810,7 @@ ExecARDeleteTriggers(EState *estate, tupleid, LockTupleExclusive, slot, + false, NULL, NULL, NULL); @@ -2944,7 +2955,8 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, - TM_FailureData *tmfd) + TM_FailureData *tmfd, + bool is_merge_update) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; TupleTableSlot *oldslot = ExecGetTriggerOldSlot(estate, relinfo); @@ -2965,10 +2977,17 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, { TupleTableSlot *epqslot_candidate = NULL; - /* get a copy of the on-disk tuple we are planning to update */ + /* + * Get a copy of the on-disk tuple we are planning to update. In + * general, if the tuple has been concurrently updated, we should + * recheck it using EPQ. However, if this is a MERGE UPDATE action, + * we skip this EPQ recheck and leave it to the caller (it must do + * additional rechecking, and might end up executing a different + * action entirely). + */ if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - lockmode, oldslot, &epqslot_candidate, - tmresult, tmfd)) + lockmode, oldslot, !is_merge_update, + &epqslot_candidate, tmresult, tmfd)) return false; /* cancel the update action */ /* @@ -3142,6 +3161,7 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, tupleid, LockTupleExclusive, oldslot, + false, NULL, NULL, NULL); @@ -3298,6 +3318,7 @@ GetTupleForTrigger(EState *estate, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot *oldslot, + bool do_epq_recheck, TupleTableSlot **epqslot, TM_Result *tmresultp, TM_FailureData *tmfdp) @@ -3357,29 +3378,30 @@ GetTupleForTrigger(EState *estate, if (tmfd.traversed) { /* - * Recheck the tuple using EPQ. For MERGE, we leave this - * to the caller (it must do additional rechecking, and - * might end up executing a different action entirely). + * Recheck the tuple using EPQ, if requested. Otherwise, + * just return that it was concurrently updated. */ - if (estate->es_plannedstmt->commandType == CMD_MERGE) + if (do_epq_recheck) { - if (tmresultp) - *tmresultp = TM_Updated; - return false; + *epqslot = EvalPlanQual(epqstate, + relation, + relinfo->ri_RangeTableIndex, + oldslot); + + /* + * If PlanQual failed for updated tuple - we must not + * process this tuple! + */ + if (TupIsNull(*epqslot)) + { + *epqslot = NULL; + return false; + } } - - *epqslot = EvalPlanQual(epqstate, - relation, - relinfo->ri_RangeTableIndex, - oldslot); - - /* - * If PlanQual failed for updated tuple - we must not - * process this tuple! - */ - if (TupIsNull(*epqslot)) + else { - *epqslot = NULL; + if (tmresultp) + *tmresultp = TM_Updated; return false; } } @@ -5058,21 +5080,6 @@ AfterTriggerBeginQuery(void) /* ---------- - * AfterTriggerAbortQuery() - * - * Called by standard_ExecutorEnd() if the query execution was aborted due to - * the plan becoming invalid during initialization. - * ---------- - */ -void -AfterTriggerAbortQuery(void) -{ - /* Revert the actions of AfterTriggerBeginQuery(). */ - afterTriggers.query_depth--; -} - - -/* ---------- * AfterTriggerEndQuery() * * Called after one query has been completely processed. At this time diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c index 45ae7472ab5..26d985193ae 100644 --- a/src/backend/commands/typecmds.c +++ b/src/backend/commands/typecmds.c @@ -939,11 +939,19 @@ DefineDomain(ParseState *pstate, CreateDomainStmt *stmt) break; case CONSTR_NOTNULL: - if (nullDefined && !typNotNull) + if (nullDefined) + { + if (!typNotNull) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting NULL/NOT NULL constraints"), + parser_errposition(pstate, constr->location)); + ereport(ERROR, - errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting NULL/NOT NULL constraints"), + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("redundant NOT NULL constraint definition"), parser_errposition(pstate, constr->location)); + } if (constr->is_no_inherit) ereport(ERROR, errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 33a33bf6b1c..733ef40ae7c 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -56,6 +56,7 @@ #include "utils/fmgroids.h" #include "utils/guc.h" #include "utils/guc_hooks.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/snapmgr.h" #include "utils/syscache.h" @@ -123,7 +124,7 @@ static void vac_truncate_clog(TransactionId frozenXID, MultiXactId minMulti, TransactionId lastSaneFrozenXid, MultiXactId lastSaneMinMulti); -static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, +static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params, BufferAccessStrategy bstrategy); static double compute_parallel_delay(void); static VacOptValue get_vacoptval_from_boolean(DefElem *def); @@ -464,7 +465,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) } /* Now go through the common routine */ - vacuum(vacstmt->rels, ¶ms, bstrategy, vac_context, isTopLevel); + vacuum(vacstmt->rels, params, bstrategy, vac_context, isTopLevel); /* Finally, clean up the vacuum memory context */ MemoryContextDelete(vac_context); @@ -493,7 +494,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) * memory context that will not disappear at transaction commit. */ void -vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, +vacuum(List *relations, const VacuumParams params, BufferAccessStrategy bstrategy, MemoryContext vac_context, bool isTopLevel) { static bool in_vacuum = false; @@ -502,9 +503,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, volatile bool in_outer_xact, use_own_xacts; - Assert(params != NULL); - - stmttype = (params->options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE"; + stmttype = (params.options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE"; /* * We cannot run VACUUM inside a user transaction block; if we were inside @@ -514,7 +513,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, * * ANALYZE (without VACUUM) can run either way. */ - if (params->options & VACOPT_VACUUM) + if (params.options & VACOPT_VACUUM) { PreventInTransactionBlock(isTopLevel, stmttype); in_outer_xact = false; @@ -537,7 +536,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, * Build list of relation(s) to process, putting any new data in * vac_context for safekeeping. */ - if (params->options & VACOPT_ONLY_DATABASE_STATS) + if (params.options & VACOPT_ONLY_DATABASE_STATS) { /* We don't process any tables in this case */ Assert(relations == NIL); @@ -553,7 +552,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, List *sublist; MemoryContext old_context; - sublist = expand_vacuum_rel(vrel, vac_context, params->options); + sublist = expand_vacuum_rel(vrel, vac_context, params.options); old_context = MemoryContextSwitchTo(vac_context); newrels = list_concat(newrels, sublist); MemoryContextSwitchTo(old_context); @@ -561,7 +560,7 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, relations = newrels; } else - relations = get_all_vacuum_rels(vac_context, params->options); + relations = get_all_vacuum_rels(vac_context, params.options); /* * Decide whether we need to start/commit our own transactions. @@ -577,11 +576,11 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, * transaction block, and also in an autovacuum worker, use own * transactions so we can release locks sooner. */ - if (params->options & VACOPT_VACUUM) + if (params.options & VACOPT_VACUUM) use_own_xacts = true; else { - Assert(params->options & VACOPT_ANALYZE); + Assert(params.options & VACOPT_ANALYZE); if (AmAutoVacuumWorkerProcess()) use_own_xacts = true; else if (in_outer_xact) @@ -632,13 +631,13 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, { VacuumRelation *vrel = lfirst_node(VacuumRelation, cur); - if (params->options & VACOPT_VACUUM) + if (params.options & VACOPT_VACUUM) { if (!vacuum_rel(vrel->oid, vrel->relation, params, bstrategy)) continue; } - if (params->options & VACOPT_ANALYZE) + if (params.options & VACOPT_ANALYZE) { /* * If using separate xacts, start one for analyze. Otherwise, @@ -702,8 +701,8 @@ vacuum(List *relations, VacuumParams *params, BufferAccessStrategy bstrategy, StartTransactionCommand(); } - if ((params->options & VACOPT_VACUUM) && - !(params->options & VACOPT_SKIP_DATABASE_STATS)) + if ((params.options & VACOPT_VACUUM) && + !(params.options & VACOPT_SKIP_DATABASE_STATS)) { /* * Update pg_database.datfrozenxid, and truncate pg_xact if possible. @@ -1101,7 +1100,7 @@ get_all_vacuum_rels(MemoryContext vac_context, int options) * minimum). */ bool -vacuum_get_cutoffs(Relation rel, const VacuumParams *params, +vacuum_get_cutoffs(Relation rel, const VacuumParams params, struct VacuumCutoffs *cutoffs) { int freeze_min_age, @@ -1117,10 +1116,10 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams *params, aggressiveMXIDCutoff; /* Use mutable copies of freeze age parameters */ - freeze_min_age = params->freeze_min_age; - multixact_freeze_min_age = params->multixact_freeze_min_age; - freeze_table_age = params->freeze_table_age; - multixact_freeze_table_age = params->multixact_freeze_table_age; + freeze_min_age = params.freeze_min_age; + multixact_freeze_min_age = params.multixact_freeze_min_age; + freeze_table_age = params.freeze_table_age; + multixact_freeze_table_age = params.multixact_freeze_table_age; /* Set pg_class fields in cutoffs */ cutoffs->relfrozenxid = rel->rd_rel->relfrozenxid; @@ -1997,7 +1996,7 @@ vac_truncate_clog(TransactionId frozenXID, * At entry and exit, we are not inside a transaction. */ static bool -vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, +vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params, BufferAccessStrategy bstrategy) { LOCKMODE lmode; @@ -2008,13 +2007,18 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, Oid save_userid; int save_sec_context; int save_nestlevel; + VacuumParams toast_vacuum_params; - Assert(params != NULL); + /* + * This function scribbles on the parameters, so make a copy early to + * avoid affecting the TOAST table (if we do end up recursing to it). + */ + memcpy(&toast_vacuum_params, ¶ms, sizeof(VacuumParams)); /* Begin a transaction for vacuuming this relation */ StartTransactionCommand(); - if (!(params->options & VACOPT_FULL)) + if (!(params.options & VACOPT_FULL)) { /* * In lazy vacuum, we can set the PROC_IN_VACUUM flag, which lets @@ -2040,7 +2044,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); MyProc->statusFlags |= PROC_IN_VACUUM; - if (params->is_wraparound) + if (params.is_wraparound) MyProc->statusFlags |= PROC_VACUUM_FOR_WRAPAROUND; ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; LWLockRelease(ProcArrayLock); @@ -2064,12 +2068,12 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * vacuum, but just ShareUpdateExclusiveLock for concurrent vacuum. Either * way, we can be sure that no other backend is vacuuming the same table. */ - lmode = (params->options & VACOPT_FULL) ? + lmode = (params.options & VACOPT_FULL) ? AccessExclusiveLock : ShareUpdateExclusiveLock; /* open the relation and get the appropriate lock on it */ - rel = vacuum_open_relation(relid, relation, params->options, - params->log_min_duration >= 0, lmode); + rel = vacuum_open_relation(relid, relation, params.options, + params.log_min_duration >= 0, lmode); /* leave if relation could not be opened or locked */ if (!rel) @@ -2084,8 +2088,8 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * This is only safe to do because we hold a session lock on the main * relation that prevents concurrent deletion. */ - if (OidIsValid(params->toast_parent)) - priv_relid = params->toast_parent; + if (OidIsValid(params.toast_parent)) + priv_relid = params.toast_parent; else priv_relid = RelationGetRelid(rel); @@ -2098,7 +2102,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, */ if (!vacuum_is_permitted_for_relation(priv_relid, rel->rd_rel, - params->options & ~VACOPT_ANALYZE)) + params.options & ~VACOPT_ANALYZE)) { relation_close(rel, lmode); PopActiveSnapshot(); @@ -2169,7 +2173,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * Set index_cleanup option based on index_cleanup reloption if it wasn't * specified in VACUUM command, or when running in an autovacuum worker */ - if (params->index_cleanup == VACOPTVALUE_UNSPECIFIED) + if (params.index_cleanup == VACOPTVALUE_UNSPECIFIED) { StdRdOptIndexCleanup vacuum_index_cleanup; @@ -2180,56 +2184,74 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, ((StdRdOptions *) rel->rd_options)->vacuum_index_cleanup; if (vacuum_index_cleanup == STDRD_OPTION_VACUUM_INDEX_CLEANUP_AUTO) - params->index_cleanup = VACOPTVALUE_AUTO; + params.index_cleanup = VACOPTVALUE_AUTO; else if (vacuum_index_cleanup == STDRD_OPTION_VACUUM_INDEX_CLEANUP_ON) - params->index_cleanup = VACOPTVALUE_ENABLED; + params.index_cleanup = VACOPTVALUE_ENABLED; else { Assert(vacuum_index_cleanup == STDRD_OPTION_VACUUM_INDEX_CLEANUP_OFF); - params->index_cleanup = VACOPTVALUE_DISABLED; + params.index_cleanup = VACOPTVALUE_DISABLED; } } +#ifdef USE_INJECTION_POINTS + if (params.index_cleanup == VACOPTVALUE_AUTO) + INJECTION_POINT("vacuum-index-cleanup-auto", NULL); + else if (params.index_cleanup == VACOPTVALUE_DISABLED) + INJECTION_POINT("vacuum-index-cleanup-disabled", NULL); + else if (params.index_cleanup == VACOPTVALUE_ENABLED) + INJECTION_POINT("vacuum-index-cleanup-enabled", NULL); +#endif + /* * Check if the vacuum_max_eager_freeze_failure_rate table storage * parameter was specified. This overrides the GUC value. */ if (rel->rd_options != NULL && ((StdRdOptions *) rel->rd_options)->vacuum_max_eager_freeze_failure_rate >= 0) - params->max_eager_freeze_failure_rate = + params.max_eager_freeze_failure_rate = ((StdRdOptions *) rel->rd_options)->vacuum_max_eager_freeze_failure_rate; /* * Set truncate option based on truncate reloption or GUC if it wasn't * specified in VACUUM command, or when running in an autovacuum worker */ - if (params->truncate == VACOPTVALUE_UNSPECIFIED) + if (params.truncate == VACOPTVALUE_UNSPECIFIED) { StdRdOptions *opts = (StdRdOptions *) rel->rd_options; if (opts && opts->vacuum_truncate_set) { if (opts->vacuum_truncate) - params->truncate = VACOPTVALUE_ENABLED; + params.truncate = VACOPTVALUE_ENABLED; else - params->truncate = VACOPTVALUE_DISABLED; + params.truncate = VACOPTVALUE_DISABLED; } else if (vacuum_truncate) - params->truncate = VACOPTVALUE_ENABLED; + params.truncate = VACOPTVALUE_ENABLED; else - params->truncate = VACOPTVALUE_DISABLED; + params.truncate = VACOPTVALUE_DISABLED; } +#ifdef USE_INJECTION_POINTS + if (params.truncate == VACOPTVALUE_AUTO) + INJECTION_POINT("vacuum-truncate-auto", NULL); + else if (params.truncate == VACOPTVALUE_DISABLED) + INJECTION_POINT("vacuum-truncate-disabled", NULL); + else if (params.truncate == VACOPTVALUE_ENABLED) + INJECTION_POINT("vacuum-truncate-enabled", NULL); +#endif + /* * Remember the relation's TOAST relation for later, if the caller asked * us to process it. In VACUUM FULL, though, the toast table is * automatically rebuilt by cluster_rel so we shouldn't recurse to it, * unless PROCESS_MAIN is disabled. */ - if ((params->options & VACOPT_PROCESS_TOAST) != 0 && - ((params->options & VACOPT_FULL) == 0 || - (params->options & VACOPT_PROCESS_MAIN) == 0)) + if ((params.options & VACOPT_PROCESS_TOAST) != 0 && + ((params.options & VACOPT_FULL) == 0 || + (params.options & VACOPT_PROCESS_MAIN) == 0)) toast_relid = rel->rd_rel->reltoastrelid; else toast_relid = InvalidOid; @@ -2252,16 +2274,16 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, * table is required (e.g., PROCESS_TOAST is set), we force PROCESS_MAIN * to be set when we recurse to the TOAST table. */ - if (params->options & VACOPT_PROCESS_MAIN) + if (params.options & VACOPT_PROCESS_MAIN) { /* * Do the actual work --- either FULL or "lazy" vacuum */ - if (params->options & VACOPT_FULL) + if (params.options & VACOPT_FULL) { ClusterParams cluster_params = {0}; - if ((params->options & VACOPT_VERBOSE) != 0) + if ((params.options & VACOPT_VERBOSE) != 0) cluster_params.options |= CLUOPT_VERBOSE; /* VACUUM FULL is now a variant of CLUSTER; see cluster.c */ @@ -2299,19 +2321,16 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, */ if (toast_relid != InvalidOid) { - VacuumParams toast_vacuum_params; - /* * Force VACOPT_PROCESS_MAIN so vacuum_rel() processes it. Likewise, * set toast_parent so that the privilege checks are done on the main * relation. NB: This is only safe to do because we hold a session * lock on the main relation that prevents concurrent deletion. */ - memcpy(&toast_vacuum_params, params, sizeof(VacuumParams)); toast_vacuum_params.options |= VACOPT_PROCESS_MAIN; toast_vacuum_params.toast_parent = relid; - vacuum_rel(toast_relid, NULL, &toast_vacuum_params, bstrategy); + vacuum_rel(toast_relid, NULL, toast_vacuum_params, bstrategy); } /* diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 2b9d548cdeb..0feea1d30ec 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -63,7 +63,7 @@ typedef struct PVShared */ Oid relid; int elevel; - uint64 queryid; + int64 queryid; /* * Fields for both index vacuum and cleanup. diff --git a/src/backend/executor/README b/src/backend/executor/README index 02745c23ed9..54f4782f31b 100644 --- a/src/backend/executor/README +++ b/src/backend/executor/README @@ -285,28 +285,6 @@ are typically reset to empty once per tuple. Per-tuple contexts are usually associated with ExprContexts, and commonly each PlanState node has its own ExprContext to evaluate its qual and targetlist expressions in. -Relation Locking ----------------- - -When the executor initializes a plan tree for execution, it doesn't lock -non-index relations if the plan tree is freshly generated and not derived -from a CachedPlan. This is because such locks have already been established -during the query's parsing, rewriting, and planning phases. However, with a -cached plan tree, some relations may remain unlocked. The function -AcquireExecutorLocks() only locks unprunable relations in the plan, deferring -the locking of prunable ones to executor initialization. This avoids -unnecessary locking of relations that will be pruned during "initial" runtime -pruning in ExecDoInitialPruning(). - -This approach creates a window where a cached plan tree with child tables -could become outdated if another backend modifies these tables before -ExecDoInitialPruning() locks them. As a result, the executor has the added duty -to verify the plan tree's validity whenever it locks a child table after -doing initial pruning. This validation is done by checking the CachedPlan.is_valid -flag. If the plan tree is outdated (is_valid = false), the executor stops -further initialization, cleans up anything in EState that would have been -allocated up to that point, and retries execution after recreating the -invalid plan in the CachedPlan. See ExecutorStartCachedPlan(). Query Processing Control Flow ----------------------------- @@ -315,13 +293,11 @@ This is a sketch of control flow for full query processing: CreateQueryDesc - ExecutorStart or ExecutorStartCachedPlan + ExecutorStart CreateExecutorState creates per-query context - switch to per-query context to run ExecDoInitialPruning and ExecInitNode + switch to per-query context to run ExecInitNode AfterTriggerBeginQuery - ExecDoInitialPruning - does initial pruning and locks surviving partitions if needed ExecInitNode --- recursively scans plan tree ExecInitNode recurse into subsidiary nodes @@ -345,12 +321,7 @@ This is a sketch of control flow for full query processing: FreeQueryDesc -As mentioned in the "Relation Locking" section, if the plan tree is found to -be stale after locking partitions in ExecDoInitialPruning(), the control is -immediately returned to ExecutorStartCachedPlan(), which will create a new plan -tree and perform the steps starting from CreateExecutorState() again. - -Per above comments, it's not really critical for ExecEndPlan to free any +Per above comments, it's not really critical for ExecEndNode to free any memory; it'll all go away in FreeExecutorState anyway. However, we do need to be careful to close relations, drop buffer pins, etc, so we do need to scan the plan state tree to find these sorts of resources. diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 8a72b5e70a4..1a37737d4a2 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -5228,7 +5228,6 @@ ExecEvalJsonCoercionFinish(ExprState *state, ExprEvalStep *op) * JsonBehavior expression. */ jsestate->escontext.error_occurred = false; - jsestate->escontext.error_occurred = false; jsestate->escontext.details_wanted = true; } } diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index 255bd795361..b5400749353 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -144,7 +144,7 @@ execTuplesHashPrepare(int numCols, * hashfunctions: FmgrInfos of datatype-specific hashing functions to use * collations: collations to use in comparisons * nbuckets: initial estimate of hashtable size - * additionalsize: size of data stored in ->additional + * additionalsize: size of data that may be stored along with the hash entry * metacxt: memory context for long-lived allocation, but not per-entry data * tablecxt: memory context in which to store table entries * tempcxt: short-lived context for evaluation hash and comparison functions @@ -288,7 +288,7 @@ ResetTupleHashTable(TupleHashTable hashtable) * * If isnew isn't NULL, then a new entry is created if no existing entry * matches. On return, *isnew is true if the entry is newly created, - * false if it existed already. ->additional_data in the new entry has + * false if it existed already. The additional data in the new entry has * been zeroed. */ TupleHashEntry diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index bdf862b2406..ca33a854278 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -279,7 +279,7 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * executor is performing an UPDATE that could not use an * optimization like heapam's HOT (in more general terms a * call to table_tuple_update() took place and set - * 'update_indexes' to TUUI_All). Receiving this hint makes + * 'update_indexes' to TU_All). Receiving this hint makes * us consider if we should pass down the 'indexUnchanged' * hint in turn. That's something that we figure out for * each index_insert() call iff 'update' is true. @@ -290,7 +290,7 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * HOT has been applied and any updated columns are indexed * only by summarizing indexes (or in more general terms a * call to table_tuple_update() took place and set - * 'update_indexes' to TUUI_Summarizing). We can (and must) + * 'update_indexes' to TU_Summarizing). We can (and must) * therefore only update the indexes that have * 'amsummarizing' = true. * diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 7230f968101..0391798dd2c 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -55,13 +55,11 @@ #include "parser/parse_relation.h" #include "pgstat.h" #include "rewrite/rewriteHandler.h" -#include "storage/lmgr.h" #include "tcop/utility.h" #include "utils/acl.h" #include "utils/backend_status.h" #include "utils/lsyscache.h" #include "utils/partcache.h" -#include "utils/plancache.h" #include "utils/rls.h" #include "utils/snapmgr.h" @@ -119,16 +117,11 @@ static void ReportNotNullViolationError(ResultRelInfo *resultRelInfo, * get control when ExecutorStart is called. Such a plugin would * normally call standard_ExecutorStart(). * - * Return value indicates if the plan has been initialized successfully so - * that queryDesc->planstate contains a valid PlanState tree. It may not - * if the plan got invalidated during InitPlan(). * ---------------------------------------------------------------- */ -bool +void ExecutorStart(QueryDesc *queryDesc, int eflags) { - bool plan_valid; - /* * In some cases (e.g. an EXECUTE statement or an execute message with the * extended query protocol) the query_id won't be reported, so do it now. @@ -140,14 +133,12 @@ ExecutorStart(QueryDesc *queryDesc, int eflags) pgstat_report_query_id(queryDesc->plannedstmt->queryId, false); if (ExecutorStart_hook) - plan_valid = (*ExecutorStart_hook) (queryDesc, eflags); + (*ExecutorStart_hook) (queryDesc, eflags); else - plan_valid = standard_ExecutorStart(queryDesc, eflags); - - return plan_valid; + standard_ExecutorStart(queryDesc, eflags); } -bool +void standard_ExecutorStart(QueryDesc *queryDesc, int eflags) { EState *estate; @@ -271,64 +262,6 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags) InitPlan(queryDesc, eflags); MemoryContextSwitchTo(oldcontext); - - return ExecPlanStillValid(queryDesc->estate); -} - -/* - * ExecutorStartCachedPlan - * Start execution for a given query in the CachedPlanSource, replanning - * if the plan is invalidated due to deferred locks taken during the - * plan's initialization - * - * This function handles cases where the CachedPlan given in queryDesc->cplan - * might become invalid during the initialization of the plan given in - * queryDesc->plannedstmt, particularly when prunable relations in it are - * locked after performing initial pruning. If the locks invalidate the plan, - * the function calls UpdateCachedPlan() to replan all queries in the - * CachedPlan, and then retries initialization. - * - * The function repeats the process until ExecutorStart() successfully - * initializes the plan, that is without the CachedPlan becoming invalid. - */ -void -ExecutorStartCachedPlan(QueryDesc *queryDesc, int eflags, - CachedPlanSource *plansource, - int query_index) -{ - if (unlikely(queryDesc->cplan == NULL)) - elog(ERROR, "ExecutorStartCachedPlan(): missing CachedPlan"); - if (unlikely(plansource == NULL)) - elog(ERROR, "ExecutorStartCachedPlan(): missing CachedPlanSource"); - - /* - * Loop and retry with an updated plan until no further invalidation - * occurs. - */ - while (1) - { - if (!ExecutorStart(queryDesc, eflags)) - { - /* - * Clean up the current execution state before creating the new - * plan to retry ExecutorStart(). Mark execution as aborted to - * ensure that AFTER trigger state is properly reset. - */ - queryDesc->estate->es_aborted = true; - ExecutorEnd(queryDesc); - - /* Retry ExecutorStart() with an updated plan tree. */ - queryDesc->plannedstmt = UpdateCachedPlan(plansource, query_index, - queryDesc->queryEnv); - } - else - - /* - * Exit the loop if the plan is initialized successfully and no - * sinval messages were received that invalidated the CachedPlan. - */ - break; - } } /* ---------------------------------------------------------------- @@ -387,7 +320,6 @@ standard_ExecutorRun(QueryDesc *queryDesc, estate = queryDesc->estate; Assert(estate != NULL); - Assert(!estate->es_aborted); Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY)); /* caller must ensure the query's snapshot is active */ @@ -494,11 +426,8 @@ standard_ExecutorFinish(QueryDesc *queryDesc) Assert(estate != NULL); Assert(!(estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY)); - /* - * This should be run once and only once per Executor instance and never - * if the execution was aborted. - */ - Assert(!estate->es_finished && !estate->es_aborted); + /* This should be run once and only once per Executor instance */ + Assert(!estate->es_finished); /* Switch into per-query memory context */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); @@ -561,10 +490,11 @@ standard_ExecutorEnd(QueryDesc *queryDesc) (PgStat_Counter) estate->es_parallel_workers_launched); /* - * Check that ExecutorFinish was called, unless in EXPLAIN-only mode or if - * execution was aborted. + * Check that ExecutorFinish was called, unless in EXPLAIN-only mode. This + * Assert is needed because ExecutorFinish is new as of 9.1, and callers + * might forget to call it. */ - Assert(estate->es_finished || estate->es_aborted || + Assert(estate->es_finished || (estate->es_top_eflags & EXEC_FLAG_EXPLAIN_ONLY)); /* @@ -579,14 +509,6 @@ standard_ExecutorEnd(QueryDesc *queryDesc) UnregisterSnapshot(estate->es_crosscheck_snapshot); /* - * Reset AFTER trigger module if the query execution was aborted. - */ - if (estate->es_aborted && - !(estate->es_top_eflags & - (EXEC_FLAG_SKIP_TRIGGERS | EXEC_FLAG_EXPLAIN_ONLY))) - AfterTriggerAbortQuery(); - - /* * Must switch out of context before destroying it */ MemoryContextSwitchTo(oldcontext); @@ -684,21 +606,6 @@ ExecCheckPermissions(List *rangeTable, List *rteperminfos, (rte->rtekind == RTE_SUBQUERY && rte->relkind == RELKIND_VIEW)); - /* - * Ensure that we have at least an AccessShareLock on relations - * whose permissions need to be checked. - * - * Skip this check in a parallel worker because locks won't be - * taken until ExecInitNode() performs plan initialization. - * - * XXX: ExecCheckPermissions() in a parallel worker may be - * redundant with the checks done in the leader process, so this - * should be reviewed to ensure it’s necessary. - */ - Assert(IsParallelWorker() || - CheckRelationOidLockedByMe(rte->relid, AccessShareLock, - true)); - (void) getRTEPermissionInfo(rteperminfos, rte); /* Many-to-one mapping not allowed */ Assert(!bms_is_member(rte->perminfoindex, indexset)); @@ -924,12 +831,6 @@ ExecCheckXactReadOnly(PlannedStmt *plannedstmt) * * Initializes the query plan: open files, allocate storage * and start up the rule manager - * - * If the plan originates from a CachedPlan (given in queryDesc->cplan), - * it can become invalid during runtime "initial" pruning when the - * remaining set of locks is taken. The function returns early in that - * case without initializing the plan, and the caller is expected to - * retry with a new valid plan. * ---------------------------------------------------------------- */ static void @@ -937,7 +838,6 @@ InitPlan(QueryDesc *queryDesc, int eflags) { CmdType operation = queryDesc->operation; PlannedStmt *plannedstmt = queryDesc->plannedstmt; - CachedPlan *cachedplan = queryDesc->cplan; Plan *plan = plannedstmt->planTree; List *rangeTable = plannedstmt->rtable; EState *estate = queryDesc->estate; @@ -958,7 +858,6 @@ InitPlan(QueryDesc *queryDesc, int eflags) bms_copy(plannedstmt->unprunableRelids)); estate->es_plannedstmt = plannedstmt; - estate->es_cachedplan = cachedplan; estate->es_part_prune_infos = plannedstmt->partPruneInfos; /* @@ -972,9 +871,6 @@ InitPlan(QueryDesc *queryDesc, int eflags) */ ExecDoInitialPruning(estate); - if (!ExecPlanStillValid(estate)) - return; - /* * Next, build the ExecRowMark array from the PlanRowMark(s), if any. */ @@ -3092,9 +2988,6 @@ EvalPlanQualStart(EPQState *epqstate, Plan *planTree) * the snapshot, rangetable, and external Param info. They need their own * copies of local state, including a tuple table, es_param_exec_vals, * result-rel info, etc. - * - * es_cachedplan is not copied because EPQ plan execution does not acquire - * any new locks that could invalidate the CachedPlan. */ rcestate->es_direction = ForwardScanDirection; rcestate->es_snapshot = parentestate->es_snapshot; diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 39c990ae638..f098a5557cf 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -189,6 +189,7 @@ ExecSerializePlan(Plan *plan, EState *estate) pstmt->permInfos = estate->es_rteperminfos; pstmt->resultRelations = NIL; pstmt->appendRelations = NIL; + pstmt->planOrigin = PLAN_STMT_INTERNAL; /* * Transfer only parallel-safe subplans, leaving a NULL "hole" in the list @@ -1278,15 +1279,8 @@ ExecParallelGetQueryDesc(shm_toc *toc, DestReceiver *receiver, paramspace = shm_toc_lookup(toc, PARALLEL_KEY_PARAMLISTINFO, false); paramLI = RestoreParamList(¶mspace); - /* - * Create a QueryDesc for the query. We pass NULL for cachedplan, because - * we don't have a pointer to the CachedPlan in the leader's process. It's - * fine because the only reason the executor needs to see it is to decide - * if it should take locks on certain relations, but parallel workers - * always take locks anyway. - */ + /* Create a QueryDesc for the query. */ return CreateQueryDesc(pstmt, - NULL, queryString, GetActiveSnapshot(), InvalidSnapshot, receiver, paramLI, NULL, instrument_options); @@ -1471,8 +1465,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc) /* Start up the executor */ queryDesc->plannedstmt->jitFlags = fpes->jit_flags; - if (!ExecutorStart(queryDesc, fpes->eflags)) - elog(ERROR, "ExecutorStart() failed unexpectedly"); + ExecutorStart(queryDesc, fpes->eflags); /* Special executor initialization steps for parallel workers */ queryDesc->planstate->state->es_query_dsa = area; diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 3f8a4cb5244..514eae1037d 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -26,7 +26,6 @@ #include "partitioning/partdesc.h" #include "partitioning/partprune.h" #include "rewrite/rewriteManip.h" -#include "storage/lmgr.h" #include "utils/acl.h" #include "utils/lsyscache.h" #include "utils/partcache.h" @@ -1771,8 +1770,7 @@ adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap) * ExecDoInitialPruning: * Perform runtime "initial" pruning, if necessary, to determine the set * of child subnodes that need to be initialized during ExecInitNode() for - * all plan nodes that contain a PartitionPruneInfo. This also locks the - * leaf partitions whose subnodes will be initialized if needed. + * all plan nodes that contain a PartitionPruneInfo. * * ExecInitPartitionExecPruning: * Updates the PartitionPruneState found at given part_prune_index in @@ -1798,8 +1796,7 @@ adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap) * ExecDoInitialPruning * Perform runtime "initial" pruning, if necessary, to determine the set * of child subnodes that need to be initialized during ExecInitNode() for - * plan nodes that support partition pruning. This also locks the leaf - * partitions whose subnodes will be initialized if needed. + * plan nodes that support partition pruning. * * This function iterates over each PartitionPruneInfo entry in * estate->es_part_prune_infos. For each entry, it creates a PartitionPruneState @@ -1821,9 +1818,7 @@ adjust_partition_colnos_using_map(List *colnos, AttrMap *attrMap) void ExecDoInitialPruning(EState *estate) { - PlannedStmt *stmt = estate->es_plannedstmt; ListCell *lc; - List *locked_relids = NIL; foreach(lc, estate->es_part_prune_infos) { @@ -1849,68 +1844,11 @@ ExecDoInitialPruning(EState *estate) else validsubplan_rtis = all_leafpart_rtis; - if (ExecShouldLockRelations(estate)) - { - int rtindex = -1; - - while ((rtindex = bms_next_member(validsubplan_rtis, - rtindex)) >= 0) - { - RangeTblEntry *rte = exec_rt_fetch(rtindex, estate); - - Assert(rte->rtekind == RTE_RELATION && - rte->rellockmode != NoLock); - LockRelationOid(rte->relid, rte->rellockmode); - locked_relids = lappend_int(locked_relids, rtindex); - } - } estate->es_unpruned_relids = bms_add_members(estate->es_unpruned_relids, validsubplan_rtis); estate->es_part_prune_results = lappend(estate->es_part_prune_results, validsubplans); } - - /* - * Lock the first result relation of each ModifyTable node, even if it was - * pruned. This is required for ExecInitModifyTable(), which keeps its - * first result relation if all other result relations have been pruned, - * because some executor paths (e.g., in nodeModifyTable.c and - * execPartition.c) rely on there being at least one result relation. - * - * There's room for improvement here --- we actually only need to do this - * if all other result relations of the ModifyTable node were pruned, but - * we don't have an easy way to tell that here. - */ - if (stmt->resultRelations && ExecShouldLockRelations(estate)) - { - foreach(lc, stmt->firstResultRels) - { - Index firstResultRel = lfirst_int(lc); - - if (!bms_is_member(firstResultRel, estate->es_unpruned_relids)) - { - RangeTblEntry *rte = exec_rt_fetch(firstResultRel, estate); - - Assert(rte->rtekind == RTE_RELATION && rte->rellockmode != NoLock); - LockRelationOid(rte->relid, rte->rellockmode); - locked_relids = lappend_int(locked_relids, firstResultRel); - } - } - } - - /* - * Release the useless locks if the plan won't be executed. This is the - * same as what CheckCachedPlan() in plancache.c does. - */ - if (!ExecPlanStillValid(estate)) - { - foreach(lc, locked_relids) - { - RangeTblEntry *rte = exec_rt_fetch(lfirst_int(lc), estate); - - UnlockRelationOid(rte->relid, rte->rellockmode); - } - } } /* diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 53ddd25c42d..68184f5d671 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -14,12 +14,14 @@ #include "postgres.h" +#include "access/commit_ts.h" #include "access/genam.h" #include "access/gist.h" #include "access/relscan.h" #include "access/tableam.h" #include "access/transam.h" #include "access/xact.h" +#include "access/heapam.h" #include "catalog/pg_am_d.h" #include "commands/trigger.h" #include "executor/executor.h" @@ -36,7 +38,7 @@ static bool tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2, - TypeCacheEntry **eq); + TypeCacheEntry **eq, Bitmapset *columns); /* * Setup a ScanKey for a search in the relation 'rel' for a tuple 'key' that @@ -221,7 +223,7 @@ retry: if (eq == NULL) eq = palloc0(sizeof(*eq) * outslot->tts_tupleDescriptor->natts); - if (!tuples_equal(outslot, searchslot, eq)) + if (!tuples_equal(outslot, searchslot, eq, NULL)) continue; } @@ -277,10 +279,13 @@ retry: /* * Compare the tuples in the slots by checking if they have equal values. + * + * If 'columns' is not null, only the columns specified within it will be + * considered for the equality check, ignoring all other columns. */ static bool tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2, - TypeCacheEntry **eq) + TypeCacheEntry **eq, Bitmapset *columns) { int attrnum; @@ -306,6 +311,14 @@ tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2, continue; /* + * Ignore columns that are not listed for checking. + */ + if (columns && + !bms_is_member(att->attnum - FirstLowInvalidHeapAttributeNumber, + columns)) + continue; + + /* * If one value is NULL and other is not, then they are certainly not * equal */ @@ -380,7 +393,7 @@ retry: /* Try to find the tuple */ while (table_scan_getnextslot(scan, ForwardScanDirection, scanslot)) { - if (!tuples_equal(scanslot, searchslot, eq)) + if (!tuples_equal(scanslot, searchslot, eq, NULL)) continue; found = true; @@ -456,6 +469,236 @@ BuildConflictIndexInfo(ResultRelInfo *resultRelInfo, Oid conflictindex) } /* + * If the tuple is recently dead and was deleted by a transaction with a newer + * commit timestamp than previously recorded, update the associated transaction + * ID, commit time, and origin. This helps ensure that conflict detection uses + * the most recent and relevant deletion metadata. + */ +static void +update_most_recent_deletion_info(TupleTableSlot *scanslot, + TransactionId oldestxmin, + TransactionId *delete_xid, + TimestampTz *delete_time, + RepOriginId *delete_origin) +{ + BufferHeapTupleTableSlot *hslot; + HeapTuple tuple; + Buffer buf; + bool recently_dead = false; + TransactionId xmax; + TimestampTz localts; + RepOriginId localorigin; + + hslot = (BufferHeapTupleTableSlot *) scanslot; + + tuple = ExecFetchSlotHeapTuple(scanslot, false, NULL); + buf = hslot->buffer; + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + /* + * We do not consider HEAPTUPLE_DEAD status because it indicates either + * tuples whose inserting transaction was aborted (meaning there is no + * commit timestamp or origin), or tuples deleted by a transaction older + * than oldestxmin, making it safe to ignore them during conflict + * detection (See comments atop worker.c for details). + */ + if (HeapTupleSatisfiesVacuum(tuple, oldestxmin, buf) == HEAPTUPLE_RECENTLY_DEAD) + recently_dead = true; + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (!recently_dead) + return; + + xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); + if (!TransactionIdIsValid(xmax)) + return; + + /* Select the dead tuple with the most recent commit timestamp */ + if (TransactionIdGetCommitTsData(xmax, &localts, &localorigin) && + TimestampDifferenceExceeds(*delete_time, localts, 0)) + { + *delete_xid = xmax; + *delete_time = localts; + *delete_origin = localorigin; + } +} + +/* + * Searches the relation 'rel' for the most recently deleted tuple that matches + * the values in 'searchslot' and is not yet removable by VACUUM. The function + * returns the transaction ID, origin, and commit timestamp of the transaction + * that deleted this tuple. + * + * 'oldestxmin' acts as a cutoff transaction ID. Tuples deleted by transactions + * with IDs >= 'oldestxmin' are considered recently dead and are eligible for + * conflict detection. + * + * Instead of stopping at the first match, we scan all matching dead tuples to + * identify most recent deletion. This is crucial because only the latest + * deletion is relevant for resolving conflicts. + * + * For example, consider a scenario on the subscriber where a row is deleted, + * re-inserted, and then deleted again only on the subscriber: + * + * - (pk, 1) - deleted at 9:00, + * - (pk, 1) - deleted at 9:02, + * + * Now, a remote update arrives: (pk, 1) -> (pk, 2), timestamped at 9:01. + * + * If we mistakenly return the older deletion (9:00), the system may wrongly + * apply the remote update using a last-update-wins strategy. Instead, we must + * recognize the more recent deletion at 9:02 and skip the update. See + * comments atop worker.c for details. Note, as of now, conflict resolution + * is not implemented. Consequently, the system may incorrectly report the + * older tuple as the conflicted one, leading to misleading results. + * + * The commit timestamp of the deleting transaction is used to determine which + * tuple was deleted most recently. + */ +bool +RelationFindDeletedTupleInfoSeq(Relation rel, TupleTableSlot *searchslot, + TransactionId oldestxmin, + TransactionId *delete_xid, + RepOriginId *delete_origin, + TimestampTz *delete_time) +{ + TupleTableSlot *scanslot; + TableScanDesc scan; + TypeCacheEntry **eq; + Bitmapset *indexbitmap; + TupleDesc desc PG_USED_FOR_ASSERTS_ONLY = RelationGetDescr(rel); + + Assert(equalTupleDescs(desc, searchslot->tts_tupleDescriptor)); + + *delete_xid = InvalidTransactionId; + *delete_origin = InvalidRepOriginId; + *delete_time = 0; + + /* + * If the relation has a replica identity key or a primary key that is + * unusable for locating deleted tuples (see + * IsIndexUsableForFindingDeletedTuple), a full table scan becomes + * necessary. In such cases, comparing the entire tuple is not required, + * since the remote tuple might not include all column values. Instead, + * the indexed columns alone are suffcient to identify the target tuple + * (see logicalrep_rel_mark_updatable). + */ + indexbitmap = RelationGetIndexAttrBitmap(rel, + INDEX_ATTR_BITMAP_IDENTITY_KEY); + + /* fallback to PK if no replica identity */ + if (!indexbitmap) + indexbitmap = RelationGetIndexAttrBitmap(rel, + INDEX_ATTR_BITMAP_PRIMARY_KEY); + + eq = palloc0(sizeof(*eq) * searchslot->tts_tupleDescriptor->natts); + + /* + * Start a heap scan using SnapshotAny to identify dead tuples that are + * not visible under a standard MVCC snapshot. Tuples from transactions + * not yet committed or those just committed prior to the scan are + * excluded in update_most_recent_deletion_info(). + */ + scan = table_beginscan(rel, SnapshotAny, 0, NULL); + scanslot = table_slot_create(rel, NULL); + + table_rescan(scan, NULL); + + /* Try to find the tuple */ + while (table_scan_getnextslot(scan, ForwardScanDirection, scanslot)) + { + if (!tuples_equal(scanslot, searchslot, eq, indexbitmap)) + continue; + + update_most_recent_deletion_info(scanslot, oldestxmin, delete_xid, + delete_time, delete_origin); + } + + table_endscan(scan); + ExecDropSingleTupleTableSlot(scanslot); + + return *delete_time != 0; +} + +/* + * Similar to RelationFindDeletedTupleInfoSeq() but using index scan to locate + * the deleted tuple. + */ +bool +RelationFindDeletedTupleInfoByIndex(Relation rel, Oid idxoid, + TupleTableSlot *searchslot, + TransactionId oldestxmin, + TransactionId *delete_xid, + RepOriginId *delete_origin, + TimestampTz *delete_time) +{ + Relation idxrel; + ScanKeyData skey[INDEX_MAX_KEYS]; + int skey_attoff; + IndexScanDesc scan; + TupleTableSlot *scanslot; + TypeCacheEntry **eq = NULL; + bool isIdxSafeToSkipDuplicates; + TupleDesc desc PG_USED_FOR_ASSERTS_ONLY = RelationGetDescr(rel); + + Assert(equalTupleDescs(desc, searchslot->tts_tupleDescriptor)); + Assert(OidIsValid(idxoid)); + + *delete_xid = InvalidTransactionId; + *delete_time = 0; + *delete_origin = InvalidRepOriginId; + + isIdxSafeToSkipDuplicates = (GetRelationIdentityOrPK(rel) == idxoid); + + scanslot = table_slot_create(rel, NULL); + + idxrel = index_open(idxoid, RowExclusiveLock); + + /* Build scan key. */ + skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot); + + /* + * Start an index scan using SnapshotAny to identify dead tuples that are + * not visible under a standard MVCC snapshot. Tuples from transactions + * not yet committed or those just committed prior to the scan are + * excluded in update_most_recent_deletion_info(). + */ + scan = index_beginscan(rel, idxrel, SnapshotAny, NULL, skey_attoff, 0); + + index_rescan(scan, skey, skey_attoff, NULL, 0); + + /* Try to find the tuple */ + while (index_getnext_slot(scan, ForwardScanDirection, scanslot)) + { + /* + * Avoid expensive equality check if the index is primary key or + * replica identity index. + */ + if (!isIdxSafeToSkipDuplicates) + { + if (eq == NULL) + eq = palloc0(sizeof(*eq) * scanslot->tts_tupleDescriptor->natts); + + if (!tuples_equal(scanslot, searchslot, eq, NULL)) + continue; + } + + update_most_recent_deletion_info(scanslot, oldestxmin, delete_xid, + delete_time, delete_origin); + } + + index_endscan(scan); + + index_close(idxrel, NoLock); + + ExecDropSingleTupleTableSlot(scanslot); + + return *delete_time != 0; +} + +/* * Find the tuple that violates the passed unique index (conflictindex). * * If the conflicting tuple is found return true, otherwise false. @@ -670,7 +913,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - tid, NULL, slot, NULL, NULL)) + tid, NULL, slot, NULL, NULL, false)) skip_tuple = true; /* "do nothing" */ } @@ -746,7 +989,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tid, NULL, NULL, NULL, NULL); + tid, NULL, NULL, NULL, NULL, false); } if (!skip_tuple) diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 772c86e70e9..fdc65c2b42b 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -147,7 +147,6 @@ CreateExecutorState(void) estate->es_top_eflags = 0; estate->es_instrument = 0; estate->es_finished = false; - estate->es_aborted = false; estate->es_exprcontexts = NIL; diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c index 8d4d062d579..359aafea681 100644 --- a/src/backend/executor/functions.c +++ b/src/backend/executor/functions.c @@ -34,6 +34,7 @@ #include "utils/funccache.h" #include "utils/lsyscache.h" #include "utils/memutils.h" +#include "utils/plancache.h" #include "utils/snapmgr.h" #include "utils/syscache.h" @@ -1338,7 +1339,6 @@ postquel_start(execution_state *es, SQLFunctionCachePtr fcache) dest = None_Receiver; es->qd = CreateQueryDesc(es->stmt, - NULL, fcache->func->src, GetActiveSnapshot(), InvalidSnapshot, @@ -1363,8 +1363,7 @@ postquel_start(execution_state *es, SQLFunctionCachePtr fcache) eflags = EXEC_FLAG_SKIP_TRIGGERS; else eflags = 0; /* default run-to-completion flags */ - if (!ExecutorStart(es->qd, eflags)) - elog(ERROR, "ExecutorStart() failed unexpectedly"); + ExecutorStart(es->qd, eflags); } es->status = F_EXEC_RUN; diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 46d533b7288..7c6c2c1f6e4 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -64,6 +64,7 @@ #include "nodes/nodeFuncs.h" #include "optimizer/optimizer.h" #include "rewrite/rewriteHandler.h" +#include "rewrite/rewriteManip.h" #include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/datum.h" @@ -1473,7 +1474,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return ExecBRDeleteTriggers(context->estate, context->epqstate, resultRelInfo, tupleid, oldtuple, - epqreturnslot, result, &context->tmfd); + epqreturnslot, result, &context->tmfd, + context->mtstate->operation == CMD_MERGE); } return true; @@ -2116,7 +2118,8 @@ ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return ExecBRUpdateTriggers(context->estate, context->epqstate, resultRelInfo, tupleid, oldtuple, slot, - result, &context->tmfd); + result, &context->tmfd, + context->mtstate->operation == CMD_MERGE); } return true; @@ -3735,6 +3738,7 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate) switch (action->commandType) { case CMD_INSERT: + /* INSERT actions always use rootRelInfo */ ExecCheckPlanOutput(rootRelInfo->ri_RelationDesc, action->targetList); @@ -3774,9 +3778,23 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate) } else { - /* not partitioned? use the stock relation and slot */ - tgtslot = resultRelInfo->ri_newTupleSlot; - tgtdesc = RelationGetDescr(resultRelInfo->ri_RelationDesc); + /* + * If the MERGE targets an inherited table, we insert + * into the root table, so we must initialize its + * "new" tuple slot, if not already done, and use its + * relation descriptor for the projection. + * + * For non-inherited tables, rootRelInfo and + * resultRelInfo are the same, and the "new" tuple + * slot will already have been initialized. + */ + if (rootRelInfo->ri_newTupleSlot == NULL) + rootRelInfo->ri_newTupleSlot = + table_slot_create(rootRelInfo->ri_RelationDesc, + &estate->es_tupleTable); + + tgtslot = rootRelInfo->ri_newTupleSlot; + tgtdesc = RelationGetDescr(rootRelInfo->ri_RelationDesc); } action_state->mas_proj = @@ -3809,6 +3827,114 @@ ExecInitMerge(ModifyTableState *mtstate, EState *estate) } } } + + /* + * If the MERGE targets an inherited table, any INSERT actions will use + * rootRelInfo, and rootRelInfo will not be in the resultRelInfo array. + * Therefore we must initialize its WITH CHECK OPTION constraints and + * RETURNING projection, as ExecInitModifyTable did for the resultRelInfo + * entries. + * + * Note that the planner does not build a withCheckOptionList or + * returningList for the root relation, but as in ExecInitPartitionInfo, + * we can use the first resultRelInfo entry as a reference to calculate + * the attno's for the root table. + */ + if (rootRelInfo != mtstate->resultRelInfo && + rootRelInfo->ri_RelationDesc->rd_rel->relkind != RELKIND_PARTITIONED_TABLE && + (mtstate->mt_merge_subcommands & MERGE_INSERT) != 0) + { + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + Relation rootRelation = rootRelInfo->ri_RelationDesc; + Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; + int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; + AttrMap *part_attmap = NULL; + bool found_whole_row; + + if (node->withCheckOptionLists != NIL) + { + List *wcoList; + List *wcoExprs = NIL; + + /* There should be as many WCO lists as result rels */ + Assert(list_length(node->withCheckOptionLists) == + list_length(node->resultRelations)); + + /* + * Use the first WCO list as a reference. In the most common case, + * this will be for the same relation as rootRelInfo, and so there + * will be no need to adjust its attno's. + */ + wcoList = linitial(node->withCheckOptionLists); + if (rootRelation != firstResultRel) + { + /* Convert any Vars in it to contain the root's attno's */ + part_attmap = + build_attrmap_by_name(RelationGetDescr(rootRelation), + RelationGetDescr(firstResultRel), + false); + + wcoList = (List *) + map_variable_attnos((Node *) wcoList, + firstVarno, 0, + part_attmap, + RelationGetForm(rootRelation)->reltype, + &found_whole_row); + } + + foreach(lc, wcoList) + { + WithCheckOption *wco = lfirst_node(WithCheckOption, lc); + ExprState *wcoExpr = ExecInitQual(castNode(List, wco->qual), + &mtstate->ps); + + wcoExprs = lappend(wcoExprs, wcoExpr); + } + + rootRelInfo->ri_WithCheckOptions = wcoList; + rootRelInfo->ri_WithCheckOptionExprs = wcoExprs; + } + + if (node->returningLists != NIL) + { + List *returningList; + + /* There should be as many returning lists as result rels */ + Assert(list_length(node->returningLists) == + list_length(node->resultRelations)); + + /* + * Use the first returning list as a reference. In the most common + * case, this will be for the same relation as rootRelInfo, and so + * there will be no need to adjust its attno's. + */ + returningList = linitial(node->returningLists); + if (rootRelation != firstResultRel) + { + /* Convert any Vars in it to contain the root's attno's */ + if (part_attmap == NULL) + part_attmap = + build_attrmap_by_name(RelationGetDescr(rootRelation), + RelationGetDescr(firstResultRel), + false); + + returningList = (List *) + map_variable_attnos((Node *) returningList, + firstVarno, 0, + part_attmap, + RelationGetForm(rootRelation)->reltype, + &found_whole_row); + } + rootRelInfo->ri_returningList = returningList; + + /* Initialize the RETURNING projection */ + rootRelInfo->ri_projectReturning = + ExecBuildProjectionInfo(returningList, econtext, + mtstate->ps.ps_ResultTupleSlot, + &mtstate->ps, + RelationGetDescr(rootRelation)); + } + } } /* @@ -4830,12 +4956,11 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ExprContext *econtext; /* - * Initialize result tuple slot and assign its rowtype using the first - * RETURNING list. We assume the rest will look the same. + * Initialize result tuple slot and assign its rowtype using the plan + * node's declared targetlist, which the planner set up to be the same + * as the first (before runtime pruning) RETURNING list. We assume + * all the result rels will produce compatible output. */ - mtstate->ps.plan->targetlist = (List *) linitial(returningLists); - - /* Set up a slot for the output of the RETURNING projection(s) */ ExecInitResultTupleSlotTL(&mtstate->ps, &TTSOpsVirtual); slot = mtstate->ps.ps_ResultTupleSlot; @@ -4865,7 +4990,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) * We still must construct a dummy result tuple type, because InitPlan * expects one (maybe should change that?). */ - mtstate->ps.plan->targetlist = NIL; ExecInitResultTypeTL(&mtstate->ps); mtstate->ps.ps_ExprContext = NULL; diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index ab2eab9596e..26f7420b64b 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -128,9 +128,11 @@ TidExprListCreate(TidRangeScanState *tidrangestate) * TidRangeEval * * Compute and set node's block and offset range to scan by evaluating - * the trss_tidexprs. Returns false if we detect the range cannot + * node->trss_tidexprs. Returns false if we detect the range cannot * contain any tuples. Returns true if it's possible for the range to - * contain tuples. + * contain tuples. We don't bother validating that trss_mintid is less + * than or equal to trss_maxtid, as the scan_set_tidrange() table AM + * function will handle that. * ---------------------------------------------------------------- */ static bool diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index 3288396def3..ecb2e4ccaa1 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -70,8 +70,7 @@ static int _SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options, static ParamListInfo _SPI_convert_params(int nargs, Oid *argtypes, Datum *Values, const char *Nulls); -static int _SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount, - CachedPlanSource *plansource, int query_index); +static int _SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount); static void _SPI_error_callback(void *arg); @@ -1686,8 +1685,7 @@ SPI_cursor_open_internal(const char *name, SPIPlanPtr plan, query_string, plansource->commandTag, stmt_list, - cplan, - plansource); + cplan); /* * Set up options for portal. Default SCROLL type is chosen the same way @@ -2502,7 +2500,6 @@ _SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options, CachedPlanSource *plansource = (CachedPlanSource *) lfirst(lc1); List *stmt_list; ListCell *lc2; - int query_index = 0; spicallbackarg.query = plansource->query_string; @@ -2693,16 +2690,14 @@ _SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options, snap = InvalidSnapshot; qdesc = CreateQueryDesc(stmt, - cplan, plansource->query_string, snap, crosscheck_snapshot, dest, options->params, _SPI_current->queryEnv, 0); - - res = _SPI_pquery(qdesc, fire_triggers, canSetTag ? options->tcount : 0, - plansource, query_index); + res = _SPI_pquery(qdesc, fire_triggers, + canSetTag ? options->tcount : 0); FreeQueryDesc(qdesc); } else @@ -2799,8 +2794,6 @@ _SPI_execute_plan(SPIPlanPtr plan, const SPIExecuteOptions *options, my_res = res; goto fail; } - - query_index++; } /* Done with this plan, so release refcount */ @@ -2878,8 +2871,7 @@ _SPI_convert_params(int nargs, Oid *argtypes, } static int -_SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount, - CachedPlanSource *plansource, int query_index) +_SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount) { int operation = queryDesc->operation; int eflags; @@ -2935,16 +2927,7 @@ _SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount, else eflags = EXEC_FLAG_SKIP_TRIGGERS; - if (queryDesc->cplan) - { - ExecutorStartCachedPlan(queryDesc, eflags, plansource, query_index); - Assert(queryDesc->planstate); - } - else - { - if (!ExecutorStart(queryDesc, eflags)) - elog(ERROR, "ExecutorStart() failed unexpectedly"); - } + ExecutorStart(queryDesc, eflags); ExecutorRun(queryDesc, ForwardScanDirection, tcount); diff --git a/src/backend/jit/README b/src/backend/jit/README index 5427bdf2153..a40950dfb03 100644 --- a/src/backend/jit/README +++ b/src/backend/jit/README @@ -205,7 +205,7 @@ The ability to do so allows us to get the LLVM IR for all operators bitcode files get installed into the server's $pkglibdir/bitcode/postgres/ Using existing LLVM functionality (for parallel LTO compilation), -additionally an index is over these is stored to +additionally an index over these is stored to $pkglibdir/bitcode/postgres.index.bc Similarly extensions can install code into diff --git a/src/backend/jit/llvm/Makefile b/src/backend/jit/llvm/Makefile index e8c12060b93..68677ba42e1 100644 --- a/src/backend/jit/llvm/Makefile +++ b/src/backend/jit/llvm/Makefile @@ -31,7 +31,7 @@ endif # All files in this directory use LLVM. CFLAGS += $(LLVM_CFLAGS) CXXFLAGS += $(LLVM_CXXFLAGS) -override CPPFLAGS := $(LLVM_CPPFLAGS) $(CPPFLAGS) +override CPPFLAGS += $(LLVM_CPPFLAGS) SHLIB_LINK += $(LLVM_LIBS) # Because this module includes C++ files, we need to use a C++ diff --git a/src/backend/jit/llvm/meson.build b/src/backend/jit/llvm/meson.build index c8e06dfbe35..805fbd69006 100644 --- a/src/backend/jit/llvm/meson.build +++ b/src/backend/jit/llvm/meson.build @@ -53,7 +53,7 @@ llvm_irgen_args = [ if ccache.found() llvm_irgen_command = ccache - llvm_irgen_args = [clang.path()] + llvm_irgen_args + llvm_irgen_args = [clang.full_path()] + llvm_irgen_args else llvm_irgen_command = clang endif diff --git a/src/backend/lib/README b/src/backend/lib/README index f2fb591237d..c28cbe356f0 100644 --- a/src/backend/lib/README +++ b/src/backend/lib/README @@ -1,8 +1,6 @@ This directory contains a general purpose data structures, for use anywhere in the backend: -binaryheap.c - a binary heap - bipartite_match.c - Hopcroft-Karp maximum cardinality algorithm for bipartite graphs bloomfilter.c - probabilistic, space-efficient set membership testing @@ -21,8 +19,6 @@ pairingheap.c - a pairing heap rbtree.c - a red-black tree -stringinfo.c - an extensible string type - Aside from the inherent characteristics of the data structures, there are a few practical differences between the binary heap and the pairing heap. The diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index 9f4d05ffbd4..4da46666439 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -94,8 +94,16 @@ static int auth_peer(hbaPort *port); #define PGSQL_PAM_SERVICE "postgresql" /* Service name passed to PAM */ +/* Work around original Solaris' lack of "const" in the conv_proc signature */ +#ifdef _PAM_LEGACY_NONCONST +#define PG_PAM_CONST +#else +#define PG_PAM_CONST const +#endif + static int CheckPAMAuth(Port *port, const char *user, const char *password); -static int pam_passwd_conv_proc(int num_msg, const struct pam_message **msg, +static int pam_passwd_conv_proc(int num_msg, + PG_PAM_CONST struct pam_message **msg, struct pam_response **resp, void *appdata_ptr); static struct pam_conv pam_passw_conv = { @@ -1917,7 +1925,7 @@ auth_peer(hbaPort *port) */ static int -pam_passwd_conv_proc(int num_msg, const struct pam_message **msg, +pam_passwd_conv_proc(int num_msg, PG_PAM_CONST struct pam_message **msg, struct pam_response **resp, void *appdata_ptr) { const char *passwd; diff --git a/src/backend/libpq/be-secure-gssapi.c b/src/backend/libpq/be-secure-gssapi.c index 717ba9824f9..5d98c58ffa8 100644 --- a/src/backend/libpq/be-secure-gssapi.c +++ b/src/backend/libpq/be-secure-gssapi.c @@ -46,11 +46,18 @@ * don't want the other side to send arbitrarily huge packets as we * would have to allocate memory for them to then pass them to GSSAPI. * - * Therefore, these two #define's are effectively part of the protocol + * Therefore, this #define is effectively part of the protocol * spec and can't ever be changed. */ -#define PQ_GSS_SEND_BUFFER_SIZE 16384 -#define PQ_GSS_RECV_BUFFER_SIZE 16384 +#define PQ_GSS_MAX_PACKET_SIZE 16384 /* includes uint32 header word */ + +/* + * However, during the authentication exchange we must cope with whatever + * message size the GSSAPI library wants to send (because our protocol + * doesn't support splitting those messages). Depending on configuration + * those messages might be as much as 64kB. + */ +#define PQ_GSS_AUTH_BUFFER_SIZE 65536 /* includes uint32 header word */ /* * Since we manage at most one GSS-encrypted connection per backend, @@ -114,9 +121,9 @@ be_gssapi_write(Port *port, const void *ptr, size_t len) * again, so if it offers a len less than that, something is wrong. * * Note: it may seem attractive to report partial write completion once - * we've successfully sent any encrypted packets. However, that can cause - * problems for callers; notably, pqPutMsgEnd's heuristic to send only - * full 8K blocks interacts badly with such a hack. We won't save much, + * we've successfully sent any encrypted packets. However, doing that + * expands the state space of this processing and has been responsible for + * bugs in the past (cf. commit d053a879b). We won't save much, * typically, by letting callers discard data early, so don't risk it. */ if (len < PqGSSSendConsumed) @@ -210,12 +217,12 @@ be_gssapi_write(Port *port, const void *ptr, size_t len) errno = ECONNRESET; return -1; } - if (output.length > PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)) + if (output.length > PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)) { ereport(COMMERROR, (errmsg("server tried to send oversize GSSAPI packet (%zu > %zu)", (size_t) output.length, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)))); + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)))); errno = ECONNRESET; return -1; } @@ -346,12 +353,12 @@ be_gssapi_read(Port *port, void *ptr, size_t len) /* Decode the packet length and check for overlength packet */ input.length = pg_ntoh32(*(uint32 *) PqGSSRecvBuffer); - if (input.length > PQ_GSS_RECV_BUFFER_SIZE - sizeof(uint32)) + if (input.length > PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)) { ereport(COMMERROR, (errmsg("oversize GSSAPI packet sent by the client (%zu > %zu)", (size_t) input.length, - PQ_GSS_RECV_BUFFER_SIZE - sizeof(uint32)))); + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)))); errno = ECONNRESET; return -1; } @@ -517,10 +524,13 @@ secure_open_gssapi(Port *port) * that will never use them, and we ensure that the buffers are * sufficiently aligned for the length-word accesses that we do in some * places in this file. + * + * We'll use PQ_GSS_AUTH_BUFFER_SIZE-sized buffers until transport + * negotiation is complete, then switch to PQ_GSS_MAX_PACKET_SIZE. */ - PqGSSSendBuffer = malloc(PQ_GSS_SEND_BUFFER_SIZE); - PqGSSRecvBuffer = malloc(PQ_GSS_RECV_BUFFER_SIZE); - PqGSSResultBuffer = malloc(PQ_GSS_RECV_BUFFER_SIZE); + PqGSSSendBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); + PqGSSRecvBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); + PqGSSResultBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); if (!PqGSSSendBuffer || !PqGSSRecvBuffer || !PqGSSResultBuffer) ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), @@ -568,16 +578,16 @@ secure_open_gssapi(Port *port) /* * During initialization, packets are always fully consumed and - * shouldn't ever be over PQ_GSS_RECV_BUFFER_SIZE in length. + * shouldn't ever be over PQ_GSS_AUTH_BUFFER_SIZE in total length. * * Verify on our side that the client doesn't do something funny. */ - if (input.length > PQ_GSS_RECV_BUFFER_SIZE) + if (input.length > PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)) { ereport(COMMERROR, - (errmsg("oversize GSSAPI packet sent by the client (%zu > %d)", + (errmsg("oversize GSSAPI packet sent by the client (%zu > %zu)", (size_t) input.length, - PQ_GSS_RECV_BUFFER_SIZE))); + PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)))); return -1; } @@ -631,12 +641,12 @@ secure_open_gssapi(Port *port) { uint32 netlen = pg_hton32(output.length); - if (output.length > PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)) + if (output.length > PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)) { ereport(COMMERROR, (errmsg("server tried to send oversize GSSAPI packet (%zu > %zu)", (size_t) output.length, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)))); + PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)))); gss_release_buffer(&minor, &output); return -1; } @@ -692,11 +702,28 @@ secure_open_gssapi(Port *port) } /* + * Release the large authentication buffers and allocate the ones we want + * for normal operation. + */ + free(PqGSSSendBuffer); + free(PqGSSRecvBuffer); + free(PqGSSResultBuffer); + PqGSSSendBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + PqGSSRecvBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + PqGSSResultBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + if (!PqGSSSendBuffer || !PqGSSRecvBuffer || !PqGSSResultBuffer) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + PqGSSSendLength = PqGSSSendNext = PqGSSSendConsumed = 0; + PqGSSRecvLength = PqGSSResultLength = PqGSSResultNext = 0; + + /* * Determine the max packet size which will fit in our buffer, after * accounting for the length. be_gssapi_write will need this. */ major = gss_wrap_size_limit(&minor, port->gss->ctx, 1, GSS_C_QOP_DEFAULT, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32), + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32), &PqGSSMaxPktSize); if (GSS_ERROR(major)) diff --git a/src/backend/libpq/be-secure-openssl.c b/src/backend/libpq/be-secure-openssl.c index 64ff3ce3d6a..c8b63ef8249 100644 --- a/src/backend/libpq/be-secure-openssl.c +++ b/src/backend/libpq/be-secure-openssl.c @@ -1436,10 +1436,10 @@ initialize_ecdh(SSL_CTX *context, bool isServerStart) */ ereport(isServerStart ? FATAL : LOG, errcode(ERRCODE_CONFIG_FILE_ERROR), - errmsg("failed to set group names specified in ssl_groups: %s", + errmsg("could not set group names specified in ssl_groups: %s", SSLerrmessageExt(ERR_get_error(), _("No valid groups found"))), - errhint("Ensure that each group name is spelled correctly and supported by the installed version of OpenSSL")); + errhint("Ensure that each group name is spelled correctly and supported by the installed version of OpenSSL.")); return false; } #endif diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c index 332fad27835..fecee8224d0 100644 --- a/src/backend/libpq/hba.c +++ b/src/backend/libpq/hba.c @@ -2873,8 +2873,11 @@ check_ident_usermap(IdentLine *identLine, const char *usermap_name, !token_has_regexp(identLine->pg_user) && (ofs = strstr(identLine->pg_user->string, "\\1")) != NULL) { + const char *repl_str; + size_t repl_len; + char *old_pg_user; char *expanded_pg_user; - int offset; + size_t offset; /* substitution of the first argument requested */ if (matches[1].rm_so < 0) @@ -2886,18 +2889,33 @@ check_ident_usermap(IdentLine *identLine, const char *usermap_name, *error_p = true; return; } + repl_str = system_user + matches[1].rm_so; + repl_len = matches[1].rm_eo - matches[1].rm_so; /* - * length: original length minus length of \1 plus length of match - * plus null terminator + * It's allowed to have more than one \1 in the string, and we'll + * replace them all. But that's pretty unusual so we optimize on + * the assumption of only one occurrence, which motivates doing + * repeated replacements instead of making two passes over the + * string to determine the final length right away. */ - expanded_pg_user = palloc0(strlen(identLine->pg_user->string) - 2 + (matches[1].rm_eo - matches[1].rm_so) + 1); - offset = ofs - identLine->pg_user->string; - memcpy(expanded_pg_user, identLine->pg_user->string, offset); - memcpy(expanded_pg_user + offset, - system_user + matches[1].rm_so, - matches[1].rm_eo - matches[1].rm_so); - strcat(expanded_pg_user, ofs + 2); + old_pg_user = identLine->pg_user->string; + do + { + /* + * length: current length minus length of \1 plus length of + * replacement plus null terminator + */ + expanded_pg_user = palloc(strlen(old_pg_user) - 2 + repl_len + 1); + /* ofs points into the old_pg_user string at this point */ + offset = ofs - old_pg_user; + memcpy(expanded_pg_user, old_pg_user, offset); + memcpy(expanded_pg_user + offset, repl_str, repl_len); + strcpy(expanded_pg_user + offset + repl_len, ofs + 2); + if (old_pg_user != identLine->pg_user->string) + pfree(old_pg_user); + old_pg_user = expanded_pg_user; + } while ((ofs = strstr(old_pg_user + offset + repl_len, "\\1")) != NULL); /* * Mark the token as quoted, so it will only be compared literally diff --git a/src/backend/libpq/pg_ident.conf.sample b/src/backend/libpq/pg_ident.conf.sample index f5225f26cdf..8ee6c0ba315 100644 --- a/src/backend/libpq/pg_ident.conf.sample +++ b/src/backend/libpq/pg_ident.conf.sample @@ -13,25 +13,25 @@ # user names to their corresponding PostgreSQL user names. Records # are of the form: # -# MAPNAME SYSTEM-USERNAME PG-USERNAME +# MAPNAME SYSTEM-USERNAME DATABASE-USERNAME # # (The uppercase quantities must be replaced by actual values.) # # MAPNAME is the (otherwise freely chosen) map name that was used in # pg_hba.conf. SYSTEM-USERNAME is the detected user name of the -# client. PG-USERNAME is the requested PostgreSQL user name. The -# existence of a record specifies that SYSTEM-USERNAME may connect as -# PG-USERNAME. +# client. DATABASE-USERNAME is the requested PostgreSQL user name. +# The existence of a record specifies that SYSTEM-USERNAME may connect +# as DATABASE-USERNAME. # -# If SYSTEM-USERNAME starts with a slash (/), it will be treated as a -# regular expression. Optionally this can contain a capture (a -# parenthesized subexpression). The substring matching the capture -# will be substituted for \1 (backslash-one) if present in -# PG-USERNAME. +# If SYSTEM-USERNAME starts with a slash (/), the rest of it will be +# treated as a regular expression. Optionally this can contain a capture +# (a parenthesized subexpression). The substring matching the capture +# will be substituted for \1 (backslash-one) if that appears in +# DATABASE-USERNAME. # -# PG-USERNAME can be "all", a user name, a group name prefixed with "+", or -# a regular expression (if it starts with a slash (/)). If it is a regular -# expression, the substring matching with \1 has no effect. +# DATABASE-USERNAME can be "all", a user name, a group name prefixed with "+", +# or a regular expression (if it starts with a slash (/)). If it is a regular +# expression, no substitution for \1 will occur. # # Multiple maps may be specified in this file and used by pg_hba.conf. # @@ -69,4 +69,4 @@ # Put your actual configuration here # ---------------------------------- -# MAPNAME SYSTEM-USERNAME PG-USERNAME +# MAPNAME SYSTEM-USERNAME DATABASE-USERNAME diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c index e5171467de1..25f739a6a17 100644 --- a/src/backend/libpq/pqcomm.c +++ b/src/backend/libpq/pqcomm.c @@ -858,7 +858,6 @@ RemoveSocketFiles(void) (void) unlink(sock_path); } /* Since we're about to exit, no need to reclaim storage */ - sock_paths = NIL; } diff --git a/src/backend/libpq/pqmq.c b/src/backend/libpq/pqmq.c index f1a08bc32ca..5f39949a367 100644 --- a/src/backend/libpq/pqmq.c +++ b/src/backend/libpq/pqmq.c @@ -23,7 +23,7 @@ #include "tcop/tcopprot.h" #include "utils/builtins.h" -static shm_mq_handle *pq_mq_handle; +static shm_mq_handle *pq_mq_handle = NULL; static bool pq_mq_busy = false; static pid_t pq_mq_parallel_leader_pid = 0; static ProcNumber pq_mq_parallel_leader_proc_number = INVALID_PROC_NUMBER; @@ -66,7 +66,11 @@ pq_redirect_to_shm_mq(dsm_segment *seg, shm_mq_handle *mqh) static void pq_cleanup_redirect_to_shm_mq(dsm_segment *seg, Datum arg) { - pq_mq_handle = NULL; + if (pq_mq_handle != NULL) + { + pfree(pq_mq_handle); + pq_mq_handle = NULL; + } whereToSendOutput = DestNone; } @@ -131,8 +135,11 @@ mq_putmessage(char msgtype, const char *s, size_t len) if (pq_mq_busy) { if (pq_mq_handle != NULL) + { shm_mq_detach(pq_mq_handle); - pq_mq_handle = NULL; + pfree(pq_mq_handle); + pq_mq_handle = NULL; + } return EOF; } @@ -152,8 +159,6 @@ mq_putmessage(char msgtype, const char *s, size_t len) iov[1].data = s; iov[1].len = len; - Assert(pq_mq_handle != NULL); - for (;;) { /* @@ -161,6 +166,7 @@ mq_putmessage(char msgtype, const char *s, size_t len) * that the shared memory value is updated before we send the parallel * message signal right after this. */ + Assert(pq_mq_handle != NULL); result = shm_mq_sendv(pq_mq_handle, iov, 2, true, true); if (pq_mq_parallel_leader_pid != 0) diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 7d63cf94a6b..bdcb5e4f261 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -125,13 +125,17 @@ main(int argc, char *argv[]) set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("postgres")); /* - * In the postmaster, absorb the environment values for LC_COLLATE and - * LC_CTYPE. Individual backends will change these later to settings - * taken from pg_database, but the postmaster cannot do that. If we leave - * these set to "C" then message localization might not work well in the - * postmaster. + * Collation is handled by pg_locale.c, and the behavior is dependent on + * the provider. strcoll(), etc., should not be called directly. + */ + init_locale("LC_COLLATE", LC_COLLATE, "C"); + + /* + * In the postmaster, absorb the environment value for LC_CTYPE. + * Individual backends will change it later to pg_database.datctype, but + * the postmaster cannot do that. If we leave it set to "C" then message + * localization might not work well in the postmaster. */ - init_locale("LC_COLLATE", LC_COLLATE, ""); init_locale("LC_CTYPE", LC_CTYPE, ""); /* diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl index 77659b0f760..9ecddb14231 100644 --- a/src/backend/nodes/gen_node_support.pl +++ b/src/backend/nodes/gen_node_support.pl @@ -1039,6 +1039,11 @@ _read${n}(void) print $off "\tWRITE_UINT_FIELD($f);\n"; print $rff "\tREAD_UINT_FIELD($f);\n" unless $no_read; } + elsif ($t eq 'int64') + { + print $off "\tWRITE_INT64_FIELD($f);\n"; + print $rff "\tREAD_INT64_FIELD($f);\n" unless $no_read; + } elsif ($t eq 'uint64' || $t eq 'AclMode') { @@ -1324,7 +1329,7 @@ _jumble${n}(JumbleState *jstate, Node *node) # Node type. Squash constants if requested. if ($query_jumble_squash) { - print $jff "\tJUMBLE_ELEMENTS($f);\n" + print $jff "\tJUMBLE_ELEMENTS($f, node);\n" unless $query_jumble_ignore; } else diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index ceac3fd8620..eaf391fc2ab 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -51,6 +51,12 @@ static void outDouble(StringInfo str, double d); #define WRITE_UINT_FIELD(fldname) \ appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname) +/* Write a signed integer field (anything written with INT64_FORMAT) */ +#define WRITE_INT64_FIELD(fldname) \ + appendStringInfo(str, \ + " :" CppAsString(fldname) " " INT64_FORMAT, \ + node->fldname) + /* Write an unsigned integer field (anything written with UINT64_FORMAT) */ #define WRITE_UINT64_FIELD(fldname) \ appendStringInfo(str, " :" CppAsString(fldname) " " UINT64_FORMAT, \ @@ -647,6 +653,8 @@ _outA_Expr(StringInfo str, const A_Expr *node) WRITE_NODE_FIELD(lexpr); WRITE_NODE_FIELD(rexpr); + WRITE_LOCATION_FIELD(rexpr_list_start); + WRITE_LOCATION_FIELD(rexpr_list_end); WRITE_LOCATION_FIELD(location); } diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c index d1e82a63f09..31f97151977 100644 --- a/src/backend/nodes/queryjumblefuncs.c +++ b/src/backend/nodes/queryjumblefuncs.c @@ -21,6 +21,11 @@ * tree(s) generated from the query. The executor can then use this value * to blame query costs on the proper queryId. * + * Arrays of two or more constants and PARAM_EXTERN parameters are "squashed" + * and contribute only once to the jumble. This has the effect that queries + * that differ only on the length of such lists have the same queryId. + * + * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -56,16 +61,18 @@ int compute_query_id = COMPUTE_QUERY_ID_AUTO; bool query_id_enabled = false; static JumbleState *InitJumble(void); -static uint64 DoJumble(JumbleState *jstate, Node *node); +static int64 DoJumble(JumbleState *jstate, Node *node); static void AppendJumble(JumbleState *jstate, const unsigned char *value, Size size); static void FlushPendingNulls(JumbleState *jstate); static void RecordConstLocation(JumbleState *jstate, - int location, bool squashed); + bool extern_param, + int location, int len); static void _jumbleNode(JumbleState *jstate, Node *node); -static void _jumbleElements(JumbleState *jstate, List *elements); -static void _jumbleA_Const(JumbleState *jstate, Node *node); static void _jumbleList(JumbleState *jstate, Node *node); +static void _jumbleElements(JumbleState *jstate, List *elements, Node *node); +static void _jumbleParam(JumbleState *jstate, Node *node); +static void _jumbleA_Const(JumbleState *jstate, Node *node); static void _jumbleVariableSetStmt(JumbleState *jstate, Node *node); static void _jumbleRangeTblEntry_eref(JumbleState *jstate, RangeTblEntry *rte, @@ -141,12 +148,12 @@ JumbleQuery(Query *query) * If we are unlucky enough to get a hash of zero, use 1 instead for * normal statements and 2 for utility queries. */ - if (query->queryId == UINT64CONST(0)) + if (query->queryId == INT64CONST(0)) { if (query->utilityStmt) - query->queryId = UINT64CONST(2); + query->queryId = INT64CONST(2); else - query->queryId = UINT64CONST(1); + query->queryId = INT64CONST(1); } return jstate; @@ -185,6 +192,7 @@ InitJumble(void) jstate->clocations_count = 0; jstate->highest_extern_param_id = 0; jstate->pending_nulls = 0; + jstate->has_squashed_lists = false; #ifdef USE_ASSERT_CHECKING jstate->total_jumble_len = 0; #endif @@ -197,7 +205,7 @@ InitJumble(void) * Jumble the given Node using the given JumbleState and return the resulting * jumble hash. */ -static uint64 +static int64 DoJumble(JumbleState *jstate, Node *node) { /* Jumble the given node */ @@ -207,10 +215,14 @@ DoJumble(JumbleState *jstate, Node *node) if (jstate->pending_nulls > 0) FlushPendingNulls(jstate); + /* Squashed list found, reset highest_extern_param_id */ + if (jstate->has_squashed_lists) + jstate->highest_extern_param_id = 0; + /* Process the jumble buffer and produce the hash value */ - return DatumGetUInt64(hash_any_extended(jstate->jumble, - jstate->jumble_len, - 0)); + return DatumGetInt64(hash_any_extended(jstate->jumble, + jstate->jumble_len, + 0)); } /* @@ -256,10 +268,10 @@ AppendJumbleInternal(JumbleState *jstate, const unsigned char *item, if (unlikely(jumble_len >= JUMBLE_SIZE)) { - uint64 start_hash; + int64 start_hash; - start_hash = DatumGetUInt64(hash_any_extended(jumble, - JUMBLE_SIZE, 0)); + start_hash = DatumGetInt64(hash_any_extended(jumble, + JUMBLE_SIZE, 0)); memcpy(jumble, &start_hash, sizeof(start_hash)); jumble_len = sizeof(start_hash); } @@ -373,15 +385,17 @@ FlushPendingNulls(JumbleState *jstate) /* - * Record location of constant within query string of query tree that is - * currently being walked. + * Record the location of some kind of constant within a query string. + * These are not only bare constants but also expressions that ultimately + * constitute a constant, such as those inside casts and simple function + * calls; if extern_param, then it corresponds to a PARAM_EXTERN Param. * - * 'squashed' signals that the constant represents the first or the last - * element in a series of merged constants, and everything but the first/last - * element contributes nothing to the jumble hash. + * If length is -1, it indicates a single such constant element. If + * it's a positive integer, it indicates the length of a squashable + * list of them. */ static void -RecordConstLocation(JumbleState *jstate, int location, bool squashed) +RecordConstLocation(JumbleState *jstate, bool extern_param, int location, int len) { /* -1 indicates unknown or undefined location */ if (location >= 0) @@ -396,9 +410,15 @@ RecordConstLocation(JumbleState *jstate, int location, bool squashed) sizeof(LocationLen)); } jstate->clocations[jstate->clocations_count].location = location; - /* initialize lengths to -1 to simplify third-party module usage */ - jstate->clocations[jstate->clocations_count].squashed = squashed; - jstate->clocations[jstate->clocations_count].length = -1; + + /* + * Lengths are either positive integers (indicating a squashable + * list), or -1. + */ + Assert(len > -1 || len == -1); + jstate->clocations[jstate->clocations_count].length = len; + jstate->clocations[jstate->clocations_count].squashed = (len > -1); + jstate->clocations[jstate->clocations_count].extern_param = extern_param; jstate->clocations_count++; } } @@ -407,47 +427,74 @@ RecordConstLocation(JumbleState *jstate, int location, bool squashed) * Subroutine for _jumbleElements: Verify a few simple cases where we can * deduce that the expression is a constant: * - * - Ignore a possible wrapping RelabelType and CoerceViaIO. - * - If it's a FuncExpr, check that the function is an implicit + * - See through any wrapping RelabelType and CoerceViaIO layers. + * - If it's a FuncExpr, check that the function is a builtin * cast and its arguments are Const. - * - Otherwise test if the expression is a simple Const. + * - Otherwise test if the expression is a simple Const or a + * PARAM_EXTERN param. */ static bool -IsSquashableConst(Node *element) +IsSquashableConstant(Node *element) { - if (IsA(element, RelabelType)) - element = (Node *) ((RelabelType *) element)->arg; - - if (IsA(element, CoerceViaIO)) - element = (Node *) ((CoerceViaIO *) element)->arg; - - if (IsA(element, FuncExpr)) +restart: + switch (nodeTag(element)) { - FuncExpr *func = (FuncExpr *) element; - ListCell *temp; + case T_RelabelType: + /* Unwrap RelabelType */ + element = (Node *) ((RelabelType *) element)->arg; + goto restart; - if (func->funcformat != COERCE_IMPLICIT_CAST && - func->funcformat != COERCE_EXPLICIT_CAST) - return false; + case T_CoerceViaIO: + /* Unwrap CoerceViaIO */ + element = (Node *) ((CoerceViaIO *) element)->arg; + goto restart; - if (func->funcid > FirstGenbkiObjectId) - return false; + case T_Const: + return true; - foreach(temp, func->args) - { - Node *arg = lfirst(temp); + case T_Param: + return castNode(Param, element)->paramkind == PARAM_EXTERN; - if (!IsA(arg, Const)) /* XXX we could recurse here instead */ - return false; - } + case T_FuncExpr: + { + FuncExpr *func = (FuncExpr *) element; + ListCell *temp; - return true; - } + if (func->funcformat != COERCE_IMPLICIT_CAST && + func->funcformat != COERCE_EXPLICIT_CAST) + return false; - if (!IsA(element, Const)) - return false; + if (func->funcid > FirstGenbkiObjectId) + return false; - return true; + /* + * We can check function arguments recursively, being careful + * about recursing too deep. At each recursion level it's + * enough to test the stack on the first element. (Note that + * I wasn't able to hit this without bloating the stack + * artificially in this function: the parser errors out before + * stack size becomes a problem here.) + */ + foreach(temp, func->args) + { + Node *arg = lfirst(temp); + + if (!IsA(arg, Const)) + { + if (foreach_current_index(temp) == 0 && + stack_is_too_deep()) + return false; + else if (!IsSquashableConstant(arg)) + return false; + } + } + + return true; + } + + default: + return false; + } } /* @@ -457,39 +504,33 @@ IsSquashableConst(Node *element) * Return value indicates if squashing is possible. * * Note that this function searches only for explicit Const nodes with - * possibly very simple decorations on top, and does not try to simplify - * expressions. + * possibly very simple decorations on top and PARAM_EXTERN parameters, + * and does not try to simplify expressions. */ static bool -IsSquashableConstList(List *elements, Node **firstExpr, Node **lastExpr) +IsSquashableConstantList(List *elements) { ListCell *temp; - /* - * If squashing is disabled, or the list is too short, we don't try to - * squash it. - */ + /* If the list is too short, we don't try to squash it. */ if (list_length(elements) < 2) return false; foreach(temp, elements) { - if (!IsSquashableConst(lfirst(temp))) + if (!IsSquashableConstant(lfirst(temp))) return false; } - *firstExpr = linitial(elements); - *lastExpr = llast(elements); - return true; } #define JUMBLE_NODE(item) \ _jumbleNode(jstate, (Node *) expr->item) -#define JUMBLE_ELEMENTS(list) \ - _jumbleElements(jstate, (List *) expr->list) +#define JUMBLE_ELEMENTS(list, node) \ + _jumbleElements(jstate, (List *) expr->list, node) #define JUMBLE_LOCATION(location) \ - RecordConstLocation(jstate, expr->location, false) + RecordConstLocation(jstate, false, expr->location, -1) #define JUMBLE_FIELD(item) \ do { \ if (sizeof(expr->item) == 8) \ @@ -516,42 +557,6 @@ do { \ #include "queryjumblefuncs.funcs.c" -/* - * We jumble lists of constant elements as one individual item regardless - * of how many elements are in the list. This means different queries - * jumble to the same query_id, if the only difference is the number of - * elements in the list. - */ -static void -_jumbleElements(JumbleState *jstate, List *elements) -{ - Node *first, - *last; - - if (IsSquashableConstList(elements, &first, &last)) - { - /* - * If this list of elements is squashable, keep track of the location - * of its first and last elements. When reading back the locations - * array, we'll see two consecutive locations with ->squashed set to - * true, indicating the location of initial and final elements of this - * list. - * - * For the limited set of cases we support now (implicit coerce via - * FuncExpr, Const) it's fine to use exprLocation of the 'last' - * expression, but if more complex composite expressions are to be - * supported (e.g., OpExpr or FuncExpr as an explicit call), more - * sophisticated tracking will be needed. - */ - RecordConstLocation(jstate, exprLocation(first), true); - RecordConstLocation(jstate, exprLocation(last), true); - } - else - { - _jumbleNode(jstate, (Node *) elements); - } -} - static void _jumbleNode(JumbleState *jstate, Node *node) { @@ -593,26 +598,6 @@ _jumbleNode(JumbleState *jstate, Node *node) break; } - /* Special cases to handle outside the automated code */ - switch (nodeTag(expr)) - { - case T_Param: - { - Param *p = (Param *) node; - - /* - * Update the highest Param id seen, in order to start - * normalization correctly. - */ - if (p->paramkind == PARAM_EXTERN && - p->paramid > jstate->highest_extern_param_id) - jstate->highest_extern_param_id = p->paramid; - } - break; - default: - break; - } - /* Ensure we added something to the jumble buffer */ Assert(jstate->total_jumble_len > prev_jumble_len); } @@ -648,6 +633,79 @@ _jumbleList(JumbleState *jstate, Node *node) } } +/* + * We try to jumble lists of expressions as one individual item regardless + * of how many elements are in the list. This is know as squashing, which + * results in different queries jumbling to the same query_id, if the only + * difference is the number of elements in the list. + * + * We allow constants and PARAM_EXTERN parameters to be squashed. To normalize + * such queries, we use the start and end locations of the list of elements in + * a list. + */ +static void +_jumbleElements(JumbleState *jstate, List *elements, Node *node) +{ + bool normalize_list = false; + + if (IsSquashableConstantList(elements)) + { + if (IsA(node, ArrayExpr)) + { + ArrayExpr *aexpr = (ArrayExpr *) node; + + if (aexpr->list_start > 0 && aexpr->list_end > 0) + { + RecordConstLocation(jstate, + false, + aexpr->list_start + 1, + (aexpr->list_end - aexpr->list_start) - 1); + normalize_list = true; + jstate->has_squashed_lists = true; + } + } + } + + if (!normalize_list) + { + _jumbleNode(jstate, (Node *) elements); + } +} + +/* + * We store the highest param ID of extern params. This can later be used + * to start the numbering of the placeholder for squashed lists. + */ +static void +_jumbleParam(JumbleState *jstate, Node *node) +{ + Param *expr = (Param *) node; + + JUMBLE_FIELD(paramkind); + JUMBLE_FIELD(paramid); + JUMBLE_FIELD(paramtype); + /* paramtypmode and paramcollid are ignored */ + + if (expr->paramkind == PARAM_EXTERN) + { + /* + * At this point, only external parameter locations outside of + * squashable lists will be recorded. + */ + RecordConstLocation(jstate, true, expr->location, -1); + + /* + * Update the highest Param id seen, in order to start normalization + * correctly. + * + * Note: This value is reset at the end of jumbling if there exists a + * squashable list. See the comment in the definition of JumbleState. + */ + if (expr->paramid > jstate->highest_extern_param_id) + jstate->highest_extern_param_id = expr->paramid; + } +} + static void _jumbleA_Const(JumbleState *jstate, Node *node) { diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 64d3a09f765..48b5d13b9b6 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -68,6 +68,12 @@ token = pg_strtok(&length); /* get field value */ \ local_node->fldname = atoui(token) +/* Read a signed integer field (anything written using INT64_FORMAT) */ +#define READ_INT64_FIELD(fldname) \ + token = pg_strtok(&length); /* skip :fldname */ \ + token = pg_strtok(&length); /* get field value */ \ + local_node->fldname = strtoi64(token, NULL, 10) + /* Read an unsigned integer field (anything written using UINT64_FORMAT) */ #define READ_UINT64_FIELD(fldname) \ token = pg_strtok(&length); /* skip :fldname */ \ @@ -520,6 +526,8 @@ _readA_Expr(void) READ_NODE_FIELD(lexpr); READ_NODE_FIELD(rexpr); + READ_LOCATION_FIELD(rexpr_list_start); + READ_LOCATION_FIELD(rexpr_list_end); READ_LOCATION_FIELD(location); READ_DONE(); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 3d44815ed5a..344a3188317 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -2247,7 +2247,7 @@ append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers) * Determines and returns the cost of an Append node. */ void -cost_append(AppendPath *apath) +cost_append(AppendPath *apath, PlannerInfo *root) { ListCell *l; @@ -2309,26 +2309,52 @@ cost_append(AppendPath *apath) foreach(l, apath->subpaths) { Path *subpath = (Path *) lfirst(l); - Path sort_path; /* dummy for result of cost_sort */ + int presorted_keys; + Path sort_path; /* dummy for result of + * cost_sort/cost_incremental_sort */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { /* * We'll need to insert a Sort node, so include costs for - * that. We can use the parent's LIMIT if any, since we + * that. We choose to use incremental sort if it is + * enabled and there are presorted keys; otherwise we use + * full sort. + * + * We can use the parent's LIMIT if any, since we * certainly won't pull more than that many tuples from * any child. */ - cost_sort(&sort_path, - NULL, /* doesn't currently need root */ - pathkeys, - subpath->disabled_nodes, - subpath->total_cost, - subpath->rows, - subpath->pathtarget->width, - 0.0, - work_mem, - apath->limit_tuples); + if (enable_incremental_sort && presorted_keys > 0) + { + cost_incremental_sort(&sort_path, + root, + pathkeys, + presorted_keys, + subpath->disabled_nodes, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + apath->limit_tuples); + } + else + { + cost_sort(&sort_path, + root, + pathkeys, + subpath->disabled_nodes, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + apath->limit_tuples); + } + subpath = &sort_path; } @@ -2546,13 +2572,13 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, Cost input_startup_cost = mpath->subpath->startup_cost; Cost input_total_cost = mpath->subpath->total_cost; double tuples = mpath->subpath->rows; - double calls = mpath->calls; + Cardinality est_calls = mpath->est_calls; int width = mpath->subpath->pathtarget->width; double hash_mem_bytes; double est_entry_bytes; - double est_cache_entries; - double ndistinct; + Cardinality est_cache_entries; + Cardinality ndistinct; double evict_ratio; double hit_ratio; Cost startup_cost; @@ -2578,7 +2604,7 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, est_cache_entries = floor(hash_mem_bytes / est_entry_bytes); /* estimate on the distinct number of parameter values */ - ndistinct = estimate_num_groups(root, mpath->param_exprs, calls, NULL, + ndistinct = estimate_num_groups(root, mpath->param_exprs, est_calls, NULL, &estinfo); /* @@ -2590,7 +2616,10 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, * certainly mean a MemoizePath will never survive add_path(). */ if ((estinfo.flags & SELFLAG_USED_DEFAULT) != 0) - ndistinct = calls; + ndistinct = est_calls; + + /* Remember the ndistinct estimate for EXPLAIN */ + mpath->est_unique_keys = ndistinct; /* * Since we've already estimated the maximum number of entries we can @@ -2618,9 +2647,12 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath, * must look at how many scans are estimated in total for this node and * how many of those scans we expect to get a cache hit. */ - hit_ratio = ((calls - ndistinct) / calls) * + hit_ratio = ((est_calls - ndistinct) / est_calls) * (est_cache_entries / Max(ndistinct, est_cache_entries)); + /* Remember the hit ratio estimate for EXPLAIN */ + mpath->est_hit_ratio = hit_ratio; + Assert(hit_ratio >= 0 && hit_ratio <= 1.0); /* diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 26f0336f1e4..ebedc5574ca 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -154,13 +154,17 @@ add_paths_to_joinrel(PlannerInfo *root, /* * See if the inner relation is provably unique for this outer rel. * - * We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't - * matter since the executor can make the equivalent optimization anyway; - * we need not expend planner cycles on proofs. For JOIN_UNIQUE_INNER, we - * must be considering a semijoin whose inner side is not provably unique - * (else reduce_unique_semijoins would've simplified it), so there's no - * point in calling innerrel_is_unique. However, if the LHS covers all of - * the semijoin's min_lefthand, then it's appropriate to set inner_unique + * We have some special cases: for JOIN_SEMI, it doesn't matter since the + * executor can make the equivalent optimization anyway. It also doesn't + * help enable use of Memoize, since a semijoin with a provably unique + * inner side should have been reduced to an inner join in that case. + * Therefore, we need not expend planner cycles on proofs. (For + * JOIN_ANTI, although it doesn't help the executor for the same reason, + * it can benefit Memoize paths.) For JOIN_UNIQUE_INNER, we must be + * considering a semijoin whose inner side is not provably unique (else + * reduce_unique_semijoins would've simplified it), so there's no point in + * calling innerrel_is_unique. However, if the LHS covers all of the + * semijoin's min_lefthand, then it's appropriate to set inner_unique * because the path produced by create_unique_path will be unique relative * to the LHS. (If we have an LHS that's only part of the min_lefthand, * that is *not* true.) For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid @@ -169,12 +173,6 @@ add_paths_to_joinrel(PlannerInfo *root, switch (jointype) { case JOIN_SEMI: - case JOIN_ANTI: - - /* - * XXX it may be worth proving this to allow a Memoize to be - * considered for Nested Loop Semi/Anti Joins. - */ extra.inner_unique = false; /* well, unproven */ break; case JOIN_UNIQUE_INNER: @@ -715,16 +713,21 @@ get_memoize_path(PlannerInfo *root, RelOptInfo *innerrel, return NULL; /* - * Currently we don't do this for SEMI and ANTI joins unless they're - * marked as inner_unique. This is because nested loop SEMI/ANTI joins - * don't scan the inner node to completion, which will mean memoize cannot - * mark the cache entry as complete. - * - * XXX Currently we don't attempt to mark SEMI/ANTI joins as inner_unique - * = true. Should we? See add_paths_to_joinrel() + * Currently we don't do this for SEMI and ANTI joins, because nested loop + * SEMI/ANTI joins don't scan the inner node to completion, which means + * memoize cannot mark the cache entry as complete. Nor can we mark the + * cache entry as complete after fetching the first inner tuple, because + * if that tuple and the current outer tuple don't satisfy the join + * clauses, a second inner tuple that satisfies the parameters would find + * the cache entry already marked as complete. The only exception is when + * the inner relation is provably unique, as in that case, there won't be + * a second matching tuple and we can safely mark the cache entry as + * complete after fetching the first inner tuple. Note that in such + * cases, the SEMI join should have been reduced to an inner join by + * reduce_unique_semijoins. */ - if (!extra->inner_unique && (jointype == JOIN_SEMI || - jointype == JOIN_ANTI)) + if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) && + !extra->inner_unique) return NULL; /* @@ -876,16 +879,13 @@ try_nestloop_path(PlannerInfo *root, /* * Check to see if proposed path is still parameterized, and reject if the * parameterization wouldn't be sensible --- unless allow_star_schema_join - * says to allow it anyway. Also, we must reject if have_dangerous_phv - * doesn't like the look of it, which could only happen if the nestloop is - * still parameterized. + * says to allow it anyway. */ required_outer = calc_nestloop_required_outer(outerrelids, outer_paramrels, innerrelids, inner_paramrels); if (required_outer && - ((!bms_overlap(required_outer, extra->param_source_rels) && - !allow_star_schema_join(root, outerrelids, inner_paramrels)) || - have_dangerous_phv(root, outerrelids, inner_paramrels))) + !bms_overlap(required_outer, extra->param_source_rels) && + !allow_star_schema_join(root, outerrelids, inner_paramrels)) { /* Waste no memory when we reject a path here */ bms_free(required_outer); diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 60d65762b5d..aad41b94009 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -565,9 +565,6 @@ join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, * Also, if the lateral reference is only indirect, we should reject * the join; whatever rel(s) the reference chain goes through must be * joined to first. - * - * Another case that might keep us from building a valid plan is the - * implementation restriction described by have_dangerous_phv(). */ lateral_fwd = bms_overlap(rel1->relids, rel2->lateral_relids); lateral_rev = bms_overlap(rel2->relids, rel1->lateral_relids); @@ -584,9 +581,6 @@ join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, /* check there is a direct reference from rel2 to rel1 */ if (!bms_overlap(rel1->relids, rel2->direct_lateral_relids)) return false; /* only indirect refs, so reject */ - /* check we won't have a dangerous PHV */ - if (have_dangerous_phv(root, rel1->relids, rel2->lateral_relids)) - return false; /* might be unable to handle required PHV */ } else if (lateral_rev) { @@ -599,9 +593,6 @@ join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, /* check there is a direct reference from rel1 to rel2 */ if (!bms_overlap(rel2->relids, rel1->direct_lateral_relids)) return false; /* only indirect refs, so reject */ - /* check we won't have a dangerous PHV */ - if (have_dangerous_phv(root, rel2->relids, rel1->lateral_relids)) - return false; /* might be unable to handle required PHV */ } /* @@ -1279,57 +1270,6 @@ has_legal_joinclause(PlannerInfo *root, RelOptInfo *rel) /* - * There's a pitfall for creating parameterized nestloops: suppose the inner - * rel (call it A) has a parameter that is a PlaceHolderVar, and that PHV's - * minimum eval_at set includes the outer rel (B) and some third rel (C). - * We might think we could create a B/A nestloop join that's parameterized by - * C. But we would end up with a plan in which the PHV's expression has to be - * evaluated as a nestloop parameter at the B/A join; and the executor is only - * set up to handle simple Vars as NestLoopParams. Rather than add complexity - * and overhead to the executor for such corner cases, it seems better to - * forbid the join. (Note that we can still make use of A's parameterized - * path with pre-joined B+C as the outer rel. have_join_order_restriction() - * ensures that we will consider making such a join even if there are not - * other reasons to do so.) - * - * So we check whether any PHVs used in the query could pose such a hazard. - * We don't have any simple way of checking whether a risky PHV would actually - * be used in the inner plan, and the case is so unusual that it doesn't seem - * worth working very hard on it. - * - * This needs to be checked in two places. If the inner rel's minimum - * parameterization would trigger the restriction, then join_is_legal() should - * reject the join altogether, because there will be no workable paths for it. - * But joinpath.c has to check again for every proposed nestloop path, because - * the inner path might have more than the minimum parameterization, causing - * some PHV to be dangerous for it that otherwise wouldn't be. - */ -bool -have_dangerous_phv(PlannerInfo *root, - Relids outer_relids, Relids inner_params) -{ - ListCell *lc; - - foreach(lc, root->placeholder_list) - { - PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(lc); - - if (!bms_is_subset(phinfo->ph_eval_at, inner_params)) - continue; /* ignore, could not be a nestloop param */ - if (!bms_overlap(phinfo->ph_eval_at, outer_relids)) - continue; /* ignore, not relevant to this join */ - if (bms_is_subset(phinfo->ph_eval_at, outer_relids)) - continue; /* safe, it can be eval'd within outerrel */ - /* Otherwise, it's potentially unsafe, so reject the join */ - return true; - } - - /* OK to perform the join */ - return false; -} - - -/* * is_dummy_rel --- has relation been proven empty? */ bool diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 4ad30b7627e..bfefc7dbea1 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -284,7 +284,10 @@ static Material *make_material(Plan *lefttree); static Memoize *make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, List *param_exprs, bool singlerow, bool binary_mode, - uint32 est_entries, Bitmapset *keyparamids); + uint32 est_entries, Bitmapset *keyparamids, + Cardinality est_calls, + Cardinality est_unique_keys, + double est_hit_ratio); static WindowAgg *make_windowagg(List *tlist, WindowClause *wc, int partNumCols, AttrNumber *partColIdx, Oid *partOperators, Oid *partCollations, int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators, Oid *ordCollations, @@ -1318,6 +1321,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) Oid *sortOperators; Oid *collations; bool *nullsFirst; + int presorted_keys; /* * Compute sort column info, and adjust subplan's tlist as needed. @@ -1353,14 +1357,38 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path, int flags) numsortkeys * sizeof(bool)) == 0); /* Now, insert a Sort node if subplan isn't sufficiently ordered */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - Sort *sort = make_sort(subplan, numsortkeys, + Plan *sort_plan; + + /* + * We choose to use incremental sort if it is enabled and + * there are presorted keys; otherwise we use full sort. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + sort_plan = (Plan *) + make_incrementalsort(subplan, numsortkeys, presorted_keys, sortColIdx, sortOperators, collations, nullsFirst); - label_sort_with_costsize(root, sort, best_path->limit_tuples); - subplan = (Plan *) sort; + label_incrementalsort_with_costsize(root, + (IncrementalSort *) sort_plan, + pathkeys, + best_path->limit_tuples); + } + else + { + sort_plan = (Plan *) make_sort(subplan, numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + + label_sort_with_costsize(root, (Sort *) sort_plan, + best_path->limit_tuples); + } + + subplan = sort_plan; } } @@ -1491,6 +1519,7 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path, Oid *sortOperators; Oid *collations; bool *nullsFirst; + int presorted_keys; /* Build the child plan */ /* Must insist that all children return the same tlist */ @@ -1525,14 +1554,38 @@ create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_path, numsortkeys * sizeof(bool)) == 0); /* Now, insert a Sort node if subplan isn't sufficiently ordered */ - if (!pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - Sort *sort = make_sort(subplan, numsortkeys, + Plan *sort_plan; + + /* + * We choose to use incremental sort if it is enabled and there + * are presorted keys; otherwise we use full sort. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + sort_plan = (Plan *) + make_incrementalsort(subplan, numsortkeys, presorted_keys, sortColIdx, sortOperators, collations, nullsFirst); - label_sort_with_costsize(root, sort, best_path->limit_tuples); - subplan = (Plan *) sort; + label_incrementalsort_with_costsize(root, + (IncrementalSort *) sort_plan, + pathkeys, + best_path->limit_tuples); + } + else + { + sort_plan = (Plan *) make_sort(subplan, numsortkeys, + sortColIdx, sortOperators, + collations, nullsFirst); + + label_sort_with_costsize(root, (Sort *) sort_plan, + best_path->limit_tuples); + } + + subplan = sort_plan; } subplans = lappend(subplans, subplan); @@ -1703,7 +1756,8 @@ create_memoize_plan(PlannerInfo *root, MemoizePath *best_path, int flags) plan = make_memoize(subplan, operators, collations, param_exprs, best_path->singlerow, best_path->binary_mode, - best_path->est_entries, keyparamids); + best_path->est_entries, keyparamids, best_path->est_calls, + best_path->est_unique_keys, best_path->est_hit_ratio); copy_generic_path_info(&plan->plan, (Path *) best_path); @@ -4344,13 +4398,16 @@ create_nestloop_plan(PlannerInfo *root, NestLoop *join_plan; Plan *outer_plan; Plan *inner_plan; + Relids outerrelids; List *tlist = build_path_tlist(root, &best_path->jpath.path); List *joinrestrictclauses = best_path->jpath.joinrestrictinfo; List *joinclauses; List *otherclauses; - Relids outerrelids; List *nestParams; + List *outer_tlist; + bool outer_parallel_safe; Relids saveOuterRels = root->curOuterRels; + ListCell *lc; /* * If the inner path is parameterized by the topmost parent of the outer @@ -4372,8 +4429,8 @@ create_nestloop_plan(PlannerInfo *root, outer_plan = create_plan_recurse(root, best_path->jpath.outerjoinpath, 0); /* For a nestloop, include outer relids in curOuterRels for inner side */ - root->curOuterRels = bms_union(root->curOuterRels, - best_path->jpath.outerjoinpath->parent->relids); + outerrelids = best_path->jpath.outerjoinpath->parent->relids; + root->curOuterRels = bms_union(root->curOuterRels, outerrelids); inner_plan = create_plan_recurse(root, best_path->jpath.innerjoinpath, 0); @@ -4412,9 +4469,66 @@ create_nestloop_plan(PlannerInfo *root, * Identify any nestloop parameters that should be supplied by this join * node, and remove them from root->curOuterParams. */ - outerrelids = best_path->jpath.outerjoinpath->parent->relids; - nestParams = identify_current_nestloop_params(root, outerrelids); + nestParams = identify_current_nestloop_params(root, + outerrelids, + PATH_REQ_OUTER((Path *) best_path)); + + /* + * While nestloop parameters that are Vars had better be available from + * the outer_plan already, there are edge cases where nestloop parameters + * that are PHVs won't be. In such cases we must add them to the + * outer_plan's tlist, since the executor's NestLoopParam machinery + * requires the params to be simple outer-Var references to that tlist. + * (This is cheating a little bit, because the outer path's required-outer + * relids might not be enough to allow evaluating such a PHV. But in + * practice, if we could have evaluated the PHV at the nestloop node, we + * can do so in the outer plan too.) + */ + outer_tlist = outer_plan->targetlist; + outer_parallel_safe = outer_plan->parallel_safe; + foreach(lc, nestParams) + { + NestLoopParam *nlp = (NestLoopParam *) lfirst(lc); + PlaceHolderVar *phv; + TargetEntry *tle; + + if (IsA(nlp->paramval, Var)) + continue; /* nothing to do for simple Vars */ + /* Otherwise it must be a PHV */ + phv = castNode(PlaceHolderVar, nlp->paramval); + + if (tlist_member((Expr *) phv, outer_tlist)) + continue; /* already available */ + + /* + * It's possible that nestloop parameter PHVs selected to evaluate + * here contain references to surviving root->curOuterParams items + * (that is, they reference values that will be supplied by some + * higher-level nestloop). Those need to be converted to Params now. + * Note: it's safe to do this after the tlist_member() check, because + * equal() won't pay attention to phv->phexpr. + */ + phv->phexpr = (Expr *) replace_nestloop_params(root, + (Node *) phv->phexpr); + + /* Make a shallow copy of outer_tlist, if we didn't already */ + if (outer_tlist == outer_plan->targetlist) + outer_tlist = list_copy(outer_tlist); + /* ... and add the needed expression */ + tle = makeTargetEntry((Expr *) copyObject(phv), + list_length(outer_tlist) + 1, + NULL, + true); + outer_tlist = lappend(outer_tlist, tle); + /* ... and track whether tlist is (still) parallel-safe */ + if (outer_parallel_safe) + outer_parallel_safe = is_parallel_safe(root, (Node *) phv); + } + if (outer_tlist != outer_plan->targetlist) + outer_plan = change_plan_targetlist(outer_plan, outer_tlist, + outer_parallel_safe); + /* And finally, we can build the join plan node */ join_plan = make_nestloop(tlist, joinclauses, otherclauses, @@ -6639,7 +6753,9 @@ materialize_finished_plan(Plan *subplan) static Memoize * make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, List *param_exprs, bool singlerow, bool binary_mode, - uint32 est_entries, Bitmapset *keyparamids) + uint32 est_entries, Bitmapset *keyparamids, + Cardinality est_calls, Cardinality est_unique_keys, + double est_hit_ratio) { Memoize *node = makeNode(Memoize); Plan *plan = &node->plan; @@ -6657,6 +6773,9 @@ make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations, node->binary_mode = binary_mode; node->est_entries = est_entries; node->keyparamids = keyparamids; + node->est_calls = est_calls; + node->est_unique_keys = est_unique_keys; + node->est_hit_ratio = est_hit_ratio; return node; } diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c index 01804b085b3..3e3fec89252 100644 --- a/src/backend/optimizer/plan/initsplan.c +++ b/src/backend/optimizer/plan/initsplan.c @@ -3048,36 +3048,16 @@ add_base_clause_to_rel(PlannerInfo *root, Index relid, * expr_is_nonnullable * Check to see if the Expr cannot be NULL * - * If the Expr is a simple Var that is defined NOT NULL and meanwhile is not - * nulled by any outer joins, then we can know that it cannot be NULL. + * Currently we only support simple Vars. */ static bool expr_is_nonnullable(PlannerInfo *root, Expr *expr) { - RelOptInfo *rel; - Var *var; - /* For now only check simple Vars */ if (!IsA(expr, Var)) return false; - var = (Var *) expr; - - /* could the Var be nulled by any outer joins? */ - if (!bms_is_empty(var->varnullingrels)) - return false; - - /* system columns cannot be NULL */ - if (var->varattno < 0) - return true; - - /* is the column defined NOT NULL? */ - rel = find_base_rel(root, var->varno); - if (var->varattno > 0 && - bms_is_member(var->varattno, rel->notnullattnums)) - return true; - - return false; + return var_is_nonnullable(root, (Var *) expr, true); } /* diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 49ad6e83578..d59d6e4c6a0 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -331,7 +331,6 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, glob->finalrteperminfos = NIL; glob->finalrowmarks = NIL; glob->resultRelations = NIL; - glob->firstResultRels = NIL; glob->appendRelations = NIL; glob->partPruneInfos = NIL; glob->relationOids = NIL; @@ -343,6 +342,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, glob->transientPlan = false; glob->dependsOnRole = false; glob->partition_directory = NULL; + glob->rel_notnullatts_hash = NULL; /* * Assess whether it's feasible to use parallel mode for this query. We @@ -558,6 +558,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->commandType = parse->commandType; result->queryId = parse->queryId; + result->planOrigin = PLAN_STMT_STANDARD; result->hasReturning = (parse->returningList != NIL); result->hasModifyingCTE = parse->hasModifyingCTE; result->canSetTag = parse->canSetTag; @@ -571,7 +572,6 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, glob->prunableRelids); result->permInfos = glob->finalrteperminfos; result->resultRelations = glob->resultRelations; - result->firstResultRels = glob->firstResultRels; result->appendRelations = glob->appendRelations; result->subplans = glob->subplans; result->rewindPlanIDs = glob->rewindPlanIDs; @@ -723,6 +723,18 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, transform_MERGE_to_join(parse); /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. Note that this step does not descend into sublinks and + * subqueries; if we pull up any sublinks or subqueries below, their + * relation RTEs are processed just before pulling them up. + */ + parse = root->parse = preprocess_relation_rtes(root); + + /* * If the FROM clause is empty, replace it with a dummy RTE_RESULT RTE, so * that we don't need so many special cases to deal with that situation. */ @@ -746,14 +758,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, preprocess_function_rtes(root); /* - * Scan the rangetable for relations with virtual generated columns, and - * replace all Var nodes in the query that reference these columns with - * the generation expressions. Recursion issues here are handled in the - * same way as for SubLinks. - */ - parse = root->parse = expand_virtual_generated_columns(root); - - /* * Check to see if any subqueries in the jointree can be merged into this * query. */ @@ -789,23 +793,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, switch (rte->rtekind) { - case RTE_RELATION: - if (rte->inh) - { - /* - * Check to see if the relation actually has any children; - * if not, clear the inh flag so we can treat it as a - * plain base relation. - * - * Note: this could give a false-positive result, if the - * rel once had children but no longer does. We used to - * be able to clear rte->inh later on when we discovered - * that, but no more; we have to handle such cases as - * full-fledged inheritance. - */ - rte->inh = has_subclass(rte->relid); - } - break; case RTE_JOIN: root->hasJoinRTEs = true; if (IS_OUTER_JOIN(rte->jointype)) @@ -6881,7 +6868,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) * * tableOid is the table on which the index is to be built. indexOid is the * OID of an index to be created or reindexed (which must be an index with - * support for parallel builds - currently btree or BRIN). + * support for parallel builds - currently btree, GIN, or BRIN). * * Return value is the number of parallel worker processes to request. It * may be unsafe to proceed if this is 0. Note that this does not include the diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 150e9f060ee..846e44186c3 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -1097,9 +1097,10 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) /* * Set up the visible plan targetlist as being the same as - * the first RETURNING list. This is for the use of - * EXPLAIN; the executor won't pay any attention to the - * targetlist. We postpone this step until here so that + * the first RETURNING list. This is mostly for the use + * of EXPLAIN; the executor won't execute that targetlist, + * although it does use it to prepare the node's result + * tuple slot. We postpone this step until here so that * we don't have to do set_returning_clause_references() * twice on identical targetlists. */ @@ -1248,9 +1249,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) lappend_int(root->glob->resultRelations, splan->rootRelation); } - root->glob->firstResultRels = - lappend_int(root->glob->firstResultRels, - linitial_int(splan->resultRelations)); } break; case T_Append: diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index e7cb3fede66..d71ed958e31 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -1454,6 +1454,7 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink, Query *parse = root->parse; Query *subselect = (Query *) sublink->subselect; Node *whereClause; + PlannerInfo subroot; int rtoffset; int varno; Relids clause_varnos; @@ -1516,6 +1517,35 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink, return NULL; /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. + * + * Note: we construct up an entirely dummy PlannerInfo for use here. This + * is fine because only the "glob" and "parse" links will be used in this + * case. + * + * Note: we temporarily assign back the WHERE clause so that any virtual + * generated column references within it can be expanded. It should be + * separated out again afterward. + */ + MemSet(&subroot, 0, sizeof(subroot)); + subroot.type = T_PlannerInfo; + subroot.glob = root->glob; + subroot.parse = subselect; + subselect->jointree->quals = whereClause; + subselect = preprocess_relation_rtes(&subroot); + + /* + * Now separate out the WHERE clause again. + */ + whereClause = subselect->jointree->quals; + subselect->jointree->quals = NULL; + + /* * The subquery must have a nonempty jointree, but we can make it so. */ replace_empty_jointree(subselect); @@ -1732,6 +1762,7 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect, Node **testexpr, List **paramIds) { Node *whereClause; + PlannerInfo subroot; List *leftargs, *rightargs, *opids, @@ -1791,12 +1822,15 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect, * parent aliases were flattened already, and we're not going to pull any * child Vars (of any description) into the parent. * - * Note: passing the parent's root to eval_const_expressions is - * technically wrong, but we can get away with it since only the - * boundParams (if any) are used, and those would be the same in a - * subroot. - */ - whereClause = eval_const_expressions(root, whereClause); + * Note: we construct up an entirely dummy PlannerInfo to pass to + * eval_const_expressions. This is fine because only the "glob" and + * "parse" links are used by eval_const_expressions. + */ + MemSet(&subroot, 0, sizeof(subroot)); + subroot.type = T_PlannerInfo; + subroot.glob = root->glob; + subroot.parse = subselect; + whereClause = eval_const_expressions(&subroot, whereClause); whereClause = (Node *) canonicalize_qual((Expr *) whereClause, false); whereClause = (Node *) make_ands_implicit((Expr *) whereClause); diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index 87dc6f56b57..35e8d3c183b 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -4,10 +4,10 @@ * Planner preprocessing for subqueries and join tree manipulation. * * NOTE: the intended sequence for invoking these operations is + * preprocess_relation_rtes * replace_empty_jointree * pull_up_sublinks * preprocess_function_rtes - * expand_virtual_generated_columns * pull_up_subqueries * flatten_simple_union_all * do expression preprocessing (including flattening JOIN alias vars) @@ -36,6 +36,7 @@ #include "optimizer/clauses.h" #include "optimizer/optimizer.h" #include "optimizer/placeholder.h" +#include "optimizer/plancat.h" #include "optimizer/prep.h" #include "optimizer/subselect.h" #include "optimizer/tlist.h" @@ -102,6 +103,9 @@ typedef struct reduce_outer_joins_partial_state Relids unreduced_side; /* relids in its still-nullable side */ } reduce_outer_joins_partial_state; +static Query *expand_virtual_generated_columns(PlannerInfo *root, Query *parse, + RangeTblEntry *rte, int rt_index, + Relation relation); static Node *pull_up_sublinks_jointree_recurse(PlannerInfo *root, Node *jtnode, Relids *relids); static Node *pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node, @@ -393,6 +397,181 @@ transform_MERGE_to_join(Query *parse) } /* + * preprocess_relation_rtes + * Do the preprocessing work for any relation RTEs in the FROM clause. + * + * This scans the rangetable for relation RTEs and retrieves the necessary + * catalog information for each relation. Using this information, it clears + * the inh flag for any relation that has no children, collects not-null + * attribute numbers for any relation that has column not-null constraints, and + * expands virtual generated columns for any relation that contains them. + * + * Note that expanding virtual generated columns may cause the query tree to + * have new copies of rangetable entries. Therefore, we have to use list_nth + * instead of foreach when iterating over the query's rangetable. + * + * Returns a modified copy of the query tree, if any relations with virtual + * generated columns are present. + */ +Query * +preprocess_relation_rtes(PlannerInfo *root) +{ + Query *parse = root->parse; + int rtable_size; + int rt_index; + + rtable_size = list_length(parse->rtable); + + for (rt_index = 0; rt_index < rtable_size; rt_index++) + { + RangeTblEntry *rte = rt_fetch(rt_index + 1, parse->rtable); + Relation relation; + + /* We only care about relation RTEs. */ + if (rte->rtekind != RTE_RELATION) + continue; + + /* + * We need not lock the relation since it was already locked by the + * rewriter. + */ + relation = table_open(rte->relid, NoLock); + + /* + * Check to see if the relation actually has any children; if not, + * clear the inh flag so we can treat it as a plain base relation. + * + * Note: this could give a false-positive result, if the rel once had + * children but no longer does. We used to be able to clear rte->inh + * later on when we discovered that, but no more; we have to handle + * such cases as full-fledged inheritance. + */ + if (rte->inh) + rte->inh = relation->rd_rel->relhassubclass; + + /* + * Check to see if the relation has any column not-null constraints; + * if so, retrieve the constraint information and store it in a + * relation OID based hash table. + */ + get_relation_notnullatts(root, relation); + + /* + * Check to see if the relation has any virtual generated columns; if + * so, replace all Var nodes in the query that reference these columns + * with the generation expressions. + */ + parse = expand_virtual_generated_columns(root, parse, + rte, rt_index + 1, + relation); + + table_close(relation, NoLock); + } + + return parse; +} + +/* + * expand_virtual_generated_columns + * Expand virtual generated columns for the given relation. + * + * This checks whether the given relation has any virtual generated columns, + * and if so, replaces all Var nodes in the query that reference those columns + * with their generation expressions. + * + * Returns a modified copy of the query tree if the relation contains virtual + * generated columns. + */ +static Query * +expand_virtual_generated_columns(PlannerInfo *root, Query *parse, + RangeTblEntry *rte, int rt_index, + Relation relation) +{ + TupleDesc tupdesc; + + /* Only normal relations can have virtual generated columns */ + Assert(rte->rtekind == RTE_RELATION); + + tupdesc = RelationGetDescr(relation); + if (tupdesc->constr && tupdesc->constr->has_generated_virtual) + { + List *tlist = NIL; + pullup_replace_vars_context rvcontext; + + for (int i = 0; i < tupdesc->natts; i++) + { + Form_pg_attribute attr = TupleDescAttr(tupdesc, i); + TargetEntry *tle; + + if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + { + Node *defexpr; + + defexpr = build_generation_expression(relation, i + 1); + ChangeVarNodes(defexpr, 1, rt_index, 0); + + tle = makeTargetEntry((Expr *) defexpr, i + 1, 0, false); + tlist = lappend(tlist, tle); + } + else + { + Var *var; + + var = makeVar(rt_index, + i + 1, + attr->atttypid, + attr->atttypmod, + attr->attcollation, + 0); + + tle = makeTargetEntry((Expr *) var, i + 1, 0, false); + tlist = lappend(tlist, tle); + } + } + + Assert(list_length(tlist) > 0); + Assert(!rte->lateral); + + /* + * The relation's targetlist items are now in the appropriate form to + * insert into the query, except that we may need to wrap them in + * PlaceHolderVars. Set up required context data for + * pullup_replace_vars. + */ + rvcontext.root = root; + rvcontext.targetlist = tlist; + rvcontext.target_rte = rte; + rvcontext.result_relation = parse->resultRelation; + /* won't need these values */ + rvcontext.relids = NULL; + rvcontext.nullinfo = NULL; + /* pass NULL for outer_hasSubLinks */ + rvcontext.outer_hasSubLinks = NULL; + rvcontext.varno = rt_index; + /* this flag will be set below, if needed */ + rvcontext.wrap_option = REPLACE_WRAP_NONE; + /* initialize cache array with indexes 0 .. length(tlist) */ + rvcontext.rv_cache = palloc0((list_length(tlist) + 1) * + sizeof(Node *)); + + /* + * If the query uses grouping sets, we need a PlaceHolderVar for each + * expression of the relation's targetlist items. (See comments in + * pull_up_simple_subquery().) + */ + if (parse->groupingSets) + rvcontext.wrap_option = REPLACE_WRAP_ALL; + + /* + * Apply pullup variable replacement throughout the query tree. + */ + parse = (Query *) pullup_replace_vars((Node *) parse, &rvcontext); + } + + return parse; +} + +/* * replace_empty_jointree * If the Query's jointree is empty, replace it with a dummy RTE_RESULT * relation. @@ -950,128 +1129,6 @@ preprocess_function_rtes(PlannerInfo *root) } /* - * expand_virtual_generated_columns - * Expand all virtual generated column references in a query. - * - * This scans the rangetable for relations with virtual generated columns, and - * replaces all Var nodes in the query that reference these columns with the - * generation expressions. Note that we do not descend into subqueries; that - * is taken care of when the subqueries are planned. - * - * This has to be done after we have pulled up any SubLinks within the query's - * quals; otherwise any virtual generated column references within the SubLinks - * that should be transformed into joins wouldn't get expanded. - * - * Returns a modified copy of the query tree, if any relations with virtual - * generated columns are present. - */ -Query * -expand_virtual_generated_columns(PlannerInfo *root) -{ - Query *parse = root->parse; - int rt_index; - ListCell *lc; - - rt_index = 0; - foreach(lc, parse->rtable) - { - RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); - Relation rel; - TupleDesc tupdesc; - - ++rt_index; - - /* - * Only normal relations can have virtual generated columns. - */ - if (rte->rtekind != RTE_RELATION) - continue; - - rel = table_open(rte->relid, NoLock); - - tupdesc = RelationGetDescr(rel); - if (tupdesc->constr && tupdesc->constr->has_generated_virtual) - { - List *tlist = NIL; - pullup_replace_vars_context rvcontext; - - for (int i = 0; i < tupdesc->natts; i++) - { - Form_pg_attribute attr = TupleDescAttr(tupdesc, i); - TargetEntry *tle; - - if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) - { - Node *defexpr; - - defexpr = build_generation_expression(rel, i + 1); - ChangeVarNodes(defexpr, 1, rt_index, 0); - - tle = makeTargetEntry((Expr *) defexpr, i + 1, 0, false); - tlist = lappend(tlist, tle); - } - else - { - Var *var; - - var = makeVar(rt_index, - i + 1, - attr->atttypid, - attr->atttypmod, - attr->attcollation, - 0); - - tle = makeTargetEntry((Expr *) var, i + 1, 0, false); - tlist = lappend(tlist, tle); - } - } - - Assert(list_length(tlist) > 0); - Assert(!rte->lateral); - - /* - * The relation's targetlist items are now in the appropriate form - * to insert into the query, except that we may need to wrap them - * in PlaceHolderVars. Set up required context data for - * pullup_replace_vars. - */ - rvcontext.root = root; - rvcontext.targetlist = tlist; - rvcontext.target_rte = rte; - rvcontext.result_relation = parse->resultRelation; - /* won't need these values */ - rvcontext.relids = NULL; - rvcontext.nullinfo = NULL; - /* pass NULL for outer_hasSubLinks */ - rvcontext.outer_hasSubLinks = NULL; - rvcontext.varno = rt_index; - /* this flag will be set below, if needed */ - rvcontext.wrap_option = REPLACE_WRAP_NONE; - /* initialize cache array with indexes 0 .. length(tlist) */ - rvcontext.rv_cache = palloc0((list_length(tlist) + 1) * - sizeof(Node *)); - - /* - * If the query uses grouping sets, we need a PlaceHolderVar for - * each expression of the relation's targetlist items. (See - * comments in pull_up_simple_subquery().) - */ - if (parse->groupingSets) - rvcontext.wrap_option = REPLACE_WRAP_ALL; - - /* - * Apply pullup variable replacement throughout the query tree. - */ - parse = (Query *) pullup_replace_vars((Node *) parse, &rvcontext); - } - - table_close(rel, NoLock); - } - - return parse; -} - -/* * pull_up_subqueries * Look for subqueries in the rangetable that can be pulled up into * the parent query. If the subquery has no special features like @@ -1334,6 +1391,16 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, Assert(subquery->cteList == NIL); /* + * Scan the rangetable for relation RTEs and retrieve the necessary + * catalog information for each relation. Using this information, clear + * the inh flag for any relation that has no children, collect not-null + * attribute numbers for any relation that has column not-null + * constraints, and expand virtual generated columns for any relation that + * contains them. + */ + subquery = subroot->parse = preprocess_relation_rtes(subroot); + + /* * If the FROM clause is empty, replace it with a dummy RTE_RESULT RTE, so * that we don't need so many special cases to deal with that situation. */ @@ -1353,13 +1420,6 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte, preprocess_function_rtes(subroot); /* - * Scan the rangetable for relations with virtual generated columns, and - * replace all Var nodes in the query that reference these columns with - * the generation expressions. - */ - subquery = subroot->parse = expand_virtual_generated_columns(subroot); - - /* * Recursively pull up the subquery's subqueries, so that * pull_up_subqueries' processing is complete for its jointree and * rangetable. diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c index 26a3e050086..6f0b338d2cd 100644 --- a/src/backend/optimizer/util/clauses.c +++ b/src/backend/optimizer/util/clauses.c @@ -20,6 +20,7 @@ #include "postgres.h" #include "access/htup_details.h" +#include "catalog/pg_class.h" #include "catalog/pg_language.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" @@ -36,6 +37,7 @@ #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/optimizer.h" +#include "optimizer/pathnode.h" #include "optimizer/plancat.h" #include "optimizer/planmain.h" #include "parser/analyze.h" @@ -43,6 +45,7 @@ #include "parser/parse_collate.h" #include "parser/parse_func.h" #include "parser/parse_oper.h" +#include "parser/parsetree.h" #include "rewrite/rewriteHandler.h" #include "rewrite/rewriteManip.h" #include "tcop/tcopprot.h" @@ -2242,7 +2245,8 @@ rowtype_field_matches(Oid rowtypeid, int fieldnum, * only operators and functions that are reasonable to try to execute. * * NOTE: "root" can be passed as NULL if the caller never wants to do any - * Param substitutions nor receive info about inlined functions. + * Param substitutions nor receive info about inlined functions nor reduce + * NullTest for Vars to constant true or constant false. * * NOTE: the planner assumes that this will always flatten nested AND and * OR clauses into N-argument form. See comments in prepqual.c. @@ -3333,6 +3337,13 @@ eval_const_expressions_mutator(Node *node, -1, coalesceexpr->coalescecollid); + /* + * If there's exactly one surviving argument, we no longer + * need COALESCE at all: the result is that argument + */ + if (list_length(newargs) == 1) + return (Node *) linitial(newargs); + newcoalesce = makeNode(CoalesceExpr); newcoalesce->coalescetype = coalesceexpr->coalescetype; newcoalesce->coalescecollid = coalesceexpr->coalescecollid; @@ -3537,6 +3548,31 @@ eval_const_expressions_mutator(Node *node, return makeBoolConst(result, false); } + if (!ntest->argisrow && arg && IsA(arg, Var) && context->root) + { + Var *varg = (Var *) arg; + bool result; + + if (var_is_nonnullable(context->root, varg, false)) + { + switch (ntest->nulltesttype) + { + case IS_NULL: + result = false; + break; + case IS_NOT_NULL: + result = true; + break; + default: + elog(ERROR, "unrecognized nulltesttype: %d", + (int) ntest->nulltesttype); + result = false; /* keep compiler quiet */ + break; + } + + return makeBoolConst(result, false); + } + } newntest = makeNode(NullTest); newntest->arg = (Expr *) arg; @@ -4156,6 +4192,67 @@ simplify_function(Oid funcid, Oid result_type, int32 result_typmod, } /* + * var_is_nonnullable: check to see if the Var cannot be NULL + * + * If the Var is defined NOT NULL and meanwhile is not nulled by any outer + * joins or grouping sets, then we can know that it cannot be NULL. + * + * use_rel_info indicates whether the corresponding RelOptInfo is available for + * use. + */ +bool +var_is_nonnullable(PlannerInfo *root, Var *var, bool use_rel_info) +{ + Relids notnullattnums = NULL; + + Assert(IsA(var, Var)); + + /* skip upper-level Vars */ + if (var->varlevelsup != 0) + return false; + + /* could the Var be nulled by any outer joins or grouping sets? */ + if (!bms_is_empty(var->varnullingrels)) + return false; + + /* system columns cannot be NULL */ + if (var->varattno < 0) + return true; + + /* + * Check if the Var is defined as NOT NULL. We retrieve the column NOT + * NULL constraint information from the corresponding RelOptInfo if it is + * available; otherwise, we search the hash table for this information. + */ + if (use_rel_info) + { + RelOptInfo *rel = find_base_rel(root, var->varno); + + notnullattnums = rel->notnullattnums; + } + else + { + RangeTblEntry *rte = planner_rt_fetch(var->varno, root); + + /* + * We must skip inheritance parent tables, as some child tables may + * have a NOT NULL constraint for a column while others may not. This + * cannot happen with partitioned tables, though. + */ + if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE) + return false; + + notnullattnums = find_relation_notnullatts(root, rte->relid); + } + + if (var->varattno > 0 && + bms_is_member(var->varattno, notnullattnums)) + return true; + + return false; +} + +/* * expand_function_arguments: convert named-notation args to positional args * and/or insert default args, as needed * diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c index 17e51cd75d7..30d158069e3 100644 --- a/src/backend/optimizer/util/inherit.c +++ b/src/backend/optimizer/util/inherit.c @@ -466,8 +466,7 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, Index *childRTindex_p) { Query *parse = root->parse; - Oid parentOID PG_USED_FOR_ASSERTS_ONLY = - RelationGetRelid(parentrel); + Oid parentOID = RelationGetRelid(parentrel); Oid childOID = RelationGetRelid(childrel); RangeTblEntry *childrte; Index childRTindex; @@ -514,6 +513,13 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte, *childRTindex_p = childRTindex; /* + * Retrieve column not-null constraint information for the child relation + * if its relation OID is different from the parent's. + */ + if (childOID != parentOID) + get_relation_notnullatts(root, childrel); + + /* * Build an AppendRelInfo struct for each parent/child pair. */ appinfo = make_append_rel_info(parentrel, childrel, diff --git a/src/backend/optimizer/util/paramassign.c b/src/backend/optimizer/util/paramassign.c index 3bd3ce37c8f..4c13c5931b4 100644 --- a/src/backend/optimizer/util/paramassign.c +++ b/src/backend/optimizer/util/paramassign.c @@ -599,38 +599,46 @@ process_subquery_nestloop_params(PlannerInfo *root, List *subplan_params) } /* - * Identify any NestLoopParams that should be supplied by a NestLoop plan - * node with the specified lefthand rels. Remove them from the active - * root->curOuterParams list and return them as the result list. + * Identify any NestLoopParams that should be supplied by a NestLoop + * plan node with the specified lefthand rels and required-outer rels. + * Remove them from the active root->curOuterParams list and return + * them as the result list. * - * XXX Here we also hack up the returned Vars and PHVs so that they do not - * contain nullingrel sets exceeding what is available from the outer side. - * This is needed if we have applied outer join identity 3, - * (A leftjoin B on (Pab)) leftjoin C on (Pb*c) - * = A leftjoin (B leftjoin C on (Pbc)) on (Pab) - * and C contains lateral references to B. It's still safe to apply the - * identity, but the parser will have created those references in the form - * "b*" (i.e., with varnullingrels listing the A/B join), while what we will - * have available from the nestloop's outer side is just "b". We deal with - * that here by stripping the nullingrels down to what is available from the - * outer side according to leftrelids. - * - * That fixes matters for the case of forward application of identity 3. - * If the identity was applied in the reverse direction, we will have - * parameter Vars containing too few nullingrel bits rather than too many. - * Currently, that causes no problems because setrefs.c applies only a - * subset check to nullingrels in NestLoopParams, but we'd have to work - * harder if we ever want to tighten that check. This is all pretty annoying - * because it greatly weakens setrefs.c's cross-check, but the alternative + * Vars and PHVs appearing in the result list must have nullingrel sets + * that could validly appear in the lefthand rel's output. Ordinarily that + * would be true already, but if we have applied outer join identity 3, + * there could be more or fewer nullingrel bits in the nodes appearing in + * curOuterParams than are in the nominal leftrelids. We deal with that by + * forcing their nullingrel sets to include exactly the outer-join relids + * that appear in leftrelids and can null the respective Var or PHV. + * This fix is a bit ad-hoc and intellectually unsatisfactory, because it's + * essentially jumping to the conclusion that we've placed evaluation of + * the nestloop parameters correctly, and thus it defeats the intent of the + * subsequent nullingrel cross-checks in setrefs.c. But the alternative * seems to be to generate multiple versions of each laterally-parameterized * subquery, which'd be unduly expensive. */ List * -identify_current_nestloop_params(PlannerInfo *root, Relids leftrelids) +identify_current_nestloop_params(PlannerInfo *root, + Relids leftrelids, + Relids outerrelids) { List *result; + Relids allleftrelids; ListCell *cell; + /* + * We'll be able to evaluate a PHV in the lefthand path if it uses the + * lefthand rels plus any available required-outer rels. But don't do so + * if it uses *only* required-outer rels; in that case it should be + * evaluated higher in the tree. For Vars, no such hair-splitting is + * necessary since they depend on only one relid. + */ + if (outerrelids) + allleftrelids = bms_union(leftrelids, outerrelids); + else + allleftrelids = leftrelids; + result = NIL; foreach(cell, root->curOuterParams) { @@ -646,25 +654,60 @@ identify_current_nestloop_params(PlannerInfo *root, Relids leftrelids) bms_is_member(nlp->paramval->varno, leftrelids)) { Var *var = (Var *) nlp->paramval; + RelOptInfo *rel = root->simple_rel_array[var->varno]; root->curOuterParams = foreach_delete_current(root->curOuterParams, cell); - var->varnullingrels = bms_intersect(var->varnullingrels, + var->varnullingrels = bms_intersect(rel->nulling_relids, leftrelids); result = lappend(result, nlp); } - else if (IsA(nlp->paramval, PlaceHolderVar) && - bms_is_subset(find_placeholder_info(root, - (PlaceHolderVar *) nlp->paramval)->ph_eval_at, - leftrelids)) + else if (IsA(nlp->paramval, PlaceHolderVar)) { PlaceHolderVar *phv = (PlaceHolderVar *) nlp->paramval; + PlaceHolderInfo *phinfo = find_placeholder_info(root, phv); + Relids eval_at = phinfo->ph_eval_at; - root->curOuterParams = foreach_delete_current(root->curOuterParams, - cell); - phv->phnullingrels = bms_intersect(phv->phnullingrels, - leftrelids); - result = lappend(result, nlp); + if (bms_is_subset(eval_at, allleftrelids) && + bms_overlap(eval_at, leftrelids)) + { + root->curOuterParams = foreach_delete_current(root->curOuterParams, + cell); + + /* + * Deal with an edge case: if the PHV was pulled up out of a + * subquery and it contains a subquery that was originally + * pushed down from this query level, then that will still be + * represented as a SubLink, because SS_process_sublinks won't + * recurse into outer PHVs, so it didn't get transformed + * during expression preprocessing in the subquery. We need a + * version of the PHV that has a SubPlan, which we can get + * from the current query level's placeholder_list. This is + * quite grotty of course, but dealing with it earlier in the + * handling of subplan params would be just as grotty, and it + * might end up being a waste of cycles if we don't decide to + * treat the PHV as a NestLoopParam. (Perhaps that whole + * mechanism should be redesigned someday, but today is not + * that day.) + */ + if (root->parse->hasSubLinks) + { + phv = copyObject(phinfo->ph_var); + + /* + * The ph_var will have empty nullingrels, but that + * doesn't matter since we're about to overwrite + * phv->phnullingrels. Other fields should be OK already. + */ + nlp->paramval = (Var *) phv; + } + + phv->phnullingrels = + bms_intersect(get_placeholder_nulling_relids(root, phinfo), + leftrelids); + + result = lappend(result, nlp); + } } } return result; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index e0192d4a491..a4c5867cdcb 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1404,12 +1404,12 @@ create_append_path(PlannerInfo *root, pathnode->path.total_cost = child->total_cost; } else - cost_append(pathnode); + cost_append(pathnode, root); /* Must do this last, else cost_append complains */ pathnode->path.pathkeys = child->pathkeys; } else - cost_append(pathnode); + cost_append(pathnode, root); /* If the caller provided a row estimate, override the computed value. */ if (rows >= 0) @@ -1515,6 +1515,9 @@ create_merge_append_path(PlannerInfo *root, foreach(l, subpaths) { Path *subpath = (Path *) lfirst(l); + int presorted_keys; + Path sort_path; /* dummy for result of + * cost_sort/cost_incremental_sort */ /* All child paths should be unparameterized */ Assert(bms_is_empty(PATH_REQ_OUTER(subpath))); @@ -1523,32 +1526,52 @@ create_merge_append_path(PlannerInfo *root, pathnode->path.parallel_safe = pathnode->path.parallel_safe && subpath->parallel_safe; - if (pathkeys_contained_in(pathkeys, subpath->pathkeys)) + if (!pathkeys_count_contained_in(pathkeys, subpath->pathkeys, + &presorted_keys)) { - /* Subpath is adequately ordered, we won't need to sort it */ - input_disabled_nodes += subpath->disabled_nodes; - input_startup_cost += subpath->startup_cost; - input_total_cost += subpath->total_cost; - } - else - { - /* We'll need to insert a Sort node, so include cost for that */ - Path sort_path; /* dummy for result of cost_sort */ + /* + * We'll need to insert a Sort node, so include costs for that. We + * choose to use incremental sort if it is enabled and there are + * presorted keys; otherwise we use full sort. + * + * We can use the parent's LIMIT if any, since we certainly won't + * pull more than that many tuples from any child. + */ + if (enable_incremental_sort && presorted_keys > 0) + { + cost_incremental_sort(&sort_path, + root, + pathkeys, + presorted_keys, + subpath->disabled_nodes, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + pathnode->limit_tuples); + } + else + { + cost_sort(&sort_path, + root, + pathkeys, + subpath->disabled_nodes, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, + work_mem, + pathnode->limit_tuples); + } - cost_sort(&sort_path, - root, - pathkeys, - subpath->disabled_nodes, - subpath->total_cost, - subpath->rows, - subpath->pathtarget->width, - 0.0, - work_mem, - pathnode->limit_tuples); - input_disabled_nodes += sort_path.disabled_nodes; - input_startup_cost += sort_path.startup_cost; - input_total_cost += sort_path.total_cost; + subpath = &sort_path; } + + input_disabled_nodes += subpath->disabled_nodes; + input_startup_cost += subpath->startup_cost; + input_total_cost += subpath->total_cost; } /* @@ -1666,7 +1689,7 @@ create_material_path(RelOptInfo *rel, Path *subpath) MemoizePath * create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, List *param_exprs, List *hash_operators, - bool singlerow, bool binary_mode, double calls) + bool singlerow, bool binary_mode, Cardinality est_calls) { MemoizePath *pathnode = makeNode(MemoizePath); @@ -1687,7 +1710,6 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, pathnode->param_exprs = param_exprs; pathnode->singlerow = singlerow; pathnode->binary_mode = binary_mode; - pathnode->calls = clamp_row_est(calls); /* * For now we set est_entries to 0. cost_memoize_rescan() does all the @@ -1697,6 +1719,12 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, */ pathnode->est_entries = 0; + pathnode->est_calls = clamp_row_est(est_calls); + + /* These will also be set later in cost_memoize_rescan() */ + pathnode->est_unique_keys = 0.0; + pathnode->est_hit_ratio = 0.0; + /* we should not generate this path type when enable_memoize=false */ Assert(enable_memoize); pathnode->path.disabled_nodes = subpath->disabled_nodes; @@ -4236,7 +4264,7 @@ reparameterize_path(PlannerInfo *root, Path *path, mpath->hash_operators, mpath->singlerow, mpath->binary_mode, - mpath->calls); + mpath->est_calls); } default: break; diff --git a/src/backend/optimizer/util/placeholder.c b/src/backend/optimizer/util/placeholder.c index 41a4c81e94a..e1cd00a72fb 100644 --- a/src/backend/optimizer/util/placeholder.c +++ b/src/backend/optimizer/util/placeholder.c @@ -545,3 +545,43 @@ contain_placeholder_references_walker(Node *node, return expression_tree_walker(node, contain_placeholder_references_walker, context); } + +/* + * Compute the set of outer-join relids that can null a placeholder. + * + * This is analogous to RelOptInfo.nulling_relids for Vars, but we compute it + * on-the-fly rather than saving it somewhere. Currently the value is needed + * at most once per query, so there's little value in doing otherwise. If it + * ever gains more widespread use, perhaps we should cache the result in + * PlaceHolderInfo. + */ +Relids +get_placeholder_nulling_relids(PlannerInfo *root, PlaceHolderInfo *phinfo) +{ + Relids result = NULL; + int relid = -1; + + /* + * Form the union of all potential nulling OJs for each baserel included + * in ph_eval_at. + */ + while ((relid = bms_next_member(phinfo->ph_eval_at, relid)) > 0) + { + RelOptInfo *rel = root->simple_rel_array[relid]; + + /* ignore the RTE_GROUP RTE */ + if (relid == root->group_rtindex) + continue; + + if (rel == NULL) /* must be an outer join */ + { + Assert(bms_is_member(relid, root->outer_join_rels)); + continue; + } + result = bms_add_members(result, rel->nulling_relids); + } + + /* Now remove any OJs already included in ph_eval_at, and we're done. */ + result = bms_del_members(result, phinfo->ph_eval_at); + return result; +} diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 59233b64730..c6a58afc5e5 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -59,6 +59,12 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; /* Hook for plugins to get control in get_relation_info() */ get_relation_info_hook_type get_relation_info_hook = NULL; +typedef struct NotnullHashEntry +{ + Oid relid; /* OID of the relation */ + Relids notnullattnums; /* attnums of NOT NULL columns */ +} NotnullHashEntry; + static void get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, Relation relation, bool inhparent); @@ -172,27 +178,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * RangeTblEntry does get populated. */ if (!inhparent || relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { - for (int i = 0; i < relation->rd_att->natts; i++) - { - CompactAttribute *attr = TupleDescCompactAttr(relation->rd_att, i); - - Assert(attr->attnullability != ATTNULLABLE_UNKNOWN); - - if (attr->attnullability == ATTNULLABLE_VALID) - { - rel->notnullattnums = bms_add_member(rel->notnullattnums, - i + 1); - - /* - * Per RemoveAttributeById(), dropped columns will have their - * attnotnull unset, so we needn't check for dropped columns - * in the above condition. - */ - Assert(!attr->attisdropped); - } - } - } + rel->notnullattnums = find_relation_notnullatts(root, relationObjectId); /* * Estimate relation size --- unless it's an inheritance parent, in which @@ -684,6 +670,105 @@ get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel, } /* + * get_relation_notnullatts - + * Retrieves column not-null constraint information for a given relation. + * + * We do this while we have the relcache entry open, and store the column + * not-null constraint information in a hash table based on the relation OID. + */ +void +get_relation_notnullatts(PlannerInfo *root, Relation relation) +{ + Oid relid = RelationGetRelid(relation); + NotnullHashEntry *hentry; + bool found; + Relids notnullattnums = NULL; + + /* bail out if the relation has no not-null constraints */ + if (relation->rd_att->constr == NULL || + !relation->rd_att->constr->has_not_null) + return; + + /* create the hash table if it hasn't been created yet */ + if (root->glob->rel_notnullatts_hash == NULL) + { + HTAB *hashtab; + HASHCTL hash_ctl; + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(NotnullHashEntry); + hash_ctl.hcxt = CurrentMemoryContext; + + hashtab = hash_create("Relation NOT NULL attnums", + 64L, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + root->glob->rel_notnullatts_hash = hashtab; + } + + /* + * Create a hash entry for this relation OID, if we don't have one + * already. + */ + hentry = (NotnullHashEntry *) hash_search(root->glob->rel_notnullatts_hash, + &relid, + HASH_ENTER, + &found); + + /* bail out if a hash entry already exists for this relation OID */ + if (found) + return; + + /* collect the column not-null constraint information for this relation */ + for (int i = 0; i < relation->rd_att->natts; i++) + { + CompactAttribute *attr = TupleDescCompactAttr(relation->rd_att, i); + + Assert(attr->attnullability != ATTNULLABLE_UNKNOWN); + + if (attr->attnullability == ATTNULLABLE_VALID) + { + notnullattnums = bms_add_member(notnullattnums, i + 1); + + /* + * Per RemoveAttributeById(), dropped columns will have their + * attnotnull unset, so we needn't check for dropped columns in + * the above condition. + */ + Assert(!attr->attisdropped); + } + } + + /* ... and initialize the new hash entry */ + hentry->notnullattnums = notnullattnums; +} + +/* + * find_relation_notnullatts - + * Searches the hash table and returns the column not-null constraint + * information for a given relation. + */ +Relids +find_relation_notnullatts(PlannerInfo *root, Oid relid) +{ + NotnullHashEntry *hentry; + bool found; + + if (root->glob->rel_notnullatts_hash == NULL) + return NULL; + + hentry = (NotnullHashEntry *) hash_search(root->glob->rel_notnullatts_hash, + &relid, + HASH_FIND, + &found); + if (!found) + return NULL; + + return hentry->notnullattnums; +} + +/* * infer_arbiter_indexes - * Determine the unique indexes used to arbitrate speculative insertion. * diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 1f4d6adda52..34f7c17f576 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -239,107 +239,23 @@ parse_sub_analyze(Node *parseTree, ParseState *parentParseState, } /* - * setQueryLocationAndLength - * Set query's location and length from statement and ParseState - * - * Some statements, like PreparableStmt, can be located within parentheses. - * For example "(SELECT 1)" or "COPY (UPDATE ...) to x;". For those, we - * cannot use the whole string from the statement's location or the SQL - * string would yield incorrectly. The parser will set stmt_len, reflecting - * the size of the statement within the parentheses. Thus, when stmt_len is - * available, we need to use it for the Query's stmt_len. - * - * For other cases, the parser can't provide the length of individual - * statements. However, we have the statement's location plus the length - * (p_stmt_len) and location (p_stmt_location) of the top level RawStmt, - * stored in pstate. Thus, the statement's length is the RawStmt's length - * minus how much we've advanced in the RawStmt's string. - */ -static void -setQueryLocationAndLength(ParseState *pstate, Query *qry, Node *parseTree) -{ - ParseLoc stmt_len = 0; - - /* - * If there is no information about the top RawStmt's length, leave it at - * 0 to use the whole string. - */ - if (pstate->p_stmt_len == 0) - return; - - switch (nodeTag(parseTree)) - { - case T_InsertStmt: - qry->stmt_location = ((InsertStmt *) parseTree)->stmt_location; - stmt_len = ((InsertStmt *) parseTree)->stmt_len; - break; - - case T_DeleteStmt: - qry->stmt_location = ((DeleteStmt *) parseTree)->stmt_location; - stmt_len = ((DeleteStmt *) parseTree)->stmt_len; - break; - - case T_UpdateStmt: - qry->stmt_location = ((UpdateStmt *) parseTree)->stmt_location; - stmt_len = ((UpdateStmt *) parseTree)->stmt_len; - break; - - case T_MergeStmt: - qry->stmt_location = ((MergeStmt *) parseTree)->stmt_location; - stmt_len = ((MergeStmt *) parseTree)->stmt_len; - break; - - case T_SelectStmt: - qry->stmt_location = ((SelectStmt *) parseTree)->stmt_location; - stmt_len = ((SelectStmt *) parseTree)->stmt_len; - break; - - case T_PLAssignStmt: - qry->stmt_location = ((PLAssignStmt *) parseTree)->location; - break; - - default: - qry->stmt_location = pstate->p_stmt_location; - break; - } - - if (stmt_len > 0) - { - /* Statement's length is known, use it */ - qry->stmt_len = stmt_len; - } - else - { - /* - * Compute the statement's length from the statement's location and - * the RawStmt's length and location. - */ - qry->stmt_len = pstate->p_stmt_len - (qry->stmt_location - pstate->p_stmt_location); - } - - /* The calculated statement length should be calculated as positive. */ - Assert(qry->stmt_len >= 0); -} - -/* * transformTopLevelStmt - * transform a Parse tree into a Query tree. * - * This function is just responsible for storing location data - * from the RawStmt into the ParseState. + * This function is just responsible for transferring statement location data + * from the RawStmt into the finished Query. */ Query * transformTopLevelStmt(ParseState *pstate, RawStmt *parseTree) { Query *result; - /* Store RawStmt's length and location in pstate */ - pstate->p_stmt_len = parseTree->stmt_len; - pstate->p_stmt_location = parseTree->stmt_location; - /* We're at top level, so allow SELECT INTO */ result = transformOptionalSelectInto(pstate, parseTree->stmt); + result->stmt_location = parseTree->stmt_location; + result->stmt_len = parseTree->stmt_len; + return result; } @@ -508,7 +424,6 @@ transformStmt(ParseState *pstate, Node *parseTree) /* Mark as original query until we learn differently */ result->querySource = QSRC_ORIGINAL; result->canSetTag = true; - setQueryLocationAndLength(pstate, result, parseTree); return result; } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 0b5652071d1..db43034b9db 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -154,7 +154,6 @@ static void base_yyerror(YYLTYPE *yylloc, core_yyscan_t yyscanner, const char *msg); static RawStmt *makeRawStmt(Node *stmt, int stmt_location); static void updateRawStmtEnd(RawStmt *rs, int end_location); -static void updatePreparableStmtEnd(Node *n, int end_location); static Node *makeColumnRef(char *colname, List *indirection, int location, core_yyscan_t yyscanner); static Node *makeTypeCast(Node *arg, TypeName *typename, int location); @@ -178,13 +177,13 @@ static void insertSelectOptions(SelectStmt *stmt, SelectLimit *limitClause, WithClause *withClause, core_yyscan_t yyscanner); -static Node *makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg, int location); +static Node *makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg); static Node *doNegate(Node *n, int location); static void doNegateFloat(Float *v); static Node *makeAndExpr(Node *lexpr, Node *rexpr, int location); static Node *makeOrExpr(Node *lexpr, Node *rexpr, int location); static Node *makeNotExpr(Node *expr, int location); -static Node *makeAArrayExpr(List *elements, int location); +static Node *makeAArrayExpr(List *elements, int location, int end_location); static Node *makeSQLValueFunction(SQLValueFunctionOp op, int32 typmod, int location); static Node *makeXmlExpr(XmlExprOp op, char *name, List *named_args, @@ -319,6 +318,11 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type <list> opt_qualified_name %type <boolean> opt_concurrently %type <dbehavior> opt_drop_behavior +%type <list> opt_utility_option_list +%type <list> utility_option_list +%type <defelt> utility_option_elem +%type <str> utility_option_name +%type <node> utility_option_arg %type <node> alter_column_default opclass_item opclass_drop alter_using %type <ival> add_drop opt_asc_desc opt_nulls_order @@ -339,10 +343,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); create_extension_opt_item alter_extension_opt_item %type <ival> opt_lock lock_type cast_context -%type <str> utility_option_name -%type <defelt> utility_option_elem -%type <list> utility_option_list -%type <node> utility_option_arg %type <defelt> drop_option %type <boolean> opt_or_replace opt_no opt_grant_grant_option @@ -523,7 +523,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type <defelt> def_elem reloption_elem old_aggr_elem operator_def_elem %type <node> def_arg columnElem where_clause where_or_current_clause a_expr b_expr c_expr AexprConst indirection_el opt_slice_bound - columnref in_expr having_clause func_table xmltable array_expr + columnref having_clause func_table xmltable array_expr OptWhereClause operator_def_arg %type <list> opt_column_and_period_list %type <list> rowsfrom_item rowsfrom_list opt_col_def_list @@ -557,7 +557,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type <list> generic_option_list alter_generic_option_list %type <ival> reindex_target_relation reindex_target_all -%type <list> opt_reindex_option_list %type <node> copy_generic_opt_arg copy_generic_opt_arg_list_item %type <defelt> copy_generic_opt_elem @@ -1142,6 +1141,41 @@ opt_drop_behavior: | /* EMPTY */ { $$ = DROP_RESTRICT; /* default */ } ; +opt_utility_option_list: + '(' utility_option_list ')' { $$ = $2; } + | /* EMPTY */ { $$ = NULL; } + ; + +utility_option_list: + utility_option_elem + { + $$ = list_make1($1); + } + | utility_option_list ',' utility_option_elem + { + $$ = lappend($1, $3); + } + ; + +utility_option_elem: + utility_option_name utility_option_arg + { + $$ = makeDefElem($1, $2, @1); + } + ; + +utility_option_name: + NonReservedWord { $$ = $1; } + | analyze_keyword { $$ = "analyze"; } + | FORMAT_LA { $$ = "format"; } + ; + +utility_option_arg: + opt_boolean_or_string { $$ = (Node *) makeString($1); } + | NumericOnly { $$ = (Node *) $1; } + | /* EMPTY */ { $$ = NULL; } + ; + /***************************************************************************** * * CALL statement @@ -2029,11 +2063,12 @@ constraints_set_mode: * Checkpoint statement */ CheckPointStmt: - CHECKPOINT + CHECKPOINT opt_utility_option_list { CheckPointStmt *n = makeNode(CheckPointStmt); $$ = (Node *) n; + n->options = $2; } ; @@ -2669,6 +2704,12 @@ alter_table_cmd: c->alterDeferrability = true; if ($4 & CAS_NO_INHERIT) c->alterInheritability = true; + /* handle unsupported case with specific error message */ + if ($4 & CAS_NOT_VALID) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraints cannot be altered to be NOT VALID"), + parser_errposition(@4)); processCASbits($4, @4, "FOREIGN KEY", &c->deferrable, &c->initdeferred, @@ -3417,7 +3458,6 @@ CopyStmt: COPY opt_binary qualified_name opt_column_list { CopyStmt *n = makeNode(CopyStmt); - updatePreparableStmtEnd($3, @4); n->relation = NULL; n->query = $3; n->attlist = NIL; @@ -6037,6 +6077,26 @@ CreateTrigStmt: EXECUTE FUNCTION_or_PROCEDURE func_name '(' TriggerFuncArgs ')' { CreateTrigStmt *n = makeNode(CreateTrigStmt); + bool dummy; + + if (($11 & CAS_NOT_VALID) != 0) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraint triggers cannot be marked %s", + "NOT VALID"), + parser_errposition(@11)); + if (($11 & CAS_NO_INHERIT) != 0) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraint triggers cannot be marked %s", + "NO INHERIT"), + parser_errposition(@11)); + if (($11 & CAS_NOT_ENFORCED) != 0) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraint triggers cannot be marked %s", + "NOT ENFORCED"), + parser_errposition(@11)); n->replace = $2; if (n->replace) /* not supported, see CreateTrigger */ @@ -6056,7 +6116,7 @@ CreateTrigStmt: n->whenClause = $15; n->transitionRels = NIL; processCASbits($11, @11, "TRIGGER", - &n->deferrable, &n->initdeferred, NULL, + &n->deferrable, &n->initdeferred, &dummy, NULL, NULL, yyscanner); n->constrrel = $10; $$ = (Node *) n; @@ -7479,6 +7539,8 @@ fetch_args: cursor_name n->portalname = $1; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_NONE; $$ = (Node *) n; } | from_in cursor_name @@ -7488,6 +7550,19 @@ fetch_args: cursor_name n->portalname = $2; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_NONE; + $$ = (Node *) n; + } + | SignedIconst opt_from_in cursor_name + { + FetchStmt *n = makeNode(FetchStmt); + + n->portalname = $3; + n->direction = FETCH_FORWARD; + n->howMany = $1; + n->location = @1; + n->direction_keyword = FETCH_KEYWORD_NONE; $$ = (Node *) n; } | NEXT opt_from_in cursor_name @@ -7497,6 +7572,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_NEXT; $$ = (Node *) n; } | PRIOR opt_from_in cursor_name @@ -7506,6 +7583,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_BACKWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_PRIOR; $$ = (Node *) n; } | FIRST_P opt_from_in cursor_name @@ -7515,6 +7594,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_ABSOLUTE; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_FIRST; $$ = (Node *) n; } | LAST_P opt_from_in cursor_name @@ -7524,6 +7605,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_ABSOLUTE; n->howMany = -1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_LAST; $$ = (Node *) n; } | ABSOLUTE_P SignedIconst opt_from_in cursor_name @@ -7533,6 +7616,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_ABSOLUTE; n->howMany = $2; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_ABSOLUTE; $$ = (Node *) n; } | RELATIVE_P SignedIconst opt_from_in cursor_name @@ -7542,15 +7627,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_RELATIVE; n->howMany = $2; - $$ = (Node *) n; - } - | SignedIconst opt_from_in cursor_name - { - FetchStmt *n = makeNode(FetchStmt); - - n->portalname = $3; - n->direction = FETCH_FORWARD; - n->howMany = $1; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_RELATIVE; $$ = (Node *) n; } | ALL opt_from_in cursor_name @@ -7560,6 +7638,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_FORWARD; n->howMany = FETCH_ALL; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_ALL; $$ = (Node *) n; } | FORWARD opt_from_in cursor_name @@ -7569,6 +7649,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_FORWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_FORWARD; $$ = (Node *) n; } | FORWARD SignedIconst opt_from_in cursor_name @@ -7578,6 +7660,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_FORWARD; n->howMany = $2; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_FORWARD; $$ = (Node *) n; } | FORWARD ALL opt_from_in cursor_name @@ -7587,6 +7671,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_FORWARD; n->howMany = FETCH_ALL; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_FORWARD_ALL; $$ = (Node *) n; } | BACKWARD opt_from_in cursor_name @@ -7596,6 +7682,8 @@ fetch_args: cursor_name n->portalname = $3; n->direction = FETCH_BACKWARD; n->howMany = 1; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_BACKWARD; $$ = (Node *) n; } | BACKWARD SignedIconst opt_from_in cursor_name @@ -7605,6 +7693,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_BACKWARD; n->howMany = $2; + n->location = @2; + n->direction_keyword = FETCH_KEYWORD_BACKWARD; $$ = (Node *) n; } | BACKWARD ALL opt_from_in cursor_name @@ -7614,6 +7704,8 @@ fetch_args: cursor_name n->portalname = $4; n->direction = FETCH_BACKWARD; n->howMany = FETCH_ALL; + n->location = -1; + n->direction_keyword = FETCH_KEYWORD_BACKWARD_ALL; $$ = (Node *) n; } ; @@ -9291,7 +9383,7 @@ DropTransformStmt: DROP TRANSFORM opt_if_exists FOR Typename LANGUAGE name opt_d *****************************************************************************/ ReindexStmt: - REINDEX opt_reindex_option_list reindex_target_relation opt_concurrently qualified_name + REINDEX opt_utility_option_list reindex_target_relation opt_concurrently qualified_name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9304,7 +9396,7 @@ ReindexStmt: makeDefElem("concurrently", NULL, @4)); $$ = (Node *) n; } - | REINDEX opt_reindex_option_list SCHEMA opt_concurrently name + | REINDEX opt_utility_option_list SCHEMA opt_concurrently name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9317,7 +9409,7 @@ ReindexStmt: makeDefElem("concurrently", NULL, @4)); $$ = (Node *) n; } - | REINDEX opt_reindex_option_list reindex_target_all opt_concurrently opt_single_name + | REINDEX opt_utility_option_list reindex_target_all opt_concurrently opt_single_name { ReindexStmt *n = makeNode(ReindexStmt); @@ -9339,10 +9431,6 @@ reindex_target_all: SYSTEM_P { $$ = REINDEX_OBJECT_SYSTEM; } | DATABASE { $$ = REINDEX_OBJECT_DATABASE; } ; -opt_reindex_option_list: - '(' utility_option_list ')' { $$ = $2; } - | /* EMPTY */ { $$ = NULL; } - ; /***************************************************************************** * @@ -11629,7 +11717,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'T'; + n->subtype = AD_AlterDefault; n->typeName = $3; n->def = $4; $$ = (Node *) n; @@ -11639,7 +11727,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'N'; + n->subtype = AD_DropNotNull; n->typeName = $3; $$ = (Node *) n; } @@ -11648,7 +11736,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'O'; + n->subtype = AD_SetNotNull; n->typeName = $3; $$ = (Node *) n; } @@ -11657,7 +11745,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'C'; + n->subtype = AD_AddConstraint; n->typeName = $3; n->def = $5; $$ = (Node *) n; @@ -11667,7 +11755,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'X'; + n->subtype = AD_DropConstraint; n->typeName = $3; n->name = $6; n->behavior = $7; @@ -11679,7 +11767,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'X'; + n->subtype = AD_DropConstraint; n->typeName = $3; n->name = $8; n->behavior = $9; @@ -11691,7 +11779,7 @@ AlterDomainStmt: { AlterDomainStmt *n = makeNode(AlterDomainStmt); - n->subtype = 'V'; + n->subtype = AD_ValidateConstraint; n->typeName = $3; n->name = $6; $$ = (Node *) n; @@ -11840,13 +11928,13 @@ ClusterStmt: n->params = $3; $$ = (Node *) n; } - | CLUSTER '(' utility_option_list ')' + | CLUSTER opt_utility_option_list { ClusterStmt *n = makeNode(ClusterStmt); n->relation = NULL; n->indexname = NULL; - n->params = $3; + n->params = $2; $$ = (Node *) n; } /* unparenthesized VERBOSE kept for pre-14 compatibility */ @@ -11856,21 +11944,18 @@ ClusterStmt: n->relation = $3; n->indexname = $4; - n->params = NIL; if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } /* unparenthesized VERBOSE kept for pre-17 compatibility */ - | CLUSTER opt_verbose + | CLUSTER VERBOSE { ClusterStmt *n = makeNode(ClusterStmt); n->relation = NULL; n->indexname = NULL; - n->params = NIL; - if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } /* kept for pre-8.3 compatibility */ @@ -11880,9 +11965,8 @@ ClusterStmt: n->relation = $5; n->indexname = $3; - n->params = NIL; if ($2) - n->params = lappend(n->params, makeDefElem("verbose", NULL, @2)); + n->params = list_make1(makeDefElem("verbose", NULL, @2)); $$ = (Node *) n; } ; @@ -11933,64 +12017,31 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose opt_analyze opt_vacuum_relati } ; -AnalyzeStmt: analyze_keyword opt_verbose opt_vacuum_relation_list +AnalyzeStmt: analyze_keyword opt_utility_option_list opt_vacuum_relation_list { VacuumStmt *n = makeNode(VacuumStmt); - n->options = NIL; - if ($2) - n->options = lappend(n->options, - makeDefElem("verbose", NULL, @2)); + n->options = $2; n->rels = $3; n->is_vacuumcmd = false; $$ = (Node *) n; } - | analyze_keyword '(' utility_option_list ')' opt_vacuum_relation_list + | analyze_keyword VERBOSE opt_vacuum_relation_list { VacuumStmt *n = makeNode(VacuumStmt); - n->options = $3; - n->rels = $5; + n->options = list_make1(makeDefElem("verbose", NULL, @2)); + n->rels = $3; n->is_vacuumcmd = false; $$ = (Node *) n; } ; -utility_option_list: - utility_option_elem - { - $$ = list_make1($1); - } - | utility_option_list ',' utility_option_elem - { - $$ = lappend($1, $3); - } - ; - analyze_keyword: ANALYZE | ANALYSE /* British */ ; -utility_option_elem: - utility_option_name utility_option_arg - { - $$ = makeDefElem($1, $2, @1); - } - ; - -utility_option_name: - NonReservedWord { $$ = $1; } - | analyze_keyword { $$ = "analyze"; } - | FORMAT_LA { $$ = "format"; } - ; - -utility_option_arg: - opt_boolean_or_string { $$ = (Node *) makeString($1); } - | NumericOnly { $$ = (Node *) $1; } - | /* EMPTY */ { $$ = NULL; } - ; - opt_analyze: analyze_keyword { $$ = true; } | /*EMPTY*/ { $$ = false; } @@ -12240,7 +12291,6 @@ InsertStmt: $5->onConflictClause = $6; $5->returningClause = $7; $5->withClause = $1; - $5->stmt_location = @$; $$ = (Node *) $5; } ; @@ -12431,7 +12481,6 @@ DeleteStmt: opt_with_clause DELETE_P FROM relation_expr_opt_alias n->whereClause = $6; n->returningClause = $7; n->withClause = $1; - n->stmt_location = @$; $$ = (Node *) n; } ; @@ -12506,7 +12555,6 @@ UpdateStmt: opt_with_clause UPDATE relation_expr_opt_alias n->whereClause = $7; n->returningClause = $8; n->withClause = $1; - n->stmt_location = @$; $$ = (Node *) n; } ; @@ -12584,7 +12632,6 @@ MergeStmt: m->joinCondition = $8; m->mergeWhenClauses = $9; m->returningClause = $10; - m->stmt_location = @$; $$ = (Node *) m; } @@ -12825,20 +12872,7 @@ SelectStmt: select_no_parens %prec UMINUS ; select_with_parens: - '(' select_no_parens ')' - { - SelectStmt *n = (SelectStmt *) $2; - - /* - * As SelectStmt's location starts at the SELECT keyword, - * we need to track the length of the SelectStmt within - * parentheses to be able to extract the relevant part - * of the query. Without this, the RawStmt's length would - * be used and would include the closing parenthesis. - */ - n->stmt_len = @3 - @2; - $$ = $2; - } + '(' select_no_parens ')' { $$ = $2; } | '(' select_with_parens ')' { $$ = $2; } ; @@ -12960,7 +12994,6 @@ simple_select: n->groupDistinct = ($7)->distinct; n->havingClause = $8; n->windowClause = $9; - n->stmt_location = @1; $$ = (Node *) n; } | SELECT distinct_clause target_list @@ -12978,7 +13011,6 @@ simple_select: n->groupDistinct = ($7)->distinct; n->havingClause = $8; n->windowClause = $9; - n->stmt_location = @1; $$ = (Node *) n; } | values_clause { $$ = $1; } @@ -12999,20 +13031,19 @@ simple_select: n->targetList = list_make1(rt); n->fromClause = list_make1($2); - n->stmt_location = @1; $$ = (Node *) n; } | select_clause UNION set_quantifier select_clause { - $$ = makeSetOp(SETOP_UNION, $3 == SET_QUANTIFIER_ALL, $1, $4, @1); + $$ = makeSetOp(SETOP_UNION, $3 == SET_QUANTIFIER_ALL, $1, $4); } | select_clause INTERSECT set_quantifier select_clause { - $$ = makeSetOp(SETOP_INTERSECT, $3 == SET_QUANTIFIER_ALL, $1, $4, @1); + $$ = makeSetOp(SETOP_INTERSECT, $3 == SET_QUANTIFIER_ALL, $1, $4); } | select_clause EXCEPT set_quantifier select_clause { - $$ = makeSetOp(SETOP_EXCEPT, $3 == SET_QUANTIFIER_ALL, $1, $4, @1); + $$ = makeSetOp(SETOP_EXCEPT, $3 == SET_QUANTIFIER_ALL, $1, $4); } ; @@ -13590,7 +13621,6 @@ values_clause: { SelectStmt *n = makeNode(SelectStmt); - n->stmt_location = @1; n->valuesLists = list_make1($3); $$ = (Node *) n; } @@ -15287,49 +15317,50 @@ a_expr: c_expr { $$ = $1; } (Node *) list_make2($5, $7), @2); } - | a_expr IN_P in_expr + | a_expr IN_P select_with_parens { - /* in_expr returns a SubLink or a list of a_exprs */ - if (IsA($3, SubLink)) - { - /* generate foo = ANY (subquery) */ - SubLink *n = (SubLink *) $3; + /* generate foo = ANY (subquery) */ + SubLink *n = makeNode(SubLink); - n->subLinkType = ANY_SUBLINK; - n->subLinkId = 0; - n->testexpr = $1; - n->operName = NIL; /* show it's IN not = ANY */ - n->location = @2; - $$ = (Node *) n; - } - else - { - /* generate scalar IN expression */ - $$ = (Node *) makeSimpleA_Expr(AEXPR_IN, "=", $1, $3, @2); - } + n->subselect = $3; + n->subLinkType = ANY_SUBLINK; + n->subLinkId = 0; + n->testexpr = $1; + n->operName = NIL; /* show it's IN not = ANY */ + n->location = @2; + $$ = (Node *) n; } - | a_expr NOT_LA IN_P in_expr %prec NOT_LA + | a_expr IN_P '(' expr_list ')' { - /* in_expr returns a SubLink or a list of a_exprs */ - if (IsA($4, SubLink)) - { - /* generate NOT (foo = ANY (subquery)) */ - /* Make an = ANY node */ - SubLink *n = (SubLink *) $4; - - n->subLinkType = ANY_SUBLINK; - n->subLinkId = 0; - n->testexpr = $1; - n->operName = NIL; /* show it's IN not = ANY */ - n->location = @2; - /* Stick a NOT on top; must have same parse location */ - $$ = makeNotExpr((Node *) n, @2); - } - else - { - /* generate scalar NOT IN expression */ - $$ = (Node *) makeSimpleA_Expr(AEXPR_IN, "<>", $1, $4, @2); - } + /* generate scalar IN expression */ + A_Expr *n = makeSimpleA_Expr(AEXPR_IN, "=", $1, (Node *) $4, @2); + + n->rexpr_list_start = @3; + n->rexpr_list_end = @5; + $$ = (Node *) n; + } + | a_expr NOT_LA IN_P select_with_parens %prec NOT_LA + { + /* generate NOT (foo = ANY (subquery)) */ + SubLink *n = makeNode(SubLink); + + n->subselect = $4; + n->subLinkType = ANY_SUBLINK; + n->subLinkId = 0; + n->testexpr = $1; + n->operName = NIL; /* show it's IN not = ANY */ + n->location = @2; + /* Stick a NOT on top; must have same parse location */ + $$ = makeNotExpr((Node *) n, @2); + } + | a_expr NOT_LA IN_P '(' expr_list ')' + { + /* generate scalar NOT IN expression */ + A_Expr *n = makeSimpleA_Expr(AEXPR_IN, "<>", $1, (Node *) $5, @2); + + n->rexpr_list_start = @4; + n->rexpr_list_end = @6; + $$ = (Node *) n; } | a_expr subquery_Op sub_type select_with_parens %prec Op { @@ -16764,15 +16795,15 @@ type_list: Typename { $$ = list_make1($1); } array_expr: '[' expr_list ']' { - $$ = makeAArrayExpr($2, @1); + $$ = makeAArrayExpr($2, @1, @3); } | '[' array_expr_list ']' { - $$ = makeAArrayExpr($2, @1); + $$ = makeAArrayExpr($2, @1, @3); } | '[' ']' { - $$ = makeAArrayExpr(NIL, @1); + $$ = makeAArrayExpr(NIL, @1, @2); } ; @@ -16894,17 +16925,6 @@ trim_list: a_expr FROM expr_list { $$ = lappend($3, $1); } | expr_list { $$ = $1; } ; -in_expr: select_with_parens - { - SubLink *n = makeNode(SubLink); - - n->subselect = $1; - /* other fields will be filled later */ - $$ = (Node *) n; - } - | '(' expr_list ')' { $$ = (Node *) $2; } - ; - /* * Define SQL-style CASE clause. * - Full specification @@ -18748,47 +18768,6 @@ updateRawStmtEnd(RawStmt *rs, int end_location) rs->stmt_len = end_location - rs->stmt_location; } -/* - * Adjust a PreparableStmt to reflect that it doesn't run to the end of the - * string. - */ -static void -updatePreparableStmtEnd(Node *n, int end_location) -{ - if (IsA(n, SelectStmt)) - { - SelectStmt *stmt = (SelectStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, InsertStmt)) - { - InsertStmt *stmt = (InsertStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, UpdateStmt)) - { - UpdateStmt *stmt = (UpdateStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, DeleteStmt)) - { - DeleteStmt *stmt = (DeleteStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, MergeStmt)) - { - MergeStmt *stmt = (MergeStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else - elog(ERROR, "unexpected node type %d", (int) n->type); -} - static Node * makeColumnRef(char *colname, List *indirection, int location, core_yyscan_t yyscanner) @@ -19167,14 +19146,11 @@ insertSelectOptions(SelectStmt *stmt, errmsg("multiple WITH clauses not allowed"), parser_errposition(exprLocation((Node *) withClause)))); stmt->withClause = withClause; - - /* Update SelectStmt's location to the start of the WITH clause */ - stmt->stmt_location = withClause->location; } } static Node * -makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg, int location) +makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg) { SelectStmt *n = makeNode(SelectStmt); @@ -19182,7 +19158,6 @@ makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg, int location) n->all = all; n->larg = (SelectStmt *) larg; n->rarg = (SelectStmt *) rarg; - n->stmt_location = location; return (Node *) n; } @@ -19300,12 +19275,14 @@ makeNotExpr(Node *expr, int location) } static Node * -makeAArrayExpr(List *elements, int location) +makeAArrayExpr(List *elements, int location, int location_end) { A_ArrayExpr *n = makeNode(A_ArrayExpr); n->elements = elements; n->location = location; + n->list_start = location; + n->list_end = location_end; return (Node *) n; } diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index 1f8e2d54673..d66276801c6 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -1223,6 +1223,8 @@ transformAExprIn(ParseState *pstate, A_Expr *a) newa->element_typeid = scalar_type; newa->elements = aexprs; newa->multidims = false; + newa->list_start = a->rexpr_list_start; + newa->list_end = a->rexpr_list_end; newa->location = -1; result = (Node *) make_scalar_array_op(pstate, @@ -2165,6 +2167,8 @@ transformArrayExpr(ParseState *pstate, A_ArrayExpr *a, /* array_collid will be set by parse_collate.c */ newa->element_typeid = element_type; newa->elements = newcoercedelems; + newa->list_start = a->list_start; + newa->list_end = a->list_end; newa->location = a->location; return (Node *) newa; diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index 62015431fdf..afcf54169c3 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1279,6 +1279,28 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla lst = RelationGetNotNullConstraints(RelationGetRelid(relation), false, true); cxt->nnconstraints = list_concat(cxt->nnconstraints, lst); + + /* Copy comments on not-null constraints */ + if (table_like_clause->options & CREATE_TABLE_LIKE_COMMENTS) + { + foreach_node(Constraint, nnconstr, lst) + { + if ((comment = GetComment(get_relation_constraint_oid(RelationGetRelid(relation), + nnconstr->conname, false), + ConstraintRelationId, + 0)) != NULL) + { + CommentStmt *stmt = makeNode(CommentStmt); + + stmt->objtype = OBJECT_TABCONSTRAINT; + stmt->object = (Node *) list_make3(makeString(cxt->relation->schemaname), + makeString(cxt->relation->relname), + makeString(nnconstr->conname)); + stmt->comment = comment; + cxt->alist = lappend(cxt->alist, stmt); + } + } + } } /* diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c index 4bdc2941efb..822cf4ec451 100644 --- a/src/backend/partitioning/partbounds.c +++ b/src/backend/partitioning/partbounds.c @@ -1007,9 +1007,6 @@ partition_bounds_copy(PartitionBoundInfo src, int ndatums; int nindexes; int partnatts; - bool hash_part; - int natts; - Datum *boundDatums; dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData)); @@ -1023,7 +1020,7 @@ partition_bounds_copy(PartitionBoundInfo src, dest->datums = (Datum **) palloc(sizeof(Datum *) * ndatums); - if (src->kind != NULL) + if (src->kind != NULL && ndatums > 0) { PartitionRangeDatumKind *boundKinds; @@ -1058,36 +1055,40 @@ partition_bounds_copy(PartitionBoundInfo src, * For hash partitioning, datums array will have two elements - modulus * and remainder. */ - hash_part = (key->strategy == PARTITION_STRATEGY_HASH); - natts = hash_part ? 2 : partnatts; - boundDatums = palloc(ndatums * natts * sizeof(Datum)); - - for (i = 0; i < ndatums; i++) + if (ndatums > 0) { - int j; - - dest->datums[i] = &boundDatums[i * natts]; + bool hash_part = (key->strategy == PARTITION_STRATEGY_HASH); + int natts = hash_part ? 2 : partnatts; + Datum *boundDatums = palloc(ndatums * natts * sizeof(Datum)); - for (j = 0; j < natts; j++) + for (i = 0; i < ndatums; i++) { - bool byval; - int typlen; + int j; - if (hash_part) - { - typlen = sizeof(int32); /* Always int4 */ - byval = true; /* int4 is pass-by-value */ - } - else + dest->datums[i] = &boundDatums[i * natts]; + + for (j = 0; j < natts; j++) { - byval = key->parttypbyval[j]; - typlen = key->parttyplen[j]; - } + if (dest->kind == NULL || + dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE) + { + bool byval; + int typlen; - if (dest->kind == NULL || - dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE) - dest->datums[i][j] = datumCopy(src->datums[i][j], - byval, typlen); + if (hash_part) + { + typlen = sizeof(int32); /* Always int4 */ + byval = true; /* int4 is pass-by-value */ + } + else + { + byval = key->parttypbyval[j]; + typlen = key->parttyplen[j]; + } + dest->datums[i][j] = datumCopy(src->datums[i][j], + byval, typlen); + } + } } } diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 4d4a1a3197e..ff96b36d710 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -310,6 +310,16 @@ static AutoVacuumShmemStruct *AutoVacuumShmem; static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList); static MemoryContext DatabaseListCxt = NULL; +/* + * Dummy pointer to persuade Valgrind that we've not leaked the array of + * avl_dbase structs. Make it global to ensure the compiler doesn't + * optimize it away. + */ +#ifdef USE_VALGRIND +extern avl_dbase *avl_dbase_array; +avl_dbase *avl_dbase_array; +#endif + /* Pointer to my own WorkerInfo, valid on each worker */ static WorkerInfo MyWorkerInfo = NULL; @@ -562,10 +572,10 @@ AutoVacLauncherMain(const void *startup_data, size_t startup_data_len) /* * Create the initial database list. The invariant we want this list to - * keep is that it's ordered by decreasing next_time. As soon as an entry - * is updated to a higher time, it will be moved to the front (which is - * correct because the only operation is to add autovacuum_naptime to the - * entry, and time always increases). + * keep is that it's ordered by decreasing next_worker. As soon as an + * entry is updated to a higher time, it will be moved to the front (which + * is correct because the only operation is to add autovacuum_naptime to + * the entry, and time always increases). */ rebuild_database_list(InvalidOid); @@ -781,10 +791,6 @@ ProcessAutoVacLauncherInterrupts(void) if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); - /* Publish memory contexts of this process */ - if (PublishMemoryContextPending) - ProcessGetMemoryContextInterrupt(); - /* Process sinval catchup interrupts that happened while sleeping */ ProcessCatchupInterrupt(); } @@ -1024,6 +1030,10 @@ rebuild_database_list(Oid newdb) /* put all the hash elements into an array */ dbary = palloc(nelems * sizeof(avl_dbase)); + /* keep Valgrind quiet */ +#ifdef USE_VALGRIND + avl_dbase_array = dbary; +#endif i = 0; hash_seq_init(&seq, dbhash); @@ -2077,6 +2087,12 @@ do_autovacuum(void) } } } + + /* Release stuff to avoid per-relation leakage */ + if (relopts) + pfree(relopts); + if (tabentry) + pfree(tabentry); } table_endscan(relScan); @@ -2093,7 +2109,8 @@ do_autovacuum(void) Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple); PgStat_StatTabEntry *tabentry; Oid relid; - AutoVacOpts *relopts = NULL; + AutoVacOpts *relopts; + bool free_relopts = false; bool dovacuum; bool doanalyze; bool wraparound; @@ -2111,7 +2128,9 @@ do_autovacuum(void) * main rel */ relopts = extract_autovac_opts(tuple, pg_class_desc); - if (relopts == NULL) + if (relopts) + free_relopts = true; + else { av_relation *hentry; bool found; @@ -2132,6 +2151,12 @@ do_autovacuum(void) /* ignore analyze for toast tables */ if (dovacuum) table_oids = lappend_oid(table_oids, relid); + + /* Release stuff to avoid leakage */ + if (free_relopts) + pfree(relopts); + if (tabentry) + pfree(tabentry); } table_endscan(relScan); @@ -2223,6 +2248,12 @@ do_autovacuum(void) get_namespace_name(classForm->relnamespace), NameStr(classForm->relname)))); + /* + * Deletion might involve TOAST table access, so ensure we have a + * valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + object.classId = RelationRelationId; object.objectId = relid; object.objectSubId = 0; @@ -2235,6 +2266,7 @@ do_autovacuum(void) * To commit the deletion, end current transaction and start a new * one. Note this also releases the locks we took. */ + PopActiveSnapshot(); CommitTransactionCommand(); StartTransactionCommand(); @@ -2503,6 +2535,8 @@ deleted: pg_atomic_test_set_flag(&MyWorkerInfo->wi_dobalance); } + list_free(table_oids); + /* * Perform additional work items, as requested by backends. */ @@ -2545,8 +2579,18 @@ deleted: /* * We leak table_toast_map here (among other things), but since we're - * going away soon, it's not a problem. + * going away soon, it's not a problem normally. But when using Valgrind, + * release some stuff to reduce complaints about leaked storage. */ +#ifdef USE_VALGRIND + hash_destroy(table_toast_map); + FreeTupleDesc(pg_class_desc); + if (bstrategy) + pfree(bstrategy); +#endif + + /* Run the rest in xact context, mainly to avoid Valgrind leak warnings */ + MemoryContextSwitchTo(TopTransactionContext); /* * Update pg_database.datfrozenxid, and truncate pg_xact if possible. We @@ -2684,8 +2728,8 @@ deleted2: /* * extract_autovac_opts * - * Given a relation's pg_class tuple, return the AutoVacOpts portion of - * reloptions, if set; otherwise, return NULL. + * Given a relation's pg_class tuple, return a palloc'd copy of the + * AutoVacOpts portion of reloptions, if set; otherwise, return NULL. * * Note: callers do not have a relation lock on the table at this point, * so the table could have been dropped, and its catalog rows gone, after @@ -2734,6 +2778,7 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, autovac_table *tab = NULL; bool wraparound; AutoVacOpts *avopts; + bool free_avopts = false; /* fetch the relation's relcache entry */ classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid)); @@ -2746,8 +2791,10 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, * main table reloptions if the toast table itself doesn't have. */ avopts = extract_autovac_opts(classTup, pg_class_desc); - if (classForm->relkind == RELKIND_TOASTVALUE && - avopts == NULL && table_toast_map != NULL) + if (avopts) + free_avopts = true; + else if (classForm->relkind == RELKIND_TOASTVALUE && + table_toast_map != NULL) { av_relation *hentry; bool found; @@ -2856,6 +2903,8 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, avopts->vacuum_cost_delay >= 0)); } + if (free_avopts) + pfree(avopts); heap_freetuple(classTup); return tab; } @@ -2887,6 +2936,10 @@ recheck_relation_needs_vacanalyze(Oid relid, effective_multixact_freeze_max_age, dovacuum, doanalyze, wraparound); + /* Release tabentry to avoid leakage */ + if (tabentry) + pfree(tabentry); + /* ignore ANALYZE for toast tables */ if (classForm->relkind == RELKIND_TOASTVALUE) *doanalyze = false; @@ -3144,20 +3197,24 @@ autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy) VacuumRelation *rel; List *rel_list; MemoryContext vac_context; + MemoryContext old_context; /* Let pgstat know what we're doing */ autovac_report_activity(tab); + /* Create a context that vacuum() can use as cross-transaction storage */ + vac_context = AllocSetContextCreate(CurrentMemoryContext, + "Vacuum", + ALLOCSET_DEFAULT_SIZES); + /* Set up one VacuumRelation target, identified by OID, for vacuum() */ + old_context = MemoryContextSwitchTo(vac_context); rangevar = makeRangeVar(tab->at_nspname, tab->at_relname, -1); rel = makeVacuumRelation(rangevar, tab->at_relid, NIL); rel_list = list_make1(rel); + MemoryContextSwitchTo(old_context); - vac_context = AllocSetContextCreate(CurrentMemoryContext, - "Vacuum", - ALLOCSET_DEFAULT_SIZES); - - vacuum(rel_list, &tab->at_params, bstrategy, vac_context, true); + vacuum(rel_list, tab->at_params, bstrategy, vac_context, true); MemoryContextDelete(vac_context); } diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 116ddf7b835..1ad65c237c3 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -613,6 +613,7 @@ ResetBackgroundWorkerCrashTimes(void) * resetting. */ rw->rw_crashed_at = 0; + rw->rw_pid = 0; /* * If there was anyone waiting for it, they're history. diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index d3cb3f1891c..e84e8663e96 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -42,6 +42,8 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogrecovery.h" +#include "catalog/pg_authid.h" +#include "commands/defrem.h" #include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" @@ -61,6 +63,7 @@ #include "storage/shmem.h" #include "storage/smgr.h" #include "storage/spin.h" +#include "utils/acl.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner.h" @@ -127,6 +130,13 @@ typedef struct int num_requests; /* current # of requests */ int max_requests; /* allocated array size */ + + int head; /* Index of the first request in the ring + * buffer */ + int tail; /* Index of the last request in the ring + * buffer */ + + /* The ring buffer of pending checkpointer requests */ CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER]; } CheckpointerShmemStruct; @@ -135,6 +145,12 @@ static CheckpointerShmemStruct *CheckpointerShmem; /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */ #define WRITES_PER_ABSORB 1000 +/* Maximum number of checkpointer requests to process in one batch */ +#define CKPT_REQ_BATCH_SIZE 10000 + +/* Max number of requests the checkpointer request queue can hold */ +#define MAX_CHECKPOINT_REQUESTS 10000000 + /* * GUC parameters */ @@ -161,7 +177,7 @@ static pg_time_t last_xlog_switch_time; static void ProcessCheckpointerInterrupts(void); static void CheckArchiveTimeout(void); static bool IsCheckpointOnSchedule(double progress); -static bool ImmediateCheckpointRequested(void); +static bool FastCheckpointRequested(void); static bool CompactCheckpointerRequestQueue(void); static void UpdateSharedMemoryConfig(void); @@ -663,10 +679,6 @@ ProcessCheckpointerInterrupts(void) /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); - - /* Publish memory contexts of this process */ - if (PublishMemoryContextPending) - ProcessGetMemoryContextInterrupt(); } /* @@ -738,12 +750,12 @@ CheckArchiveTimeout(void) } /* - * Returns true if an immediate checkpoint request is pending. (Note that - * this does not check the *current* checkpoint's IMMEDIATE flag, but whether - * there is one pending behind it.) + * Returns true if a fast checkpoint request is pending. (Note that this does + * not check the *current* checkpoint's FAST flag, but whether there is one + * pending behind it.) */ static bool -ImmediateCheckpointRequested(void) +FastCheckpointRequested(void) { volatile CheckpointerShmemStruct *cps = CheckpointerShmem; @@ -751,7 +763,7 @@ ImmediateCheckpointRequested(void) * We don't need to acquire the ckpt_lck in this case because we're only * looking at a single flag bit. */ - if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE) + if (cps->ckpt_flags & CHECKPOINT_FAST) return true; return false; } @@ -764,7 +776,7 @@ ImmediateCheckpointRequested(void) * checkpoint_completion_target. * * The checkpoint request flags should be passed in; currently the only one - * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. + * examined is CHECKPOINT_FAST, which disables delays between writes. * * 'progress' is an estimate of how much of the work has been done, as a * fraction between 0.0 meaning none, and 1.0 meaning all done. @@ -782,10 +794,10 @@ CheckpointWriteDelay(int flags, double progress) * Perform the usual duties and take a nap, unless we're behind schedule, * in which case we just try to catch up as quickly as possible. */ - if (!(flags & CHECKPOINT_IMMEDIATE) && + if (!(flags & CHECKPOINT_FAST) && !ShutdownXLOGPending && !ShutdownRequestPending && - !ImmediateCheckpointRequested() && + !FastCheckpointRequested() && IsCheckpointOnSchedule(progress)) { if (ConfigReloadPending) @@ -941,11 +953,14 @@ CheckpointerShmemSize(void) Size size; /* - * Currently, the size of the requests[] array is arbitrarily set equal to - * NBuffers. This may prove too large or small ... + * The size of the requests[] array is arbitrarily set equal to NBuffers. + * But there is a cap of MAX_CHECKPOINT_REQUESTS to prevent accumulating + * too many checkpoint requests in the ring buffer. */ size = offsetof(CheckpointerShmemStruct, requests); - size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest))); + size = add_size(size, mul_size(Min(NBuffers, + MAX_CHECKPOINT_REQUESTS), + sizeof(CheckpointerRequest))); return size; } @@ -974,24 +989,75 @@ CheckpointerShmemInit(void) */ MemSet(CheckpointerShmem, 0, size); SpinLockInit(&CheckpointerShmem->ckpt_lck); - CheckpointerShmem->max_requests = NBuffers; + CheckpointerShmem->max_requests = Min(NBuffers, MAX_CHECKPOINT_REQUESTS); + CheckpointerShmem->head = CheckpointerShmem->tail = 0; ConditionVariableInit(&CheckpointerShmem->start_cv); ConditionVariableInit(&CheckpointerShmem->done_cv); } } /* + * ExecCheckpoint + * Primary entry point for manual CHECKPOINT commands + * + * This is mainly a wrapper for RequestCheckpoint(). + */ +void +ExecCheckpoint(ParseState *pstate, CheckPointStmt *stmt) +{ + bool fast = true; + bool unlogged = false; + + foreach_ptr(DefElem, opt, stmt->options) + { + if (strcmp(opt->defname, "mode") == 0) + { + char *mode = defGetString(opt); + + if (strcmp(mode, "spread") == 0) + fast = false; + else if (strcmp(mode, "fast") != 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized MODE option \"%s\"", mode), + parser_errposition(pstate, opt->location))); + } + else if (strcmp(opt->defname, "flush_unlogged") == 0) + unlogged = defGetBoolean(opt); + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized CHECKPOINT option \"%s\"", opt->defname), + parser_errposition(pstate, opt->location))); + } + + if (!has_privs_of_role(GetUserId(), ROLE_PG_CHECKPOINT)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + /* translator: %s is name of an SQL command (e.g., CHECKPOINT) */ + errmsg("permission denied to execute %s command", + "CHECKPOINT"), + errdetail("Only roles with privileges of the \"%s\" role may execute this command.", + "pg_checkpoint"))); + + RequestCheckpoint(CHECKPOINT_WAIT | + (fast ? CHECKPOINT_FAST : 0) | + (unlogged ? CHECKPOINT_FLUSH_UNLOGGED : 0) | + (RecoveryInProgress() ? 0 : CHECKPOINT_FORCE)); +} + +/* * RequestCheckpoint * Called in backend processes to request a checkpoint * * flags is a bitwise OR of the following: * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. - * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, + * CHECKPOINT_FAST: finish the checkpoint ASAP, * ignoring checkpoint_completion_target parameter. * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or - * CHECKPOINT_END_OF_RECOVERY). + * CHECKPOINT_END_OF_RECOVERY, and the CHECKPOINT command). * CHECKPOINT_WAIT: wait for completion before returning (otherwise, * just signal checkpointer to do it, and return). * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. @@ -1013,7 +1079,7 @@ RequestCheckpoint(int flags) * There's no point in doing slow checkpoints in a standalone backend, * because there's no other backends the checkpoint could disrupt. */ - CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); + CreateCheckPoint(flags | CHECKPOINT_FAST); /* Free all smgr objects, as CheckpointerMain() normally would. */ smgrdestroyall(); @@ -1152,6 +1218,7 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) { CheckpointerRequest *request; bool too_full; + int insert_pos; if (!IsUnderPostmaster) return false; /* probably shouldn't even get here */ @@ -1175,10 +1242,14 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) } /* OK, insert request */ - request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++]; + insert_pos = CheckpointerShmem->tail; + request = &CheckpointerShmem->requests[insert_pos]; request->ftag = *ftag; request->type = type; + CheckpointerShmem->tail = (CheckpointerShmem->tail + 1) % CheckpointerShmem->max_requests; + CheckpointerShmem->num_requests++; + /* If queue is more than half full, nudge the checkpointer to empty it */ too_full = (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests / 2); @@ -1220,12 +1291,16 @@ CompactCheckpointerRequestQueue(void) struct CheckpointerSlotMapping { CheckpointerRequest request; - int slot; + int ring_idx; }; - int n, - preserve_count; + int n; int num_skipped = 0; + int head; + int max_requests; + int num_requests; + int read_idx, + write_idx; HASHCTL ctl; HTAB *htab; bool *skip_slot; @@ -1237,8 +1312,13 @@ CompactCheckpointerRequestQueue(void) if (CritSectionCount > 0) return false; + max_requests = CheckpointerShmem->max_requests; + num_requests = CheckpointerShmem->num_requests; + /* Initialize skip_slot array */ - skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); + skip_slot = palloc0(sizeof(bool) * max_requests); + + head = CheckpointerShmem->head; /* Initialize temporary hash table */ ctl.keysize = sizeof(CheckpointerRequest); @@ -1262,7 +1342,8 @@ CompactCheckpointerRequestQueue(void) * away preceding entries that would end up being canceled anyhow), but * it's not clear that the extra complexity would buy us anything. */ - for (n = 0; n < CheckpointerShmem->num_requests; n++) + read_idx = head; + for (n = 0; n < num_requests; n++) { CheckpointerRequest *request; struct CheckpointerSlotMapping *slotmap; @@ -1275,16 +1356,19 @@ CompactCheckpointerRequestQueue(void) * CheckpointerShmemInit. Note also that RelFileLocator had better * contain no pad bytes. */ - request = &CheckpointerShmem->requests[n]; + request = &CheckpointerShmem->requests[read_idx]; slotmap = hash_search(htab, request, HASH_ENTER, &found); if (found) { /* Duplicate, so mark the previous occurrence as skippable */ - skip_slot[slotmap->slot] = true; + skip_slot[slotmap->ring_idx] = true; num_skipped++; } /* Remember slot containing latest occurrence of this request value */ - slotmap->slot = n; + slotmap->ring_idx = read_idx; + + /* Move to the next request in the ring buffer */ + read_idx = (read_idx + 1) % max_requests; } /* Done with the hash table. */ @@ -1298,17 +1382,34 @@ CompactCheckpointerRequestQueue(void) } /* We found some duplicates; remove them. */ - preserve_count = 0; - for (n = 0; n < CheckpointerShmem->num_requests; n++) + read_idx = write_idx = head; + for (n = 0; n < num_requests; n++) { - if (skip_slot[n]) - continue; - CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n]; + /* If this slot is NOT skipped, keep it */ + if (!skip_slot[read_idx]) + { + /* If the read and write positions are different, copy the request */ + if (write_idx != read_idx) + CheckpointerShmem->requests[write_idx] = + CheckpointerShmem->requests[read_idx]; + + /* Advance the write position */ + write_idx = (write_idx + 1) % max_requests; + } + + read_idx = (read_idx + 1) % max_requests; } + + /* + * Update ring buffer state: head remains the same, tail moves, count + * decreases + */ + CheckpointerShmem->tail = write_idx; + CheckpointerShmem->num_requests -= num_skipped; + ereport(DEBUG1, (errmsg_internal("compacted fsync request queue from %d entries to %d entries", - CheckpointerShmem->num_requests, preserve_count))); - CheckpointerShmem->num_requests = preserve_count; + num_requests, CheckpointerShmem->num_requests))); /* Cleanup. */ pfree(skip_slot); @@ -1329,40 +1430,64 @@ AbsorbSyncRequests(void) { CheckpointerRequest *requests = NULL; CheckpointerRequest *request; - int n; + int n, + i; + bool loop; if (!AmCheckpointerProcess()) return; - LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); - - /* - * We try to avoid holding the lock for a long time by copying the request - * array, and processing the requests after releasing the lock. - * - * Once we have cleared the requests from shared memory, we have to PANIC - * if we then fail to absorb them (eg, because our hashtable runs out of - * memory). This is because the system cannot run safely if we are unable - * to fsync what we have been told to fsync. Fortunately, the hashtable - * is so small that the problem is quite unlikely to arise in practice. - */ - n = CheckpointerShmem->num_requests; - if (n > 0) + do { - requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); - memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); - } + LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); + + /*--- + * We try to avoid holding the lock for a long time by: + * 1. Copying the request array and processing the requests after + * releasing the lock; + * 2. Processing not the whole queue, but only batches of + * CKPT_REQ_BATCH_SIZE at once. + * + * Once we have cleared the requests from shared memory, we must + * PANIC if we then fail to absorb them (e.g., because our hashtable + * runs out of memory). This is because the system cannot run safely + * if we are unable to fsync what we have been told to fsync. + * Fortunately, the hashtable is so small that the problem is quite + * unlikely to arise in practice. + * + * Note: The maximum possible size of a ring buffer is + * MAX_CHECKPOINT_REQUESTS entries, which fit into a maximum palloc + * allocation size of 1Gb. Our maximum batch size, + * CKPT_REQ_BATCH_SIZE, is even smaller. + */ + n = Min(CheckpointerShmem->num_requests, CKPT_REQ_BATCH_SIZE); + if (n > 0) + { + if (!requests) + requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); - START_CRIT_SECTION(); + for (i = 0; i < n; i++) + { + requests[i] = CheckpointerShmem->requests[CheckpointerShmem->head]; + CheckpointerShmem->head = (CheckpointerShmem->head + 1) % CheckpointerShmem->max_requests; + } - CheckpointerShmem->num_requests = 0; + CheckpointerShmem->num_requests -= n; - LWLockRelease(CheckpointerCommLock); + } + + START_CRIT_SECTION(); + + /* Are there any requests in the queue? If so, keep going. */ + loop = CheckpointerShmem->num_requests != 0; + + LWLockRelease(CheckpointerCommLock); - for (request = requests; n > 0; request++, n--) - RememberSyncRequest(&request->ftag, request->type); + for (request = requests; n > 0; request++, n--) + RememberSyncRequest(&request->ftag, request->type); - END_CRIT_SECTION(); + END_CRIT_SECTION(); + } while (loop); if (requests) pfree(requests); diff --git a/src/backend/postmaster/interrupt.c b/src/backend/postmaster/interrupt.c index f24f574e748..0ae9bf906ec 100644 --- a/src/backend/postmaster/interrupt.c +++ b/src/backend/postmaster/interrupt.c @@ -48,10 +48,6 @@ ProcessMainLoopInterrupts(void) /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); - - /* Publish memory contexts of this process */ - if (PublishMemoryContextPending) - ProcessGetMemoryContextInterrupt(); } /* diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index cb7408acf4c..78e39e5f866 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -718,15 +718,15 @@ pgarch_readyXlog(char *xlog) /* * Store the file in our max-heap if it has a high enough priority. */ - if (arch_files->arch_heap->bh_size < NUM_FILES_PER_DIRECTORY_SCAN) + if (binaryheap_size(arch_files->arch_heap) < NUM_FILES_PER_DIRECTORY_SCAN) { /* If the heap isn't full yet, quickly add it. */ - arch_file = arch_files->arch_filenames[arch_files->arch_heap->bh_size]; + arch_file = arch_files->arch_filenames[binaryheap_size(arch_files->arch_heap)]; strcpy(arch_file, basename); binaryheap_add_unordered(arch_files->arch_heap, CStringGetDatum(arch_file)); /* If we just filled the heap, make it a valid one. */ - if (arch_files->arch_heap->bh_size == NUM_FILES_PER_DIRECTORY_SCAN) + if (binaryheap_size(arch_files->arch_heap) == NUM_FILES_PER_DIRECTORY_SCAN) binaryheap_build(arch_files->arch_heap); } else if (ready_file_comparator(binaryheap_first(arch_files->arch_heap), @@ -744,21 +744,21 @@ pgarch_readyXlog(char *xlog) FreeDir(rldir); /* If no files were found, simply return. */ - if (arch_files->arch_heap->bh_size == 0) + if (binaryheap_empty(arch_files->arch_heap)) return false; /* * If we didn't fill the heap, we didn't make it a valid one. Do that * now. */ - if (arch_files->arch_heap->bh_size < NUM_FILES_PER_DIRECTORY_SCAN) + if (binaryheap_size(arch_files->arch_heap) < NUM_FILES_PER_DIRECTORY_SCAN) binaryheap_build(arch_files->arch_heap); /* * Fill arch_files array with the files to archive in ascending order of * priority. */ - arch_files->arch_files_size = arch_files->arch_heap->bh_size; + arch_files->arch_files_size = binaryheap_size(arch_files->arch_heap); for (int i = 0; i < arch_files->arch_files_size; i++) arch_files->arch_files[i] = DatumGetCString(binaryheap_remove_first(arch_files->arch_heap)); @@ -867,10 +867,6 @@ ProcessPgArchInterrupts(void) if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); - /* Publish memory contexts of this process */ - if (PublishMemoryContextPending) - ProcessGetMemoryContextInterrupt(); - if (ConfigReloadPending) { char *archiveLib = pstrdup(XLogArchiveLibrary); diff --git a/src/backend/postmaster/pmchild.c b/src/backend/postmaster/pmchild.c index cde1d23a4ca..584bb58c8ab 100644 --- a/src/backend/postmaster/pmchild.c +++ b/src/backend/postmaster/pmchild.c @@ -60,6 +60,17 @@ NON_EXEC_STATIC int num_pmchild_slots = 0; dlist_head ActiveChildList; /* + * Dummy pointer to persuade Valgrind that we've not leaked the array of + * PMChild structs. Make it global to ensure the compiler doesn't + * optimize it away. + */ +#ifdef USE_VALGRIND +extern PMChild *pmchild_array; +PMChild *pmchild_array; +#endif + + +/* * MaxLivePostmasterChildren * * This reports the number of postmaster child processes that can be active. @@ -125,8 +136,13 @@ InitPostmasterChildSlots(void) for (int i = 0; i < BACKEND_NUM_TYPES; i++) num_pmchild_slots += pmchild_pools[i].size; - /* Initialize them */ + /* Allocate enough slots, and make sure Valgrind doesn't complain */ slots = palloc(num_pmchild_slots * sizeof(PMChild)); +#ifdef USE_VALGRIND + pmchild_array = slots; +#endif + + /* Initialize them */ slotno = 0; for (int btype = 0; btype < BACKEND_NUM_TYPES; btype++) { diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 490f7ce3664..e01d9f0cfe8 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2630,6 +2630,13 @@ CleanupBackend(PMChild *bp, } bp = NULL; + /* + * In a crash case, exit immediately without resetting background worker + * state. However, if restart_after_crash is enabled, the background + * worker state (e.g., rw_pid) still needs be reset so the worker can + * restart after crash recovery. This reset is handled in + * ResetBackgroundWorkerCrashTimes(), not here. + */ if (crashed) { HandleChildCrash(bp_pid, exitstatus, procname); @@ -4337,15 +4344,15 @@ maybe_start_bgworkers(void) static bool maybe_reap_io_worker(int pid) { - for (int id = 0; id < MAX_IO_WORKERS; ++id) + for (int i = 0; i < MAX_IO_WORKERS; ++i) { - if (io_worker_children[id] && - io_worker_children[id]->pid == pid) + if (io_worker_children[i] && + io_worker_children[i]->pid == pid) { - ReleasePostmasterChildSlot(io_worker_children[id]); + ReleasePostmasterChildSlot(io_worker_children[i]); --io_worker_count; - io_worker_children[id] = NULL; + io_worker_children[i] = NULL; return true; } } @@ -4389,22 +4396,22 @@ maybe_adjust_io_workers(void) while (io_worker_count < io_workers) { PMChild *child; - int id; + int i; /* find unused entry in io_worker_children array */ - for (id = 0; id < MAX_IO_WORKERS; ++id) + for (i = 0; i < MAX_IO_WORKERS; ++i) { - if (io_worker_children[id] == NULL) + if (io_worker_children[i] == NULL) break; } - if (id == MAX_IO_WORKERS) - elog(ERROR, "could not find a free IO worker ID"); + if (i == MAX_IO_WORKERS) + elog(ERROR, "could not find a free IO worker slot"); /* Try to launch one. */ child = StartChildProcess(B_IO_WORKER); if (child != NULL) { - io_worker_children[id] = child; + io_worker_children[i] = child; ++io_worker_count; } else @@ -4415,11 +4422,11 @@ maybe_adjust_io_workers(void) if (io_worker_count > io_workers) { /* ask the IO worker in the highest slot to exit */ - for (int id = MAX_IO_WORKERS - 1; id >= 0; --id) + for (int i = MAX_IO_WORKERS - 1; i >= 0; --i) { - if (io_worker_children[id] != NULL) + if (io_worker_children[i] != NULL) { - kill(io_worker_children[id]->pid, SIGUSR2); + kill(io_worker_children[i]->pid, SIGUSR2); break; } } diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index 7149a67fcbc..27e86cf393f 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -192,10 +192,6 @@ ProcessStartupProcInterrupts(void) /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); - - /* Publish memory contexts of this process */ - if (PublishMemoryContextPending) - ProcessGetMemoryContextInterrupt(); } diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index c7a76711cc5..777c9a8d555 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -385,7 +385,7 @@ WalSummarizerMain(const void *startup_data, size_t startup_data_len) switch_lsn = tliSwitchPoint(current_tli, tles, &switch_tli); ereport(DEBUG1, - errmsg_internal("switch point from TLI %u to TLI %u is at %X/%X", + errmsg_internal("switch point from TLI %u to TLI %u is at %X/%08X", current_tli, switch_tli, LSN_FORMAT_ARGS(switch_lsn))); } @@ -741,7 +741,7 @@ WaitForWalSummarization(XLogRecPtr lsn) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("WAL summarization is not progressing"), - errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.", + errdetail("Summarization is needed through %X/%08X, but is stuck at %X/%08X on disk and %X/%08X in memory.", LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(summarized_lsn), LSN_FORMAT_ARGS(pending_lsn)))); @@ -755,12 +755,12 @@ WaitForWalSummarization(XLogRecPtr lsn) current_time) / 1000; ereport(WARNING, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg_plural("still waiting for WAL summarization through %X/%X after %ld second", - "still waiting for WAL summarization through %X/%X after %ld seconds", + errmsg_plural("still waiting for WAL summarization through %X/%08X after %ld second", + "still waiting for WAL summarization through %X/%08X after %ld seconds", elapsed_seconds, LSN_FORMAT_ARGS(lsn), elapsed_seconds), - errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.", + errdetail("Summarization has reached %X/%08X on disk and %X/%08X in memory.", LSN_FORMAT_ARGS(summarized_lsn), LSN_FORMAT_ARGS(pending_lsn)))); } @@ -879,10 +879,6 @@ ProcessWalSummarizerInterrupts(void) /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); - - /* Publish memory contexts of this process */ - if (PublishMemoryContextPending) - ProcessGetMemoryContextInterrupt(); } /* @@ -985,7 +981,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, if (private_data->end_of_wal) { ereport(DEBUG1, - errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X", + errmsg_internal("could not read WAL from timeline %u at %X/%08X: end of WAL at %X/%08X", tli, LSN_FORMAT_ARGS(start_lsn), LSN_FORMAT_ARGS(private_data->read_upto))); @@ -1004,8 +1000,8 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, } else ereport(ERROR, - (errmsg("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(start_lsn)))); + errmsg("could not find a valid record after %X/%08X", + LSN_FORMAT_ARGS(start_lsn))); } /* We shouldn't go backward. */ @@ -1038,7 +1034,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, * able to read a complete record. */ ereport(DEBUG1, - errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X", + errmsg_internal("could not read WAL from timeline %u at %X/%08X: end of WAL at %X/%08X", tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr), LSN_FORMAT_ARGS(private_data->read_upto))); @@ -1049,13 +1045,13 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, if (errormsg) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read WAL from timeline %u at %X/%X: %s", + errmsg("could not read WAL from timeline %u at %X/%08X: %s", tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr), errormsg))); else ereport(ERROR, (errcode_for_file_access(), - errmsg("could not read WAL from timeline %u at %X/%X", + errmsg("could not read WAL from timeline %u at %X/%08X", tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr)))); } @@ -1226,7 +1222,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, /* Tell the user what we did. */ ereport(DEBUG1, - errmsg_internal("summarized WAL on TLI %u from %X/%X to %X/%X", + errmsg_internal("summarized WAL on TLI %u from %X/%08X to %X/%08X", tli, LSN_FORMAT_ARGS(summary_start_lsn), LSN_FORMAT_ARGS(summary_end_lsn))); @@ -1238,7 +1234,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, /* If we skipped a non-zero amount of WAL, log a debug message. */ if (summary_end_lsn > summary_start_lsn && fast_forward) ereport(DEBUG1, - errmsg_internal("skipped summarizing WAL on TLI %u from %X/%X to %X/%X", + errmsg_internal("skipped summarizing WAL on TLI %u from %X/%08X to %X/%08X", tli, LSN_FORMAT_ARGS(summary_start_lsn), LSN_FORMAT_ARGS(summary_end_lsn))); @@ -1584,7 +1580,7 @@ summarizer_read_local_xlog_page(XLogReaderState *state, /* Debugging output. */ ereport(DEBUG1, - errmsg_internal("timeline %u became historic, can read up to %X/%X", + errmsg_internal("timeline %u became historic, can read up to %X/%08X", private_data->tli, LSN_FORMAT_ARGS(private_data->read_upto))); } diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 78193cfb964..d9eab5357bc 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -20,58 +20,13 @@ #include "common/unicode_category.h" #include "utils/pg_locale.h" -/* - * For the libc provider, to provide as much functionality as possible on a - * variety of platforms without going so far as to implement everything from - * scratch, we use several implementation strategies depending on the - * situation: - * - * 1. In C/POSIX collations, we use hard-wired code. We can't depend on - * the <ctype.h> functions since those will obey LC_CTYPE. Note that these - * collations don't give a fig about multibyte characters. - * - * 2. When working in UTF8 encoding, we use the <wctype.h> functions. - * This assumes that every platform uses Unicode codepoints directly - * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption - * even for non-UTF8 encodings, which may be a problem.) On some platforms - * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. - * - * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar - * values up to 255, and punt for values above that. This is 100% correct - * only in single-byte encodings such as LATINn. However, non-Unicode - * multibyte encodings are mostly Far Eastern character sets for which the - * properties being tested here aren't very relevant for higher code values - * anyway. The difficulty with using the <wctype.h> functions with - * non-Unicode multibyte encodings is that we can have no certainty that - * the platform's wchar_t representation matches what we do in pg_wchar - * conversions. - * - * As a special case, in the "default" collation, (2) and (3) force ASCII - * letters to follow ASCII upcase/downcase rules, while in a non-default - * collation we just let the library functions do what they will. The case - * where this matters is treatment of I/i in Turkish, and the behavior is - * meant to match the upper()/lower() SQL functions. - * - * We store the active collation setting in static variables. In principle - * it could be passed down to here via the regex library's "struct vars" data - * structure; but that would require somewhat invasive changes in the regex - * library, and right now there's no real benefit to be gained from that. - * - * NB: the coding here assumes pg_wchar is an unsigned type. - */ - -typedef enum -{ - PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */ - PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */ - PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t <wctype.h> functions */ - PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */ - PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */ -} PG_Locale_Strategy; - -static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; +static struct pg_locale_struct dummy_c_locale = { + .collate_is_c = true, + .ctype_is_c = true, +}; + /* * Hard-wired character properties for C locale */ @@ -228,7 +183,6 @@ void pg_set_regex_collation(Oid collation) { pg_locale_t locale = 0; - PG_Locale_Strategy strategy; if (!OidIsValid(collation)) { @@ -249,8 +203,7 @@ pg_set_regex_collation(Oid collation) * catalog access is available, so we can't call * pg_newlocale_from_collation(). */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; + locale = &dummy_c_locale; } else { @@ -267,113 +220,41 @@ pg_set_regex_collation(Oid collation) * C/POSIX collations use this path regardless of database * encoding */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; - } - else if (locale->provider == COLLPROVIDER_BUILTIN) - { - Assert(GetDatabaseEncoding() == PG_UTF8); - strategy = PG_REGEX_STRATEGY_BUILTIN; - } -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - { - strategy = PG_REGEX_STRATEGY_ICU; - } -#endif - else - { - Assert(locale->provider == COLLPROVIDER_LIBC); - if (GetDatabaseEncoding() == PG_UTF8) - strategy = PG_REGEX_STRATEGY_LIBC_WIDE; - else - strategy = PG_REGEX_STRATEGY_LIBC_1BYTE; + locale = &dummy_c_locale; } } - pg_regex_strategy = strategy; pg_regex_locale = locale; } static int pg_wc_isdigit(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISDIGIT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isdigit(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISDIGIT)); + else + return pg_regex_locale->ctype->wc_isdigit(c, pg_regex_locale); } static int pg_wc_isalpha(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALPHA)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalpha(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalpha(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALPHA)); + else + return pg_regex_locale->ctype->wc_isalpha(c, pg_regex_locale); } static int pg_wc_isalnum(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALNUM)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalnum(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALNUM)); + else + return pg_regex_locale->ctype->wc_isalnum(c, pg_regex_locale); } static int @@ -388,231 +269,87 @@ pg_wc_isword(pg_wchar c) static int pg_wc_isupper(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISUPPER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isupper(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isupper_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isupper(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISUPPER)); + else + return pg_regex_locale->ctype->wc_isupper(c, pg_regex_locale); } static int pg_wc_islower(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISLOWER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_islower(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - islower_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_islower(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISLOWER)); + else + return pg_regex_locale->ctype->wc_islower(c, pg_regex_locale); } static int pg_wc_isgraph(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISGRAPH)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isgraph(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isgraph(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISGRAPH)); + else + return pg_regex_locale->ctype->wc_isgraph(c, pg_regex_locale); } static int pg_wc_isprint(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPRINT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isprint(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswprint_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isprint_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isprint(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPRINT)); + else + return pg_regex_locale->ctype->wc_isprint(c, pg_regex_locale); } static int pg_wc_ispunct(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPUNCT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_ispunct(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPUNCT)); + else + return pg_regex_locale->ctype->wc_ispunct(c, pg_regex_locale); } static int pg_wc_isspace(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISSPACE)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isspace(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswspace_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isspace_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isspace(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISSPACE)); + else + return pg_regex_locale->ctype->wc_isspace(c, pg_regex_locale); } static pg_wchar pg_wc_toupper(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_uppercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - if (c <= (pg_wchar) UCHAR_MAX) - return toupper_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_toupper(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale); } static pg_wchar pg_wc_tolower(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_lowercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - /* force C behavior for ASCII characters, per comments above */ - if (pg_regex_locale->is_default && c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - if (c <= (pg_wchar) UCHAR_MAX) - return tolower_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_tolower(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale); } @@ -738,37 +475,25 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) * would always be true for production values of MAX_SIMPLE_CHR, but it's * useful to allow it to be small for testing purposes.) */ - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: #if MAX_SIMPLE_CHR >= 127 - max_chr = (pg_wchar) 127; - pcc->cv.cclasscode = -1; + max_chr = (pg_wchar) 127; + pcc->cv.cclasscode = -1; #else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; + max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif - break; - case PG_REGEX_STRATEGY_BUILTIN: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_WIDE: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_1BYTE: -#if MAX_SIMPLE_CHR >= UCHAR_MAX - max_chr = (pg_wchar) UCHAR_MAX; + } + else + { + if (pg_regex_locale->ctype->max_chr != 0 && + pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR) + { + max_chr = pg_regex_locale->ctype->max_chr; pcc->cv.cclasscode = -1; -#else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; -#endif - break; - case PG_REGEX_STRATEGY_ICU: + } + else max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - default: - Assert(false); - max_chr = 0; /* can't get here, but keep compiler quiet */ - break; } /* diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index 7b4ddf7a8f5..239641bfbb6 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -232,6 +232,9 @@ libpqrcv_connect(const char *conninfo, bool replication, bool logical, errhint("Target server's authentication method must be changed, or set password_required=false in the subscription parameters."))); } + PQsetNoticeReceiver(conn->streamConn, libpqsrv_notice_receiver, + "received message via replication"); + /* * Set always-secure search path for the cases where the connection is * used to run SQL queries, so malicious users can't get control. @@ -418,31 +421,22 @@ libpqrcv_identify_system(WalReceiverConn *conn, TimeLineID *primary_tli) "IDENTIFY_SYSTEM", WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive database system identifier and timeline ID from " "the primary server: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } /* * IDENTIFY_SYSTEM returns 3 columns in 9.3 and earlier, and 4 columns in * 9.4 and onwards. */ if (PQnfields(res) < 3 || PQntuples(res) != 1) - { - int ntuples = PQntuples(res); - int nfields = PQnfields(res); - - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid response from primary server"), errdetail("Could not identify system: got %d rows and %d fields, expected %d rows and %d or more fields.", - ntuples, nfields, 1, 3))); - } + PQntuples(res), PQnfields(res), 1, 3))); primary_sysid = pstrdup(PQgetvalue(res, 0, 0)); *primary_tli = pg_strtoint32(PQgetvalue(res, 0, 1)); PQclear(res); @@ -534,7 +528,7 @@ libpqrcv_startstreaming(WalReceiverConn *conn, if (options->logical) appendStringInfoString(&cmd, " LOGICAL"); - appendStringInfo(&cmd, " %X/%X", LSN_FORMAT_ARGS(options->startpoint)); + appendStringInfo(&cmd, " %X/%08X", LSN_FORMAT_ARGS(options->startpoint)); /* * Additional options are different depending on if we are doing logical @@ -604,13 +598,10 @@ libpqrcv_startstreaming(WalReceiverConn *conn, return false; } else if (PQresultStatus(res) != PGRES_COPY_BOTH) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not start WAL streaming: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } PQclear(res); return true; } @@ -718,26 +709,17 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn, cmd, WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive timeline history file from " "the primary server: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } if (PQnfields(res) != 2 || PQntuples(res) != 1) - { - int ntuples = PQntuples(res); - int nfields = PQnfields(res); - - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid response from primary server"), errdetail("Expected 1 tuple with 2 fields, got %d tuples with %d fields.", - ntuples, nfields))); - } + PQntuples(res), PQnfields(res)))); *filename = pstrdup(PQgetvalue(res, 0, 0)); *len = PQgetlength(res, 0, 1); @@ -841,13 +823,10 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer, return -1; } else - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not receive data from WAL stream: %s", pchomp(PQerrorMessage(conn->streamConn))))); - } } if (rawlen < -1) ereport(ERROR, @@ -971,13 +950,10 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname, pfree(cmd.data); if (PQresultStatus(res) != PGRES_TUPLES_OK) - { - PQclear(res); ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("could not create replication slot \"%s\": %s", slotname, pchomp(PQerrorMessage(conn->streamConn))))); - } if (lsn) *lsn = DatumGetLSN(DirectFunctionCall1Coll(pg_lsn_in, InvalidOid, diff --git a/src/backend/replication/logical/applyparallelworker.c b/src/backend/replication/logical/applyparallelworker.c index d25085d3515..cd0e19176fd 100644 --- a/src/backend/replication/logical/applyparallelworker.c +++ b/src/backend/replication/logical/applyparallelworker.c @@ -441,7 +441,8 @@ pa_launch_parallel_worker(void) MySubscription->name, MyLogicalRepWorker->userid, InvalidOid, - dsm_segment_handle(winfo->dsm_seg)); + dsm_segment_handle(winfo->dsm_seg), + false); if (launched) { @@ -777,10 +778,10 @@ LogicalParallelApplyLoop(shm_mq_handle *mqh) /* * The first byte of messages sent from leader apply worker to - * parallel apply workers can only be 'w'. + * parallel apply workers can only be PqReplMsg_WALData. */ c = pq_getmsgbyte(&s); - if (c != 'w') + if (c != PqReplMsg_WALData) elog(ERROR, "unexpected message \"%c\"", c); /* diff --git a/src/backend/replication/logical/conflict.c b/src/backend/replication/logical/conflict.c index 97c4e26b586..2fd3e8bbda5 100644 --- a/src/backend/replication/logical/conflict.c +++ b/src/backend/replication/logical/conflict.c @@ -29,6 +29,7 @@ static const char *const ConflictTypeNames[] = { [CT_UPDATE_EXISTS] = "update_exists", [CT_UPDATE_MISSING] = "update_missing", [CT_DELETE_ORIGIN_DIFFERS] = "delete_origin_differs", + [CT_UPDATE_DELETED] = "update_deleted", [CT_DELETE_MISSING] = "delete_missing", [CT_MULTIPLE_UNIQUE_CONFLICTS] = "multiple_unique_conflicts" }; @@ -176,6 +177,7 @@ errcode_apply_conflict(ConflictType type) case CT_UPDATE_ORIGIN_DIFFERS: case CT_UPDATE_MISSING: case CT_DELETE_ORIGIN_DIFFERS: + case CT_UPDATE_DELETED: case CT_DELETE_MISSING: return errcode(ERRCODE_T_R_SERIALIZATION_FAILURE); } @@ -261,6 +263,26 @@ errdetail_apply_conflict(EState *estate, ResultRelInfo *relinfo, break; + case CT_UPDATE_DELETED: + if (localts) + { + if (localorigin == InvalidRepOriginId) + appendStringInfo(&err_detail, _("The row to be updated was deleted locally in transaction %u at %s."), + localxmin, timestamptz_to_str(localts)); + else if (replorigin_by_oid(localorigin, true, &origin_name)) + appendStringInfo(&err_detail, _("The row to be updated was deleted by a different origin \"%s\" in transaction %u at %s."), + origin_name, localxmin, timestamptz_to_str(localts)); + + /* The origin that modified this row has been removed. */ + else + appendStringInfo(&err_detail, _("The row to be updated was deleted by a non-existent origin in transaction %u at %s."), + localxmin, timestamptz_to_str(localts)); + } + else + appendStringInfo(&err_detail, _("The row to be updated was deleted.")); + + break; + case CT_UPDATE_MISSING: appendStringInfoString(&err_detail, _("Could not find the row to be updated.")); break; diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index 10677da56b2..37377f7eb63 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -32,6 +32,7 @@ #include "postmaster/interrupt.h" #include "replication/logicallauncher.h" #include "replication/origin.h" +#include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/worker_internal.h" #include "storage/ipc.h" @@ -91,7 +92,6 @@ static dshash_table *last_start_times = NULL; static bool on_commit_launcher_wakeup = false; -static void ApplyLauncherWakeup(void); static void logicalrep_launcher_onexit(int code, Datum arg); static void logicalrep_worker_onexit(int code, Datum arg); static void logicalrep_worker_detach(void); @@ -100,6 +100,9 @@ static int logicalrep_pa_worker_count(Oid subid); static void logicalrep_launcher_attach_dshmem(void); static void ApplyLauncherSetWorkerStartTime(Oid subid, TimestampTz start_time); static TimestampTz ApplyLauncherGetWorkerStartTime(Oid subid); +static void compute_min_nonremovable_xid(LogicalRepWorker *worker, TransactionId *xmin); +static bool acquire_conflict_slot_if_exists(void); +static void advance_conflict_slot_xmin(TransactionId new_xmin); /* @@ -148,6 +151,7 @@ get_subscription_list(void) sub->owner = subform->subowner; sub->enabled = subform->subenabled; sub->name = pstrdup(NameStr(subform->subname)); + sub->retaindeadtuples = subform->subretaindeadtuples; /* We don't fill fields we are not interested in. */ res = lappend(res, sub); @@ -175,12 +179,14 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker, uint16 generation, BackgroundWorkerHandle *handle) { - BgwHandleStatus status; - int rc; + bool result = false; + bool dropped_latch = false; for (;;) { + BgwHandleStatus status; pid_t pid; + int rc; CHECK_FOR_INTERRUPTS(); @@ -189,8 +195,9 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker, /* Worker either died or has started. Return false if died. */ if (!worker->in_use || worker->proc) { + result = worker->in_use; LWLockRelease(LogicalRepWorkerLock); - return worker->in_use; + break; } LWLockRelease(LogicalRepWorkerLock); @@ -205,7 +212,7 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker, if (generation == worker->generation) logicalrep_worker_cleanup(worker); LWLockRelease(LogicalRepWorkerLock); - return false; + break; /* result is already false */ } /* @@ -220,8 +227,18 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker, { ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); + dropped_latch = true; } } + + /* + * If we had to clear a latch event in order to wait, be sure to restore + * it before exiting. Otherwise caller may miss events. + */ + if (dropped_latch) + SetLatch(MyLatch); + + return result; } /* @@ -296,7 +313,8 @@ logicalrep_workers_find(Oid subid, bool only_running, bool acquire_lock) bool logicalrep_worker_launch(LogicalRepWorkerType wtype, Oid dbid, Oid subid, const char *subname, Oid userid, - Oid relid, dsm_handle subworker_dsm) + Oid relid, dsm_handle subworker_dsm, + bool retain_dead_tuples) { BackgroundWorker bgw; BackgroundWorkerHandle *bgw_handle; @@ -315,10 +333,13 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, * - must be valid worker type * - tablesync workers are only ones to have relid * - parallel apply worker is the only kind of subworker + * - The replication slot used in conflict detection is created when + * retain_dead_tuples is enabled */ Assert(wtype != WORKERTYPE_UNKNOWN); Assert(is_tablesync_worker == OidIsValid(relid)); Assert(is_parallel_apply_worker == (subworker_dsm != DSM_HANDLE_INVALID)); + Assert(!retain_dead_tuples || MyReplicationSlot); ereport(DEBUG1, (errmsg_internal("starting logical replication worker for subscription \"%s\"", @@ -328,7 +349,7 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype, if (max_active_replication_origins == 0) ereport(ERROR, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), - errmsg("cannot start logical replication workers when \"max_active_replication_origins\"=0"))); + errmsg("cannot start logical replication workers when \"max_active_replication_origins\" is 0"))); /* * We need to do the modification of the shared memory under lock so that @@ -441,6 +462,9 @@ retry: worker->stream_fileset = NULL; worker->leader_pid = is_parallel_apply_worker ? MyProcPid : InvalidPid; worker->parallel_apply = is_parallel_apply_worker; + worker->oldest_nonremovable_xid = retain_dead_tuples + ? MyReplicationSlot->data.xmin + : InvalidTransactionId; worker->last_lsn = InvalidXLogRecPtr; TIMESTAMP_NOBEGIN(worker->last_send_time); TIMESTAMP_NOBEGIN(worker->last_recv_time); @@ -766,6 +790,8 @@ logicalrep_worker_detach(void) } LWLockRelease(LogicalRepWorkerLock); + + list_free(workers); } /* Block concurrent access. */ @@ -1016,7 +1042,7 @@ logicalrep_launcher_attach_dshmem(void) last_start_times_dsa = dsa_attach(LogicalRepCtx->last_start_dsa); dsa_pin_mapping(last_start_times_dsa); last_start_times = dshash_attach(last_start_times_dsa, &dsh_params, - LogicalRepCtx->last_start_dsh, 0); + LogicalRepCtx->last_start_dsh, NULL); } MemoryContextSwitchTo(oldcontext); @@ -1105,7 +1131,10 @@ ApplyLauncherWakeupAtCommit(void) on_commit_launcher_wakeup = true; } -static void +/* + * Wakeup the launcher immediately. + */ +void ApplyLauncherWakeup(void) { if (LogicalRepCtx->launcher_pid != 0) @@ -1137,6 +1166,12 @@ ApplyLauncherMain(Datum main_arg) */ BackgroundWorkerInitializeConnection(NULL, NULL, 0); + /* + * Acquire the conflict detection slot at startup to ensure it can be + * dropped if no longer needed after a restart. + */ + acquire_conflict_slot_if_exists(); + /* Enter main loop */ for (;;) { @@ -1146,6 +1181,9 @@ ApplyLauncherMain(Datum main_arg) MemoryContext subctx; MemoryContext oldctx; long wait_time = DEFAULT_NAPTIME_PER_CYCLE; + bool can_advance_xmin = true; + bool retain_dead_tuples = false; + TransactionId xmin = InvalidTransactionId; CHECK_FOR_INTERRUPTS(); @@ -1155,7 +1193,14 @@ ApplyLauncherMain(Datum main_arg) ALLOCSET_DEFAULT_SIZES); oldctx = MemoryContextSwitchTo(subctx); - /* Start any missing workers for enabled subscriptions. */ + /* + * Start any missing workers for enabled subscriptions. + * + * Also, during the iteration through all subscriptions, we compute + * the minimum XID required to protect deleted tuples for conflict + * detection if one of the subscription enables retain_dead_tuples + * option. + */ sublist = get_subscription_list(); foreach(lc, sublist) { @@ -1165,6 +1210,38 @@ ApplyLauncherMain(Datum main_arg) TimestampTz now; long elapsed; + if (sub->retaindeadtuples) + { + retain_dead_tuples = true; + + /* + * Can't advance xmin of the slot unless all the subscriptions + * with retain_dead_tuples are enabled. This is required to + * ensure that we don't advance the xmin of + * CONFLICT_DETECTION_SLOT if one of the subscriptions is not + * enabled. Otherwise, we won't be able to detect conflicts + * reliably for such a subscription even though it has set the + * retain_dead_tuples option. + */ + can_advance_xmin &= sub->enabled; + + /* + * Create a replication slot to retain information necessary + * for conflict detection such as dead tuples, commit + * timestamps, and origins. + * + * The slot is created before starting the apply worker to + * prevent it from unnecessarily maintaining its + * oldest_nonremovable_xid. + * + * The slot is created even for a disabled subscription to + * ensure that conflict-related information is available when + * applying remote changes that occurred before the + * subscription was enabled. + */ + CreateConflictDetectionSlot(); + } + if (!sub->enabled) continue; @@ -1173,7 +1250,27 @@ ApplyLauncherMain(Datum main_arg) LWLockRelease(LogicalRepWorkerLock); if (w != NULL) - continue; /* worker is running already */ + { + /* + * Compute the minimum xmin required to protect dead tuples + * required for conflict detection among all running apply + * workers that enables retain_dead_tuples. + */ + if (sub->retaindeadtuples && can_advance_xmin) + compute_min_nonremovable_xid(w, &xmin); + + /* worker is running already */ + continue; + } + + /* + * Can't advance xmin of the slot unless all the workers + * corresponding to subscriptions with retain_dead_tuples are + * running, disabling the further computation of the minimum + * nonremovable xid. + */ + if (sub->retaindeadtuples) + can_advance_xmin = false; /* * If the worker is eligible to start now, launch it. Otherwise, @@ -1194,10 +1291,22 @@ ApplyLauncherMain(Datum main_arg) (elapsed = TimestampDifferenceMilliseconds(last_start, now)) >= wal_retrieve_retry_interval) { ApplyLauncherSetWorkerStartTime(sub->oid, now); - logicalrep_worker_launch(WORKERTYPE_APPLY, - sub->dbid, sub->oid, sub->name, - sub->owner, InvalidOid, - DSM_HANDLE_INVALID); + if (!logicalrep_worker_launch(WORKERTYPE_APPLY, + sub->dbid, sub->oid, sub->name, + sub->owner, InvalidOid, + DSM_HANDLE_INVALID, + sub->retaindeadtuples)) + { + /* + * We get here either if we failed to launch a worker + * (perhaps for resource-exhaustion reasons) or if we + * launched one but it immediately quit. Either way, it + * seems appropriate to try again after + * wal_retrieve_retry_interval. + */ + wait_time = Min(wait_time, + wal_retrieve_retry_interval); + } } else { @@ -1206,6 +1315,20 @@ ApplyLauncherMain(Datum main_arg) } } + /* + * Drop the CONFLICT_DETECTION_SLOT slot if there is no subscription + * that requires us to retain dead tuples. Otherwise, if required, + * advance the slot's xmin to protect dead tuples required for the + * conflict detection. + */ + if (MyReplicationSlot) + { + if (!retain_dead_tuples) + ReplicationSlotDropAcquired(); + else if (can_advance_xmin) + advance_conflict_slot_xmin(xmin); + } + /* Switch back to original memory context. */ MemoryContextSwitchTo(oldctx); /* Clean the temporary memory. */ @@ -1234,6 +1357,125 @@ ApplyLauncherMain(Datum main_arg) } /* + * Determine the minimum non-removable transaction ID across all apply workers + * for subscriptions that have retain_dead_tuples enabled. Store the result + * in *xmin. + */ +static void +compute_min_nonremovable_xid(LogicalRepWorker *worker, TransactionId *xmin) +{ + TransactionId nonremovable_xid; + + Assert(worker != NULL); + + /* + * The replication slot for conflict detection must be created before the + * worker starts. + */ + Assert(MyReplicationSlot); + + SpinLockAcquire(&worker->relmutex); + nonremovable_xid = worker->oldest_nonremovable_xid; + SpinLockRelease(&worker->relmutex); + + Assert(TransactionIdIsValid(nonremovable_xid)); + + if (!TransactionIdIsValid(*xmin) || + TransactionIdPrecedes(nonremovable_xid, *xmin)) + *xmin = nonremovable_xid; +} + +/* + * Acquire the replication slot used to retain information for conflict + * detection, if it exists. + * + * Return true if successfully acquired, otherwise return false. + */ +static bool +acquire_conflict_slot_if_exists(void) +{ + if (!SearchNamedReplicationSlot(CONFLICT_DETECTION_SLOT, true)) + return false; + + ReplicationSlotAcquire(CONFLICT_DETECTION_SLOT, true, false); + return true; +} + +/* + * Advance the xmin the replication slot used to retain information required + * for conflict detection. + */ +static void +advance_conflict_slot_xmin(TransactionId new_xmin) +{ + Assert(MyReplicationSlot); + Assert(TransactionIdIsValid(new_xmin)); + Assert(TransactionIdPrecedesOrEquals(MyReplicationSlot->data.xmin, new_xmin)); + + /* Return if the xmin value of the slot cannot be advanced */ + if (TransactionIdEquals(MyReplicationSlot->data.xmin, new_xmin)) + return; + + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_xmin = new_xmin; + MyReplicationSlot->data.xmin = new_xmin; + SpinLockRelease(&MyReplicationSlot->mutex); + + elog(DEBUG1, "updated xmin: %u", MyReplicationSlot->data.xmin); + + ReplicationSlotMarkDirty(); + ReplicationSlotsComputeRequiredXmin(false); + + /* + * Like PhysicalConfirmReceivedLocation(), do not save slot information + * each time. This is acceptable because all concurrent transactions on + * the publisher that require the data preceding the slot's xmin should + * have already been applied and flushed on the subscriber before the xmin + * is advanced. So, even if the slot's xmin regresses after a restart, it + * will be advanced again in the next cycle. Therefore, no data required + * for conflict detection will be prematurely removed. + */ + return; +} + +/* + * Create and acquire the replication slot used to retain information for + * conflict detection, if not yet. + */ +void +CreateConflictDetectionSlot(void) +{ + TransactionId xmin_horizon; + + /* Exit early, if the replication slot is already created and acquired */ + if (MyReplicationSlot) + return; + + ereport(LOG, + errmsg("creating replication conflict detection slot")); + + ReplicationSlotCreate(CONFLICT_DETECTION_SLOT, false, RS_PERSISTENT, false, + false, false); + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + xmin_horizon = GetOldestSafeDecodingTransactionId(false); + + SpinLockAcquire(&MyReplicationSlot->mutex); + MyReplicationSlot->effective_xmin = xmin_horizon; + MyReplicationSlot->data.xmin = xmin_horizon; + SpinLockRelease(&MyReplicationSlot->mutex); + + ReplicationSlotsComputeRequiredXmin(true); + + LWLockRelease(ProcArrayLock); + + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); +} + +/* * Is current process the logical replication launcher? */ bool diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index a8d2e024d34..7e363a7c05b 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -29,6 +29,7 @@ #include "postgres.h" #include "access/xact.h" +#include "access/xlog_internal.h" #include "access/xlogutils.h" #include "fmgr.h" #include "miscadmin.h" @@ -41,6 +42,7 @@ #include "storage/proc.h" #include "storage/procarray.h" #include "utils/builtins.h" +#include "utils/injection_point.h" #include "utils/inval.h" #include "utils/memutils.h" @@ -565,7 +567,7 @@ CreateDecodingContext(XLogRecPtr start_lsn, * kinds of client errors; so the client may wish to check that * confirmed_flush_lsn matches its expectations. */ - elog(LOG, "%X/%X has been already streamed, forwarding to %X/%X", + elog(LOG, "%X/%08X has been already streamed, forwarding to %X/%08X", LSN_FORMAT_ARGS(start_lsn), LSN_FORMAT_ARGS(slot->data.confirmed_flush)); @@ -608,7 +610,7 @@ CreateDecodingContext(XLogRecPtr start_lsn, ereport(LOG, (errmsg("starting logical decoding for slot \"%s\"", NameStr(slot->data.name)), - errdetail("Streaming transactions committing after %X/%X, reading WAL from %X/%X.", + errdetail("Streaming transactions committing after %X/%08X, reading WAL from %X/%08X.", LSN_FORMAT_ARGS(slot->data.confirmed_flush), LSN_FORMAT_ARGS(slot->data.restart_lsn)))); @@ -635,7 +637,7 @@ DecodingContextFindStartpoint(LogicalDecodingContext *ctx) /* Initialize from where to start reading WAL. */ XLogBeginRead(ctx->reader, slot->data.restart_lsn); - elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%X", + elog(DEBUG1, "searching for logical decoding starting point, starting at %X/%08X", LSN_FORMAT_ARGS(slot->data.restart_lsn)); /* Wait for a consistent starting point */ @@ -756,7 +758,7 @@ output_plugin_error_callback(void *arg) /* not all callbacks have an associated LSN */ if (state->report_location != InvalidXLogRecPtr) - errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%X", + errcontext("slot \"%s\", output plugin \"%s\", in the %s callback, associated LSN %X/%08X", NameStr(state->ctx->slot->data.name), NameStr(state->ctx->slot->data.plugin), state->callback_name, @@ -1723,7 +1725,7 @@ LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin) SpinLockRelease(&slot->mutex); if (got_new_xmin) - elog(DEBUG1, "got new catalog xmin %u at %X/%X", xmin, + elog(DEBUG1, "got new catalog xmin %u at %X/%08X", xmin, LSN_FORMAT_ARGS(current_lsn)); /* candidate already valid with the current flush position, apply */ @@ -1783,7 +1785,7 @@ LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart slot->candidate_restart_lsn = restart_lsn; SpinLockRelease(&slot->mutex); - elog(DEBUG1, "got new restart lsn %X/%X at %X/%X", + elog(DEBUG1, "got new restart lsn %X/%08X at %X/%08X", LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(current_lsn)); } @@ -1798,7 +1800,7 @@ LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn, XLogRecPtr restart confirmed_flush = slot->data.confirmed_flush; SpinLockRelease(&slot->mutex); - elog(DEBUG1, "failed to increase restart lsn: proposed %X/%X, after %X/%X, current candidate %X/%X, current after %X/%X, flushed up to %X/%X", + elog(DEBUG1, "failed to increase restart lsn: proposed %X/%08X, after %X/%08X, current candidate %X/%08X, current after %X/%08X, flushed up to %X/%08X", LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(current_lsn), LSN_FORMAT_ARGS(candidate_restart_lsn), @@ -1825,10 +1827,26 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn) { bool updated_xmin = false; bool updated_restart = false; + XLogRecPtr restart_lsn pg_attribute_unused(); SpinLockAcquire(&MyReplicationSlot->mutex); - MyReplicationSlot->data.confirmed_flush = lsn; + /* remember the old restart lsn */ + restart_lsn = MyReplicationSlot->data.restart_lsn; + + /* + * Prevent moving the confirmed_flush backwards, as this could lead to + * data duplication issues caused by replicating already replicated + * changes. + * + * This can happen when a client acknowledges an LSN it doesn't have + * to do anything for, and thus didn't store persistently. After a + * restart, the client can send the prior LSN that it stored + * persistently as an acknowledgement, but we need to ignore such an + * LSN. See similar case handling in CreateDecodingContext. + */ + if (lsn > MyReplicationSlot->data.confirmed_flush) + MyReplicationSlot->data.confirmed_flush = lsn; /* if we're past the location required for bumping xmin, do so */ if (MyReplicationSlot->candidate_xmin_lsn != InvalidXLogRecPtr && @@ -1869,6 +1887,18 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn) /* first write new xmin to disk, so we know what's up after a crash */ if (updated_xmin || updated_restart) { +#ifdef USE_INJECTION_POINTS + XLogSegNo seg1, + seg2; + + XLByteToSeg(restart_lsn, seg1, wal_segment_size); + XLByteToSeg(MyReplicationSlot->data.restart_lsn, seg2, wal_segment_size); + + /* trigger injection point, but only if segment changes */ + if (seg1 != seg2) + INJECTION_POINT("logical-replication-slot-advance-segment", NULL); +#endif + ReplicationSlotMarkDirty(); ReplicationSlotSave(); elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart); @@ -1893,7 +1923,14 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn) else { SpinLockAcquire(&MyReplicationSlot->mutex); - MyReplicationSlot->data.confirmed_flush = lsn; + + /* + * Prevent moving the confirmed_flush backwards. See comments above + * for the details. + */ + if (lsn > MyReplicationSlot->data.confirmed_flush) + MyReplicationSlot->data.confirmed_flush = lsn; + SpinLockRelease(&MyReplicationSlot->mutex); } } diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index a17bacf88e7..87f10e50dcc 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -826,9 +826,9 @@ StartupReplicationOrigin(void) last_state++; ereport(LOG, - (errmsg("recovered replication state of node %d to %X/%X", - disk_state.roident, - LSN_FORMAT_ARGS(disk_state.remote_lsn)))); + errmsg("recovered replication state of node %d to %X/%08X", + disk_state.roident, + LSN_FORMAT_ARGS(disk_state.remote_lsn))); } /* now check checksum */ diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c index 1a352b542dc..1b3d9eb49dd 100644 --- a/src/backend/replication/logical/proto.c +++ b/src/backend/replication/logical/proto.c @@ -809,7 +809,7 @@ logicalrep_write_tuple(StringInfo out, Relation rel, TupleTableSlot *slot, continue; } - if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i])) + if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(DatumGetPointer(values[i]))) { /* * Unchanged toasted datum. (Note that we don't promise to detect diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 67655111875..34cf05668ae 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -109,10 +109,22 @@ #include "storage/procarray.h" #include "storage/sinval.h" #include "utils/builtins.h" +#include "utils/inval.h" #include "utils/memutils.h" #include "utils/rel.h" #include "utils/relfilenumbermap.h" +/* + * Each transaction has an 8MB limit for invalidation messages distributed from + * other transactions. This limit is set considering scenarios with many + * concurrent logical decoding operations. When the distributed invalidation + * messages reach this threshold, the transaction is marked as + * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost + * some inval messages and hence don't know what needs to be invalidated. + */ +#define MAX_DISTR_INVAL_MSG_PER_TXN \ + ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage)) + /* entry for a hash table we use to map from xid to our transaction state */ typedef struct ReorderBufferTXNByIdEnt { @@ -472,6 +484,12 @@ ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) txn->invalidations = NULL; } + if (txn->invalidations_distributed) + { + pfree(txn->invalidations_distributed); + txn->invalidations_distributed = NULL; + } + /* Reset the toast hash */ ReorderBufferToastReset(rb, txn); @@ -1397,7 +1415,7 @@ ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state) int32 off; /* nothing there anymore */ - if (state->heap->bh_size == 0) + if (binaryheap_empty(state->heap)) return NULL; off = DatumGetInt32(binaryheap_first(state->heap)); @@ -2581,7 +2599,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, if (++changes_count >= CHANGES_THRESHOLD) { - rb->update_progress_txn(rb, txn, change->lsn); + rb->update_progress_txn(rb, txn, prev_lsn); changes_count = 0; } } @@ -2661,7 +2679,17 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, AbortCurrentTransaction(); /* make sure there's no cache pollution */ - ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + if (rbtxn_distr_inval_overflowed(txn)) + { + Assert(txn->ninvalidations_distributed == 0); + InvalidateSystemCaches(); + } + else + { + ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed, + txn->invalidations_distributed); + } if (using_subtxn) RollbackAndReleaseCurrentSubTransaction(); @@ -2710,8 +2738,17 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, AbortCurrentTransaction(); /* make sure there's no cache pollution */ - ReorderBufferExecuteInvalidations(txn->ninvalidations, - txn->invalidations); + if (rbtxn_distr_inval_overflowed(txn)) + { + Assert(txn->ninvalidations_distributed == 0); + InvalidateSystemCaches(); + } + else + { + ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed, + txn->invalidations_distributed); + } if (using_subtxn) RollbackAndReleaseCurrentSubTransaction(); @@ -3060,7 +3097,8 @@ ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, * We might have decoded changes for this transaction that could load * the cache as per the current transaction's view (consider DDL's * happened in this transaction). We don't want the decoding of future - * transactions to use those cache entries so execute invalidations. + * transactions to use those cache entries so execute only the inval + * messages in this transaction. */ if (txn->ninvalidations > 0) ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, @@ -3147,9 +3185,10 @@ ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) txn->final_lsn = lsn; /* - * Process cache invalidation messages if there are any. Even if we're not - * interested in the transaction's contents, it could have manipulated the - * catalog and we need to update the caches according to that. + * Process only cache invalidation messages in this transaction if there + * are any. Even if we're not interested in the transaction's contents, it + * could have manipulated the catalog and we need to update the caches + * according to that. */ if (txn->base_snapshot != NULL && txn->ninvalidations > 0) ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, @@ -3422,6 +3461,57 @@ ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, } /* + * Add new invalidation messages to the reorder buffer queue. + */ +static void +ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs) +{ + ReorderBufferChange *change; + + change = ReorderBufferAllocChange(rb); + change->action = REORDER_BUFFER_CHANGE_INVALIDATION; + change->data.inval.ninvalidations = nmsgs; + change->data.inval.invalidations = (SharedInvalidationMessage *) + palloc(sizeof(SharedInvalidationMessage) * nmsgs); + memcpy(change->data.inval.invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); + + ReorderBufferQueueChange(rb, xid, lsn, change, false); +} + +/* + * A helper function for ReorderBufferAddInvalidations() and + * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation + * messages to the **invals_out. + */ +static void +ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out, + uint32 *ninvals_out, + SharedInvalidationMessage *msgs_new, + Size nmsgs_new) +{ + if (*ninvals_out == 0) + { + *ninvals_out = nmsgs_new; + *invals_out = (SharedInvalidationMessage *) + palloc(sizeof(SharedInvalidationMessage) * nmsgs_new); + memcpy(*invals_out, msgs_new, sizeof(SharedInvalidationMessage) * nmsgs_new); + } + else + { + /* Enlarge the array of inval messages */ + *invals_out = (SharedInvalidationMessage *) + repalloc(*invals_out, sizeof(SharedInvalidationMessage) * + (*ninvals_out + nmsgs_new)); + memcpy(*invals_out + *ninvals_out, msgs_new, + nmsgs_new * sizeof(SharedInvalidationMessage)); + *ninvals_out += nmsgs_new; + } +} + +/* * Accumulate the invalidations for executing them later. * * This needs to be called for each XLOG_XACT_INVALIDATIONS message and @@ -3441,7 +3531,6 @@ ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, { ReorderBufferTXN *txn; MemoryContext oldcontext; - ReorderBufferChange *change; txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); @@ -3456,35 +3545,76 @@ ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, Assert(nmsgs > 0); - /* Accumulate invalidations. */ - if (txn->ninvalidations == 0) - { - txn->ninvalidations = nmsgs; - txn->invalidations = (SharedInvalidationMessage *) - palloc(sizeof(SharedInvalidationMessage) * nmsgs); - memcpy(txn->invalidations, msgs, - sizeof(SharedInvalidationMessage) * nmsgs); - } - else + ReorderBufferAccumulateInvalidations(&txn->invalidations, + &txn->ninvalidations, + msgs, nmsgs); + + ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accumulate the invalidations distributed by other committed transactions + * for executing them later. + * + * This function is similar to ReorderBufferAddInvalidations() but stores + * the given inval messages to the txn->invalidations_distributed with the + * overflow check. + * + * This needs to be called by committed transactions to distribute their + * inval messages to in-progress transactions. + */ +void +ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs) +{ + ReorderBufferTXN *txn; + MemoryContext oldcontext; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + oldcontext = MemoryContextSwitchTo(rb->context); + + /* + * Collect all the invalidations under the top transaction, if available, + * so that we can execute them all together. See comments + * ReorderBufferAddInvalidations. + */ + txn = rbtxn_get_toptxn(txn); + + Assert(nmsgs > 0); + + if (!rbtxn_distr_inval_overflowed(txn)) { - txn->invalidations = (SharedInvalidationMessage *) - repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) * - (txn->ninvalidations + nmsgs)); + /* + * Check the transaction has enough space for storing distributed + * invalidation messages. + */ + if (txn->ninvalidations_distributed + nmsgs >= MAX_DISTR_INVAL_MSG_PER_TXN) + { + /* + * Mark the invalidation message as overflowed and free up the + * messages accumulated so far. + */ + txn->txn_flags |= RBTXN_DISTR_INVAL_OVERFLOWED; - memcpy(txn->invalidations + txn->ninvalidations, msgs, - nmsgs * sizeof(SharedInvalidationMessage)); - txn->ninvalidations += nmsgs; + if (txn->invalidations_distributed) + { + pfree(txn->invalidations_distributed); + txn->invalidations_distributed = NULL; + txn->ninvalidations_distributed = 0; + } + } + else + ReorderBufferAccumulateInvalidations(&txn->invalidations_distributed, + &txn->ninvalidations_distributed, + msgs, nmsgs); } - change = ReorderBufferAllocChange(rb); - change->action = REORDER_BUFFER_CHANGE_INVALIDATION; - change->data.inval.ninvalidations = nmsgs; - change->data.inval.invalidations = (SharedInvalidationMessage *) - palloc(sizeof(SharedInvalidationMessage) * nmsgs); - memcpy(change->data.inval.invalidations, msgs, - sizeof(SharedInvalidationMessage) * nmsgs); - - ReorderBufferQueueChange(rb, xid, lsn, change, false); + /* Queue the invalidation messages into the transaction */ + ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs); MemoryContextSwitchTo(oldcontext); } @@ -4787,7 +4917,7 @@ StartupReorderBuffer(void) continue; /* if it cannot be a slot, skip the directory */ - if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2)) + if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2)) continue; /* diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 656e66e0ae0..37738440113 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -211,9 +211,9 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, * impact the users, so we used DEBUG1 level to log the message. */ ereport(slot->data.persistency == RS_TEMPORARY ? LOG : DEBUG1, - errmsg("could not synchronize replication slot \"%s\" because remote slot precedes local slot", + errmsg("could not synchronize replication slot \"%s\"", remote_slot->name), - errdetail("The remote slot has LSN %X/%X and catalog xmin %u, but the local slot has LSN %X/%X and catalog xmin %u.", + errdetail("Synchronization could lead to data loss, because the remote slot needs WAL at LSN %X/%08X and catalog xmin %u, but the standby has LSN %X/%08X and catalog xmin %u.", LSN_FORMAT_ARGS(remote_slot->restart_lsn), remote_slot->catalog_xmin, LSN_FORMAT_ARGS(slot->data.restart_lsn), @@ -275,7 +275,7 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, ereport(ERROR, errmsg_internal("synchronized confirmed_flush for slot \"%s\" differs from remote slot", remote_slot->name), - errdetail_internal("Remote slot has LSN %X/%X but local slot has LSN %X/%X.", + errdetail_internal("Remote slot has LSN %X/%08X but local slot has LSN %X/%08X.", LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), LSN_FORMAT_ARGS(slot->data.confirmed_flush))); } @@ -593,7 +593,7 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid) { ereport(LOG, errmsg("could not synchronize replication slot \"%s\"", remote_slot->name), - errdetail("Logical decoding could not find consistent point from local slot's LSN %X/%X.", + errdetail("Synchronization could lead to data loss, because the standby could not build a consistent snapshot to decode WALs at LSN %X/%08X.", LSN_FORMAT_ARGS(slot->data.restart_lsn))); return false; @@ -642,7 +642,7 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid) ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR, errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("skipping slot synchronization because the received slot sync" - " LSN %X/%X for slot \"%s\" is ahead of the standby position %X/%X", + " LSN %X/%08X for slot \"%s\" is ahead of the standby position %X/%08X", LSN_FORMAT_ARGS(remote_slot->confirmed_lsn), remote_slot->name, LSN_FORMAT_ARGS(latestFlushPtr))); @@ -733,7 +733,7 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid) ereport(ERROR, errmsg_internal("cannot synchronize local slot \"%s\"", remote_slot->name), - errdetail_internal("Local slot's start streaming location LSN(%X/%X) is ahead of remote slot's LSN(%X/%X).", + errdetail_internal("Local slot's start streaming location LSN(%X/%08X) is ahead of remote slot's LSN(%X/%08X).", LSN_FORMAT_ARGS(slot->data.confirmed_flush), LSN_FORMAT_ARGS(remote_slot->confirmed_lsn))); @@ -1059,14 +1059,14 @@ ValidateSlotSyncParams(int elevel) { /* * Logical slot sync/creation requires wal_level >= logical. - * - * Since altering the wal_level requires a server restart, so error out in - * this case regardless of elevel provided by caller. */ if (wal_level < WAL_LEVEL_LOGICAL) - ereport(ERROR, + { + ereport(elevel, errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replication slot synchronization requires \"wal_level\" >= \"logical\"")); + return false; + } /* * A physical replication slot(primary_slot_name) is required on the diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 0d7bddbe4ed..8532bfd27e5 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -774,7 +774,7 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact if (rbtxn_is_prepared(txn)) continue; - elog(DEBUG2, "adding a new snapshot and invalidations to %u at %X/%X", + elog(DEBUG2, "adding a new snapshot and invalidations to %u at %X/%08X", txn->xid, LSN_FORMAT_ARGS(lsn)); /* @@ -794,6 +794,13 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact * contents built by the current transaction even after its decoding, * which should have been invalidated due to concurrent catalog * changing transaction. + * + * Distribute only the invalidation messages generated by the current + * committed transaction. Invalidation messages received from other + * transactions would have already been propagated to the relevant + * in-progress transactions. This transaction would have processed + * those invalidations, ensuring that subsequent transactions observe + * a consistent cache state. */ if (txn->xid != xid) { @@ -807,8 +814,9 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact { Assert(msgs != NULL); - ReorderBufferAddInvalidations(builder->reorder, txn->xid, lsn, - ninvalidations, msgs); + ReorderBufferAddDistributedInvalidations(builder->reorder, + txn->xid, lsn, + ninvalidations, msgs); } } } @@ -1263,10 +1271,10 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->initial_xmin_horizon)) { ereport(DEBUG1, - (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low", - LSN_FORMAT_ARGS(lsn)), - errdetail_internal("initial xmin horizon of %u vs the snapshot's %u", - builder->initial_xmin_horizon, running->oldestRunningXid))); + errmsg_internal("skipping snapshot at %X/%08X while building logical decoding snapshot, xmin horizon too low", + LSN_FORMAT_ARGS(lsn)), + errdetail_internal("initial xmin horizon of %u vs the snapshot's %u", + builder->initial_xmin_horizon, running->oldestRunningXid)); SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon); @@ -1302,9 +1310,9 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->next_phase_at = InvalidTransactionId; ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("There are no running transactions."))); + errmsg("logical decoding found consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no running transactions.")); return false; } @@ -1351,10 +1359,10 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn Assert(TransactionIdIsNormal(builder->xmax)); ereport(LOG, - (errmsg("logical decoding found initial starting point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("Waiting for transactions (approximately %d) older than %u to end.", - running->xcnt, running->nextXid))); + errmsg("logical decoding found initial starting point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid)); SnapBuildWaitSnapshot(running, running->nextXid); } @@ -1375,10 +1383,10 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->next_phase_at = running->nextXid; ereport(LOG, - (errmsg("logical decoding found initial consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("Waiting for transactions (approximately %d) older than %u to end.", - running->xcnt, running->nextXid))); + errmsg("logical decoding found initial consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Waiting for transactions (approximately %d) older than %u to end.", + running->xcnt, running->nextXid)); SnapBuildWaitSnapshot(running, running->nextXid); } @@ -1399,9 +1407,9 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn builder->next_phase_at = InvalidTransactionId; ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("There are no old transactions anymore."))); + errmsg("logical decoding found consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("There are no old transactions anymore.")); } /* @@ -1905,9 +1913,9 @@ SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) Assert(builder->state == SNAPBUILD_CONSISTENT); ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - LSN_FORMAT_ARGS(lsn)), - errdetail("Logical decoding will begin using saved snapshot."))); + errmsg("logical decoding found consistent point at %X/%08X", + LSN_FORMAT_ARGS(lsn)), + errdetail("Logical decoding will begin using saved snapshot.")); return true; snapshot_not_interesting: @@ -2053,7 +2061,7 @@ SnapBuildSnapshotExists(XLogRecPtr lsn) int ret; struct stat stat_buf; - sprintf(path, "%s/%X-%X.snap", + sprintf(path, "%s/%08X-%08X.snap", PG_LOGICAL_SNAPSHOTS_DIR, LSN_FORMAT_ARGS(lsn)); diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c index 8e1e8762f62..d3356bc84ee 100644 --- a/src/backend/replication/logical/tablesync.c +++ b/src/backend/replication/logical/tablesync.c @@ -316,7 +316,8 @@ process_syncing_tables_for_sync(XLogRecPtr current_lsn) UpdateSubscriptionRelState(MyLogicalRepWorker->subid, MyLogicalRepWorker->relid, MyLogicalRepWorker->relstate, - MyLogicalRepWorker->relstate_lsn); + MyLogicalRepWorker->relstate_lsn, + false); /* * End streaming so that LogRepWorkerWalRcvConn can be used to drop @@ -425,6 +426,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) ListCell *lc; bool started_tx = false; bool should_exit = false; + Relation rel = NULL; Assert(!IsTransactionState()); @@ -492,7 +494,17 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) * worker to remove the origin tracking as if there is any * error while dropping we won't restart it to drop the * origin. So passing missing_ok = true. + * + * Lock the subscription and origin in the same order as we + * are doing during DDL commands to avoid deadlocks. See + * AlterSubscription_refresh. */ + LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid, + 0, AccessShareLock); + + if (!rel) + rel = table_open(SubscriptionRelRelationId, RowExclusiveLock); + ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid, rstate->relid, originname, @@ -504,7 +516,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) */ UpdateSubscriptionRelState(MyLogicalRepWorker->subid, rstate->relid, rstate->state, - rstate->lsn); + rstate->lsn, true); } } else @@ -555,7 +567,14 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) * This is required to avoid any undetected deadlocks * due to any existing lock as deadlock detector won't * be able to detect the waits on the latch. + * + * Also close any tables prior to the commit. */ + if (rel) + { + table_close(rel, NoLock); + rel = NULL; + } CommitTransactionCommand(); pgstat_report_stat(false); } @@ -603,20 +622,31 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn) TimestampDifferenceExceeds(hentry->last_start_time, now, wal_retrieve_retry_interval)) { - logicalrep_worker_launch(WORKERTYPE_TABLESYNC, - MyLogicalRepWorker->dbid, - MySubscription->oid, - MySubscription->name, - MyLogicalRepWorker->userid, - rstate->relid, - DSM_HANDLE_INVALID); + /* + * Set the last_start_time even if we fail to start + * the worker, so that we won't retry until + * wal_retrieve_retry_interval has elapsed. + */ hentry->last_start_time = now; + (void) logicalrep_worker_launch(WORKERTYPE_TABLESYNC, + MyLogicalRepWorker->dbid, + MySubscription->oid, + MySubscription->name, + MyLogicalRepWorker->userid, + rstate->relid, + DSM_HANDLE_INVALID, + false); } } } } } + /* Close table if opened */ + if (rel) + table_close(rel, NoLock); + + if (started_tx) { /* @@ -1408,7 +1438,8 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) UpdateSubscriptionRelState(MyLogicalRepWorker->subid, MyLogicalRepWorker->relid, MyLogicalRepWorker->relstate, - MyLogicalRepWorker->relstate_lsn); + MyLogicalRepWorker->relstate_lsn, + false); CommitTransactionCommand(); pgstat_report_stat(true); @@ -1541,14 +1572,15 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) UpdateSubscriptionRelState(MyLogicalRepWorker->subid, MyLogicalRepWorker->relid, SUBREL_STATE_FINISHEDCOPY, - MyLogicalRepWorker->relstate_lsn); + MyLogicalRepWorker->relstate_lsn, + false); CommitTransactionCommand(); copy_table_done: elog(DEBUG1, - "LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%X", + "LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%08X", originname, LSN_FORMAT_ARGS(*origin_startpos)); /* diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 4151a4b2a96..0fdc5de57ba 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -109,13 +109,6 @@ * If ever a user needs to be aware of the tri-state value, they can fetch it * from the pg_subscription catalog (see column subtwophasestate). * - * We don't allow to toggle two_phase option of a subscription because it can - * lead to an inconsistent replica. Consider, initially, it was on and we have - * received some prepare then we turn it off, now at commit time the server - * will send the entire transaction data along with the commit. With some more - * analysis, we can allow changing this option from off to on but not sure if - * that alone would be useful. - * * Finally, to avoid problems mentioned in previous paragraphs from any * subsequent (not READY) tablesyncs (need to toggle two_phase option from 'on' * to 'off' and then again back to 'on') there is a restriction for @@ -139,6 +132,96 @@ * failover = true when creating the subscription. Enabling failover allows us * to smoothly transition to the promoted standby, ensuring that we can * subscribe to the new primary without losing any data. + * + * RETAIN DEAD TUPLES + * ---------------------- + * Each apply worker that enabled retain_dead_tuples option maintains a + * non-removable transaction ID (oldest_nonremovable_xid) in shared memory to + * prevent dead rows from being removed prematurely when the apply worker still + * needs them to detect update_deleted conflicts. Additionally, this helps to + * retain the required commit_ts module information, which further helps to + * detect update_origin_differs and delete_origin_differs conflicts reliably, as + * otherwise, vacuum freeze could remove the required information. + * + * The logical replication launcher manages an internal replication slot named + * "pg_conflict_detection". It asynchronously aggregates the non-removable + * transaction ID from all apply workers to determine the appropriate xmin for + * the slot, thereby retaining necessary tuples. + * + * The non-removable transaction ID in the apply worker is advanced to the + * oldest running transaction ID once all concurrent transactions on the + * publisher have been applied and flushed locally. The process involves: + * + * - RDT_GET_CANDIDATE_XID: + * Call GetOldestActiveTransactionId() to take oldestRunningXid as the + * candidate xid. + * + * - RDT_REQUEST_PUBLISHER_STATUS: + * Send a message to the walsender requesting the publisher status, which + * includes the latest WAL write position and information about transactions + * that are in the commit phase. + * + * - RDT_WAIT_FOR_PUBLISHER_STATUS: + * Wait for the status from the walsender. After receiving the first status, + * do not proceed if there are concurrent remote transactions that are still + * in the commit phase. These transactions might have been assigned an + * earlier commit timestamp but have not yet written the commit WAL record. + * Continue to request the publisher status (RDT_REQUEST_PUBLISHER_STATUS) + * until all these transactions have completed. + * + * - RDT_WAIT_FOR_LOCAL_FLUSH: + * Advance the non-removable transaction ID if the current flush location has + * reached or surpassed the last received WAL position. + * + * The overall state progression is: GET_CANDIDATE_XID -> + * REQUEST_PUBLISHER_STATUS -> WAIT_FOR_PUBLISHER_STATUS -> (loop to + * REQUEST_PUBLISHER_STATUS till concurrent remote transactions end) -> + * WAIT_FOR_LOCAL_FLUSH -> loop back to GET_CANDIDATE_XID. + * + * Retaining the dead tuples for this period is sufficient for ensuring + * eventual consistency using last-update-wins strategy, as dead tuples are + * useful for detecting conflicts only during the application of concurrent + * transactions from remote nodes. After applying and flushing all remote + * transactions that occurred concurrently with the tuple DELETE, any + * subsequent UPDATE from a remote node should have a later timestamp. In such + * cases, it is acceptable to detect an update_missing scenario and convert the + * UPDATE to an INSERT when applying it. But, for concurrent remote + * transactions with earlier timestamps than the DELETE, detecting + * update_deleted is necessary, as the UPDATEs in remote transactions should be + * ignored if their timestamp is earlier than that of the dead tuples. + * + * Note that advancing the non-removable transaction ID is not supported if the + * publisher is also a physical standby. This is because the logical walsender + * on the standby can only get the WAL replay position but there may be more + * WALs that are being replicated from the primary and those WALs could have + * earlier commit timestamp. + * + * Similarly, when the publisher has subscribed to another publisher, + * information necessary for conflict detection cannot be retained for + * changes from origins other than the publisher. This is because publisher + * lacks the information on concurrent transactions of other publishers to + * which it subscribes. As the information on concurrent transactions is + * unavailable beyond subscriber's immediate publishers, the non-removable + * transaction ID might be advanced prematurely before changes from other + * origins have been fully applied. + * + * XXX Retaining information for changes from other origins might be possible + * by requesting the subscription on that origin to enable retain_dead_tuples + * and fetching the conflict detection slot.xmin along with the publisher's + * status. In the RDT_WAIT_FOR_PUBLISHER_STATUS phase, the apply worker could + * wait for the remote slot's xmin to reach the oldest active transaction ID, + * ensuring that all transactions from other origins have been applied on the + * publisher, thereby getting the latest WAL position that includes all + * concurrent changes. However, this approach may impact performance, so it + * might not worth the effort. + * + * XXX It seems feasible to get the latest commit's WAL location from the + * publisher and wait till that is applied. However, we can't do that + * because commit timestamps can regress as a commit with a later LSN is not + * guaranteed to have a later timestamp than those with earlier LSNs. Having + * said that, even if that is possible, it won't improve performance much as + * the apply always lag and moves slowly as compared with the transactions + * on the publisher. *------------------------------------------------------------------------- */ @@ -147,6 +230,7 @@ #include <sys/stat.h> #include <unistd.h> +#include "access/commit_ts.h" #include "access/table.h" #include "access/tableam.h" #include "access/twophase.h" @@ -155,6 +239,7 @@ #include "catalog/pg_inherits.h" #include "catalog/pg_subscription.h" #include "catalog/pg_subscription_rel.h" +#include "commands/subscriptioncmds.h" #include "commands/tablecmds.h" #include "commands/trigger.h" #include "executor/executor.h" @@ -173,12 +258,14 @@ #include "replication/logicalrelation.h" #include "replication/logicalworker.h" #include "replication/origin.h" +#include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/worker_internal.h" #include "rewrite/rewriteHandler.h" #include "storage/buffile.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/procarray.h" #include "tcop/tcopprot.h" #include "utils/acl.h" #include "utils/dynahash.h" @@ -275,6 +362,78 @@ typedef enum TRANS_PARALLEL_APPLY, } TransApplyAction; +/* + * The phases involved in advancing the non-removable transaction ID. + * + * See comments atop worker.c for details of the transition between these + * phases. + */ +typedef enum +{ + RDT_GET_CANDIDATE_XID, + RDT_REQUEST_PUBLISHER_STATUS, + RDT_WAIT_FOR_PUBLISHER_STATUS, + RDT_WAIT_FOR_LOCAL_FLUSH +} RetainDeadTuplesPhase; + +/* + * Critical information for managing phase transitions within the + * RetainDeadTuplesPhase. + */ +typedef struct RetainDeadTuplesData +{ + RetainDeadTuplesPhase phase; /* current phase */ + XLogRecPtr remote_lsn; /* WAL write position on the publisher */ + + /* + * Oldest transaction ID that was in the commit phase on the publisher. + * Use FullTransactionId to prevent issues with transaction ID wraparound, + * where a new remote_oldestxid could falsely appear to originate from the + * past and block advancement. + */ + FullTransactionId remote_oldestxid; + + /* + * Next transaction ID to be assigned on the publisher. Use + * FullTransactionId for consistency and to allow straightforward + * comparisons with remote_oldestxid. + */ + FullTransactionId remote_nextxid; + + TimestampTz reply_time; /* when the publisher responds with status */ + + /* + * Publisher transaction ID that must be awaited to complete before + * entering the final phase (RDT_WAIT_FOR_LOCAL_FLUSH). Use + * FullTransactionId for the same reason as remote_nextxid. + */ + FullTransactionId remote_wait_for; + + TransactionId candidate_xid; /* candidate for the non-removable + * transaction ID */ + TimestampTz flushpos_update_time; /* when the remote flush position was + * updated in final phase + * (RDT_WAIT_FOR_LOCAL_FLUSH) */ + + /* + * The following fields are used to determine the timing for the next + * round of transaction ID advancement. + */ + TimestampTz last_recv_time; /* when the last message was received */ + TimestampTz candidate_xid_time; /* when the candidate_xid is decided */ + int xid_advance_interval; /* how much time (ms) to wait before + * attempting to advance the + * non-removable transaction ID */ +} RetainDeadTuplesData; + +/* + * The minimum (100ms) and maximum (3 minutes) intervals for advancing + * non-removable transaction IDs. The maximum interval is a bit arbitrary but + * is sufficient to not cause any undue network traffic. + */ +#define MIN_XID_ADVANCE_INTERVAL 100 +#define MAX_XID_ADVANCE_INTERVAL 180000 + /* errcontext tracker */ static ApplyErrorCallbackArg apply_error_callback_arg = { @@ -339,6 +498,13 @@ static XLogRecPtr skip_xact_finish_lsn = InvalidXLogRecPtr; /* BufFile handle of the current streaming file */ static BufFile *stream_fd = NULL; +/* + * The remote WAL position that has been applied and flushed locally. We record + * and use this information both while sending feedback to the server and + * advancing oldest_nonremovable_xid. + */ +static XLogRecPtr last_flushpos = InvalidXLogRecPtr; + typedef struct SubXactInfo { TransactionId xid; /* XID of the subxact */ @@ -379,6 +545,19 @@ static void stream_close_file(void); static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply); +static void maybe_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data, + bool status_received); +static bool can_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data); +static void process_rdt_phase_transition(RetainDeadTuplesData *rdt_data, + bool status_received); +static void get_candidate_xid(RetainDeadTuplesData *rdt_data); +static void request_publisher_status(RetainDeadTuplesData *rdt_data); +static void wait_for_publisher_status(RetainDeadTuplesData *rdt_data, + bool status_received); +static void wait_for_local_flush(RetainDeadTuplesData *rdt_data); +static void adjust_xid_advance_interval(RetainDeadTuplesData *rdt_data, + bool new_xid_found); + static void apply_handle_commit_internal(LogicalRepCommitData *commit_data); static void apply_handle_insert_internal(ApplyExecutionData *edata, ResultRelInfo *relinfo, @@ -397,6 +576,12 @@ static bool FindReplTupleInLocalRel(ApplyExecutionData *edata, Relation localrel Oid localidxoid, TupleTableSlot *remoteslot, TupleTableSlot **localslot); +static bool FindDeletedTupleInLocalRel(Relation localrel, + Oid localidxoid, + TupleTableSlot *remoteslot, + TransactionId *delete_xid, + RepOriginId *delete_origin, + TimestampTz *delete_time); static void apply_handle_tuple_routing(ApplyExecutionData *edata, TupleTableSlot *remoteslot, LogicalRepTupleData *newtup, @@ -1023,7 +1208,7 @@ apply_handle_commit(StringInfo s) if (commit_data.commit_lsn != remote_final_lsn) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg_internal("incorrect commit LSN %X/%X in commit message (expected %X/%X)", + errmsg_internal("incorrect commit LSN %X/%08X in commit message (expected %X/%08X)", LSN_FORMAT_ARGS(commit_data.commit_lsn), LSN_FORMAT_ARGS(remote_final_lsn)))); @@ -1115,7 +1300,7 @@ apply_handle_prepare(StringInfo s) if (prepare_data.prepare_lsn != remote_final_lsn) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg_internal("incorrect prepare LSN %X/%X in prepare message (expected %X/%X)", + errmsg_internal("incorrect prepare LSN %X/%08X in prepare message (expected %X/%08X)", LSN_FORMAT_ARGS(prepare_data.prepare_lsn), LSN_FORMAT_ARGS(remote_final_lsn)))); @@ -2733,17 +2918,31 @@ apply_handle_update_internal(ApplyExecutionData *edata, } else { + ConflictType type; TupleTableSlot *newslot = localslot; + /* + * Detecting whether the tuple was recently deleted or never existed + * is crucial to avoid misleading the user during confict handling. + */ + if (FindDeletedTupleInLocalRel(localrel, localindexoid, remoteslot, + &conflicttuple.xmin, + &conflicttuple.origin, + &conflicttuple.ts) && + conflicttuple.origin != replorigin_session_origin) + type = CT_UPDATE_DELETED; + else + type = CT_UPDATE_MISSING; + /* Store the new tuple for conflict reporting */ slot_store_data(newslot, relmapentry, newtup); /* - * The tuple to be updated could not be found. Do nothing except for - * emitting a log message. + * The tuple to be updated could not be found or was deleted. Do + * nothing except for emitting a log message. */ - ReportApplyConflict(estate, relinfo, LOG, CT_UPDATE_MISSING, - remoteslot, newslot, list_make1(&conflicttuple)); + ReportApplyConflict(estate, relinfo, LOG, type, remoteslot, newslot, + list_make1(&conflicttuple)); } /* Cleanup. */ @@ -2964,6 +3163,112 @@ FindReplTupleInLocalRel(ApplyExecutionData *edata, Relation localrel, } /* + * Determine whether the index can reliably locate the deleted tuple in the + * local relation. + * + * An index may exclude deleted tuples if it was re-indexed or re-created during + * change application. Therefore, an index is considered usable only if the + * conflict detection slot.xmin (conflict_detection_xmin) is greater than the + * index tuple's xmin. This ensures that any tuples deleted prior to the index + * creation or re-indexing are not relevant for conflict detection in the + * current apply worker. + * + * Note that indexes may also be excluded if they were modified by other DDL + * operations, such as ALTER INDEX. However, this is acceptable, as the + * likelihood of such DDL changes coinciding with the need to scan dead + * tuples for the update_deleted is low. + */ +static bool +IsIndexUsableForFindingDeletedTuple(Oid localindexoid, + TransactionId conflict_detection_xmin) +{ + HeapTuple index_tuple; + TransactionId index_xmin; + + index_tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(localindexoid)); + + if (!HeapTupleIsValid(index_tuple)) /* should not happen */ + elog(ERROR, "cache lookup failed for index %u", localindexoid); + + /* + * No need to check for a frozen transaction ID, as + * TransactionIdPrecedes() manages it internally, treating it as falling + * behind the conflict_detection_xmin. + */ + index_xmin = HeapTupleHeaderGetXmin(index_tuple->t_data); + + ReleaseSysCache(index_tuple); + + return TransactionIdPrecedes(index_xmin, conflict_detection_xmin); +} + +/* + * Attempts to locate a deleted tuple in the local relation that matches the + * values of the tuple received from the publication side (in 'remoteslot'). + * The search is performed using either the replica identity index, primary + * key, other available index, or a sequential scan if necessary. + * + * Returns true if the deleted tuple is found. If found, the transaction ID, + * origin, and commit timestamp of the deletion are stored in '*delete_xid', + * '*delete_origin', and '*delete_time' respectively. + */ +static bool +FindDeletedTupleInLocalRel(Relation localrel, Oid localidxoid, + TupleTableSlot *remoteslot, + TransactionId *delete_xid, RepOriginId *delete_origin, + TimestampTz *delete_time) +{ + TransactionId oldestxmin; + ReplicationSlot *slot; + + /* + * Return false if either dead tuples are not retained or commit timestamp + * data is not available. + */ + if (!MySubscription->retaindeadtuples || !track_commit_timestamp) + return false; + + /* + * For conflict detection, we use the conflict slot's xmin value instead + * of invoking GetOldestNonRemovableTransactionId(). The slot.xmin acts as + * a threshold to identify tuples that were recently deleted. These tuples + * are not visible to concurrent transactions, but we log an + * update_deleted conflict if such a tuple matches the remote update being + * applied. + * + * Although GetOldestNonRemovableTransactionId() can return a value older + * than the slot's xmin, for our current purpose it is acceptable to treat + * tuples deleted by transactions prior to slot.xmin as update_missing + * conflicts. + * + * Ideally, we would use oldest_nonremovable_xid, which is directly + * maintained by the leader apply worker. However, this value is not + * available to table synchronization or parallel apply workers, making + * slot.xmin a practical alternative in those contexts. + */ + slot = SearchNamedReplicationSlot(CONFLICT_DETECTION_SLOT, true); + + Assert(slot); + + SpinLockAcquire(&slot->mutex); + oldestxmin = slot->data.xmin; + SpinLockRelease(&slot->mutex); + + Assert(TransactionIdIsValid(oldestxmin)); + + if (OidIsValid(localidxoid) && + IsIndexUsableForFindingDeletedTuple(localidxoid, oldestxmin)) + return RelationFindDeletedTupleInfoByIndex(localrel, localidxoid, + remoteslot, oldestxmin, + delete_xid, delete_origin, + delete_time); + else + return RelationFindDeletedTupleInfoSeq(localrel, remoteslot, + oldestxmin, delete_xid, + delete_origin, delete_time); +} + +/* * This handles insert, update, delete on a partitioned table. */ static void @@ -3081,18 +3386,35 @@ apply_handle_tuple_routing(ApplyExecutionData *edata, remoteslot_part, &localslot); if (!found) { + ConflictType type; TupleTableSlot *newslot = localslot; + /* + * Detecting whether the tuple was recently deleted or + * never existed is crucial to avoid misleading the user + * during confict handling. + */ + if (FindDeletedTupleInLocalRel(partrel, + part_entry->localindexoid, + remoteslot_part, + &conflicttuple.xmin, + &conflicttuple.origin, + &conflicttuple.ts) && + conflicttuple.origin != replorigin_session_origin) + type = CT_UPDATE_DELETED; + else + type = CT_UPDATE_MISSING; + /* Store the new tuple for conflict reporting */ slot_store_data(newslot, part_entry, newtup); /* - * The tuple to be updated could not be found. Do nothing - * except for emitting a log message. + * The tuple to be updated could not be found or was + * deleted. Do nothing except for emitting a log message. */ ReportApplyConflict(estate, partrelinfo, LOG, - CT_UPDATE_MISSING, remoteslot_part, - newslot, list_make1(&conflicttuple)); + type, remoteslot_part, newslot, + list_make1(&conflicttuple)); return; } @@ -3584,6 +3906,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received) bool ping_sent = false; TimeLineID tli; ErrorContextCallback errcallback; + RetainDeadTuplesData rdt_data = {0}; /* * Init the ApplyMessageContext which we clean up after each replication @@ -3662,6 +3985,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received) last_recv_timestamp = GetCurrentTimestamp(); ping_sent = false; + rdt_data.last_recv_time = last_recv_timestamp; + /* Ensure we are reading the data into our memory context. */ MemoryContextSwitchTo(ApplyMessageContext); @@ -3669,7 +3994,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received) c = pq_getmsgbyte(&s); - if (c == 'w') + if (c == PqReplMsg_WALData) { XLogRecPtr start_lsn; XLogRecPtr end_lsn; @@ -3688,8 +4013,10 @@ LogicalRepApplyLoop(XLogRecPtr last_received) UpdateWorkerStats(last_received, send_time, false); apply_dispatch(&s); + + maybe_advance_nonremovable_xid(&rdt_data, false); } - else if (c == 'k') + else if (c == PqReplMsg_Keepalive) { XLogRecPtr end_lsn; TimestampTz timestamp; @@ -3703,8 +4030,31 @@ LogicalRepApplyLoop(XLogRecPtr last_received) last_received = end_lsn; send_feedback(last_received, reply_requested, false); + + maybe_advance_nonremovable_xid(&rdt_data, false); + UpdateWorkerStats(last_received, timestamp, true); } + else if (c == PqReplMsg_PrimaryStatusUpdate) + { + rdt_data.remote_lsn = pq_getmsgint64(&s); + rdt_data.remote_oldestxid = FullTransactionIdFromU64((uint64) pq_getmsgint64(&s)); + rdt_data.remote_nextxid = FullTransactionIdFromU64((uint64) pq_getmsgint64(&s)); + rdt_data.reply_time = pq_getmsgint64(&s); + + /* + * This should never happen, see + * ProcessStandbyPSRequestMessage. But if it happens + * due to a bug, we don't want to proceed as it can + * incorrectly advance oldest_nonremovable_xid. + */ + if (XLogRecPtrIsInvalid(rdt_data.remote_lsn)) + elog(ERROR, "cannot get the latest WAL position from the publisher"); + + maybe_advance_nonremovable_xid(&rdt_data, true); + + UpdateWorkerStats(last_received, rdt_data.reply_time, false); + } /* other message types are purposefully ignored */ MemoryContextReset(ApplyMessageContext); @@ -3717,6 +4067,11 @@ LogicalRepApplyLoop(XLogRecPtr last_received) /* confirm all writes so far */ send_feedback(last_received, false, false); + /* Reset the timestamp if no message was received */ + rdt_data.last_recv_time = 0; + + maybe_advance_nonremovable_xid(&rdt_data, false); + if (!in_remote_transaction && !in_streamed_transaction) { /* @@ -3751,6 +4106,14 @@ LogicalRepApplyLoop(XLogRecPtr last_received) else wait_time = NAPTIME_PER_CYCLE; + /* + * Ensure to wake up when it's possible to advance the non-removable + * transaction ID. + */ + if (rdt_data.phase == RDT_GET_CANDIDATE_XID && + rdt_data.xid_advance_interval) + wait_time = Min(wait_time, rdt_data.xid_advance_interval); + rc = WaitLatchOrSocket(MyLatch, WL_SOCKET_READABLE | WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, @@ -3814,6 +4177,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received) send_feedback(last_received, requestReply, requestReply); + maybe_advance_nonremovable_xid(&rdt_data, false); + /* * Force reporting to ensure long idle periods don't lead to * arbitrarily delayed stats. Stats can only be reported outside @@ -3849,7 +4214,6 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) static XLogRecPtr last_recvpos = InvalidXLogRecPtr; static XLogRecPtr last_writepos = InvalidXLogRecPtr; - static XLogRecPtr last_flushpos = InvalidXLogRecPtr; XLogRecPtr writepos; XLogRecPtr flushpos; @@ -3903,14 +4267,14 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) else resetStringInfo(reply_message); - pq_sendbyte(reply_message, 'r'); + pq_sendbyte(reply_message, PqReplMsg_StandbyStatusUpdate); pq_sendint64(reply_message, recvpos); /* write */ pq_sendint64(reply_message, flushpos); /* flush */ pq_sendint64(reply_message, writepos); /* apply */ pq_sendint64(reply_message, now); /* sendTime */ pq_sendbyte(reply_message, requestReply); /* replyRequested */ - elog(DEBUG2, "sending feedback (force %d) to recv %X/%X, write %X/%X, flush %X/%X", + elog(DEBUG2, "sending feedback (force %d) to recv %X/%08X, write %X/%08X, flush %X/%08X", force, LSN_FORMAT_ARGS(recvpos), LSN_FORMAT_ARGS(writepos), @@ -3928,6 +4292,368 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply) } /* + * Attempt to advance the non-removable transaction ID. + * + * See comments atop worker.c for details. + */ +static void +maybe_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + if (!can_advance_nonremovable_xid(rdt_data)) + return; + + process_rdt_phase_transition(rdt_data, status_received); +} + +/* + * Preliminary check to determine if advancing the non-removable transaction ID + * is allowed. + */ +static bool +can_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data) +{ + /* + * It is sufficient to manage non-removable transaction ID for a + * subscription by the main apply worker to detect update_deleted reliably + * even for table sync or parallel apply workers. + */ + if (!am_leader_apply_worker()) + return false; + + /* No need to advance if retaining dead tuples is not required */ + if (!MySubscription->retaindeadtuples) + return false; + + return true; +} + +/* + * Process phase transitions during the non-removable transaction ID + * advancement. See comments atop worker.c for details of the transition. + */ +static void +process_rdt_phase_transition(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + switch (rdt_data->phase) + { + case RDT_GET_CANDIDATE_XID: + get_candidate_xid(rdt_data); + break; + case RDT_REQUEST_PUBLISHER_STATUS: + request_publisher_status(rdt_data); + break; + case RDT_WAIT_FOR_PUBLISHER_STATUS: + wait_for_publisher_status(rdt_data, status_received); + break; + case RDT_WAIT_FOR_LOCAL_FLUSH: + wait_for_local_flush(rdt_data); + break; + } +} + +/* + * Workhorse for the RDT_GET_CANDIDATE_XID phase. + */ +static void +get_candidate_xid(RetainDeadTuplesData *rdt_data) +{ + TransactionId oldest_running_xid; + TimestampTz now; + + /* + * Use last_recv_time when applying changes in the loop to avoid + * unnecessary system time retrieval. If last_recv_time is not available, + * obtain the current timestamp. + */ + now = rdt_data->last_recv_time ? rdt_data->last_recv_time : GetCurrentTimestamp(); + + /* + * Compute the candidate_xid and request the publisher status at most once + * per xid_advance_interval. Refer to adjust_xid_advance_interval() for + * details on how this value is dynamically adjusted. This is to avoid + * using CPU and network resources without making much progress. + */ + if (!TimestampDifferenceExceeds(rdt_data->candidate_xid_time, now, + rdt_data->xid_advance_interval)) + return; + + /* + * Immediately update the timer, even if the function returns later + * without setting candidate_xid due to inactivity on the subscriber. This + * avoids frequent calls to GetOldestActiveTransactionId. + */ + rdt_data->candidate_xid_time = now; + + /* + * Consider transactions in the current database, as only dead tuples from + * this database are required for conflict detection. + */ + oldest_running_xid = GetOldestActiveTransactionId(false, false); + + /* + * Oldest active transaction ID (oldest_running_xid) can't be behind any + * of its previously computed value. + */ + Assert(TransactionIdPrecedesOrEquals(MyLogicalRepWorker->oldest_nonremovable_xid, + oldest_running_xid)); + + /* Return if the oldest_nonremovable_xid cannot be advanced */ + if (TransactionIdEquals(MyLogicalRepWorker->oldest_nonremovable_xid, + oldest_running_xid)) + { + adjust_xid_advance_interval(rdt_data, false); + return; + } + + adjust_xid_advance_interval(rdt_data, true); + + rdt_data->candidate_xid = oldest_running_xid; + rdt_data->phase = RDT_REQUEST_PUBLISHER_STATUS; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Workhorse for the RDT_REQUEST_PUBLISHER_STATUS phase. + */ +static void +request_publisher_status(RetainDeadTuplesData *rdt_data) +{ + static StringInfo request_message = NULL; + + if (!request_message) + { + MemoryContext oldctx = MemoryContextSwitchTo(ApplyContext); + + request_message = makeStringInfo(); + MemoryContextSwitchTo(oldctx); + } + else + resetStringInfo(request_message); + + /* + * Send the current time to update the remote walsender's latest reply + * message received time. + */ + pq_sendbyte(request_message, PqReplMsg_PrimaryStatusRequest); + pq_sendint64(request_message, GetCurrentTimestamp()); + + elog(DEBUG2, "sending publisher status request message"); + + /* Send a request for the publisher status */ + walrcv_send(LogRepWorkerWalRcvConn, + request_message->data, request_message->len); + + rdt_data->phase = RDT_WAIT_FOR_PUBLISHER_STATUS; + + /* + * Skip calling maybe_advance_nonremovable_xid() since further transition + * is possible only once we receive the publisher status message. + */ +} + +/* + * Workhorse for the RDT_WAIT_FOR_PUBLISHER_STATUS phase. + */ +static void +wait_for_publisher_status(RetainDeadTuplesData *rdt_data, + bool status_received) +{ + /* + * Return if we have requested but not yet received the publisher status. + */ + if (!status_received) + return; + + if (!FullTransactionIdIsValid(rdt_data->remote_wait_for)) + rdt_data->remote_wait_for = rdt_data->remote_nextxid; + + /* + * Check if all remote concurrent transactions that were active at the + * first status request have now completed. If completed, proceed to the + * next phase; otherwise, continue checking the publisher status until + * these transactions finish. + * + * It's possible that transactions in the commit phase during the last + * cycle have now finished committing, but remote_oldestxid remains older + * than remote_wait_for. This can happen if some old transaction came in + * the commit phase when we requested status in this cycle. We do not + * handle this case explicitly as it's rare and the benefit doesn't + * justify the required complexity. Tracking would require either caching + * all xids at the publisher or sending them to subscribers. The condition + * will resolve naturally once the remaining transactions are finished. + * + * Directly advancing the non-removable transaction ID is possible if + * there are no activities on the publisher since the last advancement + * cycle. However, it requires maintaining two fields, last_remote_nextxid + * and last_remote_lsn, within the structure for comparison with the + * current cycle's values. Considering the minimal cost of continuing in + * RDT_WAIT_FOR_LOCAL_FLUSH without awaiting changes, we opted not to + * advance the transaction ID here. + */ + if (FullTransactionIdPrecedesOrEquals(rdt_data->remote_wait_for, + rdt_data->remote_oldestxid)) + rdt_data->phase = RDT_WAIT_FOR_LOCAL_FLUSH; + else + rdt_data->phase = RDT_REQUEST_PUBLISHER_STATUS; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Workhorse for the RDT_WAIT_FOR_LOCAL_FLUSH phase. + */ +static void +wait_for_local_flush(RetainDeadTuplesData *rdt_data) +{ + Assert(!XLogRecPtrIsInvalid(rdt_data->remote_lsn) && + TransactionIdIsValid(rdt_data->candidate_xid)); + + /* + * We expect the publisher and subscriber clocks to be in sync using time + * sync service like NTP. Otherwise, we will advance this worker's + * oldest_nonremovable_xid prematurely, leading to the removal of rows + * required to detect update_deleted reliably. This check primarily + * addresses scenarios where the publisher's clock falls behind; if the + * publisher's clock is ahead, subsequent transactions will naturally bear + * later commit timestamps, conforming to the design outlined atop + * worker.c. + * + * XXX Consider waiting for the publisher's clock to catch up with the + * subscriber's before proceeding to the next phase. + */ + if (TimestampDifferenceExceeds(rdt_data->reply_time, + rdt_data->candidate_xid_time, 0)) + ereport(ERROR, + errmsg_internal("oldest_nonremovable_xid transaction ID could be advanced prematurely"), + errdetail_internal("The clock on the publisher is behind that of the subscriber.")); + + /* + * Do not attempt to advance the non-removable transaction ID when table + * sync is in progress. During this time, changes from a single + * transaction may be applied by multiple table sync workers corresponding + * to the target tables. So, it's necessary for all table sync workers to + * apply and flush the corresponding changes before advancing the + * transaction ID, otherwise, dead tuples that are still needed for + * conflict detection in table sync workers could be removed prematurely. + * However, confirming the apply and flush progress across all table sync + * workers is complex and not worth the effort, so we simply return if not + * all tables are in the READY state. + * + * It is safe to add new tables with initial states to the subscription + * after this check because any changes applied to these tables should + * have a WAL position greater than the rdt_data->remote_lsn. + */ + if (!AllTablesyncsReady()) + return; + + /* + * Update and check the remote flush position if we are applying changes + * in a loop. This is done at most once per WalWriterDelay to avoid + * performing costly operations in get_flush_position() too frequently + * during change application. + */ + if (last_flushpos < rdt_data->remote_lsn && rdt_data->last_recv_time && + TimestampDifferenceExceeds(rdt_data->flushpos_update_time, + rdt_data->last_recv_time, WalWriterDelay)) + { + XLogRecPtr writepos; + XLogRecPtr flushpos; + bool have_pending_txes; + + /* Fetch the latest remote flush position */ + get_flush_position(&writepos, &flushpos, &have_pending_txes); + + if (flushpos > last_flushpos) + last_flushpos = flushpos; + + rdt_data->flushpos_update_time = rdt_data->last_recv_time; + } + + /* Return to wait for the changes to be applied */ + if (last_flushpos < rdt_data->remote_lsn) + return; + + /* + * Reaching here means the remote WAL position has been received, and all + * transactions up to that position on the publisher have been applied and + * flushed locally. So, we can advance the non-removable transaction ID. + */ + SpinLockAcquire(&MyLogicalRepWorker->relmutex); + MyLogicalRepWorker->oldest_nonremovable_xid = rdt_data->candidate_xid; + SpinLockRelease(&MyLogicalRepWorker->relmutex); + + elog(DEBUG2, "confirmed flush up to remote lsn %X/%X: new oldest_nonremovable_xid %u", + LSN_FORMAT_ARGS(rdt_data->remote_lsn), + rdt_data->candidate_xid); + + /* Notify launcher to update the xmin of the conflict slot */ + ApplyLauncherWakeup(); + + /* + * Reset all data fields except those used to determine the timing for the + * next round of transaction ID advancement. We can even use + * flushpos_update_time in the next round to decide whether to get the + * latest flush position. + */ + rdt_data->phase = RDT_GET_CANDIDATE_XID; + rdt_data->remote_lsn = InvalidXLogRecPtr; + rdt_data->remote_oldestxid = InvalidFullTransactionId; + rdt_data->remote_nextxid = InvalidFullTransactionId; + rdt_data->reply_time = 0; + rdt_data->remote_wait_for = InvalidFullTransactionId; + rdt_data->candidate_xid = InvalidTransactionId; + + /* process the next phase */ + process_rdt_phase_transition(rdt_data, false); +} + +/* + * Adjust the interval for advancing non-removable transaction IDs. + * + * We double the interval to try advancing the non-removable transaction IDs + * if there is no activity on the node. The maximum value of the interval is + * capped by wal_receiver_status_interval if it is not zero, otherwise to a + * 3 minutes which should be sufficient to avoid using CPU or network + * resources without much benefit. + * + * The interval is reset to a minimum value of 100ms once there is some + * activity on the node. + * + * XXX The use of wal_receiver_status_interval is a bit arbitrary so we can + * consider the other interval or a separate GUC if the need arises. + */ +static void +adjust_xid_advance_interval(RetainDeadTuplesData *rdt_data, bool new_xid_found) +{ + if (!new_xid_found && rdt_data->xid_advance_interval) + { + int max_interval = wal_receiver_status_interval + ? wal_receiver_status_interval * 1000 + : MAX_XID_ADVANCE_INTERVAL; + + /* + * No new transaction ID has been assigned since the last check, so + * double the interval, but not beyond the maximum allowable value. + */ + rdt_data->xid_advance_interval = Min(rdt_data->xid_advance_interval * 2, + max_interval); + } + else + { + /* + * A new transaction ID was found or the interval is not yet + * initialized, so set the interval to the minimum value. + */ + rdt_data->xid_advance_interval = MIN_XID_ADVANCE_INTERVAL; + } +} + +/* * Exit routine for apply workers due to subscription parameter changes. */ static void @@ -4626,8 +5352,16 @@ run_apply_worker() walrcv_startstreaming(LogRepWorkerWalRcvConn, &options); StartTransactionCommand(); + + /* + * Updating pg_subscription might involve TOAST table access, so + * ensure we have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + UpdateTwoPhaseState(MySubscription->oid, LOGICALREP_TWOPHASE_STATE_ENABLED); MySubscription->twophasestate = LOGICALREP_TWOPHASE_STATE_ENABLED; + PopActiveSnapshot(); CommitTransactionCommand(); } else @@ -4707,6 +5441,30 @@ InitializeLogRepWorker(void) apply_worker_exit(); } + /* + * Restart the worker if retain_dead_tuples was enabled during startup. + * + * At this point, the replication slot used for conflict detection might + * not exist yet, or could be dropped soon if the launcher perceives + * retain_dead_tuples as disabled. To avoid unnecessary tracking of + * oldest_nonremovable_xid when the slot is absent or at risk of being + * dropped, a restart is initiated. + * + * The oldest_nonremovable_xid should be initialized only when the + * retain_dead_tuples is enabled before launching the worker. See + * logicalrep_worker_launch. + */ + if (am_leader_apply_worker() && + MySubscription->retaindeadtuples && + !TransactionIdIsValid(MyLogicalRepWorker->oldest_nonremovable_xid)) + { + ereport(LOG, + errmsg("logical replication worker for subscription \"%s\" will restart because the option %s was enabled during startup", + MySubscription->name, "retain_dead_tuples")); + + apply_worker_exit(); + } + /* Setup synchronous commit according to the user's wishes */ SetConfigOption("synchronous_commit", MySubscription->synccommit, PGC_BACKEND, PGC_S_OVERRIDE); @@ -4843,7 +5601,15 @@ DisableSubscriptionAndExit(void) /* Disable the subscription */ StartTransactionCommand(); + + /* + * Updating pg_subscription might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + DisableSubscription(MySubscription->oid); + PopActiveSnapshot(); CommitTransactionCommand(); /* Ensure we remove no-longer-useful entry for worker's start time */ @@ -4855,6 +5621,14 @@ DisableSubscriptionAndExit(void) errmsg("subscription \"%s\" has been disabled because of an error", MySubscription->name)); + /* + * Skip the track_commit_timestamp check when disabling the worker due to + * an error, as verifying commit timestamps is unnecessary in this + * context. + */ + if (MySubscription->retaindeadtuples) + CheckSubDeadTupleRetention(false, true, WARNING); + proc_exit(0); } @@ -4900,7 +5674,7 @@ maybe_start_skipping_changes(XLogRecPtr finish_lsn) skip_xact_finish_lsn = finish_lsn; ereport(LOG, - errmsg("logical replication starts skipping transaction at LSN %X/%X", + errmsg("logical replication starts skipping transaction at LSN %X/%08X", LSN_FORMAT_ARGS(skip_xact_finish_lsn))); } @@ -4914,8 +5688,8 @@ stop_skipping_changes(void) return; ereport(LOG, - (errmsg("logical replication completed skipping transaction at LSN %X/%X", - LSN_FORMAT_ARGS(skip_xact_finish_lsn)))); + errmsg("logical replication completed skipping transaction at LSN %X/%08X", + LSN_FORMAT_ARGS(skip_xact_finish_lsn))); /* Stop skipping changes */ skip_xact_finish_lsn = InvalidXLogRecPtr; @@ -4948,6 +5722,12 @@ clear_subscription_skip_lsn(XLogRecPtr finish_lsn) } /* + * Updating pg_subscription might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + + /* * Protect subskiplsn of pg_subscription from being concurrently updated * while clearing it. */ @@ -4997,7 +5777,7 @@ clear_subscription_skip_lsn(XLogRecPtr finish_lsn) if (myskiplsn != finish_lsn) ereport(WARNING, errmsg("skip-LSN of subscription \"%s\" cleared", MySubscription->name), - errdetail("Remote transaction's finish WAL location (LSN) %X/%X did not match skip-LSN %X/%X.", + errdetail("Remote transaction's finish WAL location (LSN) %X/%08X did not match skip-LSN %X/%08X.", LSN_FORMAT_ARGS(finish_lsn), LSN_FORMAT_ARGS(myskiplsn))); } @@ -5005,6 +5785,8 @@ clear_subscription_skip_lsn(XLogRecPtr finish_lsn) heap_freetuple(tup); table_close(rel, NoLock); + PopActiveSnapshot(); + if (started_tx) CommitTransactionCommand(); } @@ -5032,7 +5814,7 @@ apply_error_callback(void *arg) logicalrep_message_type(errarg->command), errarg->remote_xid); else - errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u, finished at %X/%X", + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" in transaction %u, finished at %X/%08X", errarg->origin_name, logicalrep_message_type(errarg->command), errarg->remote_xid, @@ -5050,7 +5832,7 @@ apply_error_callback(void *arg) errarg->rel->remoterel.relname, errarg->remote_xid); else - errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u, finished at %X/%X", + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" in transaction %u, finished at %X/%08X", errarg->origin_name, logicalrep_message_type(errarg->command), errarg->rel->remoterel.nspname, @@ -5069,7 +5851,7 @@ apply_error_callback(void *arg) errarg->rel->remoterel.attnames[errarg->remote_attnum], errarg->remote_xid); else - errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u, finished at %X/%X", + errcontext("processing remote data for replication origin \"%s\" during message type \"%s\" for replication target relation \"%s.%s\" column \"%s\" in transaction %u, finished at %X/%08X", errarg->origin_name, logicalrep_message_type(errarg->command), errarg->rel->remoterel.nspname, diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c index 693a766e6d7..80540c017bd 100644 --- a/src/backend/replication/pgoutput/pgoutput.c +++ b/src/backend/replication/pgoutput/pgoutput.c @@ -297,10 +297,12 @@ parse_output_parameters(List *options, PGOutputData *data) bool two_phase_option_given = false; bool origin_option_given = false; + /* Initialize optional parameters to defaults */ data->binary = false; data->streaming = LOGICALREP_STREAM_OFF; data->messages = false; data->two_phase = false; + data->publish_no_origin = false; foreach(lc, options) { @@ -1372,8 +1374,8 @@ pgoutput_row_filter(Relation relation, TupleTableSlot *old_slot, * VARTAG_INDIRECT. See ReorderBufferToastReplace. */ if (att->attlen == -1 && - VARATT_IS_EXTERNAL_ONDISK(new_slot->tts_values[i]) && - !VARATT_IS_EXTERNAL_ONDISK(old_slot->tts_values[i])) + VARATT_IS_EXTERNAL_ONDISK(DatumGetPointer(new_slot->tts_values[i])) && + !VARATT_IS_EXTERNAL_ONDISK(DatumGetPointer(old_slot->tts_values[i]))) { if (!tmp_new_slot) { @@ -1789,7 +1791,7 @@ LoadPublications(List *pubnames) else ereport(WARNING, errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("skipped loading publication: %s", pubname), + errmsg("skipped loading publication \"%s\"", pubname), errdetail("The publication does not exist at this point in the WAL."), errhint("Create the publication if it does not exist.")); } diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y index 7440aae5a1a..8a649199ec6 100644 --- a/src/backend/replication/repl_gram.y +++ b/src/backend/replication/repl_gram.y @@ -279,7 +279,7 @@ alter_replication_slot: ; /* - * START_REPLICATION [SLOT slot] [PHYSICAL] %X/%X [TIMELINE %u] + * START_REPLICATION [SLOT slot] [PHYSICAL] %X/%08X [TIMELINE %u] */ start_replication: K_START_REPLICATION opt_slot opt_physical RECPTR opt_timeline @@ -295,7 +295,7 @@ start_replication: } ; -/* START_REPLICATION SLOT slot LOGICAL %X/%X options */ +/* START_REPLICATION SLOT slot LOGICAL %X/%08X options */ start_logical_replication: K_START_REPLICATION K_SLOT IDENT K_LOGICAL RECPTR plugin_options { diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l index 014ea8d25c6..b6930e28659 100644 --- a/src/backend/replication/repl_scanner.l +++ b/src/backend/replication/repl_scanner.l @@ -155,7 +155,7 @@ UPLOAD_MANIFEST { return K_UPLOAD_MANIFEST; } {hexdigit}+\/{hexdigit}+ { uint32 hi, lo; - if (sscanf(yytext, "%X/%X", &hi, &lo) != 2) + if (sscanf(yytext, "%X/%08X", &hi, &lo) != 2) replication_yyerror(NULL, yyscanner, "invalid streaming start location"); yylval->recptr = ((uint64) hi) << 32 | lo; return RECPTR; diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 600b87fa9cb..8605776ad86 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -47,6 +47,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "postmaster/interrupt.h" +#include "replication/logicallauncher.h" #include "replication/slotsync.h" #include "replication/slot.h" #include "replication/walsender_private.h" @@ -154,7 +155,7 @@ int max_replication_slots = 10; /* the maximum number of replication * Invalidate replication slots that have remained idle longer than this * duration; '0' disables it. */ -int idle_replication_slot_timeout_mins = 0; +int idle_replication_slot_timeout_secs = 0; /* * This GUC lists streaming replication standby server slot names that @@ -172,6 +173,7 @@ static SyncStandbySlotsConfigData *synchronized_standby_slots_config; static XLogRecPtr ss_oldest_flush_lsn = InvalidXLogRecPtr; static void ReplicationSlotShmemExit(int code, Datum arg); +static bool IsSlotForConflictCheck(const char *name); static void ReplicationSlotDropPtr(ReplicationSlot *slot); /* internal persistency functions */ @@ -258,13 +260,17 @@ ReplicationSlotShmemExit(int code, Datum arg) /* * Check whether the passed slot name is valid and report errors at elevel. * + * An error will be reported for a reserved replication slot name if + * allow_reserved_name is set to false. + * * Slot names may consist out of [a-z0-9_]{1,NAMEDATALEN-1} which should allow * the name to be used as a directory name on every supported OS. * * Returns whether the directory name is valid or not if elevel < ERROR. */ bool -ReplicationSlotValidateName(const char *name, int elevel) +ReplicationSlotValidateName(const char *name, bool allow_reserved_name, + int elevel) { const char *cp; @@ -300,10 +306,32 @@ ReplicationSlotValidateName(const char *name, int elevel) return false; } } + + if (!allow_reserved_name && IsSlotForConflictCheck(name)) + { + ereport(elevel, + errcode(ERRCODE_RESERVED_NAME), + errmsg("replication slot name \"%s\" is reserved", + name), + errdetail("The name \"%s\" is reserved for the conflict detection slot.", + CONFLICT_DETECTION_SLOT)); + + return false; + } + return true; } /* + * Return true if the replication slot name is "pg_conflict_detection". + */ +static bool +IsSlotForConflictCheck(const char *name) +{ + return (strcmp(name, CONFLICT_DETECTION_SLOT) == 0); +} + +/* * Create a new replication slot and mark it as used by this backend. * * name: Name of the slot @@ -330,7 +358,12 @@ ReplicationSlotCreate(const char *name, bool db_specific, Assert(MyReplicationSlot == NULL); - ReplicationSlotValidateName(name, ERROR); + /* + * The logical launcher or pg_upgrade may create or migrate an internal + * slot, so using a reserved name is allowed in these cases. + */ + ReplicationSlotValidateName(name, IsBinaryUpgrade || IsLogicalLauncher(), + ERROR); if (failover) { @@ -424,6 +457,7 @@ ReplicationSlotCreate(const char *name, bool db_specific, slot->candidate_restart_valid = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->last_saved_confirmed_flush = InvalidXLogRecPtr; + slot->last_saved_restart_lsn = InvalidXLogRecPtr; slot->inactive_since = 0; /* @@ -581,6 +615,17 @@ retry: } /* + * Do not allow users to acquire the reserved slot. This scenario may + * occur if the launcher that owns the slot has terminated unexpectedly + * due to an error, and a backend process attempts to reuse the slot. + */ + if (!IsLogicalLauncher() && IsSlotForConflictCheck(name)) + ereport(ERROR, + errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("cannot acquire replication slot \"%s\"", name), + errdetail("The slot is reserved for conflict detection and can only be acquired by logical replication launcher.")); + + /* * This is the slot we want; check if it's active under some other * process. In single user mode, we don't need this check. */ @@ -1165,20 +1210,41 @@ ReplicationSlotsComputeRequiredLSN(void) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; XLogRecPtr restart_lsn; + XLogRecPtr last_saved_restart_lsn; bool invalidated; + ReplicationSlotPersistency persistency; if (!s->in_use) continue; SpinLockAcquire(&s->mutex); + persistency = s->data.persistency; restart_lsn = s->data.restart_lsn; invalidated = s->data.invalidated != RS_INVAL_NONE; + last_saved_restart_lsn = s->last_saved_restart_lsn; SpinLockRelease(&s->mutex); /* invalidated slots need not apply */ if (invalidated) continue; + /* + * For persistent slot use last_saved_restart_lsn to compute the + * oldest LSN for removal of WAL segments. The segments between + * last_saved_restart_lsn and restart_lsn might be needed by a + * persistent slot in the case of database crash. Non-persistent + * slots can't survive the database crash, so we don't care about + * last_saved_restart_lsn for them. + */ + if (persistency == RS_PERSISTENT) + { + if (last_saved_restart_lsn != InvalidXLogRecPtr && + restart_lsn > last_saved_restart_lsn) + { + restart_lsn = last_saved_restart_lsn; + } + } + if (restart_lsn != InvalidXLogRecPtr && (min_required == InvalidXLogRecPtr || restart_lsn < min_required)) @@ -1216,7 +1282,9 @@ ReplicationSlotsComputeLogicalRestartLSN(void) { ReplicationSlot *s; XLogRecPtr restart_lsn; + XLogRecPtr last_saved_restart_lsn; bool invalidated; + ReplicationSlotPersistency persistency; s = &ReplicationSlotCtl->replication_slots[i]; @@ -1230,14 +1298,33 @@ ReplicationSlotsComputeLogicalRestartLSN(void) /* read once, it's ok if it increases while we're checking */ SpinLockAcquire(&s->mutex); + persistency = s->data.persistency; restart_lsn = s->data.restart_lsn; invalidated = s->data.invalidated != RS_INVAL_NONE; + last_saved_restart_lsn = s->last_saved_restart_lsn; SpinLockRelease(&s->mutex); /* invalidated slots need not apply */ if (invalidated) continue; + /* + * For persistent slot use last_saved_restart_lsn to compute the + * oldest LSN for removal of WAL segments. The segments between + * last_saved_restart_lsn and restart_lsn might be needed by a + * persistent slot in the case of database crash. Non-persistent + * slots can't survive the database crash, so we don't care about + * last_saved_restart_lsn for them. + */ + if (persistency == RS_PERSISTENT) + { + if (last_saved_restart_lsn != InvalidXLogRecPtr && + restart_lsn > last_saved_restart_lsn) + { + restart_lsn = last_saved_restart_lsn; + } + } + if (restart_lsn == InvalidXLogRecPtr) continue; @@ -1455,6 +1542,7 @@ ReplicationSlotReserveWal(void) Assert(slot != NULL); Assert(slot->data.restart_lsn == InvalidXLogRecPtr); + Assert(slot->last_saved_restart_lsn == InvalidXLogRecPtr); /* * The replication slot mechanism is used to prevent removal of required @@ -1547,8 +1635,8 @@ ReportSlotInvalidation(ReplicationSlotInvalidationCause cause, uint64 ex = oldestLSN - restart_lsn; appendStringInfo(&err_detail, - ngettext("The slot's restart_lsn %X/%X exceeds the limit by %" PRIu64 " byte.", - "The slot's restart_lsn %X/%X exceeds the limit by %" PRIu64 " bytes.", + ngettext("The slot's restart_lsn %X/%08X exceeds the limit by %" PRIu64 " byte.", + "The slot's restart_lsn %X/%08X exceeds the limit by %" PRIu64 " bytes.", ex), LSN_FORMAT_ARGS(restart_lsn), ex); @@ -1568,13 +1656,10 @@ ReportSlotInvalidation(ReplicationSlotInvalidationCause cause, case RS_INVAL_IDLE_TIMEOUT: { - int minutes = slot_idle_seconds / SECS_PER_MINUTE; - int secs = slot_idle_seconds % SECS_PER_MINUTE; - /* translator: %s is a GUC variable name */ - appendStringInfo(&err_detail, _("The slot's idle time of %dmin %02ds exceeds the configured \"%s\" duration of %dmin."), - minutes, secs, "idle_replication_slot_timeout", - idle_replication_slot_timeout_mins); + appendStringInfo(&err_detail, _("The slot's idle time of %lds exceeds the configured \"%s\" duration of %ds."), + slot_idle_seconds, "idle_replication_slot_timeout", + idle_replication_slot_timeout_secs); /* translator: %s is a GUC variable name */ appendStringInfo(&err_hint, _("You might need to increase \"%s\"."), "idle_replication_slot_timeout"); @@ -1612,7 +1697,7 @@ ReportSlotInvalidation(ReplicationSlotInvalidationCause cause, static inline bool CanInvalidateIdleSlot(ReplicationSlot *s) { - return (idle_replication_slot_timeout_mins != 0 && + return (idle_replication_slot_timeout_secs != 0 && !XLogRecPtrIsInvalid(s->data.restart_lsn) && s->inactive_since > 0 && !(RecoveryInProgress() && s->data.synced)); @@ -1673,9 +1758,9 @@ DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s, if (CanInvalidateIdleSlot(s)) { /* - * We simulate the invalidation due to idle_timeout as the minimum - * time idle time is one minute which makes tests take a long - * time. + * Simulate the invalidation due to idle_timeout to test the + * timeout behavior promptly, without waiting for it to trigger + * naturally. */ #ifdef USE_INJECTION_POINTS if (IS_INJECTION_POINT_ATTACHED("slot-timeout-inval")) @@ -1690,7 +1775,7 @@ DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s, * idle_replication_slot_timeout GUC. */ if (TimestampDifferenceExceedsSeconds(s->inactive_since, now, - idle_replication_slot_timeout_mins * SECS_PER_MINUTE)) + idle_replication_slot_timeout_secs)) { *inactive_since = s->inactive_since; return RS_INVAL_IDLE_TIMEOUT; @@ -1835,7 +1920,10 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, * just rely on .invalidated. */ if (invalidation_cause == RS_INVAL_WAL_REMOVED) + { s->data.restart_lsn = InvalidXLogRecPtr; + s->last_saved_restart_lsn = InvalidXLogRecPtr; + } /* Let caller know */ *invalidated = true; @@ -1844,15 +1932,6 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, SpinLockRelease(&s->mutex); /* - * The logical replication slots shouldn't be invalidated as GUC - * max_slot_wal_keep_size is set to -1 and - * idle_replication_slot_timeout is set to 0 during the binary - * upgrade. See check_old_cluster_for_valid_slots() where we ensure - * that no invalidated before the upgrade. - */ - Assert(!(*invalidated && SlotIsLogical(s) && IsBinaryUpgrade)); - - /* * Calculate the idle time duration of the slot if slot is marked * invalidated with RS_INVAL_IDLE_TIMEOUT. */ @@ -1998,6 +2077,10 @@ restart: if (!s->in_use) continue; + /* Prevent invalidation of logical slots during binary upgrade */ + if (SlotIsLogical(s) && IsBinaryUpgrade) + continue; + if (InvalidatePossiblyObsoleteSlot(possible_causes, s, oldestLSN, dboid, snapshotConflictHorizon, &invalidated)) @@ -2032,6 +2115,7 @@ void CheckPointReplicationSlots(bool is_shutdown) { int i; + bool last_saved_restart_lsn_updated = false; elog(DEBUG1, "performing replication slot checkpoint"); @@ -2076,9 +2160,23 @@ CheckPointReplicationSlots(bool is_shutdown) SpinLockRelease(&s->mutex); } + /* + * Track if we're going to update slot's last_saved_restart_lsn. We + * need this to know if we need to recompute the required LSN. + */ + if (s->last_saved_restart_lsn != s->data.restart_lsn) + last_saved_restart_lsn_updated = true; + SaveSlotToPath(s, path, LOG); } LWLockRelease(ReplicationSlotAllocationLock); + + /* + * Recompute the required LSN if SaveSlotToPath() updated + * last_saved_restart_lsn for any slot. + */ + if (last_saved_restart_lsn_updated) + ReplicationSlotsComputeRequiredLSN(); } /* @@ -2354,6 +2452,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) if (!slot->just_dirtied) slot->dirty = false; slot->last_saved_confirmed_flush = cp.slotdata.confirmed_flush; + slot->last_saved_restart_lsn = cp.slotdata.restart_lsn; SpinLockRelease(&slot->mutex); LWLockRelease(&slot->io_in_progress_lock); @@ -2569,6 +2668,7 @@ RestoreSlotFromDisk(const char *name) slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->last_saved_confirmed_flush = cp.slotdata.confirmed_flush; + slot->last_saved_restart_lsn = cp.slotdata.restart_lsn; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; @@ -2993,22 +3093,3 @@ WaitForStandbyConfirmation(XLogRecPtr wait_for_lsn) ConditionVariableCancelSleep(); } - -/* - * GUC check_hook for idle_replication_slot_timeout - * - * The value of idle_replication_slot_timeout must be set to 0 during - * a binary upgrade. See start_postmaster() in pg_upgrade for more details. - */ -bool -check_idle_replication_slot_timeout(int *newval, void **extra, GucSource source) -{ - if (IsBinaryUpgrade && *newval != 0) - { - GUC_check_errdetail("\"%s\" must be set to 0 during binary upgrade mode.", - "idle_replication_slot_timeout"); - return false; - } - - return true; -} diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 36cc2ed4e44..69f4c6157c5 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -566,7 +566,7 @@ pg_replication_slot_advance(PG_FUNCTION_ARGS) if (moveto < minlsn) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("cannot advance replication slot to %X/%X, minimum is %X/%X", + errmsg("cannot advance replication slot to %X/%08X, minimum is %X/%08X", LSN_FORMAT_ARGS(moveto), LSN_FORMAT_ARGS(minlsn)))); /* Do the actual slot update, depending on the slot type */ diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c index cc35984ad00..32cf3a48b89 100644 --- a/src/backend/replication/syncrep.c +++ b/src/backend/replication/syncrep.c @@ -258,7 +258,7 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit) { char buffer[32]; - sprintf(buffer, "waiting for %X/%X", LSN_FORMAT_ARGS(lsn)); + sprintf(buffer, "waiting for %X/%08X", LSN_FORMAT_ARGS(lsn)); set_ps_display_suffix(buffer); } @@ -566,7 +566,7 @@ SyncRepReleaseWaiters(void) LWLockRelease(SyncRepLock); - elog(DEBUG3, "released %d procs up to write %X/%X, %d procs up to flush %X/%X, %d procs up to apply %X/%X", + elog(DEBUG3, "released %d procs up to write %X/%08X, %d procs up to flush %X/%08X, %d procs up to apply %X/%08X", numwrite, LSN_FORMAT_ARGS(writePtr), numflush, LSN_FORMAT_ARGS(flushPtr), numapply, LSN_FORMAT_ARGS(applyPtr)); diff --git a/src/backend/replication/syncrep_scanner.l b/src/backend/replication/syncrep_scanner.l index 7dec1f869c7..02004d621e7 100644 --- a/src/backend/replication/syncrep_scanner.l +++ b/src/backend/replication/syncrep_scanner.l @@ -157,17 +157,16 @@ syncrep_yyerror(SyncRepConfigData **syncrep_parse_result_p, char **syncrep_parse { struct yyguts_t *yyg = (struct yyguts_t *) yyscanner; /* needed for yytext * macro */ - char *syncrep_parse_error_msg = *syncrep_parse_error_msg_p; /* report only the first error in a parse operation */ - if (syncrep_parse_error_msg) + if (*syncrep_parse_error_msg_p) return; if (yytext[0]) - syncrep_parse_error_msg = psprintf("%s at or near \"%s\"", - message, yytext); + *syncrep_parse_error_msg_p = psprintf("%s at or near \"%s\"", + message, yytext); else - syncrep_parse_error_msg = psprintf("%s at end of input", - message); + *syncrep_parse_error_msg_p = psprintf("%s at end of input", + message); } void diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 8c4d0fd9aed..7361ffc9dcf 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -386,12 +386,12 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) { if (first_stream) ereport(LOG, - (errmsg("started streaming WAL from primary at %X/%X on timeline %u", - LSN_FORMAT_ARGS(startpoint), startpointTLI))); + errmsg("started streaming WAL from primary at %X/%08X on timeline %u", + LSN_FORMAT_ARGS(startpoint), startpointTLI)); else ereport(LOG, - (errmsg("restarted WAL streaming at %X/%X on timeline %u", - LSN_FORMAT_ARGS(startpoint), startpointTLI))); + errmsg("restarted WAL streaming at %X/%08X on timeline %u", + LSN_FORMAT_ARGS(startpoint), startpointTLI)); first_stream = false; /* Initialize LogstreamResult and buffers for processing messages */ @@ -470,7 +470,7 @@ WalReceiverMain(const void *startup_data, size_t startup_data_len) { ereport(LOG, (errmsg("replication terminated by primary server"), - errdetail("End of WAL reached on timeline %u at %X/%X.", + errdetail("End of WAL reached on timeline %u at %X/%08X.", startpointTLI, LSN_FORMAT_ARGS(LogstreamResult.Write)))); endofwal = true; @@ -711,7 +711,7 @@ WalRcvWaitForStartPosition(XLogRecPtr *startpoint, TimeLineID *startpointTLI) { char activitymsg[50]; - snprintf(activitymsg, sizeof(activitymsg), "restarting at %X/%X", + snprintf(activitymsg, sizeof(activitymsg), "restarting at %X/%08X", LSN_FORMAT_ARGS(*startpoint)); set_ps_display(activitymsg); } @@ -826,7 +826,7 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len, TimeLineID tli) switch (type) { - case 'w': /* WAL records */ + case PqReplMsg_WALData: { StringInfoData incoming_message; @@ -850,7 +850,7 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len, TimeLineID tli) XLogWalRcvWrite(buf, len, dataStart, tli); break; } - case 'k': /* Keepalive */ + case PqReplMsg_Keepalive: { StringInfoData incoming_message; @@ -1014,7 +1014,7 @@ XLogWalRcvFlush(bool dying, TimeLineID tli) { char activitymsg[50]; - snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%08X", LSN_FORMAT_ARGS(LogstreamResult.Write)); set_ps_display(activitymsg); } @@ -1130,7 +1130,7 @@ XLogWalRcvSendReply(bool force, bool requestReply) applyPtr = GetXLogReplayRecPtr(NULL); resetStringInfo(&reply_message); - pq_sendbyte(&reply_message, 'r'); + pq_sendbyte(&reply_message, PqReplMsg_StandbyStatusUpdate); pq_sendint64(&reply_message, writePtr); pq_sendint64(&reply_message, flushPtr); pq_sendint64(&reply_message, applyPtr); @@ -1138,7 +1138,7 @@ XLogWalRcvSendReply(bool force, bool requestReply) pq_sendbyte(&reply_message, requestReply ? 1 : 0); /* Send it */ - elog(DEBUG2, "sending write %X/%X flush %X/%X apply %X/%X%s", + elog(DEBUG2, "sending write %X/%08X flush %X/%08X apply %X/%08X%s", LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), LSN_FORMAT_ARGS(applyPtr), @@ -1234,7 +1234,7 @@ XLogWalRcvSendHSFeedback(bool immed) /* Construct the message and send it. */ resetStringInfo(&reply_message); - pq_sendbyte(&reply_message, 'h'); + pq_sendbyte(&reply_message, PqReplMsg_HotStandbyFeedback); pq_sendint64(&reply_message, GetCurrentTimestamp()); pq_sendint32(&reply_message, xmin); pq_sendint32(&reply_message, xmin_epoch); diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 9fa8beb6103..0855bae3535 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -65,6 +65,7 @@ #include "funcapi.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" +#include "libpq/protocol.h" #include "miscadmin.h" #include "nodes/replnodes.h" #include "pgstat.h" @@ -84,6 +85,7 @@ #include "storage/ipc.h" #include "storage/pmsignal.h" #include "storage/proc.h" +#include "storage/procarray.h" #include "tcop/dest.h" #include "tcop/tcopprot.h" #include "utils/acl.h" @@ -258,6 +260,7 @@ static void StartLogicalReplication(StartReplicationCmd *cmd); static void ProcessStandbyMessage(void); static void ProcessStandbyReplyMessage(void); static void ProcessStandbyHSFeedbackMessage(void); +static void ProcessStandbyPSRequestMessage(void); static void ProcessRepliesIfAny(void); static void ProcessPendingWrites(void); static void WalSndKeepalive(bool requestReply, XLogRecPtr writePtr); @@ -408,7 +411,7 @@ IdentifySystem(void) else logptr = GetFlushRecPtr(&currTLI); - snprintf(xloc, sizeof(xloc), "%X/%X", LSN_FORMAT_ARGS(logptr)); + snprintf(xloc, sizeof(xloc), "%X/%08X", LSN_FORMAT_ARGS(logptr)); if (MyDatabaseId != InvalidOid) { @@ -515,7 +518,7 @@ ReadReplicationSlot(ReadReplicationSlotCmd *cmd) { char xloc[64]; - snprintf(xloc, sizeof(xloc), "%X/%X", + snprintf(xloc, sizeof(xloc), "%X/%08X", LSN_FORMAT_ARGS(slot_contents.data.restart_lsn)); values[i] = CStringGetTextDatum(xloc); nulls[i] = false; @@ -733,13 +736,13 @@ HandleUploadManifestPacket(StringInfo buf, off_t *offset, switch (mtype) { - case 'd': /* CopyData */ + case PqMsg_CopyData: maxmsglen = PQ_LARGE_MESSAGE_LIMIT; break; - case 'c': /* CopyDone */ - case 'f': /* CopyFail */ - case 'H': /* Flush */ - case 'S': /* Sync */ + case PqMsg_CopyDone: + case PqMsg_CopyFail: + case PqMsg_Flush: + case PqMsg_Sync: maxmsglen = PQ_SMALL_MESSAGE_LIMIT; break; default: @@ -761,19 +764,19 @@ HandleUploadManifestPacket(StringInfo buf, off_t *offset, /* Process the message */ switch (mtype) { - case 'd': /* CopyData */ + case PqMsg_CopyData: AppendIncrementalManifestData(ib, buf->data, buf->len); return true; - case 'c': /* CopyDone */ + case PqMsg_CopyDone: return false; - case 'H': /* Sync */ - case 'S': /* Flush */ + case PqMsg_Sync: + case PqMsg_Flush: /* Ignore these while in CopyOut mode as we do elsewhere. */ return true; - case 'f': + case PqMsg_CopyFail: ereport(ERROR, (errcode(ERRCODE_QUERY_CANCELED), errmsg("COPY from stdin failed: %s", @@ -892,12 +895,12 @@ StartReplication(StartReplicationCmd *cmd) switchpoint < cmd->startpoint) { ereport(ERROR, - (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", - LSN_FORMAT_ARGS(cmd->startpoint), - cmd->timeline), - errdetail("This server's history forked from timeline %u at %X/%X.", - cmd->timeline, - LSN_FORMAT_ARGS(switchpoint)))); + errmsg("requested starting point %X/%08X on timeline %u is not in this server's history", + LSN_FORMAT_ARGS(cmd->startpoint), + cmd->timeline), + errdetail("This server's history forked from timeline %u at %X/%08X.", + cmd->timeline, + LSN_FORMAT_ARGS(switchpoint))); } sendTimeLineValidUpto = switchpoint; } @@ -939,9 +942,9 @@ StartReplication(StartReplicationCmd *cmd) if (FlushPtr < cmd->startpoint) { ereport(ERROR, - (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", - LSN_FORMAT_ARGS(cmd->startpoint), - LSN_FORMAT_ARGS(FlushPtr)))); + errmsg("requested starting point %X/%08X is ahead of the WAL flush position of this server %X/%08X", + LSN_FORMAT_ARGS(cmd->startpoint), + LSN_FORMAT_ARGS(FlushPtr))); } /* Start streaming from the requested point */ @@ -983,7 +986,7 @@ StartReplication(StartReplicationCmd *cmd) Datum values[2]; bool nulls[2] = {0}; - snprintf(startpos_str, sizeof(startpos_str), "%X/%X", + snprintf(startpos_str, sizeof(startpos_str), "%X/%08X", LSN_FORMAT_ARGS(sendTimeLineValidUpto)); dest = CreateDestReceiver(DestRemoteSimple); @@ -1324,7 +1327,7 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) ReplicationSlotPersist(); } - snprintf(xloc, sizeof(xloc), "%X/%X", + snprintf(xloc, sizeof(xloc), "%X/%08X", LSN_FORMAT_ARGS(MyReplicationSlot->data.confirmed_flush)); dest = CreateDestReceiver(DestRemoteSimple); @@ -1531,7 +1534,7 @@ WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xi resetStringInfo(ctx->out); - pq_sendbyte(ctx->out, 'w'); + pq_sendbyte(ctx->out, PqReplMsg_WALData); pq_sendint64(ctx->out, lsn); /* dataStart */ pq_sendint64(ctx->out, lsn); /* walEnd */ @@ -1567,7 +1570,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, tmpbuf.data, sizeof(int64)); /* output previously gathered data in a CopyData packet */ - pq_putmessage_noblock('d', ctx->out->data, ctx->out->len); + pq_putmessage_noblock(PqMsg_CopyData, ctx->out->data, ctx->out->len); CHECK_FOR_INTERRUPTS(); @@ -2289,7 +2292,8 @@ ProcessRepliesIfAny(void) switch (firstchar) { /* - * 'd' means a standby reply wrapped in a CopyData packet. + * PqMsg_CopyData means a standby reply wrapped in a CopyData + * packet. */ case PqMsg_CopyData: ProcessStandbyMessage(); @@ -2297,13 +2301,14 @@ ProcessRepliesIfAny(void) break; /* - * CopyDone means the standby requested to finish streaming. - * Reply with CopyDone, if we had not sent that already. + * PqMsg_CopyDone means the standby requested to finish + * streaming. Reply with CopyDone, if we had not sent that + * already. */ case PqMsg_CopyDone: if (!streamingDoneSending) { - pq_putmessage_noblock('c', NULL, 0); + pq_putmessage_noblock(PqMsg_CopyDone, NULL, 0); streamingDoneSending = true; } @@ -2312,7 +2317,8 @@ ProcessRepliesIfAny(void) break; /* - * 'X' means that the standby is closing down the socket. + * PqMsg_Terminate means that the standby is closing down the + * socket. */ case PqMsg_Terminate: proc_exit(0); @@ -2347,14 +2353,18 @@ ProcessStandbyMessage(void) switch (msgtype) { - case 'r': + case PqReplMsg_StandbyStatusUpdate: ProcessStandbyReplyMessage(); break; - case 'h': + case PqReplMsg_HotStandbyFeedback: ProcessStandbyHSFeedbackMessage(); break; + case PqReplMsg_PrimaryStatusRequest: + ProcessStandbyPSRequestMessage(); + break; + default: ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), @@ -2429,7 +2439,7 @@ ProcessStandbyReplyMessage(void) /* Copy because timestamptz_to_str returns a static buffer */ replyTimeStr = pstrdup(timestamptz_to_str(replyTime)); - elog(DEBUG2, "write %X/%X flush %X/%X apply %X/%X%s reply_time %s", + elog(DEBUG2, "write %X/%08X flush %X/%08X apply %X/%08X%s reply_time %s", LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), LSN_FORMAT_ARGS(applyPtr), @@ -2702,6 +2712,60 @@ ProcessStandbyHSFeedbackMessage(void) } /* + * Process the request for a primary status update message. + */ +static void +ProcessStandbyPSRequestMessage(void) +{ + XLogRecPtr lsn = InvalidXLogRecPtr; + TransactionId oldestXidInCommit; + FullTransactionId nextFullXid; + FullTransactionId fullOldestXidInCommit; + WalSnd *walsnd = MyWalSnd; + TimestampTz replyTime; + + /* + * This shouldn't happen because we don't support getting primary status + * message from standby. + */ + if (RecoveryInProgress()) + elog(ERROR, "the primary status is unavailable during recovery"); + + replyTime = pq_getmsgint64(&reply_message); + + /* + * Update shared state for this WalSender process based on reply data from + * standby. + */ + SpinLockAcquire(&walsnd->mutex); + walsnd->replyTime = replyTime; + SpinLockRelease(&walsnd->mutex); + + /* + * Consider transactions in the current database, as only these are the + * ones replicated. + */ + oldestXidInCommit = GetOldestActiveTransactionId(true, false); + nextFullXid = ReadNextFullTransactionId(); + fullOldestXidInCommit = FullTransactionIdFromAllowableAt(nextFullXid, + oldestXidInCommit); + lsn = GetXLogWriteRecPtr(); + + elog(DEBUG2, "sending primary status"); + + /* construct the message... */ + resetStringInfo(&output_message); + pq_sendbyte(&output_message, PqReplMsg_PrimaryStatusUpdate); + pq_sendint64(&output_message, lsn); + pq_sendint64(&output_message, (int64) U64FromFullTransactionId(fullOldestXidInCommit)); + pq_sendint64(&output_message, (int64) U64FromFullTransactionId(nextFullXid)); + pq_sendint64(&output_message, GetCurrentTimestamp()); + + /* ... and send it wrapped in CopyData */ + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); +} + +/* * Compute how long send/receive loops should sleep. * * If wal_sender_timeout is enabled we want to wake up in time to send @@ -3246,12 +3310,12 @@ XLogSendPhysical(void) wal_segment_close(xlogreader); /* Send CopyDone */ - pq_putmessage_noblock('c', NULL, 0); + pq_putmessage_noblock(PqMsg_CopyDone, NULL, 0); streamingDoneSending = true; WalSndCaughtUp = true; - elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)", + elog(DEBUG1, "walsender reached end of timeline at %X/%08X (sent up to %X/%08X)", LSN_FORMAT_ARGS(sendTimeLineValidUpto), LSN_FORMAT_ARGS(sentPtr)); return; @@ -3303,7 +3367,7 @@ XLogSendPhysical(void) * OK to read and send the slice. */ resetStringInfo(&output_message); - pq_sendbyte(&output_message, 'w'); + pq_sendbyte(&output_message, PqReplMsg_WALData); pq_sendint64(&output_message, startptr); /* dataStart */ pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ @@ -3374,7 +3438,7 @@ retry: memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], tmpbuf.data, sizeof(int64)); - pq_putmessage_noblock('d', output_message.data, output_message.len); + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); sentPtr = endptr; @@ -3392,7 +3456,7 @@ retry: { char activitymsg[50]; - snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%08X", LSN_FORMAT_ARGS(sentPtr)); set_ps_display(activitymsg); } @@ -3449,8 +3513,16 @@ XLogSendLogical(void) if (flushPtr == InvalidXLogRecPtr || logical_decoding_ctx->reader->EndRecPtr >= flushPtr) { + /* + * For cascading logical WAL senders, we use the replay LSN instead of + * the flush LSN, since logical decoding on a standby only processes + * WAL that has been replayed. This distinction becomes particularly + * important during shutdown, as new WAL is no longer replayed and the + * last replayed LSN marks the furthest point up to which decoding can + * proceed. + */ if (am_cascading_walsender) - flushPtr = GetStandbyFlushRecPtr(NULL); + flushPtr = GetXLogReplayRecPtr(NULL); else flushPtr = GetFlushRecPtr(NULL); } @@ -4066,13 +4138,13 @@ WalSndKeepalive(bool requestReply, XLogRecPtr writePtr) /* construct the message... */ resetStringInfo(&output_message); - pq_sendbyte(&output_message, 'k'); + pq_sendbyte(&output_message, PqReplMsg_Keepalive); pq_sendint64(&output_message, XLogRecPtrIsInvalid(writePtr) ? sentPtr : writePtr); pq_sendint64(&output_message, GetCurrentTimestamp()); pq_sendbyte(&output_message, requestReply ? 1 : 0); /* ... and send it wrapped in CopyData */ - pq_putmessage_noblock('d', output_message.data, output_message.len); + pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len); /* Set local flag */ if (requestReply) diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index f0bce5f9ed9..adc9e7600e1 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -923,8 +923,9 @@ rewriteTargetListIU(List *targetList, apply_default = true; /* - * Can only insert DEFAULT into generated columns, regardless of - * any OVERRIDING clauses. + * Can only insert DEFAULT into generated columns. (The + * OVERRIDING clause does not apply to generated columns, so we + * don't consider it here.) */ if (att_tup->attgenerated && !apply_default) { @@ -4544,7 +4545,7 @@ build_generation_expression(Relation rel, int attrno) List * QueryRewrite(Query *parsetree) { - uint64 input_query_id = parsetree->queryId; + int64 input_query_id = parsetree->queryId; List *querylist; List *results; ListCell *l; diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index d98cda698d9..f59fb821543 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -767,7 +767,7 @@ statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats) values[dim][i] = PointerGetDatum(PG_DETOAST_DATUM(values[dim][i])); /* serialized length (uint32 length + data) */ - len = VARSIZE_ANY_EXHDR(values[dim][i]); + len = VARSIZE_ANY_EXHDR(DatumGetPointer(values[dim][i])); info[dim].nbytes += sizeof(uint32); /* length */ info[dim].nbytes += len; /* value (no header) */ diff --git a/src/backend/storage/aio/README.md b/src/backend/storage/aio/README.md index f10b5c7e31e..72ae3b3737d 100644 --- a/src/backend/storage/aio/README.md +++ b/src/backend/storage/aio/README.md @@ -94,7 +94,7 @@ pgaio_io_register_callbacks(ioh, PGAIO_HCB_SHARED_BUFFER_READV, 0); * * In this example we're reading only a single buffer, hence the 1. */ -pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); +pgaio_io_set_handle_data_32(ioh, (uint32 *) &buffer, 1); /* * Pass the AIO handle to lower-level function. When operating on the level of @@ -119,8 +119,9 @@ pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); * e.g. due to reaching a limit on the number of unsubmitted IOs, and even * complete before smgrstartreadv() returns. */ +void *page = BufferGetBlock(buffer); smgrstartreadv(ioh, operation->smgr, forknum, blkno, - BufferGetBlock(buffer), 1); + &page, 1); /* * To benefit from AIO, it is beneficial to perform other work, including diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index ebb5a771bfd..3643f27ad6e 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -184,6 +184,8 @@ pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret) PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret) { + PgAioHandle *ioh = NULL; + if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE) { Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE); @@ -193,10 +195,17 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret) if (pgaio_my_backend->handed_out_io) elog(ERROR, "API violation: Only one IO can be handed out"); + /* + * Probably not needed today, as interrupts should not process this IO, + * but... + */ + HOLD_INTERRUPTS(); + if (!dclist_is_empty(&pgaio_my_backend->idle_ios)) { dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios); - PgAioHandle *ioh = dclist_container(PgAioHandle, node, ion); + + ioh = dclist_container(PgAioHandle, node, ion); Assert(ioh->state == PGAIO_HS_IDLE); Assert(ioh->owner_procno == MyProcNumber); @@ -212,11 +221,11 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret) ioh->report_return = ret; ret->result.status = PGAIO_RS_UNKNOWN; } - - return ioh; } - return NULL; + RESUME_INTERRUPTS(); + + return ioh; } /* @@ -233,6 +242,12 @@ pgaio_io_release(PgAioHandle *ioh) Assert(ioh->resowner); pgaio_my_backend->handed_out_io = NULL; + + /* + * Note that no interrupts are processed between the handed_out_io + * check and the call to reclaim - that's important as otherwise an + * interrupt could have already reclaimed the handle. + */ pgaio_io_reclaim(ioh); } else @@ -251,6 +266,12 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error) Assert(ioh->resowner); + /* + * Otherwise an interrupt, in the middle of releasing the IO, could end up + * trying to wait for the IO, leading to state confusion. + */ + HOLD_INTERRUPTS(); + ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node); ioh->resowner = NULL; @@ -291,6 +312,8 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error) */ if (ioh->report_return) ioh->report_return = NULL; + + RESUME_INTERRUPTS(); } /* @@ -359,6 +382,13 @@ pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow) static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state) { + /* + * All callers need to have held interrupts in some form, otherwise + * interrupt processing could wait for the IO to complete, while in an + * intermediary state. + */ + Assert(!INTERRUPTS_CAN_BE_PROCESSED()); + pgaio_debug_io(DEBUG5, ioh, "updating state to %s", pgaio_io_state_get_name(new_state)); @@ -396,6 +426,13 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op) Assert(pgaio_my_backend->handed_out_io == ioh); Assert(pgaio_io_has_target(ioh)); + /* + * Otherwise an interrupt, in the middle of staging and possibly executing + * the IO, could end up trying to wait for the IO, leading to state + * confusion. + */ + HOLD_INTERRUPTS(); + ioh->op = op; ioh->result = 0; @@ -435,6 +472,8 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op) pgaio_io_prepare_submit(ioh); pgaio_io_perform_synchronously(ioh); } + + RESUME_INTERRUPTS(); } bool @@ -517,6 +556,13 @@ bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state) { *state = ioh->state; + + /* + * Ensure that we don't see an earlier state of the handle than ioh->state + * due to compiler or CPU reordering. This protects both ->generation as + * directly used here, and other fields in the handle accessed in the + * caller if the handle was not reused. + */ pg_read_barrier(); return ioh->generation != ref_generation; @@ -544,8 +590,8 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) && state != PGAIO_HS_COMPLETED_SHARED && state != PGAIO_HS_COMPLETED_LOCAL) { - elog(PANIC, "waiting for own IO in wrong state: %d", - state); + elog(PANIC, "waiting for own IO %d in wrong state: %s", + pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh)); } } @@ -599,7 +645,13 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) case PGAIO_HS_COMPLETED_SHARED: case PGAIO_HS_COMPLETED_LOCAL: - /* see above */ + + /* + * Note that no interrupts are processed between + * pgaio_io_was_recycled() and this check - that's important + * as otherwise an interrupt could have already reclaimed the + * handle. + */ if (am_owner) pgaio_io_reclaim(ioh); return; @@ -610,6 +662,11 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) /* * Make IO handle ready to be reused after IO has completed or after the * handle has been released without being used. + * + * Note that callers need to be careful about only calling this in the right + * state and that no interrupts can be processed between the state check and + * the call to pgaio_io_reclaim(). Otherwise interrupt processing could + * already have reclaimed the handle. */ static void pgaio_io_reclaim(PgAioHandle *ioh) @@ -618,6 +675,9 @@ pgaio_io_reclaim(PgAioHandle *ioh) Assert(ioh->owner_procno == MyProcNumber); Assert(ioh->state != PGAIO_HS_IDLE); + /* see comment in function header */ + HOLD_INTERRUPTS(); + /* * It's a bit ugly, but right now the easiest place to put the execution * of local completion callbacks is this function, as we need to execute @@ -685,6 +745,8 @@ pgaio_io_reclaim(PgAioHandle *ioh) * efficient in cases where only a few IOs are used. */ dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node); + + RESUME_INTERRUPTS(); } /* @@ -697,10 +759,10 @@ pgaio_io_wait_for_free(void) { int reclaimed = 0; - pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %d in-flight, %d idle IOs", + pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs", pgaio_my_backend->num_staged_ios, dclist_count(&pgaio_my_backend->in_flight_ios), - dclist_is_empty(&pgaio_my_backend->idle_ios)); + dclist_count(&pgaio_my_backend->idle_ios)); /* * First check if any of our IOs actually have completed - when using @@ -714,6 +776,16 @@ pgaio_io_wait_for_free(void) if (ioh->state == PGAIO_HS_COMPLETED_SHARED) { + /* + * Note that no interrupts are processed between the state check + * and the call to reclaim - that's important as otherwise an + * interrupt could have already reclaimed the handle. + * + * Need to ensure that there's no reordering, in the more common + * paths, where we wait for IO, that's done by + * pgaio_io_was_recycled(). + */ + pg_read_barrier(); pgaio_io_reclaim(ioh); reclaimed++; } @@ -730,13 +802,17 @@ pgaio_io_wait_for_free(void) if (pgaio_my_backend->num_staged_ios > 0) pgaio_submit_staged(); + /* possibly some IOs finished during submission */ + if (!dclist_is_empty(&pgaio_my_backend->idle_ios)) + return; + if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0) ereport(ERROR, errmsg_internal("no free IOs despite no in-flight IOs"), - errdetail_internal("%d pending, %d in-flight, %d idle IOs", + errdetail_internal("%d pending, %u in-flight, %u idle IOs", pgaio_my_backend->num_staged_ios, dclist_count(&pgaio_my_backend->in_flight_ios), - dclist_is_empty(&pgaio_my_backend->idle_ios))); + dclist_count(&pgaio_my_backend->idle_ios))); /* * Wait for the oldest in-flight IO to complete. @@ -747,6 +823,7 @@ pgaio_io_wait_for_free(void) { PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios); + uint64 generation = ioh->generation; switch (ioh->state) { @@ -763,20 +840,36 @@ pgaio_io_wait_for_free(void) case PGAIO_HS_COMPLETED_IO: case PGAIO_HS_SUBMITTED: pgaio_debug_io(DEBUG2, ioh, - "waiting for free io with %d in flight", + "waiting for free io with %u in flight", dclist_count(&pgaio_my_backend->in_flight_ios)); /* * In a more general case this would be racy, because the * generation could increase after we read ioh->state above. * But we are only looking at IOs by the current backend and - * the IO can only be recycled by this backend. + * the IO can only be recycled by this backend. Even this is + * only OK because we get the handle's generation before + * potentially processing interrupts, e.g. as part of + * pgaio_debug_io(). */ - pgaio_io_wait(ioh, ioh->generation); + pgaio_io_wait(ioh, generation); break; case PGAIO_HS_COMPLETED_SHARED: - /* it's possible that another backend just finished this IO */ + + /* + * It's possible that another backend just finished this IO. + * + * Note that no interrupts are processed between the state + * check and the call to reclaim - that's important as + * otherwise an interrupt could have already reclaimed the + * handle. + * + * Need to ensure that there's no reordering, in the more + * common paths, where we wait for IO, that's done by + * pgaio_io_was_recycled(). + */ + pg_read_barrier(); pgaio_io_reclaim(ioh); break; } @@ -926,6 +1019,11 @@ pgaio_wref_check_done(PgAioWaitRef *iow) if (state == PGAIO_HS_COMPLETED_SHARED || state == PGAIO_HS_COMPLETED_LOCAL) { + /* + * Note that no interrupts are processed between + * pgaio_io_was_recycled() and this check - that's important as + * otherwise an interrupt could have already reclaimed the handle. + */ if (am_owner) pgaio_io_reclaim(ioh); return true; @@ -1153,11 +1251,14 @@ pgaio_closing_fd(int fd) { dlist_iter iter; PgAioHandle *ioh = NULL; + uint64 generation; dclist_foreach(iter, &pgaio_my_backend->in_flight_ios) { ioh = dclist_container(PgAioHandle, node, iter.cur); + generation = ioh->generation; + if (pgaio_io_uses_fd(ioh, fd)) break; else @@ -1168,11 +1269,11 @@ pgaio_closing_fd(int fd) break; pgaio_debug_io(DEBUG2, ioh, - "waiting for IO before FD %d gets closed, %d in-flight IOs", + "waiting for IO before FD %d gets closed, %u in-flight IOs", fd, dclist_count(&pgaio_my_backend->in_flight_ios)); /* see comment in pgaio_io_wait_for_free() about raciness */ - pgaio_io_wait(ioh, ioh->generation); + pgaio_io_wait(ioh, generation); } } } @@ -1201,13 +1302,14 @@ pgaio_shutdown(int code, Datum arg) while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios)) { PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios); + uint64 generation = ioh->generation; pgaio_debug_io(DEBUG2, ioh, - "waiting for IO to complete during shutdown, %d in-flight IOs", + "waiting for IO to complete during shutdown, %u in-flight IOs", dclist_count(&pgaio_my_backend->in_flight_ios)); /* see comment in pgaio_io_wait_for_free() about raciness */ - pgaio_io_wait(ioh, ioh->generation); + pgaio_io_wait(ioh, generation); } pgaio_my_backend = NULL; diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c index 0ad9795bb7e..03c9bba0802 100644 --- a/src/backend/storage/aio/aio_callback.c +++ b/src/backend/storage/aio/aio_callback.c @@ -256,6 +256,9 @@ pgaio_io_call_complete_shared(PgAioHandle *ioh) pgaio_result_status_string(result.status), result.id, result.error_data, result.result); result = ce->cb->complete_shared(ioh, result, cb_data); + + /* the callback should never transition to unknown */ + Assert(result.status != PGAIO_RS_UNKNOWN); } ioh->distilled_result = result; @@ -290,6 +293,7 @@ pgaio_io_call_complete_local(PgAioHandle *ioh) /* start with distilled result from shared callback */ result = ioh->distilled_result; + Assert(result.status != PGAIO_RS_UNKNOWN); for (int i = ioh->num_callbacks; i > 0; i--) { @@ -306,6 +310,9 @@ pgaio_io_call_complete_local(PgAioHandle *ioh) pgaio_result_status_string(result.status), result.id, result.error_data, result.result); result = ce->cb->complete_local(ioh, result, cb_data); + + /* the callback should never transition to unknown */ + Assert(result.status != PGAIO_RS_UNKNOWN); } /* diff --git a/src/backend/storage/aio/aio_io.c b/src/backend/storage/aio/aio_io.c index 00e176135a6..520b5077df2 100644 --- a/src/backend/storage/aio/aio_io.c +++ b/src/backend/storage/aio/aio_io.c @@ -181,9 +181,9 @@ pgaio_io_get_op_name(PgAioHandle *ioh) case PGAIO_OP_INVALID: return "invalid"; case PGAIO_OP_READV: - return "read"; + return "readv"; case PGAIO_OP_WRITEV: - return "write"; + return "writev"; } return NULL; /* silence compiler */ diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index c719ba2727a..0a8c054162f 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -29,6 +29,9 @@ #ifdef IOMETHOD_IO_URING_ENABLED +#include <sys/mman.h> +#include <unistd.h> + #include <liburing.h> #include "miscadmin.h" @@ -94,12 +97,32 @@ PgAioUringContext struct io_uring io_uring_ring; } PgAioUringContext; +/* + * Information about the capabilities that io_uring has. + * + * Depending on liburing and kernel version different features are + * supported. At least for the kernel a kernel version check does not suffice + * as various vendors do backport features to older kernels :(. + */ +typedef struct PgAioUringCaps +{ + bool checked; + /* -1 if io_uring_queue_init_mem() is unsupported */ + int mem_init_size; +} PgAioUringCaps; + + /* PgAioUringContexts for all backends */ static PgAioUringContext *pgaio_uring_contexts; /* the current backend's context */ static PgAioUringContext *pgaio_my_uring_context; +static PgAioUringCaps pgaio_uring_caps = +{ + .checked = false, + .mem_init_size = -1, +}; static uint32 pgaio_uring_procs(void) @@ -111,30 +134,184 @@ pgaio_uring_procs(void) return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS; } -static Size +/* + * Initializes pgaio_uring_caps, unless that's already done. + */ +static void +pgaio_uring_check_capabilities(void) +{ + if (pgaio_uring_caps.checked) + return; + + /* + * By default io_uring creates a shared memory mapping for each io_uring + * instance, leading to a large number of memory mappings. Unfortunately a + * large number of memory mappings slows things down, backend exit is + * particularly affected. To address that, newer kernels (6.5) support + * using user-provided memory for the memory, by putting the relevant + * memory into shared memory we don't need any additional mappings. + * + * To know whether this is supported, we unfortunately need to probe the + * kernel by trying to create a ring with userspace-provided memory. This + * also has a secondary benefit: We can determine precisely how much + * memory we need for each io_uring instance. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + { + struct io_uring test_ring; + size_t ring_size; + void *ring_ptr; + struct io_uring_params p = {0}; + int ret; + + /* + * Liburing does not yet provide an API to query how much memory a + * ring will need. So we over-estimate it here. As the memory is freed + * just below that's small temporary waste of memory. + * + * 1MB is more than enough for rings within io_max_concurrency's + * range. + */ + ring_size = 1024 * 1024; + + /* + * Hard to believe a system exists where 1MB would not be a multiple + * of the page size. But it's cheap to ensure... + */ + ring_size -= ring_size % sysconf(_SC_PAGESIZE); + + ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (ring_ptr == MAP_FAILED) + elog(ERROR, + "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m", + ring_size); + + ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size); + if (ret > 0) + { + pgaio_uring_caps.mem_init_size = ret; + + elog(DEBUG1, + "can use combined memory mapping for io_uring, each ring needs %d bytes", + ret); + + /* clean up the created ring, it was just for a test */ + io_uring_queue_exit(&test_ring); + } + else + { + /* + * There are different reasons for ring creation to fail, but it's + * ok to treat that just as io_uring_queue_init_mem() not being + * supported. We'll report a more detailed error in + * pgaio_uring_shmem_init(). + */ + errno = -ret; + elog(DEBUG1, + "cannot use combined memory mapping for io_uring, ring creation failed: %m"); + + } + + if (munmap(ring_ptr, ring_size) != 0) + elog(ERROR, "munmap() failed: %m"); + } +#else + { + elog(DEBUG1, + "can't use combined memory mapping for io_uring, kernel or liburing too old"); + } +#endif + + pgaio_uring_caps.checked = true; +} + +/* + * Memory for all PgAioUringContext instances + */ +static size_t pgaio_uring_context_shmem_size(void) { return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext)); } +/* + * Memory for the combined memory used by io_uring instances. Returns 0 if + * that is not supported by kernel/liburing. + */ +static size_t +pgaio_uring_ring_shmem_size(void) +{ + size_t sz = 0; + + if (pgaio_uring_caps.mem_init_size > 0) + { + /* + * Memory for rings needs to be allocated to the page boundary, + * reserve space. Luckily it does not need to be aligned to hugepage + * boundaries, even if huge pages are used. + */ + sz = add_size(sz, sysconf(_SC_PAGESIZE)); + sz = add_size(sz, mul_size(pgaio_uring_procs(), + pgaio_uring_caps.mem_init_size)); + } + + return sz; +} + static size_t pgaio_uring_shmem_size(void) { - return pgaio_uring_context_shmem_size(); + size_t sz; + + /* + * Kernel and liburing support for various features influences how much + * shmem we need, perform the necessary checks. + */ + pgaio_uring_check_capabilities(); + + sz = pgaio_uring_context_shmem_size(); + sz = add_size(sz, pgaio_uring_ring_shmem_size()); + + return sz; } static void pgaio_uring_shmem_init(bool first_time) { - int TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS; + int TotalProcs = pgaio_uring_procs(); bool found; + char *shmem; + size_t ring_mem_remain = 0; + char *ring_mem_next = 0; - pgaio_uring_contexts = (PgAioUringContext *) - ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found); - + /* + * We allocate memory for all PgAioUringContext instances and, if + * supported, the memory required for each of the io_uring instances, in + * one ShmemInitStruct(). + */ + shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found); if (found) return; + pgaio_uring_contexts = (PgAioUringContext *) shmem; + shmem += pgaio_uring_context_shmem_size(); + + /* if supported, handle memory alignment / sizing for io_uring memory */ + if (pgaio_uring_caps.mem_init_size > 0) + { + ring_mem_remain = pgaio_uring_ring_shmem_size(); + ring_mem_next = (char *) shmem; + + /* align to page boundary, see also pgaio_uring_ring_shmem_size() */ + ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next); + + /* account for alignment */ + ring_mem_remain -= ring_mem_next - shmem; + shmem += ring_mem_next - shmem; + + shmem += ring_mem_remain; + } + for (int contextno = 0; contextno < TotalProcs; contextno++) { PgAioUringContext *context = &pgaio_uring_contexts[contextno]; @@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time) * be worth using that - also need to evaluate if that causes * noticeable additional contention? */ - ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + + /* + * If supported (c.f. pgaio_uring_check_capabilities()), create ring + * with its data in shared memory. Otherwise fall back io_uring + * creating a memory mapping for each ring. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + if (pgaio_uring_caps.mem_init_size > 0) + { + struct io_uring_params p = {0}; + + ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain); + + ring_mem_remain -= ret; + ring_mem_next += ret; + } + else +#endif + { + ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + } + if (ret < 0) { char *hint = NULL; @@ -400,9 +598,9 @@ pgaio_uring_wait_one(PgAioHandle *ioh, uint64 ref_generation) while (true) { pgaio_debug_io(DEBUG3, ioh, - "wait_one io_gen: %llu, ref_gen: %llu, cycle %d", - (long long unsigned) ioh->generation, - (long long unsigned) ref_generation, + "wait_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64 ", cycle %d", + ioh->generation, + ref_generation, waited); if (pgaio_io_was_recycled(ioh, ref_generation, &state) || diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index 743cccc2acd..bf8f77e6ff6 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -52,26 +52,26 @@ #define IO_WORKER_WAKEUP_FANOUT 2 -typedef struct AioWorkerSubmissionQueue +typedef struct PgAioWorkerSubmissionQueue { uint32 size; uint32 mask; uint32 head; uint32 tail; - uint32 ios[FLEXIBLE_ARRAY_MEMBER]; -} AioWorkerSubmissionQueue; + uint32 sqes[FLEXIBLE_ARRAY_MEMBER]; +} PgAioWorkerSubmissionQueue; -typedef struct AioWorkerSlot +typedef struct PgAioWorkerSlot { Latch *latch; bool in_use; -} AioWorkerSlot; +} PgAioWorkerSlot; -typedef struct AioWorkerControl +typedef struct PgAioWorkerControl { uint64 idle_worker_mask; - AioWorkerSlot workers[FLEXIBLE_ARRAY_MEMBER]; -} AioWorkerControl; + PgAioWorkerSlot workers[FLEXIBLE_ARRAY_MEMBER]; +} PgAioWorkerControl; static size_t pgaio_worker_shmem_size(void); @@ -96,8 +96,8 @@ int io_workers = 3; static int io_worker_queue_size = 64; static int MyIoWorkerId; -static AioWorkerSubmissionQueue *io_worker_submission_queue; -static AioWorkerControl *io_worker_control; +static PgAioWorkerSubmissionQueue *io_worker_submission_queue; +static PgAioWorkerControl *io_worker_control; static size_t @@ -106,15 +106,15 @@ pgaio_worker_queue_shmem_size(int *queue_size) /* Round size up to next power of two so we can make a mask. */ *queue_size = pg_nextpower2_32(io_worker_queue_size); - return offsetof(AioWorkerSubmissionQueue, ios) + + return offsetof(PgAioWorkerSubmissionQueue, sqes) + sizeof(uint32) * *queue_size; } static size_t pgaio_worker_control_shmem_size(void) { - return offsetof(AioWorkerControl, workers) + - sizeof(AioWorkerSlot) * MAX_IO_WORKERS; + return offsetof(PgAioWorkerControl, workers) + + sizeof(PgAioWorkerSlot) * MAX_IO_WORKERS; } static size_t @@ -162,7 +162,7 @@ pgaio_worker_shmem_init(bool first_time) } static int -pgaio_choose_idle_worker(void) +pgaio_worker_choose_idle(void) { int worker; @@ -172,6 +172,7 @@ pgaio_choose_idle_worker(void) /* Find the lowest bit position, and clear it. */ worker = pg_rightmost_one_pos64(io_worker_control->idle_worker_mask); io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << worker); + Assert(io_worker_control->workers[worker].in_use); return worker; } @@ -179,7 +180,7 @@ pgaio_choose_idle_worker(void) static bool pgaio_worker_submission_queue_insert(PgAioHandle *ioh) { - AioWorkerSubmissionQueue *queue; + PgAioWorkerSubmissionQueue *queue; uint32 new_head; queue = io_worker_submission_queue; @@ -191,7 +192,7 @@ pgaio_worker_submission_queue_insert(PgAioHandle *ioh) return false; /* full */ } - queue->ios[queue->head] = pgaio_io_get_id(ioh); + queue->sqes[queue->head] = pgaio_io_get_id(ioh); queue->head = new_head; return true; @@ -200,14 +201,14 @@ pgaio_worker_submission_queue_insert(PgAioHandle *ioh) static uint32 pgaio_worker_submission_queue_consume(void) { - AioWorkerSubmissionQueue *queue; + PgAioWorkerSubmissionQueue *queue; uint32 result; queue = io_worker_submission_queue; if (queue->tail == queue->head) return UINT32_MAX; /* empty */ - result = queue->ios[queue->tail]; + result = queue->sqes[queue->tail]; queue->tail = (queue->tail + 1) & (queue->size - 1); return result; @@ -240,37 +241,37 @@ pgaio_worker_needs_synchronous_execution(PgAioHandle *ioh) } static void -pgaio_worker_submit_internal(int nios, PgAioHandle *ios[]) +pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios) { PgAioHandle *synchronous_ios[PGAIO_SUBMIT_BATCH_SIZE]; int nsync = 0; Latch *wakeup = NULL; int worker; - Assert(nios <= PGAIO_SUBMIT_BATCH_SIZE); + Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE); LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE); - for (int i = 0; i < nios; ++i) + for (int i = 0; i < num_staged_ios; ++i) { - Assert(!pgaio_worker_needs_synchronous_execution(ios[i])); - if (!pgaio_worker_submission_queue_insert(ios[i])) + Assert(!pgaio_worker_needs_synchronous_execution(staged_ios[i])); + if (!pgaio_worker_submission_queue_insert(staged_ios[i])) { /* * We'll do it synchronously, but only after we've sent as many as * we can to workers, to maximize concurrency. */ - synchronous_ios[nsync++] = ios[i]; + synchronous_ios[nsync++] = staged_ios[i]; continue; } if (wakeup == NULL) { /* Choose an idle worker to wake up if we haven't already. */ - worker = pgaio_choose_idle_worker(); + worker = pgaio_worker_choose_idle(); if (worker >= 0) wakeup = io_worker_control->workers[worker].latch; - pgaio_debug_io(DEBUG4, ios[i], + pgaio_debug_io(DEBUG4, staged_ios[i], "choosing worker %d", worker); } @@ -316,6 +317,7 @@ pgaio_worker_die(int code, Datum arg) Assert(io_worker_control->workers[MyIoWorkerId].in_use); Assert(io_worker_control->workers[MyIoWorkerId].latch == MyLatch); + io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << MyIoWorkerId); io_worker_control->workers[MyIoWorkerId].in_use = false; io_worker_control->workers[MyIoWorkerId].latch = NULL; LWLockRelease(AioWorkerSubmissionQueueLock); @@ -461,7 +463,12 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) int nwakeups = 0; int worker; - /* Try to get a job to do. */ + /* + * Try to get a job to do. + * + * The lwlock acquisition also provides the necessary memory barrier + * to ensure that we don't see an outdated data in the handle. + */ LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE); if ((io_index = pgaio_worker_submission_queue_consume()) == UINT32_MAX) { @@ -483,7 +490,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) IO_WORKER_WAKEUP_FANOUT); for (int i = 0; i < nwakeups; ++i) { - if ((worker = pgaio_choose_idle_worker()) < 0) + if ((worker = pgaio_worker_choose_idle()) < 0) break; latches[nlatches++] = io_worker_control->workers[worker].latch; } @@ -568,6 +575,12 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) } CHECK_FOR_INTERRUPTS(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } } error_context_stack = errcallback.previous; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f93131a645e..67431208e7f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2743,12 +2743,10 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, * because mdread doesn't complain about reads beyond EOF (when * zero_damaged_pages is ON) and so a previous attempt to read a block * beyond EOF could have left a "valid" zero-filled buffer. - * Unfortunately, we have also seen this case occurring because of - * buggy Linux kernels that sometimes return an lseek(SEEK_END) result - * that doesn't account for a recent write. In that situation, the - * pre-existing buffer would contain valid data that we don't want to - * overwrite. Since the legitimate cases should always have left a - * zero-filled buffer, complain if not PageIsNew. + * + * This has also been observed when relation was overwritten by + * external process. Since the legitimate cases should always have + * left a zero-filled buffer, complain if not PageIsNew. */ if (existing_id >= 0) { @@ -2778,8 +2776,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, ereport(ERROR, (errmsg("unexpected data beyond EOF in block %u of relation %s", existing_hdr->tag.blockNum, - relpath(bmr.smgr->smgr_rlocator, fork).str), - errhint("This has been seen to occur with buggy kernels; consider updating your system."))); + relpath(bmr.smgr->smgr_rlocator, fork).str))); /* * We *must* do smgr[zero]extend before succeeding, else the page @@ -3339,10 +3336,10 @@ UnpinBufferNoOwner(BufferDesc *buf) * BufferSync -- Write out all dirty buffers in the pool. * * This is called at checkpoint time to write out all dirty shared buffers. - * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE - * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN, - * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even - * unlogged buffers, which are otherwise skipped. The remaining flags + * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is + * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN, + * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write + * even unlogged buffers, which are otherwise skipped. The remaining flags * currently have no effect here. */ static void @@ -3367,7 +3364,7 @@ BufferSync(int flags) * recovery, we write all dirty buffers. */ if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | - CHECKPOINT_FLUSH_ALL)))) + CHECKPOINT_FLUSH_UNLOGGED)))) mask |= BM_PERMANENT; /* @@ -4550,11 +4547,9 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, if (RelFileLocatorBackendIsTemp(rlocator)) { if (rlocator.backend == MyProcNumber) - { - for (j = 0; j < nforks; j++) - DropRelationLocalBuffers(rlocator.locator, forkNum[j], - firstDelBlock[j]); - } + DropRelationLocalBuffers(rlocator.locator, forkNum, nforks, + firstDelBlock); + return; } @@ -7320,7 +7315,7 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td, affected_count > 1 ? errdetail("Block %u held first zeroed page.", first + first_off) : 0, - errhint("See server log for details about the other %u invalid block(s).", + errhint("See server log for details about the other %d invalid block(s).", affected_count + checkfail_count - 1)); return; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 63101d56a07..3c0d20f4659 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -629,7 +629,7 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) */ if (check_unreferenced && (LocalRefCount[bufid] != 0 || BUF_STATE_GET_REFCOUNT(buf_state) != 0)) - elog(ERROR, "block %u of %s is still referenced (local %u)", + elog(ERROR, "block %u of %s is still referenced (local %d)", bufHdr->tag.blockNum, relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag), MyProcNumber, @@ -660,10 +660,11 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) * See DropRelationBuffers in bufmgr.c for more notes. */ void -DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, - BlockNumber firstDelBlock) +DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, + int nforks, BlockNumber *firstDelBlock) { int i; + int j; for (i = 0; i < NLocBuffer; i++) { @@ -672,12 +673,18 @@ DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, buf_state = pg_atomic_read_u32(&bufHdr->state); - if ((buf_state & BM_TAG_VALID) && - BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) && - BufTagGetForkNum(&bufHdr->tag) == forkNum && - bufHdr->tag.blockNum >= firstDelBlock) + if (!(buf_state & BM_TAG_VALID) || + !BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator)) + continue; + + for (j = 0; j < nforks; j++) { - InvalidateLocalBuffer(bufHdr, true); + if (BufTagGetForkNum(&bufHdr->tag) == forkNum[j] && + bufHdr->tag.blockNum >= firstDelBlock[j]) + { + InvalidateLocalBuffer(bufHdr, true); + break; + } } } } @@ -925,10 +932,11 @@ GetLocalBufferStorage(void) num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ); /* Buffers should be I/O aligned. */ - cur_block = (char *) - TYPEALIGN(PG_IO_ALIGN_SIZE, - MemoryContextAlloc(LocalBufferContext, - num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE)); + cur_block = MemoryContextAllocAligned(LocalBufferContext, + num_bufs * BLCKSZ, + PG_IO_ALIGN_SIZE, + 0); + next_buf_in_block = 0; num_bufs_in_block = num_bufs; } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 0e8299dd556..a4ec7959f31 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -400,25 +400,22 @@ pg_fsync(int fd) * portable, even if it runs ok on the current system. * * We assert here that a descriptor for a file was opened with write - * permissions (either O_RDWR or O_WRONLY) and for a directory without - * write permissions (O_RDONLY). + * permissions (i.e., not O_RDONLY) and for a directory without write + * permissions (O_RDONLY). Notice that the assertion check is made even + * if fsync() is disabled. * - * Ignore any fstat errors and let the follow-up fsync() do its work. - * Doing this sanity check here counts for the case where fsync() is - * disabled. + * If fstat() fails, ignore it and let the follow-up fsync() complain. */ if (fstat(fd, &st) == 0) { int desc_flags = fcntl(fd, F_GETFL); - /* - * O_RDONLY is historically 0, so just make sure that for directories - * no write flags are used. - */ + desc_flags &= O_ACCMODE; + if (S_ISDIR(st.st_mode)) - Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0); + Assert(desc_flags == O_RDONLY); else - Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0); + Assert(desc_flags != O_RDONLY); } errno = 0; #endif diff --git a/src/backend/storage/file/fileset.c b/src/backend/storage/file/fileset.c index 64141c7cb91..4d5ee353fd7 100644 --- a/src/backend/storage/file/fileset.c +++ b/src/backend/storage/file/fileset.c @@ -185,7 +185,7 @@ FileSetPath(char *path, FileSet *fileset, Oid tablespace) static Oid ChooseTablespace(const FileSet *fileset, const char *name) { - uint32 hash = hash_any((const unsigned char *) name, strlen(name)); + uint32 hash = hash_bytes((const unsigned char *) name, strlen(name)); return fileset->tablespaces[hash % fileset->ntablespaces]; } diff --git a/src/backend/storage/ipc/dsm_registry.c b/src/backend/storage/ipc/dsm_registry.c index 1d4fd31ffed..1682cc6d34c 100644 --- a/src/backend/storage/ipc/dsm_registry.c +++ b/src/backend/storage/ipc/dsm_registry.c @@ -15,6 +15,20 @@ * current backend. This function guarantees that only one backend * initializes the segment and that all other backends just attach it. * + * A DSA can be created in or retrieved from the registry by calling + * GetNamedDSA(). As with GetNamedDSMSegment(), if a DSA with the provided + * name does not yet exist, it is created. Otherwise, GetNamedDSA() + * ensures the DSA is attached to the current backend. This function + * guarantees that only one backend initializes the DSA and that all other + * backends just attach it. + * + * A dshash table can be created in or retrieved from the registry by + * calling GetNamedDSHash(). As with GetNamedDSMSegment(), if a hash + * table with the provided name does not yet exist, it is created. + * Otherwise, GetNamedDSHash() ensures the hash table is attached to the + * current backend. This function guarantees that only one backend + * initializes the table and that all other backends just attach it. + * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -26,12 +40,20 @@ #include "postgres.h" +#include "funcapi.h" #include "lib/dshash.h" #include "storage/dsm_registry.h" #include "storage/lwlock.h" #include "storage/shmem.h" +#include "utils/builtins.h" #include "utils/memutils.h" +#define DSMR_NAME_LEN 128 + +#define DSMR_DSA_TRANCHE_SUFFIX " DSA" +#define DSMR_DSA_TRANCHE_SUFFIX_LEN (sizeof(DSMR_DSA_TRANCHE_SUFFIX) - 1) +#define DSMR_DSA_TRANCHE_NAME_LEN (DSMR_NAME_LEN + DSMR_DSA_TRANCHE_SUFFIX_LEN) + typedef struct DSMRegistryCtxStruct { dsa_handle dsah; @@ -40,15 +62,55 @@ typedef struct DSMRegistryCtxStruct static DSMRegistryCtxStruct *DSMRegistryCtx; -typedef struct DSMRegistryEntry +typedef struct NamedDSMState { - char name[64]; dsm_handle handle; size_t size; +} NamedDSMState; + +typedef struct NamedDSAState +{ + dsa_handle handle; + int tranche; + char tranche_name[DSMR_DSA_TRANCHE_NAME_LEN]; +} NamedDSAState; + +typedef struct NamedDSHState +{ + NamedDSAState dsa; + dshash_table_handle handle; + int tranche; + char tranche_name[DSMR_NAME_LEN]; +} NamedDSHState; + +typedef enum DSMREntryType +{ + DSMR_ENTRY_TYPE_DSM, + DSMR_ENTRY_TYPE_DSA, + DSMR_ENTRY_TYPE_DSH, +} DSMREntryType; + +static const char *const DSMREntryTypeNames[] = +{ + [DSMR_ENTRY_TYPE_DSM] = "segment", + [DSMR_ENTRY_TYPE_DSA] = "area", + [DSMR_ENTRY_TYPE_DSH] = "hash", +}; + +typedef struct DSMRegistryEntry +{ + char name[DSMR_NAME_LEN]; + DSMREntryType type; + union + { + NamedDSMState dsm; + NamedDSAState dsa; + NamedDSHState dsh; + } data; } DSMRegistryEntry; static const dshash_parameters dsh_params = { - offsetof(DSMRegistryEntry, handle), + offsetof(DSMRegistryEntry, type), sizeof(DSMRegistryEntry), dshash_strcmp, dshash_strhash, @@ -141,7 +203,7 @@ GetNamedDSMSegment(const char *name, size_t size, ereport(ERROR, (errmsg("DSM segment name cannot be empty"))); - if (strlen(name) >= offsetof(DSMRegistryEntry, handle)) + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) ereport(ERROR, (errmsg("DSM segment name too long"))); @@ -158,32 +220,39 @@ GetNamedDSMSegment(const char *name, size_t size, entry = dshash_find_or_insert(dsm_registry_table, name, found); if (!(*found)) { + NamedDSMState *state = &entry->data.dsm; + dsm_segment *seg; + + entry->type = DSMR_ENTRY_TYPE_DSM; + /* Initialize the segment. */ - dsm_segment *seg = dsm_create(size, 0); + seg = dsm_create(size, 0); dsm_pin_segment(seg); dsm_pin_mapping(seg); - entry->handle = dsm_segment_handle(seg); - entry->size = size; + state->handle = dsm_segment_handle(seg); + state->size = size; ret = dsm_segment_address(seg); if (init_callback) (*init_callback) (ret); } - else if (entry->size != size) - { + else if (entry->type != DSMR_ENTRY_TYPE_DSM) ereport(ERROR, - (errmsg("requested DSM segment size does not match size of " - "existing segment"))); - } + (errmsg("requested DSM segment does not match type of existing entry"))); + else if (entry->data.dsm.size != size) + ereport(ERROR, + (errmsg("requested DSM segment size does not match size of existing segment"))); else { - dsm_segment *seg = dsm_find_mapping(entry->handle); + NamedDSMState *state = &entry->data.dsm; + dsm_segment *seg; /* If the existing segment is not already attached, attach it now. */ + seg = dsm_find_mapping(state->handle); if (seg == NULL) { - seg = dsm_attach(entry->handle); + seg = dsm_attach(state->handle); if (seg == NULL) elog(ERROR, "could not map dynamic shared memory segment"); @@ -198,3 +267,220 @@ GetNamedDSMSegment(const char *name, size_t size, return ret; } + +/* + * Initialize or attach a named DSA. + * + * This routine returns a pointer to the DSA. A new LWLock tranche ID will be + * generated if needed. Note that the lock tranche will be registered with the + * provided name. Also note that this should be called at most once for a + * given DSA in each backend. + */ +dsa_area * +GetNamedDSA(const char *name, bool *found) +{ + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dsa_area *ret; + + Assert(found); + + if (!name || *name == '\0') + ereport(ERROR, + (errmsg("DSA name cannot be empty"))); + + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) + ereport(ERROR, + (errmsg("DSA name too long"))); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* Connect to the registry. */ + init_dsm_registry(); + + entry = dshash_find_or_insert(dsm_registry_table, name, found); + if (!(*found)) + { + NamedDSAState *state = &entry->data.dsa; + + entry->type = DSMR_ENTRY_TYPE_DSA; + + /* Initialize the LWLock tranche for the DSA. */ + state->tranche = LWLockNewTrancheId(); + strcpy(state->tranche_name, name); + LWLockRegisterTranche(state->tranche, state->tranche_name); + + /* Initialize the DSA. */ + ret = dsa_create(state->tranche); + dsa_pin(ret); + dsa_pin_mapping(ret); + + /* Store handle for other backends to use. */ + state->handle = dsa_get_handle(ret); + } + else if (entry->type != DSMR_ENTRY_TYPE_DSA) + ereport(ERROR, + (errmsg("requested DSA does not match type of existing entry"))); + else + { + NamedDSAState *state = &entry->data.dsa; + + if (dsa_is_attached(state->handle)) + ereport(ERROR, + (errmsg("requested DSA already attached to current process"))); + + /* Initialize existing LWLock tranche for the DSA. */ + LWLockRegisterTranche(state->tranche, state->tranche_name); + + /* Attach to existing DSA. */ + ret = dsa_attach(state->handle); + dsa_pin_mapping(ret); + } + + dshash_release_lock(dsm_registry_table, entry); + MemoryContextSwitchTo(oldcontext); + + return ret; +} + +/* + * Initialize or attach a named dshash table. + * + * This routine returns the address of the table. The tranche_id member of + * params is ignored; new tranche IDs will be generated if needed. Note that + * the DSA lock tranche will be registered with the provided name with " DSA" + * appended. The dshash lock tranche will be registered with the provided + * name. Also note that this should be called at most once for a given table + * in each backend. + */ +dshash_table * +GetNamedDSHash(const char *name, const dshash_parameters *params, bool *found) +{ + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dshash_table *ret; + + Assert(params); + Assert(found); + + if (!name || *name == '\0') + ereport(ERROR, + (errmsg("DSHash name cannot be empty"))); + + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) + ereport(ERROR, + (errmsg("DSHash name too long"))); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* Connect to the registry. */ + init_dsm_registry(); + + entry = dshash_find_or_insert(dsm_registry_table, name, found); + if (!(*found)) + { + NamedDSAState *dsa_state = &entry->data.dsh.dsa; + NamedDSHState *dsh_state = &entry->data.dsh; + dshash_parameters params_copy; + dsa_area *dsa; + + entry->type = DSMR_ENTRY_TYPE_DSH; + + /* Initialize the LWLock tranche for the DSA. */ + dsa_state->tranche = LWLockNewTrancheId(); + sprintf(dsa_state->tranche_name, "%s%s", name, DSMR_DSA_TRANCHE_SUFFIX); + LWLockRegisterTranche(dsa_state->tranche, dsa_state->tranche_name); + + /* Initialize the LWLock tranche for the dshash table. */ + dsh_state->tranche = LWLockNewTrancheId(); + strcpy(dsh_state->tranche_name, name); + LWLockRegisterTranche(dsh_state->tranche, dsh_state->tranche_name); + + /* Initialize the DSA for the hash table. */ + dsa = dsa_create(dsa_state->tranche); + dsa_pin(dsa); + dsa_pin_mapping(dsa); + + /* Initialize the dshash table. */ + memcpy(¶ms_copy, params, sizeof(dshash_parameters)); + params_copy.tranche_id = dsh_state->tranche; + ret = dshash_create(dsa, ¶ms_copy, NULL); + + /* Store handles for other backends to use. */ + dsa_state->handle = dsa_get_handle(dsa); + dsh_state->handle = dshash_get_hash_table_handle(ret); + } + else if (entry->type != DSMR_ENTRY_TYPE_DSH) + ereport(ERROR, + (errmsg("requested DSHash does not match type of existing entry"))); + else + { + NamedDSAState *dsa_state = &entry->data.dsh.dsa; + NamedDSHState *dsh_state = &entry->data.dsh; + dsa_area *dsa; + + /* XXX: Should we verify params matches what table was created with? */ + + if (dsa_is_attached(dsa_state->handle)) + ereport(ERROR, + (errmsg("requested DSHash already attached to current process"))); + + /* Initialize existing LWLock tranches for the DSA and dshash table. */ + LWLockRegisterTranche(dsa_state->tranche, dsa_state->tranche_name); + LWLockRegisterTranche(dsh_state->tranche, dsh_state->tranche_name); + + /* Attach to existing DSA for the hash table. */ + dsa = dsa_attach(dsa_state->handle); + dsa_pin_mapping(dsa); + + /* Attach to existing dshash table. */ + ret = dshash_attach(dsa, params, dsh_state->handle, NULL); + } + + dshash_release_lock(dsm_registry_table, entry); + MemoryContextSwitchTo(oldcontext); + + return ret; +} + +Datum +pg_get_dsm_registry_allocations(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dshash_seq_status status; + + InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + init_dsm_registry(); + MemoryContextSwitchTo(oldcontext); + + dshash_seq_init(&status, dsm_registry_table, false); + while ((entry = dshash_seq_next(&status)) != NULL) + { + Datum vals[3]; + bool nulls[3] = {0}; + + vals[0] = CStringGetTextDatum(entry->name); + vals[1] = CStringGetTextDatum(DSMREntryTypeNames[entry->type]); + + /* + * Since we can't know the size of DSA/dshash entries without first + * attaching to them, return NULL for those. + */ + if (entry->type == DSMR_ENTRY_TYPE_DSM) + vals[2] = Int64GetDatum(entry->data.dsm.size); + else + nulls[2] = true; + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, vals, nulls); + } + dshash_seq_term(&status); + + return (Datum) 0; +} diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 00c76d05356..2fa045e6b0f 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -51,7 +51,6 @@ #include "storage/sinvaladt.h" #include "utils/guc.h" #include "utils/injection_point.h" -#include "utils/memutils.h" /* GUCs */ int shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE; @@ -151,7 +150,6 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); - size = add_size(size, MemoryContextReportingShmemSize()); /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -345,7 +343,6 @@ CreateOrAttachShmemStructs(void) WaitEventCustomShmemInit(); InjectionPointShmemInit(); AioShmemInit(); - MemoryContextReportingShmemInit(); } /* diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index c6aefd2f688..beadeb5e46a 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -187,9 +187,11 @@ WaitLatch(Latch *latch, int wakeEvents, long timeout, if (!(wakeEvents & WL_LATCH_SET)) latch = NULL; ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch); - ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos, - (wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)), - NULL); + + if (IsUnderPostmaster) + ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos, + (wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)), + NULL); if (WaitEventSetWait(LatchWaitSet, (wakeEvents & WL_TIMEOUT) ? timeout : -1, diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index e5b945a9ee3..bf987aed8d3 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -1622,58 +1622,6 @@ TransactionIdIsInProgress(TransactionId xid) return false; } -/* - * TransactionIdIsActive -- is xid the top-level XID of an active backend? - * - * This differs from TransactionIdIsInProgress in that it ignores prepared - * transactions, as well as transactions running on the primary if we're in - * hot standby. Also, we ignore subtransactions since that's not needed - * for current uses. - */ -bool -TransactionIdIsActive(TransactionId xid) -{ - bool result = false; - ProcArrayStruct *arrayP = procArray; - TransactionId *other_xids = ProcGlobal->xids; - int i; - - /* - * Don't bother checking a transaction older than RecentXmin; it could not - * possibly still be running. - */ - if (TransactionIdPrecedes(xid, RecentXmin)) - return false; - - LWLockAcquire(ProcArrayLock, LW_SHARED); - - for (i = 0; i < arrayP->numProcs; i++) - { - int pgprocno = arrayP->pgprocnos[i]; - PGPROC *proc = &allProcs[pgprocno]; - TransactionId pxid; - - /* Fetch xid just once - see GetNewTransactionId */ - pxid = UINT32_ACCESS_ONCE(other_xids[i]); - - if (!TransactionIdIsValid(pxid)) - continue; - - if (proc->pid == 0) - continue; /* ignore prepared transactions */ - - if (TransactionIdEquals(pxid, xid)) - { - result = true; - break; - } - } - - LWLockRelease(ProcArrayLock); - - return result; -} - /* * Determine XID horizons. @@ -2866,8 +2814,10 @@ GetRunningTransactionData(void) * * Similar to GetSnapshotData but returns just oldestActiveXid. We include * all PGPROCs with an assigned TransactionId, even VACUUM processes. - * We look at all databases, though there is no need to include WALSender - * since this has no effect on hot standby conflicts. + * + * If allDbs is true, we look at all databases, though there is no need to + * include WALSender since this has no effect on hot standby conflicts. If + * allDbs is false, skip processes attached to other databases. * * This is never executed during recovery so there is no need to look at * KnownAssignedXids. @@ -2875,9 +2825,12 @@ GetRunningTransactionData(void) * We don't worry about updating other counters, we want to keep this as * simple as possible and leave GetSnapshotData() as the primary code for * that bookkeeping. + * + * inCommitOnly indicates getting the oldestActiveXid among the transactions + * in the commit critical section. */ TransactionId -GetOldestActiveTransactionId(void) +GetOldestActiveTransactionId(bool inCommitOnly, bool allDbs) { ProcArrayStruct *arrayP = procArray; TransactionId *other_xids = ProcGlobal->xids; @@ -2904,6 +2857,8 @@ GetOldestActiveTransactionId(void) for (index = 0; index < arrayP->numProcs; index++) { TransactionId xid; + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; /* Fetch xid just once - see GetNewTransactionId */ xid = UINT32_ACCESS_ONCE(other_xids[index]); @@ -2911,6 +2866,13 @@ GetOldestActiveTransactionId(void) if (!TransactionIdIsNormal(xid)) continue; + if (inCommitOnly && + (proc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0) + continue; + + if (!allDbs && proc->databaseId != MyDatabaseId) + continue; + if (TransactionIdPrecedes(xid, oldestRunningXid)) oldestRunningXid = xid; diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index ce69e26d720..087821311cc 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -691,9 +691,6 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_LOG_MEMORY_CONTEXT)) HandleLogMemoryContextInterrupt(); - if (CheckProcSignal(PROCSIG_GET_MEMORY_CONTEXT)) - HandleGetMemoryContextInterrupt(); - if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE)) HandleParallelApplyMessageInterrupt(); @@ -731,7 +728,11 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) void SendCancelRequest(int backendPID, const uint8 *cancel_key, int cancel_key_len) { - Assert(backendPID != 0); + if (backendPID == 0) + { + ereport(LOG, (errmsg("invalid cancel request with PID 0"))); + return; + } /* * See if we have a matching backend. Reading the pss_pid and diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index c9ae3b45b76..ca3656fc76f 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -679,12 +679,10 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) */ for (i = 0; i < shm_ent_page_count; i++) { - volatile uint64 touch pg_attribute_unused(); - page_ptrs[i] = startptr + (i * os_page_size); if (firstNumaTouch) - pg_numa_touch_mem_if_required(touch, page_ptrs[i]); + pg_numa_touch_mem_if_required(page_ptrs[i]); CHECK_FOR_INTERRUPTS(); } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 7fa8d9247e0..4222bdab078 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -1376,7 +1376,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) if (xlrec.subxid_overflow) elog(DEBUG2, - "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + "snapshot of %d running transactions overflowed (lsn %X/%08X oldest xid %u latest complete %u next xid %u)", CurrRunningXacts->xcnt, LSN_FORMAT_ARGS(recptr), CurrRunningXacts->oldestRunningXid, @@ -1384,7 +1384,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) CurrRunningXacts->nextXid); else elog(DEBUG2, - "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + "snapshot of %d+%d running transaction ids (lsn %X/%08X oldest xid %u latest complete %u next xid %u)", CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt, LSN_FORMAT_ARGS(recptr), CurrRunningXacts->oldestRunningXid, diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c index 68b76f2cc18..a874000c8ca 100644 --- a/src/backend/storage/large_object/inv_api.c +++ b/src/backend/storage/large_object/inv_api.c @@ -561,7 +561,7 @@ inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes) char data[LOBLKSIZE + VARHDRSZ]; /* ensure union is aligned well enough: */ int32 align_it; - } workbuf; + } workbuf = {0}; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; @@ -752,7 +752,7 @@ inv_truncate(LargeObjectDesc *obj_desc, int64 len) char data[LOBLKSIZE + VARHDRSZ]; /* ensure union is aligned well enough: */ int32 align_it; - } workbuf; + } workbuf = {0}; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl index 4441b7cba0c..cd3e43c448a 100644 --- a/src/backend/storage/lmgr/generate-lwlocknames.pl +++ b/src/backend/storage/lmgr/generate-lwlocknames.pl @@ -10,7 +10,6 @@ use Getopt::Long; my $output_path = '.'; my $lastlockidx = -1; -my $continue = "\n"; GetOptions('outdir:s' => \$output_path); @@ -28,18 +27,24 @@ print $h "/* there is deliberately not an #ifndef LWLOCKNAMES_H here */\n\n"; # -# First, record the predefined LWLocks listed in wait_event_names.txt. We'll -# cross-check those with the ones in lwlocklist.h. +# First, record the predefined LWLocks and built-in tranches listed in +# wait_event_names.txt. We'll cross-check those with the ones in lwlocklist.h. # +my @wait_event_tranches; my @wait_event_lwlocks; my $record_lwlocks = 0; +my $in_tranches = 0; while (<$wait_event_names>) { chomp; # Check for end marker. - last if /^# END OF PREDEFINED LWLOCKS/; + if (/^# END OF PREDEFINED LWLOCKS/) + { + $in_tranches = 1; + next; + } # Skip comments and empty lines. next if /^#/; @@ -55,13 +60,29 @@ while (<$wait_event_names>) # Go to the next line if we are not yet recording LWLocks. next if not $record_lwlocks; + # Stop recording if we reach another section. + last if /^Section:/; + # Record the LWLock. (my $waiteventname, my $waitevendocsentence) = split(/\t/, $_); - push(@wait_event_lwlocks, $waiteventname); + + if ($in_tranches) + { + push(@wait_event_tranches, $waiteventname); + } + else + { + push(@wait_event_lwlocks, $waiteventname); + } } +# +# While gathering the list of predefined LWLocks, cross-check the lists in +# lwlocklist.h with the wait events we just recorded. +# my $in_comment = 0; -my $i = 0; +my $lwlock_count = 0; +my $tranche_count = 0; while (<$lwlocklist>) { chomp; @@ -82,40 +103,72 @@ while (<$lwlocklist>) next; } - die "unable to parse lwlocklist.h line \"$_\"" - unless /^PG_LWLOCK\((\d+),\s+(\w+)\)$/; + # + # Gather list of predefined LWLocks and cross-check with the wait events. + # + if (/^PG_LWLOCK\((\d+),\s+(\w+)\)$/) + { + my ($lockidx, $lockname) = ($1, $2); - (my $lockidx, my $lockname) = ($1, $2); + die "lwlocklist.h not in order" if $lockidx < $lastlockidx; + die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx; - die "lwlocklist.h not in order" if $lockidx < $lastlockidx; - die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx; + die "$lockname defined in lwlocklist.h but missing from " + . "wait_event_names.txt" + if $lwlock_count >= scalar @wait_event_lwlocks; + die "lists of predefined LWLocks do not match (first mismatch at " + . "$wait_event_lwlocks[$lwlock_count] in wait_event_names.txt and " + . "$lockname in lwlocklist.h)" + if $wait_event_lwlocks[$lwlock_count] ne $lockname; - die "$lockname defined in lwlocklist.h but missing from " - . "wait_event_names.txt" - if $i >= scalar @wait_event_lwlocks; - die "lists of predefined LWLocks do not match (first mismatch at " - . "$wait_event_lwlocks[$i] in wait_event_names.txt and $lockname in " - . "lwlocklist.h)" - if $wait_event_lwlocks[$i] ne $lockname; - $i++; + $lwlock_count++; - while ($lastlockidx < $lockidx - 1) + while ($lastlockidx < $lockidx - 1) + { + ++$lastlockidx; + } + $lastlockidx = $lockidx; + + # Add a "Lock" suffix to each lock name, as the C code depends on that. + printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n", + $lockname . "Lock"; + + next; + } + + # + # Cross-check the built-in LWLock tranches with the wait events. + # + if (/^PG_LWLOCKTRANCHE\((\w+),\s+(\w+)\)$/) { - ++$lastlockidx; - $continue = ",\n"; + my ($tranche_id, $tranche_name) = ($1, $2); + + die "$tranche_name defined in lwlocklist.h but missing from " + . "wait_event_names.txt" + if $tranche_count >= scalar @wait_event_tranches; + die + "lists of built-in LWLock tranches do not match (first mismatch at " + . "$wait_event_tranches[$tranche_count] in wait_event_names.txt and " + . "$tranche_name in lwlocklist.h)" + if $wait_event_tranches[$tranche_count] ne $tranche_name; + + $tranche_count++; + + next; } - $lastlockidx = $lockidx; - $continue = ",\n"; - # Add a "Lock" suffix to each lock name, as the C code depends on that - printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n", - $lockname . "Lock"; + die "unable to parse lwlocklist.h line \"$_\""; } die - "$wait_event_lwlocks[$i] defined in wait_event_names.txt but missing from " - . "lwlocklist.h" - if $i < scalar @wait_event_lwlocks; + "$wait_event_lwlocks[$lwlock_count] defined in wait_event_names.txt but " + . " missing from lwlocklist.h" + if $lwlock_count < scalar @wait_event_lwlocks; + +die + "$wait_event_tranches[$tranche_count] defined in wait_event_names.txt but " + . "missing from lwlocklist.h" + if $tranche_count < scalar @wait_event_tranches; print $h "\n"; printf $h "#define NUM_INDIVIDUAL_LWLOCKS %s\n", $lastlockidx + 1; diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index f50962983c3..3f6bf70bd3c 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -717,7 +717,10 @@ XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, * through, to avoid slowing down the normal case.) */ if (!first) + { + CHECK_FOR_INTERRUPTS(); pg_usleep(1000L); + } first = false; xid = SubTransGetTopmostTransaction(xid); } @@ -757,7 +760,10 @@ ConditionalXactLockTableWait(TransactionId xid, bool logLockFailure) /* See XactLockTableWait about this case */ if (!first) + { + CHECK_FOR_INTERRUPTS(); pg_usleep(1000L); + } first = false; xid = SubTransGetTopmostTransaction(xid); } diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 86b06b9223f..62f3471448e 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -51,7 +51,7 @@ /* GUC variables */ int max_locks_per_xact; /* used to set the lock table size */ -bool log_lock_failure = false; +bool log_lock_failures = false; #define NLOCKENTS() \ mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) @@ -3539,9 +3539,9 @@ AtPrepare_Locks(void) * but that probably costs more cycles. */ void -PostPrepare_Locks(TransactionId xid) +PostPrepare_Locks(FullTransactionId fxid) { - PGPROC *newproc = TwoPhaseGetDummyProc(xid, false); + PGPROC *newproc = TwoPhaseGetDummyProc(fxid, false); HASH_SEQ_STATUS status; LOCALLOCK *locallock; LOCK *lock; @@ -4324,11 +4324,11 @@ DumpAllLocks(void) * and PANIC anyway. */ void -lock_twophase_recover(TransactionId xid, uint16 info, +lock_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; - PGPROC *proc = TwoPhaseGetDummyProc(xid, false); + PGPROC *proc = TwoPhaseGetDummyProc(fxid, false); LOCKTAG *locktag; LOCKMODE lockmode; LOCKMETHODID lockmethodid; @@ -4505,7 +4505,7 @@ lock_twophase_recover(TransactionId xid, uint16 info, * starting up into hot standby mode. */ void -lock_twophase_standby_recover(TransactionId xid, uint16 info, +lock_twophase_standby_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; @@ -4524,7 +4524,7 @@ lock_twophase_standby_recover(TransactionId xid, uint16 info, if (lockmode == AccessExclusiveLock && locktag->locktag_type == LOCKTAG_RELATION) { - StandbyAcquireAccessExclusiveLock(xid, + StandbyAcquireAccessExclusiveLock(XidFromFullTransactionId(fxid), locktag->locktag_field1 /* dboid */ , locktag->locktag_field2 /* reloid */ ); } @@ -4537,11 +4537,11 @@ lock_twophase_standby_recover(TransactionId xid, uint16 info, * Find and release the lock indicated by the 2PC record. */ void -lock_twophase_postcommit(TransactionId xid, uint16 info, +lock_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; - PGPROC *proc = TwoPhaseGetDummyProc(xid, true); + PGPROC *proc = TwoPhaseGetDummyProc(fxid, true); LOCKTAG *locktag; LOCKMETHODID lockmethodid; LockMethod lockMethodTable; @@ -4563,10 +4563,10 @@ lock_twophase_postcommit(TransactionId xid, uint16 info, * This is actually just the same as the COMMIT case. */ void -lock_twophase_postabort(TransactionId xid, uint16 info, +lock_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { - lock_twophase_postcommit(xid, info, recdata, len); + lock_twophase_postcommit(fxid, info, recdata, len); } /* diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 5148ef982e3..ec9c345ffdf 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -122,9 +122,8 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0, * own tranche. We absorb the names of these tranches from there into * BuiltinTrancheNames here. * - * 2. There are some predefined tranches for built-in groups of locks. - * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names - * appear in BuiltinTrancheNames[] below. + * 2. There are some predefined tranches for built-in groups of locks defined + * in lwlocklist.h. We absorb the names of these tranches, too. * * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche * or LWLockRegisterTranche. The names of these that are known in the current @@ -135,51 +134,10 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0, */ static const char *const BuiltinTrancheNames[] = { #define PG_LWLOCK(id, lockname) [id] = CppAsString(lockname), +#define PG_LWLOCKTRANCHE(id, lockname) [LWTRANCHE_##id] = CppAsString(lockname), #include "storage/lwlocklist.h" #undef PG_LWLOCK - [LWTRANCHE_XACT_BUFFER] = "XactBuffer", - [LWTRANCHE_COMMITTS_BUFFER] = "CommitTsBuffer", - [LWTRANCHE_SUBTRANS_BUFFER] = "SubtransBuffer", - [LWTRANCHE_MULTIXACTOFFSET_BUFFER] = "MultiXactOffsetBuffer", - [LWTRANCHE_MULTIXACTMEMBER_BUFFER] = "MultiXactMemberBuffer", - [LWTRANCHE_NOTIFY_BUFFER] = "NotifyBuffer", - [LWTRANCHE_SERIAL_BUFFER] = "SerialBuffer", - [LWTRANCHE_WAL_INSERT] = "WALInsert", - [LWTRANCHE_BUFFER_CONTENT] = "BufferContent", - [LWTRANCHE_REPLICATION_ORIGIN_STATE] = "ReplicationOriginState", - [LWTRANCHE_REPLICATION_SLOT_IO] = "ReplicationSlotIO", - [LWTRANCHE_LOCK_FASTPATH] = "LockFastPath", - [LWTRANCHE_BUFFER_MAPPING] = "BufferMapping", - [LWTRANCHE_LOCK_MANAGER] = "LockManager", - [LWTRANCHE_PREDICATE_LOCK_MANAGER] = "PredicateLockManager", - [LWTRANCHE_PARALLEL_HASH_JOIN] = "ParallelHashJoin", - [LWTRANCHE_PARALLEL_BTREE_SCAN] = "ParallelBtreeScan", - [LWTRANCHE_PARALLEL_QUERY_DSA] = "ParallelQueryDSA", - [LWTRANCHE_PER_SESSION_DSA] = "PerSessionDSA", - [LWTRANCHE_PER_SESSION_RECORD_TYPE] = "PerSessionRecordType", - [LWTRANCHE_PER_SESSION_RECORD_TYPMOD] = "PerSessionRecordTypmod", - [LWTRANCHE_SHARED_TUPLESTORE] = "SharedTupleStore", - [LWTRANCHE_SHARED_TIDBITMAP] = "SharedTidBitmap", - [LWTRANCHE_PARALLEL_APPEND] = "ParallelAppend", - [LWTRANCHE_PER_XACT_PREDICATE_LIST] = "PerXactPredicateList", - [LWTRANCHE_PGSTATS_DSA] = "PgStatsDSA", - [LWTRANCHE_PGSTATS_HASH] = "PgStatsHash", - [LWTRANCHE_PGSTATS_DATA] = "PgStatsData", - [LWTRANCHE_LAUNCHER_DSA] = "LogicalRepLauncherDSA", - [LWTRANCHE_LAUNCHER_HASH] = "LogicalRepLauncherHash", - [LWTRANCHE_DSM_REGISTRY_DSA] = "DSMRegistryDSA", - [LWTRANCHE_DSM_REGISTRY_HASH] = "DSMRegistryHash", - [LWTRANCHE_COMMITTS_SLRU] = "CommitTsSLRU", - [LWTRANCHE_MULTIXACTOFFSET_SLRU] = "MultixactOffsetSLRU", - [LWTRANCHE_MULTIXACTMEMBER_SLRU] = "MultixactMemberSLRU", - [LWTRANCHE_NOTIFY_SLRU] = "NotifySLRU", - [LWTRANCHE_SERIAL_SLRU] = "SerialSLRU", - [LWTRANCHE_SUBTRANS_SLRU] = "SubtransSLRU", - [LWTRANCHE_XACT_SLRU] = "XactSLRU", - [LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA", - [LWTRANCHE_AIO_URING_COMPLETION] = "AioUringCompletion", - [LWTRANCHE_MEMORY_CONTEXT_REPORTING_STATE] = "MemoryContextReportingState", - [LWTRANCHE_MEMORY_CONTEXT_REPORTING_PROC] = "MemoryContextReportingPerProcess", +#undef PG_LWLOCKTRANCHE }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index d82114ffca1..c07fb588355 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -191,7 +191,7 @@ * AtPrepare_PredicateLocks(void); * PostPrepare_PredicateLocks(TransactionId xid); * PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit); - * predicatelock_twophase_recover(TransactionId xid, uint16 info, + * predicatelock_twophase_recover(FullTransactionId fxid, uint16 info, * void *recdata, uint32 len); */ @@ -4856,7 +4856,7 @@ AtPrepare_PredicateLocks(void) * anyway. We only need to clean up our local state. */ void -PostPrepare_PredicateLocks(TransactionId xid) +PostPrepare_PredicateLocks(FullTransactionId fxid) { if (MySerializableXact == InvalidSerializableXact) return; @@ -4879,12 +4879,12 @@ PostPrepare_PredicateLocks(TransactionId xid) * commits or aborts. */ void -PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit) +PredicateLockTwoPhaseFinish(FullTransactionId fxid, bool isCommit) { SERIALIZABLEXID *sxid; SERIALIZABLEXIDTAG sxidtag; - sxidtag.xid = xid; + sxidtag.xid = XidFromFullTransactionId(fxid); LWLockAcquire(SerializableXactHashLock, LW_SHARED); sxid = (SERIALIZABLEXID *) @@ -4906,10 +4906,11 @@ PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit) * Re-acquire a predicate lock belonging to a transaction that was prepared. */ void -predicatelock_twophase_recover(TransactionId xid, uint16 info, +predicatelock_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhasePredicateRecord *record; + TransactionId xid = XidFromFullTransactionId(fxid); Assert(len == sizeof(TwoPhasePredicateRecord)); diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index f194e6b3dcc..e9ef0fbfe32 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -50,7 +50,6 @@ #include "storage/procsignal.h" #include "storage/spin.h" #include "storage/standby.h" -#include "utils/memutils.h" #include "utils/timeout.h" #include "utils/timestamp.h" diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c index a7d1fec981f..14d5fc0b196 100644 --- a/src/backend/tcop/backend_startup.c +++ b/src/backend/tcop/backend_startup.c @@ -492,7 +492,7 @@ static int ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) { int32 len; - char *buf; + char *buf = NULL; ProtocolVersion proto; MemoryContext oldcontext; @@ -516,7 +516,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) * scanners, which may be less benign, but it's not really our job to * notice those.) */ - return STATUS_ERROR; + goto fail; } if (pq_getbytes(((char *) &len) + 1, 3) == EOF) @@ -526,7 +526,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("incomplete startup packet"))); - return STATUS_ERROR; + goto fail; } len = pg_ntoh32(len); @@ -538,7 +538,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid length of startup packet"))); - return STATUS_ERROR; + goto fail; } /* @@ -554,7 +554,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("incomplete startup packet"))); - return STATUS_ERROR; + goto fail; } pq_endmsgread(); @@ -568,7 +568,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) { ProcessCancelRequestPacket(port, buf, len); /* Not really an error, but we don't want to proceed further */ - return STATUS_ERROR; + goto fail; } if (proto == NEGOTIATE_SSL_CODE && !ssl_done) @@ -607,14 +607,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode_for_socket_access(), errmsg("failed to send SSL negotiation response: %m"))); - return STATUS_ERROR; /* close the connection */ + goto fail; /* close the connection */ } #ifdef USE_SSL if (SSLok == 'S' && secure_open_server(port) == -1) - return STATUS_ERROR; + goto fail; #endif + pfree(buf); + /* * At this point we should have no data already buffered. If we do, * it was received before we performed the SSL handshake, so it wasn't @@ -661,14 +663,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) ereport(COMMERROR, (errcode_for_socket_access(), errmsg("failed to send GSSAPI negotiation response: %m"))); - return STATUS_ERROR; /* close the connection */ + goto fail; /* close the connection */ } #ifdef ENABLE_GSS if (GSSok == 'G' && secure_open_gssapi(port) == -1) - return STATUS_ERROR; + goto fail; #endif + pfree(buf); + /* * At this point we should have no data already buffered. If we do, * it was received before we performed the GSS handshake, so it wasn't @@ -863,7 +867,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) */ MemoryContextSwitchTo(oldcontext); + pfree(buf); + return STATUS_OK; + +fail: + /* be tidy, just to avoid Valgrind complaints */ + if (buf) + pfree(buf); + + return STATUS_ERROR; } /* @@ -881,7 +894,7 @@ ProcessCancelRequestPacket(Port *port, void *pkt, int pktlen) { ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("invalid length of query cancel packet"))); + errmsg("invalid length of cancel request packet"))); return; } len = pktlen - offsetof(CancelRequestPacket, cancelAuthCode); @@ -889,7 +902,7 @@ ProcessCancelRequestPacket(Port *port, void *pkt, int pktlen) { ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("invalid length of query cancel key"))); + errmsg("invalid length of cancel key in cancel request packet"))); return; } @@ -1077,7 +1090,7 @@ check_log_connections(char **newval, void **extra, GucSource source) if (!SplitIdentifierString(rawstring, ',', &elemlist)) { - GUC_check_errdetail("Invalid list syntax in parameter \"log_connections\"."); + GUC_check_errdetail("Invalid list syntax in parameter \"%s\".", "log_connections"); pfree(rawstring); list_free(elemlist); return false; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 1ae51b1b391..0cecd464902 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -988,6 +988,7 @@ pg_plan_queries(List *querytrees, const char *query_string, int cursorOptions, stmt->stmt_location = query->stmt_location; stmt->stmt_len = query->stmt_len; stmt->queryId = query->queryId; + stmt->planOrigin = PLAN_STMT_INTERNAL; } else { @@ -1226,7 +1227,6 @@ exec_simple_query(const char *query_string) query_string, commandTag, plantree_list, - NULL, NULL); /* @@ -1683,7 +1683,7 @@ exec_bind_message(StringInfo input_message) { Query *query = lfirst_node(Query, lc); - if (query->queryId != UINT64CONST(0)) + if (query->queryId != INT64CONST(0)) { pgstat_report_query_id(query->queryId, false); break; @@ -2028,15 +2028,14 @@ exec_bind_message(StringInfo input_message) query_string, psrc->commandTag, cplan->stmt_list, - cplan, - psrc); + cplan); /* Portal is defined, set the plan ID based on its contents. */ foreach(lc, portal->stmts) { PlannedStmt *plan = lfirst_node(PlannedStmt, lc); - if (plan->planId != UINT64CONST(0)) + if (plan->planId != INT64CONST(0)) { pgstat_report_plan_id(plan->planId, false); break; @@ -2176,7 +2175,7 @@ exec_execute_message(const char *portal_name, long max_rows) { PlannedStmt *stmt = lfirst_node(PlannedStmt, lc); - if (stmt->queryId != UINT64CONST(0)) + if (stmt->queryId != INT64CONST(0)) { pgstat_report_query_id(stmt->queryId, false); break; @@ -2187,7 +2186,7 @@ exec_execute_message(const char *portal_name, long max_rows) { PlannedStmt *stmt = lfirst_node(PlannedStmt, lc); - if (stmt->planId != UINT64CONST(0)) + if (stmt->planId != INT64CONST(0)) { pgstat_report_plan_id(stmt->planId, false); break; @@ -3535,9 +3534,6 @@ ProcessInterrupts(void) if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); - if (PublishMemoryContextPending) - ProcessGetMemoryContextInterrupt(); - if (ParallelApplyMessagePending) ProcessParallelApplyMessages(); } @@ -3695,7 +3691,7 @@ set_debug_options(int debug_flag, GucContext context, GucSource source) if (debug_flag >= 1 && context == PGC_POSTMASTER) { - SetConfigOption("log_connections", "true", context, source); + SetConfigOption("log_connections", "all", context, source); SetConfigOption("log_disconnections", "true", context, source); } if (debug_flag >= 2) diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index 8164d0fbb4f..08791b8f75e 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -19,7 +19,6 @@ #include "access/xact.h" #include "commands/prepare.h" -#include "executor/execdesc.h" #include "executor/executor.h" #include "executor/tstoreReceiver.h" #include "miscadmin.h" @@ -38,9 +37,6 @@ Portal ActivePortal = NULL; static void ProcessQuery(PlannedStmt *plan, - CachedPlan *cplan, - CachedPlanSource *plansource, - int query_index, const char *sourceText, ParamListInfo params, QueryEnvironment *queryEnv, @@ -70,7 +66,6 @@ static void DoPortalRewind(Portal portal); */ QueryDesc * CreateQueryDesc(PlannedStmt *plannedstmt, - CachedPlan *cplan, const char *sourceText, Snapshot snapshot, Snapshot crosscheck_snapshot, @@ -83,7 +78,6 @@ CreateQueryDesc(PlannedStmt *plannedstmt, qd->operation = plannedstmt->commandType; /* operation */ qd->plannedstmt = plannedstmt; /* plan */ - qd->cplan = cplan; /* CachedPlan supplying the plannedstmt */ qd->sourceText = sourceText; /* query text */ qd->snapshot = RegisterSnapshot(snapshot); /* snapshot */ /* RI check snapshot */ @@ -129,9 +123,6 @@ FreeQueryDesc(QueryDesc *qdesc) * PORTAL_ONE_RETURNING, or PORTAL_ONE_MOD_WITH portal * * plan: the plan tree for the query - * cplan: CachedPlan supplying the plan - * plansource: CachedPlanSource supplying the cplan - * query_index: index of the query in plansource->query_list * sourceText: the source text of the query * params: any parameters needed * dest: where to send results @@ -144,9 +135,6 @@ FreeQueryDesc(QueryDesc *qdesc) */ static void ProcessQuery(PlannedStmt *plan, - CachedPlan *cplan, - CachedPlanSource *plansource, - int query_index, const char *sourceText, ParamListInfo params, QueryEnvironment *queryEnv, @@ -158,23 +146,14 @@ ProcessQuery(PlannedStmt *plan, /* * Create the QueryDesc object */ - queryDesc = CreateQueryDesc(plan, cplan, sourceText, + queryDesc = CreateQueryDesc(plan, sourceText, GetActiveSnapshot(), InvalidSnapshot, dest, params, queryEnv, 0); /* - * Prepare the plan for execution + * Call ExecutorStart to prepare the plan for execution */ - if (queryDesc->cplan) - { - ExecutorStartCachedPlan(queryDesc, 0, plansource, query_index); - Assert(queryDesc->planstate); - } - else - { - if (!ExecutorStart(queryDesc, 0)) - elog(ERROR, "ExecutorStart() failed unexpectedly"); - } + ExecutorStart(queryDesc, 0); /* * Run the plan to completion. @@ -515,7 +494,6 @@ PortalStart(Portal portal, ParamListInfo params, * the destination to DestNone. */ queryDesc = CreateQueryDesc(linitial_node(PlannedStmt, portal->stmts), - portal->cplan, portal->sourceText, GetActiveSnapshot(), InvalidSnapshot, @@ -535,19 +513,9 @@ PortalStart(Portal portal, ParamListInfo params, myeflags = eflags; /* - * Prepare the plan for execution. + * Call ExecutorStart to prepare the plan for execution */ - if (portal->cplan) - { - ExecutorStartCachedPlan(queryDesc, myeflags, - portal->plansource, 0); - Assert(queryDesc->planstate); - } - else - { - if (!ExecutorStart(queryDesc, myeflags)) - elog(ERROR, "ExecutorStart() failed unexpectedly"); - } + ExecutorStart(queryDesc, myeflags); /* * This tells PortalCleanup to shut down the executor @@ -1221,7 +1189,6 @@ PortalRunMulti(Portal portal, { bool active_snapshot_set = false; ListCell *stmtlist_item; - int query_index = 0; /* * If the destination is DestRemoteExecute, change to DestNone. The @@ -1303,9 +1270,6 @@ PortalRunMulti(Portal portal, { /* statement can set tag string */ ProcessQuery(pstmt, - portal->cplan, - portal->plansource, - query_index, portal->sourceText, portal->portalParams, portal->queryEnv, @@ -1315,9 +1279,6 @@ PortalRunMulti(Portal portal, { /* stmt added by rewrite cannot set tag */ ProcessQuery(pstmt, - portal->cplan, - portal->plansource, - query_index, portal->sourceText, portal->portalParams, portal->queryEnv, @@ -1382,8 +1343,6 @@ PortalRunMulti(Portal portal, */ if (lnext(portal->stmts, stmtlist_item) != NULL) CommandCounterIncrement(); - - query_index++; } /* Pop the snapshot if we pushed one. */ @@ -1391,24 +1350,15 @@ PortalRunMulti(Portal portal, PopActiveSnapshot(); /* - * If a query completion data was supplied, use it. Otherwise use the - * portal's query completion data. - * - * Exception: Clients expect INSERT/UPDATE/DELETE tags to have counts, so - * fake them with zeros. This can happen with DO INSTEAD rules if there - * is no replacement query of the same type as the original. We print "0 - * 0" here because technically there is no query of the matching tag type, - * and printing a non-zero count for a different query type seems wrong, - * e.g. an INSERT that does an UPDATE instead should not print "0 1" if - * one row was updated. See QueryRewrite(), step 3, for details. + * If a command tag was requested and we did not fill in a run-time- + * determined tag above, copy the parse-time tag from the Portal. (There + * might not be any tag there either, in edge cases such as empty prepared + * statements. That's OK.) */ - if (qc && qc->commandTag == CMDTAG_UNKNOWN) - { - if (portal->qc.commandTag != CMDTAG_UNKNOWN) - CopyQueryCompletion(qc, &portal->qc); - /* If the caller supplied a qc, we should have set it by now. */ - Assert(qc->commandTag != CMDTAG_UNKNOWN); - } + if (qc && + qc->commandTag == CMDTAG_UNKNOWN && + portal->qc.commandTag != CMDTAG_UNKNOWN) + CopyQueryCompletion(qc, &portal->qc); } /* diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 25fe3d58016..4f4191b0ea6 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -943,17 +943,7 @@ standard_ProcessUtility(PlannedStmt *pstmt, break; case T_CheckPointStmt: - if (!has_privs_of_role(GetUserId(), ROLE_PG_CHECKPOINT)) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - /* translator: %s is name of a SQL command, eg CHECKPOINT */ - errmsg("permission denied to execute %s command", - "CHECKPOINT"), - errdetail("Only roles with privileges of the \"%s\" role may execute this command.", - "pg_checkpoint"))); - - RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT | - (RecoveryInProgress() ? 0 : CHECKPOINT_FORCE)); + ExecCheckpoint(pstate, (CheckPointStmt *) parsetree); break; /* @@ -1244,6 +1234,7 @@ ProcessUtilitySlow(ParseState *pstate, wrapper->utilityStmt = stmt; wrapper->stmt_location = pstmt->stmt_location; wrapper->stmt_len = pstmt->stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; ProcessUtility(wrapper, queryString, @@ -1343,7 +1334,7 @@ ProcessUtilitySlow(ParseState *pstate, */ switch (stmt->subtype) { - case 'T': /* ALTER DOMAIN DEFAULT */ + case AD_AlterDefault: /* * Recursively alter column default for table and, @@ -1353,30 +1344,30 @@ ProcessUtilitySlow(ParseState *pstate, AlterDomainDefault(stmt->typeName, stmt->def); break; - case 'N': /* ALTER DOMAIN DROP NOT NULL */ + case AD_DropNotNull: address = AlterDomainNotNull(stmt->typeName, false); break; - case 'O': /* ALTER DOMAIN SET NOT NULL */ + case AD_SetNotNull: address = AlterDomainNotNull(stmt->typeName, true); break; - case 'C': /* ADD CONSTRAINT */ + case AD_AddConstraint: address = AlterDomainAddConstraint(stmt->typeName, stmt->def, &secondaryObject); break; - case 'X': /* DROP CONSTRAINT */ + case AD_DropConstraint: address = AlterDomainDropConstraint(stmt->typeName, stmt->name, stmt->behavior, stmt->missing_ok); break; - case 'V': /* VALIDATE CONSTRAINT */ + case AD_ValidateConstraint: address = AlterDomainValidateConstraint(stmt->typeName, stmt->name); @@ -1974,6 +1965,7 @@ ProcessUtilityForAlterTable(Node *stmt, AlterTableUtilityContext *context) wrapper->utilityStmt = stmt; wrapper->stmt_location = context->pstmt->stmt_location; wrapper->stmt_len = context->pstmt->stmt_len; + wrapper->planOrigin = PLAN_STMT_INTERNAL; ProcessUtility(wrapper, context->queryString, diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c index 63bd193a78a..debfbf956cc 100644 --- a/src/backend/tsearch/dict_ispell.c +++ b/src/backend/tsearch/dict_ispell.c @@ -47,24 +47,30 @@ dispell_init(PG_FUNCTION_ARGS) if (strcmp(defel->defname, "dictfile") == 0) { + char *filename; + if (dictloaded) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple DictFile parameters"))); - NIImportDictionary(&(d->obj), - get_tsearch_config_filename(defGetString(defel), - "dict")); + filename = get_tsearch_config_filename(defGetString(defel), + "dict"); + NIImportDictionary(&(d->obj), filename); + pfree(filename); dictloaded = true; } else if (strcmp(defel->defname, "afffile") == 0) { + char *filename; + if (affloaded) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple AffFile parameters"))); - NIImportAffixes(&(d->obj), - get_tsearch_config_filename(defGetString(defel), - "affix")); + filename = get_tsearch_config_filename(defGetString(defel), + "affix"); + NIImportAffixes(&(d->obj), filename); + pfree(filename); affloaded = true; } else if (strcmp(defel->defname, "stopwords") == 0) diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c index 0da5a9d6868..c2773eb01ad 100644 --- a/src/backend/tsearch/dict_synonym.c +++ b/src/backend/tsearch/dict_synonym.c @@ -199,6 +199,7 @@ skipline: } tsearch_readline_end(&trst); + pfree(filename); d->len = cur; qsort(d->syn, d->len, sizeof(Syn), compareSyn); diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c index 1bebe36a691..1e6bbde1ca7 100644 --- a/src/backend/tsearch/dict_thesaurus.c +++ b/src/backend/tsearch/dict_thesaurus.c @@ -167,17 +167,17 @@ addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 p static void thesaurusRead(const char *filename, DictThesaurus *d) { + char *real_filename = get_tsearch_config_filename(filename, "ths"); tsearch_readline_state trst; uint32 idsubst = 0; bool useasis = false; char *line; - filename = get_tsearch_config_filename(filename, "ths"); - if (!tsearch_readline_begin(&trst, filename)) + if (!tsearch_readline_begin(&trst, real_filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open thesaurus file \"%s\": %m", - filename))); + real_filename))); while ((line = tsearch_readline(&trst)) != NULL) { @@ -297,6 +297,7 @@ thesaurusRead(const char *filename, DictThesaurus *d) d->nsubst = idsubst; tsearch_readline_end(&trst); + pfree(real_filename); } static TheLexeme * diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index b77d8c23d36..4801fe90089 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -36,7 +36,7 @@ t_isalpha(const char *ptr) { int clen = pg_mblen(ptr); wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ + locale_t mylocale = 0; /* TODO */ if (clen == 1 || database_ctype_is_c) return isalpha(TOUCHAR(ptr)); @@ -51,7 +51,7 @@ t_isalnum(const char *ptr) { int clen = pg_mblen(ptr); wchar_t character[WC_BUF_LEN]; - pg_locale_t mylocale = 0; /* TODO */ + locale_t mylocale = 0; /* TODO */ if (clen == 1 || database_ctype_is_c) return isalnum(TOUCHAR(ptr)); diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c index 0c1d2bc1109..453a5e5c2ea 100644 --- a/src/backend/tsearch/ts_selfuncs.c +++ b/src/backend/tsearch/ts_selfuncs.c @@ -233,7 +233,7 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, * The text Datums came from an array, so it cannot be compressed or * stored out-of-line -- it's safe to use VARSIZE_ANY*. */ - Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i])); + Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(mcelem[i])) && !VARATT_IS_EXTERNAL(DatumGetPointer(mcelem[i]))); lookup[i].element = (text *) DatumGetPointer(mcelem[i]); lookup[i].frequency = numbers[i]; } diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 79bcd32a063..e2dd3da3aa3 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -299,7 +299,7 @@ TParserInit(char *str, int len) */ if (prs->charmaxlen > 1) { - pg_locale_t mylocale = 0; /* TODO */ + locale_t mylocale = 0; /* TODO */ prs->usewide = true; if (database_ctype_is_c) diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c index e1576e64b6d..a290cc4c975 100644 --- a/src/backend/utils/activity/backend_status.c +++ b/src/backend/utils/activity/backend_status.c @@ -320,8 +320,8 @@ pgstat_bestart_initial(void) lbeentry.st_state = STATE_STARTING; lbeentry.st_progress_command = PROGRESS_COMMAND_INVALID; lbeentry.st_progress_command_target = InvalidOid; - lbeentry.st_query_id = UINT64CONST(0); - lbeentry.st_plan_id = UINT64CONST(0); + lbeentry.st_query_id = INT64CONST(0); + lbeentry.st_plan_id = INT64CONST(0); /* * we don't zero st_progress_param here to save cycles; nobody should @@ -599,8 +599,8 @@ pgstat_report_activity(BackendState state, const char *cmd_str) beentry->st_activity_start_timestamp = 0; /* st_xact_start_timestamp and wait_event_info are also disabled */ beentry->st_xact_start_timestamp = 0; - beentry->st_query_id = UINT64CONST(0); - beentry->st_plan_id = UINT64CONST(0); + beentry->st_query_id = INT64CONST(0); + beentry->st_plan_id = INT64CONST(0); proc->wait_event_info = 0; PGSTAT_END_WRITE_ACTIVITY(beentry); } @@ -662,8 +662,8 @@ pgstat_report_activity(BackendState state, const char *cmd_str) */ if (state == STATE_RUNNING) { - beentry->st_query_id = UINT64CONST(0); - beentry->st_plan_id = UINT64CONST(0); + beentry->st_query_id = INT64CONST(0); + beentry->st_plan_id = INT64CONST(0); } if (cmd_str != NULL) @@ -683,7 +683,7 @@ pgstat_report_activity(BackendState state, const char *cmd_str) * -------- */ void -pgstat_report_query_id(uint64 query_id, bool force) +pgstat_report_query_id(int64 query_id, bool force) { volatile PgBackendStatus *beentry = MyBEEntry; @@ -702,7 +702,7 @@ pgstat_report_query_id(uint64 query_id, bool force) * command, so ignore the one provided unless it's an explicit call to * reset the identifier. */ - if (beentry->st_query_id != 0 && !force) + if (beentry->st_query_id != INT64CONST(0) && !force) return; /* @@ -722,7 +722,7 @@ pgstat_report_query_id(uint64 query_id, bool force) * -------- */ void -pgstat_report_plan_id(uint64 plan_id, bool force) +pgstat_report_plan_id(int64 plan_id, bool force) { volatile PgBackendStatus *beentry = MyBEEntry; @@ -1134,7 +1134,7 @@ pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen) * * Return current backend's query identifier. */ -uint64 +int64 pgstat_get_my_query_id(void) { if (!MyBEEntry) @@ -1154,7 +1154,7 @@ pgstat_get_my_query_id(void) * * Return current backend's plan identifier. */ -uint64 +int64 pgstat_get_my_plan_id(void) { if (!MyBEEntry) diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index 8b57845e870..6bc91ce0dad 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -212,6 +212,11 @@ int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE; PgStat_LocalState pgStatLocal; +/* + * Track pending reports for fixed-numbered stats, used by + * pgstat_report_stat(). + */ +bool pgstat_report_fixed = false; /* ---------- * Local data @@ -370,7 +375,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_off = offsetof(PgStatShared_Backend, stats), .shared_data_len = sizeof(((PgStatShared_Backend *) 0)->stats), - .have_static_pending_cb = pgstat_backend_have_pending_cb, .flush_static_cb = pgstat_backend_flush_cb, .reset_timestamp_cb = pgstat_backend_reset_timestamp_cb, }, @@ -437,7 +441,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_len = sizeof(((PgStatShared_IO *) 0)->stats), .flush_static_cb = pgstat_io_flush_cb, - .have_static_pending_cb = pgstat_io_have_pending_cb, .init_shmem_cb = pgstat_io_init_shmem_cb, .reset_all_cb = pgstat_io_reset_all_cb, .snapshot_cb = pgstat_io_snapshot_cb, @@ -455,7 +458,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_len = sizeof(((PgStatShared_SLRU *) 0)->stats), .flush_static_cb = pgstat_slru_flush_cb, - .have_static_pending_cb = pgstat_slru_have_pending_cb, .init_shmem_cb = pgstat_slru_init_shmem_cb, .reset_all_cb = pgstat_slru_reset_all_cb, .snapshot_cb = pgstat_slru_snapshot_cb, @@ -474,7 +476,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .init_backend_cb = pgstat_wal_init_backend_cb, .flush_static_cb = pgstat_wal_flush_cb, - .have_static_pending_cb = pgstat_wal_have_pending_cb, .init_shmem_cb = pgstat_wal_init_shmem_cb, .reset_all_cb = pgstat_wal_reset_all_cb, .snapshot_cb = pgstat_wal_snapshot_cb, @@ -708,29 +709,10 @@ pgstat_report_stat(bool force) } /* Don't expend a clock check if nothing to do */ - if (dlist_is_empty(&pgStatPending)) + if (dlist_is_empty(&pgStatPending) && + !pgstat_report_fixed) { - bool do_flush = false; - - /* Check for pending stats */ - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) - { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - - if (!kind_info) - continue; - if (!kind_info->have_static_pending_cb) - continue; - - if (kind_info->have_static_pending_cb()) - { - do_flush = true; - break; - } - } - - if (!do_flush) - return 0; + return 0; } /* @@ -784,16 +766,19 @@ pgstat_report_stat(bool force) partial_flush |= pgstat_flush_pending_entries(nowait); /* flush of other stats kinds */ - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + if (pgstat_report_fixed) { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - if (!kind_info) - continue; - if (!kind_info->flush_static_cb) - continue; + if (!kind_info) + continue; + if (!kind_info->flush_static_cb) + continue; - partial_flush |= kind_info->flush_static_cb(nowait); + partial_flush |= kind_info->flush_static_cb(nowait); + } } last_flush = now; @@ -815,6 +800,7 @@ pgstat_report_stat(bool force) } pending_since = 0; + pgstat_report_fixed = false; return 0; } diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 51256277e8d..8714a85e2d9 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -66,6 +66,7 @@ pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context, io_time); backend_has_iostats = true; + pgstat_report_fixed = true; } void @@ -81,6 +82,7 @@ pgstat_count_backend_io_op(IOObject io_object, IOContext io_context, PendingBackendStats.pending_io.bytes[io_object][io_context][io_op] += bytes; backend_has_iostats = true; + pgstat_report_fixed = true; } /* @@ -302,18 +304,6 @@ pgstat_flush_backend(bool nowait, bits32 flags) } /* - * Check if there are any backend stats waiting for flush. - */ -bool -pgstat_backend_have_pending_cb(void) -{ - if (!pgstat_tracks_backend_bktype(MyBackendType)) - return false; - - return (backend_has_iostats || pgstat_backend_wal_have_pending()); -} - -/* * Callback to flush out locally pending backend statistics. * * If some stats could not be flushed due to lock contention, return true. diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index d8d26379a57..13ae57ed649 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -80,6 +80,7 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes); have_iostats = true; + pgstat_report_fixed = true; } /* @@ -168,15 +169,6 @@ pgstat_fetch_stat_io(void) } /* - * Check if there any IO stats waiting for flush. - */ -bool -pgstat_io_have_pending_cb(void) -{ - return have_iostats; -} - -/* * Simpler wrapper of pgstat_io_flush_cb() */ void diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index 28587e2916b..69df741cbf6 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -744,7 +744,7 @@ PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) * Load the saved counts into our local pgstats state. */ void -pgstat_twophase_postcommit(TransactionId xid, uint16 info, +pgstat_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; @@ -780,7 +780,7 @@ pgstat_twophase_postcommit(TransactionId xid, uint16 info, * as aborted. */ void -pgstat_twophase_postabort(TransactionId xid, uint16 info, +pgstat_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len) { TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c index 2e33293b000..53e7d534270 100644 --- a/src/backend/utils/activity/pgstat_shmem.c +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -183,7 +183,7 @@ StatsShmemInit(void) p += MAXALIGN(pgstat_dsa_init_size()); dsa = dsa_create_in_place(ctl->raw_dsa_area, pgstat_dsa_init_size(), - LWTRANCHE_PGSTATS_DSA, 0); + LWTRANCHE_PGSTATS_DSA, NULL); dsa_pin(dsa); /* @@ -255,7 +255,8 @@ pgstat_attach_shmem(void) dsa_pin_mapping(pgStatLocal.dsa); pgStatLocal.shared_hash = dshash_attach(pgStatLocal.dsa, &dsh_params, - pgStatLocal.shmem->hash_handle, 0); + pgStatLocal.shmem->hash_handle, + NULL); MemoryContextSwitchTo(oldcontext); } diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c index b9e940dde45..7bd8744accb 100644 --- a/src/backend/utils/activity/pgstat_slru.c +++ b/src/backend/utils/activity/pgstat_slru.c @@ -144,15 +144,6 @@ pgstat_get_slru_index(const char *name) } /* - * Check if there are any SLRU stats entries waiting for flush. - */ -bool -pgstat_slru_have_pending_cb(void) -{ - return have_slrustats; -} - -/* * Flush out locally pending SLRU stats entries * * If nowait is true, this function returns false on lock failure. Otherwise @@ -247,6 +238,7 @@ get_slru_entry(int slru_idx) Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS)); have_slrustats = true; + pgstat_report_fixed = true; return &pending_SLRUStats[slru_idx]; } diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c index 16a1ecb4d90..0d04480d2f6 100644 --- a/src/backend/utils/activity/pgstat_wal.c +++ b/src/backend/utils/activity/pgstat_wal.c @@ -72,6 +72,15 @@ pgstat_fetch_stat_wal(void) } /* + * To determine whether WAL usage happened. + */ +static inline bool +pgstat_wal_have_pending(void) +{ + return pgWalUsage.wal_records != prevWalUsage.wal_records; +} + +/* * Calculate how much WAL usage counters have increased by subtracting the * previous counters from the current ones. * @@ -92,7 +101,7 @@ pgstat_wal_flush_cb(bool nowait) * This function can be called even if nothing at all has happened. Avoid * taking lock for nothing in that case. */ - if (!pgstat_wal_have_pending_cb()) + if (!pgstat_wal_have_pending()) return false; /* @@ -136,15 +145,6 @@ pgstat_wal_init_backend_cb(void) prevWalUsage = pgWalUsage; } -/* - * To determine whether WAL usage happened. - */ -bool -pgstat_wal_have_pending_cb(void) -{ - return pgWalUsage.wal_records != prevWalUsage.wal_records; -} - void pgstat_wal_init_shmem_cb(void *stats) { diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 930321905f1..0be307d2ca0 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -161,7 +161,6 @@ WAL_RECEIVER_EXIT "Waiting for the WAL receiver to exit." WAL_RECEIVER_WAIT_START "Waiting for startup process to send initial data for streaming replication." WAL_SUMMARY_READY "Waiting for a new WAL summary to be generated." XACT_GROUP_UPDATE "Waiting for the group leader to update transaction status at transaction end." -MEM_CXT_PUBLISH "Waiting for a process to publish memory information." ABI_compatibility: @@ -357,9 +356,13 @@ AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) # -# Predefined LWLocks (i.e., those declared in lwlocknames.h) must be listed -# in the section above and must be listed in the same order as in -# lwlocknames.h. Other LWLocks must be listed in the section below. +# Predefined LWLocks (i.e., those declared at the top of lwlocknames.h) must be +# listed in the section above and must be listed in the same order as in +# lwlocknames.h. +# +# Likewise, the built-in LWLock tranches (i.e., those declared at the bottom of +# lwlocknames.h) must be listed in the section below and must be listed in the +# same order as in lwlocknames.h. # XactBuffer "Waiting for I/O on a transaction status SLRU buffer." @@ -402,6 +405,7 @@ SerialSLRU "Waiting to access the serializable transaction conflict SLRU cache." SubtransSLRU "Waiting to access the sub-transaction SLRU cache." XactSLRU "Waiting to access the transaction status SLRU cache." ParallelVacuumDSA "Waiting for parallel vacuum dynamic shared memory allocation." +AioUringCompletion "Waiting for another process to complete IO via io_uring." # No "ABI_compatibility" region here as WaitEventLWLock has its own C code. diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 4a233b63c32..ffeacf2b819 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -23,6 +23,7 @@ OBJS = \ arrayutils.o \ ascii.o \ bool.o \ + bytea.o \ cash.o \ char.o \ cryptohashfuncs.o \ diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c index ca3c5ee3df3..1213f9106d5 100644 --- a/src/backend/utils/adt/acl.c +++ b/src/backend/utils/adt/acl.c @@ -135,6 +135,22 @@ static void RoleMembershipCacheCallback(Datum arg, int cacheid, uint32 hashvalue /* + * Test whether an identifier char can be left unquoted in ACLs. + * + * Formerly, we used isalnum() even on non-ASCII characters, resulting in + * unportable behavior. To ensure dump compatibility with old versions, + * we now treat high-bit-set characters as always requiring quoting during + * putid(), but getid() will always accept them without quotes. + */ +static inline bool +is_safe_acl_char(unsigned char c, bool is_getid) +{ + if (IS_HIGHBIT_SET(c)) + return is_getid; + return isalnum(c) || c == '_'; +} + +/* * getid * Consumes the first alphanumeric string (identifier) found in string * 's', ignoring any leading white space. If it finds a double quote @@ -159,21 +175,22 @@ getid(const char *s, char *n, Node *escontext) while (isspace((unsigned char) *s)) s++; - /* This code had better match what putid() does, below */ for (; *s != '\0' && - (isalnum((unsigned char) *s) || - *s == '_' || - *s == '"' || - in_quotes); + (in_quotes || *s == '"' || is_safe_acl_char(*s, true)); s++) { if (*s == '"') { + if (!in_quotes) + { + in_quotes = true; + continue; + } /* safe to look at next char (could be '\0' though) */ if (*(s + 1) != '"') { - in_quotes = !in_quotes; + in_quotes = false; continue; } /* it's an escaped double quote; skip the escaping char */ @@ -207,10 +224,10 @@ putid(char *p, const char *s) const char *src; bool safe = true; + /* Detect whether we need to use double quotes */ for (src = s; *src; src++) { - /* This test had better match what getid() does, above */ - if (!isalnum((unsigned char) *src) && *src != '_') + if (!is_safe_acl_char(*src, false)) { safe = false; break; diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c new file mode 100644 index 00000000000..6e7b914c563 --- /dev/null +++ b/src/backend/utils/adt/bytea.c @@ -0,0 +1,1114 @@ +/*------------------------------------------------------------------------- + * + * bytea.c + * Functions for the bytea type. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/adt/bytea.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "catalog/pg_collation_d.h" +#include "catalog/pg_type_d.h" +#include "common/int.h" +#include "fmgr.h" +#include "libpq/pqformat.h" +#include "port/pg_bitutils.h" +#include "utils/builtins.h" +#include "utils/bytea.h" +#include "utils/fmgrprotos.h" +#include "utils/memutils.h" +#include "utils/sortsupport.h" +#include "utils/varlena.h" +#include "varatt.h" + +/* GUC variable */ +int bytea_output = BYTEA_OUTPUT_HEX; + +static bytea *bytea_catenate(bytea *t1, bytea *t2); +static bytea *bytea_substring(Datum str, int S, int L, + bool length_not_specified); +static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); + +/* + * bytea_catenate + * Guts of byteacat(), broken out so it can be used by other functions + * + * Arguments can be in short-header form, but not compressed or out-of-line + */ +static bytea * +bytea_catenate(bytea *t1, bytea *t2) +{ + bytea *result; + int len1, + len2, + len; + char *ptr; + + len1 = VARSIZE_ANY_EXHDR(t1); + len2 = VARSIZE_ANY_EXHDR(t2); + + /* paranoia ... probably should throw error instead? */ + if (len1 < 0) + len1 = 0; + if (len2 < 0) + len2 = 0; + + len = len1 + len2 + VARHDRSZ; + result = (bytea *) palloc(len); + + /* Set size of result string... */ + SET_VARSIZE(result, len); + + /* Fill data field of result string... */ + ptr = VARDATA(result); + if (len1 > 0) + memcpy(ptr, VARDATA_ANY(t1), len1); + if (len2 > 0) + memcpy(ptr + len1, VARDATA_ANY(t2), len2); + + return result; +} + +#define PG_STR_GET_BYTEA(str_) \ + DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_))) + +static bytea * +bytea_substring(Datum str, + int S, + int L, + bool length_not_specified) +{ + int32 S1; /* adjusted start position */ + int32 L1; /* adjusted substring length */ + int32 E; /* end position */ + + /* + * The logic here should generally match text_substring(). + */ + S1 = Max(S, 1); + + if (length_not_specified) + { + /* + * Not passed a length - DatumGetByteaPSlice() grabs everything to the + * end of the string if we pass it a negative value for length. + */ + L1 = -1; + } + else if (L < 0) + { + /* SQL99 says to throw an error for E < S, i.e., negative length */ + ereport(ERROR, + (errcode(ERRCODE_SUBSTRING_ERROR), + errmsg("negative substring length not allowed"))); + L1 = -1; /* silence stupider compilers */ + } + else if (pg_add_s32_overflow(S, L, &E)) + { + /* + * L could be large enough for S + L to overflow, in which case the + * substring must run to end of string. + */ + L1 = -1; + } + else + { + /* + * A zero or negative value for the end position can happen if the + * start was negative or one. SQL99 says to return a zero-length + * string. + */ + if (E < 1) + return PG_STR_GET_BYTEA(""); + + L1 = E - S1; + } + + /* + * If the start position is past the end of the string, SQL99 says to + * return a zero-length string -- DatumGetByteaPSlice() will do that for + * us. We need only convert S1 to zero-based starting position. + */ + return DatumGetByteaPSlice(str, S1 - 1, L1); +} + +static bytea * +bytea_overlay(bytea *t1, bytea *t2, int sp, int sl) +{ + bytea *result; + bytea *s1; + bytea *s2; + int sp_pl_sl; + + /* + * Check for possible integer-overflow cases. For negative sp, throw a + * "substring length" error because that's what should be expected + * according to the spec's definition of OVERLAY(). + */ + if (sp <= 0) + ereport(ERROR, + (errcode(ERRCODE_SUBSTRING_ERROR), + errmsg("negative substring length not allowed"))); + if (pg_add_s32_overflow(sp, sl, &sp_pl_sl)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range"))); + + s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false); + s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true); + result = bytea_catenate(s1, t2); + result = bytea_catenate(result, s2); + + return result; +} + +/***************************************************************************** + * USER I/O ROUTINES * + *****************************************************************************/ + +#define VAL(CH) ((CH) - '0') +#define DIG(VAL) ((VAL) + '0') + +/* + * byteain - converts from printable representation of byte array + * + * Non-printable characters must be passed as '\nnn' (octal) and are + * converted to internal form. '\' must be passed as '\\'. + */ +Datum +byteain(PG_FUNCTION_ARGS) +{ + char *inputText = PG_GETARG_CSTRING(0); + Node *escontext = fcinfo->context; + size_t len = strlen(inputText); + size_t bc; + char *tp; + char *rp; + bytea *result; + + /* Recognize hex input */ + if (inputText[0] == '\\' && inputText[1] == 'x') + { + bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ + result = palloc(bc); + bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), + escontext); + SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */ + + PG_RETURN_BYTEA_P(result); + } + + /* Else, it's the traditional escaped style */ + result = (bytea *) palloc(len + VARHDRSZ); /* maximum possible length */ + + tp = inputText; + rp = VARDATA(result); + while (*tp != '\0') + { + if (tp[0] != '\\') + *rp++ = *tp++; + else if ((tp[1] >= '0' && tp[1] <= '3') && + (tp[2] >= '0' && tp[2] <= '7') && + (tp[3] >= '0' && tp[3] <= '7')) + { + int v; + + v = VAL(tp[1]); + v <<= 3; + v += VAL(tp[2]); + v <<= 3; + *rp++ = v + VAL(tp[3]); + + tp += 4; + } + else if (tp[1] == '\\') + { + *rp++ = '\\'; + tp += 2; + } + else + { + /* + * one backslash, not followed by another or ### valid octal + */ + ereturn(escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type %s", "bytea"))); + } + } + + bc = rp - VARDATA(result); /* actual length */ + SET_VARSIZE(result, bc + VARHDRSZ); + + PG_RETURN_BYTEA_P(result); +} + +/* + * byteaout - converts to printable representation of byte array + * + * In the traditional escaped format, non-printable characters are + * printed as '\nnn' (octal) and '\' as '\\'. + */ +Datum +byteaout(PG_FUNCTION_ARGS) +{ + bytea *vlena = PG_GETARG_BYTEA_PP(0); + char *result; + char *rp; + + if (bytea_output == BYTEA_OUTPUT_HEX) + { + /* Print hex format */ + rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1); + *rp++ = '\\'; + *rp++ = 'x'; + rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp); + } + else if (bytea_output == BYTEA_OUTPUT_ESCAPE) + { + /* Print traditional escaped format */ + char *vp; + uint64 len; + int i; + + len = 1; /* empty string has 1 char */ + vp = VARDATA_ANY(vlena); + for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) + { + if (*vp == '\\') + len += 2; + else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) + len += 4; + else + len++; + } + + /* + * In principle len can't overflow uint32 if the input fit in 1GB, but + * for safety let's check rather than relying on palloc's internal + * check. + */ + if (len > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("result of bytea output conversion is too large"))); + rp = result = (char *) palloc(len); + + vp = VARDATA_ANY(vlena); + for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) + { + if (*vp == '\\') + { + *rp++ = '\\'; + *rp++ = '\\'; + } + else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) + { + int val; /* holds unprintable chars */ + + val = *vp; + rp[0] = '\\'; + rp[3] = DIG(val & 07); + val >>= 3; + rp[2] = DIG(val & 07); + val >>= 3; + rp[1] = DIG(val & 03); + rp += 4; + } + else + *rp++ = *vp; + } + } + else + { + elog(ERROR, "unrecognized \"bytea_output\" setting: %d", + bytea_output); + rp = result = NULL; /* keep compiler quiet */ + } + *rp = '\0'; + PG_RETURN_CSTRING(result); +} + +/* + * bytearecv - converts external binary format to bytea + */ +Datum +bytearecv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + bytea *result; + int nbytes; + + nbytes = buf->len - buf->cursor; + result = (bytea *) palloc(nbytes + VARHDRSZ); + SET_VARSIZE(result, nbytes + VARHDRSZ); + pq_copymsgbytes(buf, VARDATA(result), nbytes); + PG_RETURN_BYTEA_P(result); +} + +/* + * byteasend - converts bytea to binary format + * + * This is a special case: just copy the input... + */ +Datum +byteasend(PG_FUNCTION_ARGS) +{ + bytea *vlena = PG_GETARG_BYTEA_P_COPY(0); + + PG_RETURN_BYTEA_P(vlena); +} + +Datum +bytea_string_agg_transfn(PG_FUNCTION_ARGS) +{ + StringInfo state; + + state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); + + /* Append the value unless null, preceding it with the delimiter. */ + if (!PG_ARGISNULL(1)) + { + bytea *value = PG_GETARG_BYTEA_PP(1); + bool isfirst = false; + + /* + * You might think we can just throw away the first delimiter, however + * we must keep it as we may be a parallel worker doing partial + * aggregation building a state to send to the main process. We need + * to keep the delimiter of every aggregation so that the combine + * function can properly join up the strings of two separately + * partially aggregated results. The first delimiter is only stripped + * off in the final function. To know how much to strip off the front + * of the string, we store the length of the first delimiter in the + * StringInfo's cursor field, which we don't otherwise need here. + */ + if (state == NULL) + { + MemoryContext aggcontext; + MemoryContext oldcontext; + + if (!AggCheckCallContext(fcinfo, &aggcontext)) + { + /* cannot be called directly because of internal-type argument */ + elog(ERROR, "bytea_string_agg_transfn called in non-aggregate context"); + } + + /* + * Create state in aggregate context. It'll stay there across + * subsequent calls. + */ + oldcontext = MemoryContextSwitchTo(aggcontext); + state = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + + isfirst = true; + } + + if (!PG_ARGISNULL(2)) + { + bytea *delim = PG_GETARG_BYTEA_PP(2); + + appendBinaryStringInfo(state, VARDATA_ANY(delim), + VARSIZE_ANY_EXHDR(delim)); + if (isfirst) + state->cursor = VARSIZE_ANY_EXHDR(delim); + } + + appendBinaryStringInfo(state, VARDATA_ANY(value), + VARSIZE_ANY_EXHDR(value)); + } + + /* + * The transition type for string_agg() is declared to be "internal", + * which is a pass-by-value type the same size as a pointer. + */ + if (state) + PG_RETURN_POINTER(state); + PG_RETURN_NULL(); +} + +Datum +bytea_string_agg_finalfn(PG_FUNCTION_ARGS) +{ + StringInfo state; + + /* cannot be called directly because of internal-type argument */ + Assert(AggCheckCallContext(fcinfo, NULL)); + + state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); + + if (state != NULL) + { + /* As per comment in transfn, strip data before the cursor position */ + bytea *result; + int strippedlen = state->len - state->cursor; + + result = (bytea *) palloc(strippedlen + VARHDRSZ); + SET_VARSIZE(result, strippedlen + VARHDRSZ); + memcpy(VARDATA(result), &state->data[state->cursor], strippedlen); + PG_RETURN_BYTEA_P(result); + } + else + PG_RETURN_NULL(); +} + +/*------------------------------------------------------------- + * byteaoctetlen + * + * get the number of bytes contained in an instance of type 'bytea' + *------------------------------------------------------------- + */ +Datum +byteaoctetlen(PG_FUNCTION_ARGS) +{ + Datum str = PG_GETARG_DATUM(0); + + /* We need not detoast the input at all */ + PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); +} + +/* + * byteacat - + * takes two bytea* and returns a bytea* that is the concatenation of + * the two. + * + * Cloned from textcat and modified as required. + */ +Datum +byteacat(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + + PG_RETURN_BYTEA_P(bytea_catenate(t1, t2)); +} + +/* + * byteaoverlay + * Replace specified substring of first string with second + * + * The SQL standard defines OVERLAY() in terms of substring and concatenation. + * This code is a direct implementation of what the standard says. + */ +Datum +byteaoverlay(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + int sp = PG_GETARG_INT32(2); /* substring start position */ + int sl = PG_GETARG_INT32(3); /* substring length */ + + PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); +} + +Datum +byteaoverlay_no_len(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + int sp = PG_GETARG_INT32(2); /* substring start position */ + int sl; + + sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */ + PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); +} + +/* + * bytea_substr() + * Return a substring starting at the specified position. + * Cloned from text_substr and modified as required. + * + * Input: + * - string + * - starting position (is one-based) + * - string length (optional) + * + * If the starting position is zero or less, then return from the start of the string + * adjusting the length to be consistent with the "negative start" per SQL. + * If the length is less than zero, an ERROR is thrown. If no third argument + * (length) is provided, the length to the end of the string is assumed. + */ +Datum +bytea_substr(PG_FUNCTION_ARGS) +{ + PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), + PG_GETARG_INT32(1), + PG_GETARG_INT32(2), + false)); +} + +/* + * bytea_substr_no_len - + * Wrapper to avoid opr_sanity failure due to + * one function accepting a different number of args. + */ +Datum +bytea_substr_no_len(PG_FUNCTION_ARGS) +{ + PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), + PG_GETARG_INT32(1), + -1, + true)); +} + +/* + * bit_count + */ +Datum +bytea_bit_count(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + + PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1))); +} + +/* + * byteapos - + * Return the position of the specified substring. + * Implements the SQL POSITION() function. + * Cloned from textpos and modified as required. + */ +Datum +byteapos(PG_FUNCTION_ARGS) +{ + bytea *t1 = PG_GETARG_BYTEA_PP(0); + bytea *t2 = PG_GETARG_BYTEA_PP(1); + int pos; + int px, + p; + int len1, + len2; + char *p1, + *p2; + + len1 = VARSIZE_ANY_EXHDR(t1); + len2 = VARSIZE_ANY_EXHDR(t2); + + if (len2 <= 0) + PG_RETURN_INT32(1); /* result for empty pattern */ + + p1 = VARDATA_ANY(t1); + p2 = VARDATA_ANY(t2); + + pos = 0; + px = (len1 - len2); + for (p = 0; p <= px; p++) + { + if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0)) + { + pos = p + 1; + break; + }; + p1++; + }; + + PG_RETURN_INT32(pos); +} + +/*------------------------------------------------------------- + * byteaGetByte + * + * this routine treats "bytea" as an array of bytes. + * It returns the Nth byte (a number between 0 and 255). + *------------------------------------------------------------- + */ +Datum +byteaGetByte(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int32 n = PG_GETARG_INT32(1); + int len; + int byte; + + len = VARSIZE_ANY_EXHDR(v); + + if (n < 0 || n >= len) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %d out of valid range, 0..%d", + n, len - 1))); + + byte = ((unsigned char *) VARDATA_ANY(v))[n]; + + PG_RETURN_INT32(byte); +} + +/*------------------------------------------------------------- + * byteaGetBit + * + * This routine treats a "bytea" type like an array of bits. + * It returns the value of the Nth bit (0 or 1). + * + *------------------------------------------------------------- + */ +Datum +byteaGetBit(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int64 n = PG_GETARG_INT64(1); + int byteNo, + bitNo; + int len; + int byte; + + len = VARSIZE_ANY_EXHDR(v); + + if (n < 0 || n >= (int64) len * 8) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, + n, (int64) len * 8 - 1))); + + /* n/8 is now known < len, so safe to cast to int */ + byteNo = (int) (n / 8); + bitNo = (int) (n % 8); + + byte = ((unsigned char *) VARDATA_ANY(v))[byteNo]; + + if (byte & (1 << bitNo)) + PG_RETURN_INT32(1); + else + PG_RETURN_INT32(0); +} + +/*------------------------------------------------------------- + * byteaSetByte + * + * Given an instance of type 'bytea' creates a new one with + * the Nth byte set to the given value. + * + *------------------------------------------------------------- + */ +Datum +byteaSetByte(PG_FUNCTION_ARGS) +{ + bytea *res = PG_GETARG_BYTEA_P_COPY(0); + int32 n = PG_GETARG_INT32(1); + int32 newByte = PG_GETARG_INT32(2); + int len; + + len = VARSIZE(res) - VARHDRSZ; + + if (n < 0 || n >= len) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %d out of valid range, 0..%d", + n, len - 1))); + + /* + * Now set the byte. + */ + ((unsigned char *) VARDATA(res))[n] = newByte; + + PG_RETURN_BYTEA_P(res); +} + +/*------------------------------------------------------------- + * byteaSetBit + * + * Given an instance of type 'bytea' creates a new one with + * the Nth bit set to the given value. + * + *------------------------------------------------------------- + */ +Datum +byteaSetBit(PG_FUNCTION_ARGS) +{ + bytea *res = PG_GETARG_BYTEA_P_COPY(0); + int64 n = PG_GETARG_INT64(1); + int32 newBit = PG_GETARG_INT32(2); + int len; + int oldByte, + newByte; + int byteNo, + bitNo; + + len = VARSIZE(res) - VARHDRSZ; + + if (n < 0 || n >= (int64) len * 8) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, + n, (int64) len * 8 - 1))); + + /* n/8 is now known < len, so safe to cast to int */ + byteNo = (int) (n / 8); + bitNo = (int) (n % 8); + + /* + * sanity check! + */ + if (newBit != 0 && newBit != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("new bit must be 0 or 1"))); + + /* + * Update the byte. + */ + oldByte = ((unsigned char *) VARDATA(res))[byteNo]; + + if (newBit == 0) + newByte = oldByte & (~(1 << bitNo)); + else + newByte = oldByte | (1 << bitNo); + + ((unsigned char *) VARDATA(res))[byteNo] = newByte; + + PG_RETURN_BYTEA_P(res); +} + +/* + * Return reversed bytea + */ +Datum +bytea_reverse(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + const char *p = VARDATA_ANY(v); + int len = VARSIZE_ANY_EXHDR(v); + const char *endp = p + len; + bytea *result = palloc(len + VARHDRSZ); + char *dst = (char *) VARDATA(result) + len; + + SET_VARSIZE(result, len + VARHDRSZ); + + while (p < endp) + *(--dst) = *p++; + + PG_RETURN_BYTEA_P(result); +} + + +/***************************************************************************** + * Comparison Functions used for bytea + * + * Note: btree indexes need these routines not to leak memory; therefore, + * be careful to free working copies of toasted datums. Most places don't + * need to be so careful. + *****************************************************************************/ + +Datum +byteaeq(PG_FUNCTION_ARGS) +{ + Datum arg1 = PG_GETARG_DATUM(0); + Datum arg2 = PG_GETARG_DATUM(1); + bool result; + Size len1, + len2; + + /* + * We can use a fast path for unequal lengths, which might save us from + * having to detoast one or both values. + */ + len1 = toast_raw_datum_size(arg1); + len2 = toast_raw_datum_size(arg2); + if (len1 != len2) + result = false; + else + { + bytea *barg1 = DatumGetByteaPP(arg1); + bytea *barg2 = DatumGetByteaPP(arg2); + + result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), + len1 - VARHDRSZ) == 0); + + PG_FREE_IF_COPY(barg1, 0); + PG_FREE_IF_COPY(barg2, 1); + } + + PG_RETURN_BOOL(result); +} + +Datum +byteane(PG_FUNCTION_ARGS) +{ + Datum arg1 = PG_GETARG_DATUM(0); + Datum arg2 = PG_GETARG_DATUM(1); + bool result; + Size len1, + len2; + + /* + * We can use a fast path for unequal lengths, which might save us from + * having to detoast one or both values. + */ + len1 = toast_raw_datum_size(arg1); + len2 = toast_raw_datum_size(arg2); + if (len1 != len2) + result = true; + else + { + bytea *barg1 = DatumGetByteaPP(arg1); + bytea *barg2 = DatumGetByteaPP(arg2); + + result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), + len1 - VARHDRSZ) != 0); + + PG_FREE_IF_COPY(barg1, 0); + PG_FREE_IF_COPY(barg2, 1); + } + + PG_RETURN_BOOL(result); +} + +Datum +bytealt(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2))); +} + +Datum +byteale(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2))); +} + +Datum +byteagt(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2))); +} + +Datum +byteage(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2))); +} + +Datum +byteacmp(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + if ((cmp == 0) && (len1 != len2)) + cmp = (len1 < len2) ? -1 : 1; + + PG_FREE_IF_COPY(arg1, 0); + PG_FREE_IF_COPY(arg2, 1); + + PG_RETURN_INT32(cmp); +} + +Datum +bytea_larger(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + bytea *result; + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2); + + PG_RETURN_BYTEA_P(result); +} + +Datum +bytea_smaller(PG_FUNCTION_ARGS) +{ + bytea *arg1 = PG_GETARG_BYTEA_PP(0); + bytea *arg2 = PG_GETARG_BYTEA_PP(1); + bytea *result; + int len1, + len2; + int cmp; + + len1 = VARSIZE_ANY_EXHDR(arg1); + len2 = VARSIZE_ANY_EXHDR(arg2); + + cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); + result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2); + + PG_RETURN_BYTEA_P(result); +} + +Datum +bytea_sortsupport(PG_FUNCTION_ARGS) +{ + SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); + + /* Use generic string SortSupport, forcing "C" collation */ + varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID); + + MemoryContextSwitchTo(oldcontext); + + PG_RETURN_VOID(); +} + +/* Cast bytea -> int2 */ +Datum +bytea_int2(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int len = VARSIZE_ANY_EXHDR(v); + uint16 result; + + /* Check that the byte array is not too long */ + if (len > sizeof(result)) + ereport(ERROR, + errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("smallint out of range")); + + /* Convert it to an integer; most significant bytes come first */ + result = 0; + for (int i = 0; i < len; i++) + { + result <<= BITS_PER_BYTE; + result |= ((unsigned char *) VARDATA_ANY(v))[i]; + } + + PG_RETURN_INT16(result); +} + +/* Cast bytea -> int4 */ +Datum +bytea_int4(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int len = VARSIZE_ANY_EXHDR(v); + uint32 result; + + /* Check that the byte array is not too long */ + if (len > sizeof(result)) + ereport(ERROR, + errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("integer out of range")); + + /* Convert it to an integer; most significant bytes come first */ + result = 0; + for (int i = 0; i < len; i++) + { + result <<= BITS_PER_BYTE; + result |= ((unsigned char *) VARDATA_ANY(v))[i]; + } + + PG_RETURN_INT32(result); +} + +/* Cast bytea -> int8 */ +Datum +bytea_int8(PG_FUNCTION_ARGS) +{ + bytea *v = PG_GETARG_BYTEA_PP(0); + int len = VARSIZE_ANY_EXHDR(v); + uint64 result; + + /* Check that the byte array is not too long */ + if (len > sizeof(result)) + ereport(ERROR, + errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("bigint out of range")); + + /* Convert it to an integer; most significant bytes come first */ + result = 0; + for (int i = 0; i < len; i++) + { + result <<= BITS_PER_BYTE; + result |= ((unsigned char *) VARDATA_ANY(v))[i]; + } + + PG_RETURN_INT64(result); +} + +/* Cast int2 -> bytea; can just use int2send() */ +Datum +int2_bytea(PG_FUNCTION_ARGS) +{ + return int2send(fcinfo); +} + +/* Cast int4 -> bytea; can just use int4send() */ +Datum +int4_bytea(PG_FUNCTION_ARGS) +{ + return int4send(fcinfo); +} + +/* Cast int8 -> bytea; can just use int8send() */ +Datum +int8_bytea(PG_FUNCTION_ARGS) +{ + return int8send(fcinfo); +} diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c index 4227ab1a72b..344f58b92f7 100644 --- a/src/backend/utils/adt/date.c +++ b/src/backend/utils/adt/date.c @@ -1363,10 +1363,35 @@ timestamp_date(PG_FUNCTION_ARGS) { Timestamp timestamp = PG_GETARG_TIMESTAMP(0); DateADT result; + + result = timestamp2date_opt_overflow(timestamp, NULL); + PG_RETURN_DATEADT(result); +} + +/* + * Convert timestamp to date. + * + * On successful conversion, *overflow is set to zero if it's not NULL. + * + * If the timestamp is finite but out of the valid range for date, then: + * if overflow is NULL, we throw an out-of-range error. + * if overflow is not NULL, we store +1 or -1 there to indicate the sign + * of the overflow, and return the appropriate date infinity. + * + * Note: given the ranges of the types, overflow is only possible at + * the minimum end of the range, but we don't assume that in this code. + */ +DateADT +timestamp2date_opt_overflow(Timestamp timestamp, int *overflow) +{ + DateADT result; struct pg_tm tt, *tm = &tt; fsec_t fsec; + if (overflow) + *overflow = 0; + if (TIMESTAMP_IS_NOBEGIN(timestamp)) DATE_NOBEGIN(result); else if (TIMESTAMP_IS_NOEND(timestamp)) @@ -1374,14 +1399,30 @@ timestamp_date(PG_FUNCTION_ARGS) else { if (timestamp2tm(timestamp, NULL, tm, &fsec, NULL, NULL) != 0) + { + if (overflow) + { + if (timestamp < 0) + { + *overflow = -1; + DATE_NOBEGIN(result); + } + else + { + *overflow = 1; /* not actually reachable */ + DATE_NOEND(result); + } + return result; + } ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } result = date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) - POSTGRES_EPOCH_JDATE; } - PG_RETURN_DATEADT(result); + return result; } @@ -1408,11 +1449,36 @@ timestamptz_date(PG_FUNCTION_ARGS) { TimestampTz timestamp = PG_GETARG_TIMESTAMP(0); DateADT result; + + result = timestamptz2date_opt_overflow(timestamp, NULL); + PG_RETURN_DATEADT(result); +} + +/* + * Convert timestamptz to date. + * + * On successful conversion, *overflow is set to zero if it's not NULL. + * + * If the timestamptz is finite but out of the valid range for date, then: + * if overflow is NULL, we throw an out-of-range error. + * if overflow is not NULL, we store +1 or -1 there to indicate the sign + * of the overflow, and return the appropriate date infinity. + * + * Note: given the ranges of the types, overflow is only possible at + * the minimum end of the range, but we don't assume that in this code. + */ +DateADT +timestamptz2date_opt_overflow(TimestampTz timestamp, int *overflow) +{ + DateADT result; struct pg_tm tt, *tm = &tt; fsec_t fsec; int tz; + if (overflow) + *overflow = 0; + if (TIMESTAMP_IS_NOBEGIN(timestamp)) DATE_NOBEGIN(result); else if (TIMESTAMP_IS_NOEND(timestamp)) @@ -1420,14 +1486,30 @@ timestamptz_date(PG_FUNCTION_ARGS) else { if (timestamp2tm(timestamp, &tz, tm, &fsec, NULL, NULL) != 0) + { + if (overflow) + { + if (timestamp < 0) + { + *overflow = -1; + DATE_NOBEGIN(result); + } + else + { + *overflow = 1; /* not actually reachable */ + DATE_NOEND(result); + } + return result; + } ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } result = date2j(tm->tm_year, tm->tm_mon, tm->tm_mday) - POSTGRES_EPOCH_JDATE; } - PG_RETURN_DATEADT(result); + return result; } diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c index 793d8a9adcc..680fee2a844 100644 --- a/src/backend/utils/adt/datetime.c +++ b/src/backend/utils/adt/datetime.c @@ -702,9 +702,18 @@ ParseFraction(char *cp, double *frac) } else { + /* + * On the other hand, let's reject anything that's not digits after + * the ".". strtod is happy with input like ".123e9", but that'd + * break callers' expectation that the result is in 0..1. (It's quite + * difficult to get here with such input, but not impossible.) + */ + if (strspn(cp + 1, "0123456789") != strlen(cp + 1)) + return DTERR_BAD_FORMAT; + errno = 0; *frac = strtod(cp, &cp); - /* check for parse failure */ + /* check for parse failure (probably redundant given prior check) */ if (*cp != '\0' || errno != 0) return DTERR_BAD_FORMAT; } @@ -2959,30 +2968,27 @@ DecodeNumberField(int len, char *str, int fmask, char *cp; /* + * This function was originally meant to cope only with DTK_NUMBER fields, + * but we now sometimes abuse it to parse (parts of) DTK_DATE fields, + * which can contain letters and other punctuation. Reject if it's not a + * valid DTK_NUMBER, that is digits and decimal point(s). (ParseFraction + * will reject if there's more than one decimal point.) + */ + if (strspn(str, "0123456789.") != len) + return DTERR_BAD_FORMAT; + + /* * Have a decimal point? Then this is a date or something with a seconds * field... */ if ((cp = strchr(str, '.')) != NULL) { - /* - * Can we use ParseFractionalSecond here? Not clear whether trailing - * junk should be rejected ... - */ - if (cp[1] == '\0') - { - /* avoid assuming that strtod will accept "." */ - *fsec = 0; - } - else - { - double frac; + int dterr; - errno = 0; - frac = strtod(cp, NULL); - if (errno != 0) - return DTERR_BAD_FORMAT; - *fsec = rint(frac * 1000000); - } + /* Convert the fraction and store at *fsec */ + dterr = ParseFractionalSecond(cp, fsec); + if (dterr) + return dterr; /* Now truncate off the fraction for further processing */ *cp = '\0'; len = strlen(str); diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c index 6d20ae07ae7..7b97d2be6ca 100644 --- a/src/backend/utils/adt/float.c +++ b/src/backend/utils/adt/float.c @@ -4065,10 +4065,11 @@ float84ge(PG_FUNCTION_ARGS) * in the histogram. width_bucket() returns an integer indicating the * bucket number that 'operand' belongs to in an equiwidth histogram * with the specified characteristics. An operand smaller than the - * lower bound is assigned to bucket 0. An operand greater than the - * upper bound is assigned to an additional bucket (with number - * count+1). We don't allow "NaN" for any of the float8 inputs, and we - * don't allow either of the histogram bounds to be +/- infinity. + * lower bound is assigned to bucket 0. An operand greater than or equal + * to the upper bound is assigned to an additional bucket (with number + * count+1). We don't allow the histogram bounds to be NaN or +/- infinity, + * but we do allow those values for the operand (taking NaN to be larger + * than any other value, as we do in comparisons). */ Datum width_bucket_float8(PG_FUNCTION_ARGS) @@ -4084,12 +4085,11 @@ width_bucket_float8(PG_FUNCTION_ARGS) (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), errmsg("count must be greater than zero"))); - if (isnan(operand) || isnan(bound1) || isnan(bound2)) + if (isnan(bound1) || isnan(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), - errmsg("operand, lower bound, and upper bound cannot be NaN"))); + errmsg("lower and upper bounds cannot be NaN"))); - /* Note that we allow "operand" to be infinite */ if (isinf(bound1) || isinf(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), @@ -4097,15 +4097,15 @@ width_bucket_float8(PG_FUNCTION_ARGS) if (bound1 < bound2) { - if (operand < bound1) - result = 0; - else if (operand >= bound2) + if (isnan(operand) || operand >= bound2) { if (pg_add_s32_overflow(count, 1, &result)) ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("integer out of range"))); } + else if (operand < bound1) + result = 0; else { if (!isinf(bound2 - bound1)) @@ -4135,7 +4135,7 @@ width_bucket_float8(PG_FUNCTION_ARGS) } else if (bound1 > bound2) { - if (operand > bound1) + if (isnan(operand) || operand > bound1) result = 0; else if (operand <= bound2) { diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 5bd1e01f7e4..1d05481181d 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -3590,14 +3590,15 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, if (matched < 2) ereturn(escontext,, (errcode(ERRCODE_INVALID_DATETIME_FORMAT), - errmsg("invalid input string for \"Y,YYY\""))); + errmsg("invalid value \"%s\" for \"%s\"", + s, "Y,YYY"))); /* years += (millennia * 1000); */ if (pg_mul_s32_overflow(millennia, 1000, &millennia) || pg_add_s32_overflow(years, millennia, &years)) ereturn(escontext,, (errcode(ERRCODE_DATETIME_FIELD_OVERFLOW), - errmsg("value for \"Y,YYY\" in source string is out of range"))); + errmsg("value for \"%s\" in source string is out of range", "Y,YYY"))); if (!from_char_set_int(&out->year, years, n, escontext)) return; diff --git a/src/backend/utils/adt/inet_net_pton.c b/src/backend/utils/adt/inet_net_pton.c index ef2236d9f04..3b0db2a3799 100644 --- a/src/backend/utils/adt/inet_net_pton.c +++ b/src/backend/utils/adt/inet_net_pton.c @@ -115,8 +115,7 @@ inet_cidr_pton_ipv4(const char *src, u_char *dst, size_t size) src++; /* skip x or X. */ while ((ch = *src++) != '\0' && isxdigit((unsigned char) ch)) { - if (isupper((unsigned char) ch)) - ch = tolower((unsigned char) ch); + ch = pg_ascii_tolower((unsigned char) ch); n = strchr(xdigits, ch) - xdigits; assert(n >= 0 && n <= 15); if (dirty == 0) diff --git a/src/backend/utils/adt/jsonb_gin.c b/src/backend/utils/adt/jsonb_gin.c index c1950792b5a..9b56248cf0b 100644 --- a/src/backend/utils/adt/jsonb_gin.c +++ b/src/backend/utils/adt/jsonb_gin.c @@ -896,8 +896,8 @@ gin_extract_jsonb_query(PG_FUNCTION_ARGS) continue; /* We rely on the array elements not being toasted */ entries[j++] = make_text_key(JGINFLAG_KEY, - VARDATA_ANY(key_datums[i]), - VARSIZE_ANY_EXHDR(key_datums[i])); + VARDATA_ANY(DatumGetPointer(key_datums[i])), + VARSIZE_ANY_EXHDR(DatumGetPointer(key_datums[i]))); } *nentries = j; diff --git a/src/backend/utils/adt/jsonb_op.c b/src/backend/utils/adt/jsonb_op.c index fa5603f26e1..51d38e321fb 100644 --- a/src/backend/utils/adt/jsonb_op.c +++ b/src/backend/utils/adt/jsonb_op.c @@ -63,8 +63,8 @@ jsonb_exists_any(PG_FUNCTION_ARGS) strVal.type = jbvString; /* We rely on the array elements not being toasted */ - strVal.val.string.val = VARDATA_ANY(key_datums[i]); - strVal.val.string.len = VARSIZE_ANY_EXHDR(key_datums[i]); + strVal.val.string.val = VARDATA_ANY(DatumGetPointer(key_datums[i])); + strVal.val.string.len = VARSIZE_ANY_EXHDR(DatumGetPointer(key_datums[i])); if (findJsonbValueFromContainer(&jb->root, JB_FOBJECT | JB_FARRAY, @@ -96,8 +96,8 @@ jsonb_exists_all(PG_FUNCTION_ARGS) strVal.type = jbvString; /* We rely on the array elements not being toasted */ - strVal.val.string.val = VARDATA_ANY(key_datums[i]); - strVal.val.string.len = VARSIZE_ANY_EXHDR(key_datums[i]); + strVal.val.string.val = VARDATA_ANY(DatumGetPointer(key_datums[i])); + strVal.val.string.len = VARSIZE_ANY_EXHDR(DatumGetPointer(key_datums[i])); if (findJsonbValueFromContainer(&jb->root, JB_FOBJECT | JB_FARRAY, diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index c8b6c15e059..82b807d067a 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -277,22 +277,16 @@ compareJsonbContainers(JsonbContainer *a, JsonbContainer *b) else { /* - * It's safe to assume that the types differed, and that the va - * and vb values passed were set. - * - * If the two values were of the same container type, then there'd - * have been a chance to observe the variation in the number of - * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're - * either two heterogeneously-typed containers, or a container and - * some scalar type. - * - * We don't have to consider the WJB_END_ARRAY and WJB_END_OBJECT - * cases here, because we would have seen the corresponding - * WJB_BEGIN_ARRAY and WJB_BEGIN_OBJECT tokens first, and - * concluded that they don't match. + * It's not possible for one iterator to report end of array or + * object while the other one reports something else, because we + * would have detected a length mismatch when we processed the + * container-start tokens above. Likewise we can't see WJB_DONE + * from one but not the other. So we have two different-type + * containers, or a container and some scalar type, or two + * different scalar types. Sort on the basis of the type code. */ - Assert(ra != WJB_END_ARRAY && ra != WJB_END_OBJECT); - Assert(rb != WJB_END_ARRAY && rb != WJB_END_OBJECT); + Assert(ra != WJB_DONE && ra != WJB_END_ARRAY && ra != WJB_END_OBJECT); + Assert(rb != WJB_DONE && rb != WJB_END_ARRAY && rb != WJB_END_OBJECT); Assert(va.type != vb.type); Assert(va.type != jbvBinary); @@ -852,15 +846,20 @@ JsonbIteratorInit(JsonbContainer *container) * It is our job to expand the jbvBinary representation without bothering them * with it. However, clients should not take it upon themselves to touch array * or Object element/pair buffers, since their element/pair pointers are - * garbage. Also, *val will not be set when returning WJB_END_ARRAY or - * WJB_END_OBJECT, on the assumption that it's only useful to access values - * when recursing in. + * garbage. + * + * *val is not meaningful when the result is WJB_DONE, WJB_END_ARRAY or + * WJB_END_OBJECT. However, we set val->type = jbvNull in those cases, + * so that callers may assume that val->type is always well-defined. */ JsonbIteratorToken JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) { if (*it == NULL) + { + val->type = jbvNull; return WJB_DONE; + } /* * When stepping into a nested container, we jump back here to start @@ -898,6 +897,7 @@ recurse: * nesting). */ *it = freeAndGetParent(*it); + val->type = jbvNull; return WJB_END_ARRAY; } @@ -951,6 +951,7 @@ recurse: * of nesting). */ *it = freeAndGetParent(*it); + val->type = jbvNull; return WJB_END_OBJECT; } else @@ -995,8 +996,10 @@ recurse: return WJB_VALUE; } - elog(ERROR, "invalid iterator state"); - return -1; + elog(ERROR, "invalid jsonb iterator state"); + /* satisfy compilers that don't know that elog(ERROR) doesn't return */ + val->type = jbvNull; + return WJB_DONE; } /* diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c index bcb1720b6cd..370456408bf 100644 --- a/src/backend/utils/adt/jsonfuncs.c +++ b/src/backend/utils/adt/jsonfuncs.c @@ -4766,8 +4766,8 @@ jsonb_delete_array(PG_FUNCTION_ARGS) continue; /* We rely on the array elements not being toasted */ - keyptr = VARDATA_ANY(keys_elems[i]); - keylen = VARSIZE_ANY_EXHDR(keys_elems[i]); + keyptr = VARDATA_ANY(DatumGetPointer(keys_elems[i])); + keylen = VARSIZE_ANY_EXHDR(DatumGetPointer(keys_elems[i])); if (keylen == v.val.string.len && memcmp(keyptr, v.val.string.val, keylen) == 0) { diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c index dbab24737ef..407041b14a1 100644 --- a/src/backend/utils/adt/jsonpath_exec.c +++ b/src/backend/utils/adt/jsonpath_exec.c @@ -3074,8 +3074,8 @@ JsonItemFromDatum(Datum val, Oid typid, int32 typmod, JsonbValue *res) case TEXTOID: case VARCHAROID: res->type = jbvString; - res->val.string.val = VARDATA_ANY(val); - res->val.string.len = VARSIZE_ANY_EXHDR(val); + res->val.string.val = VARDATA_ANY(DatumGetPointer(val)); + res->val.string.len = VARSIZE_ANY_EXHDR(DatumGetPointer(val)); break; case DATEOID: case TIMEOID: diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 7f4cf614585..4216ac17f43 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -98,7 +98,7 @@ SB_lower_char(unsigned char c, pg_locale_t locale) else if (locale->is_default) return pg_tolower(c); else - return tolower_l(c, locale->info.lt); + return char_tolower(c, locale); } @@ -209,7 +209,17 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) * way. */ - if (pg_database_encoding_max_length() > 1 || (locale->provider == COLLPROVIDER_ICU)) + if (locale->ctype_is_c || + (char_tolower_enabled(locale) && + pg_database_encoding_max_length() == 1)) + { + p = VARDATA_ANY(pat); + plen = VARSIZE_ANY_EXHDR(pat); + s = VARDATA_ANY(str); + slen = VARSIZE_ANY_EXHDR(str); + return SB_IMatchText(s, slen, p, plen, locale); + } + else { pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation, PointerGetDatum(pat))); @@ -224,14 +234,6 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation) else return MB_MatchText(s, slen, p, plen, 0); } - else - { - p = VARDATA_ANY(pat); - plen = VARSIZE_ANY_EXHDR(pat); - s = VARDATA_ANY(str); - slen = VARSIZE_ANY_EXHDR(str); - return SB_IMatchText(s, slen, p, plen, locale); - } } /* diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index 8fdc677371f..999f23f86d5 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -1495,13 +1495,8 @@ pattern_char_isalpha(char c, bool is_multibyte, { if (locale->ctype_is_c) return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); - else if (is_multibyte && IS_HIGHBIT_SET(c)) - return true; - else if (locale->provider != COLLPROVIDER_LIBC) - return IS_HIGHBIT_SET(c) || - (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); else - return isalpha_l((unsigned char) c, locale->info.lt); + return char_is_cased(c, locale); } diff --git a/src/backend/utils/adt/mcxtfuncs.c b/src/backend/utils/adt/mcxtfuncs.c index 7ec2c225016..fe6dce9cba3 100644 --- a/src/backend/utils/adt/mcxtfuncs.c +++ b/src/backend/utils/adt/mcxtfuncs.c @@ -15,27 +15,30 @@ #include "postgres.h" -#include "access/twophase.h" -#include "catalog/pg_authid_d.h" #include "funcapi.h" #include "mb/pg_wchar.h" -#include "miscadmin.h" #include "storage/proc.h" #include "storage/procarray.h" -#include "utils/acl.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/hsearch.h" -#include "utils/memutils.h" -#include "utils/wait_event_types.h" /* ---------- * The max bytes for showing identifiers of MemoryContext. * ---------- */ #define MEMORY_CONTEXT_IDENT_DISPLAY_SIZE 1024 -struct MemoryStatsBackendState *memCxtState = NULL; -struct MemoryStatsCtl *memCxtArea = NULL; + +/* + * MemoryContextId + * Used for storage of transient identifiers for + * pg_get_backend_memory_contexts. + */ +typedef struct MemoryContextId +{ + MemoryContext context; + int context_id; +} MemoryContextId; /* * int_list_to_array @@ -86,7 +89,7 @@ PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore, */ for (MemoryContext cur = context; cur != NULL; cur = cur->parent) { - MemoryStatsContextId *entry; + MemoryContextId *entry; bool found; entry = hash_search(context_id_lookup, &cur, HASH_FIND, &found); @@ -140,51 +143,36 @@ PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore, else nulls[1] = true; - type = ContextTypeToString(context->type); - - values[2] = CStringGetTextDatum(type); - values[3] = Int32GetDatum(list_length(path)); /* level */ - values[4] = int_list_to_array(path); - values[5] = Int64GetDatum(stat.totalspace); - values[6] = Int64GetDatum(stat.nblocks); - values[7] = Int64GetDatum(stat.freespace); - values[8] = Int64GetDatum(stat.freechunks); - values[9] = Int64GetDatum(stat.totalspace - stat.freespace); - - tuplestore_putvalues(tupstore, tupdesc, values, nulls); - list_free(path); -} - -/* - * ContextTypeToString - * Returns a textual representation of a context type - * - * This should cover the same types as MemoryContextIsValid. - */ -const char * -ContextTypeToString(NodeTag type) -{ - const char *context_type; - - switch (type) + switch (context->type) { case T_AllocSetContext: - context_type = "AllocSet"; + type = "AllocSet"; break; case T_GenerationContext: - context_type = "Generation"; + type = "Generation"; break; case T_SlabContext: - context_type = "Slab"; + type = "Slab"; break; case T_BumpContext: - context_type = "Bump"; + type = "Bump"; break; default: - context_type = "???"; + type = "???"; break; } - return context_type; + + values[2] = CStringGetTextDatum(type); + values[3] = Int32GetDatum(list_length(path)); /* level */ + values[4] = int_list_to_array(path); + values[5] = Int64GetDatum(stat.totalspace); + values[6] = Int64GetDatum(stat.nblocks); + values[7] = Int64GetDatum(stat.freespace); + values[8] = Int64GetDatum(stat.freechunks); + values[9] = Int64GetDatum(stat.totalspace - stat.freespace); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + list_free(path); } /* @@ -201,7 +189,7 @@ pg_get_backend_memory_contexts(PG_FUNCTION_ARGS) HTAB *context_id_lookup; ctl.keysize = sizeof(MemoryContext); - ctl.entrysize = sizeof(MemoryStatsContextId); + ctl.entrysize = sizeof(MemoryContextId); ctl.hcxt = CurrentMemoryContext; context_id_lookup = hash_create("pg_get_backend_memory_contexts", @@ -228,7 +216,7 @@ pg_get_backend_memory_contexts(PG_FUNCTION_ARGS) foreach_ptr(MemoryContextData, cur, contexts) { - MemoryStatsContextId *entry; + MemoryContextId *entry; bool found; /* @@ -236,8 +224,8 @@ pg_get_backend_memory_contexts(PG_FUNCTION_ARGS) * PutMemoryContextsStatsTupleStore needs this to populate the "path" * column with the parent context_ids. */ - entry = (MemoryStatsContextId *) hash_search(context_id_lookup, &cur, - HASH_ENTER, &found); + entry = (MemoryContextId *) hash_search(context_id_lookup, &cur, + HASH_ENTER, &found); entry->context_id = context_id++; Assert(!found); @@ -317,349 +305,3 @@ pg_log_backend_memory_contexts(PG_FUNCTION_ARGS) PG_RETURN_BOOL(true); } - -/* - * pg_get_process_memory_contexts - * Signal a backend or an auxiliary process to send its memory contexts, - * wait for the results and display them. - * - * By default, only superusers or users with ROLE_PG_READ_ALL_STATS are allowed - * to signal a process to return the memory contexts. This is because allowing - * any users to issue this request at an unbounded rate would cause lots of - * requests to be sent, which can lead to denial of service. Additional roles - * can be permitted with GRANT. - * - * On receipt of this signal, a backend or an auxiliary process sets the flag - * in the signal handler, which causes the next CHECK_FOR_INTERRUPTS() - * or process-specific interrupt handler to copy the memory context details - * to a dynamic shared memory space. - * - * We have defined a limit on DSA memory that could be allocated per process - - * if the process has more memory contexts than what can fit in the allocated - * size, the excess contexts are summarized and represented as cumulative total - * at the end of the buffer. - * - * After sending the signal, wait on a condition variable. The publishing - * backend, after copying the data to shared memory, sends signal on that - * condition variable. There is one condition variable per publishing backend. - * Once the condition variable is signalled, check if the latest memory context - * information is available and display. - * - * If the publishing backend does not respond before the condition variable - * times out, which is set to MEMSTATS_WAIT_TIMEOUT, retry given that there is - * time left within the timeout specified by the user, before giving up and - * returning previously published statistics, if any. If no previous statistics - * exist, return NULL. - */ -#define MEMSTATS_WAIT_TIMEOUT 100 -Datum -pg_get_process_memory_contexts(PG_FUNCTION_ARGS) -{ - int pid = PG_GETARG_INT32(0); - bool summary = PG_GETARG_BOOL(1); - double timeout = PG_GETARG_FLOAT8(2); - PGPROC *proc; - ProcNumber procNumber = INVALID_PROC_NUMBER; - bool proc_is_aux = false; - ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; - MemoryStatsEntry *memcxt_info; - TimestampTz start_timestamp; - - /* - * See if the process with given pid is a backend or an auxiliary process - * and remember the type for when we requery the process later. - */ - proc = BackendPidGetProc(pid); - if (proc == NULL) - { - proc = AuxiliaryPidGetProc(pid); - proc_is_aux = true; - } - - /* - * BackendPidGetProc() and AuxiliaryPidGetProc() return NULL if the pid - * isn't valid; this is however not a problem and leave with a WARNING. - * See comment in pg_log_backend_memory_contexts for a discussion on this. - */ - if (proc == NULL) - { - /* - * This is just a warning so a loop-through-resultset will not abort - * if one backend terminated on its own during the run. - */ - ereport(WARNING, - errmsg("PID %d is not a PostgreSQL server process", pid)); - PG_RETURN_NULL(); - } - - InitMaterializedSRF(fcinfo, 0); - - procNumber = GetNumberFromPGProc(proc); - - LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); - memCxtState[procNumber].summary = summary; - LWLockRelease(&memCxtState[procNumber].lw_lock); - - start_timestamp = GetCurrentTimestamp(); - - /* - * Send a signal to a PostgreSQL process, informing it we want it to - * produce information about its memory contexts. - */ - if (SendProcSignal(pid, PROCSIG_GET_MEMORY_CONTEXT, procNumber) < 0) - { - ereport(WARNING, - errmsg("could not send signal to process %d: %m", pid)); - PG_RETURN_NULL(); - } - - /* - * Even if the proc has published statistics, the may not be due to the - * current request, but previously published stats. Check if the stats - * are updated by comparing the timestamp, if the stats are newer than our - * previously recorded timestamp from before sending the procsignal, they - * must by definition be updated. Wait for the timeout specified by the - * user, following which display old statistics if available or return - * NULL. - */ - while (1) - { - long msecs; - - /* - * We expect to come out of sleep when the requested process has - * finished publishing the statistics, verified using the valid DSA - * pointer. - * - * Make sure that the information belongs to pid we requested - * information for, Otherwise loop back and wait for the server - * process to finish publishing statistics. - */ - LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); - - /* - * Note in procnumber.h file says that a procNumber can be re-used for - * a different backend immediately after a backend exits. In case an - * old process' data was there and not updated by the current process - * in the slot identified by the procNumber, the pid of the requested - * process and the proc_id might not match. - */ - if (memCxtState[procNumber].proc_id == pid) - { - /* - * Break if the latest stats have been read, indicated by - * statistics timestamp being newer than the current request - * timestamp. - */ - msecs = TimestampDifferenceMilliseconds(start_timestamp, - memCxtState[procNumber].stats_timestamp); - - if (DsaPointerIsValid(memCxtState[procNumber].memstats_dsa_pointer) - && msecs > 0) - break; - } - LWLockRelease(&memCxtState[procNumber].lw_lock); - - /* - * Recheck the state of the backend before sleeping on the condition - * variable to ensure the process is still alive. Only check the - * relevant process type based on the earlier PID check. - */ - if (proc_is_aux) - proc = AuxiliaryPidGetProc(pid); - else - proc = BackendPidGetProc(pid); - - /* - * The process ending during memory context processing is not an - * error. - */ - if (proc == NULL) - { - ereport(WARNING, - errmsg("PID %d is no longer a PostgreSQL server process", - pid)); - PG_RETURN_NULL(); - } - - msecs = TimestampDifferenceMilliseconds(start_timestamp, GetCurrentTimestamp()); - - /* - * If we haven't already exceeded the timeout value, sleep for the - * remainder of the timeout on the condition variable. - */ - if (msecs > 0 && msecs < (timeout * 1000)) - { - /* - * Wait for the timeout as defined by the user. If no updated - * statistics are available within the allowed time then display - * previously published statistics if there are any. If no - * previous statistics are available then return NULL. The timer - * is defined in milliseconds since that's what the condition - * variable sleep uses. - */ - if (ConditionVariableTimedSleep(&memCxtState[procNumber].memcxt_cv, - ((timeout * 1000) - msecs), WAIT_EVENT_MEM_CXT_PUBLISH)) - { - LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); - /* Displaying previously published statistics if available */ - if (DsaPointerIsValid(memCxtState[procNumber].memstats_dsa_pointer)) - break; - else - { - LWLockRelease(&memCxtState[procNumber].lw_lock); - PG_RETURN_NULL(); - } - } - } - else - { - LWLockAcquire(&memCxtState[procNumber].lw_lock, LW_EXCLUSIVE); - /* Displaying previously published statistics if available */ - if (DsaPointerIsValid(memCxtState[procNumber].memstats_dsa_pointer)) - break; - else - { - LWLockRelease(&memCxtState[procNumber].lw_lock); - PG_RETURN_NULL(); - } - } - } - - /* - * We should only reach here with a valid DSA handle, either containing - * updated statistics or previously published statistics (identified by - * the timestamp. - */ - Assert(memCxtArea->memstats_dsa_handle != DSA_HANDLE_INVALID); - /* Attach to the dsa area if we have not already done so */ - if (MemoryStatsDsaArea == NULL) - { - MemoryContext oldcontext = CurrentMemoryContext; - - MemoryContextSwitchTo(TopMemoryContext); - MemoryStatsDsaArea = dsa_attach(memCxtArea->memstats_dsa_handle); - MemoryContextSwitchTo(oldcontext); - dsa_pin_mapping(MemoryStatsDsaArea); - } - - /* - * Backend has finished publishing the stats, project them. - */ - memcxt_info = (MemoryStatsEntry *) - dsa_get_address(MemoryStatsDsaArea, memCxtState[procNumber].memstats_dsa_pointer); - -#define PG_GET_PROCESS_MEMORY_CONTEXTS_COLS 12 - for (int i = 0; i < memCxtState[procNumber].total_stats; i++) - { - ArrayType *path_array; - int path_length; - Datum values[PG_GET_PROCESS_MEMORY_CONTEXTS_COLS]; - bool nulls[PG_GET_PROCESS_MEMORY_CONTEXTS_COLS]; - char *name; - char *ident; - Datum *path_datum = NULL; - int *path_int = NULL; - - memset(values, 0, sizeof(values)); - memset(nulls, 0, sizeof(nulls)); - - if (DsaPointerIsValid(memcxt_info[i].name)) - { - name = (char *) dsa_get_address(MemoryStatsDsaArea, memcxt_info[i].name); - values[0] = CStringGetTextDatum(name); - } - else - nulls[0] = true; - - if (DsaPointerIsValid(memcxt_info[i].ident)) - { - ident = (char *) dsa_get_address(MemoryStatsDsaArea, memcxt_info[i].ident); - values[1] = CStringGetTextDatum(ident); - } - else - nulls[1] = true; - - values[2] = CStringGetTextDatum(ContextTypeToString(memcxt_info[i].type)); - - path_length = memcxt_info[i].path_length; - path_datum = (Datum *) palloc(path_length * sizeof(Datum)); - if (DsaPointerIsValid(memcxt_info[i].path)) - { - path_int = (int *) dsa_get_address(MemoryStatsDsaArea, memcxt_info[i].path); - for (int j = 0; j < path_length; j++) - path_datum[j] = Int32GetDatum(path_int[j]); - path_array = construct_array_builtin(path_datum, path_length, INT4OID); - values[3] = PointerGetDatum(path_array); - } - else - nulls[3] = true; - - values[4] = Int32GetDatum(memcxt_info[i].levels); - values[5] = Int64GetDatum(memcxt_info[i].totalspace); - values[6] = Int64GetDatum(memcxt_info[i].nblocks); - values[7] = Int64GetDatum(memcxt_info[i].freespace); - values[8] = Int64GetDatum(memcxt_info[i].freechunks); - values[9] = Int64GetDatum(memcxt_info[i].totalspace - - memcxt_info[i].freespace); - values[10] = Int32GetDatum(memcxt_info[i].num_agg_stats); - values[11] = TimestampTzGetDatum(memCxtState[procNumber].stats_timestamp); - - tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, - values, nulls); - } - LWLockRelease(&memCxtState[procNumber].lw_lock); - - ConditionVariableCancelSleep(); - - PG_RETURN_NULL(); -} - -Size -MemoryContextReportingShmemSize(void) -{ - Size sz = 0; - Size TotalProcs = 0; - - TotalProcs = add_size(TotalProcs, NUM_AUXILIARY_PROCS); - TotalProcs = add_size(TotalProcs, MaxBackends); - sz = add_size(sz, mul_size(TotalProcs, sizeof(MemoryStatsBackendState))); - - sz = add_size(sz, sizeof(MemoryStatsCtl)); - - return sz; -} - -/* - * Initialize shared memory for displaying memory context statistics - */ -void -MemoryContextReportingShmemInit(void) -{ - bool found; - - memCxtArea = (MemoryStatsCtl *) - ShmemInitStruct("MemoryStatsCtl", - sizeof(MemoryStatsCtl), &found); - - if (!found) - { - LWLockInitialize(&memCxtArea->lw_lock, LWTRANCHE_MEMORY_CONTEXT_REPORTING_STATE); - memCxtArea->memstats_dsa_handle = DSA_HANDLE_INVALID; - } - - memCxtState = (MemoryStatsBackendState *) - ShmemInitStruct("MemoryStatsBackendState", - ((MaxBackends + NUM_AUXILIARY_PROCS) * sizeof(MemoryStatsBackendState)), - &found); - - if (found) - return; - - for (int i = 0; i < (MaxBackends + NUM_AUXILIARY_PROCS); i++) - { - ConditionVariableInit(&memCxtState[i].memcxt_cv); - LWLockInitialize(&memCxtState[i].lw_lock, LWTRANCHE_MEMORY_CONTEXT_REPORTING_PROC); - memCxtState[i].memstats_dsa_pointer = InvalidDsaPointer; - } -} diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index 244f48f4fd7..ed9bbd7b926 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -12,6 +12,7 @@ backend_sources += files( 'arrayutils.c', 'ascii.c', 'bool.c', + 'bytea.c', 'cash.c', 'char.c', 'cryptohashfuncs.c', diff --git a/src/backend/utils/adt/multirangetypes.c b/src/backend/utils/adt/multirangetypes.c index cd84ced5b48..46f2ec0c29f 100644 --- a/src/backend/utils/adt/multirangetypes.c +++ b/src/backend/utils/adt/multirangetypes.c @@ -394,12 +394,13 @@ multirange_send(PG_FUNCTION_ARGS) for (int i = 0; i < range_count; i++) { Datum range; + bytea *outputbytes; range = RangeTypePGetDatum(ranges[i]); - range = PointerGetDatum(SendFunctionCall(&cache->typioproc, range)); + outputbytes = SendFunctionCall(&cache->typioproc, range); - pq_sendint32(buf, VARSIZE(range) - VARHDRSZ); - pq_sendbytes(buf, VARDATA(range), VARSIZE(range) - VARHDRSZ); + pq_sendint32(buf, VARSIZE(outputbytes) - VARHDRSZ); + pq_sendbytes(buf, VARDATA(outputbytes), VARSIZE(outputbytes) - VARHDRSZ); } PG_RETURN_BYTEA_P(pq_endtypsend(buf)); @@ -2833,7 +2834,7 @@ hash_multirange(PG_FUNCTION_ARGS) upper_hash = 0; /* Merge hashes of flags and bounds */ - range_hash = hash_uint32((uint32) flags); + range_hash = hash_bytes_uint32((uint32) flags); range_hash ^= lower_hash; range_hash = pg_rotate_left32(range_hash, 1); range_hash ^= upper_hash; diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c index f03fcc1147b..9fd211b2d45 100644 --- a/src/backend/utils/adt/network.c +++ b/src/backend/utils/adt/network.c @@ -12,8 +12,6 @@ #include <netinet/in.h> #include <arpa/inet.h> -#include "access/stratnum.h" -#include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" #include "common/hashfn.h" #include "common/ip.h" diff --git a/src/backend/utils/adt/network_spgist.c b/src/backend/utils/adt/network_spgist.c index a84747d9275..602276a35c3 100644 --- a/src/backend/utils/adt/network_spgist.c +++ b/src/backend/utils/adt/network_spgist.c @@ -37,7 +37,6 @@ #include "catalog/pg_type.h" #include "utils/fmgrprotos.h" #include "utils/inet.h" -#include "varatt.h" static int inet_spg_node_number(const inet *val, int commonbits); diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c index 40dcbc7b671..c9233565d57 100644 --- a/src/backend/utils/adt/numeric.c +++ b/src/backend/utils/adt/numeric.c @@ -1958,9 +1958,11 @@ generate_series_numeric_support(PG_FUNCTION_ARGS) * in the histogram. width_bucket() returns an integer indicating the * bucket number that 'operand' belongs to in an equiwidth histogram * with the specified characteristics. An operand smaller than the - * lower bound is assigned to bucket 0. An operand greater than the - * upper bound is assigned to an additional bucket (with number - * count+1). We don't allow "NaN" for any of the numeric arguments. + * lower bound is assigned to bucket 0. An operand greater than or equal + * to the upper bound is assigned to an additional bucket (with number + * count+1). We don't allow the histogram bounds to be NaN or +/- infinity, + * but we do allow those values for the operand (taking NaN to be larger + * than any other value, as we do in comparisons). */ Datum width_bucket_numeric(PG_FUNCTION_ARGS) @@ -1978,17 +1980,13 @@ width_bucket_numeric(PG_FUNCTION_ARGS) (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), errmsg("count must be greater than zero"))); - if (NUMERIC_IS_SPECIAL(operand) || - NUMERIC_IS_SPECIAL(bound1) || - NUMERIC_IS_SPECIAL(bound2)) + if (NUMERIC_IS_SPECIAL(bound1) || NUMERIC_IS_SPECIAL(bound2)) { - if (NUMERIC_IS_NAN(operand) || - NUMERIC_IS_NAN(bound1) || - NUMERIC_IS_NAN(bound2)) + if (NUMERIC_IS_NAN(bound1) || NUMERIC_IS_NAN(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), - errmsg("operand, lower bound, and upper bound cannot be NaN"))); - /* We allow "operand" to be infinite; cmp_numerics will cope */ + errmsg("lower and upper bounds cannot be NaN"))); + if (NUMERIC_IS_INF(bound1) || NUMERIC_IS_INF(bound2)) ereport(ERROR, (errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index a858f27cadc..97c2ac1faf9 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -41,11 +41,11 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" -#include "utils/formatting.h" #include "utils/guc_hooks.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_locale.h" +#include "utils/relcache.h" #include "utils/syscache.h" #ifdef WIN32 @@ -79,31 +79,6 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); extern char *get_collation_actual_version_libc(const char *collcollate); -extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - -extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - -extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - /* GUC settings */ char *locale_messages; char *locale_monetary; @@ -1092,6 +1067,9 @@ create_pg_locale(Oid collid, MemoryContext context) Assert((result->collate_is_c && result->collate == NULL) || (!result->collate_is_c && result->collate != NULL)); + Assert((result->ctype_is_c && result->ctype == NULL) || + (!result->ctype_is_c && result->ctype != NULL)); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, &isnull); if (!isnull) @@ -1256,77 +1234,31 @@ size_t pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strlower_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strlower_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strlower_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strlower(dst, dstsize, src, srclen, locale); } size_t pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strtitle_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strtitle_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strtitle_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strtitle(dst, dstsize, src, srclen, locale); } size_t pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strupper_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strupper_icu(dst, dstsize, src, srclen, locale); -#endif - else if (locale->provider == COLLPROVIDER_LIBC) - return strupper_libc(dst, dstsize, src, srclen, locale); - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strupper(dst, dstsize, src, srclen, locale); } size_t pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale) { - if (locale->provider == COLLPROVIDER_BUILTIN) - return strfold_builtin(dst, dstsize, src, srclen, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - return strfold_icu(dst, dstsize, src, srclen, locale); -#endif - /* for libc, just use strlower */ - else if (locale->provider == COLLPROVIDER_LIBC) - return strlower_libc(dst, dstsize, src, srclen, locale); + if (locale->ctype->strfold) + return locale->ctype->strfold(dst, dstsize, src, srclen, locale); else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return 0; /* keep compiler quiet */ + return locale->ctype->strlower(dst, dstsize, src, srclen, locale); } /* @@ -1464,6 +1396,41 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, } /* + * char_is_cased() + * + * Fuzzy test of whether the given char is case-varying or not. The argument + * is a single byte, so in a multibyte encoding, just assume any non-ASCII + * char is case-varying. + */ +bool +char_is_cased(char ch, pg_locale_t locale) +{ + return locale->ctype->char_is_cased(ch, locale); +} + +/* + * char_tolower_enabled() + * + * Does the provider support char_tolower()? + */ +bool +char_tolower_enabled(pg_locale_t locale) +{ + return (locale->ctype->char_tolower != NULL); +} + +/* + * char_tolower() + * + * Convert char (single-byte encoding) to lowercase. + */ +char +char_tolower(unsigned char ch, pg_locale_t locale) +{ + return locale->ctype->char_tolower(ch, locale); +} + +/* * Return required encoding ID for the given locale, or -1 if any encoding is * valid for the locale. */ diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index f51768830cd..0c9fbdb40f2 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -18,22 +18,12 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" -#include "utils/memutils.h" #include "utils/pg_locale.h" #include "utils/syscache.h" extern pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context); extern char *get_collation_actual_version_builtin(const char *collcollate); -extern size_t strlower_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_builtin(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); - struct WordBoundaryState { @@ -77,7 +67,7 @@ initcap_wbnext(void *state) return wbstate->len; } -size_t +static size_t strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -85,7 +75,7 @@ strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } -size_t +static size_t strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -103,7 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, initcap_wbnext, &wbstate); } -size_t +static size_t strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -111,7 +101,7 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } -size_t +static size_t strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -119,6 +109,98 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } +static bool +wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isdigit(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isalpha(wc); +} + +static bool +wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isalnum(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isupper(wc); +} + +static bool +wc_islower_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_islower(wc); +} + +static bool +wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isgraph(wc); +} + +static bool +wc_isprint_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isprint(wc); +} + +static bool +wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_ispunct(wc, !locale->info.builtin.casemap_full); +} + +static bool +wc_isspace_builtin(pg_wchar wc, pg_locale_t locale) +{ + return pg_u_isspace(wc); +} + +static bool +char_is_cased_builtin(char ch, pg_locale_t locale) +{ + return IS_HIGHBIT_SET(ch) || + (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +static pg_wchar +wc_toupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return unicode_uppercase_simple(wc); +} + +static pg_wchar +wc_tolower_builtin(pg_wchar wc, pg_locale_t locale) +{ + return unicode_lowercase_simple(wc); +} + +static const struct ctype_methods ctype_methods_builtin = { + .strlower = strlower_builtin, + .strtitle = strtitle_builtin, + .strupper = strupper_builtin, + .strfold = strfold_builtin, + .wc_isdigit = wc_isdigit_builtin, + .wc_isalpha = wc_isalpha_builtin, + .wc_isalnum = wc_isalnum_builtin, + .wc_isupper = wc_isupper_builtin, + .wc_islower = wc_islower_builtin, + .wc_isgraph = wc_isgraph_builtin, + .wc_isprint = wc_isprint_builtin, + .wc_ispunct = wc_ispunct_builtin, + .wc_isspace = wc_isspace_builtin, + .char_is_cased = char_is_cased_builtin, + .wc_tolower = wc_tolower_builtin, + .wc_toupper = wc_toupper_builtin, +}; + pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) { @@ -158,10 +240,11 @@ create_pg_locale_builtin(Oid collid, MemoryContext context) result->info.builtin.locale = MemoryContextStrdup(context, locstr); result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0); - result->provider = COLLPROVIDER_BUILTIN; result->deterministic = true; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); + if (!result->ctype_is_c) + result->ctype = &ctype_methods_builtin; return result; } diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index a32c32a0744..96741e08269 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -48,19 +48,22 @@ #define TEXTBUFLEN 1024 extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); -extern size_t strlower_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strfold_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); #ifdef USE_ICU extern UCollator *pg_ucol_open(const char *loc_str); +static size_t strlower_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strtitle_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strupper_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static size_t strfold_icu(char *dest, size_t destsize, const char *src, + ssize_t srclen, pg_locale_t locale); +static int strncoll_icu(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); static size_t strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -118,6 +121,25 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, const char *locale, UErrorCode *pErrorCode); +static bool +char_is_cased_icu(char ch, pg_locale_t locale) +{ + return IS_HIGHBIT_SET(ch) || + (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); +} + +static pg_wchar +toupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_toupper(wc); +} + +static pg_wchar +tolower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_tolower(wc); +} + static const struct collate_methods collate_methods_icu = { .strncoll = strncoll_icu, .strnxfrm = strnxfrm_icu, @@ -136,6 +158,78 @@ static const struct collate_methods collate_methods_icu_utf8 = { .strxfrm_is_safe = true, }; +static bool +wc_isdigit_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isdigit(wc); +} + +static bool +wc_isalpha_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isalpha(wc); +} + +static bool +wc_isalnum_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isalnum(wc); +} + +static bool +wc_isupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isupper(wc); +} + +static bool +wc_islower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_islower(wc); +} + +static bool +wc_isgraph_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isgraph(wc); +} + +static bool +wc_isprint_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isprint(wc); +} + +static bool +wc_ispunct_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_ispunct(wc); +} + +static bool +wc_isspace_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_isspace(wc); +} + +static const struct ctype_methods ctype_methods_icu = { + .strlower = strlower_icu, + .strtitle = strtitle_icu, + .strupper = strupper_icu, + .strfold = strfold_icu, + .wc_isdigit = wc_isdigit_icu, + .wc_isalpha = wc_isalpha_icu, + .wc_isalnum = wc_isalnum_icu, + .wc_isupper = wc_isupper_icu, + .wc_islower = wc_islower_icu, + .wc_isgraph = wc_isgraph_icu, + .wc_isprint = wc_isprint_icu, + .wc_ispunct = wc_ispunct_icu, + .wc_isspace = wc_isspace_icu, + .char_is_cased = char_is_cased_icu, + .wc_toupper = toupper_icu, + .wc_tolower = tolower_icu, +}; #endif pg_locale_t @@ -198,7 +292,6 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); result->info.icu.locale = MemoryContextStrdup(context, iculocstr); result->info.icu.ucol = collator; - result->provider = COLLPROVIDER_ICU; result->deterministic = deterministic; result->collate_is_c = false; result->ctype_is_c = false; @@ -206,6 +299,7 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result->collate = &collate_methods_icu_utf8; else result->collate = &collate_methods_icu; + result->ctype = &ctype_methods_icu; return result; #else @@ -379,7 +473,7 @@ make_icu_collator(const char *iculocstr, const char *icurules) } } -size_t +static size_t strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -399,7 +493,7 @@ strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -419,7 +513,7 @@ strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -439,7 +533,7 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } -size_t +static size_t strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { @@ -474,8 +568,6 @@ strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2 int result; UErrorCode status; - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() == PG_UTF8); status = U_ZERO_ERROR; @@ -503,8 +595,6 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t uchar_bsize; Size result_bsize; - Assert(locale->provider == COLLPROVIDER_ICU); - init_icu_converter(); ulen = uchar_length(icu_converter, src, srclen); @@ -549,8 +639,6 @@ strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, uint32_t state[2]; UErrorCode status; - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() == PG_UTF8); uiter_setUTF8(&iter, src, srclen); @@ -749,8 +837,6 @@ strncoll_icu(const char *arg1, ssize_t len1, *uchar2; int result; - Assert(locale->provider == COLLPROVIDER_ICU); - /* if encoding is UTF8, use more efficient strncoll_icu_utf8 */ #ifdef HAVE_UCOL_STRCOLLUTF8 Assert(GetDatabaseEncoding() != PG_UTF8); @@ -799,8 +885,6 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, size_t uchar_bsize; Size result_bsize; - Assert(locale->provider == COLLPROVIDER_ICU); - /* if encoding is UTF8, use more efficient strnxfrm_prefix_icu_utf8 */ Assert(GetDatabaseEncoding() != PG_UTF8); diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 199857e22db..8d88b53c375 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -34,6 +34,46 @@ #endif /* + * For the libc provider, to provide as much functionality as possible on a + * variety of platforms without going so far as to implement everything from + * scratch, we use several implementation strategies depending on the + * situation: + * + * 1. In C/POSIX collations, we use hard-wired code. We can't depend on + * the <ctype.h> functions since those will obey LC_CTYPE. Note that these + * collations don't give a fig about multibyte characters. + * + * 2. When working in UTF8 encoding, we use the <wctype.h> functions. + * This assumes that every platform uses Unicode codepoints directly + * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption + * even for non-UTF8 encodings, which may be a problem.) On some platforms + * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. + * + * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar + * values up to 255, and punt for values above that. This is 100% correct + * only in single-byte encodings such as LATINn. However, non-Unicode + * multibyte encodings are mostly Far Eastern character sets for which the + * properties being tested here aren't very relevant for higher code values + * anyway. The difficulty with using the <wctype.h> functions with + * non-Unicode multibyte encodings is that we can have no certainty that + * the platform's wchar_t representation matches what we do in pg_wchar + * conversions. + * + * As a special case, in the "default" collation, (2) and (3) force ASCII + * letters to follow ASCII upcase/downcase rules, while in a non-default + * collation we just let the library functions do what they will. The case + * where this matters is treatment of I/i in Turkish, and the behavior is + * meant to match the upper()/lower() SQL functions. + * + * We store the active collation setting in static variables. In principle + * it could be passed down to here via the regex library's "struct vars" data + * structure; but that would require somewhat invasive changes in the regex + * library, and right now there's no real benefit to be gained from that. + * + * NB: the coding here assumes pg_wchar is an unsigned type. + */ + +/* * Size of stack buffer to use for string transformations, used to avoid heap * allocations in typical cases. This should be large enough that most strings * will fit, but small enough that we feel comfortable putting it on the @@ -43,13 +83,6 @@ extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); -extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); - static int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -85,6 +118,251 @@ static size_t strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +static bool +wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isdigit_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isalpha_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isalnum_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isupper_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return islower_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isgraph_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isprint_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return ispunct_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + return isspace_l((unsigned char) wc, locale->info.lt); +} + +static bool +wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswdigit_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswalpha_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswalnum_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswupper_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswlower_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswgraph_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswprint_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswpunct_l((wint_t) wc, locale->info.lt); +} + +static bool +wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + return iswspace_l((wint_t) wc, locale->info.lt); +} + +static char +char_tolower_libc(unsigned char ch, pg_locale_t locale) +{ + Assert(pg_database_encoding_max_length() == 1); + return tolower_l(ch, locale->info.lt); +} + +static bool +char_is_cased_libc(char ch, pg_locale_t locale) +{ + bool is_multibyte = pg_database_encoding_max_length() > 1; + + if (is_multibyte && IS_HIGHBIT_SET(ch)) + return true; + else + return isalpha_l((unsigned char) ch, locale->info.lt); +} + +static pg_wchar +toupper_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() != PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) wc); + if (wc <= (pg_wchar) UCHAR_MAX) + return toupper_l((unsigned char) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +toupper_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) wc); + if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) + return towupper_l((wint_t) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +tolower_libc_sb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() != PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) wc); + if (wc <= (pg_wchar) UCHAR_MAX) + return tolower_l((unsigned char) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +tolower_libc_mb(pg_wchar wc, pg_locale_t locale) +{ + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* force C behavior for ASCII characters, per comments above */ + if (locale->is_default && wc <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) wc); + if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) + return towlower_l((wint_t) wc, locale->info.lt); + else + return wc; +} + +static const struct ctype_methods ctype_methods_libc_sb = { + .strlower = strlower_libc_sb, + .strtitle = strtitle_libc_sb, + .strupper = strupper_libc_sb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, + .max_chr = UCHAR_MAX, +}; + +/* + * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but + * single-byte semantics for pattern matching. + */ +static const struct ctype_methods ctype_methods_libc_other_mb = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + .wc_isdigit = wc_isdigit_libc_sb, + .wc_isalpha = wc_isalpha_libc_sb, + .wc_isalnum = wc_isalnum_libc_sb, + .wc_isupper = wc_isupper_libc_sb, + .wc_islower = wc_islower_libc_sb, + .wc_isgraph = wc_isgraph_libc_sb, + .wc_isprint = wc_isprint_libc_sb, + .wc_ispunct = wc_ispunct_libc_sb, + .wc_isspace = wc_isspace_libc_sb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_sb, + .wc_tolower = tolower_libc_sb, + .max_chr = UCHAR_MAX, +}; + +static const struct ctype_methods ctype_methods_libc_utf8 = { + .strlower = strlower_libc_mb, + .strtitle = strtitle_libc_mb, + .strupper = strupper_libc_mb, + .wc_isdigit = wc_isdigit_libc_mb, + .wc_isalpha = wc_isalpha_libc_mb, + .wc_isalnum = wc_isalnum_libc_mb, + .wc_isupper = wc_isupper_libc_mb, + .wc_islower = wc_islower_libc_mb, + .wc_isgraph = wc_isgraph_libc_mb, + .wc_isprint = wc_isprint_libc_mb, + .wc_ispunct = wc_ispunct_libc_mb, + .wc_isspace = wc_isspace_libc_mb, + .char_is_cased = char_is_cased_libc, + .char_tolower = char_tolower_libc, + .wc_toupper = toupper_libc_mb, + .wc_tolower = tolower_libc_mb, +}; + static const struct collate_methods collate_methods_libc = { .strncoll = strncoll_libc, .strnxfrm = strnxfrm_libc, @@ -119,36 +397,6 @@ static const struct collate_methods collate_methods_libc_win32_utf8 = { }; #endif -size_t -strlower_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strlower_libc_mb(dst, dstsize, src, srclen, locale); - else - return strlower_libc_sb(dst, dstsize, src, srclen, locale); -} - -size_t -strtitle_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strtitle_libc_mb(dst, dstsize, src, srclen, locale); - else - return strtitle_libc_sb(dst, dstsize, src, srclen, locale); -} - -size_t -strupper_libc(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) -{ - if (pg_database_encoding_max_length() > 1) - return strupper_libc_mb(dst, dstsize, src, srclen, locale); - else - return strupper_libc_sb(dst, dstsize, src, srclen, locale); -} - static size_t strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) @@ -209,7 +457,7 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, /* Output workspace cannot have more codes than input bytes */ workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); - char2wchar(workspace, srclen + 1, src, srclen, locale); + char2wchar(workspace, srclen + 1, src, srclen, loc); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) workspace[curr_char] = towlower_l(workspace[curr_char], loc); @@ -220,7 +468,7 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, max_size = curr_char * pg_database_encoding_max_length(); result = palloc(max_size + 1); - result_size = wchar2char(result, workspace, max_size + 1, locale); + result_size = wchar2char(result, workspace, max_size + 1, loc); if (result_size + 1 > destsize) return result_size; @@ -304,7 +552,7 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, /* Output workspace cannot have more codes than input bytes */ workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); - char2wchar(workspace, srclen + 1, src, srclen, locale); + char2wchar(workspace, srclen + 1, src, srclen, loc); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) { @@ -321,7 +569,7 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, max_size = curr_char * pg_database_encoding_max_length(); result = palloc(max_size + 1); - result_size = wchar2char(result, workspace, max_size + 1, locale); + result_size = wchar2char(result, workspace, max_size + 1, loc); if (result_size + 1 > destsize) return result_size; @@ -392,7 +640,7 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, /* Output workspace cannot have more codes than input bytes */ workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); - char2wchar(workspace, srclen + 1, src, srclen, locale); + char2wchar(workspace, srclen + 1, src, srclen, loc); for (curr_char = 0; workspace[curr_char] != 0; curr_char++) workspace[curr_char] = towupper_l(workspace[curr_char], loc); @@ -403,7 +651,7 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, max_size = curr_char * pg_database_encoding_max_length(); result = palloc(max_size + 1); - result_size = wchar2char(result, workspace, max_size + 1, locale); + result_size = wchar2char(result, workspace, max_size + 1, loc); if (result_size + 1 > destsize) return result_size; @@ -465,7 +713,6 @@ create_pg_locale_libc(Oid collid, MemoryContext context) loc = make_libc_collator(collate, ctype); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); - result->provider = COLLPROVIDER_LIBC; result->deterministic = true; result->collate_is_c = (strcmp(collate, "C") == 0) || (strcmp(collate, "POSIX") == 0); @@ -481,6 +728,15 @@ create_pg_locale_libc(Oid collid, MemoryContext context) #endif result->collate = &collate_methods_libc; } + if (!result->ctype_is_c) + { + if (GetDatabaseEncoding() == PG_UTF8) + result->ctype = &ctype_methods_libc_utf8; + else if (pg_database_encoding_max_length() > 1) + result->ctype = &ctype_methods_libc_other_mb; + else + result->ctype = &ctype_methods_libc_sb; + } return result; } @@ -576,8 +832,6 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, const char *arg2n; int result; - Assert(locale->provider == COLLPROVIDER_LIBC); - if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); @@ -632,8 +886,6 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t bufsize = srclen + 1; size_t result; - Assert(locale->provider == COLLPROVIDER_LIBC); - if (srclen == -1) return strxfrm_l(dest, src, destsize, locale->info.lt); @@ -742,7 +994,6 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, int r; int result; - Assert(locale->provider == COLLPROVIDER_LIBC); Assert(GetDatabaseEncoding() == PG_UTF8); if (len1 == -1) @@ -879,7 +1130,7 @@ wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc) * zero-terminated. The output will be zero-terminated iff there is room. */ size_t -wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) +wchar2char(char *to, const wchar_t *from, size_t tolen, locale_t loc) { size_t result; @@ -909,7 +1160,7 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) } else #endif /* WIN32 */ - if (locale == (pg_locale_t) 0) + if (loc == (locale_t) 0) { /* Use wcstombs directly for the default locale */ result = wcstombs(to, from, tolen); @@ -917,7 +1168,7 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) else { /* Use wcstombs_l for nondefault locales */ - result = wcstombs_l(to, from, tolen, locale->info.lt); + result = wcstombs_l(to, from, tolen, loc); } return result; @@ -934,7 +1185,7 @@ wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) */ size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, - pg_locale_t locale) + locale_t loc) { size_t result; @@ -969,7 +1220,7 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, /* mbstowcs requires ending '\0' */ char *str = pnstrdup(from, fromlen); - if (locale == (pg_locale_t) 0) + if (loc == (locale_t) 0) { /* Use mbstowcs directly for the default locale */ result = mbstowcs(to, str, tolen); @@ -977,7 +1228,7 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, else { /* Use mbstowcs_l for nondefault locales */ - result = mbstowcs_l(to, str, tolen, locale->info.lt); + result = mbstowcs_l(to, str, tolen, loc); } pfree(str); diff --git a/src/backend/utils/adt/pg_lsn.c b/src/backend/utils/adt/pg_lsn.c index 16311590a14..12de2446f5b 100644 --- a/src/backend/utils/adt/pg_lsn.c +++ b/src/backend/utils/adt/pg_lsn.c @@ -83,7 +83,7 @@ pg_lsn_out(PG_FUNCTION_ARGS) char buf[MAXPG_LSNLEN + 1]; char *result; - snprintf(buf, sizeof buf, "%X/%X", LSN_FORMAT_ARGS(lsn)); + snprintf(buf, sizeof buf, "%X/%08X", LSN_FORMAT_ARGS(lsn)); result = pstrdup(buf); PG_RETURN_CSTRING(result); } diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c index d44f8c262ba..a4f8b4faa90 100644 --- a/src/backend/utils/adt/pg_upgrade_support.c +++ b/src/backend/utils/adt/pg_upgrade_support.c @@ -21,6 +21,7 @@ #include "commands/extension.h" #include "miscadmin.h" #include "replication/logical.h" +#include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/worker_internal.h" #include "storage/lmgr.h" @@ -410,3 +411,21 @@ binary_upgrade_replorigin_advance(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * binary_upgrade_create_conflict_detection_slot + * + * Create a replication slot to retain information necessary for conflict + * detection such as dead tuples, commit timestamps, and origins. + */ +Datum +binary_upgrade_create_conflict_detection_slot(PG_FUNCTION_ARGS) +{ + CHECK_IS_BINARY_UPGRADE; + + CreateConflictDetectionSlot(); + + ReplicationSlotRelease(); + + PG_RETURN_VOID(); +} diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 97af7c6554f..c756c2bebaa 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -640,10 +640,10 @@ pg_stat_get_activity(PG_FUNCTION_ARGS) values[28] = BoolGetDatum(false); /* GSS credentials not * delegated */ } - if (beentry->st_query_id == 0) + if (beentry->st_query_id == INT64CONST(0)) nulls[30] = true; else - values[30] = UInt64GetDatum(beentry->st_query_id); + values[30] = Int64GetDatum(beentry->st_query_id); } else { @@ -1510,7 +1510,7 @@ pg_stat_io_build_tuples(ReturnSetInfo *rsinfo, bktype_stats->bytes[io_obj][io_context][io_op]; /* Convert to numeric */ - snprintf(buf, sizeof buf, UINT64_FORMAT, byte); + snprintf(buf, sizeof buf, INT64_FORMAT, byte); values[byte_idx] = DirectFunctionCall3(numeric_in, CStringGetDatum(buf), ObjectIdGetDatum(0), @@ -2171,7 +2171,7 @@ pg_stat_get_replication_slot(PG_FUNCTION_ARGS) Datum pg_stat_get_subscription_stats(PG_FUNCTION_ARGS) { -#define PG_STAT_GET_SUBSCRIPTION_STATS_COLS 11 +#define PG_STAT_GET_SUBSCRIPTION_STATS_COLS 12 Oid subid = PG_GETARG_OID(0); TupleDesc tupdesc; Datum values[PG_STAT_GET_SUBSCRIPTION_STATS_COLS] = {0}; @@ -2197,15 +2197,17 @@ pg_stat_get_subscription_stats(PG_FUNCTION_ARGS) INT8OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 6, "confl_update_exists", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 7, "confl_update_missing", + TupleDescInitEntry(tupdesc, (AttrNumber) 7, "confl_update_deleted", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 8, "confl_delete_origin_differs", + TupleDescInitEntry(tupdesc, (AttrNumber) 8, "confl_update_missing", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 9, "confl_delete_missing", + TupleDescInitEntry(tupdesc, (AttrNumber) 9, "confl_delete_origin_differs", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 10, "confl_multiple_unique_conflicts", + TupleDescInitEntry(tupdesc, (AttrNumber) 10, "confl_delete_missing", INT8OID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 11, "stats_reset", + TupleDescInitEntry(tupdesc, (AttrNumber) 11, "confl_multiple_unique_conflicts", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 12, "stats_reset", TIMESTAMPTZOID, -1, 0); BlessTupleDesc(tupdesc); diff --git a/src/backend/utils/adt/rangetypes.c b/src/backend/utils/adt/rangetypes.c index 66cc0acf4a7..c83b239b3bb 100644 --- a/src/backend/utils/adt/rangetypes.c +++ b/src/backend/utils/adt/rangetypes.c @@ -285,8 +285,7 @@ range_send(PG_FUNCTION_ARGS) if (RANGE_HAS_LBOUND(flags)) { - Datum bound = PointerGetDatum(SendFunctionCall(&cache->typioproc, - lower.val)); + bytea *bound = SendFunctionCall(&cache->typioproc, lower.val); uint32 bound_len = VARSIZE(bound) - VARHDRSZ; char *bound_data = VARDATA(bound); @@ -296,8 +295,7 @@ range_send(PG_FUNCTION_ARGS) if (RANGE_HAS_UBOUND(flags)) { - Datum bound = PointerGetDatum(SendFunctionCall(&cache->typioproc, - upper.val)); + bytea *bound = SendFunctionCall(&cache->typioproc, upper.val); uint32 bound_len = VARSIZE(bound) - VARHDRSZ; char *bound_data = VARDATA(bound); @@ -1444,7 +1442,7 @@ hash_range(PG_FUNCTION_ARGS) upper_hash = 0; /* Merge hashes of flags and bounds */ - result = hash_uint32((uint32) flags); + result = hash_bytes_uint32((uint32) flags); result ^= lower_hash; result = pg_rotate_left32(result, 1); result ^= upper_hash; diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index edee1f7880b..6e2864cbbda 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -773,8 +773,11 @@ similar_escape_internal(text *pat_text, text *esc_text) int plen, elen; bool afterescape = false; - bool incharclass = false; int nquotes = 0; + int charclass_depth = 0; /* Nesting level of character classes, + * encompassed by square brackets */ + int charclass_start = 0; /* State of the character class start, + * for carets */ p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); @@ -904,7 +907,7 @@ similar_escape_internal(text *pat_text, text *esc_text) /* fast path */ if (afterescape) { - if (pchar == '"' && !incharclass) /* escape-double-quote? */ + if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */ { /* emit appropriate part separator, per notes above */ if (nquotes == 0) @@ -953,18 +956,41 @@ similar_escape_internal(text *pat_text, text *esc_text) /* SQL escape character; do not send to output */ afterescape = true; } - else if (incharclass) + else if (charclass_depth > 0) { if (pchar == '\\') *r++ = '\\'; *r++ = pchar; - if (pchar == ']') - incharclass = false; + + /* + * Ignore a closing bracket at the start of a character class. + * Such a bracket is taken literally rather than closing the + * class. "charclass_start" is 1 right at the beginning of a + * class and 2 after an initial caret. + */ + if (pchar == ']' && charclass_start > 2) + charclass_depth--; + else if (pchar == '[') + charclass_depth++; + + /* + * If there is a caret right after the opening bracket, it negates + * the character class, but a following closing bracket should + * still be treated as a normal character. That holds only for + * the first caret, so only the values 1 and 2 mean that closing + * brackets should be taken literally. + */ + if (pchar == '^') + charclass_start++; + else + charclass_start = 3; /* definitely past the start */ } else if (pchar == '[') { + /* start of a character class */ *r++ = pchar; - incharclass = true; + charclass_depth++; + charclass_start = 1; } else if (pchar == '%') { diff --git a/src/backend/utils/adt/regproc.c b/src/backend/utils/adt/regproc.c index 5ee608a2b39..b8bbe95e82e 100644 --- a/src/backend/utils/adt/regproc.c +++ b/src/backend/utils/adt/regproc.c @@ -30,6 +30,7 @@ #include "catalog/pg_ts_config.h" #include "catalog/pg_ts_dict.h" #include "catalog/pg_type.h" +#include "commands/dbcommands.h" #include "lib/stringinfo.h" #include "mb/pg_wchar.h" #include "miscadmin.h" @@ -1764,6 +1765,123 @@ regnamespacesend(PG_FUNCTION_ARGS) } /* + * regdatabasein - converts database name to database OID + * + * We also accept a numeric OID, for symmetry with the output routine. + * + * '-' signifies unknown (OID 0). In all other cases, the input must + * match an existing pg_database entry. + */ +Datum +regdatabasein(PG_FUNCTION_ARGS) +{ + char *db_name_or_oid = PG_GETARG_CSTRING(0); + Node *escontext = fcinfo->context; + Oid result; + List *names; + + /* Handle "-" or numeric OID */ + if (parseDashOrOid(db_name_or_oid, &result, escontext)) + PG_RETURN_OID(result); + + /* The rest of this wouldn't work in bootstrap mode */ + if (IsBootstrapProcessingMode()) + elog(ERROR, "regdatabase values must be OIDs in bootstrap mode"); + + /* Normal case: see if the name matches any pg_database entry. */ + names = stringToQualifiedNameList(db_name_or_oid, escontext); + if (names == NIL) + PG_RETURN_NULL(); + + if (list_length(names) != 1) + ereturn(escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_NAME), + errmsg("invalid name syntax"))); + + result = get_database_oid(strVal(linitial(names)), true); + + if (!OidIsValid(result)) + ereturn(escontext, (Datum) 0, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("database \"%s\" does not exist", + strVal(linitial(names))))); + + PG_RETURN_OID(result); +} + +/* + * to_regdatabase - converts database name to database OID + * + * If the name is not found, we return NULL. + */ +Datum +to_regdatabase(PG_FUNCTION_ARGS) +{ + char *db_name = text_to_cstring(PG_GETARG_TEXT_PP(0)); + Datum result; + ErrorSaveContext escontext = {T_ErrorSaveContext}; + + if (!DirectInputFunctionCallSafe(regdatabasein, db_name, + InvalidOid, -1, + (Node *) &escontext, + &result)) + PG_RETURN_NULL(); + PG_RETURN_DATUM(result); +} + +/* + * regdatabaseout - converts database OID to database name + */ +Datum +regdatabaseout(PG_FUNCTION_ARGS) +{ + Oid dboid = PG_GETARG_OID(0); + char *result; + + if (dboid == InvalidOid) + { + result = pstrdup("-"); + PG_RETURN_CSTRING(result); + } + + result = get_database_name(dboid); + + if (result) + { + /* pstrdup is not really necessary, but it avoids a compiler warning */ + result = pstrdup(quote_identifier(result)); + } + else + { + /* If OID doesn't match any database, return it numerically */ + result = (char *) palloc(NAMEDATALEN); + snprintf(result, NAMEDATALEN, "%u", dboid); + } + + PG_RETURN_CSTRING(result); +} + +/* + * regdatabaserecv - converts external binary format to regdatabase + */ +Datum +regdatabaserecv(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidrecv, so share code */ + return oidrecv(fcinfo); +} + +/* + * regdatabasesend - converts regdatabase to binary format + */ +Datum +regdatabasesend(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidsend, so share code */ + return oidsend(fcinfo); +} + +/* * text_regclass: convert text to regclass * * This could be replaced by CoerceViaIO, except that we need to treat diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c index 6239900fa28..059fc5ebf60 100644 --- a/src/backend/utils/adt/ri_triggers.c +++ b/src/backend/utils/adt/ri_triggers.c @@ -30,7 +30,6 @@ #include "access/xact.h" #include "catalog/pg_collation.h" #include "catalog/pg_constraint.h" -#include "catalog/pg_proc.h" #include "commands/trigger.h" #include "executor/executor.h" #include "executor/spi.h" @@ -46,7 +45,6 @@ #include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" -#include "utils/rangetypes.h" #include "utils/rel.h" #include "utils/rls.h" #include "utils/ruleutils.h" diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 467b08198b8..3d6e6bdbfd2 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -5956,9 +5956,19 @@ get_select_query_def(Query *query, deparse_context *context) { if (query->limitOption == LIMIT_OPTION_WITH_TIES) { + /* + * The limitCount arg is a c_expr, so it needs parens. Simple + * literals and function expressions would not need parens, but + * unfortunately it's hard to tell if the expression will be + * printed as a simple literal like 123 or as a typecast + * expression, like '-123'::int4. The grammar accepts the former + * without quoting, but not the latter. + */ appendContextKeyword(context, " FETCH FIRST ", -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + appendStringInfoChar(buf, '('); get_rule_expr(query->limitCount, context, false); + appendStringInfoChar(buf, ')'); appendStringInfoString(buf, " ROWS WITH TIES"); } else diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index a96b1b9c0bc..17fbfa9b410 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -103,7 +103,6 @@ #include "access/table.h" #include "access/tableam.h" #include "access/visibilitymap.h" -#include "catalog/pg_am.h" #include "catalog/pg_collation.h" #include "catalog/pg_operator.h" #include "catalog/pg_statistic.h" @@ -3799,18 +3798,25 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, List *hashclauses, Selectivity *innerbucketsize) { - List *clauses = list_copy(hashclauses); - List *otherclauses = NIL; - double ndistinct = 1.0; + List *clauses; + List *otherclauses; + double ndistinct; if (list_length(hashclauses) <= 1) - + { /* * Nothing to do for a single clause. Could we employ univariate * extended stat here? */ return hashclauses; + } + /* "clauses" is the list of hashclauses we've not dealt with yet */ + clauses = list_copy(hashclauses); + /* "otherclauses" holds clauses we are going to return to caller */ + otherclauses = NIL; + /* current estimate of ndistinct */ + ndistinct = 1.0; while (clauses != NIL) { ListCell *lc; @@ -3875,12 +3881,13 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner, group_rel = root->simple_rel_array[relid]; } else if (group_relid != relid) - + { /* * Being in the group forming state we don't need other * clauses. */ continue; + } /* * We're going to add the new clause to the varinfos list. We @@ -4620,6 +4627,7 @@ convert_to_scalar(Datum value, Oid valuetypid, Oid collid, double *scaledvalue, case REGDICTIONARYOID: case REGROLEOID: case REGNAMESPACEOID: + case REGDATABASEOID: *scaledvalue = convert_numeric_to_scalar(value, valuetypid, &failure); *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid, @@ -4752,6 +4760,7 @@ convert_numeric_to_scalar(Datum value, Oid typid, bool *failure) case REGDICTIONARYOID: case REGROLEOID: case REGNAMESPACEOID: + case REGDATABASEOID: /* we can treat OIDs as integers... */ return (double) DatumGetObjectId(value); } diff --git a/src/backend/utils/adt/tid.c b/src/backend/utils/adt/tid.c index 1b0df111717..39dab3e42df 100644 --- a/src/backend/utils/adt/tid.c +++ b/src/backend/utils/adt/tid.c @@ -84,7 +84,7 @@ tidin(PG_FUNCTION_ARGS) /* * Cope with possibility that unsigned long is wider than BlockNumber, in * which case strtoul will not raise an error for some values that are out - * of the range of BlockNumber. (See similar code in oidin().) + * of the range of BlockNumber. (See similar code in uint32in_subr().) */ #if SIZEOF_LONG > 4 if (cvt != (unsigned long) blockNumber && diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 347089b7626..e640b48205b 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -4954,7 +4954,7 @@ timestamptz_trunc_internal(text *units, TimestampTz timestamp, pg_tz *tzp) case DTK_SECOND: case DTK_MILLISEC: case DTK_MICROSEC: - PG_RETURN_TIMESTAMPTZ(timestamp); + return timestamp; break; default: @@ -5312,10 +5312,10 @@ isoweekdate2date(int isoweek, int wday, int *year, int *mon, int *mday) int date2isoweek(int year, int mon, int mday) { - float8 result; int day0, day4, - dayn; + dayn, + week; /* current day */ dayn = date2j(year, mon, mday); @@ -5338,13 +5338,13 @@ date2isoweek(int year, int mon, int mday) day0 = j2day(day4 - 1); } - result = (dayn - (day4 - day0)) / 7 + 1; + week = (dayn - (day4 - day0)) / 7 + 1; /* * Sometimes the last few days in a year will fall into the first week of * the next year, so check for this. */ - if (result >= 52) + if (week >= 52) { day4 = date2j(year + 1, 1, 4); @@ -5352,10 +5352,10 @@ date2isoweek(int year, int mon, int mday) day0 = j2day(day4 - 1); if (dayn >= day4 - day0) - result = (dayn - (day4 - day0)) / 7 + 1; + week = (dayn - (day4 - day0)) / 7 + 1; } - return (int) result; + return week; } @@ -5367,10 +5367,10 @@ date2isoweek(int year, int mon, int mday) int date2isoyear(int year, int mon, int mday) { - float8 result; int day0, day4, - dayn; + dayn, + week; /* current day */ dayn = date2j(year, mon, mday); @@ -5395,13 +5395,13 @@ date2isoyear(int year, int mon, int mday) year--; } - result = (dayn - (day4 - day0)) / 7 + 1; + week = (dayn - (day4 - day0)) / 7 + 1; /* * Sometimes the last few days in a year will fall into the first week of * the next year, so check for this. */ - if (result >= 52) + if (week >= 52) { day4 = date2j(year + 1, 1, 4); @@ -6477,7 +6477,7 @@ timestamp2timestamptz_opt_overflow(Timestamp timestamp, int *overflow) if (TIMESTAMP_NOT_FINITE(timestamp)) return timestamp; - /* We don't expect this to fail, but check it pro forma */ + /* timestamp2tm should not fail on valid timestamps, but cope */ if (timestamp2tm(timestamp, NULL, tm, &fsec, NULL, NULL) == 0) { tz = DetermineTimeZoneOffset(tm, session_timezone); @@ -6485,23 +6485,22 @@ timestamp2timestamptz_opt_overflow(Timestamp timestamp, int *overflow) result = dt2local(timestamp, -tz); if (IS_VALID_TIMESTAMP(result)) - { return result; + } + + if (overflow) + { + if (timestamp < 0) + { + *overflow = -1; + TIMESTAMP_NOBEGIN(result); } - else if (overflow) + else { - if (result < MIN_TIMESTAMP) - { - *overflow = -1; - TIMESTAMP_NOBEGIN(result); - } - else - { - *overflow = 1; - TIMESTAMP_NOEND(result); - } - return result; + *overflow = 1; + TIMESTAMP_NOEND(result); } + return result; } ereport(ERROR, @@ -6531,27 +6530,81 @@ timestamptz_timestamp(PG_FUNCTION_ARGS) PG_RETURN_TIMESTAMP(timestamptz2timestamp(timestamp)); } +/* + * Convert timestamptz to timestamp, throwing error for overflow. + */ static Timestamp timestamptz2timestamp(TimestampTz timestamp) { + return timestamptz2timestamp_opt_overflow(timestamp, NULL); +} + +/* + * Convert timestamp with time zone to timestamp. + * + * On successful conversion, *overflow is set to zero if it's not NULL. + * + * If the timestamptz is finite but out of the valid range for timestamp, then: + * if overflow is NULL, we throw an out-of-range error. + * if overflow is not NULL, we store +1 or -1 there to indicate the sign + * of the overflow, and return the appropriate timestamp infinity. + */ +Timestamp +timestamptz2timestamp_opt_overflow(TimestampTz timestamp, int *overflow) +{ Timestamp result; struct pg_tm tt, *tm = &tt; fsec_t fsec; int tz; + if (overflow) + *overflow = 0; + if (TIMESTAMP_NOT_FINITE(timestamp)) result = timestamp; else { if (timestamp2tm(timestamp, &tz, tm, &fsec, NULL, NULL) != 0) + { + if (overflow) + { + if (timestamp < 0) + { + *overflow = -1; + TIMESTAMP_NOBEGIN(result); + } + else + { + *overflow = 1; + TIMESTAMP_NOEND(result); + } + return result; + } ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } if (tm2timestamp(tm, fsec, NULL, &result) != 0) + { + if (overflow) + { + if (timestamp < 0) + { + *overflow = -1; + TIMESTAMP_NOBEGIN(result); + } + else + { + *overflow = 1; + TIMESTAMP_NOEND(result); + } + return result; + } ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); + } } return result; } diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 1fa1275ca63..0625da9532f 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -329,8 +329,8 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS) if (nulls[i]) continue; - lex = VARDATA(dlexemes[i]); - lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; + lex = VARDATA(DatumGetPointer(dlexemes[i])); + lex_len = VARSIZE(DatumGetPointer(dlexemes[i])) - VARHDRSZ; lex_pos = tsvector_bsearch(tsout, lex, lex_len); if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0) @@ -443,10 +443,10 @@ compare_text_lexemes(const void *va, const void *vb) { Datum a = *((const Datum *) va); Datum b = *((const Datum *) vb); - char *alex = VARDATA_ANY(a); - int alex_len = VARSIZE_ANY_EXHDR(a); - char *blex = VARDATA_ANY(b); - int blex_len = VARSIZE_ANY_EXHDR(b); + char *alex = VARDATA_ANY(DatumGetPointer(a)); + int alex_len = VARSIZE_ANY_EXHDR(DatumGetPointer(a)); + char *blex = VARDATA_ANY(DatumGetPointer(b)); + int blex_len = VARSIZE_ANY_EXHDR(DatumGetPointer(b)); return tsCompareString(alex, alex_len, blex, blex_len, false); } @@ -605,8 +605,8 @@ tsvector_delete_arr(PG_FUNCTION_ARGS) if (nulls[i]) continue; - lex = VARDATA(dlexemes[i]); - lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; + lex = VARDATA(DatumGetPointer(dlexemes[i])); + lex_len = VARSIZE(DatumGetPointer(dlexemes[i])) - VARHDRSZ; lex_pos = tsvector_bsearch(tsin, lex, lex_len); if (lex_pos >= 0) @@ -770,7 +770,7 @@ array_to_tsvector(PG_FUNCTION_ARGS) (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); - if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0) + if (VARSIZE(DatumGetPointer(dlexemes[i])) - VARHDRSZ == 0) ereport(ERROR, (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING), errmsg("lexeme array may not contain empty strings"))); @@ -786,7 +786,7 @@ array_to_tsvector(PG_FUNCTION_ARGS) /* Calculate space needed for surviving lexemes. */ for (i = 0; i < nitems; i++) - datalen += VARSIZE(dlexemes[i]) - VARHDRSZ; + datalen += VARSIZE(DatumGetPointer(dlexemes[i])) - VARHDRSZ; tslen = CALCDATASIZE(nitems, datalen); /* Allocate and fill tsvector. */ @@ -798,8 +798,8 @@ array_to_tsvector(PG_FUNCTION_ARGS) cur = STRPTR(tsout); for (i = 0; i < nitems; i++) { - char *lex = VARDATA(dlexemes[i]); - int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; + char *lex = VARDATA(DatumGetPointer(dlexemes[i])); + int lex_len = VARSIZE(DatumGetPointer(dlexemes[i])) - VARHDRSZ; memcpy(cur, lex, lex_len); arrout[i].haspos = 0; diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 3e4d5568bde..ffae8c23abf 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -35,7 +35,6 @@ #include "port/pg_bswap.h" #include "regex/regex.h" #include "utils/builtins.h" -#include "utils/bytea.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -43,10 +42,6 @@ #include "utils/sortsupport.h" #include "utils/varlena.h" - -/* GUC variable */ -int bytea_output = BYTEA_OUTPUT_HEX; - typedef struct varlena VarString; /* @@ -148,12 +143,6 @@ static int text_position_get_match_pos(TextPositionState *state); static void text_position_cleanup(TextPositionState *state); static void check_collation_set(Oid collid); static int text_cmp(text *arg1, text *arg2, Oid collid); -static bytea *bytea_catenate(bytea *t1, bytea *t2); -static bytea *bytea_substring(Datum str, - int S, - int L, - bool length_not_specified); -static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl); static void appendStringInfoText(StringInfo str, const text *t); static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate); static void split_text_accum_result(SplitTextOutputData *tstate, @@ -279,307 +268,6 @@ text_to_cstring_buffer(const text *src, char *dst, size_t dst_len) * USER I/O ROUTINES * *****************************************************************************/ - -#define VAL(CH) ((CH) - '0') -#define DIG(VAL) ((VAL) + '0') - -/* - * byteain - converts from printable representation of byte array - * - * Non-printable characters must be passed as '\nnn' (octal) and are - * converted to internal form. '\' must be passed as '\\'. - * ereport(ERROR, ...) if bad form. - * - * BUGS: - * The input is scanned twice. - * The error checking of input is minimal. - */ -Datum -byteain(PG_FUNCTION_ARGS) -{ - char *inputText = PG_GETARG_CSTRING(0); - Node *escontext = fcinfo->context; - char *tp; - char *rp; - int bc; - bytea *result; - - /* Recognize hex input */ - if (inputText[0] == '\\' && inputText[1] == 'x') - { - size_t len = strlen(inputText); - - bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */ - result = palloc(bc); - bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), - escontext); - SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */ - - PG_RETURN_BYTEA_P(result); - } - - /* Else, it's the traditional escaped style */ - for (bc = 0, tp = inputText; *tp != '\0'; bc++) - { - if (tp[0] != '\\') - tp++; - else if ((tp[0] == '\\') && - (tp[1] >= '0' && tp[1] <= '3') && - (tp[2] >= '0' && tp[2] <= '7') && - (tp[3] >= '0' && tp[3] <= '7')) - tp += 4; - else if ((tp[0] == '\\') && - (tp[1] == '\\')) - tp += 2; - else - { - /* - * one backslash, not followed by another or ### valid octal - */ - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "bytea"))); - } - } - - bc += VARHDRSZ; - - result = (bytea *) palloc(bc); - SET_VARSIZE(result, bc); - - tp = inputText; - rp = VARDATA(result); - while (*tp != '\0') - { - if (tp[0] != '\\') - *rp++ = *tp++; - else if ((tp[0] == '\\') && - (tp[1] >= '0' && tp[1] <= '3') && - (tp[2] >= '0' && tp[2] <= '7') && - (tp[3] >= '0' && tp[3] <= '7')) - { - bc = VAL(tp[1]); - bc <<= 3; - bc += VAL(tp[2]); - bc <<= 3; - *rp++ = bc + VAL(tp[3]); - - tp += 4; - } - else if ((tp[0] == '\\') && - (tp[1] == '\\')) - { - *rp++ = '\\'; - tp += 2; - } - else - { - /* - * We should never get here. The first pass should not allow it. - */ - ereturn(escontext, (Datum) 0, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type %s", "bytea"))); - } - } - - PG_RETURN_BYTEA_P(result); -} - -/* - * byteaout - converts to printable representation of byte array - * - * In the traditional escaped format, non-printable characters are - * printed as '\nnn' (octal) and '\' as '\\'. - */ -Datum -byteaout(PG_FUNCTION_ARGS) -{ - bytea *vlena = PG_GETARG_BYTEA_PP(0); - char *result; - char *rp; - - if (bytea_output == BYTEA_OUTPUT_HEX) - { - /* Print hex format */ - rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1); - *rp++ = '\\'; - *rp++ = 'x'; - rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp); - } - else if (bytea_output == BYTEA_OUTPUT_ESCAPE) - { - /* Print traditional escaped format */ - char *vp; - uint64 len; - int i; - - len = 1; /* empty string has 1 char */ - vp = VARDATA_ANY(vlena); - for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) - { - if (*vp == '\\') - len += 2; - else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) - len += 4; - else - len++; - } - - /* - * In principle len can't overflow uint32 if the input fit in 1GB, but - * for safety let's check rather than relying on palloc's internal - * check. - */ - if (len > MaxAllocSize) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg_internal("result of bytea output conversion is too large"))); - rp = result = (char *) palloc(len); - - vp = VARDATA_ANY(vlena); - for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++) - { - if (*vp == '\\') - { - *rp++ = '\\'; - *rp++ = '\\'; - } - else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e) - { - int val; /* holds unprintable chars */ - - val = *vp; - rp[0] = '\\'; - rp[3] = DIG(val & 07); - val >>= 3; - rp[2] = DIG(val & 07); - val >>= 3; - rp[1] = DIG(val & 03); - rp += 4; - } - else - *rp++ = *vp; - } - } - else - { - elog(ERROR, "unrecognized \"bytea_output\" setting: %d", - bytea_output); - rp = result = NULL; /* keep compiler quiet */ - } - *rp = '\0'; - PG_RETURN_CSTRING(result); -} - -/* - * bytearecv - converts external binary format to bytea - */ -Datum -bytearecv(PG_FUNCTION_ARGS) -{ - StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); - bytea *result; - int nbytes; - - nbytes = buf->len - buf->cursor; - result = (bytea *) palloc(nbytes + VARHDRSZ); - SET_VARSIZE(result, nbytes + VARHDRSZ); - pq_copymsgbytes(buf, VARDATA(result), nbytes); - PG_RETURN_BYTEA_P(result); -} - -/* - * byteasend - converts bytea to binary format - * - * This is a special case: just copy the input... - */ -Datum -byteasend(PG_FUNCTION_ARGS) -{ - bytea *vlena = PG_GETARG_BYTEA_P_COPY(0); - - PG_RETURN_BYTEA_P(vlena); -} - -Datum -bytea_string_agg_transfn(PG_FUNCTION_ARGS) -{ - StringInfo state; - - state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); - - /* Append the value unless null, preceding it with the delimiter. */ - if (!PG_ARGISNULL(1)) - { - bytea *value = PG_GETARG_BYTEA_PP(1); - bool isfirst = false; - - /* - * You might think we can just throw away the first delimiter, however - * we must keep it as we may be a parallel worker doing partial - * aggregation building a state to send to the main process. We need - * to keep the delimiter of every aggregation so that the combine - * function can properly join up the strings of two separately - * partially aggregated results. The first delimiter is only stripped - * off in the final function. To know how much to strip off the front - * of the string, we store the length of the first delimiter in the - * StringInfo's cursor field, which we don't otherwise need here. - */ - if (state == NULL) - { - state = makeStringAggState(fcinfo); - isfirst = true; - } - - if (!PG_ARGISNULL(2)) - { - bytea *delim = PG_GETARG_BYTEA_PP(2); - - appendBinaryStringInfo(state, VARDATA_ANY(delim), - VARSIZE_ANY_EXHDR(delim)); - if (isfirst) - state->cursor = VARSIZE_ANY_EXHDR(delim); - } - - appendBinaryStringInfo(state, VARDATA_ANY(value), - VARSIZE_ANY_EXHDR(value)); - } - - /* - * The transition type for string_agg() is declared to be "internal", - * which is a pass-by-value type the same size as a pointer. - */ - if (state) - PG_RETURN_POINTER(state); - PG_RETURN_NULL(); -} - -Datum -bytea_string_agg_finalfn(PG_FUNCTION_ARGS) -{ - StringInfo state; - - /* cannot be called directly because of internal-type argument */ - Assert(AggCheckCallContext(fcinfo, NULL)); - - state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0); - - if (state != NULL) - { - /* As per comment in transfn, strip data before the cursor position */ - bytea *result; - int strippedlen = state->len - state->cursor; - - result = (bytea *) palloc(strippedlen + VARHDRSZ); - SET_VARSIZE(result, strippedlen + VARHDRSZ); - memcpy(VARDATA(result), &state->data[state->cursor], strippedlen); - PG_RETURN_BYTEA_P(result); - } - else - PG_RETURN_NULL(); -} - /* * textin - converts cstring to internal representation */ @@ -2959,467 +2647,6 @@ bttext_pattern_sortsupport(PG_FUNCTION_ARGS) } -/*------------------------------------------------------------- - * byteaoctetlen - * - * get the number of bytes contained in an instance of type 'bytea' - *------------------------------------------------------------- - */ -Datum -byteaoctetlen(PG_FUNCTION_ARGS) -{ - Datum str = PG_GETARG_DATUM(0); - - /* We need not detoast the input at all */ - PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ); -} - -/* - * byteacat - - * takes two bytea* and returns a bytea* that is the concatenation of - * the two. - * - * Cloned from textcat and modified as required. - */ -Datum -byteacat(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); - - PG_RETURN_BYTEA_P(bytea_catenate(t1, t2)); -} - -/* - * bytea_catenate - * Guts of byteacat(), broken out so it can be used by other functions - * - * Arguments can be in short-header form, but not compressed or out-of-line - */ -static bytea * -bytea_catenate(bytea *t1, bytea *t2) -{ - bytea *result; - int len1, - len2, - len; - char *ptr; - - len1 = VARSIZE_ANY_EXHDR(t1); - len2 = VARSIZE_ANY_EXHDR(t2); - - /* paranoia ... probably should throw error instead? */ - if (len1 < 0) - len1 = 0; - if (len2 < 0) - len2 = 0; - - len = len1 + len2 + VARHDRSZ; - result = (bytea *) palloc(len); - - /* Set size of result string... */ - SET_VARSIZE(result, len); - - /* Fill data field of result string... */ - ptr = VARDATA(result); - if (len1 > 0) - memcpy(ptr, VARDATA_ANY(t1), len1); - if (len2 > 0) - memcpy(ptr + len1, VARDATA_ANY(t2), len2); - - return result; -} - -#define PG_STR_GET_BYTEA(str_) \ - DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_))) - -/* - * bytea_substr() - * Return a substring starting at the specified position. - * Cloned from text_substr and modified as required. - * - * Input: - * - string - * - starting position (is one-based) - * - string length (optional) - * - * If the starting position is zero or less, then return from the start of the string - * adjusting the length to be consistent with the "negative start" per SQL. - * If the length is less than zero, an ERROR is thrown. If no third argument - * (length) is provided, the length to the end of the string is assumed. - */ -Datum -bytea_substr(PG_FUNCTION_ARGS) -{ - PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), - PG_GETARG_INT32(1), - PG_GETARG_INT32(2), - false)); -} - -/* - * bytea_substr_no_len - - * Wrapper to avoid opr_sanity failure due to - * one function accepting a different number of args. - */ -Datum -bytea_substr_no_len(PG_FUNCTION_ARGS) -{ - PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0), - PG_GETARG_INT32(1), - -1, - true)); -} - -static bytea * -bytea_substring(Datum str, - int S, - int L, - bool length_not_specified) -{ - int32 S1; /* adjusted start position */ - int32 L1; /* adjusted substring length */ - int32 E; /* end position */ - - /* - * The logic here should generally match text_substring(). - */ - S1 = Max(S, 1); - - if (length_not_specified) - { - /* - * Not passed a length - DatumGetByteaPSlice() grabs everything to the - * end of the string if we pass it a negative value for length. - */ - L1 = -1; - } - else if (L < 0) - { - /* SQL99 says to throw an error for E < S, i.e., negative length */ - ereport(ERROR, - (errcode(ERRCODE_SUBSTRING_ERROR), - errmsg("negative substring length not allowed"))); - L1 = -1; /* silence stupider compilers */ - } - else if (pg_add_s32_overflow(S, L, &E)) - { - /* - * L could be large enough for S + L to overflow, in which case the - * substring must run to end of string. - */ - L1 = -1; - } - else - { - /* - * A zero or negative value for the end position can happen if the - * start was negative or one. SQL99 says to return a zero-length - * string. - */ - if (E < 1) - return PG_STR_GET_BYTEA(""); - - L1 = E - S1; - } - - /* - * If the start position is past the end of the string, SQL99 says to - * return a zero-length string -- DatumGetByteaPSlice() will do that for - * us. We need only convert S1 to zero-based starting position. - */ - return DatumGetByteaPSlice(str, S1 - 1, L1); -} - -/* - * byteaoverlay - * Replace specified substring of first string with second - * - * The SQL standard defines OVERLAY() in terms of substring and concatenation. - * This code is a direct implementation of what the standard says. - */ -Datum -byteaoverlay(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); - int sp = PG_GETARG_INT32(2); /* substring start position */ - int sl = PG_GETARG_INT32(3); /* substring length */ - - PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); -} - -Datum -byteaoverlay_no_len(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); - int sp = PG_GETARG_INT32(2); /* substring start position */ - int sl; - - sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */ - PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl)); -} - -static bytea * -bytea_overlay(bytea *t1, bytea *t2, int sp, int sl) -{ - bytea *result; - bytea *s1; - bytea *s2; - int sp_pl_sl; - - /* - * Check for possible integer-overflow cases. For negative sp, throw a - * "substring length" error because that's what should be expected - * according to the spec's definition of OVERLAY(). - */ - if (sp <= 0) - ereport(ERROR, - (errcode(ERRCODE_SUBSTRING_ERROR), - errmsg("negative substring length not allowed"))); - if (pg_add_s32_overflow(sp, sl, &sp_pl_sl)) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range"))); - - s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false); - s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true); - result = bytea_catenate(s1, t2); - result = bytea_catenate(result, s2); - - return result; -} - -/* - * bit_count - */ -Datum -bytea_bit_count(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - - PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1))); -} - -/* - * byteapos - - * Return the position of the specified substring. - * Implements the SQL POSITION() function. - * Cloned from textpos and modified as required. - */ -Datum -byteapos(PG_FUNCTION_ARGS) -{ - bytea *t1 = PG_GETARG_BYTEA_PP(0); - bytea *t2 = PG_GETARG_BYTEA_PP(1); - int pos; - int px, - p; - int len1, - len2; - char *p1, - *p2; - - len1 = VARSIZE_ANY_EXHDR(t1); - len2 = VARSIZE_ANY_EXHDR(t2); - - if (len2 <= 0) - PG_RETURN_INT32(1); /* result for empty pattern */ - - p1 = VARDATA_ANY(t1); - p2 = VARDATA_ANY(t2); - - pos = 0; - px = (len1 - len2); - for (p = 0; p <= px; p++) - { - if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0)) - { - pos = p + 1; - break; - }; - p1++; - }; - - PG_RETURN_INT32(pos); -} - -/*------------------------------------------------------------- - * byteaGetByte - * - * this routine treats "bytea" as an array of bytes. - * It returns the Nth byte (a number between 0 and 255). - *------------------------------------------------------------- - */ -Datum -byteaGetByte(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int32 n = PG_GETARG_INT32(1); - int len; - int byte; - - len = VARSIZE_ANY_EXHDR(v); - - if (n < 0 || n >= len) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %d out of valid range, 0..%d", - n, len - 1))); - - byte = ((unsigned char *) VARDATA_ANY(v))[n]; - - PG_RETURN_INT32(byte); -} - -/*------------------------------------------------------------- - * byteaGetBit - * - * This routine treats a "bytea" type like an array of bits. - * It returns the value of the Nth bit (0 or 1). - * - *------------------------------------------------------------- - */ -Datum -byteaGetBit(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int64 n = PG_GETARG_INT64(1); - int byteNo, - bitNo; - int len; - int byte; - - len = VARSIZE_ANY_EXHDR(v); - - if (n < 0 || n >= (int64) len * 8) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, - n, (int64) len * 8 - 1))); - - /* n/8 is now known < len, so safe to cast to int */ - byteNo = (int) (n / 8); - bitNo = (int) (n % 8); - - byte = ((unsigned char *) VARDATA_ANY(v))[byteNo]; - - if (byte & (1 << bitNo)) - PG_RETURN_INT32(1); - else - PG_RETURN_INT32(0); -} - -/*------------------------------------------------------------- - * byteaSetByte - * - * Given an instance of type 'bytea' creates a new one with - * the Nth byte set to the given value. - * - *------------------------------------------------------------- - */ -Datum -byteaSetByte(PG_FUNCTION_ARGS) -{ - bytea *res = PG_GETARG_BYTEA_P_COPY(0); - int32 n = PG_GETARG_INT32(1); - int32 newByte = PG_GETARG_INT32(2); - int len; - - len = VARSIZE(res) - VARHDRSZ; - - if (n < 0 || n >= len) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %d out of valid range, 0..%d", - n, len - 1))); - - /* - * Now set the byte. - */ - ((unsigned char *) VARDATA(res))[n] = newByte; - - PG_RETURN_BYTEA_P(res); -} - -/*------------------------------------------------------------- - * byteaSetBit - * - * Given an instance of type 'bytea' creates a new one with - * the Nth bit set to the given value. - * - *------------------------------------------------------------- - */ -Datum -byteaSetBit(PG_FUNCTION_ARGS) -{ - bytea *res = PG_GETARG_BYTEA_P_COPY(0); - int64 n = PG_GETARG_INT64(1); - int32 newBit = PG_GETARG_INT32(2); - int len; - int oldByte, - newByte; - int byteNo, - bitNo; - - len = VARSIZE(res) - VARHDRSZ; - - if (n < 0 || n >= (int64) len * 8) - ereport(ERROR, - (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), - errmsg("index %" PRId64 " out of valid range, 0..%" PRId64, - n, (int64) len * 8 - 1))); - - /* n/8 is now known < len, so safe to cast to int */ - byteNo = (int) (n / 8); - bitNo = (int) (n % 8); - - /* - * sanity check! - */ - if (newBit != 0 && newBit != 1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("new bit must be 0 or 1"))); - - /* - * Update the byte. - */ - oldByte = ((unsigned char *) VARDATA(res))[byteNo]; - - if (newBit == 0) - newByte = oldByte & (~(1 << bitNo)); - else - newByte = oldByte | (1 << bitNo); - - ((unsigned char *) VARDATA(res))[byteNo] = newByte; - - PG_RETURN_BYTEA_P(res); -} - -/* - * Return reversed bytea - */ -Datum -bytea_reverse(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - const char *p = VARDATA_ANY(v); - int len = VARSIZE_ANY_EXHDR(v); - const char *endp = p + len; - bytea *result = palloc(len + VARHDRSZ); - char *dst = (char *) VARDATA(result) + len; - - SET_VARSIZE(result, len + VARHDRSZ); - - while (p < endp) - *(--dst) = *p++; - - PG_RETURN_BYTEA_P(result); -} - - /* text_name() * Converts a text type to a Name type. */ @@ -3849,331 +3076,6 @@ SplitGUCList(char *rawstring, char separator, return true; } - -/***************************************************************************** - * Comparison Functions used for bytea - * - * Note: btree indexes need these routines not to leak memory; therefore, - * be careful to free working copies of toasted datums. Most places don't - * need to be so careful. - *****************************************************************************/ - -Datum -byteaeq(PG_FUNCTION_ARGS) -{ - Datum arg1 = PG_GETARG_DATUM(0); - Datum arg2 = PG_GETARG_DATUM(1); - bool result; - Size len1, - len2; - - /* - * We can use a fast path for unequal lengths, which might save us from - * having to detoast one or both values. - */ - len1 = toast_raw_datum_size(arg1); - len2 = toast_raw_datum_size(arg2); - if (len1 != len2) - result = false; - else - { - bytea *barg1 = DatumGetByteaPP(arg1); - bytea *barg2 = DatumGetByteaPP(arg2); - - result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), - len1 - VARHDRSZ) == 0); - - PG_FREE_IF_COPY(barg1, 0); - PG_FREE_IF_COPY(barg2, 1); - } - - PG_RETURN_BOOL(result); -} - -Datum -byteane(PG_FUNCTION_ARGS) -{ - Datum arg1 = PG_GETARG_DATUM(0); - Datum arg2 = PG_GETARG_DATUM(1); - bool result; - Size len1, - len2; - - /* - * We can use a fast path for unequal lengths, which might save us from - * having to detoast one or both values. - */ - len1 = toast_raw_datum_size(arg1); - len2 = toast_raw_datum_size(arg2); - if (len1 != len2) - result = true; - else - { - bytea *barg1 = DatumGetByteaPP(arg1); - bytea *barg2 = DatumGetByteaPP(arg2); - - result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2), - len1 - VARHDRSZ) != 0); - - PG_FREE_IF_COPY(barg1, 0); - PG_FREE_IF_COPY(barg2, 1); - } - - PG_RETURN_BOOL(result); -} - -Datum -bytealt(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2))); -} - -Datum -byteale(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2))); -} - -Datum -byteagt(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2))); -} - -Datum -byteage(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2))); -} - -Datum -byteacmp(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - if ((cmp == 0) && (len1 != len2)) - cmp = (len1 < len2) ? -1 : 1; - - PG_FREE_IF_COPY(arg1, 0); - PG_FREE_IF_COPY(arg2, 1); - - PG_RETURN_INT32(cmp); -} - -Datum -bytea_larger(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - bytea *result; - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2); - - PG_RETURN_BYTEA_P(result); -} - -Datum -bytea_smaller(PG_FUNCTION_ARGS) -{ - bytea *arg1 = PG_GETARG_BYTEA_PP(0); - bytea *arg2 = PG_GETARG_BYTEA_PP(1); - bytea *result; - int len1, - len2; - int cmp; - - len1 = VARSIZE_ANY_EXHDR(arg1); - len2 = VARSIZE_ANY_EXHDR(arg2); - - cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2)); - result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2); - - PG_RETURN_BYTEA_P(result); -} - -Datum -bytea_sortsupport(PG_FUNCTION_ARGS) -{ - SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0); - MemoryContext oldcontext; - - oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt); - - /* Use generic string SortSupport, forcing "C" collation */ - varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID); - - MemoryContextSwitchTo(oldcontext); - - PG_RETURN_VOID(); -} - -/* Cast bytea -> int2 */ -Datum -bytea_int2(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int len = VARSIZE_ANY_EXHDR(v); - uint16 result; - - /* Check that the byte array is not too long */ - if (len > sizeof(result)) - ereport(ERROR, - errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("smallint out of range")); - - /* Convert it to an integer; most significant bytes come first */ - result = 0; - for (int i = 0; i < len; i++) - { - result <<= BITS_PER_BYTE; - result |= ((unsigned char *) VARDATA_ANY(v))[i]; - } - - PG_RETURN_INT16(result); -} - -/* Cast bytea -> int4 */ -Datum -bytea_int4(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int len = VARSIZE_ANY_EXHDR(v); - uint32 result; - - /* Check that the byte array is not too long */ - if (len > sizeof(result)) - ereport(ERROR, - errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("integer out of range")); - - /* Convert it to an integer; most significant bytes come first */ - result = 0; - for (int i = 0; i < len; i++) - { - result <<= BITS_PER_BYTE; - result |= ((unsigned char *) VARDATA_ANY(v))[i]; - } - - PG_RETURN_INT32(result); -} - -/* Cast bytea -> int8 */ -Datum -bytea_int8(PG_FUNCTION_ARGS) -{ - bytea *v = PG_GETARG_BYTEA_PP(0); - int len = VARSIZE_ANY_EXHDR(v); - uint64 result; - - /* Check that the byte array is not too long */ - if (len > sizeof(result)) - ereport(ERROR, - errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("bigint out of range")); - - /* Convert it to an integer; most significant bytes come first */ - result = 0; - for (int i = 0; i < len; i++) - { - result <<= BITS_PER_BYTE; - result |= ((unsigned char *) VARDATA_ANY(v))[i]; - } - - PG_RETURN_INT64(result); -} - -/* Cast int2 -> bytea; can just use int2send() */ -Datum -int2_bytea(PG_FUNCTION_ARGS) -{ - return int2send(fcinfo); -} - -/* Cast int4 -> bytea; can just use int4send() */ -Datum -int4_bytea(PG_FUNCTION_ARGS) -{ - return int4send(fcinfo); -} - -/* Cast int8 -> bytea; can just use int8send() */ -Datum -int8_bytea(PG_FUNCTION_ARGS) -{ - return int8send(fcinfo); -} - /* * appendStringInfoText * diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index db8d0d6a7e8..182e8f75db7 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -529,14 +529,36 @@ xmltext(PG_FUNCTION_ARGS) #ifdef USE_LIBXML text *arg = PG_GETARG_TEXT_PP(0); text *result; - xmlChar *xmlbuf = NULL; + volatile xmlChar *xmlbuf = NULL; + PgXmlErrorContext *xmlerrcxt; + + /* First we gotta spin up some error handling. */ + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); + + PG_TRY(); + { + xmlbuf = xmlEncodeSpecialChars(NULL, xml_text2xmlChar(arg)); + + if (xmlbuf == NULL || xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xmlChar"); - xmlbuf = xmlEncodeSpecialChars(NULL, xml_text2xmlChar(arg)); + result = cstring_to_text_with_len((const char *) xmlbuf, + xmlStrlen((const xmlChar *) xmlbuf)); + } + PG_CATCH(); + { + if (xmlbuf) + xmlFree((xmlChar *) xmlbuf); + + pg_xml_done(xmlerrcxt, true); + PG_RE_THROW(); + } + PG_END_TRY(); - Assert(xmlbuf); + xmlFree((xmlChar *) xmlbuf); + pg_xml_done(xmlerrcxt, false); - result = cstring_to_text_with_len((const char *) xmlbuf, xmlStrlen(xmlbuf)); - xmlFree(xmlbuf); PG_RETURN_XML_P(result); #else NO_XML_SUPPORT(); @@ -663,7 +685,7 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) volatile xmlBufferPtr buf = NULL; volatile xmlSaveCtxtPtr ctxt = NULL; ErrorSaveContext escontext = {T_ErrorSaveContext}; - PgXmlErrorContext *xmlerrcxt; + PgXmlErrorContext *volatile xmlerrcxt = NULL; #endif if (xmloption_arg != XMLOPTION_DOCUMENT && !indent) @@ -704,13 +726,18 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) return (text *) data; } - /* Otherwise, we gotta spin up some error handling. */ - xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); - + /* + * Otherwise, we gotta spin up some error handling. Unlike most other + * routines in this module, we already have a libxml "doc" structure to + * free, so we need to call pg_xml_init() inside the PG_TRY and be + * prepared for it to fail (typically due to palloc OOM). + */ PG_TRY(); { size_t decl_len = 0; + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); + /* The serialized data will go into this buffer. */ buf = xmlBufferCreate(); @@ -754,6 +781,7 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) * content nodes, and then iterate over the nodes. */ xmlNodePtr root; + xmlNodePtr oldroot; xmlNodePtr newline; root = xmlNewNode(NULL, (const xmlChar *) "content-root"); @@ -761,9 +789,18 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate xml node"); - /* This attaches root to doc, so we need not free it separately. */ - xmlDocSetRootElement(doc, root); - xmlAddChildList(root, content_nodes); + /* + * This attaches root to doc, so we need not free it separately... + * but instead, we have to free the old root if there was one. + */ + oldroot = xmlDocSetRootElement(doc, root); + if (oldroot != NULL) + xmlFreeNode(oldroot); + + if (xmlAddChildList(root, content_nodes) == NULL || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not append xml node list"); /* * We use this node to insert newlines in the dump. Note: in at @@ -831,10 +868,10 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) xmlSaveClose(ctxt); if (buf) xmlBufferFree(buf); - if (doc) - xmlFreeDoc(doc); + xmlFreeDoc(doc); - pg_xml_done(xmlerrcxt, true); + if (xmlerrcxt) + pg_xml_done(xmlerrcxt, true); PG_RE_THROW(); } @@ -924,7 +961,10 @@ xmlelement(XmlExpr *xexpr, xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate xmlTextWriter"); - xmlTextWriterStartElement(writer, (xmlChar *) xexpr->name); + if (xmlTextWriterStartElement(writer, (xmlChar *) xexpr->name) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not start xml element"); forboth(arg, named_arg_strings, narg, xexpr->arg_names) { @@ -932,19 +972,30 @@ xmlelement(XmlExpr *xexpr, char *argname = strVal(lfirst(narg)); if (str) - xmlTextWriterWriteAttribute(writer, - (xmlChar *) argname, - (xmlChar *) str); + { + if (xmlTextWriterWriteAttribute(writer, + (xmlChar *) argname, + (xmlChar *) str) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not write xml attribute"); + } } foreach(arg, arg_strings) { char *str = (char *) lfirst(arg); - xmlTextWriterWriteRaw(writer, (xmlChar *) str); + if (xmlTextWriterWriteRaw(writer, (xmlChar *) str) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not write raw xml text"); } - xmlTextWriterEndElement(writer); + if (xmlTextWriterEndElement(writer) < 0 || + xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INTERNAL_ERROR, + "could not end xml element"); /* we MUST do this now to flush data out to the buffer ... */ xmlFreeTextWriter(writer); @@ -1718,7 +1769,7 @@ xml_doctype_in_content(const xmlChar *str) * xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode). * * If parsed_nodes isn't NULL and we parse in CONTENT mode, the list - * of parsed nodes from the xmlParseInNodeContext call will be returned + * of parsed nodes from the xmlParseBalancedChunkMemory call will be returned * to *parsed_nodes. (It is caller's responsibility to free that.) * * Errors normally result in ereport(ERROR), but if escontext is an @@ -1744,6 +1795,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, PgXmlErrorContext *xmlerrcxt; volatile xmlParserCtxtPtr ctxt = NULL; volatile xmlDocPtr doc = NULL; + volatile int save_keep_blanks = -1; /* * This step looks annoyingly redundant, but we must do it to have a @@ -1771,7 +1823,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg, PG_TRY(); { bool parse_as_document = false; - int options; int res_code; size_t count = 0; xmlChar *version = NULL; @@ -1802,18 +1853,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg, parse_as_document = true; } - /* - * Select parse options. - * - * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR) - * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined by - * internal DTD are applied'. As for external DTDs, we try to support - * them too (see SQL/XML:2008 GR 10.16.7.e), but that doesn't really - * happen because xmlPgEntityLoader prevents it. - */ - options = XML_PARSE_NOENT | XML_PARSE_DTDATTR - | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS); - /* initialize output parameters */ if (parsed_xmloptiontype != NULL) *parsed_xmloptiontype = parse_as_document ? XMLOPTION_DOCUMENT : @@ -1823,11 +1862,26 @@ xml_parse(text *data, XmlOptionType xmloption_arg, if (parse_as_document) { + int options; + + /* set up parser context used by xmlCtxtReadDoc */ ctxt = xmlNewParserCtxt(); if (ctxt == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); + /* + * Select parse options. + * + * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR) + * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined + * by internal DTD are applied'. As for external DTDs, we try to + * support them too (see SQL/XML:2008 GR 10.16.7.e), but that + * doesn't really happen because xmlPgEntityLoader prevents it. + */ + options = XML_PARSE_NOENT | XML_PARSE_DTDATTR + | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS); + doc = xmlCtxtReadDoc(ctxt, utf8string, NULL, /* no URL */ "UTF-8", @@ -1849,9 +1903,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } else { - xmlNodePtr root; - - /* set up document with empty root node to be the context node */ + /* set up document that xmlParseBalancedChunkMemory will add to */ doc = xmlNewDoc(version); if (doc == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, @@ -1864,38 +1916,22 @@ xml_parse(text *data, XmlOptionType xmloption_arg, "could not allocate XML document"); doc->standalone = standalone; - root = xmlNewNode(NULL, (const xmlChar *) "content-root"); - if (root == NULL || xmlerrcxt->err_occurred) - xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, - "could not allocate xml node"); - /* This attaches root to doc, so we need not free it separately. */ - xmlDocSetRootElement(doc, root); + /* set parse options --- have to do this the ugly way */ + save_keep_blanks = xmlKeepBlanksDefault(preserve_whitespace ? 1 : 0); /* allow empty content */ if (*(utf8string + count)) { - xmlNodePtr node_list = NULL; - xmlParserErrors res; - - res = xmlParseInNodeContext(root, - (char *) utf8string + count, - strlen((char *) utf8string + count), - options, - &node_list); - - if (res != XML_ERR_OK || xmlerrcxt->err_occurred) + res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, + utf8string + count, + parsed_nodes); + if (res_code != 0 || xmlerrcxt->err_occurred) { - xmlFreeNodeList(node_list); xml_errsave(escontext, xmlerrcxt, ERRCODE_INVALID_XML_CONTENT, "invalid XML content"); goto fail; } - - if (parsed_nodes != NULL) - *parsed_nodes = node_list; - else - xmlFreeNodeList(node_list); } } @@ -1904,6 +1940,8 @@ fail: } PG_CATCH(); { + if (save_keep_blanks != -1) + xmlKeepBlanksDefault(save_keep_blanks); if (doc != NULL) xmlFreeDoc(doc); if (ctxt != NULL) @@ -1915,6 +1953,9 @@ fail: } PG_END_TRY(); + if (save_keep_blanks != -1) + xmlKeepBlanksDefault(save_keep_blanks); + if (ctxt != NULL) xmlFreeParserCtxt(ctxt); @@ -4207,20 +4248,27 @@ xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt) } else { - xmlChar *str; + volatile xmlChar *str = NULL; - str = xmlXPathCastNodeToString(cur); PG_TRY(); { + char *escaped; + + str = xmlXPathCastNodeToString(cur); + if (str == NULL || xmlerrcxt->err_occurred) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate xmlChar"); + /* Here we rely on XML having the same representation as TEXT */ - char *escaped = escape_xml((char *) str); + escaped = escape_xml((char *) str); result = (xmltype *) cstring_to_text(escaped); pfree(escaped); } PG_FINALLY(); { - xmlFree(str); + if (str) + xmlFree((xmlChar *) str); } PG_END_TRY(); } diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c index 657648996c2..e2cd3feaf81 100644 --- a/src/backend/utils/cache/catcache.c +++ b/src/backend/utils/cache/catcache.c @@ -213,7 +213,7 @@ namehashfast(Datum datum) { char *key = NameStr(*DatumGetName(datum)); - return hash_any((unsigned char *) key, strlen(key)); + return hash_bytes((unsigned char *) key, strlen(key)); } static bool @@ -317,6 +317,7 @@ GetCCHashEqFuncs(Oid keytype, CCHashFN *hashfunc, RegProcedure *eqfunc, CCFastEq case REGDICTIONARYOID: case REGROLEOID: case REGNAMESPACEOID: + case REGDATABASEOID: *hashfunc = int4hashfast; *fasteqfunc = int4eqfast; *eqfunc = F_OIDEQ; diff --git a/src/backend/utils/cache/evtcache.c b/src/backend/utils/cache/evtcache.c index ce596bf5638..b9d5a5998be 100644 --- a/src/backend/utils/cache/evtcache.c +++ b/src/backend/utils/cache/evtcache.c @@ -78,7 +78,6 @@ BuildEventTriggerCache(void) { HASHCTL ctl; HTAB *cache; - MemoryContext oldcontext; Relation rel; Relation irel; SysScanDesc scan; @@ -110,9 +109,6 @@ BuildEventTriggerCache(void) (Datum) 0); } - /* Switch to correct memory context. */ - oldcontext = MemoryContextSwitchTo(EventTriggerCacheContext); - /* Prevent the memory context from being nuked while we're rebuilding. */ EventTriggerCacheState = ETCS_REBUILD_STARTED; @@ -145,6 +141,7 @@ BuildEventTriggerCache(void) bool evttags_isnull; EventTriggerCacheEntry *entry; bool found; + MemoryContext oldcontext; /* Get next tuple. */ tup = systable_getnext_ordered(scan, ForwardScanDirection); @@ -171,6 +168,9 @@ BuildEventTriggerCache(void) else continue; + /* Switch to correct memory context. */ + oldcontext = MemoryContextSwitchTo(EventTriggerCacheContext); + /* Allocate new cache item. */ item = palloc0(sizeof(EventTriggerCacheItem)); item->fnoid = form->evtfoid; @@ -188,6 +188,9 @@ BuildEventTriggerCache(void) entry->triggerlist = lappend(entry->triggerlist, item); else entry->triggerlist = list_make1(item); + + /* Restore previous memory context. */ + MemoryContextSwitchTo(oldcontext); } /* Done with pg_event_trigger scan. */ @@ -195,9 +198,6 @@ BuildEventTriggerCache(void) index_close(irel, AccessShareLock); relation_close(rel, AccessShareLock); - /* Restore previous memory context. */ - MemoryContextSwitchTo(oldcontext); - /* Install new cache. */ EventTriggerCache = cache; @@ -240,6 +240,8 @@ DecodeTextArrayToBitmapset(Datum array) } pfree(elems); + if ((Pointer) arr != DatumGetPointer(array)) + pfree(arr); return bms; } diff --git a/src/backend/utils/cache/funccache.c b/src/backend/utils/cache/funccache.c index 150c502a612..afc048a051e 100644 --- a/src/backend/utils/cache/funccache.c +++ b/src/backend/utils/cache/funccache.c @@ -491,6 +491,7 @@ cached_function_compile(FunctionCallInfo fcinfo, CachedFunctionHashKey hashkey; bool function_valid = false; bool hashkey_valid = false; + bool new_function = false; /* * Lookup the pg_proc tuple by Oid; we'll need it in any case @@ -570,13 +571,15 @@ recheck: /* * Create the new function struct, if not done already. The function - * structs are never thrown away, so keep them in TopMemoryContext. + * cache entry will be kept for the life of the backend, so put it in + * TopMemoryContext. */ Assert(cacheEntrySize >= sizeof(CachedFunction)); if (function == NULL) { function = (CachedFunction *) MemoryContextAllocZero(TopMemoryContext, cacheEntrySize); + new_function = true; } else { @@ -585,17 +588,36 @@ recheck: } /* - * Fill in the CachedFunction part. fn_hashkey and use_count remain - * zeroes for now. + * However, if function compilation fails, we'd like not to leak the + * function struct, so use a PG_TRY block to prevent that. (It's up + * to the compile callback function to avoid its own internal leakage + * in such cases.) Unfortunately, freeing the struct is only safe if + * we just allocated it: otherwise there are probably fn_extra + * pointers to it. */ - function->fn_xmin = HeapTupleHeaderGetRawXmin(procTup->t_data); - function->fn_tid = procTup->t_self; - function->dcallback = dcallback; + PG_TRY(); + { + /* + * Do the hard, language-specific part. + */ + ccallback(fcinfo, procTup, &hashkey, function, forValidator); + } + PG_CATCH(); + { + if (new_function) + pfree(function); + PG_RE_THROW(); + } + PG_END_TRY(); /* - * Do the hard, language-specific part. + * Fill in the CachedFunction part. (We do this last to prevent the + * function from looking valid before it's fully built.) fn_hashkey + * will be set by cfunc_hashtable_insert; use_count remains zero. */ - ccallback(fcinfo, procTup, &hashkey, function, forValidator); + function->fn_xmin = HeapTupleHeaderGetRawXmin(procTup->t_data); + function->fn_tid = procTup->t_self; + function->dcallback = dcallback; /* * Add the completed struct to the hash table. diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 9bcbc4c3e97..6661d2c6b73 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -92,8 +92,7 @@ static void ReleaseGenericPlan(CachedPlanSource *plansource); static bool StmtPlanRequiresRevalidation(CachedPlanSource *plansource); static bool BuildingPlanRequiresSnapshot(CachedPlanSource *plansource); static List *RevalidateCachedQuery(CachedPlanSource *plansource, - QueryEnvironment *queryEnv, - bool release_generic); + QueryEnvironment *queryEnv); static bool CheckCachedPlan(CachedPlanSource *plansource); static CachedPlan *BuildCachedPlan(CachedPlanSource *plansource, List *qlist, ParamListInfo boundParams, QueryEnvironment *queryEnv); @@ -464,8 +463,7 @@ CompleteCachedPlan(CachedPlanSource *plansource, /* * Save the final parameter types (or other parameter specification data) - * into the source_context, as well as our other parameters. Also save - * the result tuple descriptor. + * into the source_context, as well as our other parameters. */ MemoryContextSwitchTo(source_context); @@ -481,9 +479,25 @@ CompleteCachedPlan(CachedPlanSource *plansource, plansource->parserSetupArg = parserSetupArg; plansource->cursor_options = cursor_options; plansource->fixed_result = fixed_result; - plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list); + /* + * Also save the result tuple descriptor. PlanCacheComputeResultDesc may + * leak some cruft; normally we just accept that to save a copy step, but + * in USE_VALGRIND mode be tidy by running it in the caller's context. + */ +#ifdef USE_VALGRIND MemoryContextSwitchTo(oldcxt); + plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list); + if (plansource->resultDesc) + { + MemoryContextSwitchTo(source_context); + plansource->resultDesc = CreateTupleDescCopy(plansource->resultDesc); + MemoryContextSwitchTo(oldcxt); + } +#else + plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list); + MemoryContextSwitchTo(oldcxt); +#endif plansource->is_complete = true; plansource->is_valid = true; @@ -663,17 +677,10 @@ BuildingPlanRequiresSnapshot(CachedPlanSource *plansource) * The result value is the transient analyzed-and-rewritten query tree if we * had to do re-analysis, and NIL otherwise. (This is returned just to save * a tree copying step in a subsequent BuildCachedPlan call.) - * - * This also releases and drops the generic plan (plansource->gplan), if any, - * as most callers will typically build a new CachedPlan for the plansource - * right after this. However, when called from UpdateCachedPlan(), the - * function does not release the generic plan, as UpdateCachedPlan() updates - * an existing CachedPlan in place. */ static List * RevalidateCachedQuery(CachedPlanSource *plansource, - QueryEnvironment *queryEnv, - bool release_generic) + QueryEnvironment *queryEnv) { bool snapshot_set; List *tlist; /* transient query-tree list */ @@ -772,9 +779,8 @@ RevalidateCachedQuery(CachedPlanSource *plansource, MemoryContextDelete(qcxt); } - /* Drop the generic plan reference, if any, and if requested */ - if (release_generic) - ReleaseGenericPlan(plansource); + /* Drop the generic plan reference if any */ + ReleaseGenericPlan(plansource); /* * Now re-do parse analysis and rewrite. This not incidentally acquires @@ -937,10 +943,8 @@ RevalidateCachedQuery(CachedPlanSource *plansource, * Caller must have already called RevalidateCachedQuery to verify that the * querytree is up to date. * - * On a "true" return, we have acquired locks on the "unprunableRelids" set - * for all plans in plansource->stmt_list. However, the plans are not fully - * race-condition-free until the executor acquires locks on the prunable - * relations that survive initial runtime pruning during InitPlan(). + * On a "true" return, we have acquired the locks needed to run the plan. + * (We must do this for the "true" result to be race-condition-free.) */ static bool CheckCachedPlan(CachedPlanSource *plansource) @@ -1025,8 +1029,6 @@ CheckCachedPlan(CachedPlanSource *plansource) * Planning work is done in the caller's memory context. The finished plan * is in a child memory context, which typically should get reparented * (unless this is a one-shot plan, in which case we don't copy the plan). - * - * Note: When changing this, you should also look at UpdateCachedPlan(). */ static CachedPlan * BuildCachedPlan(CachedPlanSource *plansource, List *qlist, @@ -1037,7 +1039,6 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, bool snapshot_set; bool is_transient; MemoryContext plan_context; - MemoryContext stmt_context = NULL; MemoryContext oldcxt = CurrentMemoryContext; ListCell *lc; @@ -1055,7 +1056,7 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, * let's treat it as real and redo the RevalidateCachedQuery call. */ if (!plansource->is_valid) - qlist = RevalidateCachedQuery(plansource, queryEnv, true); + qlist = RevalidateCachedQuery(plansource, queryEnv); /* * If we don't already have a copy of the querytree list that can be @@ -1093,19 +1094,10 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, PopActiveSnapshot(); /* - * Normally, we create a dedicated memory context for the CachedPlan and - * its subsidiary data. Although it's usually not very large, the context - * is designed to allow growth if necessary. - * - * The PlannedStmts are stored in a separate child context (stmt_context) - * of the CachedPlan's memory context. This separation allows - * UpdateCachedPlan() to free and replace the PlannedStmts without - * affecting the CachedPlan structure or its stmt_list List. - * - * For one-shot plans, we instead use the caller's memory context, as the - * CachedPlan will not persist. stmt_context will be set to NULL in this - * case, because UpdateCachedPlan() should never get called on a one-shot - * plan. + * Normally we make a dedicated memory context for the CachedPlan and its + * subsidiary data. (It's probably not going to be large, but just in + * case, allow it to grow large. It's transient for the moment.) But for + * a one-shot plan, we just leave it in the caller's memory context. */ if (!plansource->is_oneshot) { @@ -1114,17 +1106,12 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, ALLOCSET_START_SMALL_SIZES); MemoryContextCopyAndSetIdentifier(plan_context, plansource->query_string); - stmt_context = AllocSetContextCreate(CurrentMemoryContext, - "CachedPlan PlannedStmts", - ALLOCSET_START_SMALL_SIZES); - MemoryContextCopyAndSetIdentifier(stmt_context, plansource->query_string); - MemoryContextSetParent(stmt_context, plan_context); + /* + * Copy plan into the new context. + */ + MemoryContextSwitchTo(plan_context); - MemoryContextSwitchTo(stmt_context); plist = copyObject(plist); - - MemoryContextSwitchTo(plan_context); - plist = list_copy(plist); } else plan_context = CurrentMemoryContext; @@ -1165,10 +1152,8 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, plan->saved_xmin = InvalidTransactionId; plan->refcount = 0; plan->context = plan_context; - plan->stmt_context = stmt_context; plan->is_oneshot = plansource->is_oneshot; plan->is_saved = false; - plan->is_reused = false; plan->is_valid = true; /* assign generation number to new plan */ @@ -1180,113 +1165,6 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist, } /* - * UpdateCachedPlan - * Create fresh plans for all queries in the CachedPlanSource, replacing - * those in the generic plan's stmt_list, and return the plan for the - * query_index'th query. - * - * This function is primarily used by ExecutorStartCachedPlan() to handle - * cases where the original generic CachedPlan becomes invalid. Such - * invalidation may occur when prunable relations in the old plan for the - * query_index'th query are locked in preparation for execution. - * - * Note that invalidations received during the execution of the query_index'th - * query can affect both the queries that have already finished execution - * (e.g., due to concurrent modifications on prunable relations that were not - * locked during their execution) and also the queries that have not yet been - * executed. As a result, this function updates all plans to ensure - * CachedPlan.is_valid is safely set to true. - * - * The old PlannedStmts in plansource->gplan->stmt_list are freed here, so - * the caller and any of its callers must not rely on them remaining accessible - * after this function is called. - */ -PlannedStmt * -UpdateCachedPlan(CachedPlanSource *plansource, int query_index, - QueryEnvironment *queryEnv) -{ - List *query_list = plansource->query_list, - *plan_list; - ListCell *l1, - *l2; - CachedPlan *plan = plansource->gplan; - MemoryContext oldcxt; - - Assert(ActiveSnapshotSet()); - - /* Sanity checks (XXX can be Asserts?) */ - if (plan == NULL) - elog(ERROR, "UpdateCachedPlan() called in the wrong context: plansource->gplan is NULL"); - else if (plan->is_valid) - elog(ERROR, "UpdateCachedPlan() called in the wrong context: plansource->gplan->is_valid is true"); - else if (plan->is_oneshot) - elog(ERROR, "UpdateCachedPlan() called in the wrong context: plansource->gplan->is_oneshot is true"); - - /* - * The plansource might have become invalid since GetCachedPlan() returned - * the CachedPlan. See the comment in BuildCachedPlan() for details on why - * this might happen. Although invalidation is likely a false positive as - * stated there, we make the plan valid to ensure the query list used for - * planning is up to date. - * - * The risk of catching an invalidation is higher here than when - * BuildCachedPlan() is called from GetCachedPlan(), because this function - * is normally called long after GetCachedPlan() returns the CachedPlan, - * so much more processing could have occurred including things that mark - * the CachedPlanSource invalid. - * - * Note: Do not release plansource->gplan, because the upstream callers - * (such as the callers of ExecutorStartCachedPlan()) would still be - * referencing it. - */ - if (!plansource->is_valid) - query_list = RevalidateCachedQuery(plansource, queryEnv, false); - Assert(query_list != NIL); - - /* - * Build a new generic plan for all the queries after making a copy to be - * scribbled on by the planner. - */ - query_list = copyObject(query_list); - - /* - * Planning work is done in the caller's memory context. The resulting - * PlannedStmt is then copied into plan->stmt_context after throwing away - * the old ones. - */ - plan_list = pg_plan_queries(query_list, plansource->query_string, - plansource->cursor_options, NULL); - Assert(list_length(plan_list) == list_length(plan->stmt_list)); - - MemoryContextReset(plan->stmt_context); - oldcxt = MemoryContextSwitchTo(plan->stmt_context); - forboth(l1, plan_list, l2, plan->stmt_list) - { - PlannedStmt *plannedstmt = lfirst(l1); - - lfirst(l2) = copyObject(plannedstmt); - } - MemoryContextSwitchTo(oldcxt); - - /* - * XXX Should this also (re)set the properties of the CachedPlan that are - * set in BuildCachedPlan() after creating the fresh plans such as - * planRoleId, dependsOnRole, and saved_xmin? - */ - - /* - * We've updated all the plans that might have been invalidated, so mark - * the CachedPlan as valid. - */ - plan->is_valid = true; - - /* Also update generic_cost because we just created a new generic plan. */ - plansource->generic_cost = cached_plan_cost(plan, false); - - return list_nth_node(PlannedStmt, plan->stmt_list, query_index); -} - -/* * choose_custom_plan: choose whether to use custom or generic plan * * This defines the policy followed by GetCachedPlan. @@ -1402,13 +1280,8 @@ cached_plan_cost(CachedPlan *plan, bool include_planner) * plan or a custom plan for the given parameters: the caller does not know * which it will get. * - * On return, the plan is valid, but if it is a reused generic plan, not all - * locks are acquired. In such cases, CheckCachedPlan() does not take locks - * on relations subject to initial runtime pruning; instead, these locks are - * deferred until execution startup, when ExecDoInitialPruning() performs - * initial pruning. The plan's "is_reused" flag is set to indicate that - * CachedPlanRequiresLocking() should return true when called by - * ExecDoInitialPruning(). + * On return, the plan is valid and we have sufficient locks to begin + * execution. * * On return, the refcount of the plan has been incremented; a later * ReleaseCachedPlan() call is expected. If "owner" is not NULL then @@ -1425,6 +1298,7 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, CachedPlan *plan = NULL; List *qlist; bool customplan; + ListCell *lc; /* Assert caller is doing things in a sane order */ Assert(plansource->magic == CACHEDPLANSOURCE_MAGIC); @@ -1434,7 +1308,7 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, elog(ERROR, "cannot apply ResourceOwner to non-saved cached plan"); /* Make sure the querytree list is valid and we have parse-time locks */ - qlist = RevalidateCachedQuery(plansource, queryEnv, true); + qlist = RevalidateCachedQuery(plansource, queryEnv); /* Decide whether to use a custom plan */ customplan = choose_custom_plan(plansource, boundParams); @@ -1446,8 +1320,6 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, /* We want a generic plan, and we already have a valid one */ plan = plansource->gplan; Assert(plan->magic == CACHEDPLAN_MAGIC); - /* Reusing the existing plan, so not all locks may be acquired. */ - plan->is_reused = true; } else { @@ -1529,6 +1401,13 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, plan->is_saved = true; } + foreach(lc, plan->stmt_list) + { + PlannedStmt *pstmt = (PlannedStmt *) lfirst(lc); + + pstmt->planOrigin = customplan ? PLAN_STMT_CACHE_CUSTOM : PLAN_STMT_CACHE_GENERIC; + } + return plan; } @@ -1913,7 +1792,7 @@ CachedPlanGetTargetList(CachedPlanSource *plansource, return NIL; /* Make sure the querytree list is valid and we have parse-time locks */ - RevalidateCachedQuery(plansource, queryEnv, true); + RevalidateCachedQuery(plansource, queryEnv); /* Get the primary statement and find out what it returns */ pstmt = QueryListGetPrimaryStmt(plansource->query_list); @@ -2035,7 +1914,7 @@ AcquireExecutorLocks(List *stmt_list, bool acquire) foreach(lc1, stmt_list) { PlannedStmt *plannedstmt = lfirst_node(PlannedStmt, lc1); - int rtindex; + ListCell *lc2; if (plannedstmt->commandType == CMD_UTILITY) { @@ -2053,16 +1932,13 @@ AcquireExecutorLocks(List *stmt_list, bool acquire) continue; } - rtindex = -1; - while ((rtindex = bms_next_member(plannedstmt->unprunableRelids, - rtindex)) >= 0) + foreach(lc2, plannedstmt->rtable) { - RangeTblEntry *rte = list_nth_node(RangeTblEntry, - plannedstmt->rtable, - rtindex - 1); + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc2); - Assert(rte->rtekind == RTE_RELATION || - (rte->rtekind == RTE_SUBQUERY && OidIsValid(rte->relid))); + if (!(rte->rtekind == RTE_RELATION || + (rte->rtekind == RTE_SUBQUERY && OidIsValid(rte->relid)))) + continue; /* * Acquire the appropriate type of lock on each relation OID. Note diff --git a/src/backend/utils/cache/ts_cache.c b/src/backend/utils/cache/ts_cache.c index 18cccd778fd..e8ae53238d0 100644 --- a/src/backend/utils/cache/ts_cache.c +++ b/src/backend/utils/cache/ts_cache.c @@ -321,7 +321,9 @@ lookup_ts_dictionary_cache(Oid dictId) /* * Init method runs in dictionary's private memory context, and we - * make sure the options are stored there too + * make sure the options are stored there too. This typically + * results in a small amount of memory leakage, but it's not worth + * complicating the API for tmplinit functions to avoid it. */ oldcontext = MemoryContextSwitchTo(entry->dictCtx); diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c index f9aec38a11f..6a347698edf 100644 --- a/src/backend/utils/cache/typcache.c +++ b/src/backend/utils/cache/typcache.c @@ -1171,9 +1171,6 @@ load_domaintype_info(TypeCacheEntry *typentry) elog(ERROR, "domain \"%s\" constraint \"%s\" has NULL conbin", NameStr(typTup->typname), NameStr(c->conname)); - /* Convert conbin to C string in caller context */ - constring = TextDatumGetCString(val); - /* Create the DomainConstraintCache object and context if needed */ if (dcc == NULL) { @@ -1189,9 +1186,8 @@ load_domaintype_info(TypeCacheEntry *typentry) dcc->dccRefCount = 0; } - /* Create node trees in DomainConstraintCache's context */ - oldcxt = MemoryContextSwitchTo(dcc->dccContext); - + /* Convert conbin to a node tree, still in caller's context */ + constring = TextDatumGetCString(val); check_expr = (Expr *) stringToNode(constring); /* @@ -1206,10 +1202,13 @@ load_domaintype_info(TypeCacheEntry *typentry) */ check_expr = expression_planner(check_expr); + /* Create only the minimally needed stuff in dccContext */ + oldcxt = MemoryContextSwitchTo(dcc->dccContext); + r = makeNode(DomainConstraintState); r->constrainttype = DOM_CONSTRAINT_CHECK; r->name = pstrdup(NameStr(c->conname)); - r->check_expr = check_expr; + r->check_expr = copyObject(check_expr); r->check_exprstate = NULL; MemoryContextSwitchTo(oldcxt); diff --git a/src/backend/utils/error/csvlog.c b/src/backend/utils/error/csvlog.c index fdac3c048e3..c3159ed7d97 100644 --- a/src/backend/utils/error/csvlog.c +++ b/src/backend/utils/error/csvlog.c @@ -120,7 +120,7 @@ write_csvlog(ErrorData *edata) appendStringInfoChar(&buf, ','); /* session id */ - appendStringInfo(&buf, INT64_HEX_FORMAT ".%x", MyStartTime, MyProcPid); + appendStringInfo(&buf, "%" PRIx64 ".%x", MyStartTime, MyProcPid); appendStringInfoChar(&buf, ','); /* Line number */ diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index 47af743990f..b7b9692f8c8 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -1128,12 +1128,15 @@ set_backtrace(ErrorData *edata, int num_skip) nframes = backtrace(buf, lengthof(buf)); strfrms = backtrace_symbols(buf, nframes); - if (strfrms == NULL) - return; - - for (int i = num_skip; i < nframes; i++) - appendStringInfo(&errtrace, "\n%s", strfrms[i]); - free(strfrms); + if (strfrms != NULL) + { + for (int i = num_skip; i < nframes; i++) + appendStringInfo(&errtrace, "\n%s", strfrms[i]); + free(strfrms); + } + else + appendStringInfoString(&errtrace, + "insufficient memory for backtrace generation"); } #else appendStringInfoString(&errtrace, @@ -2956,12 +2959,12 @@ log_status_format(StringInfo buf, const char *format, ErrorData *edata) { char strfbuf[128]; - snprintf(strfbuf, sizeof(strfbuf) - 1, INT64_HEX_FORMAT ".%x", + snprintf(strfbuf, sizeof(strfbuf) - 1, "%" PRIx64 ".%x", MyStartTime, MyProcPid); appendStringInfo(buf, "%*s", padding, strfbuf); } else - appendStringInfo(buf, INT64_HEX_FORMAT ".%x", MyStartTime, MyProcPid); + appendStringInfo(buf, "%" PRIx64 ".%x", MyStartTime, MyProcPid); break; case 'p': if (padding != 0) diff --git a/src/backend/utils/error/jsonlog.c b/src/backend/utils/error/jsonlog.c index 519eacf17f8..2619f499042 100644 --- a/src/backend/utils/error/jsonlog.c +++ b/src/backend/utils/error/jsonlog.c @@ -168,7 +168,7 @@ write_jsonlog(ErrorData *edata) } /* Session id */ - appendJSONKeyValueFmt(&buf, "session_id", true, INT64_HEX_FORMAT ".%x", + appendJSONKeyValueFmt(&buf, "session_id", true, "%" PRIx64 ".%x", MyStartTime, MyProcPid); /* Line number */ diff --git a/src/backend/utils/fmgr/dfmgr.c b/src/backend/utils/fmgr/dfmgr.c index 603632581d0..4bb84ff7087 100644 --- a/src/backend/utils/fmgr/dfmgr.c +++ b/src/backend/utils/fmgr/dfmgr.c @@ -99,6 +99,14 @@ load_external_function(const char *filename, const char *funcname, void *lib_handle; void *retval; + /* + * If the value starts with "$libdir/", strip that. This is because many + * extensions have hardcoded '$libdir/foo' as their library name, which + * prevents using the path. + */ + if (strncmp(filename, "$libdir/", 8) == 0) + filename += 8; + /* Expand the possibly-abbreviated filename to an exact path name */ fullname = expand_dynamic_library_name(filename); @@ -456,14 +464,6 @@ expand_dynamic_library_name(const char *name) Assert(name); - /* - * If the value starts with "$libdir/", strip that. This is because many - * extensions have hardcoded '$libdir/foo' as their library name, which - * prevents using the path. - */ - if (strncmp(name, "$libdir/", 8) == 0) - name += 8; - have_slash = (first_dir_separator(name) != NULL); if (!have_slash) diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 1ad155d446e..81da03629f0 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -22,10 +22,11 @@ * lookup key's hash value as a partition number --- this will work because * of the way calc_bucket() maps hash values to bucket numbers. * - * For hash tables in shared memory, the memory allocator function should - * match malloc's semantics of returning NULL on failure. For hash tables - * in local memory, we typically use palloc() which will throw error on - * failure. The code in this file has to cope with both cases. + * The memory allocator function should match malloc's semantics of returning + * NULL on failure. (This is essential for hash tables in shared memory. + * For hash tables in local memory, we used to use palloc() which will throw + * error on failure; but we no longer do, so it's untested whether this + * module will still cope with that behavior.) * * dynahash.c provides support for these types of lookup keys: * @@ -98,6 +99,7 @@ #include "access/xact.h" #include "common/hashfn.h" +#include "lib/ilist.h" #include "port/pg_bitutils.h" #include "storage/shmem.h" #include "storage/spin.h" @@ -195,6 +197,7 @@ struct HASHHDR long ssize; /* segment size --- must be power of 2 */ int sshift; /* segment shift = log2(ssize) */ int nelem_alloc; /* number of entries to allocate at once */ + bool isfixed; /* if true, don't enlarge */ #ifdef HASH_STATISTICS @@ -227,7 +230,6 @@ struct HTAB MemoryContext hcxt; /* memory context if default allocator used */ char *tabname; /* table name (for error messages) */ bool isshared; /* true if table is in shared memory */ - bool isfixed; /* if true, don't enlarge */ /* freezing a shared table isn't allowed, so we can keep state here */ bool frozen; /* true = no more inserts allowed */ @@ -236,6 +238,16 @@ struct HTAB Size keysize; /* hash key length in bytes */ long ssize; /* segment size --- must be power of 2 */ int sshift; /* segment shift = log2(ssize) */ + + /* + * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of + * all the element blocks they have allocated. This pacifies Valgrind, + * which would otherwise often claim that the element blocks are "possibly + * lost" for lack of any non-interior pointers to their starts. + */ +#ifdef USE_VALGRIND + slist_head element_blocks; +#endif }; /* @@ -618,8 +630,10 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags) } } + /* Set isfixed if requested, but not till after we build initial entries */ if (flags & HASH_FIXED_SIZE) - hashp->isfixed = true; + hctl->isfixed = true; + return hashp; } @@ -644,6 +658,8 @@ hdefault(HTAB *hashp) hctl->ssize = DEF_SEGSIZE; hctl->sshift = DEF_SEGSIZE_SHIFT; + hctl->isfixed = false; /* can be enlarged */ + #ifdef HASH_STATISTICS hctl->accesses = hctl->collisions = 0; #endif @@ -1708,23 +1724,51 @@ element_alloc(HTAB *hashp, int nelem, int freelist_idx) { HASHHDR *hctl = hashp->hctl; Size elementSize; + Size requestSize; + char *allocedBlock; HASHELEMENT *firstElement; HASHELEMENT *tmpElement; HASHELEMENT *prevElement; int i; - if (hashp->isfixed) + if (hctl->isfixed) return false; /* Each element has a HASHELEMENT header plus user data. */ elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize); + requestSize = nelem * elementSize; + + /* Add space for slist_node list link if we need one. */ +#ifdef USE_VALGRIND + if (!hashp->isshared) + requestSize += MAXALIGN(sizeof(slist_node)); +#endif + + /* Allocate the memory. */ CurrentDynaHashCxt = hashp->hcxt; - firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize); + allocedBlock = hashp->alloc(requestSize); - if (!firstElement) + if (!allocedBlock) return false; + /* + * If USE_VALGRIND, each allocated block of elements of a non-shared + * hashtable is chained into a list, so that Valgrind won't think it's + * been leaked. + */ +#ifdef USE_VALGRIND + if (hashp->isshared) + firstElement = (HASHELEMENT *) allocedBlock; + else + { + slist_push_head(&hashp->element_blocks, (slist_node *) allocedBlock); + firstElement = (HASHELEMENT *) (allocedBlock + MAXALIGN(sizeof(slist_node))); + } +#else + firstElement = (HASHELEMENT *) allocedBlock; +#endif + /* prepare to link all the new entries into the freelist */ prevElement = NULL; tmpElement = firstElement; diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 92b0446b80c..d31cb45a058 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -39,7 +39,6 @@ volatile sig_atomic_t TransactionTimeoutPending = false; volatile sig_atomic_t IdleSessionTimeoutPending = false; volatile sig_atomic_t ProcSignalBarrierPending = false; volatile sig_atomic_t LogMemoryContextPending = false; -volatile sig_atomic_t PublishMemoryContextPending = false; volatile sig_atomic_t IdleStatsUpdateTimeoutPending = false; volatile uint32 InterruptHoldoffCount = 0; volatile uint32 QueryCancelHoldoffCount = 0; diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 43b4dbccc3d..65d8cbfaed5 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -1183,7 +1183,6 @@ UnlinkLockFiles(int status, Datum arg) /* Should we complain if the unlink fails? */ } /* Since we're about to exit, no need to reclaim storage */ - lock_files = NIL; /* * Lock file removal should always be the last externally visible action diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 89d72cdd5ff..641e535a73c 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -417,12 +417,11 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype); ctype = TextDatumGetCString(datum); - if (pg_perm_setlocale(LC_COLLATE, collate) == NULL) - ereport(FATAL, - (errmsg("database locale is incompatible with operating system"), - errdetail("The database was initialized with LC_COLLATE \"%s\", " - " which is not recognized by setlocale().", collate), - errhint("Recreate the database with another locale or install the missing locale."))); + /* + * Historcally, we set LC_COLLATE from datcollate, as well. That's no + * longer necessary because all collation behavior is handled through + * pg_locale_t. + */ if (pg_perm_setlocale(LC_CTYPE, ctype) == NULL) ereport(FATAL, @@ -663,13 +662,6 @@ BaseInit(void) * drop ephemeral slots, which in turn triggers stats reporting. */ ReplicationSlotInitialize(); - - /* - * The before shmem exit callback frees the DSA memory occupied by the - * latest memory context statistics that could be published by this proc - * if requested. - */ - before_shmem_exit(AtProcExit_memstats_cleanup, 0); } diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 308016d7763..886ecbad871 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -39,6 +39,7 @@ #include "mb/pg_wchar.h" #include "utils/fmgrprotos.h" #include "utils/memutils.h" +#include "utils/relcache.h" #include "varatt.h" /* diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 667df448732..e404c345e6e 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -249,6 +249,7 @@ static void reapply_stacked_values(struct config_generic *variable, const char *curvalue, GucContext curscontext, GucSource cursource, Oid cursrole); +static void free_placeholder(struct config_string *pHolder); static bool validate_option_array_item(const char *name, const char *value, bool skipIfNoPermissions); static void write_auto_conf_file(int fd, const char *filename, ConfigVariable *head); @@ -4722,8 +4723,13 @@ AlterSystemSetConfigFile(AlterSystemStmt *altersysstmt) * the config file cannot cause postmaster start to fail, so we * don't have to be too tense about possibly installing a bad * value.) + * + * As an exception, we skip this check if this is a RESET command + * for an unknown custom GUC, else there'd be no way for users to + * remove such settings with reserved prefixes. */ - (void) assignable_custom_variable_name(name, false, ERROR); + if (value || !valid_custom_variable_name(name)) + (void) assignable_custom_variable_name(name, false, ERROR); } /* @@ -5018,16 +5024,8 @@ define_custom_variable(struct config_generic *variable) set_config_sourcefile(name, pHolder->gen.sourcefile, pHolder->gen.sourceline); - /* - * Free up as much as we conveniently can of the placeholder structure. - * (This neglects any stack items, so it's possible for some memory to be - * leaked. Since this can only happen once per session per variable, it - * doesn't seem worth spending much code on.) - */ - set_string_field(pHolder, pHolder->variable, NULL); - set_string_field(pHolder, &pHolder->reset_val, NULL); - - guc_free(pHolder); + /* Now we can free the no-longer-referenced placeholder variable */ + free_placeholder(pHolder); } /* @@ -5127,6 +5125,25 @@ reapply_stacked_values(struct config_generic *variable, } /* + * Free up a no-longer-referenced placeholder GUC variable. + * + * This neglects any stack items, so it's possible for some memory to be + * leaked. Since this can only happen once per session per variable, it + * doesn't seem worth spending much code on. + */ +static void +free_placeholder(struct config_string *pHolder) +{ + /* Placeholders are always STRING type, so free their values */ + Assert(pHolder->gen.vartype == PGC_STRING); + set_string_field(pHolder, pHolder->variable, NULL); + set_string_field(pHolder, &pHolder->reset_val, NULL); + + guc_free(unconstify(char *, pHolder->gen.name)); + guc_free(pHolder); +} + +/* * Functions for extensions to call to define their custom GUC variables. */ void @@ -5286,9 +5303,7 @@ MarkGUCPrefixReserved(const char *className) /* * Check for existing placeholders. We must actually remove invalid - * placeholders, else future parallel worker startups will fail. (We - * don't bother trying to free associated memory, since this shouldn't - * happen often.) + * placeholders, else future parallel worker startups will fail. */ hash_seq_init(&status, guc_hashtab); while ((hentry = (GUCHashEntry *) hash_seq_search(&status)) != NULL) @@ -5312,6 +5327,8 @@ MarkGUCPrefixReserved(const char *className) NULL); /* Remove it from any lists it's in, too */ RemoveGUCFromLists(var); + /* And free it */ + free_placeholder((struct config_string *) var); } } @@ -6711,6 +6728,7 @@ validate_option_array_item(const char *name, const char *value, { struct config_generic *gconf; + bool reset_custom; /* * There are three cases to consider: @@ -6729,16 +6747,21 @@ validate_option_array_item(const char *name, const char *value, * it's assumed to be fully validated.) * * name is not known and can't be created as a placeholder. Throw error, - * unless skipIfNoPermissions is true, in which case return false. + * unless skipIfNoPermissions or reset_custom is true. If reset_custom is + * true, this is a RESET or RESET ALL operation for an unknown custom GUC + * with a reserved prefix, in which case we want to fall through to the + * placeholder case described in the preceding paragraph (else there'd be + * no way for users to remove them). Otherwise, return false. */ - gconf = find_option(name, true, skipIfNoPermissions, ERROR); - if (!gconf) + reset_custom = (!value && valid_custom_variable_name(name)); + gconf = find_option(name, true, skipIfNoPermissions || reset_custom, ERROR); + if (!gconf && !reset_custom) { /* not known, failed to make a placeholder */ return false; } - if (gconf->flags & GUC_CUSTOM_PLACEHOLDER) + if (!gconf || gconf->flags & GUC_CUSTOM_PLACEHOLDER) { /* * We cannot do any meaningful check on the value, so only permissions diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 2f8cbd86759..d14b1678e7f 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -1028,7 +1028,7 @@ struct config_bool ConfigureNamesBool[] = }, { {"enable_distinct_reordering", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables reordering of DISTINCT pathkeys."), + gettext_noop("Enables reordering of DISTINCT keys."), NULL, GUC_EXPLAIN }, @@ -1602,11 +1602,11 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { - {"log_lock_failure", PGC_SUSET, LOGGING_WHAT, + {"log_lock_failures", PGC_SUSET, LOGGING_WHAT, gettext_noop("Logs lock failures."), NULL }, - &log_lock_failure, + &log_lock_failures, false, NULL, NULL, NULL }, @@ -3081,7 +3081,7 @@ struct config_int ConfigureNamesInt[] = }, &max_slot_wal_keep_size_mb, -1, -1, MAX_KILOBYTES, - check_max_slot_wal_keep_size, NULL, NULL + NULL, NULL, NULL }, { @@ -3100,11 +3100,11 @@ struct config_int ConfigureNamesInt[] = gettext_noop("Sets the duration a replication slot can remain idle before " "it is invalidated."), NULL, - GUC_UNIT_MIN + GUC_UNIT_S }, - &idle_replication_slot_timeout_mins, - 0, 0, INT_MAX / SECS_PER_MINUTE, - check_idle_replication_slot_timeout, NULL, NULL + &idle_replication_slot_timeout_secs, + 0, 0, INT_MAX, + NULL, NULL, NULL }, { @@ -4837,7 +4837,7 @@ struct config_string ConfigureNamesString[] = { {"ssl_groups", PGC_SIGHUP, CONN_AUTH_SSL, gettext_noop("Sets the group(s) to use for Diffie-Hellman key exchange."), - gettext_noop("Multiple groups can be specified using colon-separated list."), + gettext_noop("Multiple groups can be specified using a colon-separated list."), GUC_SUPERUSER_ONLY }, &SSLECDHCurve, diff --git a/src/backend/utils/misc/injection_point.c b/src/backend/utils/misc/injection_point.c index f58ebc8ee52..83b887b6978 100644 --- a/src/backend/utils/misc/injection_point.c +++ b/src/backend/utils/misc/injection_point.c @@ -584,3 +584,49 @@ IsInjectionPointAttached(const char *name) return false; /* silence compiler */ #endif } + +/* + * Retrieve a list of all the injection points currently attached. + * + * This list is palloc'd in the current memory context. + */ +List * +InjectionPointList(void) +{ +#ifdef USE_INJECTION_POINTS + List *inj_points = NIL; + uint32 max_inuse; + + LWLockAcquire(InjectionPointLock, LW_SHARED); + + max_inuse = pg_atomic_read_u32(&ActiveInjectionPoints->max_inuse); + + for (uint32 idx = 0; idx < max_inuse; idx++) + { + InjectionPointEntry *entry; + InjectionPointData *inj_point; + uint64 generation; + + entry = &ActiveInjectionPoints->entries[idx]; + generation = pg_atomic_read_u64(&entry->generation); + + /* skip free slots */ + if (generation % 2 == 0) + continue; + + inj_point = (InjectionPointData *) palloc0(sizeof(InjectionPointData)); + inj_point->name = pstrdup(entry->name); + inj_point->library = pstrdup(entry->library); + inj_point->function = pstrdup(entry->function); + inj_points = lappend(inj_points, inj_point); + } + + LWLockRelease(InjectionPointLock); + + return inj_points; + +#else + elog(ERROR, "Injection points are not supported by this build"); + return NIL; /* keep compiler quiet */ +#endif +} diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 34826d01380..a9d8293474a 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -97,6 +97,7 @@ #password_encryption = scram-sha-256 # scram-sha-256 or md5 #scram_iterations = 4096 #md5_password_warnings = on +#oauth_validator_libraries = '' # comma-separated list of trusted validator modules # GSSAPI using Kerberos #krb_server_keyfile = 'FILE:${sysconfdir}/krb5.keytab' @@ -121,9 +122,6 @@ #ssl_passphrase_command = '' #ssl_passphrase_command_supports_reload = off -# OAuth -#oauth_validator_libraries = '' # comma-separated list of trusted validator modules - #------------------------------------------------------------------------------ # RESOURCE USAGE (except WAL) @@ -180,13 +178,11 @@ #temp_file_limit = -1 # limits per-process temp file space # in kilobytes, or -1 for no limit +#file_copy_method = copy # copy, clone (if supported by OS) + #max_notify_queue_pages = 1048576 # limits the number of SLRU pages allocated # for NOTIFY / LISTEN queue -#file_copy_method = copy # the default is the first option - # copy - # clone (if system support is available) - # - Kernel Resources - #max_files_per_process = 1000 # min 64 @@ -346,7 +342,7 @@ # (change requires restart) #wal_keep_size = 0 # in megabytes; 0 disables #max_slot_wal_keep_size = -1 # in megabytes; -1 disables -#idle_replication_slot_timeout = 0 # in minutes; 0 disables +#idle_replication_slot_timeout = 0 # in seconds; 0 disables #wal_sender_timeout = 60s # in milliseconds; 0 disables #track_commit_timestamp = off # collect timestamp of transaction commit # (change requires restart) @@ -628,7 +624,7 @@ # %% = '%' # e.g. '<%u%%%d> ' #log_lock_waits = off # log lock waits >= deadlock_timeout -#log_lock_failure = off # log lock failures +#log_lock_failures = off # log lock failures #log_recovery_conflict_waits = off # log standby recovery conflict waits # >= deadlock_timeout #log_parameter_max_length = -1 # when logging statements, limit logged diff --git a/src/backend/utils/misc/ps_status.c b/src/backend/utils/misc/ps_status.c index e08b26e8c14..4df25944deb 100644 --- a/src/backend/utils/misc/ps_status.c +++ b/src/backend/utils/misc/ps_status.c @@ -100,6 +100,17 @@ static void flush_ps_display(void); static int save_argc; static char **save_argv; +/* + * Valgrind seems not to consider the global "environ" variable as a valid + * root pointer; so when we allocate a new environment array, it claims that + * data is leaked. To fix that, keep our own statically-allocated copy of the + * pointer. (Oddly, this doesn't seem to be a problem for "argv".) + */ +#if defined(PS_USE_CLOBBER_ARGV) && defined(USE_VALGRIND) +extern char **ps_status_new_environ; +char **ps_status_new_environ; +#endif + /* * Call this early in startup to save the original argc/argv values. @@ -206,6 +217,11 @@ save_ps_display_args(int argc, char **argv) } new_environ[i] = NULL; environ = new_environ; + + /* See notes about Valgrind above. */ +#ifdef USE_VALGRIND + ps_status_new_environ = new_environ; +#endif } /* diff --git a/src/backend/utils/mmgr/alignedalloc.c b/src/backend/utils/mmgr/alignedalloc.c index 85aee389d6b..b1be7426914 100644 --- a/src/backend/utils/mmgr/alignedalloc.c +++ b/src/backend/utils/mmgr/alignedalloc.c @@ -45,6 +45,16 @@ AlignedAllocFree(void *pointer) GetMemoryChunkContext(unaligned)->name, chunk); #endif + /* + * Create a dummy vchunk covering the start of the unaligned chunk, but + * not overlapping the aligned chunk. This will be freed while pfree'ing + * the unaligned chunk, keeping Valgrind happy. Then when we return to + * the outer pfree, that will clean up the vchunk for the aligned chunk. + */ + VALGRIND_MEMPOOL_ALLOC(GetMemoryChunkContext(unaligned), unaligned, + (char *) pointer - (char *) unaligned); + + /* Recursively pfree the unaligned chunk */ pfree(unaligned); } @@ -96,18 +106,41 @@ AlignedAllocRealloc(void *pointer, Size size, int flags) Assert(old_size >= redirchunk->requested_size); #endif + /* + * To keep things simple, we always allocate a new aligned chunk and copy + * data into it. Because of the above inaccuracy, this may end in copying + * more data than was in the original allocation request size, but that + * should be OK. + */ ctx = GetMemoryChunkContext(unaligned); newptr = MemoryContextAllocAligned(ctx, size, alignto, flags); - /* - * We may memcpy beyond the end of the original allocation request size, - * so we must mark the entire allocation as defined. - */ - if (likely(newptr != NULL)) + /* Cope cleanly with OOM */ + if (unlikely(newptr == NULL)) { - VALGRIND_MAKE_MEM_DEFINED(pointer, old_size); - memcpy(newptr, pointer, Min(size, old_size)); + VALGRIND_MAKE_MEM_NOACCESS(redirchunk, sizeof(MemoryChunk)); + return MemoryContextAllocationFailure(ctx, size, flags); } + + /* + * We may memcpy more than the original allocation request size, which + * would result in trying to copy trailing bytes that the original + * MemoryContextAllocAligned call marked NOACCESS. So we must mark the + * entire old_size as defined. That's slightly annoying, but probably not + * worth improving. + */ + VALGRIND_MAKE_MEM_DEFINED(pointer, old_size); + memcpy(newptr, pointer, Min(size, old_size)); + + /* + * Create a dummy vchunk covering the start of the old unaligned chunk, + * but not overlapping the aligned chunk. This will be freed while + * pfree'ing the old unaligned chunk, keeping Valgrind happy. Then when + * we return to repalloc, it will move the vchunk for the aligned chunk. + */ + VALGRIND_MEMPOOL_ALLOC(ctx, unaligned, + (char *) pointer - (char *) unaligned); + pfree(unaligned); return newptr; diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c index 666ecd8f78d..9ef109ca586 100644 --- a/src/backend/utils/mmgr/aset.c +++ b/src/backend/utils/mmgr/aset.c @@ -103,6 +103,8 @@ #define ALLOC_BLOCKHDRSZ MAXALIGN(sizeof(AllocBlockData)) #define ALLOC_CHUNKHDRSZ sizeof(MemoryChunk) +#define FIRST_BLOCKHDRSZ (MAXALIGN(sizeof(AllocSetContext)) + \ + ALLOC_BLOCKHDRSZ) typedef struct AllocBlockData *AllocBlock; /* forward reference */ @@ -458,6 +460,21 @@ AllocSetContextCreateInternal(MemoryContext parent, * we'd leak the header/initial block if we ereport in this stretch. */ + /* Create a vpool associated with the context */ + VALGRIND_CREATE_MEMPOOL(set, 0, false); + + /* + * Create a vchunk covering both the AllocSetContext struct and the keeper + * block's header. (Perhaps it would be more sensible for these to be two + * separate vchunks, but doing that seems to tickle bugs in some versions + * of Valgrind.) We must have these vchunks, and also a vchunk for each + * subsequently-added block header, so that Valgrind considers the + * pointers within them while checking for leaked memory. Note that + * Valgrind doesn't distinguish between these vchunks and those created by + * mcxt.c for the user-accessible-data chunks we allocate. + */ + VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ); + /* Fill in the initial block's block header */ block = KeeperBlock(set); block->aset = set; @@ -585,6 +602,14 @@ AllocSetReset(MemoryContext context) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, block->freeptr - ((char *) block)); #endif + + /* + * We need to free the block header's vchunk explicitly, although + * the user-data vchunks within will go away in the TRIM below. + * Otherwise Valgrind complains about leaked allocations. + */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } block = next; @@ -592,6 +617,14 @@ AllocSetReset(MemoryContext context) Assert(context->mem_allocated == keepersize); + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the AllocSetContext and + * keeper-block header. This gets rid of the vchunks for whatever user + * data is getting discarded by the context reset. + */ + VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ); + /* Reset block size allocation sequence, too */ set->nextBlockSize = set->initBlockSize; } @@ -648,6 +681,9 @@ AllocSetDelete(MemoryContext context) freelist->first_free = (AllocSetContext *) oldset->header.nextchild; freelist->num_free--; + /* Destroy the context's vpool --- see notes below */ + VALGRIND_DESTROY_MEMPOOL(oldset); + /* All that remains is to free the header/initial block */ free(oldset); } @@ -675,13 +711,24 @@ AllocSetDelete(MemoryContext context) #endif if (!IsKeeperBlock(set, block)) + { + /* As in AllocSetReset, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); free(block); + } block = next; } Assert(context->mem_allocated == keepersize); + /* + * Destroy the vpool. We don't seem to need to explicitly free the + * initial block's header vchunk, nor any user-data vchunks that Valgrind + * still knows about; they'll all go away automatically. + */ + VALGRIND_DESTROY_MEMPOOL(set); + /* Finally, free the context header, including the keeper block */ free(set); } @@ -716,6 +763,9 @@ AllocSetAllocLarge(MemoryContext context, Size size, int flags) if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, ALLOC_BLOCKHDRSZ); + context->mem_allocated += blksize; block->aset = set; @@ -922,6 +972,9 @@ AllocSetAllocFromNewBlock(MemoryContext context, Size size, int flags, if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, ALLOC_BLOCKHDRSZ); + context->mem_allocated += blksize; block->aset = set; @@ -1104,6 +1157,10 @@ AllocSetFree(void *pointer) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, block->freeptr - ((char *) block)); #endif + + /* As in AllocSetReset, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } else @@ -1184,6 +1241,7 @@ AllocSetRealloc(void *pointer, Size size, int flags) * realloc() to make the containing block bigger, or smaller, with * minimum space wastage. */ + AllocBlock newblock; Size chksize; Size blksize; Size oldblksize; @@ -1223,14 +1281,21 @@ AllocSetRealloc(void *pointer, Size size, int flags) blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; oldblksize = block->endptr - ((char *) block); - block = (AllocBlock) realloc(block, blksize); - if (block == NULL) + newblock = (AllocBlock) realloc(block, blksize); + if (newblock == NULL) { /* Disallow access to the chunk header. */ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ); return MemoryContextAllocationFailure(&set->header, size, flags); } + /* + * Move the block-header vchunk explicitly. (mcxt.c will take care of + * moving the vchunk for the user data.) + */ + VALGRIND_MEMPOOL_CHANGE(set, block, newblock, ALLOC_BLOCKHDRSZ); + block = newblock; + /* updated separately, not to underflow when (oldblksize > blksize) */ set->header.mem_allocated -= oldblksize; set->header.mem_allocated += blksize; @@ -1294,7 +1359,7 @@ AllocSetRealloc(void *pointer, Size size, int flags) /* Ensure any padding bytes are marked NOACCESS. */ VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size, chksize - size); - /* Disallow access to the chunk header . */ + /* Disallow access to the chunk header. */ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ); return pointer; diff --git a/src/backend/utils/mmgr/bump.c b/src/backend/utils/mmgr/bump.c index f7a37d1b3e8..2805d55a2ec 100644 --- a/src/backend/utils/mmgr/bump.c +++ b/src/backend/utils/mmgr/bump.c @@ -45,7 +45,9 @@ #include "utils/memutils_memorychunk.h" #include "utils/memutils_internal.h" -#define Bump_BLOCKHDRSZ MAXALIGN(sizeof(BumpBlock)) +#define Bump_BLOCKHDRSZ MAXALIGN(sizeof(BumpBlock)) +#define FIRST_BLOCKHDRSZ (MAXALIGN(sizeof(BumpContext)) + \ + Bump_BLOCKHDRSZ) /* No chunk header unless built with MEMORY_CONTEXT_CHECKING */ #ifdef MEMORY_CONTEXT_CHECKING @@ -189,6 +191,12 @@ BumpContextCreate(MemoryContext parent, const char *name, Size minContextSize, * Avoid writing code that can fail between here and MemoryContextCreate; * we'd leak the header and initial block if we ereport in this stretch. */ + + /* See comments about Valgrind interactions in aset.c */ + VALGRIND_CREATE_MEMPOOL(set, 0, false); + /* This vchunk covers the BumpContext and the keeper block header */ + VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ); + dlist_init(&set->blocks); /* Fill in the initial block's block header */ @@ -262,6 +270,14 @@ BumpReset(MemoryContext context) BumpBlockFree(set, block); } + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the BumpContext and keeper-block + * header. This gets rid of the vchunks for whatever user data is getting + * discarded by the context reset. + */ + VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ); + /* Reset block size allocation sequence, too */ set->nextBlockSize = set->initBlockSize; @@ -279,6 +295,10 @@ BumpDelete(MemoryContext context) { /* Reset to release all releasable BumpBlocks */ BumpReset(context); + + /* Destroy the vpool -- see notes in aset.c */ + VALGRIND_DESTROY_MEMPOOL(context); + /* And free the context header and keeper block */ free(context); } @@ -318,6 +338,9 @@ BumpAllocLarge(MemoryContext context, Size size, int flags) if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Bump_BLOCKHDRSZ); + context->mem_allocated += blksize; /* the block is completely full */ @@ -455,6 +478,9 @@ BumpAllocFromNewBlock(MemoryContext context, Size size, int flags, if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Bump_BLOCKHDRSZ); + context->mem_allocated += blksize; /* initialize the new block */ @@ -606,6 +632,9 @@ BumpBlockFree(BumpContext *set, BumpBlock *block) wipe_mem(block, ((char *) block->endptr - (char *) block)); #endif + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index 17d4f7a7a06..be43e9351c3 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -532,6 +532,21 @@ dsa_attach(dsa_handle handle) } /* + * Returns whether the area with the given handle was already attached by the + * current process. The area must have been created with dsa_create (not + * dsa_create_in_place). + */ +bool +dsa_is_attached(dsa_handle handle) +{ + /* + * An area handle is really a DSM segment handle for the first segment, so + * we can just search for that. + */ + return dsm_find_mapping(handle) != NULL; +} + +/* * Attach to an area that was created with dsa_create_in_place. The caller * must somehow know the location in memory that was used when the area was * created, though it may be mapped at a different virtual address in this diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c index 18679ad4f1e..cfafc9bf082 100644 --- a/src/backend/utils/mmgr/generation.c +++ b/src/backend/utils/mmgr/generation.c @@ -45,6 +45,8 @@ #define Generation_BLOCKHDRSZ MAXALIGN(sizeof(GenerationBlock)) #define Generation_CHUNKHDRSZ sizeof(MemoryChunk) +#define FIRST_BLOCKHDRSZ (MAXALIGN(sizeof(GenerationContext)) + \ + Generation_BLOCKHDRSZ) #define Generation_CHUNK_FRACTION 8 @@ -221,6 +223,12 @@ GenerationContextCreate(MemoryContext parent, * Avoid writing code that can fail between here and MemoryContextCreate; * we'd leak the header if we ereport in this stretch. */ + + /* See comments about Valgrind interactions in aset.c */ + VALGRIND_CREATE_MEMPOOL(set, 0, false); + /* This vchunk covers the GenerationContext and the keeper block header */ + VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ); + dlist_init(&set->blocks); /* Fill in the initial block's block header */ @@ -309,6 +317,14 @@ GenerationReset(MemoryContext context) GenerationBlockFree(set, block); } + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the GenerationContext and + * keeper-block header. This gets rid of the vchunks for whatever user + * data is getting discarded by the context reset. + */ + VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ); + /* set it so new allocations to make use of the keeper block */ set->block = KeeperBlock(set); @@ -329,6 +345,10 @@ GenerationDelete(MemoryContext context) { /* Reset to release all releasable GenerationBlocks */ GenerationReset(context); + + /* Destroy the vpool -- see notes in aset.c */ + VALGRIND_DESTROY_MEMPOOL(context); + /* And free the context header and keeper block */ free(context); } @@ -365,6 +385,9 @@ GenerationAllocLarge(MemoryContext context, Size size, int flags) if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Generation_BLOCKHDRSZ); + context->mem_allocated += blksize; /* block with a single (used) chunk */ @@ -487,6 +510,9 @@ GenerationAllocFromNewBlock(MemoryContext context, Size size, int flags, if (block == NULL) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(set, block, Generation_BLOCKHDRSZ); + context->mem_allocated += blksize; /* initialize the new block */ @@ -677,6 +703,9 @@ GenerationBlockFree(GenerationContext *set, GenerationBlock *block) wipe_mem(block, block->blksize); #endif + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(set, block); + free(block); } diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index 7d28ca706eb..47fd774c7d2 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -8,6 +8,23 @@ * context-type-specific operations via the function pointers in a * context's MemoryContextMethods struct. * + * A note about Valgrind support: when USE_VALGRIND is defined, we provide + * support for memory leak tracking at the allocation-unit level. Valgrind + * does leak detection by tracking allocated "chunks", which can be grouped + * into "pools". The "chunk" terminology is overloaded, since we use that + * word for our allocation units, and it's sometimes important to distinguish + * those from the Valgrind objects that describe them. To reduce confusion, + * let's use the terms "vchunk" and "vpool" for the Valgrind objects. + * + * We use a separate vpool for each memory context. The context-type-specific + * code is responsible for creating and deleting the vpools, and also for + * creating vchunks to cover its management data structures such as block + * headers. (There must be a vchunk that includes every pointer we want + * Valgrind to consider for leak-tracking purposes.) This module creates + * and deletes the vchunks that cover the caller-visible allocated chunks. + * However, the context-type-specific code must handle cleaning up those + * vchunks too during memory context reset operations. + * * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -23,11 +40,6 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" -#include "nodes/pg_list.h" -#include "storage/lwlock.h" -#include "storage/ipc.h" -#include "utils/dsa.h" -#include "utils/hsearch.h" #include "utils/memdebug.h" #include "utils/memutils.h" #include "utils/memutils_internal.h" @@ -140,17 +152,6 @@ static const MemoryContextMethods mcxt_methods[] = { }; #undef BOGUS_MCTX -/* - * This is passed to MemoryContextStatsInternal to determine whether - * to print context statistics or not and where to print them logs or - * stderr. - */ -typedef enum PrintDestination -{ - PRINT_STATS_TO_STDERR = 0, - PRINT_STATS_TO_LOGS, - PRINT_STATS_NONE -} PrintDestination; /* * CurrentMemoryContext @@ -172,31 +173,16 @@ MemoryContext CurTransactionContext = NULL; /* This is a transient link to the active portal's memory context: */ MemoryContext PortalContext = NULL; -dsa_area *MemoryStatsDsaArea = NULL; static void MemoryContextDeleteOnly(MemoryContext context); static void MemoryContextCallResetCallbacks(MemoryContext context); static void MemoryContextStatsInternal(MemoryContext context, int level, int max_level, int max_children, MemoryContextCounters *totals, - PrintDestination print_location, - int *num_contexts); + bool print_to_stderr); static void MemoryContextStatsPrint(MemoryContext context, void *passthru, const char *stats_string, bool print_to_stderr); -static void PublishMemoryContext(MemoryStatsEntry *memcxt_info, - int curr_id, MemoryContext context, - List *path, - MemoryContextCounters stat, - int num_contexts, dsa_area *area, - int max_levels); -static void compute_contexts_count_and_ids(List *contexts, HTAB *context_id_lookup, - int *stats_count, - bool summary); -static List *compute_context_path(MemoryContext c, HTAB *context_id_lookup); -static void free_memorycontextstate_dsa(dsa_area *area, int total_stats, - dsa_pointer prev_dsa_pointer); -static void end_memorycontext_reporting(void); /* * You should not do memory allocations within a critical section, because @@ -449,8 +435,6 @@ MemoryContextResetOnly(MemoryContext context) context->methods->reset(context); context->isReset = true; - VALGRIND_DESTROY_MEMPOOL(context); - VALGRIND_CREATE_MEMPOOL(context, 0, false); } } @@ -557,8 +541,6 @@ MemoryContextDeleteOnly(MemoryContext context) context->ident = NULL; context->methods->delete_context(context); - - VALGRIND_DESTROY_MEMPOOL(context); } /* @@ -591,9 +573,7 @@ MemoryContextDeleteChildren(MemoryContext context) * the specified context, since that means it will automatically be freed * when no longer needed. * - * There is no API for deregistering a callback once registered. If you - * want it to not do anything anymore, adjust the state pointed to by its - * "arg" to indicate that. + * Note that callers can assume this cannot fail. */ void MemoryContextRegisterResetCallback(MemoryContext context, @@ -609,6 +589,41 @@ MemoryContextRegisterResetCallback(MemoryContext context, } /* + * MemoryContextUnregisterResetCallback + * Undo the effects of MemoryContextRegisterResetCallback. + * + * This can be used if a callback's effects are no longer required + * at some point before the context has been reset/deleted. It is the + * caller's responsibility to pfree the callback struct (if needed). + * + * An assertion failure occurs if the callback was not registered. + * We could alternatively define that case as a no-op, but that seems too + * likely to mask programming errors such as passing the wrong context. + */ +void +MemoryContextUnregisterResetCallback(MemoryContext context, + MemoryContextCallback *cb) +{ + MemoryContextCallback *prev, + *cur; + + Assert(MemoryContextIsValid(context)); + + for (prev = NULL, cur = context->reset_cbs; cur != NULL; + prev = cur, cur = cur->next) + { + if (cur != cb) + continue; + if (prev) + prev->next = cur->next; + else + context->reset_cbs = cur->next; + return; + } + Assert(false); +} + +/* * MemoryContextCallResetCallbacks * Internal function to call all registered callbacks for context. */ @@ -862,19 +877,11 @@ MemoryContextStatsDetail(MemoryContext context, bool print_to_stderr) { MemoryContextCounters grand_totals; - int num_contexts; - PrintDestination print_location; memset(&grand_totals, 0, sizeof(grand_totals)); - if (print_to_stderr) - print_location = PRINT_STATS_TO_STDERR; - else - print_location = PRINT_STATS_TO_LOGS; - - /* num_contexts report number of contexts aggregated in the output */ MemoryContextStatsInternal(context, 1, max_level, max_children, - &grand_totals, print_location, &num_contexts); + &grand_totals, print_to_stderr); if (print_to_stderr) fprintf(stderr, @@ -909,14 +916,13 @@ MemoryContextStatsDetail(MemoryContext context, * One recursion level for MemoryContextStats * * Print stats for this context if possible, but in any case accumulate counts - * into *totals (if not NULL). The callers should make sure that print_location - * is set to PRINT_STATS_TO_STDERR or PRINT_STATS_TO_LOGS or PRINT_STATS_NONE. + * into *totals (if not NULL). */ static void MemoryContextStatsInternal(MemoryContext context, int level, int max_level, int max_children, MemoryContextCounters *totals, - PrintDestination print_location, int *num_contexts) + bool print_to_stderr) { MemoryContext child; int ichild; @@ -924,39 +930,10 @@ MemoryContextStatsInternal(MemoryContext context, int level, Assert(MemoryContextIsValid(context)); /* Examine the context itself */ - switch (print_location) - { - case PRINT_STATS_TO_STDERR: - context->methods->stats(context, - MemoryContextStatsPrint, - &level, - totals, true); - break; - - case PRINT_STATS_TO_LOGS: - context->methods->stats(context, - MemoryContextStatsPrint, - &level, - totals, false); - break; - - case PRINT_STATS_NONE: - - /* - * Do not print the statistics if print_location is - * PRINT_STATS_NONE, only compute totals. This is used in - * reporting of memory context statistics via a sql function. Last - * parameter is not relevant. - */ - context->methods->stats(context, - NULL, - NULL, - totals, false); - break; - } - - /* Increment the context count for each of the recursive call */ - *num_contexts = *num_contexts + 1; + context->methods->stats(context, + MemoryContextStatsPrint, + &level, + totals, print_to_stderr); /* * Examine children. @@ -976,7 +953,7 @@ MemoryContextStatsInternal(MemoryContext context, int level, MemoryContextStatsInternal(child, level + 1, max_level, max_children, totals, - print_location, num_contexts); + print_to_stderr); } } @@ -995,13 +972,7 @@ MemoryContextStatsInternal(MemoryContext context, int level, child = MemoryContextTraverseNext(child, context); } - /* - * Add the count of children contexts which are traversed in the - * non-recursive manner. - */ - *num_contexts = *num_contexts + ichild; - - if (print_location == PRINT_STATS_TO_STDERR) + if (print_to_stderr) { for (int i = 0; i < level; i++) fprintf(stderr, " "); @@ -1014,7 +985,7 @@ MemoryContextStatsInternal(MemoryContext context, int level, local_totals.freechunks, local_totals.totalspace - local_totals.freespace); } - else if (print_location == PRINT_STATS_TO_LOGS) + else ereport(LOG_SERVER_ONLY, (errhidestmt(true), errhidecontext(true), @@ -1212,8 +1183,6 @@ MemoryContextCreate(MemoryContext node, node->nextchild = NULL; node->allowInCritSection = false; } - - VALGRIND_CREATE_MEMPOOL(node, 0, false); } /* @@ -1356,22 +1325,6 @@ HandleLogMemoryContextInterrupt(void) } /* - * HandleGetMemoryContextInterrupt - * Handle receipt of an interrupt indicating a request to publish memory - * contexts statistics. - * - * All the actual work is deferred to ProcessGetMemoryContextInterrupt() as - * this cannot be performed in a signal handler. - */ -void -HandleGetMemoryContextInterrupt(void) -{ - InterruptPending = true; - PublishMemoryContextPending = true; - /* latch will be set by procsignal_sigusr1_handler */ -} - -/* * ProcessLogMemoryContextInterrupt * Perform logging of memory contexts of this backend process. * @@ -1408,539 +1361,6 @@ ProcessLogMemoryContextInterrupt(void) MemoryContextStatsDetail(TopMemoryContext, 100, 100, false); } -/* - * ProcessGetMemoryContextInterrupt - * Generate information about memory contexts used by the process. - * - * Performs a breadth first search on the memory context tree, thus parents - * statistics are reported before their children in the monitoring function - * output. - * - * Statistics for all the processes are shared via the same dynamic shared - * area. Statistics written by each process are tracked independently in - * per-process DSA pointers. These pointers are stored in static shared memory. - * - * We calculate maximum number of context's statistics that can be displayed - * using a pre-determined limit for memory available per process for this - * utility maximum size of statistics for each context. The remaining context - * statistics if any are captured as a cumulative total at the end of - * individual context's statistics. - * - * If summary is true, we capture the level 1 and level 2 contexts - * statistics. For that we traverse the memory context tree recursively in - * depth first search manner to cover all the children of a parent context, to - * be able to display a cumulative total of memory consumption by a parent at - * level 2 and all its children. - */ -void -ProcessGetMemoryContextInterrupt(void) -{ - List *contexts; - HASHCTL ctl; - HTAB *context_id_lookup; - int context_id = 0; - MemoryStatsEntry *meminfo; - bool summary = false; - int max_stats; - int idx = MyProcNumber; - int stats_count = 0; - int stats_num = 0; - MemoryContextCounters stat; - int num_individual_stats = 0; - - PublishMemoryContextPending = false; - - /* - * The hash table is used for constructing "path" column of the view, - * similar to its local backend counterpart. - */ - ctl.keysize = sizeof(MemoryContext); - ctl.entrysize = sizeof(MemoryStatsContextId); - ctl.hcxt = CurrentMemoryContext; - - context_id_lookup = hash_create("pg_get_remote_backend_memory_contexts", - 256, - &ctl, - HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); - - /* List of contexts to process in the next round - start at the top. */ - contexts = list_make1(TopMemoryContext); - - /* Compute the number of stats that can fit in the defined limit */ - max_stats = - MEMORY_CONTEXT_REPORT_MAX_PER_BACKEND / MAX_MEMORY_CONTEXT_STATS_SIZE; - LWLockAcquire(&memCxtState[idx].lw_lock, LW_EXCLUSIVE); - summary = memCxtState[idx].summary; - LWLockRelease(&memCxtState[idx].lw_lock); - - /* - * Traverse the memory context tree to find total number of contexts. If - * summary is requested report the total number of contexts at level 1 and - * 2 from the top. Also, populate the hash table of context ids. - */ - compute_contexts_count_and_ids(contexts, context_id_lookup, &stats_count, - summary); - - /* - * Allocate memory in this process's DSA for storing statistics of the - * memory contexts upto max_stats, for contexts that don't fit within a - * limit, a cumulative total is written as the last record in the DSA - * segment. - */ - stats_num = Min(stats_count, max_stats); - - LWLockAcquire(&memCxtArea->lw_lock, LW_EXCLUSIVE); - - /* - * Create a DSA and send handle to the client process after storing the - * context statistics. If number of contexts exceed a predefined limit - * (1MB), a cumulative total is stored for such contexts. - */ - if (memCxtArea->memstats_dsa_handle == DSA_HANDLE_INVALID) - { - MemoryContext oldcontext = CurrentMemoryContext; - dsa_handle handle; - - MemoryContextSwitchTo(TopMemoryContext); - - MemoryStatsDsaArea = dsa_create(memCxtArea->lw_lock.tranche); - - handle = dsa_get_handle(MemoryStatsDsaArea); - MemoryContextSwitchTo(oldcontext); - - dsa_pin_mapping(MemoryStatsDsaArea); - - /* - * Pin the DSA area, this is to make sure the area remains attachable - * even if the backend that created it exits. This is done so that the - * statistics are published even if the process exits while a client - * is waiting. Also, other processes that publish statistics will use - * the same area. - */ - dsa_pin(MemoryStatsDsaArea); - - /* Set the handle in shared memory */ - memCxtArea->memstats_dsa_handle = handle; - } - - /* - * If DSA exists, created by another process publishing statistics, attach - * to it. - */ - else if (MemoryStatsDsaArea == NULL) - { - MemoryContext oldcontext = CurrentMemoryContext; - - MemoryContextSwitchTo(TopMemoryContext); - MemoryStatsDsaArea = dsa_attach(memCxtArea->memstats_dsa_handle); - MemoryContextSwitchTo(oldcontext); - dsa_pin_mapping(MemoryStatsDsaArea); - } - LWLockRelease(&memCxtArea->lw_lock); - - /* - * Hold the process lock to protect writes to process specific memory. Two - * processes publishing statistics do not block each other. - */ - LWLockAcquire(&memCxtState[idx].lw_lock, LW_EXCLUSIVE); - memCxtState[idx].proc_id = MyProcPid; - - if (DsaPointerIsValid(memCxtState[idx].memstats_dsa_pointer)) - { - /* - * Free any previous allocations, free the name, ident and path - * pointers before freeing the pointer that contains them. - */ - free_memorycontextstate_dsa(MemoryStatsDsaArea, memCxtState[idx].total_stats, - memCxtState[idx].memstats_dsa_pointer); - } - - /* - * Assigning total stats before allocating memory so that memory cleanup - * can run if any subsequent dsa_allocate call to allocate name/ident/path - * fails. - */ - memCxtState[idx].total_stats = stats_num; - memCxtState[idx].memstats_dsa_pointer = - dsa_allocate0(MemoryStatsDsaArea, stats_num * sizeof(MemoryStatsEntry)); - - meminfo = (MemoryStatsEntry *) - dsa_get_address(MemoryStatsDsaArea, memCxtState[idx].memstats_dsa_pointer); - - if (summary) - { - int cxt_id = 0; - List *path = NIL; - - /* Copy TopMemoryContext statistics to DSA */ - memset(&stat, 0, sizeof(stat)); - (*TopMemoryContext->methods->stats) (TopMemoryContext, NULL, NULL, - &stat, true); - path = lcons_int(1, path); - PublishMemoryContext(meminfo, cxt_id, TopMemoryContext, path, stat, - 1, MemoryStatsDsaArea, 100); - cxt_id = cxt_id + 1; - - /* - * Copy statistics for each of TopMemoryContexts children. This - * includes statistics of at most 100 children per node, with each - * child node limited to a depth of 100 in its subtree. - */ - for (MemoryContext c = TopMemoryContext->firstchild; c != NULL; - c = c->nextchild) - { - MemoryContextCounters grand_totals; - int num_contexts = 0; - - path = NIL; - memset(&grand_totals, 0, sizeof(grand_totals)); - - MemoryContextStatsInternal(c, 1, 100, 100, &grand_totals, - PRINT_STATS_NONE, &num_contexts); - - path = compute_context_path(c, context_id_lookup); - - /* - * Register the stats entry first, that way the cleanup handler - * can reach it in case of allocation failures of one or more - * members. - */ - memCxtState[idx].total_stats = cxt_id++; - PublishMemoryContext(meminfo, cxt_id, c, path, - grand_totals, num_contexts, MemoryStatsDsaArea, 100); - } - memCxtState[idx].total_stats = cxt_id; - - /* Notify waiting backends and return */ - end_memorycontext_reporting(); - - hash_destroy(context_id_lookup); - - return; - } - - foreach_ptr(MemoryContextData, cur, contexts) - { - List *path = NIL; - - /* - * Figure out the transient context_id of this context and each of its - * ancestors, to compute a path for this context. - */ - path = compute_context_path(cur, context_id_lookup); - - /* Examine the context stats */ - memset(&stat, 0, sizeof(stat)); - (*cur->methods->stats) (cur, NULL, NULL, &stat, true); - - /* Account for saving one statistics slot for cumulative reporting */ - if (context_id < (max_stats - 1) || stats_count <= max_stats) - { - /* Copy statistics to DSA memory */ - PublishMemoryContext(meminfo, context_id, cur, path, stat, 1, MemoryStatsDsaArea, 100); - } - else - { - meminfo[max_stats - 1].totalspace += stat.totalspace; - meminfo[max_stats - 1].nblocks += stat.nblocks; - meminfo[max_stats - 1].freespace += stat.freespace; - meminfo[max_stats - 1].freechunks += stat.freechunks; - } - - /* - * DSA max limit per process is reached, write aggregate of the - * remaining statistics. - * - * We can store contexts from 0 to max_stats - 1. When stats_count is - * greater than max_stats, we stop reporting individual statistics - * when context_id equals max_stats - 2. As we use max_stats - 1 array - * slot for reporting cumulative statistics or "Remaining Totals". - */ - if (stats_count > max_stats && context_id == (max_stats - 2)) - { - char *nameptr; - int namelen = strlen("Remaining Totals"); - - num_individual_stats = context_id + 1; - meminfo[max_stats - 1].name = dsa_allocate(MemoryStatsDsaArea, namelen + 1); - nameptr = dsa_get_address(MemoryStatsDsaArea, meminfo[max_stats - 1].name); - strlcpy(nameptr, "Remaining Totals", namelen + 1); - meminfo[max_stats - 1].ident = InvalidDsaPointer; - meminfo[max_stats - 1].path = InvalidDsaPointer; - meminfo[max_stats - 1].type = 0; - } - context_id++; - } - - /* - * Statistics are not aggregated, i.e individual statistics reported when - * stats_count <= max_stats. - */ - if (stats_count <= max_stats) - { - memCxtState[idx].total_stats = context_id; - } - /* Report number of aggregated memory contexts */ - else - { - meminfo[max_stats - 1].num_agg_stats = context_id - - num_individual_stats; - - /* - * Total stats equals num_individual_stats + 1 record for cumulative - * statistics. - */ - memCxtState[idx].total_stats = num_individual_stats + 1; - } - - /* Notify waiting backends and return */ - end_memorycontext_reporting(); - - hash_destroy(context_id_lookup); -} - -/* - * Update timestamp and signal all the waiting client backends after copying - * all the statistics. - */ -static void -end_memorycontext_reporting(void) -{ - memCxtState[MyProcNumber].stats_timestamp = GetCurrentTimestamp(); - LWLockRelease(&memCxtState[MyProcNumber].lw_lock); - ConditionVariableBroadcast(&memCxtState[MyProcNumber].memcxt_cv); -} - -/* - * compute_context_path - * - * Append the transient context_id of this context and each of its ancestors - * to a list, in order to compute a path. - */ -static List * -compute_context_path(MemoryContext c, HTAB *context_id_lookup) -{ - bool found; - List *path = NIL; - MemoryContext cur_context; - - for (cur_context = c; cur_context != NULL; cur_context = cur_context->parent) - { - MemoryStatsContextId *cur_entry; - - cur_entry = hash_search(context_id_lookup, &cur_context, HASH_FIND, &found); - - if (!found) - elog(ERROR, "hash table corrupted, can't construct path value"); - - path = lcons_int(cur_entry->context_id, path); - } - - return path; -} - -/* - * Return the number of contexts allocated currently by the backend - * Assign context ids to each of the contexts. - */ -static void -compute_contexts_count_and_ids(List *contexts, HTAB *context_id_lookup, - int *stats_count, bool summary) -{ - foreach_ptr(MemoryContextData, cur, contexts) - { - MemoryStatsContextId *entry; - bool found; - - entry = (MemoryStatsContextId *) hash_search(context_id_lookup, &cur, - HASH_ENTER, &found); - Assert(!found); - - /* - * context id starts with 1 so increment the stats_count before - * assigning. - */ - entry->context_id = ++(*stats_count); - - /* Append the children of the current context to the main list. */ - for (MemoryContext c = cur->firstchild; c != NULL; c = c->nextchild) - { - if (summary) - { - entry = (MemoryStatsContextId *) hash_search(context_id_lookup, &c, - HASH_ENTER, &found); - Assert(!found); - - entry->context_id = ++(*stats_count); - } - - contexts = lappend(contexts, c); - } - - /* - * In summary mode only the first two level (from top) contexts are - * displayed. - */ - if (summary) - break; - } -} - -/* - * PublishMemoryContext - * - * Copy the memory context statistics of a single context to a DSA memory - */ -static void -PublishMemoryContext(MemoryStatsEntry *memcxt_info, int curr_id, - MemoryContext context, List *path, - MemoryContextCounters stat, int num_contexts, - dsa_area *area, int max_levels) -{ - const char *ident = context->ident; - const char *name = context->name; - int *path_list; - - /* - * To be consistent with logging output, we label dynahash contexts with - * just the hash table name as with MemoryContextStatsPrint(). - */ - if (context->ident && strncmp(context->name, "dynahash", 8) == 0) - { - name = context->ident; - ident = NULL; - } - - if (name != NULL) - { - int namelen = strlen(name); - char *nameptr; - - if (strlen(name) >= MEMORY_CONTEXT_IDENT_SHMEM_SIZE) - namelen = pg_mbcliplen(name, namelen, - MEMORY_CONTEXT_IDENT_SHMEM_SIZE - 1); - - memcxt_info[curr_id].name = dsa_allocate(area, namelen + 1); - nameptr = (char *) dsa_get_address(area, memcxt_info[curr_id].name); - strlcpy(nameptr, name, namelen + 1); - } - else - memcxt_info[curr_id].name = InvalidDsaPointer; - - /* Trim and copy the identifier if it is not set to NULL */ - if (ident != NULL) - { - int idlen = strlen(context->ident); - char *identptr; - - /* - * Some identifiers such as SQL query string can be very long, - * truncate oversize identifiers. - */ - if (idlen >= MEMORY_CONTEXT_IDENT_SHMEM_SIZE) - idlen = pg_mbcliplen(ident, idlen, - MEMORY_CONTEXT_IDENT_SHMEM_SIZE - 1); - - memcxt_info[curr_id].ident = dsa_allocate(area, idlen + 1); - identptr = (char *) dsa_get_address(area, memcxt_info[curr_id].ident); - strlcpy(identptr, ident, idlen + 1); - } - else - memcxt_info[curr_id].ident = InvalidDsaPointer; - - /* Allocate DSA memory for storing path information */ - if (path == NIL) - memcxt_info[curr_id].path = InvalidDsaPointer; - else - { - int levels = Min(list_length(path), max_levels); - - memcxt_info[curr_id].path_length = levels; - memcxt_info[curr_id].path = dsa_allocate0(area, levels * sizeof(int)); - memcxt_info[curr_id].levels = list_length(path); - path_list = (int *) dsa_get_address(area, memcxt_info[curr_id].path); - - foreach_int(i, path) - { - path_list[foreach_current_index(i)] = i; - if (--levels == 0) - break; - } - } - memcxt_info[curr_id].type = context->type; - memcxt_info[curr_id].totalspace = stat.totalspace; - memcxt_info[curr_id].nblocks = stat.nblocks; - memcxt_info[curr_id].freespace = stat.freespace; - memcxt_info[curr_id].freechunks = stat.freechunks; - memcxt_info[curr_id].num_agg_stats = num_contexts; -} - -/* - * free_memorycontextstate_dsa - * - * Worker for freeing resources from a MemoryStatsEntry. Callers are - * responsible for ensuring that the DSA pointer is valid. - */ -static void -free_memorycontextstate_dsa(dsa_area *area, int total_stats, - dsa_pointer prev_dsa_pointer) -{ - MemoryStatsEntry *meminfo; - - meminfo = (MemoryStatsEntry *) dsa_get_address(area, prev_dsa_pointer); - Assert(meminfo != NULL); - for (int i = 0; i < total_stats; i++) - { - if (DsaPointerIsValid(meminfo[i].name)) - dsa_free(area, meminfo[i].name); - - if (DsaPointerIsValid(meminfo[i].ident)) - dsa_free(area, meminfo[i].ident); - - if (DsaPointerIsValid(meminfo[i].path)) - dsa_free(area, meminfo[i].path); - } - - dsa_free(area, memCxtState[MyProcNumber].memstats_dsa_pointer); - memCxtState[MyProcNumber].memstats_dsa_pointer = InvalidDsaPointer; -} - -/* - * Free the memory context statistics stored by this process - * in DSA area. - */ -void -AtProcExit_memstats_cleanup(int code, Datum arg) -{ - int idx = MyProcNumber; - - if (memCxtArea->memstats_dsa_handle == DSA_HANDLE_INVALID) - return; - - LWLockAcquire(&memCxtState[idx].lw_lock, LW_EXCLUSIVE); - - if (!DsaPointerIsValid(memCxtState[idx].memstats_dsa_pointer)) - { - LWLockRelease(&memCxtState[idx].lw_lock); - return; - } - - /* If the dsa mapping could not be found, attach to the area */ - if (MemoryStatsDsaArea == NULL) - MemoryStatsDsaArea = dsa_attach(memCxtArea->memstats_dsa_handle); - - /* - * Free the memory context statistics, free the name, ident and path - * pointers before freeing the pointer that contains these pointers and - * integer statistics. - */ - free_memorycontextstate_dsa(MemoryStatsDsaArea, memCxtState[idx].total_stats, - memCxtState[idx].memstats_dsa_pointer); - - dsa_detach(MemoryStatsDsaArea); - LWLockRelease(&memCxtState[idx].lw_lock); -} - void * palloc(Size size) { @@ -2045,7 +1465,13 @@ MemoryContextAllocAligned(MemoryContext context, void *unaligned; void *aligned; - /* wouldn't make much sense to waste that much space */ + /* + * Restrict alignto to ensure that it can fit into the "value" field of + * the redirection MemoryChunk, and that the distance back to the start of + * the unaligned chunk will fit into the space available for that. This + * isn't a limitation in practice, since it wouldn't make much sense to + * waste that much space. + */ Assert(alignto < (128 * 1024 * 1024)); /* ensure alignto is a power of 2 */ @@ -2082,10 +1508,15 @@ MemoryContextAllocAligned(MemoryContext context, alloc_size += 1; #endif - /* perform the actual allocation */ - unaligned = MemoryContextAllocExtended(context, alloc_size, flags); + /* + * Perform the actual allocation, but do not pass down MCXT_ALLOC_ZERO. + * This ensures that wasted bytes beyond the aligned chunk do not become + * DEFINED. + */ + unaligned = MemoryContextAllocExtended(context, alloc_size, + flags & ~MCXT_ALLOC_ZERO); - /* set the aligned pointer */ + /* compute the aligned pointer */ aligned = (void *) TYPEALIGN(alignto, (char *) unaligned + sizeof(MemoryChunk)); @@ -2113,12 +1544,23 @@ MemoryContextAllocAligned(MemoryContext context, set_sentinel(aligned, size); #endif - /* Mark the bytes before the redirection header as noaccess */ - VALGRIND_MAKE_MEM_NOACCESS(unaligned, - (char *) alignedchunk - (char *) unaligned); + /* + * MemoryContextAllocExtended marked the whole unaligned chunk as a + * vchunk. Undo that, instead making just the aligned chunk be a vchunk. + * This prevents Valgrind from complaining that the vchunk is possibly + * leaked, since only pointers to the aligned chunk will exist. + * + * After these calls, the aligned chunk will be marked UNDEFINED, and all + * the rest of the unaligned chunk (the redirection chunk header, the + * padding bytes before it, and any wasted trailing bytes) will be marked + * NOACCESS, which is what we want. + */ + VALGRIND_MEMPOOL_FREE(context, unaligned); + VALGRIND_MEMPOOL_ALLOC(context, aligned, size); - /* Disallow access to the redirection chunk header. */ - VALGRIND_MAKE_MEM_NOACCESS(alignedchunk, sizeof(MemoryChunk)); + /* Now zero (and make DEFINED) just the aligned chunk, if requested */ + if ((flags & MCXT_ALLOC_ZERO) != 0) + MemSetAligned(aligned, 0, size); return aligned; } @@ -2152,16 +1594,12 @@ void pfree(void *pointer) { #ifdef USE_VALGRIND - MemoryContextMethodID method = GetMemoryChunkMethodID(pointer); MemoryContext context = GetMemoryChunkContext(pointer); #endif MCXT_METHOD(pointer, free_p) (pointer); -#ifdef USE_VALGRIND - if (method != MCTX_ALIGNED_REDIRECT_ID) - VALGRIND_MEMPOOL_FREE(context, pointer); -#endif + VALGRIND_MEMPOOL_FREE(context, pointer); } /* @@ -2171,9 +1609,6 @@ pfree(void *pointer) void * repalloc(void *pointer, Size size) { -#ifdef USE_VALGRIND - MemoryContextMethodID method = GetMemoryChunkMethodID(pointer); -#endif #if defined(USE_ASSERT_CHECKING) || defined(USE_VALGRIND) MemoryContext context = GetMemoryChunkContext(pointer); #endif @@ -2196,10 +1631,7 @@ repalloc(void *pointer, Size size) */ ret = MCXT_METHOD(pointer, realloc) (pointer, size, 0); -#ifdef USE_VALGRIND - if (method != MCTX_ALIGNED_REDIRECT_ID) - VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size); -#endif + VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size); return ret; } diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index e3526e78064..0be1c2b0fff 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -284,8 +284,7 @@ PortalDefineQuery(Portal portal, const char *sourceText, CommandTag commandTag, List *stmts, - CachedPlan *cplan, - CachedPlanSource *plansource) + CachedPlan *cplan) { Assert(PortalIsValid(portal)); Assert(portal->status == PORTAL_NEW); @@ -300,7 +299,6 @@ PortalDefineQuery(Portal portal, portal->commandTag = commandTag; portal->stmts = stmts; portal->cplan = cplan; - portal->plansource = plansource; portal->status = PORTAL_DEFINED; } diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c index d32c0d318fb..0e35abcf5a0 100644 --- a/src/backend/utils/mmgr/slab.c +++ b/src/backend/utils/mmgr/slab.c @@ -377,6 +377,11 @@ SlabContextCreate(MemoryContext parent, * we'd leak the header if we ereport in this stretch. */ + /* See comments about Valgrind interactions in aset.c */ + VALGRIND_CREATE_MEMPOOL(slab, 0, false); + /* This vchunk covers the SlabContext only */ + VALGRIND_MEMPOOL_ALLOC(slab, slab, sizeof(SlabContext)); + /* Fill in SlabContext-specific header fields */ slab->chunkSize = (uint32) chunkSize; slab->fullChunkSize = (uint32) fullChunkSize; @@ -451,6 +456,10 @@ SlabReset(MemoryContext context) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, slab->blockSize); #endif + + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(slab, block); + free(block); context->mem_allocated -= slab->blockSize; } @@ -467,11 +476,23 @@ SlabReset(MemoryContext context) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, slab->blockSize); #endif + + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(slab, block); + free(block); context->mem_allocated -= slab->blockSize; } } + /* + * Instruct Valgrind to throw away all the vchunks associated with this + * context, except for the one covering the SlabContext. This gets rid of + * the vchunks for whatever user data is getting discarded by the context + * reset. + */ + VALGRIND_MEMPOOL_TRIM(slab, slab, sizeof(SlabContext)); + slab->curBlocklistIndex = 0; Assert(context->mem_allocated == 0); @@ -486,6 +507,10 @@ SlabDelete(MemoryContext context) { /* Reset to release all the SlabBlocks */ SlabReset(context); + + /* Destroy the vpool -- see notes in aset.c */ + VALGRIND_DESTROY_MEMPOOL(context); + /* And free the context header */ free(context); } @@ -567,6 +592,9 @@ SlabAllocFromNewBlock(MemoryContext context, Size size, int flags) if (unlikely(block == NULL)) return MemoryContextAllocationFailure(context, size, flags); + /* Make a vchunk covering the new block's header */ + VALGRIND_MEMPOOL_ALLOC(slab, block, Slab_BLOCKHDRSZ); + block->slab = slab; context->mem_allocated += slab->blockSize; @@ -795,6 +823,10 @@ SlabFree(void *pointer) #ifdef CLOBBER_FREED_MEMORY wipe_mem(block, slab->blockSize); #endif + + /* As in aset.c, free block-header vchunks explicitly */ + VALGRIND_MEMPOOL_FREE(slab, block); + free(block); slab->header.mem_allocated -= slab->blockSize; } diff --git a/src/bin/initdb/Makefile b/src/bin/initdb/Makefile index 997e0a013e9..c0470efda92 100644 --- a/src/bin/initdb/Makefile +++ b/src/bin/initdb/Makefile @@ -20,7 +20,7 @@ include $(top_builddir)/src/Makefile.global # from libpq, else we have risks of version skew if we run with a libpq # shared library from a different PG version. Define # USE_PRIVATE_ENCODING_FUNCS to ensure that that happens. -override CPPFLAGS := -DUSE_PRIVATE_ENCODING_FUNCS -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(ICU_CFLAGS) $(CPPFLAGS) +override CPPFLAGS := -DUSE_PRIVATE_ENCODING_FUNCS -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(CPPFLAGS) $(ICU_CFLAGS) # We need libpq only because fe_utils does. LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) $(ICU_LIBS) diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 15dd10ce40a..b7ef7ed8d06 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -76,7 +76,8 @@ command_like( 'checksums are enabled in control file'); command_ok([ 'initdb', '--sync-only', $datadir ], 'sync only'); -command_ok([ 'initdb', '--sync-only', '--no-sync-data-files', $datadir ], '--no-sync-data-files'); +command_ok([ 'initdb', '--sync-only', '--no-sync-data-files', $datadir ], + '--no-sync-data-files'); command_fails([ 'initdb', $datadir ], 'existing data directory'); if ($supports_syncfs) diff --git a/src/bin/pg_amcheck/t/004_verify_heapam.pl b/src/bin/pg_amcheck/t/004_verify_heapam.pl index 2a3af2666f5..72693660fb6 100644 --- a/src/bin/pg_amcheck/t/004_verify_heapam.pl +++ b/src/bin/pg_amcheck/t/004_verify_heapam.pl @@ -529,7 +529,7 @@ for (my $tupidx = 0; $tupidx < $ROWCOUNT; $tupidx++) $tup->{t_infomask2} |= HEAP_NATTS_MASK; push @expected, - qr/${$header}number of attributes 2047 exceeds maximum expected for table 3/; + qr/${$header}number of attributes 2047 exceeds maximum 3 expected for table/; } elsif ($offnum == 10) { @@ -552,7 +552,7 @@ for (my $tupidx = 0; $tupidx < $ROWCOUNT; $tupidx++) $tup->{t_hoff} = 32; push @expected, - qr/${$header}number of attributes 67 exceeds maximum expected for table 3/; + qr/${$header}number of attributes 67 exceeds maximum 3 expected for table/; } elsif ($offnum == 12) { diff --git a/src/bin/pg_basebackup/meson.build b/src/bin/pg_basebackup/meson.build index 8a1c96b4f5c..3a7fc10eab0 100644 --- a/src/bin/pg_basebackup/meson.build +++ b/src/bin/pg_basebackup/meson.build @@ -93,9 +93,9 @@ tests += { 'sd': meson.current_source_dir(), 'bd': meson.current_build_dir(), 'tap': { - 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.path() : '', - 'TAR': tar.found() ? tar.path() : '', - 'LZ4': program_lz4.found() ? program_lz4.path() : '', + 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.full_path() : '', + 'TAR': tar.found() ? tar.full_path() : '', + 'LZ4': program_lz4.found() ? program_lz4.full_path() : '', }, 'tests': [ 't/010_pg_basebackup.pl', diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index eb7354200bc..0a3ca4315de 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -35,6 +35,7 @@ #include "fe_utils/option_utils.h" #include "fe_utils/recovery_gen.h" #include "getopt_long.h" +#include "libpq/protocol.h" #include "receivelog.h" #include "streamutil.h" @@ -487,7 +488,7 @@ reached_end_position(XLogRecPtr segendpos, uint32 timeline, if (r < 0) pg_fatal("could not read from ready pipe: %m"); - if (sscanf(xlogend, "%X/%X", &hi, &lo) != 2) + if (sscanf(xlogend, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse write-ahead log location \"%s\"", xlogend); xlogendptr = ((uint64) hi) << 32 | lo; @@ -629,7 +630,7 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier, param->wal_compress_level = wal_compress_level; /* Convert the starting position */ - if (sscanf(startpos, "%X/%X", &hi, &lo) != 2) + if (sscanf(startpos, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse write-ahead log location \"%s\"", startpos); param->startptr = ((uint64) hi) << 32 | lo; @@ -1338,7 +1339,7 @@ ReceiveArchiveStreamChunk(size_t r, char *copybuf, void *callback_data) /* Each CopyData message begins with a type byte. */ switch (GetCopyDataByte(r, copybuf, &cursor)) { - case 'n': + case PqBackupMsg_NewArchive: { /* New archive. */ char *archive_name; @@ -1410,7 +1411,7 @@ ReceiveArchiveStreamChunk(size_t r, char *copybuf, void *callback_data) break; } - case 'd': + case PqMsg_CopyData: { /* Archive or manifest data. */ if (state->manifest_buffer != NULL) @@ -1446,7 +1447,7 @@ ReceiveArchiveStreamChunk(size_t r, char *copybuf, void *callback_data) break; } - case 'p': + case PqBackupMsg_ProgressReport: { /* * Progress report. @@ -1465,7 +1466,7 @@ ReceiveArchiveStreamChunk(size_t r, char *copybuf, void *callback_data) break; } - case 'm': + case PqBackupMsg_Manifest: { /* * Manifest data will be sent next. This message is not @@ -2255,7 +2256,7 @@ BaseBackup(char *compression_algorithm, char *compression_detail, * value directly in the variable, and then set the flag that says * it's there. */ - if (sscanf(xlogend, "%X/%X", &hi, &lo) != 2) + if (sscanf(xlogend, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse write-ahead log location \"%s\"", xlogend); xlogendptr = ((uint64) hi) << 32 | lo; diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c index f65acc7cb11..3986882f042 100644 --- a/src/bin/pg_basebackup/pg_createsubscriber.c +++ b/src/bin/pg_basebackup/pg_createsubscriber.c @@ -46,7 +46,7 @@ struct CreateSubscriberOptions SimpleStringList replslot_names; /* list of replication slot names */ int recovery_timeout; /* stop recovery after this time */ bool all_dbs; /* all option */ - SimpleStringList objecttypes_to_remove; /* list of object types to remove */ + SimpleStringList objecttypes_to_clean; /* list of object types to cleanup */ }; /* per-database publication/subscription info */ @@ -71,8 +71,8 @@ struct LogicalRepInfos { struct LogicalRepInfo *dbinfo; bool two_phase; /* enable-two-phase option */ - bits32 objecttypes_to_remove; /* flags indicating which object types - * to remove on subscriber */ + bits32 objecttypes_to_clean; /* flags indicating which object types + * to clean up on subscriber */ }; static void cleanup_objects_atexit(void); @@ -247,19 +247,19 @@ usage(void) printf(_(" %s [OPTION]...\n"), progname); printf(_("\nOptions:\n")); printf(_(" -a, --all create subscriptions for all databases except template\n" - " databases or databases that don't allow connections\n")); + " databases and databases that don't allow connections\n")); printf(_(" -d, --database=DBNAME database in which to create a subscription\n")); printf(_(" -D, --pgdata=DATADIR location for the subscriber data directory\n")); printf(_(" -n, --dry-run dry run, just show what would be done\n")); printf(_(" -p, --subscriber-port=PORT subscriber port number (default %s)\n"), DEFAULT_SUB_PORT); printf(_(" -P, --publisher-server=CONNSTR publisher connection string\n")); - printf(_(" -R, --remove=OBJECTTYPE remove all objects of the specified type from specified\n" - " databases on the subscriber; accepts: publications\n")); printf(_(" -s, --socketdir=DIR socket directory to use (default current dir.)\n")); printf(_(" -t, --recovery-timeout=SECS seconds to wait for recovery to end\n")); printf(_(" -T, --enable-two-phase enable two-phase commit for all subscriptions\n")); printf(_(" -U, --subscriber-username=NAME user name for subscriber connection\n")); printf(_(" -v, --verbose output verbose messages\n")); + printf(_(" --clean=OBJECTTYPE drop all objects of the specified type from specified\n" + " databases on the subscriber; accepts: \"%s\"\n"), "publications"); printf(_(" --config-file=FILENAME use specified main server configuration\n" " file when running target cluster\n")); printf(_(" --publication=NAME publication name\n")); @@ -973,7 +973,7 @@ check_publisher(const struct LogicalRepInfo *dbinfo) pg_log_warning("two_phase option will not be enabled for replication slots"); pg_log_warning_detail("Subscriptions will be created with the two_phase option disabled. " "Prepared transactions will be replicated at COMMIT PREPARED."); - pg_log_warning_hint("You can use --enable-two-phase switch to enable two_phase."); + pg_log_warning_hint("You can use the command-line option --enable-two-phase to enable two_phase."); } /* @@ -1250,8 +1250,17 @@ setup_recovery(const struct LogicalRepInfo *dbinfo, const char *datadir, const c appendPQExpBufferStr(recoveryconfcontents, "recovery_target = ''\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_timeline = 'latest'\n"); + + /* + * Set recovery_target_inclusive = false to avoid reapplying the + * transaction committed at 'lsn' after subscription is enabled. This is + * because the provided 'lsn' is also used as the replication start point + * for the subscription. So, the server can send the transaction committed + * at that 'lsn' after replication is started which can lead to applying + * the same transaction twice if we keep recovery_target_inclusive = true. + */ appendPQExpBufferStr(recoveryconfcontents, - "recovery_target_inclusive = true\n"); + "recovery_target_inclusive = false\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_action = promote\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_name = ''\n"); @@ -1262,7 +1271,7 @@ setup_recovery(const struct LogicalRepInfo *dbinfo, const char *datadir, const c { appendPQExpBufferStr(recoveryconfcontents, "# dry run mode"); appendPQExpBuffer(recoveryconfcontents, - "recovery_target_lsn = '%X/%X'\n", + "recovery_target_lsn = '%X/%08X'\n", LSN_FORMAT_ARGS((XLogRecPtr) InvalidXLogRecPtr)); } else @@ -1730,7 +1739,7 @@ static void check_and_drop_publications(PGconn *conn, struct LogicalRepInfo *dbinfo) { PGresult *res; - bool drop_all_pubs = dbinfos.objecttypes_to_remove & OBJECTTYPE_PUBLICATIONS; + bool drop_all_pubs = dbinfos.objecttypes_to_clean & OBJECTTYPE_PUBLICATIONS; Assert(conn != NULL); @@ -1876,7 +1885,7 @@ set_replication_progress(PGconn *conn, const struct LogicalRepInfo *dbinfo, cons if (dry_run) { suboid = InvalidOid; - lsnstr = psprintf("%X/%X", LSN_FORMAT_ARGS((XLogRecPtr) InvalidXLogRecPtr)); + lsnstr = psprintf("%X/%08X", LSN_FORMAT_ARGS((XLogRecPtr) InvalidXLogRecPtr)); } else { @@ -2026,7 +2035,6 @@ main(int argc, char **argv) {"dry-run", no_argument, NULL, 'n'}, {"subscriber-port", required_argument, NULL, 'p'}, {"publisher-server", required_argument, NULL, 'P'}, - {"remove", required_argument, NULL, 'R'}, {"socketdir", required_argument, NULL, 's'}, {"recovery-timeout", required_argument, NULL, 't'}, {"enable-two-phase", no_argument, NULL, 'T'}, @@ -2038,6 +2046,7 @@ main(int argc, char **argv) {"publication", required_argument, NULL, 2}, {"replication-slot", required_argument, NULL, 3}, {"subscription", required_argument, NULL, 4}, + {"clean", required_argument, NULL, 5}, {NULL, 0, NULL, 0} }; @@ -2109,7 +2118,7 @@ main(int argc, char **argv) get_restricted_token(); - while ((c = getopt_long(argc, argv, "ad:D:np:P:R:s:t:TU:v", + while ((c = getopt_long(argc, argv, "ad:D:np:P:s:t:TU:v", long_options, &option_index)) != -1) { switch (c) @@ -2139,12 +2148,6 @@ main(int argc, char **argv) case 'P': opt.pub_conninfo_str = pg_strdup(optarg); break; - case 'R': - if (!simple_string_list_member(&opt.objecttypes_to_remove, optarg)) - simple_string_list_append(&opt.objecttypes_to_remove, optarg); - else - pg_fatal("object type \"%s\" is specified more than once for -R/--remove", optarg); - break; case 's': opt.socket_dir = pg_strdup(optarg); canonicalize_path(opt.socket_dir); @@ -2191,6 +2194,12 @@ main(int argc, char **argv) else pg_fatal("subscription \"%s\" specified more than once for --subscription", optarg); break; + case 5: + if (!simple_string_list_member(&opt.objecttypes_to_clean, optarg)) + simple_string_list_append(&opt.objecttypes_to_clean, optarg); + else + pg_fatal("object type \"%s\" specified more than once for --clean", optarg); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -2214,7 +2223,7 @@ main(int argc, char **argv) if (bad_switch) { - pg_log_error("%s cannot be used with -a/--all", bad_switch); + pg_log_error("options %s and -a/--all cannot be used together", bad_switch); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } @@ -2334,14 +2343,14 @@ main(int argc, char **argv) } /* Verify the object types specified for removal from the subscriber */ - for (SimpleStringListCell *cell = opt.objecttypes_to_remove.head; cell; cell = cell->next) + for (SimpleStringListCell *cell = opt.objecttypes_to_clean.head; cell; cell = cell->next) { if (pg_strcasecmp(cell->val, "publications") == 0) - dbinfos.objecttypes_to_remove |= OBJECTTYPE_PUBLICATIONS; + dbinfos.objecttypes_to_clean |= OBJECTTYPE_PUBLICATIONS; else { - pg_log_error("invalid object type \"%s\" specified for -R/--remove", cell->val); - pg_log_error_hint("The valid option is: \"publications\""); + pg_log_error("invalid object type \"%s\" specified for --clean", cell->val); + pg_log_error_hint("The valid value is: \"%s\"", "publications"); exit(1); } } diff --git a/src/bin/pg_basebackup/pg_receivewal.c b/src/bin/pg_basebackup/pg_receivewal.c index e816cf58101..289ca14dcfe 100644 --- a/src/bin/pg_basebackup/pg_receivewal.c +++ b/src/bin/pg_basebackup/pg_receivewal.c @@ -188,14 +188,14 @@ stop_streaming(XLogRecPtr xlogpos, uint32 timeline, bool segment_finished) /* we assume that we get called once at the end of each segment */ if (verbose && segment_finished) - pg_log_info("finished segment at %X/%X (timeline %u)", + pg_log_info("finished segment at %X/%08X (timeline %u)", LSN_FORMAT_ARGS(xlogpos), timeline); if (!XLogRecPtrIsInvalid(endpos) && endpos < xlogpos) { if (verbose) - pg_log_info("stopped log streaming at %X/%X (timeline %u)", + pg_log_info("stopped log streaming at %X/%08X (timeline %u)", LSN_FORMAT_ARGS(xlogpos), timeline); time_to_stop = true; @@ -211,7 +211,7 @@ stop_streaming(XLogRecPtr xlogpos, uint32 timeline, bool segment_finished) * timeline, but it's close enough for reporting purposes. */ if (verbose && prevtimeline != 0 && prevtimeline != timeline) - pg_log_info("switched to timeline %u at %X/%X", + pg_log_info("switched to timeline %u at %X/%08X", timeline, LSN_FORMAT_ARGS(prevpos)); @@ -575,7 +575,7 @@ StreamLog(void) * Start the replication */ if (verbose) - pg_log_info("starting log streaming at %X/%X (timeline %u)", + pg_log_info("starting log streaming at %X/%08X (timeline %u)", LSN_FORMAT_ARGS(stream.startpos), stream.timeline); @@ -689,7 +689,7 @@ main(int argc, char **argv) basedir = pg_strdup(optarg); break; case 'E': - if (sscanf(optarg, "%X/%X", &hi, &lo) != 2) + if (sscanf(optarg, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse end position \"%s\"", optarg); endpos = ((uint64) hi) << 32 | lo; break; diff --git a/src/bin/pg_basebackup/pg_recvlogical.c b/src/bin/pg_basebackup/pg_recvlogical.c index e6810efe5f0..7a4d1a2d2ca 100644 --- a/src/bin/pg_basebackup/pg_recvlogical.c +++ b/src/bin/pg_basebackup/pg_recvlogical.c @@ -24,6 +24,7 @@ #include "getopt_long.h" #include "libpq-fe.h" #include "libpq/pqsignal.h" +#include "libpq/protocol.h" #include "pqexpbuffer.h" #include "streamutil.h" @@ -41,8 +42,8 @@ typedef enum /* Global Options */ static char *outfile = NULL; static int verbose = 0; -static bool two_phase = false; -static bool failover = false; +static bool two_phase = false; /* enable-two-phase option */ +static bool failover = false; /* enable-failover option */ static int noloop = 0; static int standby_message_timeout = 10 * 1000; /* 10 sec = default */ static int fsync_interval = 10 * 1000; /* 10 sec = default */ @@ -89,9 +90,9 @@ usage(void) printf(_(" --drop-slot drop the replication slot (for the slot's name see --slot)\n")); printf(_(" --start start streaming in a replication slot (for the slot's name see --slot)\n")); printf(_("\nOptions:\n")); + printf(_(" --enable-failover enable replication slot synchronization to standby servers when\n" + " creating a replication slot\n")); printf(_(" -E, --endpos=LSN exit after receiving the specified LSN\n")); - printf(_(" --failover enable replication slot synchronization to standby servers when\n" - " creating a slot\n")); printf(_(" -f, --file=FILE receive log into this file, - for stdout\n")); printf(_(" -F --fsync-interval=SECS\n" " time between fsyncs to the output file (default: %d)\n"), (fsync_interval / 1000)); @@ -105,7 +106,8 @@ usage(void) printf(_(" -s, --status-interval=SECS\n" " time between status packets sent to server (default: %d)\n"), (standby_message_timeout / 1000)); printf(_(" -S, --slot=SLOTNAME name of the logical replication slot\n")); - printf(_(" -t, --two-phase enable decoding of prepared transactions when creating a slot\n")); + printf(_(" -t, --enable-two-phase enable decoding of prepared transactions when creating a slot\n")); + printf(_(" --two-phase (same as --enable-two-phase, deprecated)\n")); printf(_(" -v, --verbose output verbose messages\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" -?, --help show this help, then exit\n")); @@ -143,12 +145,12 @@ sendFeedback(PGconn *conn, TimestampTz now, bool force, bool replyRequested) return true; if (verbose) - pg_log_info("confirming write up to %X/%X, flush to %X/%X (slot %s)", + pg_log_info("confirming write up to %X/%08X, flush to %X/%08X (slot %s)", LSN_FORMAT_ARGS(output_written_lsn), LSN_FORMAT_ARGS(output_fsync_lsn), replication_slot); - replybuf[len] = 'r'; + replybuf[len] = PqReplMsg_StandbyStatusUpdate; len += 1; fe_sendint64(output_written_lsn, &replybuf[len]); /* write */ len += 8; @@ -237,13 +239,13 @@ StreamLogicalLog(void) * Start the replication */ if (verbose) - pg_log_info("starting log streaming at %X/%X (slot %s)", + pg_log_info("starting log streaming at %X/%08X (slot %s)", LSN_FORMAT_ARGS(startpos), replication_slot); /* Initiate the replication stream at specified location */ query = createPQExpBuffer(); - appendPQExpBuffer(query, "START_REPLICATION SLOT \"%s\" LOGICAL %X/%X", + appendPQExpBuffer(query, "START_REPLICATION SLOT \"%s\" LOGICAL %X/%08X", replication_slot, LSN_FORMAT_ARGS(startpos)); /* print options if there are any */ @@ -453,7 +455,7 @@ StreamLogicalLog(void) } /* Check the message type. */ - if (copybuf[0] == 'k') + if (copybuf[0] == PqReplMsg_Keepalive) { int pos; bool replyRequested; @@ -465,7 +467,7 @@ StreamLogicalLog(void) * We just check if the server requested a reply, and ignore the * rest. */ - pos = 1; /* skip msgtype 'k' */ + pos = 1; /* skip msgtype PqReplMsg_Keepalive */ walEnd = fe_recvint64(©buf[pos]); output_written_lsn = Max(walEnd, output_written_lsn); @@ -508,7 +510,7 @@ StreamLogicalLog(void) continue; } - else if (copybuf[0] != 'w') + else if (copybuf[0] != PqReplMsg_WALData) { pg_log_error("unrecognized streaming header: \"%c\"", copybuf[0]); @@ -516,11 +518,11 @@ StreamLogicalLog(void) } /* - * Read the header of the XLogData message, enclosed in the CopyData + * Read the header of the WALData message, enclosed in the CopyData * message. We only need the WAL location field (dataStart), the rest * of the header is ignored. */ - hdr_len = 1; /* msgtype 'w' */ + hdr_len = 1; /* msgtype PqReplMsg_WALData */ hdr_len += 8; /* dataStart */ hdr_len += 8; /* walEnd */ hdr_len += 8; /* sendTime */ @@ -604,7 +606,7 @@ StreamLogicalLog(void) /* * We're doing a client-initiated clean exit and have sent CopyDone to * the server. Drain any messages, so we don't miss a last-minute - * ErrorResponse. The walsender stops generating XLogData records once + * ErrorResponse. The walsender stops generating WALData records once * it sees CopyDone, so expect this to finish quickly. After CopyDone, * it's too late for sendFeedback(), even if this were to take a long * time. Hence, use synchronous-mode PQgetCopyData(). @@ -698,9 +700,10 @@ main(int argc, char **argv) {"file", required_argument, NULL, 'f'}, {"fsync-interval", required_argument, NULL, 'F'}, {"no-loop", no_argument, NULL, 'n'}, - {"failover", no_argument, NULL, 5}, + {"enable-failover", no_argument, NULL, 5}, + {"enable-two-phase", no_argument, NULL, 't'}, + {"two-phase", no_argument, NULL, 't'}, /* deprecated */ {"verbose", no_argument, NULL, 'v'}, - {"two-phase", no_argument, NULL, 't'}, {"version", no_argument, NULL, 'V'}, {"help", no_argument, NULL, '?'}, /* connection options */ @@ -798,12 +801,12 @@ main(int argc, char **argv) break; /* replication options */ case 'I': - if (sscanf(optarg, "%X/%X", &hi, &lo) != 2) + if (sscanf(optarg, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse start position \"%s\"", optarg); startpos = ((uint64) hi) << 32 | lo; break; case 'E': - if (sscanf(optarg, "%X/%X", &hi, &lo) != 2) + if (sscanf(optarg, "%X/%08X", &hi, &lo) != 2) pg_fatal("could not parse end position \"%s\"", optarg); endpos = ((uint64) hi) << 32 | lo; break; @@ -928,14 +931,14 @@ main(int argc, char **argv) { if (two_phase) { - pg_log_error("--two-phase may only be specified with --create-slot"); + pg_log_error("%s may only be specified with --create-slot", "--enable-two-phase"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } if (failover) { - pg_log_error("--failover may only be specified with --create-slot"); + pg_log_error("%s may only be specified with --create-slot", "--enable-failover"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } @@ -1073,12 +1076,12 @@ prepareToTerminate(PGconn *conn, XLogRecPtr endpos, StreamStopReason reason, pg_log_info("received interrupt signal, exiting"); break; case STREAM_STOP_KEEPALIVE: - pg_log_info("end position %X/%X reached by keepalive", + pg_log_info("end position %X/%08X reached by keepalive", LSN_FORMAT_ARGS(endpos)); break; case STREAM_STOP_END_OF_WAL: Assert(!XLogRecPtrIsInvalid(lsn)); - pg_log_info("end position %X/%X reached by WAL record at %X/%X", + pg_log_info("end position %X/%08X reached by WAL record at %X/%08X", LSN_FORMAT_ARGS(endpos), LSN_FORMAT_ARGS(lsn)); break; case STREAM_STOP_NONE: diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c index 6b6e32dfbdf..25b13c7f55c 100644 --- a/src/bin/pg_basebackup/receivelog.c +++ b/src/bin/pg_basebackup/receivelog.c @@ -21,6 +21,7 @@ #include "access/xlog_internal.h" #include "common/logging.h" #include "libpq-fe.h" +#include "libpq/protocol.h" #include "receivelog.h" #include "streamutil.h" @@ -38,8 +39,8 @@ static int CopyStreamReceive(PGconn *conn, long timeout, pgsocket stop_socket, char **buffer); static bool ProcessKeepaliveMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, XLogRecPtr blockpos, TimestampTz *last_status); -static bool ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, - XLogRecPtr *blockpos); +static bool ProcessWALDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, + XLogRecPtr *blockpos); static PGresult *HandleEndOfCopyStream(PGconn *conn, StreamCtl *stream, char *copybuf, XLogRecPtr blockpos, XLogRecPtr *stoppos); static bool CheckCopyStreamStop(PGconn *conn, StreamCtl *stream, XLogRecPtr blockpos); @@ -338,7 +339,7 @@ sendFeedback(PGconn *conn, XLogRecPtr blockpos, TimestampTz now, bool replyReque char replybuf[1 + 8 + 8 + 8 + 8 + 1]; int len = 0; - replybuf[len] = 'r'; + replybuf[len] = PqReplMsg_StandbyStatusUpdate; len += 1; fe_sendint64(blockpos, &replybuf[len]); /* write */ len += 8; @@ -571,7 +572,7 @@ ReceiveXlogStream(PGconn *conn, StreamCtl *stream) return true; /* Initiate the replication stream at specified location */ - snprintf(query, sizeof(query), "START_REPLICATION %s%X/%X TIMELINE %u", + snprintf(query, sizeof(query), "START_REPLICATION %s%X/%08X TIMELINE %u", slotcmd, LSN_FORMAT_ARGS(stream->startpos), stream->timeline); @@ -628,7 +629,7 @@ ReceiveXlogStream(PGconn *conn, StreamCtl *stream) } if (stream->startpos > stoppos) { - pg_log_error("server stopped streaming timeline %u at %X/%X, but reported next timeline %u to begin at %X/%X", + pg_log_error("server stopped streaming timeline %u at %X/%08X, but reported next timeline %u to begin at %X/%08X", stream->timeline, LSN_FORMAT_ARGS(stoppos), newtimeline, LSN_FORMAT_ARGS(stream->startpos)); goto error; @@ -720,7 +721,7 @@ ReadEndOfStreamingResult(PGresult *res, XLogRecPtr *startpos, uint32 *timeline) } *timeline = atoi(PQgetvalue(res, 0, 0)); - if (sscanf(PQgetvalue(res, 0, 1), "%X/%X", &startpos_xlogid, + if (sscanf(PQgetvalue(res, 0, 1), "%X/%08X", &startpos_xlogid, &startpos_xrecoff) != 2) { pg_log_error("could not parse next timeline's starting point \"%s\"", @@ -823,15 +824,15 @@ HandleCopyStream(PGconn *conn, StreamCtl *stream, } /* Check the message type. */ - if (copybuf[0] == 'k') + if (copybuf[0] == PqReplMsg_Keepalive) { if (!ProcessKeepaliveMsg(conn, stream, copybuf, r, blockpos, &last_status)) goto error; } - else if (copybuf[0] == 'w') + else if (copybuf[0] == PqReplMsg_WALData) { - if (!ProcessXLogDataMsg(conn, stream, copybuf, r, &blockpos)) + if (!ProcessWALDataMsg(conn, stream, copybuf, r, &blockpos)) goto error; /* @@ -1001,7 +1002,7 @@ ProcessKeepaliveMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, * Parse the keepalive message, enclosed in the CopyData message. We just * check if the server requested a reply, and ignore the rest. */ - pos = 1; /* skip msgtype 'k' */ + pos = 1; /* skip msgtype PqReplMsg_Keepalive */ pos += 8; /* skip walEnd */ pos += 8; /* skip sendTime */ @@ -1041,11 +1042,11 @@ ProcessKeepaliveMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, } /* - * Process XLogData message. + * Process WALData message. */ static bool -ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, - XLogRecPtr *blockpos) +ProcessWALDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, + XLogRecPtr *blockpos) { int xlogoff; int bytes_left; @@ -1054,17 +1055,17 @@ ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, /* * Once we've decided we don't want to receive any more, just ignore any - * subsequent XLogData messages. + * subsequent WALData messages. */ if (!(still_sending)) return true; /* - * Read the header of the XLogData message, enclosed in the CopyData + * Read the header of the WALData message, enclosed in the CopyData * message. We only need the WAL location field (dataStart), the rest of * the header is ignored. */ - hdr_len = 1; /* msgtype 'w' */ + hdr_len = 1; /* msgtype PqReplMsg_WALData */ hdr_len += 8; /* dataStart */ hdr_len += 8; /* walEnd */ hdr_len += 8; /* sendTime */ @@ -1162,7 +1163,7 @@ ProcessXLogDataMsg(PGconn *conn, StreamCtl *stream, char *copybuf, int len, return false; } still_sending = false; - return true; /* ignore the rest of this XLogData packet */ + return true; /* ignore the rest of this WALData packet */ } } } diff --git a/src/bin/pg_basebackup/streamutil.c b/src/bin/pg_basebackup/streamutil.c index c7b8a4c3a4b..e5a7cb6e5b1 100644 --- a/src/bin/pg_basebackup/streamutil.c +++ b/src/bin/pg_basebackup/streamutil.c @@ -445,7 +445,7 @@ RunIdentifySystem(PGconn *conn, char **sysid, TimeLineID *starttli, /* Get LSN start position if necessary */ if (startpos != NULL) { - if (sscanf(PQgetvalue(res, 0, 2), "%X/%X", &hi, &lo) != 2) + if (sscanf(PQgetvalue(res, 0, 2), "%X/%08X", &hi, &lo) != 2) { pg_log_error("could not parse write-ahead log location \"%s\"", PQgetvalue(res, 0, 2)); @@ -551,7 +551,7 @@ GetSlotInformation(PGconn *conn, const char *slot_name, uint32 hi, lo; - if (sscanf(PQgetvalue(res, 0, 1), "%X/%X", &hi, &lo) != 2) + if (sscanf(PQgetvalue(res, 0, 1), "%X/%08X", &hi, &lo) != 2) { pg_log_error("could not parse restart_lsn \"%s\" for replication slot \"%s\"", PQgetvalue(res, 0, 1), slot_name); diff --git a/src/bin/pg_basebackup/t/030_pg_recvlogical.pl b/src/bin/pg_basebackup/t/030_pg_recvlogical.pl index c82e78847b3..1b7a6f6f43f 100644 --- a/src/bin/pg_basebackup/t/030_pg_recvlogical.pl +++ b/src/bin/pg_basebackup/t/030_pg_recvlogical.pl @@ -110,7 +110,7 @@ $node->command_fails( '--dbname' => $node->connstr('postgres'), '--start', '--endpos' => $nextlsn, - '--two-phase', '--no-loop', + '--enable-two-phase', '--no-loop', '--file' => '-', ], 'incorrect usage'); @@ -142,12 +142,13 @@ $node->command_ok( '--slot' => 'test', '--dbname' => $node->connstr('postgres'), '--create-slot', - '--failover', + '--enable-failover', ], 'slot with failover created'); my $result = $node->safe_psql('postgres', - "SELECT failover FROM pg_catalog.pg_replication_slots WHERE slot_name = 'test'"); + "SELECT failover FROM pg_catalog.pg_replication_slots WHERE slot_name = 'test'" +); is($result, 't', "failover is enabled for the new slot"); done_testing(); diff --git a/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl b/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl index 2d532fee567..229fef5b3b5 100644 --- a/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl +++ b/src/bin/pg_basebackup/t/040_pg_createsubscriber.pl @@ -331,7 +331,7 @@ $node_p->safe_psql($db1, $node_p->wait_for_replay_catchup($node_s); # Create user-defined publications, wait for streaming replication to sync them -# to the standby, then verify that '--remove' +# to the standby, then verify that '--clean' # removes them. $node_p->safe_psql( $db1, qq( @@ -399,7 +399,7 @@ command_fails_like( '--database' => $db1, '--all', ], - qr/--database cannot be used with -a\/--all/, + qr/options --database and -a\/--all cannot be used together/, 'fail if --database is used with --all'); # run pg_createsubscriber with '--publication' and '--all' and verify @@ -416,7 +416,7 @@ command_fails_like( '--all', '--publication' => 'pub1', ], - qr/--publication cannot be used with -a\/--all/, + qr/options --publication and -a\/--all cannot be used together/, 'fail if --publication is used with --all'); # run pg_createsubscriber with '--all' option @@ -446,7 +446,7 @@ is(scalar(() = $stderr =~ /creating subscription/g), # Run pg_createsubscriber on node S. --verbose is used twice # to show more information. # In passing, also test the --enable-two-phase option and -# --remove option +# --clean option command_ok( [ 'pg_createsubscriber', @@ -463,7 +463,7 @@ command_ok( '--database' => $db1, '--database' => $db2, '--enable-two-phase', - '--remove' => 'publications', + '--clean' => 'publications', ], 'run pg_createsubscriber on node S'); diff --git a/src/bin/pg_combinebackup/backup_label.c b/src/bin/pg_combinebackup/backup_label.c index e89d4603f09..e774bc78a62 100644 --- a/src/bin/pg_combinebackup/backup_label.c +++ b/src/bin/pg_combinebackup/backup_label.c @@ -247,7 +247,7 @@ parse_lsn(char *s, char *e, XLogRecPtr *lsn, char **c) unsigned lo; *e = '\0'; - success = (sscanf(s, "%X/%X%n", &hi, &lo, &nchars) == 2); + success = (sscanf(s, "%X/%08X%n", &hi, &lo, &nchars) == 2); *e = save; if (success) diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c index 28e58cd8ef4..f5cef99f627 100644 --- a/src/bin/pg_combinebackup/pg_combinebackup.c +++ b/src/bin/pg_combinebackup/pg_combinebackup.c @@ -569,7 +569,7 @@ check_backup_label_files(int n_backups, char **backup_dirs) pg_fatal("backup at \"%s\" starts on timeline %u, but expected %u", backup_dirs[i], start_tli, check_tli); if (i < n_backups - 1 && start_lsn != check_lsn) - pg_fatal("backup at \"%s\" starts at LSN %X/%X, but expected %X/%X", + pg_fatal("backup at \"%s\" starts at LSN %X/%08X, but expected %X/%08X", backup_dirs[i], LSN_FORMAT_ARGS(start_lsn), LSN_FORMAT_ARGS(check_lsn)); diff --git a/src/bin/pg_combinebackup/t/010_hardlink.pl b/src/bin/pg_combinebackup/t/010_hardlink.pl index a0ee419090c..4f92d6676bd 100644 --- a/src/bin/pg_combinebackup/t/010_hardlink.pl +++ b/src/bin/pg_combinebackup/t/010_hardlink.pl @@ -56,7 +56,7 @@ $primary->command_ok( '--pgdata' => $backup1path, '--no-sync', '--checkpoint' => 'fast', - '--wal-method' => 'none' + '--wal-method' => 'none' ], "full backup"); @@ -74,7 +74,7 @@ $primary->command_ok( '--pgdata' => $backup2path, '--no-sync', '--checkpoint' => 'fast', - '--wal-method' => 'none', + '--wal-method' => 'none', '--incremental' => $backup1path . '/backup_manifest' ], "incremental backup"); @@ -112,45 +112,45 @@ done_testing(); # of the given data file. sub check_data_file { - my ($data_file, $last_segment_nlinks) = @_; - - my @data_file_segments = ($data_file); - - # Start checking for additional segments - my $segment_number = 1; - - while (1) - { - my $next_segment = $data_file . '.' . $segment_number; - - # If the file exists and is a regular file, add it to the list - if (-f $next_segment) - { - push @data_file_segments, $next_segment; - $segment_number++; - } - # Stop the loop if the file doesn't exist - else - { - last; - } - } - - # All segments of the given data file should contain 2 hard links, except - # for the last one, which should match the given number of links. - my $last_segment = pop @data_file_segments; - - for my $segment (@data_file_segments) - { - # Get the file's stat information of each segment - my $nlink_count = get_hard_link_count($segment); - ok($nlink_count == 2, "File '$segment' has 2 hard links"); - } - - # Get the file's stat information of the last segment - my $nlink_count = get_hard_link_count($last_segment); - ok($nlink_count == $last_segment_nlinks, - "File '$last_segment' has $last_segment_nlinks hard link(s)"); + my ($data_file, $last_segment_nlinks) = @_; + + my @data_file_segments = ($data_file); + + # Start checking for additional segments + my $segment_number = 1; + + while (1) + { + my $next_segment = $data_file . '.' . $segment_number; + + # If the file exists and is a regular file, add it to the list + if (-f $next_segment) + { + push @data_file_segments, $next_segment; + $segment_number++; + } + # Stop the loop if the file doesn't exist + else + { + last; + } + } + + # All segments of the given data file should contain 2 hard links, except + # for the last one, which should match the given number of links. + my $last_segment = pop @data_file_segments; + + for my $segment (@data_file_segments) + { + # Get the file's stat information of each segment + my $nlink_count = get_hard_link_count($segment); + ok($nlink_count == 2, "File '$segment' has 2 hard links"); + } + + # Get the file's stat information of the last segment + my $nlink_count = get_hard_link_count($last_segment); + ok($nlink_count == $last_segment_nlinks, + "File '$last_segment' has $last_segment_nlinks hard link(s)"); } @@ -159,11 +159,11 @@ sub check_data_file # that file. sub get_hard_link_count { - my ($file) = @_; + my ($file) = @_; - # Get file stats - my @stats = stat($file); - my $nlink = $stats[3]; # Number of hard links + # Get file stats + my @stats = stat($file); + my $nlink = $stats[3]; # Number of hard links - return $nlink; + return $nlink; } diff --git a/src/bin/pg_combinebackup/write_manifest.c b/src/bin/pg_combinebackup/write_manifest.c index 313f8929df5..819a3fd0b7a 100644 --- a/src/bin/pg_combinebackup/write_manifest.c +++ b/src/bin/pg_combinebackup/write_manifest.c @@ -155,7 +155,7 @@ finalize_manifest(manifest_writer *mwriter, for (wal_range = first_wal_range; wal_range != NULL; wal_range = wal_range->next) appendStringInfo(&mwriter->buf, - "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%X\", \"End-LSN\": \"%X/%X\" }", + "%s{ \"Timeline\": %u, \"Start-LSN\": \"%X/%08X\", \"End-LSN\": \"%X/%08X\" }", wal_range == first_wal_range ? "" : ",\n", wal_range->tli, LSN_FORMAT_ARGS(wal_range->start_lsn), diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 7bb801bb886..10de058ce91 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -245,9 +245,9 @@ main(int argc, char *argv[]) dbState(ControlFile->state)); printf(_("pg_control last modified: %s\n"), pgctime_str); - printf(_("Latest checkpoint location: %X/%X\n"), + printf(_("Latest checkpoint location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->checkPoint)); - printf(_("Latest checkpoint's REDO location: %X/%X\n"), + printf(_("Latest checkpoint's REDO location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo)); printf(_("Latest checkpoint's REDO WAL file: %s\n"), xlogfilename); @@ -282,15 +282,15 @@ main(int argc, char *argv[]) ControlFile->checkPointCopy.newestCommitTsXid); printf(_("Time of latest checkpoint: %s\n"), ckpttime_str); - printf(_("Fake LSN counter for unlogged rels: %X/%X\n"), + printf(_("Fake LSN counter for unlogged rels: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->unloggedLSN)); - printf(_("Minimum recovery ending location: %X/%X\n"), + printf(_("Minimum recovery ending location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint)); printf(_("Min recovery ending loc's timeline: %u\n"), ControlFile->minRecoveryPointTLI); - printf(_("Backup start location: %X/%X\n"), + printf(_("Backup start location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->backupStartPoint)); - printf(_("Backup end location: %X/%X\n"), + printf(_("Backup end location: %X/%08X\n"), LSN_FORMAT_ARGS(ControlFile->backupEndPoint)); printf(_("End-of-backup record required: %s\n"), ControlFile->backupEndRequired ? _("yes") : _("no")); diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c index aa1589e3331..a1976fae607 100644 --- a/src/bin/pg_dump/common.c +++ b/src/bin/pg_dump/common.c @@ -17,6 +17,7 @@ #include <ctype.h> +#include "catalog/pg_am_d.h" #include "catalog/pg_class_d.h" #include "catalog/pg_collation_d.h" #include "catalog/pg_extension_d.h" @@ -945,6 +946,24 @@ findOprByOid(Oid oid) } /* + * findAccessMethodByOid + * finds the DumpableObject for the access method with the given oid + * returns NULL if not found + */ +AccessMethodInfo * +findAccessMethodByOid(Oid oid) +{ + CatalogId catId; + DumpableObject *dobj; + + catId.tableoid = AccessMethodRelationId; + catId.oid = oid; + dobj = findObjectByCatalogId(catId); + Assert(dobj == NULL || dobj->objType == DO_ACCESS_METHOD); + return (AccessMethodInfo *) dobj; +} + +/* * findCollationByOid * finds the DumpableObject for the collation with the given oid * returns NULL if not found diff --git a/src/bin/pg_dump/meson.build b/src/bin/pg_dump/meson.build index d8e9e101254..a2233b0a1b4 100644 --- a/src/bin/pg_dump/meson.build +++ b/src/bin/pg_dump/meson.build @@ -91,9 +91,9 @@ tests += { 'bd': meson.current_build_dir(), 'tap': { 'env': { - 'GZIP_PROGRAM': gzip.found() ? gzip.path() : '', - 'LZ4': program_lz4.found() ? program_lz4.path() : '', - 'ZSTD': program_zstd.found() ? program_zstd.path() : '', + 'GZIP_PROGRAM': gzip.found() ? gzip.full_path() : '', + 'LZ4': program_lz4.found() ? program_lz4.full_path() : '', + 'ZSTD': program_zstd.found() ? program_zstd.full_path() : '', 'with_icu': icu.found() ? 'yes' : 'no', }, 'tests': [ @@ -102,7 +102,6 @@ tests += { 't/003_pg_dump_with_server.pl', 't/004_pg_dump_parallel.pl', 't/005_pg_dump_filterfile.pl', - 't/006_pg_dumpall.pl', 't/010_dump_connstr.pl', ], }, diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c index 5974d6706fd..086adcdc502 100644 --- a/src/bin/pg_dump/parallel.c +++ b/src/bin/pg_dump/parallel.c @@ -334,16 +334,6 @@ on_exit_close_archive(Archive *AHX) } /* - * When pg_restore restores multiple databases, then update already added entry - * into array for cleanup. - */ -void -replace_on_exit_close_archive(Archive *AHX) -{ - shutdown_info.AHX = AHX; -} - -/* * on_exit_nicely handler for shutting down database connections and * worker processes cleanly. */ diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index af0007fb6d2..4ebef1e8644 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -308,7 +308,7 @@ extern void SetArchiveOptions(Archive *AH, DumpOptions *dopt, RestoreOptions *ro extern void ProcessArchiveRestoreOptions(Archive *AHX); -extern void RestoreArchive(Archive *AHX, bool append_data); +extern void RestoreArchive(Archive *AHX); /* Open an existing archive */ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt); diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index afa42337b11..dce88f040ac 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -31,6 +31,8 @@ #endif #include "catalog/pg_class_d.h" +#include "catalog/pg_largeobject_metadata_d.h" +#include "catalog/pg_shdepend_d.h" #include "common/string.h" #include "compress_io.h" #include "dumputils.h" @@ -85,7 +87,7 @@ static int RestoringToDB(ArchiveHandle *AH); static void dump_lo_buf(ArchiveHandle *AH); static void dumpTimestamp(ArchiveHandle *AH, const char *msg, time_t tim); static void SetOutput(ArchiveHandle *AH, const char *filename, - const pg_compress_specification compression_spec, bool append_data); + const pg_compress_specification compression_spec); static CompressFileHandle *SaveOutput(ArchiveHandle *AH); static void RestoreOutput(ArchiveHandle *AH, CompressFileHandle *savedOutput); @@ -152,7 +154,7 @@ InitDumpOptions(DumpOptions *opts) opts->dumpSections = DUMP_UNSECTIONED; opts->dumpSchema = true; opts->dumpData = true; - opts->dumpStatistics = true; + opts->dumpStatistics = false; } /* @@ -337,14 +339,9 @@ ProcessArchiveRestoreOptions(Archive *AHX) StrictNamesCheck(ropt); } -/* - * RestoreArchive - * - * If append_data is set, then append data into file as we are restoring dump - * of multiple databases which was taken by pg_dumpall. - */ +/* Public */ void -RestoreArchive(Archive *AHX, bool append_data) +RestoreArchive(Archive *AHX) { ArchiveHandle *AH = (ArchiveHandle *) AHX; RestoreOptions *ropt = AH->public.ropt; @@ -461,7 +458,7 @@ RestoreArchive(Archive *AHX, bool append_data) */ sav = SaveOutput(AH); if (ropt->filename || ropt->compression_spec.algorithm != PG_COMPRESSION_NONE) - SetOutput(AH, ropt->filename, ropt->compression_spec, append_data); + SetOutput(AH, ropt->filename, ropt->compression_spec); ahprintf(AH, "--\n-- PostgreSQL database dump\n--\n\n"); @@ -1300,7 +1297,7 @@ PrintTOCSummary(Archive *AHX) sav = SaveOutput(AH); if (ropt->filename) - SetOutput(AH, ropt->filename, out_compression_spec, false); + SetOutput(AH, ropt->filename, out_compression_spec); if (strftime(stamp_str, sizeof(stamp_str), PGDUMP_STRFTIME_FMT, localtime(&AH->createDate)) == 0) @@ -1679,8 +1676,7 @@ archprintf(Archive *AH, const char *fmt,...) static void SetOutput(ArchiveHandle *AH, const char *filename, - const pg_compress_specification compression_spec, - bool append_data) + const pg_compress_specification compression_spec) { CompressFileHandle *CFH; const char *mode; @@ -1700,7 +1696,7 @@ SetOutput(ArchiveHandle *AH, const char *filename, else fn = fileno(stdout); - if (append_data || AH->mode == archModeAppend) + if (AH->mode == archModeAppend) mode = PG_BINARY_A; else mode = PG_BINARY_W; @@ -2655,7 +2651,7 @@ WriteToc(ArchiveHandle *AH) pg_fatal("unexpected TOC entry in WriteToc(): %d %s %s", te->dumpId, te->desc, te->tag); - if (fseeko(AH->FH, te->defnLen, SEEK_CUR != 0)) + if (fseeko(AH->FH, te->defnLen, SEEK_CUR) != 0) pg_fatal("error during file seek: %m"); } else if (te->defnDumper) @@ -2974,6 +2970,19 @@ _tocEntryRequired(TocEntry *te, teSection curSection, ArchiveHandle *AH) int res = REQ_SCHEMA | REQ_DATA; RestoreOptions *ropt = AH->public.ropt; + /* + * For binary upgrade mode, dump pg_largeobject_metadata and the + * associated pg_shdepend rows. This is faster to restore than the + * equivalent set of large object commands. We can only do this for + * upgrades from v12 and newer; in older versions, pg_largeobject_metadata + * was created WITH OIDS, so the OID column is hidden and won't be dumped. + */ + if (ropt->binary_upgrade && AH->public.remoteVersion >= 120000 && + strcmp(te->desc, "TABLE DATA") == 0 && + (te->catalogId.oid == LargeObjectMetadataRelationId || + te->catalogId.oid == SharedDependRelationId)) + return REQ_DATA; + /* These items are treated specially */ if (strcmp(te->desc, "ENCODING") == 0 || strcmp(te->desc, "STDSTRINGS") == 0 || diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h index 365073b3eae..325b53fc9bd 100644 --- a/src/bin/pg_dump/pg_backup_archiver.h +++ b/src/bin/pg_dump/pg_backup_archiver.h @@ -394,7 +394,6 @@ struct _tocEntry extern int parallel_restore(ArchiveHandle *AH, TocEntry *te); extern void on_exit_close_archive(Archive *AHX); -extern void replace_on_exit_close_archive(Archive *AHX); extern void warn_or_exit_horribly(ArchiveHandle *AH, const char *fmt,...) pg_attribute_printf(2, 3); diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c index 21b00792a8a..bc2a2fb4797 100644 --- a/src/bin/pg_dump/pg_backup_directory.c +++ b/src/bin/pg_dump/pg_backup_directory.c @@ -412,10 +412,15 @@ _LoadLOs(ArchiveHandle *AH, TocEntry *te) /* * Note: before archive v16, there was always only one BLOBS TOC entry, - * now there can be multiple. We don't need to worry what version we are - * reading though, because tctx->filename should be correct either way. + * now there can be multiple. Furthermore, although the actual filename + * was always "blobs.toc" before v16, the value of tctx->filename did not + * match that before commit 548e50976 fixed it. For simplicity we assume + * it must be "blobs.toc" in all archives before v16. */ - setFilePath(AH, tocfname, tctx->filename); + if (AH->version < K_VERS_1_16) + setFilePath(AH, tocfname, "blobs.toc"); + else + setFilePath(AH, tocfname, tctx->filename); CFH = ctx->LOsTocFH = InitDiscoverCompressFileHandle(tocfname, PG_BINARY_R); diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c index d94d0de2a5d..b5ba3b46dd9 100644 --- a/src/bin/pg_dump/pg_backup_tar.c +++ b/src/bin/pg_dump/pg_backup_tar.c @@ -826,7 +826,7 @@ _CloseArchive(ArchiveHandle *AH) savVerbose = AH->public.verbose; AH->public.verbose = 0; - RestoreArchive((Archive *) AH, false); + RestoreArchive((Archive *) AH); SetArchiveOptions((Archive *) AH, savDopt, savRopt); diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index e2e7975b34e..f3a353a61a5 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -47,10 +47,13 @@ #include "catalog/pg_authid_d.h" #include "catalog/pg_cast_d.h" #include "catalog/pg_class_d.h" +#include "catalog/pg_constraint_d.h" #include "catalog/pg_default_acl_d.h" #include "catalog/pg_largeobject_d.h" +#include "catalog/pg_largeobject_metadata_d.h" #include "catalog/pg_proc_d.h" #include "catalog/pg_publication_d.h" +#include "catalog/pg_shdepend_d.h" #include "catalog/pg_subscription_d.h" #include "catalog/pg_type_d.h" #include "common/connect.h" @@ -209,6 +212,12 @@ static int nbinaryUpgradeClassOids = 0; static SequenceItem *sequences = NULL; static int nsequences = 0; +/* + * For binary upgrade, the dump ID of pg_largeobject_metadata is saved for use + * as a dependency for pg_shdepend and any large object comments/seclabels. + */ +static DumpId lo_metadata_dumpId; + /* Maximum number of relations to fetch in a fetchAttributeStats() call. */ #define MAX_ATTR_STATS_RELS 64 @@ -350,7 +359,9 @@ static void buildMatViewRefreshDependencies(Archive *fout); static void getTableDataFKConstraints(void); static void determineNotNullFlags(Archive *fout, PGresult *res, int r, TableInfo *tbinfo, int j, - int i_notnull_name, int i_notnull_invalidoid, + int i_notnull_name, + int i_notnull_comment, + int i_notnull_invalidoid, int i_notnull_noinherit, int i_notnull_islocal, PQExpBuffer *invalidnotnulloids); @@ -438,8 +449,6 @@ main(int argc, char **argv) bool data_only = false; bool schema_only = false; bool statistics_only = false; - bool with_data = false; - bool with_schema = false; bool with_statistics = false; bool no_data = false; bool no_schema = false; @@ -503,6 +512,7 @@ main(int argc, char **argv) {"section", required_argument, NULL, 5}, {"serializable-deferrable", no_argument, &dopt.serializable_deferrable, 1}, {"snapshot", required_argument, NULL, 6}, + {"statistics", no_argument, NULL, 22}, {"statistics-only", no_argument, NULL, 18}, {"strict-names", no_argument, &strict_names, 1}, {"use-set-session-authorization", no_argument, &dopt.use_setsessauth, 1}, @@ -517,9 +527,6 @@ main(int argc, char **argv) {"no-toast-compression", no_argument, &dopt.no_toast_compression, 1}, {"no-unlogged-table-data", no_argument, &dopt.no_unlogged_table_data, 1}, {"no-sync", no_argument, NULL, 7}, - {"with-data", no_argument, NULL, 22}, - {"with-schema", no_argument, NULL, 23}, - {"with-statistics", no_argument, NULL, 24}, {"on-conflict-do-nothing", no_argument, &dopt.do_nothing, 1}, {"rows-per-insert", required_argument, NULL, 10}, {"include-foreign-data", required_argument, NULL, 11}, @@ -787,14 +794,6 @@ main(int argc, char **argv) break; case 22: - with_data = true; - break; - - case 23: - with_schema = true; - break; - - case 24: with_statistics = true; break; @@ -841,13 +840,17 @@ main(int argc, char **argv) if (statistics_only && no_statistics) pg_fatal("options --statistics-only and --no-statistics cannot be used together"); - /* reject conflicting "with-" and "no-" options */ - if (with_data && no_data) - pg_fatal("options --with-data and --no-data cannot be used together"); - if (with_schema && no_schema) - pg_fatal("options --with-schema and --no-schema cannot be used together"); + /* reject conflicting "no-" options */ if (with_statistics && no_statistics) - pg_fatal("options --with-statistics and --no-statistics cannot be used together"); + pg_fatal("options --statistics and --no-statistics cannot be used together"); + + /* reject conflicting "-only" options */ + if (data_only && with_statistics) + pg_fatal("options %s and %s cannot be used together", + "-a/--data-only", "--statistics"); + if (schema_only && with_statistics) + pg_fatal("options %s and %s cannot be used together", + "-s/--schema-only", "--statistics"); if (schema_only && foreign_servers_include_patterns.head != NULL) pg_fatal("options -s/--schema-only and --include-foreign-data cannot be used together"); @@ -862,16 +865,14 @@ main(int argc, char **argv) pg_fatal("option --if-exists requires option -c/--clean"); /* - * Set derivative flags. An "-only" option may be overridden by an - * explicit "with-" option; e.g. "--schema-only --with-statistics" will - * include schema and statistics. Other ambiguous or nonsensical - * combinations, e.g. "--schema-only --no-schema", will have already - * caused an error in one of the checks above. + * Set derivative flags. Ambiguous or nonsensical combinations, e.g. + * "--schema-only --no-schema", will have already caused an error in one + * of the checks above. */ dopt.dumpData = ((dopt.dumpData && !schema_only && !statistics_only) || - (data_only || with_data)) && !no_data; + data_only) && !no_data; dopt.dumpSchema = ((dopt.dumpSchema && !data_only && !statistics_only) || - (schema_only || with_schema)) && !no_schema; + schema_only) && !no_schema; dopt.dumpStatistics = ((dopt.dumpStatistics && !schema_only && !data_only) || (statistics_only || with_statistics)) && !no_statistics; @@ -1084,6 +1085,36 @@ main(int argc, char **argv) getTableData(&dopt, tblinfo, numTables, RELKIND_SEQUENCE); /* + * For binary upgrade mode, dump pg_largeobject_metadata and the + * associated pg_shdepend rows. This is faster to restore than the + * equivalent set of large object commands. We can only do this for + * upgrades from v12 and newer; in older versions, pg_largeobject_metadata + * was created WITH OIDS, so the OID column is hidden and won't be dumped. + */ + if (dopt.binary_upgrade && fout->remoteVersion >= 120000) + { + TableInfo *lo_metadata = findTableByOid(LargeObjectMetadataRelationId); + TableInfo *shdepend = findTableByOid(SharedDependRelationId); + + makeTableDataInfo(&dopt, lo_metadata); + makeTableDataInfo(&dopt, shdepend); + + /* + * Save pg_largeobject_metadata's dump ID for use as a dependency for + * pg_shdepend and any large object comments/seclabels. + */ + lo_metadata_dumpId = lo_metadata->dataObj->dobj.dumpId; + addObjectDependency(&shdepend->dataObj->dobj, lo_metadata_dumpId); + + /* + * Only dump large object shdepend rows for this database. + */ + shdepend->dataObj->filtercond = "WHERE classid = 'pg_largeobject'::regclass " + "AND dbid = (SELECT oid FROM pg_database " + " WHERE datname = current_database())"; + } + + /* * In binary-upgrade mode, we do not have to worry about the actual LO * data or the associated metadata that resides in the pg_largeobject and * pg_largeobject_metadata tables, respectively. @@ -1224,7 +1255,7 @@ main(int argc, char **argv) * right now. */ if (plainText) - RestoreArchive(fout, false); + RestoreArchive(fout); CloseArchive(fout); @@ -1235,7 +1266,7 @@ main(int argc, char **argv) static void help(const char *progname) { - printf(_("%s dumps a database as a text file or to other formats.\n\n"), progname); + printf(_("%s exports a PostgreSQL database as an SQL script or to other formats.\n\n"), progname); printf(_("Usage:\n")); printf(_(" %s [OPTION]... [DBNAME]\n"), progname); @@ -1314,6 +1345,7 @@ help(const char *progname) printf(_(" --sequence-data include sequence data in dump\n")); printf(_(" --serializable-deferrable wait until the dump can run without anomalies\n")); printf(_(" --snapshot=SNAPSHOT use given snapshot for the dump\n")); + printf(_(" --statistics dump the statistics\n")); printf(_(" --statistics-only dump only the statistics, not schema or data\n")); printf(_(" --strict-names require table and/or schema include patterns to\n" " match at least one entity each\n")); @@ -1322,9 +1354,6 @@ help(const char *progname) printf(_(" --use-set-session-authorization\n" " use SET SESSION AUTHORIZATION commands instead of\n" " ALTER OWNER commands to set ownership\n")); - printf(_(" --with-data dump the data\n")); - printf(_(" --with-schema dump the schema\n")); - printf(_(" --with-statistics dump the statistics\n")); printf(_("\nConnection options:\n")); printf(_(" -d, --dbname=DBNAME database to dump\n")); @@ -2166,6 +2195,13 @@ selectDumpableProcLang(ProcLangInfo *plang, Archive *fout) static void selectDumpableAccessMethod(AccessMethodInfo *method, Archive *fout) { + /* see getAccessMethods() comment about v9.6. */ + if (fout->remoteVersion < 90600) + { + method->dobj.dump = DUMP_COMPONENT_NONE; + return; + } + if (checkExtensionMembership(&method->dobj, fout)) return; /* extension membership overrides all else */ @@ -3922,10 +3958,37 @@ getLOs(Archive *fout) * as it will be copied by pg_upgrade, which simply copies the * pg_largeobject table. We *do* however dump out anything but the * data, as pg_upgrade copies just pg_largeobject, but not - * pg_largeobject_metadata, after the dump is restored. + * pg_largeobject_metadata, after the dump is restored. In versions + * before v12, this is done via proper large object commands. In + * newer versions, we dump the content of pg_largeobject_metadata and + * any associated pg_shdepend rows, which is faster to restore. (On + * <v12, pg_largeobject_metadata was created WITH OIDS, so the OID + * column is hidden and won't be dumped.) */ if (dopt->binary_upgrade) - loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA; + { + if (fout->remoteVersion >= 120000) + { + /* + * We should've saved pg_largeobject_metadata's dump ID before + * this point. + */ + Assert(lo_metadata_dumpId); + + loinfo->dobj.dump &= ~(DUMP_COMPONENT_DATA | DUMP_COMPONENT_ACL | DUMP_COMPONENT_DEFINITION); + + /* + * Mark the large object as dependent on + * pg_largeobject_metadata so that any large object + * comments/seclables are dumped after it. + */ + loinfo->dobj.dependencies = (DumpId *) pg_malloc(sizeof(DumpId)); + loinfo->dobj.dependencies[0] = lo_metadata_dumpId; + loinfo->dobj.nDeps = loinfo->dobj.allocDeps = 1; + } + else + loinfo->dobj.dump &= ~DUMP_COMPONENT_DATA; + } /* * Create a "BLOBS" data item for the group, too. This is just a @@ -4960,6 +5023,7 @@ getSubscriptions(Archive *fout) int i_suboriginremotelsn; int i_subenabled; int i_subfailover; + int i_subretaindeadtuples; int i, ntups; @@ -5032,10 +5096,17 @@ getSubscriptions(Archive *fout) if (fout->remoteVersion >= 170000) appendPQExpBufferStr(query, - " s.subfailover\n"); + " s.subfailover,\n"); else appendPQExpBufferStr(query, - " false AS subfailover\n"); + " false AS subfailover,\n"); + + if (fout->remoteVersion >= 190000) + appendPQExpBufferStr(query, + " s.subretaindeadtuples\n"); + else + appendPQExpBufferStr(query, + " false AS subretaindeadtuples\n"); appendPQExpBufferStr(query, "FROM pg_subscription s\n"); @@ -5069,6 +5140,7 @@ getSubscriptions(Archive *fout) i_subpasswordrequired = PQfnumber(res, "subpasswordrequired"); i_subrunasowner = PQfnumber(res, "subrunasowner"); i_subfailover = PQfnumber(res, "subfailover"); + i_subretaindeadtuples = PQfnumber(res, "subretaindeadtuples"); i_subconninfo = PQfnumber(res, "subconninfo"); i_subslotname = PQfnumber(res, "subslotname"); i_subsynccommit = PQfnumber(res, "subsynccommit"); @@ -5102,6 +5174,8 @@ getSubscriptions(Archive *fout) (strcmp(PQgetvalue(res, i, i_subrunasowner), "t") == 0); subinfo[i].subfailover = (strcmp(PQgetvalue(res, i, i_subfailover), "t") == 0); + subinfo[i].subretaindeadtuples = + (strcmp(PQgetvalue(res, i, i_subretaindeadtuples), "t") == 0); subinfo[i].subconninfo = pg_strdup(PQgetvalue(res, i, i_subconninfo)); if (PQgetisnull(res, i, i_subslotname)) @@ -5360,6 +5434,9 @@ dumpSubscription(Archive *fout, const SubscriptionInfo *subinfo) if (subinfo->subfailover) appendPQExpBufferStr(query, ", failover = true"); + if (subinfo->subretaindeadtuples) + appendPQExpBufferStr(query, ", retain_dead_tuples = true"); + if (strcmp(subinfo->subsynccommit, "off") != 0) appendPQExpBuffer(query, ", synchronous_commit = %s", fmtId(subinfo->subsynccommit)); @@ -6120,6 +6197,7 @@ getTypes(Archive *fout) */ tyinfo[i].nDomChecks = 0; tyinfo[i].domChecks = NULL; + tyinfo[i].notnull = NULL; if ((tyinfo[i].dobj.dump & DUMP_COMPONENT_DEFINITION) && tyinfo[i].typtype == TYPTYPE_DOMAIN) getDomainConstraints(fout, &(tyinfo[i])); @@ -6179,6 +6257,8 @@ getOperators(Archive *fout) int i_oprnamespace; int i_oprowner; int i_oprkind; + int i_oprleft; + int i_oprright; int i_oprcode; /* @@ -6190,6 +6270,8 @@ getOperators(Archive *fout) "oprnamespace, " "oprowner, " "oprkind, " + "oprleft, " + "oprright, " "oprcode::oid AS oprcode " "FROM pg_operator"); @@ -6205,6 +6287,8 @@ getOperators(Archive *fout) i_oprnamespace = PQfnumber(res, "oprnamespace"); i_oprowner = PQfnumber(res, "oprowner"); i_oprkind = PQfnumber(res, "oprkind"); + i_oprleft = PQfnumber(res, "oprleft"); + i_oprright = PQfnumber(res, "oprright"); i_oprcode = PQfnumber(res, "oprcode"); for (i = 0; i < ntups; i++) @@ -6218,6 +6302,8 @@ getOperators(Archive *fout) findNamespace(atooid(PQgetvalue(res, i, i_oprnamespace))); oprinfo[i].rolname = getRoleName(PQgetvalue(res, i, i_oprowner)); oprinfo[i].oprkind = (PQgetvalue(res, i, i_oprkind))[0]; + oprinfo[i].oprleft = atooid(PQgetvalue(res, i, i_oprleft)); + oprinfo[i].oprright = atooid(PQgetvalue(res, i, i_oprright)); oprinfo[i].oprcode = atooid(PQgetvalue(res, i, i_oprcode)); /* Decide whether we want to dump it */ @@ -6246,6 +6332,7 @@ getCollations(Archive *fout) int i_collname; int i_collnamespace; int i_collowner; + int i_collencoding; query = createPQExpBuffer(); @@ -6256,7 +6343,8 @@ getCollations(Archive *fout) appendPQExpBufferStr(query, "SELECT tableoid, oid, collname, " "collnamespace, " - "collowner " + "collowner, " + "collencoding " "FROM pg_collation"); res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); @@ -6270,6 +6358,7 @@ getCollations(Archive *fout) i_collname = PQfnumber(res, "collname"); i_collnamespace = PQfnumber(res, "collnamespace"); i_collowner = PQfnumber(res, "collowner"); + i_collencoding = PQfnumber(res, "collencoding"); for (i = 0; i < ntups; i++) { @@ -6281,6 +6370,7 @@ getCollations(Archive *fout) collinfo[i].dobj.namespace = findNamespace(atooid(PQgetvalue(res, i, i_collnamespace))); collinfo[i].rolname = getRoleName(PQgetvalue(res, i, i_collowner)); + collinfo[i].collencoding = atoi(PQgetvalue(res, i, i_collencoding)); /* Decide whether we want to dump it */ selectDumpableObject(&(collinfo[i].dobj), fout); @@ -6371,16 +6461,28 @@ getAccessMethods(Archive *fout) int i_amhandler; int i_amtype; - /* Before 9.6, there are no user-defined access methods */ - if (fout->remoteVersion < 90600) - return; - query = createPQExpBuffer(); - /* Select all access methods from pg_am table */ - appendPQExpBufferStr(query, "SELECT tableoid, oid, amname, amtype, " - "amhandler::pg_catalog.regproc AS amhandler " - "FROM pg_am"); + /* + * Select all access methods from pg_am table. v9.6 introduced CREATE + * ACCESS METHOD, so earlier versions usually have only built-in access + * methods. v9.6 also changed the access method API, replacing dozens of + * pg_am columns with amhandler. Even if a user created an access method + * by "INSERT INTO pg_am", we have no way to translate pre-v9.6 pg_am + * columns to a v9.6+ CREATE ACCESS METHOD. Hence, before v9.6, read + * pg_am just to facilitate findAccessMethodByOid() providing the + * OID-to-name mapping. + */ + appendPQExpBufferStr(query, "SELECT tableoid, oid, amname, "); + if (fout->remoteVersion >= 90600) + appendPQExpBufferStr(query, + "amtype, " + "amhandler::pg_catalog.regproc AS amhandler "); + else + appendPQExpBufferStr(query, + "'i'::pg_catalog.\"char\" AS amtype, " + "'-'::pg_catalog.regproc AS amhandler "); + appendPQExpBufferStr(query, "FROM pg_am"); res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); @@ -6429,6 +6531,7 @@ getOpclasses(Archive *fout) OpclassInfo *opcinfo; int i_tableoid; int i_oid; + int i_opcmethod; int i_opcname; int i_opcnamespace; int i_opcowner; @@ -6438,7 +6541,7 @@ getOpclasses(Archive *fout) * system-defined opclasses at dump-out time. */ - appendPQExpBufferStr(query, "SELECT tableoid, oid, opcname, " + appendPQExpBufferStr(query, "SELECT tableoid, oid, opcmethod, opcname, " "opcnamespace, " "opcowner " "FROM pg_opclass"); @@ -6451,6 +6554,7 @@ getOpclasses(Archive *fout) i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); + i_opcmethod = PQfnumber(res, "opcmethod"); i_opcname = PQfnumber(res, "opcname"); i_opcnamespace = PQfnumber(res, "opcnamespace"); i_opcowner = PQfnumber(res, "opcowner"); @@ -6464,6 +6568,7 @@ getOpclasses(Archive *fout) opcinfo[i].dobj.name = pg_strdup(PQgetvalue(res, i, i_opcname)); opcinfo[i].dobj.namespace = findNamespace(atooid(PQgetvalue(res, i, i_opcnamespace))); + opcinfo[i].opcmethod = atooid(PQgetvalue(res, i, i_opcmethod)); opcinfo[i].rolname = getRoleName(PQgetvalue(res, i, i_opcowner)); /* Decide whether we want to dump it */ @@ -6489,6 +6594,7 @@ getOpfamilies(Archive *fout) OpfamilyInfo *opfinfo; int i_tableoid; int i_oid; + int i_opfmethod; int i_opfname; int i_opfnamespace; int i_opfowner; @@ -6500,7 +6606,7 @@ getOpfamilies(Archive *fout) * system-defined opfamilies at dump-out time. */ - appendPQExpBufferStr(query, "SELECT tableoid, oid, opfname, " + appendPQExpBufferStr(query, "SELECT tableoid, oid, opfmethod, opfname, " "opfnamespace, " "opfowner " "FROM pg_opfamily"); @@ -6514,6 +6620,7 @@ getOpfamilies(Archive *fout) i_tableoid = PQfnumber(res, "tableoid"); i_oid = PQfnumber(res, "oid"); i_opfname = PQfnumber(res, "opfname"); + i_opfmethod = PQfnumber(res, "opfmethod"); i_opfnamespace = PQfnumber(res, "opfnamespace"); i_opfowner = PQfnumber(res, "opfowner"); @@ -6526,6 +6633,7 @@ getOpfamilies(Archive *fout) opfinfo[i].dobj.name = pg_strdup(PQgetvalue(res, i, i_opfname)); opfinfo[i].dobj.namespace = findNamespace(atooid(PQgetvalue(res, i, i_opfnamespace))); + opfinfo[i].opfmethod = atooid(PQgetvalue(res, i, i_opfmethod)); opfinfo[i].rolname = getRoleName(PQgetvalue(res, i, i_opfowner)); /* Decide whether we want to dump it */ @@ -6890,7 +6998,8 @@ getRelationStatistics(Archive *fout, DumpableObject *rel, int32 relpages, (relkind == RELKIND_PARTITIONED_TABLE) || (relkind == RELKIND_INDEX) || (relkind == RELKIND_PARTITIONED_INDEX) || - (relkind == RELKIND_MATVIEW)) + (relkind == RELKIND_MATVIEW || + relkind == RELKIND_FOREIGN_TABLE)) { RelStatsInfo *info = pg_malloc0(sizeof(RelStatsInfo)); DumpableObject *dobj = &info->dobj; @@ -6929,6 +7038,7 @@ getRelationStatistics(Archive *fout, DumpableObject *rel, int32 relpages, case RELKIND_RELATION: case RELKIND_PARTITIONED_TABLE: case RELKIND_MATVIEW: + case RELKIND_FOREIGN_TABLE: info->section = SECTION_DATA; break; case RELKIND_INDEX: @@ -6936,7 +7046,7 @@ getRelationStatistics(Archive *fout, DumpableObject *rel, int32 relpages, info->section = SECTION_POST_DATA; break; default: - pg_fatal("cannot dump statistics for relation kind '%c'", + pg_fatal("cannot dump statistics for relation kind \"%c\"", info->relkind); } @@ -8243,27 +8353,33 @@ addConstrChildIdxDeps(DumpableObject *dobj, const IndxInfo *refidx) static void getDomainConstraints(Archive *fout, TypeInfo *tyinfo) { - int i; ConstraintInfo *constrinfo; PQExpBuffer query = createPQExpBuffer(); PGresult *res; int i_tableoid, i_oid, i_conname, - i_consrc; + i_consrc, + i_convalidated, + i_contype; int ntups; if (!fout->is_prepared[PREPQUERY_GETDOMAINCONSTRAINTS]) { - /* Set up query for constraint-specific details */ - appendPQExpBufferStr(query, - "PREPARE getDomainConstraints(pg_catalog.oid) AS\n" - "SELECT tableoid, oid, conname, " - "pg_catalog.pg_get_constraintdef(oid) AS consrc, " - "convalidated " - "FROM pg_catalog.pg_constraint " - "WHERE contypid = $1 AND contype = 'c' " - "ORDER BY conname"); + /* + * Set up query for constraint-specific details. For servers 17 and + * up, domains have constraints of type 'n' as well as 'c', otherwise + * just the latter. + */ + appendPQExpBuffer(query, + "PREPARE getDomainConstraints(pg_catalog.oid) AS\n" + "SELECT tableoid, oid, conname, " + "pg_catalog.pg_get_constraintdef(oid) AS consrc, " + "convalidated, contype " + "FROM pg_catalog.pg_constraint " + "WHERE contypid = $1 AND contype IN (%s) " + "ORDER BY conname", + fout->remoteVersion < 170000 ? "'c'" : "'c', 'n'"); ExecuteSqlStatement(fout, query->data); @@ -8282,33 +8398,50 @@ getDomainConstraints(Archive *fout, TypeInfo *tyinfo) i_oid = PQfnumber(res, "oid"); i_conname = PQfnumber(res, "conname"); i_consrc = PQfnumber(res, "consrc"); + i_convalidated = PQfnumber(res, "convalidated"); + i_contype = PQfnumber(res, "contype"); constrinfo = (ConstraintInfo *) pg_malloc(ntups * sizeof(ConstraintInfo)); - - tyinfo->nDomChecks = ntups; tyinfo->domChecks = constrinfo; - for (i = 0; i < ntups; i++) + /* 'i' tracks result rows; 'j' counts CHECK constraints */ + for (int i = 0, j = 0; i < ntups; i++) { - bool validated = PQgetvalue(res, i, 4)[0] == 't'; - - constrinfo[i].dobj.objType = DO_CONSTRAINT; - constrinfo[i].dobj.catId.tableoid = atooid(PQgetvalue(res, i, i_tableoid)); - constrinfo[i].dobj.catId.oid = atooid(PQgetvalue(res, i, i_oid)); - AssignDumpId(&constrinfo[i].dobj); - constrinfo[i].dobj.name = pg_strdup(PQgetvalue(res, i, i_conname)); - constrinfo[i].dobj.namespace = tyinfo->dobj.namespace; - constrinfo[i].contable = NULL; - constrinfo[i].condomain = tyinfo; - constrinfo[i].contype = 'c'; - constrinfo[i].condef = pg_strdup(PQgetvalue(res, i, i_consrc)); - constrinfo[i].confrelid = InvalidOid; - constrinfo[i].conindex = 0; - constrinfo[i].condeferrable = false; - constrinfo[i].condeferred = false; - constrinfo[i].conislocal = true; - - constrinfo[i].separate = !validated; + bool validated = PQgetvalue(res, i, i_convalidated)[0] == 't'; + char contype = (PQgetvalue(res, i, i_contype))[0]; + ConstraintInfo *constraint; + + if (contype == CONSTRAINT_CHECK) + { + constraint = &constrinfo[j++]; + tyinfo->nDomChecks++; + } + else + { + Assert(contype == CONSTRAINT_NOTNULL); + Assert(tyinfo->notnull == NULL); + /* use last item in array for the not-null constraint */ + tyinfo->notnull = &(constrinfo[ntups - 1]); + constraint = tyinfo->notnull; + } + + constraint->dobj.objType = DO_CONSTRAINT; + constraint->dobj.catId.tableoid = atooid(PQgetvalue(res, i, i_tableoid)); + constraint->dobj.catId.oid = atooid(PQgetvalue(res, i, i_oid)); + AssignDumpId(&(constraint->dobj)); + constraint->dobj.name = pg_strdup(PQgetvalue(res, i, i_conname)); + constraint->dobj.namespace = tyinfo->dobj.namespace; + constraint->contable = NULL; + constraint->condomain = tyinfo; + constraint->contype = contype; + constraint->condef = pg_strdup(PQgetvalue(res, i, i_consrc)); + constraint->confrelid = InvalidOid; + constraint->conindex = 0; + constraint->condeferrable = false; + constraint->condeferred = false; + constraint->conislocal = true; + + constraint->separate = !validated; /* * Make the domain depend on the constraint, ensuring it won't be @@ -8317,8 +8450,7 @@ getDomainConstraints(Archive *fout, TypeInfo *tyinfo) * anyway, so this doesn't matter. */ if (validated) - addObjectDependency(&tyinfo->dobj, - constrinfo[i].dobj.dumpId); + addObjectDependency(&tyinfo->dobj, constraint->dobj.dumpId); } PQclear(res); @@ -9004,6 +9136,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) int i_attalign; int i_attislocal; int i_notnull_name; + int i_notnull_comment; int i_notnull_noinherit; int i_notnull_islocal; int i_notnull_invalidoid; @@ -9034,8 +9167,20 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) if (tbinfo->relkind == RELKIND_SEQUENCE) continue; - /* Don't bother with uninteresting tables, either */ - if (!tbinfo->interesting) + /* + * Don't bother with uninteresting tables, either. For binary + * upgrades, this is bypassed for pg_largeobject_metadata and + * pg_shdepend so that the columns names are collected for the + * corresponding COPY commands. Restoring the data for those catalogs + * is faster than restoring the equivalent set of large object + * commands. We can only do this for upgrades from v12 and newer; in + * older versions, pg_largeobject_metadata was created WITH OIDS, so + * the OID column is hidden and won't be dumped. + */ + if (!tbinfo->interesting && + !(fout->dopt->binary_upgrade && fout->remoteVersion >= 120000 && + (tbinfo->dobj.catId.oid == LargeObjectMetadataRelationId || + tbinfo->dobj.catId.oid == SharedDependRelationId))) continue; /* OK, we need info for this table */ @@ -9087,7 +9232,8 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) /* * Find out any NOT NULL markings for each column. In 18 and up we read - * pg_constraint to obtain the constraint name. notnull_noinherit is set + * pg_constraint to obtain the constraint name, and for valid constraints + * also pg_description to obtain its comment. notnull_noinherit is set * according to the NO INHERIT property. For versions prior to 18, we * store an empty string as the name when a constraint is marked as * attnotnull (this cues dumpTableSchema to print the NOT NULL clause @@ -9095,7 +9241,8 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) * * For invalid constraints, we need to store their OIDs for processing * elsewhere, so we bring the pg_constraint.oid value when the constraint - * is invalid, and NULL otherwise. + * is invalid, and NULL otherwise. Their comments are handled not here + * but by collectComments, because they're their own dumpable object. * * We track in notnull_islocal whether the constraint was defined directly * in this table or via an ancestor, for binary upgrade. flagInhAttrs @@ -9105,6 +9252,8 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) if (fout->remoteVersion >= 180000) appendPQExpBufferStr(q, "co.conname AS notnull_name,\n" + "CASE WHEN co.convalidated THEN pt.description" + " ELSE NULL END AS notnull_comment,\n" "CASE WHEN NOT co.convalidated THEN co.oid " "ELSE NULL END AS notnull_invalidoid,\n" "co.connoinherit AS notnull_noinherit,\n" @@ -9112,6 +9261,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) else appendPQExpBufferStr(q, "CASE WHEN a.attnotnull THEN '' ELSE NULL END AS notnull_name,\n" + "NULL AS notnull_comment,\n" "NULL AS notnull_invalidoid,\n" "false AS notnull_noinherit,\n" "a.attislocal AS notnull_islocal,\n"); @@ -9155,15 +9305,16 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) /* * In versions 18 and up, we need pg_constraint for explicit NOT NULL - * entries. Also, we need to know if the NOT NULL for each column is - * backing a primary key. + * entries and pg_description to get their comments. */ if (fout->remoteVersion >= 180000) appendPQExpBufferStr(q, " LEFT JOIN pg_catalog.pg_constraint co ON " "(a.attrelid = co.conrelid\n" " AND co.contype = 'n' AND " - "co.conkey = array[a.attnum])\n"); + "co.conkey = array[a.attnum])\n" + " LEFT JOIN pg_catalog.pg_description pt ON " + "(pt.classoid = co.tableoid AND pt.objoid = co.oid)\n"); appendPQExpBufferStr(q, "WHERE a.attnum > 0::pg_catalog.int2\n" @@ -9187,6 +9338,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) i_attalign = PQfnumber(res, "attalign"); i_attislocal = PQfnumber(res, "attislocal"); i_notnull_name = PQfnumber(res, "notnull_name"); + i_notnull_comment = PQfnumber(res, "notnull_comment"); i_notnull_invalidoid = PQfnumber(res, "notnull_invalidoid"); i_notnull_noinherit = PQfnumber(res, "notnull_noinherit"); i_notnull_islocal = PQfnumber(res, "notnull_islocal"); @@ -9232,7 +9384,10 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) pg_fatal("unrecognized table OID %u", attrelid); /* cross-check that we only got requested tables */ if (tbinfo->relkind == RELKIND_SEQUENCE || - !tbinfo->interesting) + (!tbinfo->interesting && + !(fout->dopt->binary_upgrade && fout->remoteVersion >= 120000 && + (tbinfo->dobj.catId.oid == LargeObjectMetadataRelationId || + tbinfo->dobj.catId.oid == SharedDependRelationId)))) pg_fatal("unexpected column data for table \"%s\"", tbinfo->dobj.name); @@ -9255,6 +9410,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) tbinfo->attfdwoptions = (char **) pg_malloc(numatts * sizeof(char *)); tbinfo->attmissingval = (char **) pg_malloc(numatts * sizeof(char *)); tbinfo->notnull_constrs = (char **) pg_malloc(numatts * sizeof(char *)); + tbinfo->notnull_comment = (char **) pg_malloc(numatts * sizeof(char *)); tbinfo->notnull_invalid = (bool *) pg_malloc(numatts * sizeof(bool)); tbinfo->notnull_noinh = (bool *) pg_malloc(numatts * sizeof(bool)); tbinfo->notnull_islocal = (bool *) pg_malloc(numatts * sizeof(bool)); @@ -9286,11 +9442,14 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) determineNotNullFlags(fout, res, r, tbinfo, j, i_notnull_name, + i_notnull_comment, i_notnull_invalidoid, i_notnull_noinherit, i_notnull_islocal, &invalidnotnulloids); + tbinfo->notnull_comment[j] = PQgetisnull(res, r, i_notnull_comment) ? + NULL : pg_strdup(PQgetvalue(res, r, i_notnull_comment)); tbinfo->attoptions[j] = pg_strdup(PQgetvalue(res, r, i_attoptions)); tbinfo->attcollation[j] = atooid(PQgetvalue(res, r, i_attcollation)); tbinfo->attcompression[j] = *(PQgetvalue(res, r, i_attcompression)); @@ -9461,7 +9620,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) int i_consrc; int i_conislocal; - pg_log_info("finding invalid not null constraints"); + pg_log_info("finding invalid not-null constraints"); resetPQExpBuffer(q); appendPQExpBuffer(q, @@ -9702,8 +9861,9 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables) * 4) The column has a constraint with a known name; in that case * notnull_constrs carries that name and dumpTableSchema will print * "CONSTRAINT the_name NOT NULL". However, if the name is the default - * (table_column_not_null), there's no need to print that name in the dump, - * so notnull_constrs is set to the empty string and it behaves as case 2. + * (table_column_not_null) and there's no comment on the constraint, + * there's no need to print that name in the dump, so notnull_constrs + * is set to the empty string and it behaves as case 2. * * In a child table that inherits from a parent already containing NOT NULL * constraints and the columns in the child don't have their own NOT NULL @@ -9730,6 +9890,7 @@ static void determineNotNullFlags(Archive *fout, PGresult *res, int r, TableInfo *tbinfo, int j, int i_notnull_name, + int i_notnull_comment, int i_notnull_invalidoid, int i_notnull_noinherit, int i_notnull_islocal, @@ -9803,11 +9964,13 @@ determineNotNullFlags(Archive *fout, PGresult *res, int r, { /* * In binary upgrade of inheritance child tables, must have a - * constraint name that we can UPDATE later. + * constraint name that we can UPDATE later; same if there's a + * comment on the constraint. */ - if (dopt->binary_upgrade && - !tbinfo->ispartition && - !tbinfo->notnull_islocal) + if ((dopt->binary_upgrade && + !tbinfo->ispartition && + !tbinfo->notnull_islocal) || + !PQgetisnull(res, r, i_notnull_comment)) { tbinfo->notnull_constrs[j] = pstrdup(PQgetvalue(res, r, i_notnull_name)); @@ -10765,6 +10928,9 @@ fetchAttributeStats(Archive *fout) restarted = true; } + appendPQExpBufferChar(nspnames, '{'); + appendPQExpBufferChar(relnames, '{'); + /* * Scan the TOC for the next set of relevant stats entries. We assume * that statistics are dumped in the order they are listed in the TOC. @@ -10776,23 +10942,25 @@ fetchAttributeStats(Archive *fout) if ((te->reqs & REQ_STATS) != 0 && strcmp(te->desc, "STATISTICS DATA") == 0) { - appendPQExpBuffer(nspnames, "%s%s", count ? "," : "", - fmtId(te->namespace)); - appendPQExpBuffer(relnames, "%s%s", count ? "," : "", - fmtId(te->tag)); + appendPGArray(nspnames, te->namespace); + appendPGArray(relnames, te->tag); count++; } } + appendPQExpBufferChar(nspnames, '}'); + appendPQExpBufferChar(relnames, '}'); + /* Execute the query for the next batch of relations. */ if (count > 0) { PQExpBuffer query = createPQExpBuffer(); - appendPQExpBuffer(query, "EXECUTE getAttributeStats(" - "'{%s}'::pg_catalog.name[]," - "'{%s}'::pg_catalog.name[])", - nspnames->data, relnames->data); + appendPQExpBufferStr(query, "EXECUTE getAttributeStats("); + appendStringLiteralAH(query, nspnames->data, fout); + appendPQExpBufferStr(query, "::pg_catalog.name[],"); + appendStringLiteralAH(query, relnames->data, fout); + appendPQExpBufferStr(query, "::pg_catalog.name[])"); res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK); destroyPQExpBuffer(query); } @@ -10850,7 +11018,7 @@ dumpRelationStats_dumper(Archive *fout, const void *userArg, const TocEntry *te) expected_te = expected_te->next; if (te != expected_te) - pg_fatal("stats dumped out of order (current: %d %s %s) (expected: %d %s %s)", + pg_fatal("statistics dumped out of order (current: %d %s %s, expected: %d %s %s)", te->dumpId, te->desc, te->tag, expected_te->dumpId, expected_te->desc, expected_te->tag); @@ -10924,7 +11092,20 @@ dumpRelationStats_dumper(Archive *fout, const void *userArg, const TocEntry *te) appendStringLiteralAH(out, rsinfo->dobj.name, fout); appendPQExpBufferStr(out, ",\n"); appendPQExpBuffer(out, "\t'relpages', '%d'::integer,\n", rsinfo->relpages); - appendPQExpBuffer(out, "\t'reltuples', '%s'::real,\n", rsinfo->reltuples); + + /* + * Before v14, a reltuples value of 0 was ambiguous: it could either mean + * the relation is empty, or it could mean that it hadn't yet been + * vacuumed or analyzed. (Newer versions use -1 for the latter case.) + * This ambiguity allegedly can cause the planner to choose inefficient + * plans after restoring to v18 or newer. To deal with this, let's just + * set reltuples to -1 in that case. + */ + if (fout->remoteVersion < 140000 && strcmp("0", rsinfo->reltuples) == 0) + appendPQExpBufferStr(out, "\t'reltuples', '-1'::real,\n"); + else + appendPQExpBuffer(out, "\t'reltuples', '%s'::real,\n", rsinfo->reltuples); + appendPQExpBuffer(out, "\t'relallvisible', '%d'::integer", rsinfo->relallvisible); @@ -10978,7 +11159,7 @@ dumpRelationStats_dumper(Archive *fout, const void *userArg, const TocEntry *te) appendStringLiteralAH(out, rsinfo->dobj.name, fout); if (PQgetisnull(res, rownum, i_attname)) - pg_fatal("attname cannot be NULL"); + pg_fatal("unexpected null attname"); attname = PQgetvalue(res, rownum, i_attname); /* @@ -12479,8 +12660,36 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) appendPQExpBuffer(q, " COLLATE %s", fmtQualifiedDumpable(coll)); } + /* + * Print a not-null constraint if there's one. In servers older than 17 + * these don't have names, so just print it unadorned; in newer ones they + * do, but most of the time it's going to be the standard generated one, + * so omit the name in that case also. + */ if (typnotnull[0] == 't') - appendPQExpBufferStr(q, " NOT NULL"); + { + if (fout->remoteVersion < 170000 || tyinfo->notnull == NULL) + appendPQExpBufferStr(q, " NOT NULL"); + else + { + ConstraintInfo *notnull = tyinfo->notnull; + + if (!notnull->separate) + { + char *default_name; + + /* XXX should match ChooseConstraintName better */ + default_name = psprintf("%s_not_null", tyinfo->dobj.name); + + if (strcmp(default_name, notnull->dobj.name) == 0) + appendPQExpBufferStr(q, " NOT NULL"); + else + appendPQExpBuffer(q, " CONSTRAINT %s %s", + fmtId(notnull->dobj.name), notnull->condef); + free(default_name); + } + } + } if (typdefault != NULL) { @@ -12500,7 +12709,7 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) { ConstraintInfo *domcheck = &(tyinfo->domChecks[i]); - if (!domcheck->separate) + if (!domcheck->separate && domcheck->contype == 'c') appendPQExpBuffer(q, "\n\tCONSTRAINT %s %s", fmtId(domcheck->dobj.name), domcheck->condef); } @@ -12545,8 +12754,13 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) for (i = 0; i < tyinfo->nDomChecks; i++) { ConstraintInfo *domcheck = &(tyinfo->domChecks[i]); - PQExpBuffer conprefix = createPQExpBuffer(); + PQExpBuffer conprefix; + + /* but only if the constraint itself was dumped here */ + if (domcheck->separate) + continue; + conprefix = createPQExpBuffer(); appendPQExpBuffer(conprefix, "CONSTRAINT %s ON DOMAIN", fmtId(domcheck->dobj.name)); @@ -12559,6 +12773,25 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) destroyPQExpBuffer(conprefix); } + /* + * And a comment on the not-null constraint, if there's one -- but only if + * the constraint itself was dumped here + */ + if (tyinfo->notnull != NULL && !tyinfo->notnull->separate) + { + PQExpBuffer conprefix = createPQExpBuffer(); + + appendPQExpBuffer(conprefix, "CONSTRAINT %s ON DOMAIN", + fmtId(tyinfo->notnull->dobj.name)); + + if (tyinfo->notnull->dobj.dump & DUMP_COMPONENT_COMMENT) + dumpComment(fout, conprefix->data, qtypname, + tyinfo->dobj.namespace->dobj.name, + tyinfo->rolname, + tyinfo->notnull->dobj.catId, 0, tyinfo->dobj.dumpId); + destroyPQExpBuffer(conprefix); + } + destroyPQExpBuffer(q); destroyPQExpBuffer(delq); destroyPQExpBuffer(query); @@ -17666,6 +17899,56 @@ dumpTableSchema(Archive *fout, const TableInfo *tbinfo) if (tbinfo->dobj.dump & DUMP_COMPONENT_SECLABEL) dumpTableSecLabel(fout, tbinfo, reltypename); + /* + * Dump comments for not-null constraints that aren't to be dumped + * separately (those are processed by collectComments/dumpComment). + */ + if (!fout->dopt->no_comments && dopt->dumpSchema && + fout->remoteVersion >= 180000) + { + PQExpBuffer comment = NULL; + PQExpBuffer tag = NULL; + + for (j = 0; j < tbinfo->numatts; j++) + { + if (tbinfo->notnull_constrs[j] != NULL && + tbinfo->notnull_comment[j] != NULL) + { + if (comment == NULL) + { + comment = createPQExpBuffer(); + tag = createPQExpBuffer(); + } + else + { + resetPQExpBuffer(comment); + resetPQExpBuffer(tag); + } + + appendPQExpBuffer(comment, "COMMENT ON CONSTRAINT %s ON %s IS ", + fmtId(tbinfo->notnull_constrs[j]), qualrelname); + appendStringLiteralAH(comment, tbinfo->notnull_comment[j], fout); + appendPQExpBufferStr(comment, ";\n"); + + appendPQExpBuffer(tag, "CONSTRAINT %s ON %s", + fmtId(tbinfo->notnull_constrs[j]), qrelname); + + ArchiveEntry(fout, nilCatalogId, createDumpId(), + ARCHIVE_OPTS(.tag = tag->data, + .namespace = tbinfo->dobj.namespace->dobj.name, + .owner = tbinfo->rolname, + .description = "COMMENT", + .section = SECTION_NONE, + .createStmt = comment->data, + .deps = &(tbinfo->dobj.dumpId), + .nDeps = 1)); + } + } + + destroyPQExpBuffer(comment); + destroyPQExpBuffer(tag); + } + /* Dump comments on inlined table constraints */ for (j = 0; j < tbinfo->ncheck; j++) { @@ -18370,14 +18653,23 @@ dumpConstraint(Archive *fout, const ConstraintInfo *coninfo) .dropStmt = delq->data)); } } - else if (coninfo->contype == 'c' && tbinfo == NULL) + else if (tbinfo == NULL) { - /* CHECK constraint on a domain */ + /* CHECK, NOT NULL constraint on a domain */ TypeInfo *tyinfo = coninfo->condomain; + Assert(coninfo->contype == 'c' || coninfo->contype == 'n'); + /* Ignore if not to be dumped separately */ if (coninfo->separate) { + const char *keyword; + + if (coninfo->contype == 'c') + keyword = "CHECK CONSTRAINT"; + else + keyword = "CONSTRAINT"; + appendPQExpBuffer(q, "ALTER DOMAIN %s\n", fmtQualifiedDumpable(tyinfo)); appendPQExpBuffer(q, " ADD CONSTRAINT %s %s;\n", @@ -18396,10 +18688,26 @@ dumpConstraint(Archive *fout, const ConstraintInfo *coninfo) ARCHIVE_OPTS(.tag = tag, .namespace = tyinfo->dobj.namespace->dobj.name, .owner = tyinfo->rolname, - .description = "CHECK CONSTRAINT", + .description = keyword, .section = SECTION_POST_DATA, .createStmt = q->data, .dropStmt = delq->data)); + + if (coninfo->dobj.dump & DUMP_COMPONENT_COMMENT) + { + PQExpBuffer conprefix = createPQExpBuffer(); + char *qtypname = pg_strdup(fmtId(tyinfo->dobj.name)); + + appendPQExpBuffer(conprefix, "CONSTRAINT %s ON DOMAIN", + fmtId(coninfo->dobj.name)); + + dumpComment(fout, conprefix->data, qtypname, + tyinfo->dobj.namespace->dobj.name, + tyinfo->rolname, + coninfo->dobj.catId, 0, tyinfo->dobj.dumpId); + destroyPQExpBuffer(conprefix); + free(qtypname); + } } } else diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 7417eab6aef..dde85ed156c 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -222,7 +222,9 @@ typedef struct _typeInfo bool isDefined; /* true if typisdefined */ /* If needed, we'll create a "shell type" entry for it; link that here: */ struct _shellTypeInfo *shellType; /* shell-type entry, or NULL */ - /* If it's a domain, we store links to its constraints here: */ + /* If it's a domain, its not-null constraint is here: */ + struct _constraintInfo *notnull; + /* If it's a domain, we store links to its CHECK constraints here: */ int nDomChecks; struct _constraintInfo *domChecks; } TypeInfo; @@ -258,6 +260,8 @@ typedef struct _oprInfo DumpableObject dobj; const char *rolname; char oprkind; + Oid oprleft; + Oid oprright; Oid oprcode; } OprInfo; @@ -271,12 +275,14 @@ typedef struct _accessMethodInfo typedef struct _opclassInfo { DumpableObject dobj; + Oid opcmethod; const char *rolname; } OpclassInfo; typedef struct _opfamilyInfo { DumpableObject dobj; + Oid opfmethod; const char *rolname; } OpfamilyInfo; @@ -284,6 +290,7 @@ typedef struct _collInfo { DumpableObject dobj; const char *rolname; + int collencoding; } CollInfo; typedef struct _convInfo @@ -365,6 +372,7 @@ typedef struct _tableInfo * there isn't one on this column. If * empty string, unnamed constraint * (pre-v17) */ + char **notnull_comment; /* comment thereof */ bool *notnull_invalid; /* true for NOT NULL NOT VALID */ bool *notnull_noinh; /* NOT NULL is NO INHERIT */ bool *notnull_islocal; /* true if NOT NULL has local definition */ @@ -708,6 +716,7 @@ typedef struct _SubscriptionInfo bool subpasswordrequired; bool subrunasowner; bool subfailover; + bool subretaindeadtuples; char *subconninfo; char *subslotname; char *subsynccommit; @@ -756,6 +765,7 @@ extern TableInfo *findTableByOid(Oid oid); extern TypeInfo *findTypeByOid(Oid oid); extern FuncInfo *findFuncByOid(Oid oid); extern OprInfo *findOprByOid(Oid oid); +extern AccessMethodInfo *findAccessMethodByOid(Oid oid); extern CollInfo *findCollationByOid(Oid oid); extern NamespaceInfo *findNamespaceByOid(Oid oid); extern ExtensionInfo *findExtensionByOid(Oid oid); diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c index 0b0977788f1..a02da3e9652 100644 --- a/src/bin/pg_dump/pg_dump_sort.c +++ b/src/bin/pg_dump/pg_dump_sort.c @@ -76,10 +76,10 @@ enum dbObjectTypePriorities PRIO_TABLE_ATTACH, PRIO_DUMMY_TYPE, PRIO_ATTRDEF, - PRIO_LARGE_OBJECT, PRIO_PRE_DATA_BOUNDARY, /* boundary! */ PRIO_TABLE_DATA, PRIO_SEQUENCE_SET, + PRIO_LARGE_OBJECT, PRIO_LARGE_OBJECT_DATA, PRIO_STATISTICS_DATA_DATA, PRIO_POST_DATA_BOUNDARY, /* boundary! */ @@ -162,6 +162,8 @@ static DumpId postDataBoundId; static int DOTypeNameCompare(const void *p1, const void *p2); +static int pgTypeNameCompare(Oid typid1, Oid typid2); +static int accessMethodNameCompare(Oid am1, Oid am2); static bool TopoSort(DumpableObject **objs, int numObjs, DumpableObject **ordering, @@ -228,12 +230,39 @@ DOTypeNameCompare(const void *p1, const void *p2) else if (obj2->namespace) return 1; - /* Sort by name */ + /* + * Sort by name. With a few exceptions, names here are single catalog + * columns. To get a fuller picture, grep pg_dump.c for "dobj.name = ". + * Names here don't match "Name:" in plain format output, which is a + * _tocEntry.tag. For example, DumpableObject.name of a constraint is + * pg_constraint.conname, but _tocEntry.tag of a constraint is relname and + * conname joined with a space. + */ cmpval = strcmp(obj1->name, obj2->name); if (cmpval != 0) return cmpval; - /* To have a stable sort order, break ties for some object types */ + /* + * Sort by type. This helps types that share a type priority without + * sharing a unique name constraint, e.g. opclass and opfamily. + */ + cmpval = obj1->objType - obj2->objType; + if (cmpval != 0) + return cmpval; + + /* + * To have a stable sort order, break ties for some object types. Most + * catalogs have a natural key, e.g. pg_proc_proname_args_nsp_index. Where + * the above "namespace" and "name" comparisons don't cover all natural + * key columns, compare the rest here. + * + * The natural key usually refers to other catalogs by surrogate keys. + * Hence, this translates each of those references to the natural key of + * the referenced catalog. That may descend through multiple levels of + * catalog references. For example, to sort by pg_proc.proargtypes, + * descend to each pg_type and then further to its pg_namespace, for an + * overall sort by (nspname, typname). + */ if (obj1->objType == DO_FUNC || obj1->objType == DO_AGG) { FuncInfo *fobj1 = *(FuncInfo *const *) p1; @@ -246,22 +275,10 @@ DOTypeNameCompare(const void *p1, const void *p2) return cmpval; for (i = 0; i < fobj1->nargs; i++) { - TypeInfo *argtype1 = findTypeByOid(fobj1->argtypes[i]); - TypeInfo *argtype2 = findTypeByOid(fobj2->argtypes[i]); - - if (argtype1 && argtype2) - { - if (argtype1->dobj.namespace && argtype2->dobj.namespace) - { - cmpval = strcmp(argtype1->dobj.namespace->dobj.name, - argtype2->dobj.namespace->dobj.name); - if (cmpval != 0) - return cmpval; - } - cmpval = strcmp(argtype1->dobj.name, argtype2->dobj.name); - if (cmpval != 0) - return cmpval; - } + cmpval = pgTypeNameCompare(fobj1->argtypes[i], + fobj2->argtypes[i]); + if (cmpval != 0) + return cmpval; } } else if (obj1->objType == DO_OPERATOR) @@ -273,6 +290,57 @@ DOTypeNameCompare(const void *p1, const void *p2) cmpval = (oobj2->oprkind - oobj1->oprkind); if (cmpval != 0) return cmpval; + /* Within an oprkind, sort by argument type names */ + cmpval = pgTypeNameCompare(oobj1->oprleft, oobj2->oprleft); + if (cmpval != 0) + return cmpval; + cmpval = pgTypeNameCompare(oobj1->oprright, oobj2->oprright); + if (cmpval != 0) + return cmpval; + } + else if (obj1->objType == DO_OPCLASS) + { + OpclassInfo *opcobj1 = *(OpclassInfo *const *) p1; + OpclassInfo *opcobj2 = *(OpclassInfo *const *) p2; + + /* Sort by access method name, per pg_opclass_am_name_nsp_index */ + cmpval = accessMethodNameCompare(opcobj1->opcmethod, + opcobj2->opcmethod); + if (cmpval != 0) + return cmpval; + } + else if (obj1->objType == DO_OPFAMILY) + { + OpfamilyInfo *opfobj1 = *(OpfamilyInfo *const *) p1; + OpfamilyInfo *opfobj2 = *(OpfamilyInfo *const *) p2; + + /* Sort by access method name, per pg_opfamily_am_name_nsp_index */ + cmpval = accessMethodNameCompare(opfobj1->opfmethod, + opfobj2->opfmethod); + if (cmpval != 0) + return cmpval; + } + else if (obj1->objType == DO_COLLATION) + { + CollInfo *cobj1 = *(CollInfo *const *) p1; + CollInfo *cobj2 = *(CollInfo *const *) p2; + + /* + * Sort by encoding, per pg_collation_name_enc_nsp_index. Technically, + * this is not necessary, because wherever this changes dump order, + * restoring the dump fails anyway. CREATE COLLATION can't create a + * tie for this to break, because it imposes restrictions to make + * (nspname, collname) uniquely identify a collation within a given + * DatabaseEncoding. While pg_import_system_collations() can create a + * tie, pg_dump+restore fails after + * pg_import_system_collations('my_schema') does so. However, there's + * little to gain by ignoring one natural key column on the basis of + * those limitations elsewhere, so respect the full natural key like + * we do for other object types. + */ + cmpval = cobj1->collencoding - cobj2->collencoding; + if (cmpval != 0) + return cmpval; } else if (obj1->objType == DO_ATTRDEF) { @@ -317,11 +385,143 @@ DOTypeNameCompare(const void *p1, const void *p2) if (cmpval != 0) return cmpval; } + else if (obj1->objType == DO_CONSTRAINT) + { + ConstraintInfo *robj1 = *(ConstraintInfo *const *) p1; + ConstraintInfo *robj2 = *(ConstraintInfo *const *) p2; + + /* + * Sort domain constraints before table constraints, for consistency + * with our decision to sort CREATE DOMAIN before CREATE TABLE. + */ + if (robj1->condomain) + { + if (robj2->condomain) + { + /* Sort by domain name (domain namespace was considered) */ + cmpval = strcmp(robj1->condomain->dobj.name, + robj2->condomain->dobj.name); + if (cmpval != 0) + return cmpval; + } + else + return PRIO_TYPE - PRIO_TABLE; + } + else if (robj2->condomain) + return PRIO_TABLE - PRIO_TYPE; + else + { + /* Sort by table name (table namespace was considered already) */ + cmpval = strcmp(robj1->contable->dobj.name, + robj2->contable->dobj.name); + if (cmpval != 0) + return cmpval; + } + } + else if (obj1->objType == DO_PUBLICATION_REL) + { + PublicationRelInfo *probj1 = *(PublicationRelInfo *const *) p1; + PublicationRelInfo *probj2 = *(PublicationRelInfo *const *) p2; + + /* Sort by publication name, since (namespace, name) match the rel */ + cmpval = strcmp(probj1->publication->dobj.name, + probj2->publication->dobj.name); + if (cmpval != 0) + return cmpval; + } + else if (obj1->objType == DO_PUBLICATION_TABLE_IN_SCHEMA) + { + PublicationSchemaInfo *psobj1 = *(PublicationSchemaInfo *const *) p1; + PublicationSchemaInfo *psobj2 = *(PublicationSchemaInfo *const *) p2; + + /* Sort by publication name, since ->name is just nspname */ + cmpval = strcmp(psobj1->publication->dobj.name, + psobj2->publication->dobj.name); + if (cmpval != 0) + return cmpval; + } - /* Usually shouldn't get here, but if we do, sort by OID */ + /* + * Shouldn't get here except after catalog corruption, but if we do, sort + * by OID. This may make logically-identical databases differ in the + * order of objects in dump output. Users will get spurious schema diffs. + * Expect flaky failures of 002_pg_upgrade.pl test 'dump outputs from + * original and restored regression databases match' if the regression + * database contains objects allowing that test to reach here. That's a + * consequence of the test using "pg_restore -j", which doesn't fully + * constrain OID assignment order. + */ + Assert(false); return oidcmp(obj1->catId.oid, obj2->catId.oid); } +/* Compare two OID-identified pg_type values by nspname, then by typname. */ +static int +pgTypeNameCompare(Oid typid1, Oid typid2) +{ + TypeInfo *typobj1; + TypeInfo *typobj2; + int cmpval; + + if (typid1 == typid2) + return 0; + + typobj1 = findTypeByOid(typid1); + typobj2 = findTypeByOid(typid2); + + if (!typobj1 || !typobj2) + { + /* + * getTypes() didn't find some OID. Assume catalog corruption, e.g. + * an oprright value without the corresponding OID in a pg_type row. + * Report as "equal", so the caller uses the next available basis for + * comparison, e.g. the next function argument. + * + * Unary operators have InvalidOid in oprleft (if oprkind='r') or in + * oprright (if oprkind='l'). Caller already sorted by oprkind, + * calling us only for like-kind operators. Hence, "typid1 == typid2" + * took care of InvalidOid. (v14 removed postfix operator support. + * Hence, when dumping from v14+, only oprleft can be InvalidOid.) + */ + Assert(false); + return 0; + } + + if (!typobj1->dobj.namespace || !typobj2->dobj.namespace) + Assert(false); /* catalog corruption */ + else + { + cmpval = strcmp(typobj1->dobj.namespace->dobj.name, + typobj2->dobj.namespace->dobj.name); + if (cmpval != 0) + return cmpval; + } + return strcmp(typobj1->dobj.name, typobj2->dobj.name); +} + +/* Compare two OID-identified pg_am values by amname. */ +static int +accessMethodNameCompare(Oid am1, Oid am2) +{ + AccessMethodInfo *amobj1; + AccessMethodInfo *amobj2; + + if (am1 == am2) + return 0; + + amobj1 = findAccessMethodByOid(am1); + amobj2 = findAccessMethodByOid(am2); + + if (!amobj1 || !amobj2) + { + /* catalog corruption: handle like pgTypeNameCompare() does */ + Assert(false); + return 0; + } + + return strcmp(amobj1->dobj.name, amobj2->dobj.name); +} + /* * Sort the given objects into a safe dump order using dependency @@ -907,7 +1107,7 @@ repairTableAttrDefMultiLoop(DumpableObject *tableobj, } /* - * CHECK constraints on domains work just like those on tables ... + * CHECK, NOT NULL constraints on domains work just like those on tables ... */ static void repairDomainConstraintLoop(DumpableObject *domainobj, @@ -1173,11 +1373,12 @@ repairDependencyLoop(DumpableObject **loop, } } - /* Domain and CHECK constraint */ + /* Domain and CHECK or NOT NULL constraint */ if (nLoop == 2 && loop[0]->objType == DO_TYPE && loop[1]->objType == DO_CONSTRAINT && - ((ConstraintInfo *) loop[1])->contype == 'c' && + (((ConstraintInfo *) loop[1])->contype == 'c' || + ((ConstraintInfo *) loop[1])->contype == 'n') && ((ConstraintInfo *) loop[1])->condomain == (TypeInfo *) loop[0]) { repairDomainConstraintLoop(loop[0], loop[1]); @@ -1186,14 +1387,15 @@ repairDependencyLoop(DumpableObject **loop, if (nLoop == 2 && loop[1]->objType == DO_TYPE && loop[0]->objType == DO_CONSTRAINT && - ((ConstraintInfo *) loop[0])->contype == 'c' && + (((ConstraintInfo *) loop[0])->contype == 'c' || + ((ConstraintInfo *) loop[0])->contype == 'n') && ((ConstraintInfo *) loop[0])->condomain == (TypeInfo *) loop[1]) { repairDomainConstraintLoop(loop[1], loop[0]); return; } - /* Indirect loop involving domain and CHECK constraint */ + /* Indirect loop involving domain and CHECK or NOT NULL constraint */ if (nLoop > 2) { for (i = 0; i < nLoop; i++) @@ -1203,7 +1405,8 @@ repairDependencyLoop(DumpableObject **loop, for (j = 0; j < nLoop; j++) { if (loop[j]->objType == DO_CONSTRAINT && - ((ConstraintInfo *) loop[j])->contype == 'c' && + (((ConstraintInfo *) loop[j])->contype == 'c' || + ((ConstraintInfo *) loop[j])->contype == 'n') && ((ConstraintInfo *) loop[j])->condomain == (TypeInfo *) loop[i]) { repairDomainConstraintMultiLoop(loop[i], loop[j]); diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 7f9c302b719..27aa1b65698 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -65,10 +65,9 @@ static void dropTablespaces(PGconn *conn); static void dumpTablespaces(PGconn *conn); static void dropDBs(PGconn *conn); static void dumpUserConfig(PGconn *conn, const char *username); -static void dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat); +static void dumpDatabases(PGconn *conn); static void dumpTimestamp(const char *msg); -static int runPgDump(const char *dbname, const char *create_opts, - char *dbfile, ArchiveFormat archDumpFormat); +static int runPgDump(const char *dbname, const char *create_opts); static void buildShSecLabels(PGconn *conn, const char *catalog_name, Oid objectId, const char *objtype, const char *objname, @@ -77,7 +76,6 @@ static void executeCommand(PGconn *conn, const char *query); static void expand_dbname_patterns(PGconn *conn, SimpleStringList *patterns, SimpleStringList *names); static void read_dumpall_filters(const char *filename, SimpleStringList *pattern); -static ArchiveFormat parseDumpFormat(const char *format); static char pg_dump_bin[MAXPGPATH]; static PQExpBuffer pgdumpopts; @@ -107,8 +105,6 @@ static int no_subscriptions = 0; static int no_toast_compression = 0; static int no_unlogged_table_data = 0; static int no_role_passwords = 0; -static int with_data = 0; -static int with_schema = 0; static int with_statistics = 0; static int server_version; static int load_via_partition_root = 0; @@ -150,7 +146,6 @@ main(int argc, char *argv[]) {"password", no_argument, NULL, 'W'}, {"no-privileges", no_argument, NULL, 'x'}, {"no-acl", no_argument, NULL, 'x'}, - {"format", required_argument, NULL, 'F'}, /* * the following options don't have an equivalent short option letter @@ -183,11 +178,9 @@ main(int argc, char *argv[]) {"no-sync", no_argument, NULL, 4}, {"no-toast-compression", no_argument, &no_toast_compression, 1}, {"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1}, - {"with-data", no_argument, &with_data, 1}, - {"with-schema", no_argument, &with_schema, 1}, - {"with-statistics", no_argument, &with_statistics, 1}, {"on-conflict-do-nothing", no_argument, &on_conflict_do_nothing, 1}, {"rows-per-insert", required_argument, NULL, 7}, + {"statistics", no_argument, &with_statistics, 1}, {"statistics-only", no_argument, &statistics_only, 1}, {"filter", required_argument, NULL, 8}, {"sequence-data", no_argument, &sequence_data, 1}, @@ -201,8 +194,6 @@ main(int argc, char *argv[]) char *pgdb = NULL; char *use_role = NULL; const char *dumpencoding = NULL; - ArchiveFormat archDumpFormat = archNull; - const char *formatName = "p"; trivalue prompt_password = TRI_DEFAULT; bool data_only = false; bool globals_only = false; @@ -252,7 +243,7 @@ main(int argc, char *argv[]) pgdumpopts = createPQExpBuffer(); - while ((c = getopt_long(argc, argv, "acd:E:f:F:gh:l:Op:rsS:tU:vwWx", long_options, &optindex)) != -1) + while ((c = getopt_long(argc, argv, "acd:E:f:gh:l:Op:rsS:tU:vwWx", long_options, &optindex)) != -1) { switch (c) { @@ -280,9 +271,7 @@ main(int argc, char *argv[]) appendPQExpBufferStr(pgdumpopts, " -f "); appendShellString(pgdumpopts, filename); break; - case 'F': - formatName = pg_strdup(optarg); - break; + case 'g': globals_only = true; break; @@ -431,21 +420,6 @@ main(int argc, char *argv[]) exit_nicely(1); } - /* Get format for dump. */ - archDumpFormat = parseDumpFormat(formatName); - - /* - * If a non-plain format is specified, a file name is also required as the - * path to the main directory. - */ - if (archDumpFormat != archNull && - (!filename || strcmp(filename, "") == 0)) - { - pg_log_error("option -F/--format=d|c|t requires option -f/--file"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - exit_nicely(1); - } - /* * If password values are not required in the dump, switch to using * pg_roles which is equally useful, just more likely to have unrestricted @@ -497,12 +471,8 @@ main(int argc, char *argv[]) appendPQExpBufferStr(pgdumpopts, " --no-toast-compression"); if (no_unlogged_table_data) appendPQExpBufferStr(pgdumpopts, " --no-unlogged-table-data"); - if (with_data) - appendPQExpBufferStr(pgdumpopts, " --with-data"); - if (with_schema) - appendPQExpBufferStr(pgdumpopts, " --with-schema"); if (with_statistics) - appendPQExpBufferStr(pgdumpopts, " --with-statistics"); + appendPQExpBufferStr(pgdumpopts, " --statistics"); if (on_conflict_do_nothing) appendPQExpBufferStr(pgdumpopts, " --on-conflict-do-nothing"); if (statistics_only) @@ -511,33 +481,6 @@ main(int argc, char *argv[]) appendPQExpBufferStr(pgdumpopts, " --sequence-data"); /* - * Open the output file if required, otherwise use stdout. If required, - * then create new directory and global.dat file. - */ - if (archDumpFormat != archNull) - { - char global_path[MAXPGPATH]; - - /* Create new directory or accept the empty existing directory. */ - create_or_open_dir(filename); - - snprintf(global_path, MAXPGPATH, "%s/global.dat", filename); - - OPF = fopen(global_path, PG_BINARY_W); - if (!OPF) - pg_fatal("could not open \"%s\": %m", global_path); - } - else if (filename) - { - OPF = fopen(filename, PG_BINARY_W); - if (!OPF) - pg_fatal("could not open output file \"%s\": %m", - filename); - } - else - OPF = stdout; - - /* * If there was a database specified on the command line, use that, * otherwise try to connect to database "postgres", and failing that * "template1". @@ -577,6 +520,19 @@ main(int argc, char *argv[]) &database_exclude_names); /* + * Open the output file if required, otherwise use stdout + */ + if (filename) + { + OPF = fopen(filename, PG_BINARY_W); + if (!OPF) + pg_fatal("could not open output file \"%s\": %m", + filename); + } + else + OPF = stdout; + + /* * Set the client encoding if requested. */ if (dumpencoding) @@ -632,7 +588,7 @@ main(int argc, char *argv[]) fprintf(OPF, "SET escape_string_warning = off;\n"); fprintf(OPF, "\n"); - if (!data_only) + if (!data_only && !statistics_only && !no_schema) { /* * If asked to --clean, do that first. We can avoid detailed @@ -675,7 +631,7 @@ main(int argc, char *argv[]) } if (!globals_only && !roles_only && !tablespaces_only) - dumpDatabases(conn, archDumpFormat); + dumpDatabases(conn); PQfinish(conn); @@ -688,7 +644,7 @@ main(int argc, char *argv[]) fclose(OPF); /* sync the resulting file, errors are not fatal */ - if (dosync && (archDumpFormat == archNull)) + if (dosync) (void) fsync_fname(filename, false); } @@ -699,14 +655,12 @@ main(int argc, char *argv[]) static void help(void) { - printf(_("%s extracts a PostgreSQL database cluster based on specified dump format.\n\n"), progname); + printf(_("%s exports a PostgreSQL database cluster as an SQL script.\n\n"), progname); printf(_("Usage:\n")); printf(_(" %s [OPTION]...\n"), progname); printf(_("\nGeneral options:\n")); printf(_(" -f, --file=FILENAME output file name\n")); - printf(_(" -F, --format=c|d|t|p output file format (custom, directory, tar,\n" - " plain text (default))\n")); printf(_(" -v, --verbose verbose mode\n")); printf(_(" -V, --version output version information, then exit\n")); printf(_(" --lock-wait-timeout=TIMEOUT fail after waiting TIMEOUT for a table lock\n")); @@ -750,13 +704,11 @@ help(void) printf(_(" --quote-all-identifiers quote all identifiers, even if not key words\n")); printf(_(" --rows-per-insert=NROWS number of rows per INSERT; implies --inserts\n")); printf(_(" --sequence-data include sequence data in dump\n")); + printf(_(" --statistics dump the statistics\n")); printf(_(" --statistics-only dump only the statistics, not schema or data\n")); printf(_(" --use-set-session-authorization\n" " use SET SESSION AUTHORIZATION commands instead of\n" " ALTER OWNER commands to set ownership\n")); - printf(_(" --with-data dump the data\n")); - printf(_(" --with-schema dump the schema\n")); - printf(_(" --with-statistics dump the statistics\n")); printf(_("\nConnection options:\n")); printf(_(" -d, --dbname=CONNSTR connect using connection string\n")); @@ -1013,6 +965,9 @@ dumpRoles(PGconn *conn) * We do it this way because config settings for roles could mention the * names of other roles. */ + if (PQntuples(res) > 0) + fprintf(OPF, "\n--\n-- User Configurations\n--\n"); + for (i = 0; i < PQntuples(res); i++) dumpUserConfig(conn, PQgetvalue(res, i, i_rolname)); @@ -1526,7 +1481,6 @@ dumpUserConfig(PGconn *conn, const char *username) { PQExpBuffer buf = createPQExpBuffer(); PGresult *res; - static bool header_done = false; printfPQExpBuffer(buf, "SELECT unnest(setconfig) FROM pg_db_role_setting " "WHERE setdatabase = 0 AND setrole = " @@ -1538,13 +1492,7 @@ dumpUserConfig(PGconn *conn, const char *username) res = executeQuery(conn, buf->data); if (PQntuples(res) > 0) - { - if (!header_done) - fprintf(OPF, "\n--\n-- User Configurations\n--\n"); - header_done = true; - fprintf(OPF, "\n--\n-- User Config \"%s\"\n--\n\n", username); - } for (int i = 0; i < PQntuples(res); i++) { @@ -1618,13 +1566,10 @@ expand_dbname_patterns(PGconn *conn, * Dump contents of databases. */ static void -dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) +dumpDatabases(PGconn *conn) { PGresult *res; int i; - char db_subdir[MAXPGPATH]; - char dbfilepath[MAXPGPATH]; - FILE *map_file = NULL; /* * Skip databases marked not datallowconn, since we'd be unable to connect @@ -1638,42 +1583,18 @@ dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) * doesn't have some failure mode with --clean. */ res = executeQuery(conn, - "SELECT datname, oid " + "SELECT datname " "FROM pg_database d " "WHERE datallowconn AND datconnlimit != -2 " "ORDER BY (datname <> 'template1'), datname"); - if (archDumpFormat == archNull && PQntuples(res) > 0) + if (PQntuples(res) > 0) fprintf(OPF, "--\n-- Databases\n--\n\n"); - /* - * If directory/tar/custom format is specified, create a subdirectory - * under the main directory and each database dump file or subdirectory - * will be created in that subdirectory by pg_dump. - */ - if (archDumpFormat != archNull) - { - char map_file_path[MAXPGPATH]; - - snprintf(db_subdir, MAXPGPATH, "%s/databases", filename); - - /* Create a subdirectory with 'databases' name under main directory. */ - if (mkdir(db_subdir, pg_dir_create_mode) != 0) - pg_fatal("could not create subdirectory \"%s\": %m", db_subdir); - - snprintf(map_file_path, MAXPGPATH, "%s/map.dat", filename); - - /* Create a map file (to store dboid and dbname) */ - map_file = fopen(map_file_path, PG_BINARY_W); - if (!map_file) - pg_fatal("could not open map file: %s", strerror(errno)); - } - for (i = 0; i < PQntuples(res); i++) { char *dbname = PQgetvalue(res, i, 0); - char *oid = PQgetvalue(res, i, 1); - const char *create_opts = ""; + const char *create_opts; int ret; /* Skip template0, even if it's not marked !datallowconn. */ @@ -1687,27 +1608,9 @@ dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) continue; } - /* - * If this is not a plain format dump, then append dboid and dbname to - * the map.dat file. - */ - if (archDumpFormat != archNull) - { - if (archDumpFormat == archCustom) - snprintf(dbfilepath, MAXPGPATH, "\"%s\"/\"%s\".dmp", db_subdir, oid); - else if (archDumpFormat == archTar) - snprintf(dbfilepath, MAXPGPATH, "\"%s\"/\"%s\".tar", db_subdir, oid); - else - snprintf(dbfilepath, MAXPGPATH, "\"%s\"/\"%s\"", db_subdir, oid); - - /* Put one line entry for dboid and dbname in map file. */ - fprintf(map_file, "%s %s\n", oid, dbname); - } - pg_log_info("dumping database \"%s\"", dbname); - if (archDumpFormat == archNull) - fprintf(OPF, "--\n-- Database \"%s\" dump\n--\n\n", dbname); + fprintf(OPF, "--\n-- Database \"%s\" dump\n--\n\n", dbname); /* * We assume that "template1" and "postgres" already exist in the @@ -1721,9 +1624,12 @@ dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) { if (output_clean) create_opts = "--clean --create"; - /* Since pg_dump won't emit a \connect command, we must */ - else if (archDumpFormat == archNull) + else + { + create_opts = ""; + /* Since pg_dump won't emit a \connect command, we must */ fprintf(OPF, "\\connect %s\n\n", dbname); + } } else create_opts = "--create"; @@ -1731,30 +1637,19 @@ dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) if (filename) fclose(OPF); - ret = runPgDump(dbname, create_opts, dbfilepath, archDumpFormat); + ret = runPgDump(dbname, create_opts); if (ret != 0) pg_fatal("pg_dump failed on database \"%s\", exiting", dbname); if (filename) { - char global_path[MAXPGPATH]; - - if (archDumpFormat != archNull) - snprintf(global_path, MAXPGPATH, "%s/global.dat", filename); - else - snprintf(global_path, MAXPGPATH, "%s", filename); - - OPF = fopen(global_path, PG_BINARY_A); + OPF = fopen(filename, PG_BINARY_A); if (!OPF) pg_fatal("could not re-open the output file \"%s\": %m", - global_path); + filename); } } - /* Close map file */ - if (archDumpFormat != archNull) - fclose(map_file); - PQclear(res); } @@ -1764,8 +1659,7 @@ dumpDatabases(PGconn *conn, ArchiveFormat archDumpFormat) * Run pg_dump on dbname, with specified options. */ static int -runPgDump(const char *dbname, const char *create_opts, char *dbfile, - ArchiveFormat archDumpFormat) +runPgDump(const char *dbname, const char *create_opts) { PQExpBufferData connstrbuf; PQExpBufferData cmd; @@ -1774,36 +1668,17 @@ runPgDump(const char *dbname, const char *create_opts, char *dbfile, initPQExpBuffer(&connstrbuf); initPQExpBuffer(&cmd); + printfPQExpBuffer(&cmd, "\"%s\" %s %s", pg_dump_bin, + pgdumpopts->data, create_opts); + /* - * If this is not a plain format dump, then append file name and dump - * format to the pg_dump command to get archive dump. + * If we have a filename, use the undocumented plain-append pg_dump + * format. */ - if (archDumpFormat != archNull) - { - printfPQExpBuffer(&cmd, "\"%s\" -f %s %s", pg_dump_bin, - dbfile, create_opts); - - if (archDumpFormat == archDirectory) - appendPQExpBufferStr(&cmd, " --format=directory "); - else if (archDumpFormat == archCustom) - appendPQExpBufferStr(&cmd, " --format=custom "); - else if (archDumpFormat == archTar) - appendPQExpBufferStr(&cmd, " --format=tar "); - } + if (filename) + appendPQExpBufferStr(&cmd, " -Fa "); else - { - printfPQExpBuffer(&cmd, "\"%s\" %s %s", pg_dump_bin, - pgdumpopts->data, create_opts); - - /* - * If we have a filename, use the undocumented plain-append pg_dump - * format. - */ - if (filename) - appendPQExpBufferStr(&cmd, " -Fa "); - else - appendPQExpBufferStr(&cmd, " -Fp "); - } + appendPQExpBufferStr(&cmd, " -Fp "); /* * Append the database name to the already-constructed stem of connection @@ -1948,36 +1823,3 @@ read_dumpall_filters(const char *filename, SimpleStringList *pattern) filter_free(&fstate); } - -/* - * parseDumpFormat - * - * This will validate dump formats. - */ -static ArchiveFormat -parseDumpFormat(const char *format) -{ - ArchiveFormat archDumpFormat; - - if (pg_strcasecmp(format, "c") == 0) - archDumpFormat = archCustom; - else if (pg_strcasecmp(format, "custom") == 0) - archDumpFormat = archCustom; - else if (pg_strcasecmp(format, "d") == 0) - archDumpFormat = archDirectory; - else if (pg_strcasecmp(format, "directory") == 0) - archDumpFormat = archDirectory; - else if (pg_strcasecmp(format, "p") == 0) - archDumpFormat = archNull; - else if (pg_strcasecmp(format, "plain") == 0) - archDumpFormat = archNull; - else if (pg_strcasecmp(format, "t") == 0) - archDumpFormat = archTar; - else if (pg_strcasecmp(format, "tar") == 0) - archDumpFormat = archTar; - else - pg_fatal("unrecognized archive format \"%s\"; please specify \"c\", \"d\", \"p\", or \"t\"", - format); - - return archDumpFormat; -} diff --git a/src/bin/pg_dump/pg_restore.c b/src/bin/pg_dump/pg_restore.c index f2182e91825..6c129278bc5 100644 --- a/src/bin/pg_dump/pg_restore.c +++ b/src/bin/pg_dump/pg_restore.c @@ -2,7 +2,7 @@ * * pg_restore.c * pg_restore is an utility extracting postgres database definitions - * from a backup archive created by pg_dump/pg_dumpall using the archiver + * from a backup archive created by pg_dump using the archiver * interface. * * pg_restore will read the backup archive and @@ -41,15 +41,11 @@ #include "postgres_fe.h" #include <ctype.h> -#include <sys/stat.h> #ifdef HAVE_TERMIOS_H #include <termios.h> #endif -#include "common/string.h" -#include "connectdb.h" #include "fe_utils/option_utils.h" -#include "fe_utils/string_utils.h" #include "filter.h" #include "getopt_long.h" #include "parallel.h" @@ -57,43 +53,18 @@ static void usage(const char *progname); static void read_restore_filters(const char *filename, RestoreOptions *opts); -static bool file_exists_in_directory(const char *dir, const char *filename); -static int restore_one_database(const char *inputFileSpec, RestoreOptions *opts, - int numWorkers, bool append_data, int num); -static int read_one_statement(StringInfo inBuf, FILE *pfile); -static int restore_all_databases(PGconn *conn, const char *dumpdirpath, - SimpleStringList db_exclude_patterns, RestoreOptions *opts, int numWorkers); -static int process_global_sql_commands(PGconn *conn, const char *dumpdirpath, - const char *outfile); -static void copy_or_print_global_file(const char *outfile, FILE *pfile); -static int get_dbnames_list_to_restore(PGconn *conn, - SimplePtrList *dbname_oid_list, - SimpleStringList db_exclude_patterns); -static int get_dbname_oid_list_from_mfile(const char *dumpdirpath, - SimplePtrList *dbname_oid_list); - -/* - * Stores a database OID and the corresponding name. - */ -typedef struct DbOidName -{ - Oid oid; - char str[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated string here */ -} DbOidName; - int main(int argc, char **argv) { RestoreOptions *opts; int c; + int exit_code; int numWorkers = 1; + Archive *AH; char *inputFileSpec; bool data_only = false; bool schema_only = false; - int n_errors = 0; - bool globals_only = false; - SimpleStringList db_exclude_patterns = {NULL, NULL}; static int disable_triggers = 0; static int enable_row_security = 0; static int if_exists = 0; @@ -111,15 +82,12 @@ main(int argc, char **argv) static int no_subscriptions = 0; static int strict_names = 0; static int statistics_only = 0; - static int with_data = 0; - static int with_schema = 0; static int with_statistics = 0; struct option cmdopts[] = { {"clean", 0, NULL, 'c'}, {"create", 0, NULL, 'C'}, {"data-only", 0, NULL, 'a'}, - {"globals-only", 0, NULL, 'g'}, {"dbname", 1, NULL, 'd'}, {"exit-on-error", 0, NULL, 'e'}, {"exclude-schema", 1, NULL, 'N'}, @@ -169,12 +137,9 @@ main(int argc, char **argv) {"no-security-labels", no_argument, &no_security_labels, 1}, {"no-subscriptions", no_argument, &no_subscriptions, 1}, {"no-statistics", no_argument, &no_statistics, 1}, - {"with-data", no_argument, &with_data, 1}, - {"with-schema", no_argument, &with_schema, 1}, - {"with-statistics", no_argument, &with_statistics, 1}, + {"statistics", no_argument, &with_statistics, 1}, {"statistics-only", no_argument, &statistics_only, 1}, {"filter", required_argument, NULL, 4}, - {"exclude-database", required_argument, NULL, 6}, {NULL, 0, NULL, 0} }; @@ -203,7 +168,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "acCd:ef:F:gh:I:j:lL:n:N:Op:P:RsS:t:T:U:vwWx1", + while ((c = getopt_long(argc, argv, "acCd:ef:F:h:I:j:lL:n:N:Op:P:RsS:t:T:U:vwWx1", cmdopts, NULL)) != -1) { switch (c) @@ -230,14 +195,11 @@ main(int argc, char **argv) if (strlen(optarg) != 0) opts->formatName = pg_strdup(optarg); break; - case 'g': - /* restore only global.dat file from directory */ - globals_only = true; - break; case 'h': if (strlen(optarg) != 0) opts->cparams.pghost = pg_strdup(optarg); break; + case 'j': /* number of restore jobs */ if (!option_parse_int(optarg, "-j/--jobs", 1, PG_MAX_JOBS, @@ -352,9 +314,6 @@ main(int argc, char **argv) exit(1); opts->exit_on_error = true; break; - case 6: /* database patterns to skip */ - simple_string_list_append(&db_exclude_patterns, optarg); - break; default: /* getopt_long already emitted a complaint */ @@ -382,13 +341,6 @@ main(int argc, char **argv) if (!opts->cparams.dbname && !opts->filename && !opts->tocSummary) pg_fatal("one of -d/--dbname and -f/--file must be specified"); - if (db_exclude_patterns.head != NULL && globals_only) - { - pg_log_error("option --exclude-database cannot be used together with -g/--globals-only"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - exit_nicely(1); - } - /* Should get at most one of -d and -f, else user is confused */ if (opts->cparams.dbname) { @@ -417,13 +369,17 @@ main(int argc, char **argv) if (statistics_only && no_statistics) pg_fatal("options --statistics-only and --no-statistics cannot be used together"); - /* reject conflicting "with-" and "no-" options */ - if (with_data && no_data) - pg_fatal("options --with-data and --no-data cannot be used together"); - if (with_schema && no_schema) - pg_fatal("options --with-schema and --no-schema cannot be used together"); + /* reject conflicting "no-" options */ if (with_statistics && no_statistics) - pg_fatal("options --with-statistics and --no-statistics cannot be used together"); + pg_fatal("options --statistics and --no-statistics cannot be used together"); + + /* reject conflicting "only-" options */ + if (data_only && with_statistics) + pg_fatal("options %s and %s cannot be used together", + "-a/--data-only", "--statistics"); + if (schema_only && with_statistics) + pg_fatal("options %s and %s cannot be used together", + "-s/--schema-only", "--statistics"); if (data_only && opts->dropSchema) pg_fatal("options -c/--clean and -a/--data-only cannot be used together"); @@ -443,16 +399,14 @@ main(int argc, char **argv) pg_fatal("cannot specify both --single-transaction and multiple jobs"); /* - * Set derivative flags. An "-only" option may be overridden by an - * explicit "with-" option; e.g. "--schema-only --with-statistics" will - * include schema and statistics. Other ambiguous or nonsensical - * combinations, e.g. "--schema-only --no-schema", will have already - * caused an error in one of the checks above. + * Set derivative flags. Ambiguous or nonsensical combinations, e.g. + * "--schema-only --no-schema", will have already caused an error in one + * of the checks above. */ opts->dumpData = ((opts->dumpData && !schema_only && !statistics_only) || - (data_only || with_data)) && !no_data; + data_only) && !no_data; opts->dumpSchema = ((opts->dumpSchema && !data_only && !statistics_only) || - (schema_only || with_schema)) && !no_schema; + schema_only) && !no_schema; opts->dumpStatistics = ((opts->dumpStatistics && !schema_only && !data_only) || (statistics_only || with_statistics)) && !no_statistics; @@ -496,114 +450,6 @@ main(int argc, char **argv) opts->formatName); } - /* - * If toc.dat file is not present in the current path, then check for - * global.dat. If global.dat file is present, then restore all the - * databases from map.dat (if it exists), but skip restoring those - * matching --exclude-database patterns. - */ - if (inputFileSpec != NULL && !file_exists_in_directory(inputFileSpec, "toc.dat") && - file_exists_in_directory(inputFileSpec, "global.dat")) - { - PGconn *conn = NULL; /* Connection to restore global sql - * commands. */ - - /* - * Can only use --list or --use-list options with a single database - * dump. - */ - if (opts->tocSummary) - pg_fatal("option -l/--list cannot be used when restoring an archive created by pg_dumpall"); - else if (opts->tocFile) - pg_fatal("option -L/--use-list cannot be used when restoring an archive created by pg_dumpall"); - - /* - * To restore from a pg_dumpall archive, -C (create database) option - * must be specified unless we are only restoring globals. - */ - if (!globals_only && opts->createDB != 1) - { - pg_log_error("-C/--create option should be specified when restoring an archive created by pg_dumpall"); - pg_log_error_hint("Try \"%s --help\" for more information.", progname); - pg_log_error_hint("Individual databases can be restored using their specific archives."); - exit_nicely(1); - } - - /* - * Connect to the database to execute global sql commands from - * global.dat file. - */ - if (opts->cparams.dbname) - { - conn = ConnectDatabase(opts->cparams.dbname, NULL, opts->cparams.pghost, - opts->cparams.pgport, opts->cparams.username, TRI_DEFAULT, - false, progname, NULL, NULL, NULL, NULL); - - - if (!conn) - pg_fatal("could not connect to database \"%s\"", opts->cparams.dbname); - } - - /* If globals-only, then return from here. */ - if (globals_only) - { - /* - * Open global.dat file and execute/append all the global sql - * commands. - */ - n_errors = process_global_sql_commands(conn, inputFileSpec, - opts->filename); - - if (conn) - PQfinish(conn); - - pg_log_info("database restoring skipped as -g/--globals-only option was specified"); - } - else - { - /* Now restore all the databases from map.dat */ - n_errors = restore_all_databases(conn, inputFileSpec, db_exclude_patterns, - opts, numWorkers); - } - - /* Free db pattern list. */ - simple_string_list_destroy(&db_exclude_patterns); - } - else /* process if global.dat file does not exist. */ - { - if (db_exclude_patterns.head != NULL) - pg_fatal("option --exclude-database can be used only when restoring an archive created by pg_dumpall"); - - if (globals_only) - pg_fatal("option -g/--globals-only can be used only when restoring an archive created by pg_dumpall"); - - n_errors = restore_one_database(inputFileSpec, opts, numWorkers, false, 0); - } - - /* Done, print a summary of ignored errors during restore. */ - if (n_errors) - { - pg_log_warning("errors ignored on restore: %d", n_errors); - return 1; - } - - return 0; -} - -/* - * restore_one_database - * - * This will restore one database using toc.dat file. - * - * returns the number of errors while doing restore. - */ -static int -restore_one_database(const char *inputFileSpec, RestoreOptions *opts, - int numWorkers, bool append_data, int num) -{ - Archive *AH; - int n_errors; - AH = OpenArchive(inputFileSpec, opts->format); SetArchiveOptions(AH, NULL, opts); @@ -611,15 +457,9 @@ restore_one_database(const char *inputFileSpec, RestoreOptions *opts, /* * We don't have a connection yet but that doesn't matter. The connection * is initialized to NULL and if we terminate through exit_nicely() while - * it's still NULL, the cleanup function will just be a no-op. If we are - * restoring multiple databases, then only update AX handle for cleanup as - * the previous entry was already in the array and we had closed previous - * connection, so we can use the same array slot. + * it's still NULL, the cleanup function will just be a no-op. */ - if (!append_data || num == 0) - on_exit_close_archive(AH); - else - replace_on_exit_close_archive(AH); + on_exit_close_archive(AH); /* Let the archiver know how noisy to be */ AH->verbose = opts->verbose; @@ -639,21 +479,25 @@ restore_one_database(const char *inputFileSpec, RestoreOptions *opts, else { ProcessArchiveRestoreOptions(AH); - RestoreArchive(AH, append_data); + RestoreArchive(AH); } - n_errors = AH->n_errors; + /* done, print a summary of ignored errors */ + if (AH->n_errors) + pg_log_warning("errors ignored on restore: %d", AH->n_errors); /* AH may be freed in CloseArchive? */ + exit_code = AH->n_errors ? 1 : 0; + CloseArchive(AH); - return n_errors; + return exit_code; } static void usage(const char *progname) { - printf(_("%s restores PostgreSQL databases from archives created by pg_dump or pg_dumpall.\n\n"), progname); + printf(_("%s restores a PostgreSQL database from an archive created by pg_dump.\n\n"), progname); printf(_("Usage:\n")); printf(_(" %s [OPTION]... [FILE]\n"), progname); @@ -671,7 +515,6 @@ usage(const char *progname) printf(_(" -c, --clean clean (drop) database objects before recreating\n")); printf(_(" -C, --create create the target database\n")); printf(_(" -e, --exit-on-error exit on error, default is to continue\n")); - printf(_(" -g, --globals-only restore only global objects, no databases\n")); printf(_(" -I, --index=NAME restore named index\n")); printf(_(" -j, --jobs=NUM use this many parallel jobs to restore\n")); printf(_(" -L, --use-list=FILENAME use table of contents from this file for\n" @@ -688,7 +531,6 @@ usage(const char *progname) printf(_(" -1, --single-transaction restore as a single transaction\n")); printf(_(" --disable-triggers disable triggers during data-only restore\n")); printf(_(" --enable-row-security enable row security\n")); - printf(_(" --exclude-database=PATTERN do not restore the specified database(s)\n")); printf(_(" --filter=FILENAME restore or skip objects based on expressions\n" " in FILENAME\n")); printf(_(" --if-exists use IF EXISTS when dropping objects\n")); @@ -705,6 +547,7 @@ usage(const char *progname) printf(_(" --no-table-access-method do not restore table access methods\n")); printf(_(" --no-tablespaces do not restore tablespace assignments\n")); printf(_(" --section=SECTION restore named section (pre-data, data, or post-data)\n")); + printf(_(" --statistics restore the statistics\n")); printf(_(" --statistics-only restore only the statistics, not schema or data\n")); printf(_(" --strict-names require table and/or schema include patterns to\n" " match at least one entity each\n")); @@ -712,9 +555,6 @@ usage(const char *progname) printf(_(" --use-set-session-authorization\n" " use SET SESSION AUTHORIZATION commands instead of\n" " ALTER OWNER commands to set ownership\n")); - printf(_(" --with-data dump the data\n")); - printf(_(" --with-schema dump the schema\n")); - printf(_(" --with-statistics dump the statistics\n")); printf(_("\nConnection options:\n")); printf(_(" -h, --host=HOSTNAME database server host or socket directory\n")); @@ -725,8 +565,8 @@ usage(const char *progname) printf(_(" --role=ROLENAME do SET ROLE before restore\n")); printf(_("\n" - "The options -I, -n, -N, -P, -t, -T, --section, and --exclude-database can be combined\n" - "and specified multiple times to select multiple objects.\n")); + "The options -I, -n, -N, -P, -t, -T, and --section can be combined and specified\n" + "multiple times to select multiple objects.\n")); printf(_("\nIf no input file name is supplied, then standard input is used.\n\n")); printf(_("Report bugs to <%s>.\n"), PACKAGE_BUGREPORT); printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); @@ -831,578 +671,3 @@ read_restore_filters(const char *filename, RestoreOptions *opts) filter_free(&fstate); } - -/* - * file_exists_in_directory - * - * Returns true if the file exists in the given directory. - */ -static bool -file_exists_in_directory(const char *dir, const char *filename) -{ - struct stat st; - char buf[MAXPGPATH]; - - if (snprintf(buf, MAXPGPATH, "%s/%s", dir, filename) >= MAXPGPATH) - pg_fatal("directory name too long: \"%s\"", dir); - - return (stat(buf, &st) == 0 && S_ISREG(st.st_mode)); -} - -/* - * read_one_statement - * - * This will start reading from passed file pointer using fgetc and read till - * semicolon(sql statement terminator for global.dat file) - * - * EOF is returned if end-of-file input is seen; time to shut down. - */ - -static int -read_one_statement(StringInfo inBuf, FILE *pfile) -{ - int c; /* character read from getc() */ - int m; - - StringInfoData q; - - initStringInfo(&q); - - resetStringInfo(inBuf); - - /* - * Read characters until EOF or the appropriate delimiter is seen. - */ - while ((c = fgetc(pfile)) != EOF) - { - if (c != '\'' && c != '"' && c != '\n' && c != ';') - { - appendStringInfoChar(inBuf, (char) c); - while ((c = fgetc(pfile)) != EOF) - { - if (c != '\'' && c != '"' && c != ';' && c != '\n') - appendStringInfoChar(inBuf, (char) c); - else - break; - } - } - - if (c == '\'' || c == '"') - { - appendStringInfoChar(&q, (char) c); - m = c; - - while ((c = fgetc(pfile)) != EOF) - { - appendStringInfoChar(&q, (char) c); - - if (c == m) - { - appendStringInfoString(inBuf, q.data); - resetStringInfo(&q); - break; - } - } - } - - if (c == ';') - { - appendStringInfoChar(inBuf, (char) ';'); - break; - } - - if (c == '\n') - appendStringInfoChar(inBuf, (char) '\n'); - } - - pg_free(q.data); - - /* No input before EOF signal means time to quit. */ - if (c == EOF && inBuf->len == 0) - return EOF; - - /* return something that's not EOF */ - return 'Q'; -} - -/* - * get_dbnames_list_to_restore - * - * This will mark for skipping any entries from dbname_oid_list that pattern match an - * entry in the db_exclude_patterns list. - * - * Returns the number of database to be restored. - * - */ -static int -get_dbnames_list_to_restore(PGconn *conn, - SimplePtrList *dbname_oid_list, - SimpleStringList db_exclude_patterns) -{ - int count_db = 0; - PQExpBuffer query; - PGresult *res; - - query = createPQExpBuffer(); - - if (!conn) - pg_log_info("considering PATTERN as NAME for --exclude-database option as no db connection while doing pg_restore."); - - /* - * Process one by one all dbnames and if specified to skip restoring, then - * remove dbname from list. - */ - for (SimplePtrListCell *db_cell = dbname_oid_list->head; - db_cell; db_cell = db_cell->next) - { - DbOidName *dbidname = (DbOidName *) db_cell->ptr; - bool skip_db_restore = false; - PQExpBuffer db_lit = createPQExpBuffer(); - - appendStringLiteralConn(db_lit, dbidname->str, conn); - - for (SimpleStringListCell *pat_cell = db_exclude_patterns.head; pat_cell; pat_cell = pat_cell->next) - { - /* - * If there is an exact match then we don't need to try a pattern - * match - */ - if (pg_strcasecmp(dbidname->str, pat_cell->val) == 0) - skip_db_restore = true; - /* Otherwise, try a pattern match if there is a connection */ - else if (conn) - { - int dotcnt; - - appendPQExpBufferStr(query, "SELECT 1 "); - processSQLNamePattern(conn, query, pat_cell->val, false, - false, NULL, db_lit->data, - NULL, NULL, NULL, &dotcnt); - - if (dotcnt > 0) - { - pg_log_error("improper qualified name (too many dotted names): %s", - dbidname->str); - PQfinish(conn); - exit_nicely(1); - } - - res = executeQuery(conn, query->data); - - if ((PQresultStatus(res) == PGRES_TUPLES_OK) && PQntuples(res)) - { - skip_db_restore = true; - pg_log_info("database \"%s\" matches exclude pattern: \"%s\"", dbidname->str, pat_cell->val); - } - - PQclear(res); - resetPQExpBuffer(query); - } - - if (skip_db_restore) - break; - } - - destroyPQExpBuffer(db_lit); - - /* - * Mark db to be skipped or increment the counter of dbs to be - * restored - */ - if (skip_db_restore) - { - pg_log_info("excluding database \"%s\"", dbidname->str); - dbidname->oid = InvalidOid; - } - else - { - count_db++; - } - } - - destroyPQExpBuffer(query); - - return count_db; -} - -/* - * get_dbname_oid_list_from_mfile - * - * Open map.dat file and read line by line and then prepare a list of database - * names and corresponding db_oid. - * - * Returns, total number of database names in map.dat file. - */ -static int -get_dbname_oid_list_from_mfile(const char *dumpdirpath, SimplePtrList *dbname_oid_list) -{ - StringInfoData linebuf; - FILE *pfile; - char map_file_path[MAXPGPATH]; - int count = 0; - - - /* - * If there is only global.dat file in dump, then return from here as - * there is no database to restore. - */ - if (!file_exists_in_directory(dumpdirpath, "map.dat")) - { - pg_log_info("database restoring is skipped as \"map.dat\" is not present in \"%s\"", dumpdirpath); - return 0; - } - - snprintf(map_file_path, MAXPGPATH, "%s/map.dat", dumpdirpath); - - /* Open map.dat file. */ - pfile = fopen(map_file_path, PG_BINARY_R); - - if (pfile == NULL) - pg_fatal("could not open \"%s\": %m", map_file_path); - - initStringInfo(&linebuf); - - /* Append all the dbname/db_oid combinations to the list. */ - while (pg_get_line_buf(pfile, &linebuf)) - { - Oid db_oid = InvalidOid; - char *dbname; - DbOidName *dbidname; - int namelen; - char *p = linebuf.data; - - /* Extract dboid. */ - while (isdigit((unsigned char) *p)) - p++; - if (p > linebuf.data && *p == ' ') - { - sscanf(linebuf.data, "%u", &db_oid); - p++; - } - - /* dbname is the rest of the line */ - dbname = p; - namelen = strlen(dbname); - - /* Report error and exit if the file has any corrupted data. */ - if (!OidIsValid(db_oid) || namelen <= 1) - pg_fatal("invalid entry in \"%s\" at line: %d", map_file_path, - count + 1); - - pg_log_info("found database \"%s\" (OID: %u) in \"%s\"", - dbname, db_oid, map_file_path); - - dbidname = pg_malloc(offsetof(DbOidName, str) + namelen + 1); - dbidname->oid = db_oid; - strlcpy(dbidname->str, dbname, namelen); - - simple_ptr_list_append(dbname_oid_list, dbidname); - count++; - } - - /* Close map.dat file. */ - fclose(pfile); - - return count; -} - -/* - * restore_all_databases - * - * This will restore databases those dumps are present in - * directory based on map.dat file mapping. - * - * This will skip restoring for databases that are specified with - * exclude-database option. - * - * returns, number of errors while doing restore. - */ -static int -restore_all_databases(PGconn *conn, const char *dumpdirpath, - SimpleStringList db_exclude_patterns, RestoreOptions *opts, - int numWorkers) -{ - SimplePtrList dbname_oid_list = {NULL, NULL}; - int num_db_restore = 0; - int num_total_db; - int n_errors_total; - int count = 0; - char *connected_db = NULL; - bool dumpData = opts->dumpData; - bool dumpSchema = opts->dumpSchema; - bool dumpStatistics = opts->dumpSchema; - - /* Save db name to reuse it for all the database. */ - if (opts->cparams.dbname) - connected_db = opts->cparams.dbname; - - num_total_db = get_dbname_oid_list_from_mfile(dumpdirpath, &dbname_oid_list); - - /* If map.dat has no entries, return after processing global.dat */ - if (dbname_oid_list.head == NULL) - return process_global_sql_commands(conn, dumpdirpath, opts->filename); - - pg_log_info("found %d database names in \"map.dat\"", num_total_db); - - if (!conn) - { - pg_log_info("trying to connect database \"postgres\""); - - conn = ConnectDatabase("postgres", NULL, opts->cparams.pghost, - opts->cparams.pgport, opts->cparams.username, TRI_DEFAULT, - false, progname, NULL, NULL, NULL, NULL); - - /* Try with template1. */ - if (!conn) - { - pg_log_info("trying to connect database \"template1\""); - - conn = ConnectDatabase("template1", NULL, opts->cparams.pghost, - opts->cparams.pgport, opts->cparams.username, TRI_DEFAULT, - false, progname, NULL, NULL, NULL, NULL); - } - } - - /* - * filter the db list according to the exclude patterns - */ - num_db_restore = get_dbnames_list_to_restore(conn, &dbname_oid_list, - db_exclude_patterns); - - /* Open global.dat file and execute/append all the global sql commands. */ - n_errors_total = process_global_sql_commands(conn, dumpdirpath, opts->filename); - - /* Close the db connection as we are done with globals and patterns. */ - if (conn) - PQfinish(conn); - - /* Exit if no db needs to be restored. */ - if (dbname_oid_list.head == NULL || num_db_restore == 0) - { - pg_log_info("no database needs to restore out of %d databases", num_total_db); - return n_errors_total; - } - - pg_log_info("need to restore %d databases out of %d databases", num_db_restore, num_total_db); - - /* - * We have a list of databases to restore after processing the - * exclude-database switch(es). Now we can restore them one by one. - */ - for (SimplePtrListCell *db_cell = dbname_oid_list.head; - db_cell; db_cell = db_cell->next) - { - DbOidName *dbidname = (DbOidName *) db_cell->ptr; - char subdirpath[MAXPGPATH]; - char subdirdbpath[MAXPGPATH]; - char dbfilename[MAXPGPATH]; - int n_errors; - - /* ignore dbs marked for skipping */ - if (dbidname->oid == InvalidOid) - continue; - - /* - * We need to reset override_dbname so that objects can be restored - * into an already created database. (used with -d/--dbname option) - */ - if (opts->cparams.override_dbname) - { - pfree(opts->cparams.override_dbname); - opts->cparams.override_dbname = NULL; - } - - snprintf(subdirdbpath, MAXPGPATH, "%s/databases", dumpdirpath); - - /* - * Look for the database dump file/dir. If there is an {oid}.tar or - * {oid}.dmp file, use it. Otherwise try to use a directory called - * {oid} - */ - snprintf(dbfilename, MAXPGPATH, "%u.tar", dbidname->oid); - if (file_exists_in_directory(subdirdbpath, dbfilename)) - snprintf(subdirpath, MAXPGPATH, "%s/databases/%u.tar", dumpdirpath, dbidname->oid); - else - { - snprintf(dbfilename, MAXPGPATH, "%u.dmp", dbidname->oid); - - if (file_exists_in_directory(subdirdbpath, dbfilename)) - snprintf(subdirpath, MAXPGPATH, "%s/databases/%u.dmp", dumpdirpath, dbidname->oid); - else - snprintf(subdirpath, MAXPGPATH, "%s/databases/%u", dumpdirpath, dbidname->oid); - } - - pg_log_info("restoring database \"%s\"", dbidname->str); - - /* If database is already created, then don't set createDB flag. */ - if (opts->cparams.dbname) - { - PGconn *test_conn; - - test_conn = ConnectDatabase(dbidname->str, NULL, opts->cparams.pghost, - opts->cparams.pgport, opts->cparams.username, TRI_DEFAULT, - false, progname, NULL, NULL, NULL, NULL); - if (test_conn) - { - PQfinish(test_conn); - - /* Use already created database for connection. */ - opts->createDB = 0; - opts->cparams.dbname = dbidname->str; - } - else - { - /* we'll have to create it */ - opts->createDB = 1; - opts->cparams.dbname = connected_db; - } - } - - /* - * Reset flags - might have been reset in pg_backup_archiver.c by the - * previous restore. - */ - opts->dumpData = dumpData; - opts->dumpSchema = dumpSchema; - opts->dumpStatistics = dumpStatistics; - - /* Restore the single database. */ - n_errors = restore_one_database(subdirpath, opts, numWorkers, true, count); - - /* Print a summary of ignored errors during single database restore. */ - if (n_errors) - { - n_errors_total += n_errors; - pg_log_warning("errors ignored on database \"%s\" restore: %d", dbidname->str, n_errors); - } - - count++; - } - - /* Log number of processed databases. */ - pg_log_info("number of restored databases is %d", num_db_restore); - - /* Free dbname and dboid list. */ - simple_ptr_list_destroy(&dbname_oid_list); - - return n_errors_total; -} - -/* - * process_global_sql_commands - * - * Open global.dat and execute or copy the sql commands one by one. - * - * If outfile is not NULL, copy all sql commands into outfile rather than - * executing them. - * - * Returns the number of errors while processing global.dat - */ -static int -process_global_sql_commands(PGconn *conn, const char *dumpdirpath, const char *outfile) -{ - char global_file_path[MAXPGPATH]; - PGresult *result; - StringInfoData sqlstatement, - user_create; - FILE *pfile; - int n_errors = 0; - - snprintf(global_file_path, MAXPGPATH, "%s/global.dat", dumpdirpath); - - /* Open global.dat file. */ - pfile = fopen(global_file_path, PG_BINARY_R); - - if (pfile == NULL) - pg_fatal("could not open \"%s\": %m", global_file_path); - - /* - * If outfile is given, then just copy all global.dat file data into - * outfile. - */ - if (outfile) - { - copy_or_print_global_file(outfile, pfile); - return 0; - } - - /* Init sqlstatement to append commands. */ - initStringInfo(&sqlstatement); - - /* creation statement for our current role */ - initStringInfo(&user_create); - appendStringInfoString(&user_create, "CREATE ROLE "); - /* should use fmtId here, but we don't know the encoding */ - appendStringInfoString(&user_create, PQuser(conn)); - appendStringInfoChar(&user_create, ';'); - - /* Process file till EOF and execute sql statements. */ - while (read_one_statement(&sqlstatement, pfile) != EOF) - { - /* don't try to create the role we are connected as */ - if (strstr(sqlstatement.data, user_create.data)) - continue; - - pg_log_info("executing query: %s", sqlstatement.data); - result = PQexec(conn, sqlstatement.data); - - switch (PQresultStatus(result)) - { - case PGRES_COMMAND_OK: - case PGRES_TUPLES_OK: - case PGRES_EMPTY_QUERY: - break; - default: - n_errors++; - pg_log_error("could not execute query: \"%s\" \nCommand was: \"%s\"", PQerrorMessage(conn), sqlstatement.data); - } - PQclear(result); - } - - /* Print a summary of ignored errors during global.dat. */ - if (n_errors) - pg_log_warning("ignored %d errors in \"%s\"", n_errors, global_file_path); - - fclose(pfile); - - return n_errors; -} - -/* - * copy_or_print_global_file - * - * Copy global.dat into the output file. If "-" is used as outfile, - * then print commands to stdout. - */ -static void -copy_or_print_global_file(const char *outfile, FILE *pfile) -{ - char out_file_path[MAXPGPATH]; - FILE *OPF; - int c; - - /* "-" is used for stdout. */ - if (strcmp(outfile, "-") == 0) - OPF = stdout; - else - { - snprintf(out_file_path, MAXPGPATH, "%s", outfile); - OPF = fopen(out_file_path, PG_BINARY_W); - - if (OPF == NULL) - { - fclose(pfile); - pg_fatal("could not open file: \"%s\"", outfile); - } - } - - /* Append global.dat into output file or print to stdout. */ - while ((c = fgetc(pfile)) != EOF) - fputc(c, OPF); - - fclose(pfile); - - /* Close output file. */ - if (strcmp(outfile, "-") != 0) - fclose(OPF); -} diff --git a/src/bin/pg_dump/t/001_basic.pl b/src/bin/pg_dump/t/001_basic.pl index 84ca25e17d6..37d893d5e6a 100644 --- a/src/bin/pg_dump/t/001_basic.pl +++ b/src/bin/pg_dump/t/001_basic.pl @@ -237,21 +237,6 @@ command_fails_like( 'pg_restore: options -C\/--create and -1\/--single-transaction cannot be used together' ); -command_fails_like( - [ 'pg_restore', '--exclude-database=foo', '--globals-only', '-d', 'xxx' ], - qr/\Qpg_restore: error: option --exclude-database cannot be used together with -g\/--globals-only\E/, - 'pg_restore: option --exclude-database cannot be used together with -g/--globals-only'); - -command_fails_like( - [ 'pg_restore', '--exclude-database=foo', '-d', 'xxx', 'dumpdir' ], - qr/\Qpg_restore: error: option --exclude-database can be used only when restoring an archive created by pg_dumpall\E/, - 'When option --exclude-database is used in pg_restore with dump of pg_dump'); - -command_fails_like( - [ 'pg_restore', '--globals-only', '-d', 'xxx', 'dumpdir' ], - qr/\Qpg_restore: error: option -g\/--globals-only can be used only when restoring an archive created by pg_dumpall\E/, - 'When option --globals-only is not used in pg_restore with dump of pg_dump'); - # also fails for -r and -t, but it seems pointless to add more tests for those. command_fails_like( [ 'pg_dumpall', '--exclude-database=foo', '--globals-only' ], @@ -259,8 +244,4 @@ command_fails_like( 'pg_dumpall: option --exclude-database cannot be used together with -g/--globals-only' ); -command_fails_like( - [ 'pg_dumpall', '--format', 'x' ], - qr/\Qpg_dumpall: error: unrecognized archive format "x";\E/, - 'pg_dumpall: unrecognized archive format'); done_testing(); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index 55d892d9c16..a86b38466de 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -68,6 +68,7 @@ my %pgdump_runs = ( '--no-data', '--sequence-data', '--binary-upgrade', + '--statistics', '--dbname' => 'postgres', # alternative way to specify database ], restore_cmd => [ @@ -75,6 +76,7 @@ my %pgdump_runs = ( '--format' => 'custom', '--verbose', '--file' => "$tempdir/binary_upgrade.sql", + '--statistics', "$tempdir/binary_upgrade.dump", ], }, @@ -88,11 +90,13 @@ my %pgdump_runs = ( '--format' => 'custom', '--compress' => '1', '--file' => "$tempdir/compression_gzip_custom.dump", + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--file' => "$tempdir/compression_gzip_custom.sql", + '--statistics', "$tempdir/compression_gzip_custom.dump", ], command_like => { @@ -115,6 +119,7 @@ my %pgdump_runs = ( '--format' => 'directory', '--compress' => 'gzip:1', '--file' => "$tempdir/compression_gzip_dir", + '--statistics', 'postgres', ], # Give coverage for manually compressed blobs.toc files during @@ -132,6 +137,7 @@ my %pgdump_runs = ( 'pg_restore', '--jobs' => '2', '--file' => "$tempdir/compression_gzip_dir.sql", + '--statistics', "$tempdir/compression_gzip_dir", ], }, @@ -144,6 +150,7 @@ my %pgdump_runs = ( '--format' => 'plain', '--compress' => '1', '--file' => "$tempdir/compression_gzip_plain.sql.gz", + '--statistics', 'postgres', ], # Decompress the generated file to run through the tests. @@ -162,11 +169,13 @@ my %pgdump_runs = ( '--format' => 'custom', '--compress' => 'lz4', '--file' => "$tempdir/compression_lz4_custom.dump", + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--file' => "$tempdir/compression_lz4_custom.sql", + '--statistics', "$tempdir/compression_lz4_custom.dump", ], command_like => { @@ -189,6 +198,7 @@ my %pgdump_runs = ( '--format' => 'directory', '--compress' => 'lz4:1', '--file' => "$tempdir/compression_lz4_dir", + '--statistics', 'postgres', ], # Verify that data files were compressed @@ -200,6 +210,7 @@ my %pgdump_runs = ( 'pg_restore', '--jobs' => '2', '--file' => "$tempdir/compression_lz4_dir.sql", + '--statistics', "$tempdir/compression_lz4_dir", ], }, @@ -212,6 +223,7 @@ my %pgdump_runs = ( '--format' => 'plain', '--compress' => 'lz4', '--file' => "$tempdir/compression_lz4_plain.sql.lz4", + '--statistics', 'postgres', ], # Decompress the generated file to run through the tests. @@ -233,11 +245,13 @@ my %pgdump_runs = ( '--format' => 'custom', '--compress' => 'zstd', '--file' => "$tempdir/compression_zstd_custom.dump", + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--file' => "$tempdir/compression_zstd_custom.sql", + '--statistics', "$tempdir/compression_zstd_custom.dump", ], command_like => { @@ -259,6 +273,7 @@ my %pgdump_runs = ( '--format' => 'directory', '--compress' => 'zstd:1', '--file' => "$tempdir/compression_zstd_dir", + '--statistics', 'postgres', ], # Give coverage for manually compressed blobs.toc files during @@ -279,6 +294,7 @@ my %pgdump_runs = ( 'pg_restore', '--jobs' => '2', '--file' => "$tempdir/compression_zstd_dir.sql", + '--statistics', "$tempdir/compression_zstd_dir", ], }, @@ -292,6 +308,7 @@ my %pgdump_runs = ( '--format' => 'plain', '--compress' => 'zstd:long', '--file' => "$tempdir/compression_zstd_plain.sql.zst", + '--statistics', 'postgres', ], # Decompress the generated file to run through the tests. @@ -310,6 +327,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/clean.sql", '--clean', + '--statistics', '--dbname' => 'postgres', # alternative way to specify database ], }, @@ -320,6 +338,7 @@ my %pgdump_runs = ( '--clean', '--if-exists', '--encoding' => 'UTF8', # no-op, just for testing + '--statistics', 'postgres', ], }, @@ -338,6 +357,7 @@ my %pgdump_runs = ( '--create', '--no-reconnect', # no-op, just for testing '--verbose', + '--statistics', 'postgres', ], }, @@ -348,7 +368,7 @@ my %pgdump_runs = ( '--data-only', '--superuser' => 'test_superuser', '--disable-triggers', - '--verbose', # no-op, just make sure it works + '--verbose', # no-op, just make sure it works 'postgres', ], }, @@ -356,6 +376,7 @@ my %pgdump_runs = ( dump_cmd => [ 'pg_dump', '--no-sync', '--file' => "$tempdir/defaults.sql", + '--statistics', 'postgres', ], }, @@ -364,6 +385,7 @@ my %pgdump_runs = ( dump_cmd => [ 'pg_dump', '--no-sync', '--file' => "$tempdir/defaults_no_public.sql", + '--statistics', 'regress_pg_dump_test', ], }, @@ -373,6 +395,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--clean', '--file' => "$tempdir/defaults_no_public_clean.sql", + '--statistics', 'regress_pg_dump_test', ], }, @@ -381,6 +404,7 @@ my %pgdump_runs = ( dump_cmd => [ 'pg_dump', '--no-sync', '--file' => "$tempdir/defaults_public_owner.sql", + '--statistics', 'regress_public_owner', ], }, @@ -395,12 +419,14 @@ my %pgdump_runs = ( 'pg_dump', '--format' => 'custom', '--file' => "$tempdir/defaults_custom_format.dump", + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--format' => 'custom', '--file' => "$tempdir/defaults_custom_format.sql", + '--statistics', "$tempdir/defaults_custom_format.dump", ], command_like => { @@ -425,12 +451,14 @@ my %pgdump_runs = ( 'pg_dump', '--format' => 'directory', '--file' => "$tempdir/defaults_dir_format", + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--format' => 'directory', '--file' => "$tempdir/defaults_dir_format.sql", + '--statistics', "$tempdir/defaults_dir_format", ], command_like => { @@ -456,11 +484,13 @@ my %pgdump_runs = ( '--format' => 'directory', '--jobs' => 2, '--file' => "$tempdir/defaults_parallel", + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--file' => "$tempdir/defaults_parallel.sql", + '--statistics', "$tempdir/defaults_parallel", ], }, @@ -472,12 +502,14 @@ my %pgdump_runs = ( 'pg_dump', '--format' => 'tar', '--file' => "$tempdir/defaults_tar_format.tar", + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--format' => 'tar', '--file' => "$tempdir/defaults_tar_format.sql", + '--statistics', "$tempdir/defaults_tar_format.tar", ], }, @@ -486,6 +518,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/exclude_dump_test_schema.sql", '--exclude-schema' => 'dump_test', + '--statistics', 'postgres', ], }, @@ -494,6 +527,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/exclude_test_table.sql", '--exclude-table' => 'dump_test.test_table', + '--statistics', 'postgres', ], }, @@ -502,6 +536,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/exclude_measurement.sql", '--exclude-table-and-children' => 'dump_test.measurement', + '--statistics', 'postgres', ], }, @@ -511,6 +546,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/exclude_measurement_data.sql", '--exclude-table-data-and-children' => 'dump_test.measurement', '--no-unlogged-table-data', + '--statistics', 'postgres', ], }, @@ -520,6 +556,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/exclude_test_table_data.sql", '--exclude-table-data' => 'dump_test.test_table', '--no-unlogged-table-data', + '--statistics', 'postgres', ], }, @@ -538,6 +575,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/pg_dumpall_globals.sql", '--globals-only', '--no-sync', + '--statistics', ], }, pg_dumpall_globals_clean => { @@ -547,12 +585,14 @@ my %pgdump_runs = ( '--globals-only', '--clean', '--no-sync', + '--statistics', ], }, pg_dumpall_dbprivs => { dump_cmd => [ 'pg_dumpall', '--no-sync', '--file' => "$tempdir/pg_dumpall_dbprivs.sql", + '--statistics', ], }, pg_dumpall_exclude => { @@ -562,6 +602,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/pg_dumpall_exclude.sql", '--exclude-database' => '*dump_test*', '--no-sync', + '--statistics', ], }, no_toast_compression => { @@ -569,6 +610,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_toast_compression.sql", '--no-toast-compression', + '--statistics', 'postgres', ], }, @@ -577,6 +619,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_large_objects.sql", '--no-large-objects', + '--statistics', 'postgres', ], }, @@ -585,6 +628,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_policies.sql", '--no-policies', + '--statistics', 'postgres', ], }, @@ -593,6 +637,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_privs.sql", '--no-privileges', + '--statistics', 'postgres', ], }, @@ -601,6 +646,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_owner.sql", '--no-owner', + '--statistics', 'postgres', ], }, @@ -609,6 +655,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/no_table_access_method.sql", '--no-table-access-method', + '--statistics', 'postgres', ], }, @@ -617,6 +664,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/only_dump_test_schema.sql", '--schema' => 'dump_test', + '--statistics', 'postgres', ], }, @@ -627,6 +675,7 @@ my %pgdump_runs = ( '--table' => 'dump_test.test_table', '--lock-wait-timeout' => (1000 * $PostgreSQL::Test::Utils::timeout_default), + '--statistics', 'postgres', ], }, @@ -637,6 +686,7 @@ my %pgdump_runs = ( '--table-and-children' => 'dump_test.measurement', '--lock-wait-timeout' => (1000 * $PostgreSQL::Test::Utils::timeout_default), + '--statistics', 'postgres', ], }, @@ -646,6 +696,7 @@ my %pgdump_runs = ( '--file' => "$tempdir/role.sql", '--role' => 'regress_dump_test_role', '--schema' => 'dump_test_second_schema', + '--statistics', 'postgres', ], }, @@ -658,11 +709,13 @@ my %pgdump_runs = ( '--file' => "$tempdir/role_parallel", '--role' => 'regress_dump_test_role', '--schema' => 'dump_test_second_schema', + '--statistics', 'postgres', ], restore_cmd => [ 'pg_restore', '--file' => "$tempdir/role_parallel.sql", + '--statistics', "$tempdir/role_parallel", ], }, @@ -691,6 +744,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/section_pre_data.sql", '--section' => 'pre-data', + '--statistics', 'postgres', ], }, @@ -699,6 +753,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/section_data.sql", '--section' => 'data', + '--statistics', 'postgres', ], }, @@ -707,6 +762,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', '--file' => "$tempdir/section_post_data.sql", '--section' => 'post-data', + '--statistics', 'postgres', ], }, @@ -717,6 +773,7 @@ my %pgdump_runs = ( '--schema' => 'dump_test', '--large-objects', '--no-large-objects', + '--statistics', 'postgres', ], }, @@ -732,6 +789,7 @@ my %pgdump_runs = ( 'pg_dump', '--no-sync', "--file=$tempdir/no_data_no_schema.sql", '--no-data', '--no-schema', 'postgres', + '--statistics', ], }, statistics_only => { @@ -741,18 +799,11 @@ my %pgdump_runs = ( 'postgres', ], }, - schema_only_with_statistics => { - dump_cmd => [ - 'pg_dump', '--no-sync', - "--file=$tempdir/schema_only_with_statistics.sql", - '--schema-only', '--with-statistics', 'postgres', - ], - }, no_schema => { dump_cmd => [ 'pg_dump', '--no-sync', "--file=$tempdir/no_schema.sql", '--no-schema', - 'postgres', + '--statistics', 'postgres', ], },); @@ -1029,6 +1080,7 @@ my %tests = ( test_schema_plus_large_objects => 1, }, unlike => { + binary_upgrade => 1, no_large_objects => 1, no_owner => 1, schema_only => 1, @@ -1132,7 +1184,9 @@ my %tests = ( ) INHERITS (dump_test.test_table_nn, dump_test.test_table_nn_2); ALTER TABLE dump_test.test_table_nn ADD CONSTRAINT nn NOT NULL col1 NOT VALID; ALTER TABLE dump_test.test_table_nn_chld1 VALIDATE CONSTRAINT nn; - ALTER TABLE dump_test.test_table_nn_chld2 VALIDATE CONSTRAINT nn;', + ALTER TABLE dump_test.test_table_nn_chld2 VALIDATE CONSTRAINT nn; + COMMENT ON CONSTRAINT nn ON dump_test.test_table_nn IS \'nn comment is valid\'; + COMMENT ON CONSTRAINT nn ON dump_test.test_table_nn_chld2 IS \'nn_chld2 comment is valid\';', regexp => qr/^ \QALTER TABLE dump_test.test_table_nn\E \n^\s+ \QADD CONSTRAINT nn NOT NULL col1 NOT VALID;\E @@ -1146,6 +1200,34 @@ my %tests = ( }, }, + # This constraint is invalid therefore it goes in SECTION_POST_DATA + 'COMMENT ON CONSTRAINT ON test_table_nn' => { + regexp => qr/^ + \QCOMMENT ON CONSTRAINT nn ON dump_test.test_table_nn IS\E + /xm, + like => { + %full_runs, %dump_test_schema_runs, section_post_data => 1, + }, + unlike => { + exclude_dump_test_schema => 1, + only_dump_measurement => 1, + }, + }, + + # This constraint is valid therefore it goes in SECTION_PRE_DATA + 'COMMENT ON CONSTRAINT ON test_table_chld2' => { + regexp => qr/^ + \QCOMMENT ON CONSTRAINT nn ON dump_test.test_table_nn_chld2 IS\E + /xm, + like => { + %full_runs, %dump_test_schema_runs, section_pre_data => 1, + }, + unlike => { + exclude_dump_test_schema => 1, + only_dump_measurement => 1, + }, + }, + 'CONSTRAINT NOT NULL / NOT VALID (child1)' => { regexp => qr/^ \QCREATE TABLE dump_test.test_table_nn_chld1 (\E\n @@ -1517,6 +1599,7 @@ my %tests = ( test_schema_plus_large_objects => 1, }, unlike => { + binary_upgrade => 1, schema_only => 1, schema_only_with_statistics => 1, no_large_objects => 1, @@ -2289,17 +2372,19 @@ my %tests = ( create_sql => 'CREATE DOMAIN dump_test.us_postal_code AS TEXT COLLATE "C" DEFAULT \'10014\' + CONSTRAINT nn NOT NULL CHECK(VALUE ~ \'^\d{5}$\' OR VALUE ~ \'^\d{5}-\d{4}$\'); + COMMENT ON CONSTRAINT nn + ON DOMAIN dump_test.us_postal_code IS \'not null\'; COMMENT ON CONSTRAINT us_postal_code_check ON DOMAIN dump_test.us_postal_code IS \'check it\';', regexp => qr/^ - \QCREATE DOMAIN dump_test.us_postal_code AS text COLLATE pg_catalog."C" DEFAULT '10014'::text\E\n\s+ + \QCREATE DOMAIN dump_test.us_postal_code AS text COLLATE pg_catalog."C" CONSTRAINT nn NOT NULL DEFAULT '10014'::text\E\n\s+ \QCONSTRAINT us_postal_code_check CHECK \E \Q(((VALUE ~ '^\d{5}\E \$\Q'::text) OR (VALUE ~ '^\d{5}-\d{4}\E\$ \Q'::text)));\E(.|\n)* - \QCOMMENT ON CONSTRAINT us_postal_code_check ON DOMAIN dump_test.us_postal_code IS 'check it';\E /xm, like => { %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, @@ -2309,6 +2394,30 @@ my %tests = ( }, }, + 'COMMENT ON CONSTRAINT ON DOMAIN (1)' => { + regexp => qr/^ + \QCOMMENT ON CONSTRAINT nn ON DOMAIN dump_test.us_postal_code IS 'not null';\E + /xm, + like => + { %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, + unlike => { + exclude_dump_test_schema => 1, + only_dump_measurement => 1, + }, + }, + + 'COMMENT ON CONSTRAINT ON DOMAIN (2)' => { + regexp => qr/^ + \QCOMMENT ON CONSTRAINT us_postal_code_check ON DOMAIN dump_test.us_postal_code IS 'check it';\E + /xm, + like => + { %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, + unlike => { + exclude_dump_test_schema => 1, + only_dump_measurement => 1, + }, + }, + 'CREATE FUNCTION dump_test.pltestlang_call_handler' => { create_order => 17, create_sql => 'CREATE FUNCTION dump_test.pltestlang_call_handler() @@ -4524,9 +4633,9 @@ my %tests = ( no_schema => 1, section_data => 1, test_schema_plus_large_objects => 1, - binary_upgrade => 1, }, unlike => { + binary_upgrade => 1, no_large_objects => 1, no_privs => 1, schema_only => 1, @@ -4834,13 +4943,13 @@ my %tests = ( CREATE TABLE dump_test.has_stats AS SELECT g.g AS x, g.g / 2 AS y FROM generate_series(1,100) AS g(g); CREATE MATERIALIZED VIEW dump_test.has_stats_mv AS SELECT * FROM dump_test.has_stats; - CREATE INDEX dup_test_post_data_ix ON dump_test.has_stats(x, (x - 1)); + CREATE INDEX """dump_test""\'s post-data index" ON dump_test.has_stats(x, (x - 1)); ANALYZE dump_test.has_stats, dump_test.has_stats_mv;', regexp => qr/^ \QSELECT * FROM pg_catalog.pg_restore_relation_stats(\E\s+ 'version',\s'\d+'::integer,\s+ 'schemaname',\s'dump_test',\s+ - 'relname',\s'dup_test_post_data_ix',\s+ + 'relname',\s'"dump_test"''s\ post-data\ index',\s+ 'relpages',\s'\d+'::integer,\s+ 'reltuples',\s'\d+'::real,\s+ 'relallvisible',\s'\d+'::integer,\s+ @@ -4849,7 +4958,7 @@ my %tests = ( \QSELECT * FROM pg_catalog.pg_restore_attribute_stats(\E\s+ 'version',\s'\d+'::integer,\s+ 'schemaname',\s'dump_test',\s+ - 'relname',\s'dup_test_post_data_ix',\s+ + 'relname',\s'"dump_test"''s\ post-data\ index',\s+ 'attnum',\s'2'::smallint,\s+ 'inherited',\s'f'::boolean,\s+ 'null_frac',\s'0'::real,\s+ @@ -5096,6 +5205,17 @@ command_fails_like( 'pg_dump', '--port' => $port, '--strict-names', + '--schema-only', + '--statistics', + ], + qr/\Qpg_dump: error: options -s\/--schema-only and --statistics cannot be used together\E/, + 'cannot use --schema-only and --statistics together'); + +command_fails_like( + [ + 'pg_dump', + '--port' => $port, + '--strict-names', '--table' => 'nonexistent*' ], qr/\Qpg_dump: error: no matching tables were found for pattern\E/, diff --git a/src/bin/pg_dump/t/006_pg_dumpall.pl b/src/bin/pg_dump/t/006_pg_dumpall.pl deleted file mode 100644 index 5acd49f1559..00000000000 --- a/src/bin/pg_dump/t/006_pg_dumpall.pl +++ /dev/null @@ -1,391 +0,0 @@ -# Copyright (c) 2021-2025, PostgreSQL Global Development Group - -use strict; -use warnings FATAL => 'all'; - -use PostgreSQL::Test::Cluster; -use PostgreSQL::Test::Utils; -use Test::More; - -my $tempdir = PostgreSQL::Test::Utils::tempdir; -my $run_db = 'postgres'; -my $sep = $windows_os ? "\\" : "/"; - -# Tablespace locations used by "restore_tablespace" test case. -my $tablespace1 = "${tempdir}${sep}tbl1"; -my $tablespace2 = "${tempdir}${sep}tbl2"; -mkdir($tablespace1) || die "mkdir $tablespace1 $!"; -mkdir($tablespace2) || die "mkdir $tablespace2 $!"; - -# Scape tablespace locations on Windows. -$tablespace1 = $windows_os ? ($tablespace1 =~ s/\\/\\\\/gr) : $tablespace1; -$tablespace2 = $windows_os ? ($tablespace2 =~ s/\\/\\\\/gr) : $tablespace2; - -# Where pg_dumpall will be executed. -my $node = PostgreSQL::Test::Cluster->new('node'); -$node->init; -$node->start; - - -############################################################### -# Definition of the pg_dumpall test cases to run. -# -# Each of these test cases are named and those names are used for fail -# reporting and also to save the dump and restore information needed for the -# test to assert. -# -# The "setup_sql" is a psql valid script that contains SQL commands to execute -# before of actually execute the tests. The setups are all executed before of -# any test execution. -# -# The "dump_cmd" and "restore_cmd" are the commands that will be executed. The -# "restore_cmd" must have the --file flag to save the restore output so that we -# can assert on it. -# -# The "like" and "unlike" is a regexp that is used to match the pg_restore -# output. It must have at least one of then filled per test cases but it also -# can have both. See "excluding_databases" test case for example. -my %pgdumpall_runs = ( - restore_roles => { - setup_sql => ' - CREATE ROLE dumpall WITH ENCRYPTED PASSWORD \'admin\' SUPERUSER; - CREATE ROLE dumpall2 WITH REPLICATION CONNECTION LIMIT 10;', - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--file' => "$tempdir/restore_roles", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'directory', - '--file' => "$tempdir/restore_roles.sql", - "$tempdir/restore_roles", - ], - like => qr/ - ^\s*\QCREATE ROLE dumpall;\E\s*\n - \s*\QALTER ROLE dumpall WITH SUPERUSER INHERIT NOCREATEROLE NOCREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS PASSWORD 'SCRAM-SHA-256\E - [^']+';\s*\n - \s*\QCREATE ROLE dumpall2;\E - \s*\QALTER ROLE dumpall2 WITH NOSUPERUSER INHERIT NOCREATEROLE NOCREATEDB NOLOGIN REPLICATION NOBYPASSRLS CONNECTION LIMIT 10;\E - /xm - }, - - restore_tablespace => { - setup_sql => " - CREATE ROLE tap; - CREATE TABLESPACE tbl1 OWNER tap LOCATION '$tablespace1'; - CREATE TABLESPACE tbl2 OWNER tap LOCATION '$tablespace2' WITH (seq_page_cost=1.0);", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--file' => "$tempdir/restore_tablespace", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'directory', - '--file' => "$tempdir/restore_tablespace.sql", - "$tempdir/restore_tablespace", - ], - # Match "E" as optional since it is added on LOCATION when running on - # Windows. - like => qr/^ - \n\QCREATE TABLESPACE tbl1 OWNER tap LOCATION \E(?:E)?\Q'$tablespace1';\E - \n\QCREATE TABLESPACE tbl2 OWNER tap LOCATION \E(?:E)?\Q'$tablespace2';\E - \n\QALTER TABLESPACE tbl2 SET (seq_page_cost=1.0);\E - /xm, - }, - - restore_grants => { - setup_sql => " - CREATE DATABASE tapgrantsdb; - CREATE SCHEMA private; - CREATE SEQUENCE serial START 101; - CREATE FUNCTION fn() RETURNS void AS \$\$ - BEGIN - END; - \$\$ LANGUAGE plpgsql; - CREATE ROLE super; - CREATE ROLE grant1; - CREATE ROLE grant2; - CREATE ROLE grant3; - CREATE ROLE grant4; - CREATE ROLE grant5; - CREATE ROLE grant6; - CREATE ROLE grant7; - CREATE ROLE grant8; - - CREATE TABLE t (id int); - INSERT INTO t VALUES (1), (2), (3), (4); - - GRANT SELECT ON TABLE t TO grant1; - GRANT INSERT ON TABLE t TO grant2; - GRANT ALL PRIVILEGES ON TABLE t to grant3; - GRANT CONNECT, CREATE ON DATABASE tapgrantsdb TO grant4; - GRANT USAGE, CREATE ON SCHEMA private TO grant5; - GRANT USAGE, SELECT, UPDATE ON SEQUENCE serial TO grant6; - GRANT super TO grant7; - GRANT EXECUTE ON FUNCTION fn() TO grant8; - ", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--file' => "$tempdir/restore_grants", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'directory', - '--file' => "$tempdir/restore_grants.sql", - "$tempdir/restore_grants", - ], - like => qr/^ - \n\QGRANT super TO grant7 WITH INHERIT TRUE GRANTED BY\E - (.*\n)* - \n\QGRANT ALL ON SCHEMA private TO grant5;\E - (.*\n)* - \n\QGRANT ALL ON FUNCTION public.fn() TO grant8;\E - (.*\n)* - \n\QGRANT ALL ON SEQUENCE public.serial TO grant6;\E - (.*\n)* - \n\QGRANT SELECT ON TABLE public.t TO grant1;\E - \n\QGRANT INSERT ON TABLE public.t TO grant2;\E - \n\QGRANT ALL ON TABLE public.t TO grant3;\E - (.*\n)* - \n\QGRANT CREATE,CONNECT ON DATABASE tapgrantsdb TO grant4;\E - /xm, - }, - - excluding_databases => { - setup_sql => 'CREATE DATABASE db1; - \c db1 - CREATE TABLE t1 (id int); - INSERT INTO t1 VALUES (1), (2), (3), (4); - CREATE TABLE t2 (id int); - INSERT INTO t2 VALUES (1), (2), (3), (4); - - CREATE DATABASE db2; - \c db2 - CREATE TABLE t3 (id int); - INSERT INTO t3 VALUES (1), (2), (3), (4); - CREATE TABLE t4 (id int); - INSERT INTO t4 VALUES (1), (2), (3), (4); - - CREATE DATABASE dbex3; - \c dbex3 - CREATE TABLE t5 (id int); - INSERT INTO t5 VALUES (1), (2), (3), (4); - CREATE TABLE t6 (id int); - INSERT INTO t6 VALUES (1), (2), (3), (4); - - CREATE DATABASE dbex4; - \c dbex4 - CREATE TABLE t7 (id int); - INSERT INTO t7 VALUES (1), (2), (3), (4); - CREATE TABLE t8 (id int); - INSERT INTO t8 VALUES (1), (2), (3), (4); - - CREATE DATABASE db5; - \c db5 - CREATE TABLE t9 (id int); - INSERT INTO t9 VALUES (1), (2), (3), (4); - CREATE TABLE t10 (id int); - INSERT INTO t10 VALUES (1), (2), (3), (4); - ', - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--file' => "$tempdir/excluding_databases", - '--exclude-database' => 'dbex*', - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'directory', - '--file' => "$tempdir/excluding_databases.sql", - '--exclude-database' => 'db5', - "$tempdir/excluding_databases", - ], - like => qr/^ - \n\QCREATE DATABASE db1\E - (.*\n)* - \n\QCREATE TABLE public.t1 (\E - (.*\n)* - \n\QCREATE TABLE public.t2 (\E - (.*\n)* - \n\QCREATE DATABASE db2\E - (.*\n)* - \n\QCREATE TABLE public.t3 (\E - (.*\n)* - \n\QCREATE TABLE public.t4 (/xm, - unlike => qr/^ - \n\QCREATE DATABASE db3\E - (.*\n)* - \n\QCREATE TABLE public.t5 (\E - (.*\n)* - \n\QCREATE TABLE public.t6 (\E - (.*\n)* - \n\QCREATE DATABASE db4\E - (.*\n)* - \n\QCREATE TABLE public.t7 (\E - (.*\n)* - \n\QCREATE TABLE public.t8 (\E - \n\QCREATE DATABASE db5\E - (.*\n)* - \n\QCREATE TABLE public.t9 (\E - (.*\n)* - \n\QCREATE TABLE public.t10 (\E - /xm, - }, - - format_directory => { - setup_sql => "CREATE TABLE format_directory(a int, b boolean, c text); - INSERT INTO format_directory VALUES (1, true, 'name1'), (2, false, 'name2');", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--file' => "$tempdir/format_directory", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'directory', - '--file' => "$tempdir/format_directory.sql", - "$tempdir/format_directory", - ], - like => qr/^\n\QCOPY public.format_directory (a, b, c) FROM stdin;/xm - }, - - format_tar => { - setup_sql => "CREATE TABLE format_tar(a int, b boolean, c text); - INSERT INTO format_tar VALUES (1, false, 'name3'), (2, true, 'name4');", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'tar', - '--file' => "$tempdir/format_tar", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'tar', - '--file' => "$tempdir/format_tar.sql", - "$tempdir/format_tar", - ], - like => qr/^\n\QCOPY public.format_tar (a, b, c) FROM stdin;/xm - }, - - format_custom => { - setup_sql => "CREATE TABLE format_custom(a int, b boolean, c text); - INSERT INTO format_custom VALUES (1, false, 'name5'), (2, true, 'name6');", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'custom', - '--file' => "$tempdir/format_custom", - ], - restore_cmd => [ - 'pg_restore', '-C', - '--format' => 'custom', - '--file' => "$tempdir/format_custom.sql", - "$tempdir/format_custom", - ], - like => qr/^ \n\QCOPY public.format_custom (a, b, c) FROM stdin;/xm - }, - - dump_globals_only => { - setup_sql => "CREATE TABLE format_dir(a int, b boolean, c text); - INSERT INTO format_dir VALUES (1, false, 'name5'), (2, true, 'name6');", - dump_cmd => [ - 'pg_dumpall', - '--format' => 'directory', - '--globals-only', - '--file' => "$tempdir/dump_globals_only", - ], - restore_cmd => [ - 'pg_restore', '-C', '--globals-only', - '--format' => 'directory', - '--file' => "$tempdir/dump_globals_only.sql", - "$tempdir/dump_globals_only", - ], - like => qr/ - ^\s*\QCREATE ROLE dumpall;\E\s*\n - /xm - }, ); - -# First execute the setup_sql -foreach my $run (sort keys %pgdumpall_runs) -{ - if ($pgdumpall_runs{$run}->{setup_sql}) - { - $node->safe_psql($run_db, $pgdumpall_runs{$run}->{setup_sql}); - } -} - -# Execute the tests -foreach my $run (sort keys %pgdumpall_runs) -{ - # Create a new target cluster to pg_restore each test case run so that we - # don't need to take care of the cleanup from the target cluster after each - # run. - my $target_node = PostgreSQL::Test::Cluster->new("target_$run"); - $target_node->init; - $target_node->start; - - # Dumpall from node cluster. - $node->command_ok(\@{ $pgdumpall_runs{$run}->{dump_cmd} }, - "$run: pg_dumpall runs"); - - # Restore the dump on "target_node" cluster. - my @restore_cmd = ( - @{ $pgdumpall_runs{$run}->{restore_cmd} }, - '--host', $target_node->host, '--port', $target_node->port); - - my ($stdout, $stderr) = run_command(\@restore_cmd); - - # pg_restore --file output file. - my $output_file = slurp_file("$tempdir/${run}.sql"); - - if (!($pgdumpall_runs{$run}->{like}) && !($pgdumpall_runs{$run}->{unlike})) - { - die "missing \"like\" or \"unlike\" in test \"$run\""; - } - - if ($pgdumpall_runs{$run}->{like}) - { - like($output_file, $pgdumpall_runs{$run}->{like}, "should dump $run"); - } - - if ($pgdumpall_runs{$run}->{unlike}) - { - unlike( - $output_file, - $pgdumpall_runs{$run}->{unlike}, - "should not dump $run"); - } -} - -# Some negative test case with dump of pg_dumpall and restore using pg_restore -# test case 1: when -C is not used in pg_restore with dump of pg_dumpall -$node->command_fails_like( - [ 'pg_restore', - "$tempdir/format_custom", - '--format' => 'custom', - '--file' => "$tempdir/error_test.sql", ], - qr/\Qpg_restore: error: -C\/--create option should be specified when restoring an archive created by pg_dumpall\E/, - 'When -C is not used in pg_restore with dump of pg_dumpall'); - -# test case 2: When --list option is used with dump of pg_dumpall -$node->command_fails_like( - [ 'pg_restore', - "$tempdir/format_custom", '-C', - '--format' => 'custom', '--list', - '--file' => "$tempdir/error_test.sql", ], - qr/\Qpg_restore: error: option -l\/--list cannot be used when restoring an archive created by pg_dumpall\E/, - 'When --list is used in pg_restore with dump of pg_dumpall'); - -# test case 3: When non-exist database is given with -d option -$node->command_fails_like( - [ 'pg_restore', - "$tempdir/format_custom", '-C', - '--format' => 'custom', - '-d' => 'dbpq', ], - qr/\Qpg_restore: error: could not connect to database "dbpq"\E/, - 'When non-existent database is given with -d option in pg_restore with dump of pg_dumpall'); - -$node->stop('fast'); - -done_testing(); diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c index 56c2ad55d4a..e80edb7077e 100644 --- a/src/bin/pg_rewind/libpq_source.c +++ b/src/bin/pg_rewind/libpq_source.c @@ -215,7 +215,7 @@ libpq_get_current_wal_insert_lsn(rewind_source *source) val = run_simple_query(conn, "SELECT pg_current_wal_insert_lsn()"); - if (sscanf(val, "%X/%X", &hi, &lo) != 2) + if (sscanf(val, "%X/%08X", &hi, &lo) != 2) pg_fatal("unrecognized result \"%s\" for current WAL insert location", val); result = ((uint64) hi) << 32 | lo; diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 2cd44625ca3..8f4b282c6b1 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -89,11 +89,11 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, XLogRecPtr errptr = xlogreader->EndRecPtr; if (errormsg) - pg_fatal("could not read WAL record at %X/%X: %s", + pg_fatal("could not read WAL record at %X/%08X: %s", LSN_FORMAT_ARGS(errptr), errormsg); else - pg_fatal("could not read WAL record at %X/%X", + pg_fatal("could not read WAL record at %X/%08X", LSN_FORMAT_ARGS(errptr)); } @@ -105,7 +105,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, * messed up. */ if (xlogreader->EndRecPtr != endpoint) - pg_fatal("end pointer %X/%X is not a valid end point; expected %X/%X", + pg_fatal("end pointer %X/%08X is not a valid end point; expected %X/%08X", LSN_FORMAT_ARGS(endpoint), LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); XLogReaderFree(xlogreader); @@ -143,10 +143,10 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, if (record == NULL) { if (errormsg) - pg_fatal("could not read WAL record at %X/%X: %s", + pg_fatal("could not read WAL record at %X/%08X: %s", LSN_FORMAT_ARGS(ptr), errormsg); else - pg_fatal("could not read WAL record at %X/%X", + pg_fatal("could not read WAL record at %X/%08X", LSN_FORMAT_ARGS(ptr)); } endptr = xlogreader->EndRecPtr; @@ -211,11 +211,11 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, if (record == NULL) { if (errormsg) - pg_fatal("could not find previous WAL record at %X/%X: %s", + pg_fatal("could not find previous WAL record at %X/%08X: %s", LSN_FORMAT_ARGS(searchptr), errormsg); else - pg_fatal("could not find previous WAL record at %X/%X", + pg_fatal("could not find previous WAL record at %X/%08X", LSN_FORMAT_ARGS(searchptr)); } @@ -458,8 +458,8 @@ extractPageInfo(XLogReaderState *record) * we don't recognize the type. That's bad - we don't know how to * track that change. */ - pg_fatal("WAL record modifies a relation, but record type is not recognized: " - "lsn: %X/%X, rmid: %d, rmgr: %s, info: %02X", + pg_fatal("WAL record modifies a relation, but record type is not recognized:\n" + "lsn: %X/%08X, rmid: %d, rmgr: %s, info: %02X", LSN_FORMAT_ARGS(record->ReadRecPtr), rmid, RmgrName(rmid), info); } diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 9d16c1e6b47..0c68dd4235e 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -393,7 +393,7 @@ main(int argc, char **argv) targetHistory, targetNentries, &divergerec, &lastcommontliIndex); - pg_log_info("servers diverged at WAL location %X/%X on timeline %u", + pg_log_info("servers diverged at WAL location %X/%08X on timeline %u", LSN_FORMAT_ARGS(divergerec), targetHistory[lastcommontliIndex].tli); @@ -461,7 +461,7 @@ main(int argc, char **argv) findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex, &chkptrec, &chkpttli, &chkptredo, restore_command); - pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u", + pg_log_info("rewinding from last common checkpoint at %X/%08X on timeline %u", LSN_FORMAT_ARGS(chkptrec), chkpttli); /* Initialize the hash table to track the status of each file */ @@ -902,7 +902,7 @@ getTimelineHistory(TimeLineID tli, bool is_source, int *nentries) TimeLineHistoryEntry *entry; entry = &history[i]; - pg_log_debug("%u: %X/%X - %X/%X", entry->tli, + pg_log_debug("%u: %X/%08X - %X/%08X", entry->tli, LSN_FORMAT_ARGS(entry->begin), LSN_FORMAT_ARGS(entry->end)); } @@ -981,8 +981,8 @@ createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpo strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp); len = snprintf(buf, sizeof(buf), - "START WAL LOCATION: %X/%X (file %s)\n" - "CHECKPOINT LOCATION: %X/%X\n" + "START WAL LOCATION: %X/%08X (file %s)\n" + "CHECKPOINT LOCATION: %X/%08X\n" "BACKUP METHOD: pg_rewind\n" "BACKUP FROM: standby\n" "START TIME: %s\n", diff --git a/src/bin/pg_rewind/t/RewindTest.pm b/src/bin/pg_rewind/t/RewindTest.pm index 3efab831797..b0234ebfaf2 100644 --- a/src/bin/pg_rewind/t/RewindTest.pm +++ b/src/bin/pg_rewind/t/RewindTest.pm @@ -285,7 +285,7 @@ sub run_pg_rewind # Check that pg_rewind with dbname and --write-recovery-conf # wrote the dbname in the generated primary_conninfo value. like(slurp_file("$primary_pgdata/postgresql.auto.conf"), - qr/dbname=postgres/m, 'recovery conf file sets dbname'); + qr/dbname=postgres/m, 'recovery conf file sets dbname'); # Check that standby.signal is here as recovery configuration # was requested. diff --git a/src/bin/pg_rewind/timeline.c b/src/bin/pg_rewind/timeline.c index 4d9f0d8301b..6784969951f 100644 --- a/src/bin/pg_rewind/timeline.c +++ b/src/bin/pg_rewind/timeline.c @@ -66,7 +66,7 @@ rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries) if (*ptr == '\0' || *ptr == '#') continue; - nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo); + nfields = sscanf(fline, "%u\t%X/%08X", &tli, &switchpoint_hi, &switchpoint_lo); if (nfields < 1) { diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index ce7aad4b25a..a5621251afc 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -9,19 +9,30 @@ #include <limits.h> #include "getopt_long.h" +#include "port/pg_bitutils.h" #include "portability/instr_time.h" static const char *progname; static unsigned int test_duration = 3; +static double max_rprct = 99.99; + +/* record duration in powers of 2 nanoseconds */ +static long long int histogram[32]; + +/* record counts of first 10K durations directly */ +#define NUM_DIRECT 10000 +static long long int direct_histogram[NUM_DIRECT]; + +/* separately record highest observed duration */ +static int32 largest_diff; +static long long int largest_diff_count; + static void handle_args(int argc, char *argv[]); static uint64 test_timing(unsigned int duration); static void output(uint64 loop_count); -/* record duration in powers of 2 microseconds */ -static long long int histogram[32]; - int main(int argc, char *argv[]) { @@ -44,6 +55,7 @@ handle_args(int argc, char *argv[]) { static struct option long_options[] = { {"duration", required_argument, NULL, 'd'}, + {"cutoff", required_argument, NULL, 'c'}, {NULL, 0, NULL, 0} }; @@ -56,7 +68,7 @@ handle_args(int argc, char *argv[]) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) { - printf(_("Usage: %s [-d DURATION]\n"), progname); + printf(_("Usage: %s [-d DURATION] [-c CUTOFF]\n"), progname); exit(0); } if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) @@ -66,7 +78,7 @@ handle_args(int argc, char *argv[]) } } - while ((option = getopt_long(argc, argv, "d:", + while ((option = getopt_long(argc, argv, "d:c:", long_options, &optindex)) != -1) { switch (option) @@ -93,6 +105,26 @@ handle_args(int argc, char *argv[]) } break; + case 'c': + errno = 0; + max_rprct = strtod(optarg, &endptr); + + if (endptr == optarg || *endptr != '\0' || errno != 0) + { + fprintf(stderr, _("%s: invalid argument for option %s\n"), + progname, "--cutoff"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + + if (max_rprct < 0 || max_rprct > 100) + { + fprintf(stderr, _("%s: %s must be in range %u..%u\n"), + progname, "--cutoff", 0, 100); + exit(1); + } + break; + default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); @@ -111,7 +143,6 @@ handle_args(int argc, char *argv[]) exit(1); } - printf(ngettext("Testing timing overhead for %u second.\n", "Testing timing overhead for %u seconds.\n", test_duration), @@ -130,23 +161,33 @@ test_timing(unsigned int duration) end_time, temp; - total_time = duration > 0 ? duration * INT64CONST(1000000) : 0; + /* + * Pre-zero the statistics data structures. They're already zero by + * default, but this helps bring them into processor cache and avoid + * possible timing glitches due to COW behavior. + */ + memset(direct_histogram, 0, sizeof(direct_histogram)); + memset(histogram, 0, sizeof(histogram)); + largest_diff = 0; + largest_diff_count = 0; + + total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0; INSTR_TIME_SET_CURRENT(start_time); - cur = INSTR_TIME_GET_MICROSEC(start_time); + cur = INSTR_TIME_GET_NANOSEC(start_time); while (time_elapsed < total_time) { int32 diff, - bits = 0; + bits; prev = cur; INSTR_TIME_SET_CURRENT(temp); - cur = INSTR_TIME_GET_MICROSEC(temp); + cur = INSTR_TIME_GET_NANOSEC(temp); diff = cur - prev; /* Did time go backwards? */ - if (diff < 0) + if (unlikely(diff < 0)) { fprintf(stderr, _("Detected clock going backwards in time.\n")); fprintf(stderr, _("Time warp: %d ms\n"), diff); @@ -154,25 +195,37 @@ test_timing(unsigned int duration) } /* What is the highest bit in the time diff? */ - while (diff) - { - diff >>= 1; - bits++; - } + if (diff > 0) + bits = pg_leftmost_one_pos32(diff) + 1; + else + bits = 0; /* Update appropriate duration bucket */ histogram[bits]++; + /* Update direct histogram of time diffs */ + if (diff < NUM_DIRECT) + direct_histogram[diff]++; + + /* Also track the largest observed duration, even if >= NUM_DIRECT */ + if (diff > largest_diff) + { + largest_diff = diff; + largest_diff_count = 1; + } + else if (diff == largest_diff) + largest_diff_count++; + loop_count++; INSTR_TIME_SUBTRACT(temp, start_time); - time_elapsed = INSTR_TIME_GET_MICROSEC(temp); + time_elapsed = INSTR_TIME_GET_NANOSEC(temp); } INSTR_TIME_SET_CURRENT(end_time); INSTR_TIME_SUBTRACT(end_time, start_time); - printf(_("Per loop time including overhead: %0.2f ns\n"), + printf(_("Average loop time including overhead: %0.2f ns\n"), INSTR_TIME_GET_DOUBLE(end_time) * 1e9 / loop_count); return loop_count; @@ -181,28 +234,95 @@ test_timing(unsigned int duration) static void output(uint64 loop_count) { - int64 max_bit = 31, - i; - char *header1 = _("< us"); - char *header2 = /* xgettext:no-c-format */ _("% of total"); - char *header3 = _("count"); + int max_bit = 31; + const char *header1 = _("<= ns"); + const char *header1b = _("ns"); + const char *header2 = /* xgettext:no-c-format */ _("% of total"); + const char *header3 = /* xgettext:no-c-format */ _("running %"); + const char *header4 = _("count"); int len1 = strlen(header1); int len2 = strlen(header2); int len3 = strlen(header3); + int len4 = strlen(header4); + double rprct; + bool stopped = false; /* find highest bit value */ while (max_bit > 0 && histogram[max_bit] == 0) max_bit--; + /* set minimum column widths */ + len1 = Max(8, len1); + len2 = Max(10, len2); + len3 = Max(10, len3); + len4 = Max(10, len4); + printf(_("Histogram of timing durations:\n")); - printf("%*s %*s %*s\n", - Max(6, len1), header1, - Max(10, len2), header2, - Max(10, len3), header3); - - for (i = 0; i <= max_bit; i++) - printf("%*ld %*.5f %*lld\n", - Max(6, len1), 1l << i, - Max(10, len2) - 1, (double) histogram[i] * 100 / loop_count, - Max(10, len3), histogram[i]); + printf("%*s %*s %*s %*s\n", + len1, header1, + len2, header2, + len3, header3, + len4, header4); + + rprct = 0; + for (int i = 0; i <= max_bit; i++) + { + double prct = (double) histogram[i] * 100 / loop_count; + + rprct += prct; + printf("%*ld %*.4f %*.4f %*lld\n", + len1, (1L << i) - 1, + len2, prct, + len3, rprct, + len4, histogram[i]); + } + + printf(_("\nObserved timing durations up to %.4f%%:\n"), max_rprct); + printf("%*s %*s %*s %*s\n", + len1, header1b, + len2, header2, + len3, header3, + len4, header4); + + rprct = 0; + for (int i = 0; i < NUM_DIRECT; i++) + { + if (direct_histogram[i]) + { + double prct = (double) direct_histogram[i] * 100 / loop_count; + bool print_it = !stopped; + + rprct += prct; + + /* if largest diff is < NUM_DIRECT, be sure we print it */ + if (i == largest_diff) + { + if (stopped) + printf("...\n"); + print_it = true; + } + + if (print_it) + printf("%*d %*.4f %*.4f %*lld\n", + len1, i, + len2, prct, + len3, rprct, + len4, direct_histogram[i]); + if (rprct >= max_rprct) + stopped = true; + } + } + + /* print largest diff when it's outside the array range */ + if (largest_diff >= NUM_DIRECT) + { + double prct = (double) largest_diff_count * 100 / loop_count; + + printf("...\n"); + printf("%*d %*.4f %*.4f %*lld\n", + len1, largest_diff, + len2, prct, + len3, 100.0, + len4, largest_diff_count); + } } diff --git a/src/bin/pg_test_timing/t/001_basic.pl b/src/bin/pg_test_timing/t/001_basic.pl index 6554cd981af..9912acc052a 100644 --- a/src/bin/pg_test_timing/t/001_basic.pl +++ b/src/bin/pg_test_timing/t/001_basic.pl @@ -25,5 +25,22 @@ command_fails_like( [ 'pg_test_timing', '--duration' => '0' ], qr/\Qpg_test_timing: --duration must be in range 1..4294967295\E/, 'pg_test_timing: --duration must be in range'); +command_fails_like( + [ 'pg_test_timing', '--cutoff' => '101' ], + qr/\Qpg_test_timing: --cutoff must be in range 0..100\E/, + 'pg_test_timing: --cutoff must be in range'); + +######################################### +# We obviously can't check for specific output, but we can +# do a simple run and make sure it produces something. + +command_like( + [ 'pg_test_timing', '--duration' => '1' ], + qr/ +\QTesting timing overhead for 1 second.\E.* +\QHistogram of timing durations:\E.* +\QObserved timing durations up to 99.9900%:\E +/sx, + 'pg_test_timing: sanity check'); done_testing(); diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 940fc77fc2e..67eedbae265 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -23,11 +23,12 @@ static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster); static void check_for_user_defined_postfix_ops(ClusterInfo *cluster); static void check_for_incompatible_polymorphics(ClusterInfo *cluster); static void check_for_tables_with_oids(ClusterInfo *cluster); +static void check_for_not_null_inheritance(ClusterInfo *cluster); static void check_for_pg_role_prefix(ClusterInfo *cluster); static void check_for_new_tablespace_dir(void); static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster); static void check_for_unicode_update(ClusterInfo *cluster); -static void check_new_cluster_logical_replication_slots(void); +static void check_new_cluster_replication_slots(void); static void check_new_cluster_subscription_configuration(void); static void check_old_cluster_for_valid_slots(void); static void check_old_cluster_subscription_state(void); @@ -168,6 +169,7 @@ static DataTypesUsageChecks data_types_usage_checks[] = /* pg_class.oid is preserved, so 'regclass' is OK */ " 'regcollation', " " 'regconfig', " + /* pg_database.oid is preserved, so 'regdatabase' is OK */ " 'regdictionary', " " 'regnamespace', " " 'regoper', " @@ -419,7 +421,7 @@ process_data_type_check(DbInfo *dbinfo, PGresult *res, void *arg) if (!state->result) { pg_log(PG_REPORT, "failed check: %s", _(state->check->status)); - appendPQExpBuffer(*state->report, "\n%s\n%s %s\n", + appendPQExpBuffer(*state->report, "\n%s\n%s\n %s\n", _(state->check->report_text), _("A list of the problem columns is in the file:"), output_path); @@ -629,7 +631,7 @@ check_and_dump_old_cluster(void) * Before that the logical slots are not upgraded, so we will not be * able to upgrade the logical replication clusters completely. */ - get_subscription_count(&old_cluster); + get_subscription_info(&old_cluster); check_old_cluster_subscription_state(); } @@ -672,6 +674,14 @@ check_and_dump_old_cluster(void) check_for_tables_with_oids(&old_cluster); /* + * Pre-PG 18 allowed child tables to omit not-null constraints that their + * parents columns have, but schema restore fails for them. Verify there + * are none, iff applicable. + */ + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1800) + check_for_not_null_inheritance(&old_cluster); + + /* * Pre-PG 10 allowed tables with 'unknown' type columns and non WAL logged * hash indexes */ @@ -754,7 +764,7 @@ check_new_cluster(void) check_for_new_tablespace_dir(); - check_new_cluster_logical_replication_slots(); + check_new_cluster_replication_slots(); check_new_cluster_subscription_configuration(); } @@ -885,7 +895,7 @@ check_cluster_versions(void) */ if (GET_MAJOR_VERSION(old_cluster.major_version) >= 1800 && user_opts.char_signedness != -1) - pg_fatal("%s option cannot be used to upgrade from PostgreSQL %s and later.", + pg_fatal("The option %s cannot be used for upgrades from PostgreSQL %s and later.", "--set-char-signedness", "18"); check_ok(); @@ -946,12 +956,12 @@ check_for_new_tablespace_dir(void) prep_status("Checking for new cluster tablespace directories"); - for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (tblnum = 0; tblnum < new_cluster.num_tablespaces; tblnum++) { struct stat statbuf; snprintf(new_tablespace_dir, MAXPGPATH, "%s%s", - os_info.old_tablespaces[tblnum], + new_cluster.tablespaces[tblnum], new_cluster.tablespace_suffix); if (stat(new_tablespace_dir, &statbuf) == 0 || errno != ENOENT) @@ -1003,17 +1013,17 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name) * directory. We can't create a proper old cluster delete script in that * case. */ - for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (tblnum = 0; tblnum < new_cluster.num_tablespaces; tblnum++) { - char old_tablespace_dir[MAXPGPATH]; + char new_tablespace_dir[MAXPGPATH]; - strlcpy(old_tablespace_dir, os_info.old_tablespaces[tblnum], MAXPGPATH); - canonicalize_path(old_tablespace_dir); - if (path_is_prefix_of_path(old_cluster_pgdata, old_tablespace_dir)) + strlcpy(new_tablespace_dir, new_cluster.tablespaces[tblnum], MAXPGPATH); + canonicalize_path(new_tablespace_dir); + if (path_is_prefix_of_path(old_cluster_pgdata, new_tablespace_dir)) { /* reproduce warning from CREATE TABLESPACE that is in the log */ pg_log(PG_WARNING, - "\nWARNING: user-defined tablespace locations should not be inside the data directory, i.e. %s", old_tablespace_dir); + "\nWARNING: user-defined tablespace locations should not be inside the data directory, i.e. %s", new_tablespace_dir); /* Unlink file in case it is left over from a previous run. */ unlink(*deletion_script_file_name); @@ -1041,9 +1051,9 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name) /* delete old cluster's alternate tablespaces */ old_tblspc_suffix = pg_strdup(old_cluster.tablespace_suffix); fix_path_separator(old_tblspc_suffix); - for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (tblnum = 0; tblnum < old_cluster.num_tablespaces; tblnum++) fprintf(script, RMDIR_CMD " %c%s%s%c\n", PATH_QUOTE, - fix_path_separator(os_info.old_tablespaces[tblnum]), + fix_path_separator(old_cluster.tablespaces[tblnum]), old_tblspc_suffix, PATH_QUOTE); pfree(old_tblspc_suffix); @@ -1623,6 +1633,93 @@ check_for_tables_with_oids(ClusterInfo *cluster) check_ok(); } +/* + * Callback function for processing results of query for + * check_for_not_null_inheritance. + */ +static void +process_inconsistent_notnull(DbInfo *dbinfo, PGresult *res, void *arg) +{ + UpgradeTaskReport *report = (UpgradeTaskReport *) arg; + int ntups = PQntuples(res); + int i_nspname = PQfnumber(res, "nspname"); + int i_relname = PQfnumber(res, "relname"); + int i_attname = PQfnumber(res, "attname"); + + AssertVariableIsOfType(&process_inconsistent_notnull, + UpgradeTaskProcessCB); + + if (ntups == 0) + return; + + if (report->file == NULL && + (report->file = fopen_priv(report->path, "w")) == NULL) + pg_fatal("could not open file \"%s\": %m", report->path); + + fprintf(report->file, "In database: %s\n", dbinfo->db_name); + + for (int rowno = 0; rowno < ntups; rowno++) + { + fprintf(report->file, " %s.%s.%s\n", + PQgetvalue(res, rowno, i_nspname), + PQgetvalue(res, rowno, i_relname), + PQgetvalue(res, rowno, i_attname)); + } +} + +/* + * check_for_not_null_inheritance() + * + * An attempt to create child tables lacking not-null constraints that are + * present in their parents errors out. This can no longer occur since 18, + * but previously there were various ways for that to happen. Check that + * the cluster to be upgraded doesn't have any of those problems. + */ +static void +check_for_not_null_inheritance(ClusterInfo *cluster) +{ + UpgradeTaskReport report; + UpgradeTask *task; + const char *query; + + prep_status("Checking for not-null constraint inconsistencies"); + + report.file = NULL; + snprintf(report.path, sizeof(report.path), "%s/%s", + log_opts.basedir, + "not_null_inconsistent_columns.txt"); + + query = "SELECT nspname, cc.relname, ac.attname " + "FROM pg_catalog.pg_inherits i, pg_catalog.pg_attribute ac, " + " pg_catalog.pg_attribute ap, pg_catalog.pg_class cc, " + " pg_catalog.pg_namespace nc " + "WHERE cc.oid = ac.attrelid AND i.inhrelid = ac.attrelid " + " AND i.inhparent = ap.attrelid AND ac.attname = ap.attname " + " AND cc.relnamespace = nc.oid " + " AND ap.attnum > 0 and ap.attnotnull AND NOT ac.attnotnull"; + + task = upgrade_task_create(); + upgrade_task_add_step(task, query, + process_inconsistent_notnull, + true, &report); + upgrade_task_run(task, cluster); + upgrade_task_free(task); + + if (report.file) + { + fclose(report.file); + pg_log(PG_REPORT, "fatal"); + pg_fatal("Your installation contains inconsistent NOT NULL constraints.\n" + "If the parent column(s) are NOT NULL, then the child column must\n" + "also be marked NOT NULL, or the upgrade will fail.\n" + "You can fix this by running\n" + " ALTER TABLE tablename ALTER column SET NOT NULL;\n" + "on each column listed in the file:\n" + " %s", report.path); + } + else + check_ok(); +} /* * check_for_pg_role_prefix() @@ -1934,7 +2031,7 @@ check_for_unicode_update(ClusterInfo *cluster) { fclose(report.file); report_status(PG_WARNING, "warning"); - pg_log(PG_WARNING, "Your installation contains relations that may be affected by a new version of Unicode.\n" + pg_log(PG_WARNING, "Your installation contains relations that might be affected by a new version of Unicode.\n" "A list of potentially-affected relations is in the file:\n" " %s", report.path); } @@ -1943,48 +2040,80 @@ check_for_unicode_update(ClusterInfo *cluster) } /* - * check_new_cluster_logical_replication_slots() + * check_new_cluster_replication_slots() * - * Verify that there are no logical replication slots on the new cluster and - * that the parameter settings necessary for creating slots are sufficient. + * Validate the new cluster's readiness for migrating replication slots: + * - Ensures no existing logical replication slots on the new cluster when + * migrating logical slots. + * - Ensure conflict detection slot does not exist on the new cluster when + * migrating subscriptions with retain_dead_tuples enabled. + * - Ensure that the parameter settings on the new cluster necessary for + * creating slots are sufficient. */ static void -check_new_cluster_logical_replication_slots(void) +check_new_cluster_replication_slots(void) { PGresult *res; PGconn *conn; int nslots_on_old; int nslots_on_new; + int rdt_slot_on_new; int max_replication_slots; char *wal_level; + int i_nslots_on_new; + int i_rdt_slot_on_new; - /* Logical slots can be migrated since PG17. */ + /* + * Logical slots can be migrated since PG17 and a physical slot + * CONFLICT_DETECTION_SLOT can be migrated since PG19. + */ if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1600) return; nslots_on_old = count_old_cluster_logical_slots(); - /* Quick return if there are no logical slots to be migrated. */ - if (nslots_on_old == 0) + /* + * Quick return if there are no slots to be migrated and no subscriptions + * have the retain_dead_tuples option enabled. + */ + if (nslots_on_old == 0 && !old_cluster.sub_retain_dead_tuples) return; conn = connectToServer(&new_cluster, "template1"); - prep_status("Checking for new cluster logical replication slots"); + prep_status("Checking for new cluster replication slots"); - res = executeQueryOrDie(conn, "SELECT count(*) " - "FROM pg_catalog.pg_replication_slots " - "WHERE slot_type = 'logical' AND " - "temporary IS FALSE;"); + res = executeQueryOrDie(conn, "SELECT %s AS nslots_on_new, %s AS rdt_slot_on_new " + "FROM pg_catalog.pg_replication_slots", + nslots_on_old > 0 + ? "COUNT(*) FILTER (WHERE slot_type = 'logical' AND temporary IS FALSE)" + : "0", + old_cluster.sub_retain_dead_tuples + ? "COUNT(*) FILTER (WHERE slot_name = 'pg_conflict_detection')" + : "0"); if (PQntuples(res) != 1) - pg_fatal("could not count the number of logical replication slots"); + pg_fatal("could not count the number of replication slots"); - nslots_on_new = atoi(PQgetvalue(res, 0, 0)); + i_nslots_on_new = PQfnumber(res, "nslots_on_new"); + i_rdt_slot_on_new = PQfnumber(res, "rdt_slot_on_new"); + + nslots_on_new = atoi(PQgetvalue(res, 0, i_nslots_on_new)); if (nslots_on_new) + { + Assert(nslots_on_old); pg_fatal("expected 0 logical replication slots but found %d", nslots_on_new); + } + + rdt_slot_on_new = atoi(PQgetvalue(res, 0, i_rdt_slot_on_new)); + + if (rdt_slot_on_new) + { + Assert(old_cluster.sub_retain_dead_tuples); + pg_fatal("The replication slot \"pg_conflict_detection\" already exists on the new cluster"); + } PQclear(res); @@ -1997,12 +2126,24 @@ check_new_cluster_logical_replication_slots(void) wal_level = PQgetvalue(res, 0, 0); - if (strcmp(wal_level, "logical") != 0) + if (nslots_on_old > 0 && strcmp(wal_level, "logical") != 0) pg_fatal("\"wal_level\" must be \"logical\" but is set to \"%s\"", wal_level); + if (old_cluster.sub_retain_dead_tuples && + strcmp(wal_level, "minimal") == 0) + pg_fatal("\"wal_level\" must be \"replica\" or \"logical\" but is set to \"%s\"", + wal_level); + max_replication_slots = atoi(PQgetvalue(res, 1, 0)); + if (old_cluster.sub_retain_dead_tuples && + nslots_on_old + 1 > max_replication_slots) + pg_fatal("\"max_replication_slots\" (%d) must be greater than or equal to the number of " + "logical replication slots on the old cluster plus one additional slot required " + "for retaining conflict detection information (%d)", + max_replication_slots, nslots_on_old + 1); + if (nslots_on_old > max_replication_slots) pg_fatal("\"max_replication_slots\" (%d) must be greater than or equal to the number of " "logical replication slots (%d) on the old cluster", @@ -2114,6 +2255,22 @@ check_old_cluster_for_valid_slots(void) "The slot \"%s\" has not consumed the WAL yet\n", slot->slotname); } + + /* + * The name "pg_conflict_detection" (defined as + * CONFLICT_DETECTION_SLOT) has been reserved for logical + * replication conflict detection slot since PG19. + */ + if (strcmp(slot->slotname, "pg_conflict_detection") == 0) + { + if (script == NULL && + (script = fopen_priv(output_path, "w")) == NULL) + pg_fatal("could not open file \"%s\": %m", output_path); + + fprintf(script, + "The slot name \"%s\" is reserved\n", + slot->slotname); + } } } diff --git a/src/bin/pg_upgrade/dump.c b/src/bin/pg_upgrade/dump.c index 23cb08e8347..55f6e7b4d9c 100644 --- a/src/bin/pg_upgrade/dump.c +++ b/src/bin/pg_upgrade/dump.c @@ -58,7 +58,7 @@ generate_old_dump(void) (user_opts.transfer_mode == TRANSFER_MODE_SWAP) ? "" : "--sequence-data", log_opts.verbose ? "--verbose" : "", - user_opts.do_statistics ? "" : "--no-statistics", + user_opts.do_statistics ? "--statistics" : "--no-statistics", log_opts.dumpdir, sql_file_name, escaped_connstr.data); diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index 4b7a56f5b3b..c39eb077c2f 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -443,10 +443,26 @@ get_db_infos(ClusterInfo *cluster) for (tupnum = 0; tupnum < ntups; tupnum++) { + char *spcloc = PQgetvalue(res, tupnum, i_spclocation); + bool inplace = spcloc[0] && !is_absolute_path(spcloc); + dbinfos[tupnum].db_oid = atooid(PQgetvalue(res, tupnum, i_oid)); dbinfos[tupnum].db_name = pg_strdup(PQgetvalue(res, tupnum, i_datname)); - snprintf(dbinfos[tupnum].db_tablespace, sizeof(dbinfos[tupnum].db_tablespace), "%s", - PQgetvalue(res, tupnum, i_spclocation)); + + /* + * The tablespace location might be "", meaning the cluster default + * location, i.e. pg_default or pg_global. For in-place tablespaces, + * pg_tablespace_location() returns a path relative to the data + * directory. + */ + if (inplace) + snprintf(dbinfos[tupnum].db_tablespace, + sizeof(dbinfos[tupnum].db_tablespace), + "%s/%s", cluster->pgdata, spcloc); + else + snprintf(dbinfos[tupnum].db_tablespace, + sizeof(dbinfos[tupnum].db_tablespace), + "%s", spcloc); } PQclear(res); @@ -616,11 +632,21 @@ process_rel_infos(DbInfo *dbinfo, PGresult *res, void *arg) /* Is the tablespace oid non-default? */ if (atooid(PQgetvalue(res, relnum, i_reltablespace)) != 0) { + char *spcloc = PQgetvalue(res, relnum, i_spclocation); + bool inplace = spcloc[0] && !is_absolute_path(spcloc); + /* * The tablespace location might be "", meaning the cluster - * default location, i.e. pg_default or pg_global. + * default location, i.e. pg_default or pg_global. For in-place + * tablespaces, pg_tablespace_location() returns a path relative + * to the data directory. */ - tablespace = PQgetvalue(res, relnum, i_spclocation); + if (inplace) + tablespace = psprintf("%s/%s", + os_info.running_cluster->pgdata, + spcloc); + else + tablespace = spcloc; /* Can we reuse the previous string allocation? */ if (last_tablespace && strcmp(tablespace, last_tablespace) == 0) @@ -630,6 +656,10 @@ process_rel_infos(DbInfo *dbinfo, PGresult *res, void *arg) last_tablespace = curr->tablespace = pg_strdup(tablespace); curr->tblsp_alloc = true; } + + /* Free palloc'd string for in-place tablespaces. */ + if (inplace) + pfree(tablespace); } else /* A zero reltablespace oid indicates the database tablespace. */ @@ -752,20 +782,33 @@ count_old_cluster_logical_slots(void) } /* - * get_subscription_count() + * get_subscription_info() * - * Gets the number of subscriptions in the cluster. + * Gets the information of subscriptions in the cluster. */ void -get_subscription_count(ClusterInfo *cluster) +get_subscription_info(ClusterInfo *cluster) { PGconn *conn; PGresult *res; + int i_nsub; + int i_retain_dead_tuples; conn = connectToServer(cluster, "template1"); - res = executeQueryOrDie(conn, "SELECT count(*) " - "FROM pg_catalog.pg_subscription"); - cluster->nsubs = atoi(PQgetvalue(res, 0, 0)); + if (GET_MAJOR_VERSION(cluster->major_version) >= 1900) + res = executeQueryOrDie(conn, "SELECT count(*) AS nsub," + "COUNT(CASE WHEN subretaindeadtuples THEN 1 END) > 0 AS retain_dead_tuples " + "FROM pg_catalog.pg_subscription"); + else + res = executeQueryOrDie(conn, "SELECT count(*) AS nsub," + "'f' AS retain_dead_tuples " + "FROM pg_catalog.pg_subscription"); + + i_nsub = PQfnumber(res, "nsub"); + i_retain_dead_tuples = PQfnumber(res, "retain_dead_tuples"); + + cluster->nsubs = atoi(PQgetvalue(res, 0, i_nsub)); + cluster->sub_retain_dead_tuples = (strcmp(PQgetvalue(res, 0, i_retain_dead_tuples), "t") == 0); PQclear(res); PQfinish(conn); diff --git a/src/bin/pg_upgrade/parallel.c b/src/bin/pg_upgrade/parallel.c index 056aa2edaee..6d7941844a7 100644 --- a/src/bin/pg_upgrade/parallel.c +++ b/src/bin/pg_upgrade/parallel.c @@ -40,6 +40,7 @@ typedef struct char *old_pgdata; char *new_pgdata; char *old_tablespace; + char *new_tablespace; } transfer_thread_arg; static exec_thread_arg **exec_thread_args; @@ -171,7 +172,7 @@ win32_exec_prog(exec_thread_arg *args) void parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata, - char *old_tablespace) + char *old_tablespace, char *new_tablespace) { #ifndef WIN32 pid_t child; @@ -181,7 +182,7 @@ parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, #endif if (user_opts.jobs <= 1) - transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, NULL); + transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, NULL, NULL); else { /* parallel */ @@ -225,7 +226,7 @@ parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, if (child == 0) { transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, - old_tablespace); + old_tablespace, new_tablespace); /* if we take another exit path, it will be non-zero */ /* use _exit to skip atexit() functions */ _exit(0); @@ -246,6 +247,7 @@ parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, new_arg->new_pgdata = pg_strdup(new_pgdata); pg_free(new_arg->old_tablespace); new_arg->old_tablespace = old_tablespace ? pg_strdup(old_tablespace) : NULL; + new_arg->new_tablespace = new_tablespace ? pg_strdup(new_tablespace) : NULL; child = (HANDLE) _beginthreadex(NULL, 0, (void *) win32_transfer_all_new_dbs, new_arg, 0, NULL); @@ -263,7 +265,8 @@ DWORD win32_transfer_all_new_dbs(transfer_thread_arg *args) { transfer_all_new_dbs(args->old_db_arr, args->new_db_arr, args->old_pgdata, - args->new_pgdata, args->old_tablespace); + args->new_pgdata, args->old_tablespace, + args->new_tablespace); /* terminates thread */ return 0; diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 536e49d2616..d5cd5bf0b3a 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -67,6 +67,7 @@ static void set_frozenxids(bool minmxid_only); static void make_outputdirs(char *pgdata); static void setup(char *argv0); static void create_logical_replication_slots(void); +static void create_conflict_detection_slot(void); ClusterInfo old_cluster, new_cluster; @@ -88,6 +89,7 @@ int main(int argc, char **argv) { char *deletion_script_file_name = NULL; + bool migrate_logical_slots; /* * pg_upgrade doesn't currently use common/logging.c, but initialize it @@ -198,18 +200,39 @@ main(int argc, char **argv) new_cluster.pgdata); check_ok(); + migrate_logical_slots = count_old_cluster_logical_slots(); + /* - * Migrate the logical slots to the new cluster. Note that we need to do - * this after resetting WAL because otherwise the required WAL would be - * removed and slots would become unusable. There is a possibility that - * background processes might generate some WAL before we could create the - * slots in the new cluster but we can ignore that WAL as that won't be - * required downstream. + * Migrate replication slots to the new cluster. + * + * Note that we must migrate logical slots after resetting WAL because + * otherwise the required WAL would be removed and slots would become + * unusable. There is a possibility that background processes might + * generate some WAL before we could create the slots in the new cluster + * but we can ignore that WAL as that won't be required downstream. + * + * The conflict detection slot is not affected by concerns related to WALs + * as it only retains the dead tuples. It is created here for consistency. + * Note that the new conflict detection slot uses the latest transaction + * ID as xmin, so it cannot protect dead tuples that existed before the + * upgrade. Additionally, commit timestamps and origin data are not + * preserved during the upgrade. So, even after creating the slot, the + * upgraded subscriber may be unable to detect conflicts or log relevant + * commit timestamps and origins when applying changes from the publisher + * occurred before the upgrade especially if those changes were not + * replicated. It can only protect tuples that might be deleted after the + * new cluster starts. */ - if (count_old_cluster_logical_slots()) + if (migrate_logical_slots || old_cluster.sub_retain_dead_tuples) { start_postmaster(&new_cluster, true); - create_logical_replication_slots(); + + if (migrate_logical_slots) + create_logical_replication_slots(); + + if (old_cluster.sub_retain_dead_tuples) + create_conflict_detection_slot(); + stop_postmaster(false); } @@ -1025,3 +1048,24 @@ create_logical_replication_slots(void) return; } + +/* + * create_conflict_detection_slot() + * + * Create a replication slot to retain information necessary for conflict + * detection such as dead tuples, commit timestamps, and origins, for migrated + * subscriptions with retain_dead_tuples enabled. + */ +static void +create_conflict_detection_slot(void) +{ + PGconn *conn_new_template1; + + prep_status("Creating the replication conflict detection slot"); + + conn_new_template1 = connectToServer(&new_cluster, "template1"); + PQclear(executeQueryOrDie(conn_new_template1, "SELECT pg_catalog.binary_upgrade_create_conflict_detection_slot()")); + PQfinish(conn_new_template1); + + check_ok(); +} diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 69c965bb7d0..0ef47be0dc1 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -300,8 +300,12 @@ typedef struct uint32 major_version; /* PG_VERSION of cluster */ char major_version_str[64]; /* string PG_VERSION of cluster */ uint32 bin_version; /* version returned from pg_ctl */ + char **tablespaces; /* tablespace directories */ + int num_tablespaces; const char *tablespace_suffix; /* directory specification */ int nsubs; /* number of subscriptions */ + bool sub_retain_dead_tuples; /* whether a subscription enables + * retain_dead_tuples. */ } ClusterInfo; @@ -354,8 +358,6 @@ typedef struct const char *progname; /* complete pathname for this program */ char *user; /* username for clusters */ bool user_specified; /* user specified on command-line */ - char **old_tablespaces; /* tablespaces */ - int num_old_tablespaces; LibraryInfo *libraries; /* loadable libraries */ int num_libraries; ClusterInfo *running_cluster; @@ -441,7 +443,7 @@ FileNameMap *gen_db_file_maps(DbInfo *old_db, const char *new_pgdata); void get_db_rel_and_slot_infos(ClusterInfo *cluster); int count_old_cluster_logical_slots(void); -void get_subscription_count(ClusterInfo *cluster); +void get_subscription_info(ClusterInfo *cluster); /* option.c */ @@ -455,7 +457,7 @@ void transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata); void transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata, - char *old_tablespace); + char *old_tablespace, char *new_tablespace); /* tablespace.c */ @@ -503,7 +505,7 @@ void parallel_exec_prog(const char *log_file, const char *opt_log_file, const char *fmt,...) pg_attribute_printf(3, 4); void parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata, - char *old_tablespace); + char *old_tablespace, char *new_tablespace); bool reap_child(bool wait_for_child); /* task.c */ diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index 2959c07f0b8..38c17ceabf2 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -17,7 +17,7 @@ #include "common/logging.h" #include "pg_upgrade.h" -static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); +static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace, char *new_tablespace); static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit); /* @@ -136,21 +136,22 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, */ if (user_opts.jobs <= 1) parallel_transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, - new_pgdata, NULL); + new_pgdata, NULL, NULL); else { int tblnum; /* transfer default tablespace */ parallel_transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, - new_pgdata, old_pgdata); + new_pgdata, old_pgdata, new_pgdata); - for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (tblnum = 0; tblnum < old_cluster.num_tablespaces; tblnum++) parallel_transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, - os_info.old_tablespaces[tblnum]); + old_cluster.tablespaces[tblnum], + new_cluster.tablespaces[tblnum]); /* reap all children */ while (reap_child(true) == true) ; @@ -169,7 +170,8 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, */ void transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, - char *old_pgdata, char *new_pgdata, char *old_tablespace) + char *old_pgdata, char *new_pgdata, + char *old_tablespace, char *new_tablespace) { int old_dbnum, new_dbnum; @@ -204,7 +206,7 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, new_pgdata); if (n_maps) { - transfer_single_new_db(mappings, n_maps, old_tablespace); + transfer_single_new_db(mappings, n_maps, old_tablespace, new_tablespace); } /* We allocate something even for n_maps == 0 */ pg_free(mappings); @@ -234,10 +236,10 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, * moved_db_dir: Destination for the pg_restore-generated database directory. */ static bool -prepare_for_swap(const char *old_tablespace, Oid db_oid, - char *old_catalog_dir, char *new_db_dir, char *moved_db_dir) +prepare_for_swap(const char *old_tablespace, const char *new_tablespace, + Oid db_oid, char *old_catalog_dir, char *new_db_dir, + char *moved_db_dir) { - const char *new_tablespace; const char *old_tblspc_suffix; const char *new_tblspc_suffix; char old_tblspc[MAXPGPATH]; @@ -247,24 +249,14 @@ prepare_for_swap(const char *old_tablespace, Oid db_oid, struct stat st; if (strcmp(old_tablespace, old_cluster.pgdata) == 0) - { - new_tablespace = new_cluster.pgdata; - new_tblspc_suffix = "/base"; old_tblspc_suffix = "/base"; - } else - { - /* - * XXX: The below line is a hack to deal with the fact that we - * presently don't have an easy way to find the corresponding new - * tablespace's path. This will need to be fixed if/when we add - * pg_upgrade support for in-place tablespaces. - */ - new_tablespace = old_tablespace; + old_tblspc_suffix = old_cluster.tablespace_suffix; + if (strcmp(new_tablespace, new_cluster.pgdata) == 0) + new_tblspc_suffix = "/base"; + else new_tblspc_suffix = new_cluster.tablespace_suffix; - old_tblspc_suffix = old_cluster.tablespace_suffix; - } /* Old and new cluster paths. */ snprintf(old_tblspc, sizeof(old_tblspc), "%s%s", old_tablespace, old_tblspc_suffix); @@ -290,19 +282,19 @@ prepare_for_swap(const char *old_tablespace, Oid db_oid, /* Create directory for stuff that is moved aside. */ if (pg_mkdir_p(moved_tblspc, pg_dir_create_mode) != 0 && errno != EEXIST) - pg_fatal("could not create directory \"%s\"", moved_tblspc); + pg_fatal("could not create directory \"%s\": %m", moved_tblspc); /* Create directory for old catalog files. */ if (pg_mkdir_p(old_catalog_dir, pg_dir_create_mode) != 0) - pg_fatal("could not create directory \"%s\"", old_catalog_dir); + pg_fatal("could not create directory \"%s\": %m", old_catalog_dir); /* Move the new cluster's database directory aside. */ if (rename(new_db_dir, moved_db_dir) != 0) - pg_fatal("could not rename \"%s\" to \"%s\"", new_db_dir, moved_db_dir); + pg_fatal("could not rename directory \"%s\" to \"%s\": %m", new_db_dir, moved_db_dir); /* Move the old cluster's database directory into place. */ if (rename(old_db_dir, new_db_dir) != 0) - pg_fatal("could not rename \"%s\" to \"%s\"", old_db_dir, new_db_dir); + pg_fatal("could not rename directory \"%s\" to \"%s\": %m", old_db_dir, new_db_dir); return true; } @@ -390,7 +382,7 @@ swap_catalog_files(FileNameMap *maps, int size, const char *old_catalog_dir, snprintf(dest, sizeof(dest), "%s/%s", old_catalog_dir, de->d_name); if (rename(path, dest) != 0) - pg_fatal("could not rename \"%s\" to \"%s\": %m", path, dest); + pg_fatal("could not rename file \"%s\" to \"%s\": %m", path, dest); } if (errno) pg_fatal("could not read directory \"%s\": %m", new_db_dir); @@ -417,7 +409,7 @@ swap_catalog_files(FileNameMap *maps, int size, const char *old_catalog_dir, snprintf(dest, sizeof(dest), "%s/%s", new_db_dir, de->d_name); if (rename(path, dest) != 0) - pg_fatal("could not rename \"%s\" to \"%s\": %m", path, dest); + pg_fatal("could not rename file \"%s\" to \"%s\": %m", path, dest); /* * We don't fsync() the database files in the file synchronization @@ -450,7 +442,7 @@ swap_catalog_files(FileNameMap *maps, int size, const char *old_catalog_dir, * during pg_restore. */ static void -do_swap(FileNameMap *maps, int size, char *old_tablespace) +do_swap(FileNameMap *maps, int size, char *old_tablespace, char *new_tablespace) { char old_catalog_dir[MAXPGPATH]; char new_db_dir[MAXPGPATH]; @@ -470,21 +462,23 @@ do_swap(FileNameMap *maps, int size, char *old_tablespace) */ if (old_tablespace) { - if (prepare_for_swap(old_tablespace, maps[0].db_oid, + if (prepare_for_swap(old_tablespace, new_tablespace, maps[0].db_oid, old_catalog_dir, new_db_dir, moved_db_dir)) swap_catalog_files(maps, size, old_catalog_dir, new_db_dir, moved_db_dir); } else { - if (prepare_for_swap(old_cluster.pgdata, maps[0].db_oid, + if (prepare_for_swap(old_cluster.pgdata, new_cluster.pgdata, maps[0].db_oid, old_catalog_dir, new_db_dir, moved_db_dir)) swap_catalog_files(maps, size, old_catalog_dir, new_db_dir, moved_db_dir); - for (int tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (int tblnum = 0; tblnum < old_cluster.num_tablespaces; tblnum++) { - if (prepare_for_swap(os_info.old_tablespaces[tblnum], maps[0].db_oid, + if (prepare_for_swap(old_cluster.tablespaces[tblnum], + new_cluster.tablespaces[tblnum], + maps[0].db_oid, old_catalog_dir, new_db_dir, moved_db_dir)) swap_catalog_files(maps, size, old_catalog_dir, new_db_dir, moved_db_dir); @@ -498,7 +492,8 @@ do_swap(FileNameMap *maps, int size, char *old_tablespace) * create links for mappings stored in "maps" array. */ static void -transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) +transfer_single_new_db(FileNameMap *maps, int size, + char *old_tablespace, char *new_tablespace) { int mapnum; bool vm_must_add_frozenbit = false; @@ -520,7 +515,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) */ Assert(!vm_must_add_frozenbit); - do_swap(maps, size, old_tablespace); + do_swap(maps, size, old_tablespace, new_tablespace); return; } diff --git a/src/bin/pg_upgrade/server.c b/src/bin/pg_upgrade/server.c index 873e5b5117b..7eb15bc7d5a 100644 --- a/src/bin/pg_upgrade/server.c +++ b/src/bin/pg_upgrade/server.c @@ -242,24 +242,6 @@ start_postmaster(ClusterInfo *cluster, bool report_and_exit_on_error) appendPQExpBufferStr(&pgoptions, " -c synchronous_commit=off -c fsync=off -c full_page_writes=off"); /* - * Use max_slot_wal_keep_size as -1 to prevent the WAL removal by the - * checkpointer process. If WALs required by logical replication slots - * are removed, the slots are unusable. This setting prevents the - * invalidation of slots during the upgrade. We set this option when - * cluster is PG17 or later because logical replication slots can only be - * migrated since then. Besides, max_slot_wal_keep_size is added in PG13. - */ - if (GET_MAJOR_VERSION(cluster->major_version) >= 1700) - appendPQExpBufferStr(&pgoptions, " -c max_slot_wal_keep_size=-1"); - - /* - * Use idle_replication_slot_timeout=0 to prevent slot invalidation due to - * idle_timeout by checkpointer process during upgrade. - */ - if (GET_MAJOR_VERSION(cluster->major_version) >= 1800) - appendPQExpBufferStr(&pgoptions, " -c idle_replication_slot_timeout=0"); - - /* * Use -b to disable autovacuum and logical replication launcher * (effective in PG17 or later for the latter). */ diff --git a/src/bin/pg_upgrade/t/002_pg_upgrade.pl b/src/bin/pg_upgrade/t/002_pg_upgrade.pl index 7d82593879d..0b15e38297e 100644 --- a/src/bin/pg_upgrade/t/002_pg_upgrade.pl +++ b/src/bin/pg_upgrade/t/002_pg_upgrade.pl @@ -375,6 +375,9 @@ SKIP: { my $dstnode = PostgreSQL::Test::Cluster->new('dst_node'); + skip "regress_dump_restore not enabled in PG_TEST_EXTRA" + if (!$ENV{PG_TEST_EXTRA} + || $ENV{PG_TEST_EXTRA} !~ /\bregress_dump_restore\b/); skip "different Postgres versions" if ($oldnode->pg_version != $dstnode->pg_version); skip "source node not using default install" diff --git a/src/bin/pg_upgrade/t/004_subscription.pl b/src/bin/pg_upgrade/t/004_subscription.pl index c545abf6581..77387be0f9d 100644 --- a/src/bin/pg_upgrade/t/004_subscription.pl +++ b/src/bin/pg_upgrade/t/004_subscription.pl @@ -22,13 +22,13 @@ $publisher->start; # Initialize the old subscriber node my $old_sub = PostgreSQL::Test::Cluster->new('old_sub'); -$old_sub->init; +$old_sub->init(allows_streaming => 'physical'); $old_sub->start; my $oldbindir = $old_sub->config_data('--bindir'); # Initialize the new subscriber my $new_sub = PostgreSQL::Test::Cluster->new('new_sub'); -$new_sub->init; +$new_sub->init(allows_streaming => 'physical'); my $newbindir = $new_sub->config_data('--bindir'); # In a VPATH build, we'll be started in the source directory, but we want @@ -53,7 +53,8 @@ $old_sub->safe_psql('postgres', $old_sub->stop; -$new_sub->append_conf('postgresql.conf', "max_active_replication_origins = 0"); +$new_sub->append_conf('postgresql.conf', + "max_active_replication_origins = 0"); # pg_upgrade will fail because the new cluster has insufficient # max_active_replication_origins. @@ -80,7 +81,56 @@ command_checks_all( ); # Reset max_active_replication_origins -$new_sub->append_conf('postgresql.conf', "max_active_replication_origins = 10"); +$new_sub->append_conf('postgresql.conf', + "max_active_replication_origins = 10"); + +# Cleanup +$publisher->safe_psql('postgres', "DROP PUBLICATION regress_pub1"); +$old_sub->start; +$old_sub->safe_psql('postgres', "DROP SUBSCRIPTION regress_sub1;"); + +# ------------------------------------------------------ +# Check that pg_upgrade fails when max_replication_slots configured in the new +# cluster is less than the number of logical slots in the old cluster + 1 when +# subscription's retain_dead_tuples option is enabled. +# ------------------------------------------------------ +# It is sufficient to use disabled subscription to test upgrade failure. + +$publisher->safe_psql('postgres', "CREATE PUBLICATION regress_pub1"); +$old_sub->safe_psql('postgres', + "CREATE SUBSCRIPTION regress_sub1 CONNECTION '$connstr' PUBLICATION regress_pub1 WITH (enabled = false, retain_dead_tuples = true)" +); + +$old_sub->stop; + +$new_sub->append_conf('postgresql.conf', 'max_replication_slots = 0'); + +# pg_upgrade will fail because the new cluster has insufficient +# max_replication_slots. +command_checks_all( + [ + 'pg_upgrade', + '--no-sync', + '--old-datadir' => $old_sub->data_dir, + '--new-datadir' => $new_sub->data_dir, + '--old-bindir' => $oldbindir, + '--new-bindir' => $newbindir, + '--socketdir' => $new_sub->host, + '--old-port' => $old_sub->port, + '--new-port' => $new_sub->port, + $mode, + '--check', + ], + 1, + [ + qr/"max_replication_slots" \(0\) must be greater than or equal to the number of logical replication slots on the old cluster plus one additional slot required for retaining conflict detection information \(1\)/ + ], + [qr//], + 'run of pg_upgrade where the new cluster has insufficient max_replication_slots' +); + +# Reset max_replication_slots +$new_sub->append_conf('postgresql.conf', 'max_replication_slots = 10'); # Cleanup $publisher->safe_psql('postgres', "DROP PUBLICATION regress_pub1"); @@ -198,8 +248,9 @@ $old_sub->safe_psql( rmtree($new_sub->data_dir . "/pg_upgrade_output.d"); # Verify that the upgrade should be successful with tables in 'ready'/'init' -# state along with retaining the replication origin's remote lsn, subscription's -# running status, and failover option. +# state along with retaining the replication origin's remote lsn, +# subscription's running status, failover option, and retain_dead_tuples +# option. $publisher->safe_psql( 'postgres', qq[ CREATE TABLE tab_upgraded1(id int); @@ -209,7 +260,7 @@ $publisher->safe_psql( $old_sub->safe_psql( 'postgres', qq[ CREATE TABLE tab_upgraded1(id int); - CREATE SUBSCRIPTION regress_sub4 CONNECTION '$connstr' PUBLICATION regress_pub4 WITH (failover = true); + CREATE SUBSCRIPTION regress_sub4 CONNECTION '$connstr' PUBLICATION regress_pub4 WITH (failover = true, retain_dead_tuples = true); ]); # Wait till the table tab_upgraded1 reaches 'ready' state @@ -268,7 +319,8 @@ $new_sub->append_conf('postgresql.conf', # Check that pg_upgrade is successful when all tables are in ready or in # init state (tab_upgraded1 table is in ready state and tab_upgraded2 table is # in init state) along with retaining the replication origin's remote lsn, -# subscription's running status, and failover option. +# subscription's running status, failover option, and retain_dead_tuples +# option. # ------------------------------------------------------ command_ok( [ @@ -291,7 +343,8 @@ ok( !-d $new_sub->data_dir . "/pg_upgrade_output.d", # ------------------------------------------------------ # Check that the data inserted to the publisher when the new subscriber is down # will be replicated once it is started. Also check that the old subscription -# states and relations origins are all preserved. +# states and relations origins are all preserved, and that the conflict +# detection slot is created. # ------------------------------------------------------ $publisher->safe_psql( 'postgres', qq[ @@ -301,15 +354,16 @@ $publisher->safe_psql( $new_sub->start; -# The subscription's running status and failover option should be preserved -# in the upgraded instance. So regress_sub4 should still have subenabled and -# subfailover set to true, while regress_sub5 should have both set to false. +# The subscription's running status, failover option, and retain_dead_tuples +# option should be preserved in the upgraded instance. So regress_sub4 should +# still have subenabled, subfailover, and subretaindeadtuples set to true, +# while regress_sub5 should have both set to false. $result = $new_sub->safe_psql('postgres', - "SELECT subname, subenabled, subfailover FROM pg_subscription ORDER BY subname" + "SELECT subname, subenabled, subfailover, subretaindeadtuples FROM pg_subscription ORDER BY subname" ); -is( $result, qq(regress_sub4|t|t -regress_sub5|f|f), - "check that the subscription's running status and failover are preserved" +is( $result, qq(regress_sub4|t|t|t +regress_sub5|f|f|f), + "check that the subscription's running status, failover, and retain_dead_tuples are preserved" ); # Subscription relations should be preserved @@ -328,6 +382,11 @@ $result = $new_sub->safe_psql('postgres', ); is($result, qq($remote_lsn), "remote_lsn should have been preserved"); +# The conflict detection slot should be created +$result = $new_sub->safe_psql('postgres', + "SELECT xmin IS NOT NULL from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'"); +is($result, qq(t), "conflict detection slot exists"); + # Resume the initial sync and wait until all tables of subscription # 'regress_sub5' are synchronized $new_sub->append_conf('postgresql.conf', diff --git a/src/bin/pg_upgrade/t/005_char_signedness.pl b/src/bin/pg_upgrade/t/005_char_signedness.pl index 17fa0d48b15..cd8cff6f513 100644 --- a/src/bin/pg_upgrade/t/005_char_signedness.pl +++ b/src/bin/pg_upgrade/t/005_char_signedness.pl @@ -65,7 +65,7 @@ command_checks_all( $mode ], 1, - [qr/--set-char-signedness option cannot be used/], + [qr/option --set-char-signedness cannot be used/], [], '--set-char-signedness option cannot be used for upgrading from v18 or later' ); diff --git a/src/bin/pg_upgrade/t/006_transfer_modes.pl b/src/bin/pg_upgrade/t/006_transfer_modes.pl index 550a63fdf7d..348f4021462 100644 --- a/src/bin/pg_upgrade/t/006_transfer_modes.pl +++ b/src/bin/pg_upgrade/t/006_transfer_modes.pl @@ -13,7 +13,8 @@ sub test_mode { my ($mode) = @_; - my $old = PostgreSQL::Test::Cluster->new('old', install_path => $ENV{oldinstall}); + my $old = + PostgreSQL::Test::Cluster->new('old', install_path => $ENV{oldinstall}); my $new = PostgreSQL::Test::Cluster->new('new'); # --swap can't be used to upgrade from versions older than 10, so just skip @@ -37,24 +38,50 @@ sub test_mode } $new->init(); + # allow_in_place_tablespaces is available as far back as v10. + if ($old->pg_version >= 10) + { + $new->append_conf('postgresql.conf', "allow_in_place_tablespaces = true"); + $old->append_conf('postgresql.conf', "allow_in_place_tablespaces = true"); + } + # Create a small variety of simple test objects on the old cluster. We'll # check that these reach the new version after upgrading. $old->start; - $old->safe_psql('postgres', "CREATE TABLE test1 AS SELECT generate_series(1, 100)"); + $old->safe_psql('postgres', + "CREATE TABLE test1 AS SELECT generate_series(1, 100)"); $old->safe_psql('postgres', "CREATE DATABASE testdb1"); - $old->safe_psql('testdb1', "CREATE TABLE test2 AS SELECT generate_series(200, 300)"); + $old->safe_psql('testdb1', + "CREATE TABLE test2 AS SELECT generate_series(200, 300)"); $old->safe_psql('testdb1', "VACUUM FULL test2"); $old->safe_psql('testdb1', "CREATE SEQUENCE testseq START 5432"); - # For cross-version tests, we can also check that pg_upgrade handles - # tablespaces. + # If an old installation is provided, we can test non-in-place tablespaces. if (defined($ENV{oldinstall})) { my $tblspc = PostgreSQL::Test::Utils::tempdir_short(); - $old->safe_psql('postgres', "CREATE TABLESPACE test_tblspc LOCATION '$tblspc'"); - $old->safe_psql('postgres', "CREATE DATABASE testdb2 TABLESPACE test_tblspc"); - $old->safe_psql('postgres', "CREATE TABLE test3 TABLESPACE test_tblspc AS SELECT generate_series(300, 401)"); - $old->safe_psql('testdb2', "CREATE TABLE test4 AS SELECT generate_series(400, 502)"); + $old->safe_psql('postgres', + "CREATE TABLESPACE test_tblspc LOCATION '$tblspc'"); + $old->safe_psql('postgres', + "CREATE DATABASE testdb2 TABLESPACE test_tblspc"); + $old->safe_psql('postgres', + "CREATE TABLE test3 TABLESPACE test_tblspc AS SELECT generate_series(300, 401)" + ); + $old->safe_psql('testdb2', + "CREATE TABLE test4 AS SELECT generate_series(400, 502)"); + } + + # If the old cluster is >= v10, we can test in-place tablespaces. + if ($old->pg_version >= 10) + { + $old->safe_psql('postgres', + "CREATE TABLESPACE inplc_tblspc LOCATION ''"); + $old->safe_psql('postgres', + "CREATE DATABASE testdb3 TABLESPACE inplc_tblspc"); + $old->safe_psql('postgres', + "CREATE TABLE test5 TABLESPACE inplc_tblspc AS SELECT generate_series(503, 606)"); + $old->safe_psql('testdb3', + "CREATE TABLE test6 AS SELECT generate_series(607, 711)"); } $old->stop; @@ -86,15 +113,25 @@ sub test_mode $result = $new->safe_psql('testdb1', "SELECT nextval('testseq')"); is($result, '5432', "sequence data after pg_upgrade $mode"); - # For cross-version tests, we should have some objects in a non-default - # tablespace. + # Tests for non-in-place tablespaces. if (defined($ENV{oldinstall})) { - $result = $new->safe_psql('postgres', "SELECT COUNT(*) FROM test3"); + $result = + $new->safe_psql('postgres', "SELECT COUNT(*) FROM test3"); is($result, '102', "test3 data after pg_upgrade $mode"); - $result = $new->safe_psql('testdb2', "SELECT COUNT(*) FROM test4"); + $result = + $new->safe_psql('testdb2', "SELECT COUNT(*) FROM test4"); is($result, '103', "test4 data after pg_upgrade $mode"); } + + # Tests for in-place tablespaces. + if ($old->pg_version >= 10) + { + $result = $new->safe_psql('postgres', "SELECT COUNT(*) FROM test5"); + is($result, '104', "test5 data after pg_upgrade $mode"); + $result = $new->safe_psql('testdb3', "SELECT COUNT(*) FROM test6"); + is($result, '105', "test6 data after pg_upgrade $mode"); + } $new->stop; } diff --git a/src/bin/pg_upgrade/tablespace.c b/src/bin/pg_upgrade/tablespace.c index 3520a75ba31..151d74e1734 100644 --- a/src/bin/pg_upgrade/tablespace.c +++ b/src/bin/pg_upgrade/tablespace.c @@ -23,10 +23,20 @@ init_tablespaces(void) set_tablespace_directory_suffix(&old_cluster); set_tablespace_directory_suffix(&new_cluster); - if (os_info.num_old_tablespaces > 0 && + if (old_cluster.num_tablespaces > 0 && strcmp(old_cluster.tablespace_suffix, new_cluster.tablespace_suffix) == 0) - pg_fatal("Cannot upgrade to/from the same system catalog version when\n" - "using tablespaces."); + { + for (int i = 0; i < old_cluster.num_tablespaces; i++) + { + /* + * In-place tablespaces are okay for same-version upgrades because + * their paths will differ between clusters. + */ + if (strcmp(old_cluster.tablespaces[i], new_cluster.tablespaces[i]) == 0) + pg_fatal("Cannot upgrade to/from the same system catalog version when\n" + "using tablespaces."); + } + } } @@ -53,19 +63,48 @@ get_tablespace_paths(void) res = executeQueryOrDie(conn, "%s", query); - if ((os_info.num_old_tablespaces = PQntuples(res)) != 0) - os_info.old_tablespaces = - (char **) pg_malloc(os_info.num_old_tablespaces * sizeof(char *)); + old_cluster.num_tablespaces = PQntuples(res); + new_cluster.num_tablespaces = PQntuples(res); + + if (PQntuples(res) != 0) + { + old_cluster.tablespaces = + (char **) pg_malloc(old_cluster.num_tablespaces * sizeof(char *)); + new_cluster.tablespaces = + (char **) pg_malloc(new_cluster.num_tablespaces * sizeof(char *)); + } else - os_info.old_tablespaces = NULL; + { + old_cluster.tablespaces = NULL; + new_cluster.tablespaces = NULL; + } i_spclocation = PQfnumber(res, "spclocation"); - for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) + for (tblnum = 0; tblnum < old_cluster.num_tablespaces; tblnum++) { struct stat statBuf; + char *spcloc = PQgetvalue(res, tblnum, i_spclocation); - os_info.old_tablespaces[tblnum] = pg_strdup(PQgetvalue(res, tblnum, i_spclocation)); + /* + * For now, we do not expect non-in-place tablespaces to move during + * upgrade. If that changes, it will likely become necessary to run + * the above query on the new cluster, too. + * + * pg_tablespace_location() returns absolute paths for non-in-place + * tablespaces and relative paths for in-place ones, so we use + * is_absolute_path() to distinguish between them. + */ + if (is_absolute_path(PQgetvalue(res, tblnum, i_spclocation))) + { + old_cluster.tablespaces[tblnum] = pg_strdup(spcloc); + new_cluster.tablespaces[tblnum] = old_cluster.tablespaces[tblnum]; + } + else + { + old_cluster.tablespaces[tblnum] = psprintf("%s/%s", old_cluster.pgdata, spcloc); + new_cluster.tablespaces[tblnum] = psprintf("%s/%s", new_cluster.pgdata, spcloc); + } /* * Check that the tablespace path exists and is a directory. @@ -76,21 +115,21 @@ get_tablespace_paths(void) * that contains user tablespaces is moved as part of pg_upgrade * preparation and the symbolic links are not updated. */ - if (stat(os_info.old_tablespaces[tblnum], &statBuf) != 0) + if (stat(old_cluster.tablespaces[tblnum], &statBuf) != 0) { if (errno == ENOENT) report_status(PG_FATAL, "tablespace directory \"%s\" does not exist", - os_info.old_tablespaces[tblnum]); + old_cluster.tablespaces[tblnum]); else report_status(PG_FATAL, "could not stat tablespace directory \"%s\": %m", - os_info.old_tablespaces[tblnum]); + old_cluster.tablespaces[tblnum]); } if (!S_ISDIR(statBuf.st_mode)) report_status(PG_FATAL, "tablespace path \"%s\" is not a directory", - os_info.old_tablespaces[tblnum]); + old_cluster.tablespaces[tblnum]); } PQclear(res); diff --git a/src/bin/pg_upgrade/task.c b/src/bin/pg_upgrade/task.c index a48d5691390..ee0e2457152 100644 --- a/src/bin/pg_upgrade/task.c +++ b/src/bin/pg_upgrade/task.c @@ -192,8 +192,7 @@ start_conn(const ClusterInfo *cluster, UpgradeTaskSlot *slot) slot->conn = PQconnectStart(conn_opts.data); if (!slot->conn) - pg_fatal("failed to create connection with connection string: \"%s\"", - conn_opts.data); + pg_fatal("out of memory"); termPQExpBuffer(&conn_opts); } @@ -402,7 +401,7 @@ wait_on_slots(UpgradeTaskSlot *slots, int numslots) * If we found socket(s) to wait on, wait. */ if (select_loop(maxFd, &input, &output) == -1) - pg_fatal("select() failed: %m"); + pg_fatal("%s() failed: %m", "select"); /* * Mark which sockets appear to be ready. diff --git a/src/bin/pg_verifybackup/meson.build b/src/bin/pg_verifybackup/meson.build index 9567d55500d..f45ea790d8e 100644 --- a/src/bin/pg_verifybackup/meson.build +++ b/src/bin/pg_verifybackup/meson.build @@ -23,10 +23,10 @@ tests += { 'sd': meson.current_source_dir(), 'bd': meson.current_build_dir(), 'tap': { - 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.path() : '', - 'TAR': tar.found() ? tar.path() : '', - 'LZ4': program_lz4.found() ? program_lz4.path() : '', - 'ZSTD': program_zstd.found() ? program_zstd.path() : ''}, + 'env': {'GZIP_PROGRAM': gzip.found() ? gzip.full_path() : '', + 'TAR': tar.found() ? tar.full_path() : '', + 'LZ4': program_lz4.found() ? program_lz4.full_path() : '', + 'ZSTD': program_zstd.found() ? program_zstd.full_path() : ''}, 'tests': [ 't/001_basic.pl', 't/002_algorithm.pl', diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c index 48994ef9bc6..5e6c13bb921 100644 --- a/src/bin/pg_verifybackup/pg_verifybackup.c +++ b/src/bin/pg_verifybackup/pg_verifybackup.c @@ -1207,7 +1207,7 @@ parse_required_wal(verifier_context *context, char *pg_waldump_path, { char *pg_waldump_cmd; - pg_waldump_cmd = psprintf("\"%s\" --quiet --path=\"%s\" --timeline=%u --start=%X/%X --end=%X/%X\n", + pg_waldump_cmd = psprintf("\"%s\" --quiet --path=\"%s\" --timeline=%u --start=%X/%08X --end=%X/%08X\n", pg_waldump_path, wal_directory, this_wal_range->tli, LSN_FORMAT_ARGS(this_wal_range->start_lsn), LSN_FORMAT_ARGS(this_wal_range->end_lsn)); diff --git a/src/bin/pg_verifybackup/t/008_untar.pl b/src/bin/pg_verifybackup/t/008_untar.pl index deed3ec247d..bc3d6b352ad 100644 --- a/src/bin/pg_verifybackup/t/008_untar.pl +++ b/src/bin/pg_verifybackup/t/008_untar.pl @@ -16,6 +16,22 @@ my $primary = PostgreSQL::Test::Cluster->new('primary'); $primary->init(allows_streaming => 1); $primary->start; +# Create file with some random data and an arbitrary size, useful to check +# the solidity of the compression and decompression logic. The size of the +# file is chosen to be around 640kB. This has proven to be large enough to +# detect some issues related to LZ4, and low enough to not impact the runtime +# of the test significantly. +my $junk_data = $primary->safe_psql( + 'postgres', qq( + SELECT string_agg(encode(sha256(i::bytea), 'hex'), '') + FROM generate_series(1, 10240) s(i);)); +my $data_dir = $primary->data_dir; +my $junk_file = "$data_dir/junk"; +open my $jf, '>', $junk_file + or die "Could not create junk file: $!"; +print $jf $junk_data; +close $jf; + # Create a tablespace directory. my $source_ts_path = PostgreSQL::Test::Utils::tempdir_short(); @@ -53,6 +69,12 @@ my @test_configuration = ( 'enabled' => check_pg_config("#define USE_LZ4 1") }, { + 'compression_method' => 'lz4', + 'backup_flags' => [ '--compress', 'server-lz4:5' ], + 'backup_archive' => [ 'base.tar.lz4', "$tsoid.tar.lz4" ], + 'enabled' => check_pg_config("#define USE_LZ4 1") + }, + { 'compression_method' => 'zstd', 'backup_flags' => [ '--compress', 'server-zstd' ], 'backup_archive' => [ 'base.tar.zst', "$tsoid.tar.zst" ], diff --git a/src/bin/pg_verifybackup/t/010_client_untar.pl b/src/bin/pg_verifybackup/t/010_client_untar.pl index d8d2b06c7ee..b62faeb5acf 100644 --- a/src/bin/pg_verifybackup/t/010_client_untar.pl +++ b/src/bin/pg_verifybackup/t/010_client_untar.pl @@ -15,6 +15,22 @@ my $primary = PostgreSQL::Test::Cluster->new('primary'); $primary->init(allows_streaming => 1); $primary->start; +# Create file with some random data and an arbitrary size, useful to check +# the solidity of the compression and decompression logic. The size of the +# file is chosen to be around 640kB. This has proven to be large enough to +# detect some issues related to LZ4, and low enough to not impact the runtime +# of the test significantly. +my $junk_data = $primary->safe_psql( + 'postgres', qq( + SELECT string_agg(encode(sha256(i::bytea), 'hex'), '') + FROM generate_series(1, 10240) s(i);)); +my $data_dir = $primary->data_dir; +my $junk_file = "$data_dir/junk"; +open my $jf, '>', $junk_file + or die "Could not create junk file: $!"; +print $jf $junk_data; +close $jf; + my $backup_path = $primary->backup_dir . '/client-backup'; my $extract_path = $primary->backup_dir . '/extracted-backup'; @@ -38,6 +54,12 @@ my @test_configuration = ( 'enabled' => check_pg_config("#define USE_LZ4 1") }, { + 'compression_method' => 'lz4', + 'backup_flags' => [ '--compress', 'client-lz4:1' ], + 'backup_archive' => 'base.tar.lz4', + 'enabled' => check_pg_config("#define USE_LZ4 1") + }, + { 'compression_method' => 'zstd', 'backup_flags' => [ '--compress', 'client-zstd:5' ], 'backup_archive' => 'base.tar.zst', diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 51fb76efc48..13d3ec2f5be 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -656,7 +656,7 @@ XLogDumpDisplayStats(XLogDumpConfig *config, XLogStats *stats) } total_len = total_rec_len + total_fpi_len; - printf("WAL statistics between %X/%X and %X/%X:\n", + printf("WAL statistics between %X/%08X and %X/%08X:\n", LSN_FORMAT_ARGS(stats->startptr), LSN_FORMAT_ARGS(stats->endptr)); /* @@ -904,7 +904,7 @@ main(int argc, char **argv) config.filter_by_extended = true; break; case 'e': - if (sscanf(optarg, "%X/%X", &xlogid, &xrecoff) != 2) + if (sscanf(optarg, "%X/%08X", &xlogid, &xrecoff) != 2) { pg_log_error("invalid WAL location: \"%s\"", optarg); @@ -1002,7 +1002,7 @@ main(int argc, char **argv) config.filter_by_extended = true; break; case 's': - if (sscanf(optarg, "%X/%X", &xlogid, &xrecoff) != 2) + if (sscanf(optarg, "%X/%08X", &xlogid, &xrecoff) != 2) { pg_log_error("invalid WAL location: \"%s\"", optarg); @@ -1140,7 +1140,7 @@ main(int argc, char **argv) XLogSegNoOffsetToRecPtr(segno, 0, WalSegSz, private.startptr); else if (!XLByteInSeg(private.startptr, segno, WalSegSz)) { - pg_log_error("start WAL location %X/%X is not inside file \"%s\"", + pg_log_error("start WAL location %X/%08X is not inside file \"%s\"", LSN_FORMAT_ARGS(private.startptr), fname); goto bad_argument; @@ -1182,7 +1182,7 @@ main(int argc, char **argv) if (!XLByteInSeg(private.endptr, segno, WalSegSz) && private.endptr != (segno + 1) * WalSegSz) { - pg_log_error("end WAL location %X/%X is not inside file \"%s\"", + pg_log_error("end WAL location %X/%08X is not inside file \"%s\"", LSN_FORMAT_ARGS(private.endptr), argv[argc - 1]); goto bad_argument; @@ -1214,7 +1214,7 @@ main(int argc, char **argv) first_record = XLogFindNextRecord(xlogreader_state, private.startptr); if (first_record == InvalidXLogRecPtr) - pg_fatal("could not find a valid record after %X/%X", + pg_fatal("could not find a valid record after %X/%08X", LSN_FORMAT_ARGS(private.startptr)); /* @@ -1224,8 +1224,8 @@ main(int argc, char **argv) */ if (first_record != private.startptr && XLogSegmentOffset(private.startptr, WalSegSz) != 0) - pg_log_info(ngettext("first record is after %X/%X, at %X/%X, skipping over %u byte", - "first record is after %X/%X, at %X/%X, skipping over %u bytes", + pg_log_info(ngettext("first record is after %X/%08X, at %X/%08X, skipping over %u byte", + "first record is after %X/%08X, at %X/%08X, skipping over %u bytes", (first_record - private.startptr)), LSN_FORMAT_ARGS(private.startptr), LSN_FORMAT_ARGS(first_record), @@ -1309,7 +1309,7 @@ main(int argc, char **argv) exit(0); if (errormsg) - pg_fatal("error in WAL record at %X/%X: %s", + pg_fatal("error in WAL record at %X/%08X: %s", LSN_FORMAT_ARGS(xlogreader_state->ReadRecPtr), errormsg); diff --git a/src/bin/pg_walsummary/t/002_blocks.pl b/src/bin/pg_walsummary/t/002_blocks.pl index 270332780a4..0f98c7df82e 100644 --- a/src/bin/pg_walsummary/t/002_blocks.pl +++ b/src/bin/pg_walsummary/t/002_blocks.pl @@ -47,11 +47,12 @@ EOM ok($result, "WAL summarization caught up after insert"); # The WAL summarizer should have generated some IO statistics. -my $stats_reads = $node1->safe_psql( +$node1->poll_query_until( 'postgres', - qq{SELECT sum(reads) > 0 FROM pg_stat_io - WHERE backend_type = 'walsummarizer' AND object = 'wal'}); -is($stats_reads, 't', "WAL summarizer generates statistics for WAL reads"); + q{SELECT sum(reads) > 0 FROM pg_stat_io + WHERE backend_type = 'walsummarizer' AND object = 'wal'}) + or die + "Timed out while waiting for WAL summarizer to generate statistics for WAL reads"; # Find the highest LSN that is summarized on disk. my $summarized_lsn = $node1->safe_psql('postgres', <<EOM); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 497a936c141..125f3c7bbbe 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -3495,6 +3495,8 @@ doRetry(CState *st, pg_time_usec_t *now) static int discardUntilSync(CState *st) { + bool received_sync = false; + /* send a sync */ if (!PQpipelineSync(st->con)) { @@ -3509,10 +3511,21 @@ discardUntilSync(CState *st) PGresult *res = PQgetResult(st->con); if (PQresultStatus(res) == PGRES_PIPELINE_SYNC) + received_sync = true; + else if (received_sync) { - PQclear(res); - res = PQgetResult(st->con); + /* + * PGRES_PIPELINE_SYNC must be followed by another + * PGRES_PIPELINE_SYNC or NULL; otherwise, assert failure. + */ Assert(res == NULL); + + /* + * Reset ongoing sync count to 0 since all PGRES_PIPELINE_SYNC + * results have been discarded. + */ + st->num_syncs = 0; + PQclear(res); break; } PQclear(res); diff --git a/src/bin/pgbench/t/002_pgbench_no_server.pl b/src/bin/pgbench/t/002_pgbench_no_server.pl index f975c73dd75..2cc59cc8140 100644 --- a/src/bin/pgbench/t/002_pgbench_no_server.pl +++ b/src/bin/pgbench/t/002_pgbench_no_server.pl @@ -233,21 +233,9 @@ for my $o (@options) 'pgbench option error: ' . $name); } -# Help -pgbench( - '--help', 0, - [ - qr{benchmarking tool for PostgreSQL}, - qr{Usage}, - qr{Initialization options:}, - qr{Common options:}, - qr{Report bugs to} - ], - [qr{^$}], - 'pgbench help'); - -# Version -pgbench('-V', 0, [qr{^pgbench .PostgreSQL. }], [qr{^$}], 'pgbench version'); +program_help_ok('pgbench'); +program_version_ok('pgbench'); +program_options_handling_ok('pgbench'); # list of builtins pgbench( diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c index 81a5ba844ba..0e00d73487c 100644 --- a/src/bin/psql/command.c +++ b/src/bin/psql/command.c @@ -67,8 +67,8 @@ static backslashResult exec_command_C(PsqlScanState scan_state, bool active_bran static backslashResult exec_command_connect(PsqlScanState scan_state, bool active_branch); static backslashResult exec_command_cd(PsqlScanState scan_state, bool active_branch, const char *cmd); -static backslashResult exec_command_close(PsqlScanState scan_state, bool active_branch, - const char *cmd); +static backslashResult exec_command_close_prepared(PsqlScanState scan_state, + bool active_branch, const char *cmd); static backslashResult exec_command_conninfo(PsqlScanState scan_state, bool active_branch); static backslashResult exec_command_copy(PsqlScanState scan_state, bool active_branch); static backslashResult exec_command_copyright(PsqlScanState scan_state, bool active_branch); @@ -330,8 +330,8 @@ exec_command(const char *cmd, status = exec_command_connect(scan_state, active_branch); else if (strcmp(cmd, "cd") == 0) status = exec_command_cd(scan_state, active_branch, cmd); - else if (strcmp(cmd, "close") == 0) - status = exec_command_close(scan_state, active_branch, cmd); + else if (strcmp(cmd, "close_prepared") == 0) + status = exec_command_close_prepared(scan_state, active_branch, cmd); else if (strcmp(cmd, "conninfo") == 0) status = exec_command_conninfo(scan_state, active_branch); else if (pg_strcasecmp(cmd, "copy") == 0) @@ -728,10 +728,10 @@ exec_command_cd(PsqlScanState scan_state, bool active_branch, const char *cmd) } /* - * \close -- close a previously prepared statement + * \close_prepared -- close a previously prepared statement */ static backslashResult -exec_command_close(PsqlScanState scan_state, bool active_branch, const char *cmd) +exec_command_close_prepared(PsqlScanState scan_state, bool active_branch, const char *cmd) { backslashResult status = PSQL_CMD_SKIP_LINE; @@ -778,6 +778,7 @@ exec_command_conninfo(PsqlScanState scan_state, bool active_branch) int ssl_in_use, password_used, gssapi_used; + int version_num; char *paramval; if (!active_branch) @@ -793,7 +794,9 @@ exec_command_conninfo(PsqlScanState scan_state, bool active_branch) /* Get values for the parameters */ host = PQhost(pset.db); hostaddr = PQhostaddr(pset.db); - protocol_version = psprintf("%d", PQprotocolVersion(pset.db)); + version_num = PQfullProtocolVersion(pset.db); + protocol_version = psprintf("%d.%d", version_num / 10000, + version_num % 10000); ssl_in_use = PQsslInUse(pset.db); password_used = PQconnectionUsedPassword(pset.db); gssapi_used = PQconnectionUsedGSSAPI(pset.db); @@ -874,11 +877,11 @@ exec_command_conninfo(PsqlScanState scan_state, bool active_branch) printTableAddCell(&cont, _("Backend PID"), false, false); printTableAddCell(&cont, backend_pid, false, false); - /* TLS Connection */ - printTableAddCell(&cont, _("TLS Connection"), false, false); + /* SSL Connection */ + printTableAddCell(&cont, _("SSL Connection"), false, false); printTableAddCell(&cont, ssl_in_use ? _("true") : _("false"), false, false); - /* TLS Information */ + /* SSL Information */ if (ssl_in_use) { char *library, @@ -895,19 +898,19 @@ exec_command_conninfo(PsqlScanState scan_state, bool active_branch) compression = (char *) PQsslAttribute(pset.db, "compression"); alpn = (char *) PQsslAttribute(pset.db, "alpn"); - printTableAddCell(&cont, _("TLS Library"), false, false); + printTableAddCell(&cont, _("SSL Library"), false, false); printTableAddCell(&cont, library ? library : _("unknown"), false, false); - printTableAddCell(&cont, _("TLS Protocol"), false, false); + printTableAddCell(&cont, _("SSL Protocol"), false, false); printTableAddCell(&cont, protocol ? protocol : _("unknown"), false, false); - printTableAddCell(&cont, _("TLS Key Bits"), false, false); + printTableAddCell(&cont, _("SSL Key Bits"), false, false); printTableAddCell(&cont, key_bits ? key_bits : _("unknown"), false, false); - printTableAddCell(&cont, _("TLS Cipher"), false, false); + printTableAddCell(&cont, _("SSL Cipher"), false, false); printTableAddCell(&cont, cipher ? cipher : _("unknown"), false, false); - printTableAddCell(&cont, _("TLS Compression"), false, false); + printTableAddCell(&cont, _("SSL Compression"), false, false); printTableAddCell(&cont, (compression && strcmp(compression, "off") != 0) ? _("true") : _("false"), false, false); @@ -1946,7 +1949,7 @@ exec_command_gexec(PsqlScanState scan_state, bool active_branch) { if (PQpipelineStatus(pset.db) != PQ_PIPELINE_OFF) { - pg_log_error("\\gexec not allowed in pipeline mode"); + pg_log_error("\\%s not allowed in pipeline mode", "gexec"); clean_extended_state(); return PSQL_CMD_ERROR; } @@ -1972,7 +1975,7 @@ exec_command_gset(PsqlScanState scan_state, bool active_branch) if (PQpipelineStatus(pset.db) != PQ_PIPELINE_OFF) { - pg_log_error("\\gset not allowed in pipeline mode"); + pg_log_error("\\%s not allowed in pipeline mode", "gset"); clean_extended_state(); return PSQL_CMD_ERROR; } @@ -3284,7 +3287,7 @@ exec_command_watch(PsqlScanState scan_state, bool active_branch, if (PQpipelineStatus(pset.db) != PQ_PIPELINE_OFF) { - pg_log_error("\\watch not allowed in pipeline mode"); + pg_log_error("\\%s not allowed in pipeline mode", "watch"); clean_extended_state(); success = false; } @@ -4477,6 +4480,8 @@ SyncVariables(void) { char vbuf[32]; const char *server_version; + char *service_name; + char *service_file; /* get stuff from connection */ pset.encoding = PQclientEncoding(pset.db); @@ -4486,12 +4491,21 @@ SyncVariables(void) setFmtEncoding(pset.encoding); SetVariable(pset.vars, "DBNAME", PQdb(pset.db)); - SetVariable(pset.vars, "SERVICE", PQservice(pset.db)); SetVariable(pset.vars, "USER", PQuser(pset.db)); SetVariable(pset.vars, "HOST", PQhost(pset.db)); SetVariable(pset.vars, "PORT", PQport(pset.db)); SetVariable(pset.vars, "ENCODING", pg_encoding_to_char(pset.encoding)); + service_name = get_conninfo_value("service"); + SetVariable(pset.vars, "SERVICE", service_name); + if (service_name) + pg_free(service_name); + + service_file = get_conninfo_value("servicefile"); + SetVariable(pset.vars, "SERVICEFILE", service_file); + if (service_file) + pg_free(service_file); + /* this bit should match connection_warnings(): */ /* Try to get full text form of version, might include "devel" etc */ server_version = PQparameterStatus(pset.db, "server_version"); @@ -4521,6 +4535,7 @@ UnsyncVariables(void) { SetVariable(pset.vars, "DBNAME", NULL); SetVariable(pset.vars, "SERVICE", NULL); + SetVariable(pset.vars, "SERVICEFILE", NULL); SetVariable(pset.vars, "USER", NULL); SetVariable(pset.vars, "HOST", NULL); SetVariable(pset.vars, "PORT", NULL); diff --git a/src/bin/psql/common.c b/src/bin/psql/common.c index 3e4e444f3fd..cd329ade12b 100644 --- a/src/bin/psql/common.c +++ b/src/bin/psql/common.c @@ -1867,6 +1867,33 @@ ExecQueryAndProcessResults(const char *query, { FILE *copy_stream = NULL; + if (PQpipelineStatus(pset.db) != PQ_PIPELINE_OFF) + { + /* + * Running COPY within a pipeline can break the protocol + * synchronisation in multiple ways, and psql shows its limits + * when it comes to tracking this information. + * + * While in COPY mode, the backend process ignores additional + * Sync messages and will not send the matching ReadyForQuery + * expected by the frontend. + * + * Additionally, libpq automatically sends a Sync with the + * Copy message, creating an unexpected synchronisation point. + * A failure during COPY would leave the pipeline in an + * aborted state while the backend would be in a clean state, + * ready to process commands. + * + * Improving those issues would require modifications in how + * libpq handles pipelines and COPY. Hence, for the time + * being, we forbid the use of COPY within a pipeline, + * aborting the connection to avoid an inconsistent state on + * psql side if trying to use a COPY command. + */ + pg_log_info("COPY in a pipeline is not supported, aborting connection"); + exit(EXIT_BADCONN); + } + /* * For COPY OUT, direct the output to the default place (probably * a pager pipe) for \watch, or to pset.copyStream for \copy, @@ -2504,6 +2531,41 @@ session_username(void) return PQuser(pset.db); } +/* + * Return the value of option for keyword in the current connection. + * + * The caller is responsible for freeing the result value allocated. + */ +char * +get_conninfo_value(const char *keyword) +{ + PQconninfoOption *opts; + PQconninfoOption *serviceopt = NULL; + char *res = NULL; + + if (pset.db == NULL) + return NULL; + + opts = PQconninfo(pset.db); + if (opts == NULL) + return NULL; + + for (PQconninfoOption *opt = opts; opt->keyword != NULL; ++opt) + { + if (strcmp(opt->keyword, keyword) == 0) + { + serviceopt = opt; + break; + } + } + + /* Take a copy of the value, as it is freed by PQconninfoFree(). */ + if (serviceopt && serviceopt->val != NULL) + res = pg_strdup(serviceopt->val); + PQconninfoFree(opts); + + return res; +} /* expand_tilde * @@ -2601,7 +2663,7 @@ clean_extended_state(void) switch (pset.send_mode) { - case PSQL_SEND_EXTENDED_CLOSE: /* \close */ + case PSQL_SEND_EXTENDED_CLOSE: /* \close_prepared */ free(pset.stmtName); break; case PSQL_SEND_EXTENDED_PARSE: /* \parse */ diff --git a/src/bin/psql/common.h b/src/bin/psql/common.h index 7f1a23de1e8..64762ab9817 100644 --- a/src/bin/psql/common.h +++ b/src/bin/psql/common.h @@ -39,6 +39,7 @@ extern bool SendQuery(const char *query); extern bool is_superuser(void); extern bool standard_strings(void); extern const char *session_username(void); +extern char *get_conninfo_value(const char *keyword); extern void expand_tilde(char **filename); extern void clean_extended_state(void); diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 1d08268393e..7a06af48842 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -296,6 +296,7 @@ describeFunctions(const char *functypes, const char *func_pattern, char **arg_patterns, int num_arg_patterns, bool verbose, bool showSystem) { + const char *df_options = "anptwSx+"; bool showAggregate = strchr(functypes, 'a') != NULL; bool showNormal = strchr(functypes, 'n') != NULL; bool showProcedure = strchr(functypes, 'p') != NULL; @@ -310,9 +311,9 @@ describeFunctions(const char *functypes, const char *func_pattern, /* No "Parallel" column before 9.6 */ static const bool translate_columns_pre_96[] = {false, false, false, false, true, true, false, true, true, false, false, false, false}; - if (strlen(functypes) != strspn(functypes, "anptwSx+")) + if (strlen(functypes) != strspn(functypes, df_options)) { - pg_log_error("\\df only takes [anptwSx+] as options"); + pg_log_error("\\df only takes [%s] as options", df_options); return true; } @@ -6188,8 +6189,8 @@ listExtensions(const char *pattern) "FROM pg_catalog.pg_extension e " "LEFT JOIN pg_catalog.pg_namespace n ON n.oid = e.extnamespace " "LEFT JOIN pg_catalog.pg_description d ON d.objoid = e.oid " - "LEFT JOIN pg_catalog.pg_available_extensions() ae(name, default_version, comment) ON ae.name = e.extname " - "AND d.classoid = 'pg_catalog.pg_extension'::pg_catalog.regclass\n", + "AND d.classoid = 'pg_catalog.pg_extension'::pg_catalog.regclass " + "LEFT JOIN pg_catalog.pg_available_extensions() ae(name, default_version, comment) ON ae.name = e.extname\n", gettext_noop("Name"), gettext_noop("Version"), gettext_noop("Default version"), @@ -6745,7 +6746,7 @@ describeSubscriptions(const char *pattern, bool verbose) printQueryOpt myopt = pset.popt; static const bool translate_columns[] = {false, false, false, false, false, false, false, false, false, false, false, false, false, false, - false}; + false, false}; if (pset.sversion < 100000) { @@ -6813,6 +6814,10 @@ describeSubscriptions(const char *pattern, bool verbose) appendPQExpBuffer(&buf, ", subfailover AS \"%s\"\n", gettext_noop("Failover")); + if (pset.sversion >= 190000) + appendPQExpBuffer(&buf, + ", subretaindeadtuples AS \"%s\"\n", + gettext_noop("Retain dead tuples")); appendPQExpBuffer(&buf, ", subsynccommit AS \"%s\"\n" diff --git a/src/bin/psql/help.c b/src/bin/psql/help.c index 403b51325a7..8c62729a0d1 100644 --- a/src/bin/psql/help.c +++ b/src/bin/psql/help.c @@ -252,7 +252,8 @@ slashUsage(unsigned short int pager) HELP0(" \\dO[Sx+] [PATTERN] list collations\n"); HELP0(" \\dp[Sx] [PATTERN] list table, view, and sequence access privileges\n"); HELP0(" \\dP[itnx+] [PATTERN] list [only index/table] partitioned relations [n=nested]\n"); - HELP0(" \\drds[x] [ROLEPTRN [DBPTRN]] list per-database role settings\n"); + HELP0(" \\drds[x] [ROLEPTRN [DBPTRN]]\n" + " list per-database role settings\n"); HELP0(" \\drg[Sx] [PATTERN] list role grants\n"); HELP0(" \\dRp[x+] [PATTERN] list replication publications\n"); HELP0(" \\dRs[x+] [PATTERN] list replication subscriptions\n"); @@ -330,12 +331,12 @@ slashUsage(unsigned short int pager) HELP0(" \\bind [PARAM]... set query parameters\n"); HELP0(" \\bind_named STMT_NAME [PARAM]...\n" " set query parameters for an existing prepared statement\n"); - HELP0(" \\close STMT_NAME close an existing prepared statement\n"); + HELP0(" \\close_prepared STMT_NAME\n" + " close an existing prepared statement\n"); HELP0(" \\endpipeline exit pipeline mode\n"); HELP0(" \\flush flush output data to the server\n"); HELP0(" \\flushrequest send request to the server to flush its output buffer\n"); - HELP0(" \\getresults [NUM_RES] read NUM_RES pending results. All pending results are\n" - " read if no argument is provided\n"); + HELP0(" \\getresults [NUM_RES] read NUM_RES pending results, or all if no argument\n"); HELP0(" \\parse STMT_NAME create a prepared statement\n"); HELP0(" \\sendpipeline send an extended query to an ongoing pipeline\n"); HELP0(" \\startpipeline enter pipeline mode\n"); @@ -463,8 +464,9 @@ helpVariables(unsigned short int pager) " VERSION_NAME\n" " VERSION_NUM\n" " psql's version (in verbose string, short string, or numeric format)\n"); - HELP0(" WATCH_INTERVAL\n" - " if set to a number, overrides the default two second \\watch interval\n"); + HELPN(" WATCH_INTERVAL\n" + " number of seconds \\watch waits between executions (default %s)\n", + DEFAULT_WATCH_INTERVAL); HELP0("\nDisplay settings:\n"); HELP0("Usage:\n"); @@ -746,7 +748,7 @@ void print_copyright(void) { puts("PostgreSQL Database Management System\n" - "(formerly known as Postgres, then as Postgres95)\n\n" + "(also known as Postgres, formerly known as Postgres95)\n\n" "Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group\n\n" "Portions Copyright (c) 1994, The Regents of the University of California\n\n" "Permission to use, copy, modify, and distribute this software and its\n" diff --git a/src/bin/psql/prompt.c b/src/bin/psql/prompt.c index 3aa7d2d06c8..b08d7328fbf 100644 --- a/src/bin/psql/prompt.c +++ b/src/bin/psql/prompt.c @@ -169,8 +169,12 @@ get_prompt(promptStatus_t status, ConditionalStack cstack) break; /* service name */ case 's': - if (pset.db && PQservice(pset.db)) - strlcpy(buf, PQservice(pset.db), sizeof(buf)); + { + const char *service_name = GetVariable(pset.vars, "SERVICE"); + + if (service_name) + strlcpy(buf, service_name, sizeof(buf)); + } break; /* backend pid */ case 'p': diff --git a/src/bin/psql/t/001_basic.pl b/src/bin/psql/t/001_basic.pl index 4050f9a5e3e..f42c3961e09 100644 --- a/src/bin/psql/t/001_basic.pl +++ b/src/bin/psql/t/001_basic.pl @@ -483,8 +483,8 @@ psql_like($node, "copy (values ('foo'),('bar')) to stdout \\g | $pipe_cmd", my $c4 = slurp_file($g_file); like($c4, qr/foo.*bar/s); -# Tests with pipelines. These trigger FATAL failures in the backend, -# so they cannot be tested via SQL. +# Test COPY within pipelines. These abort the connection from +# the frontend so they cannot be tested via SQL. $node->safe_psql('postgres', 'CREATE TABLE psql_pipeline()'); my $log_location = -s $node->logfile; psql_fails_like( @@ -493,35 +493,41 @@ psql_fails_like( COPY psql_pipeline FROM STDIN; SELECT 'val1'; \\syncpipeline -\\getresults \\endpipeline}, - qr/server closed the connection unexpectedly/, - 'protocol sync loss in pipeline: direct COPY, SELECT, sync and getresult' -); + qr/COPY in a pipeline is not supported, aborting connection/, + 'COPY FROM in pipeline: fails'); $node->wait_for_log( qr/FATAL: .*terminating connection because protocol synchronization was lost/, $log_location); +# Remove \syncpipeline here. psql_fails_like( $node, qq{\\startpipeline -COPY psql_pipeline FROM STDIN \\bind \\sendpipeline -SELECT 'val1' \\bind \\sendpipeline -\\syncpipeline -\\getresults +COPY psql_pipeline TO STDOUT; +SELECT 'val1'; \\endpipeline}, - qr/server closed the connection unexpectedly/, - 'protocol sync loss in pipeline: bind COPY, SELECT, sync and getresult'); + qr/COPY in a pipeline is not supported, aborting connection/, + 'COPY TO in pipeline: fails'); -# This time, test without the \getresults. psql_fails_like( $node, qq{\\startpipeline -COPY psql_pipeline FROM STDIN; +\\copy psql_pipeline from stdin; SELECT 'val1'; \\syncpipeline \\endpipeline}, - qr/server closed the connection unexpectedly/, - 'protocol sync loss in pipeline: COPY, SELECT and sync'); + qr/COPY in a pipeline is not supported, aborting connection/, + '\copy from in pipeline: fails'); + +# Sync attempt after a COPY TO/FROM. +psql_fails_like( + $node, + qq{\\startpipeline +\\copy psql_pipeline to stdout; +\\syncpipeline +\\endpipeline}, + qr/COPY in a pipeline is not supported, aborting connection/, + '\copy to in pipeline: fails'); done_testing(); diff --git a/src/bin/psql/tab-complete.in.c b/src/bin/psql/tab-complete.in.c index ec65ab79fec..1f2ca946fc5 100644 --- a/src/bin/psql/tab-complete.in.c +++ b/src/bin/psql/tab-complete.in.c @@ -889,6 +889,14 @@ static const SchemaQuery Query_for_list_of_analyzables = { .result = "c.relname", }; +/* + * Relations supporting COPY TO/FROM are currently almost the same as + * those supporting ANALYZE. Although views with INSTEAD OF INSERT triggers + * can be used with COPY FROM, they are rarely used for this purpose, + * so plain views are intentionally excluded from this tab completion. + */ +#define Query_for_list_of_tables_for_copy Query_for_list_of_analyzables + /* Relations supporting index creation */ static const SchemaQuery Query_for_list_of_indexables = { .catname = "pg_catalog.pg_class c", @@ -1002,7 +1010,7 @@ static const SchemaQuery Query_for_trigger_of_table = { #define Query_for_list_of_database_vars \ "SELECT conf FROM ("\ -" SELECT setdatabase, pg_catalog.split_part(unnest(setconfig),'=',1) conf"\ +" SELECT setdatabase, pg_catalog.split_part(pg_catalog.unnest(setconfig),'=',1) conf"\ " FROM pg_db_role_setting "\ " ) s, pg_database d "\ " WHERE s.setdatabase = d.oid "\ @@ -1078,9 +1086,12 @@ Keywords_for_list_of_owner_roles, "PUBLIC" " WHERE usename LIKE '%s'" #define Query_for_list_of_user_vars \ -" SELECT pg_catalog.split_part(pg_catalog.unnest(rolconfig),'=',1) "\ -" FROM pg_catalog.pg_roles "\ -" WHERE rolname LIKE '%s'" +"SELECT conf FROM ("\ +" SELECT rolname, pg_catalog.split_part(pg_catalog.unnest(rolconfig),'=',1) conf"\ +" FROM pg_catalog.pg_roles"\ +" ) s"\ +" WHERE s.conf like '%s' "\ +" AND s.rolname LIKE '%s'" #define Query_for_list_of_access_methods \ " SELECT amname "\ @@ -1190,6 +1201,19 @@ Alter_procedure_options, "COST", "IMMUTABLE", "LEAKPROOF", "NOT LEAKPROOF", \ Alter_routine_options, "CALLED ON NULL INPUT", "RETURNS NULL ON NULL INPUT", \ "STRICT", "SUPPORT" +/* COPY options shared between FROM and TO */ +#define Copy_common_options \ +"DELIMITER", "ENCODING", "ESCAPE", "FORMAT", "HEADER", "NULL", "QUOTE" + +/* COPY FROM options */ +#define Copy_from_options \ +Copy_common_options, "DEFAULT", "FORCE_NOT_NULL", "FORCE_NULL", "FREEZE", \ +"LOG_VERBOSITY", "ON_ERROR", "REJECT_LIMIT" + +/* COPY TO options */ +#define Copy_to_options \ +Copy_common_options, "FORCE_QUOTE" + /* * These object types were introduced later than our support cutoff of * server version 9.2. We use the VersionedQuery infrastructure so that @@ -1875,7 +1899,7 @@ psql_completion(const char *text, int start, int end) static const char *const backslash_commands[] = { "\\a", "\\bind", "\\bind_named", - "\\connect", "\\conninfo", "\\C", "\\cd", "\\close", "\\copy", + "\\connect", "\\conninfo", "\\C", "\\cd", "\\close_prepared", "\\copy", "\\copyright", "\\crosstabview", "\\d", "\\da", "\\dA", "\\dAc", "\\dAf", "\\dAo", "\\dAp", "\\db", "\\dc", "\\dconfig", "\\dC", "\\dd", "\\ddp", "\\dD", @@ -2298,8 +2322,9 @@ match_previous_words(int pattern_id, /* ALTER SUBSCRIPTION <name> SET ( */ else if (Matches("ALTER", "SUBSCRIPTION", MatchAny, MatchAnyN, "SET", "(")) COMPLETE_WITH("binary", "disable_on_error", "failover", "origin", - "password_required", "run_as_owner", "slot_name", - "streaming", "synchronous_commit", "two_phase"); + "password_required", "retain_dead_tuples", + "run_as_owner", "slot_name", "streaming", + "synchronous_commit", "two_phase"); /* ALTER SUBSCRIPTION <name> SKIP ( */ else if (Matches("ALTER", "SUBSCRIPTION", MatchAny, MatchAnyN, "SKIP", "(")) COMPLETE_WITH("lsn"); @@ -2495,7 +2520,10 @@ match_previous_words(int pattern_id, /* ALTER USER,ROLE <name> RESET */ else if (Matches("ALTER", "USER|ROLE", MatchAny, "RESET")) + { + set_completion_reference(prev2_wd); COMPLETE_WITH_QUERY_PLUS(Query_for_list_of_user_vars, "ALL"); + } /* ALTER USER,ROLE <name> WITH */ else if (Matches("ALTER", "USER|ROLE", MatchAny, "WITH")) @@ -2725,17 +2753,24 @@ match_previous_words(int pattern_id, /* ALTER TABLE xxx ADD */ else if (Matches("ALTER", "TABLE", MatchAny, "ADD")) { - /* make sure to keep this list and the !Matches() below in sync */ - COMPLETE_WITH("COLUMN", "CONSTRAINT", "CHECK", "UNIQUE", "PRIMARY KEY", - "EXCLUDE", "FOREIGN KEY"); + /* + * make sure to keep this list and the MatchAnyExcept() below in sync + */ + COMPLETE_WITH("COLUMN", "CONSTRAINT", "CHECK (", "NOT NULL", "UNIQUE", + "PRIMARY KEY", "EXCLUDE", "FOREIGN KEY"); } /* ALTER TABLE xxx ADD [COLUMN] yyy */ else if (Matches("ALTER", "TABLE", MatchAny, "ADD", "COLUMN", MatchAny) || - Matches("ALTER", "TABLE", MatchAny, "ADD", MatchAnyExcept("COLUMN|CONSTRAINT|CHECK|UNIQUE|PRIMARY|EXCLUDE|FOREIGN"))) + Matches("ALTER", "TABLE", MatchAny, "ADD", MatchAnyExcept("COLUMN|CONSTRAINT|CHECK|UNIQUE|PRIMARY|NOT|EXCLUDE|FOREIGN"))) COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_datatypes); /* ALTER TABLE xxx ADD CONSTRAINT yyy */ else if (Matches("ALTER", "TABLE", MatchAny, "ADD", "CONSTRAINT", MatchAny)) - COMPLETE_WITH("CHECK", "UNIQUE", "PRIMARY KEY", "EXCLUDE", "FOREIGN KEY"); + COMPLETE_WITH("CHECK (", "NOT NULL", "UNIQUE", "PRIMARY KEY", "EXCLUDE", "FOREIGN KEY"); + /* ALTER TABLE xxx ADD NOT NULL */ + else if (Matches("ALTER", "TABLE", MatchAny, "ADD", "NOT", "NULL")) + COMPLETE_WITH_ATTR(prev4_wd); + else if (Matches("ALTER", "TABLE", MatchAny, "ADD", "CONSTRAINT", MatchAny, "NOT", "NULL")) + COMPLETE_WITH_ATTR(prev6_wd); /* ALTER TABLE xxx ADD [CONSTRAINT yyy] (PRIMARY KEY|UNIQUE) */ else if (Matches("ALTER", "TABLE", MatchAny, "ADD", "PRIMARY", "KEY") || Matches("ALTER", "TABLE", MatchAny, "ADD", "UNIQUE") || @@ -3125,6 +3160,22 @@ match_previous_words(int pattern_id, COMPLETE_WITH_VERSIONED_SCHEMA_QUERY(Query_for_list_of_procedures); else if (Matches("CALL", MatchAny)) COMPLETE_WITH("("); +/* CHECKPOINT */ + else if (Matches("CHECKPOINT")) + COMPLETE_WITH("("); + else if (HeadMatches("CHECKPOINT", "(*") && + !HeadMatches("CHECKPOINT", "(*)")) + { + /* + * This fires if we're in an unfinished parenthesized option list. + * get_previous_words treats a completed parenthesized option list as + * one word, so the above test is correct. + */ + if (ends_with(prev_wd, '(') || ends_with(prev_wd, ',')) + COMPLETE_WITH("MODE", "FLUSH_UNLOGGED"); + else if (TailMatches("MODE")) + COMPLETE_WITH("FAST", "SPREAD"); + } /* CLOSE */ else if (Matches("CLOSE")) COMPLETE_WITH_QUERY_PLUS(Query_for_list_of_cursors, @@ -3255,7 +3306,7 @@ match_previous_words(int pattern_id, * backslash command). */ else if (Matches("COPY|\\copy")) - COMPLETE_WITH_SCHEMA_QUERY_PLUS(Query_for_list_of_tables, "("); + COMPLETE_WITH_SCHEMA_QUERY_PLUS(Query_for_list_of_tables_for_copy, "("); /* Complete COPY ( with legal query commands */ else if (Matches("COPY|\\copy", "(")) COMPLETE_WITH("SELECT", "TABLE", "VALUES", "INSERT INTO", "UPDATE", "DELETE FROM", "MERGE INTO", "WITH"); @@ -3284,23 +3335,24 @@ match_previous_words(int pattern_id, else if (Matches("COPY|\\copy", MatchAny, "FROM", MatchAny)) COMPLETE_WITH("WITH (", "WHERE"); - /* Complete COPY <sth> FROM|TO filename WITH ( */ - else if (Matches("COPY|\\copy", MatchAny, "FROM|TO", MatchAny, "WITH", "(")) - COMPLETE_WITH("FORMAT", "FREEZE", "DELIMITER", "NULL", - "HEADER", "QUOTE", "ESCAPE", "FORCE_QUOTE", - "FORCE_NOT_NULL", "FORCE_NULL", "ENCODING", "DEFAULT", - "ON_ERROR", "LOG_VERBOSITY"); + /* Complete COPY <sth> FROM filename WITH ( */ + else if (Matches("COPY|\\copy", MatchAny, "FROM", MatchAny, "WITH", "(")) + COMPLETE_WITH(Copy_from_options); + + /* Complete COPY <sth> TO filename WITH ( */ + else if (Matches("COPY|\\copy", MatchAny, "TO", MatchAny, "WITH", "(")) + COMPLETE_WITH(Copy_to_options); /* Complete COPY <sth> FROM|TO filename WITH (FORMAT */ else if (Matches("COPY|\\copy", MatchAny, "FROM|TO", MatchAny, "WITH", "(", "FORMAT")) COMPLETE_WITH("binary", "csv", "text"); /* Complete COPY <sth> FROM filename WITH (ON_ERROR */ - else if (Matches("COPY|\\copy", MatchAny, "FROM|TO", MatchAny, "WITH", "(", "ON_ERROR")) + else if (Matches("COPY|\\copy", MatchAny, "FROM", MatchAny, "WITH", "(", "ON_ERROR")) COMPLETE_WITH("stop", "ignore"); /* Complete COPY <sth> FROM filename WITH (LOG_VERBOSITY */ - else if (Matches("COPY|\\copy", MatchAny, "FROM|TO", MatchAny, "WITH", "(", "LOG_VERBOSITY")) + else if (Matches("COPY|\\copy", MatchAny, "FROM", MatchAny, "WITH", "(", "LOG_VERBOSITY")) COMPLETE_WITH("silent", "default", "verbose"); /* Complete COPY <sth> FROM <sth> WITH (<options>) */ @@ -3664,9 +3716,10 @@ match_previous_words(int pattern_id, TailMatches("CREATE", "TEMP|TEMPORARY|UNLOGGED", "TABLE", MatchAny, "(*)", "AS")) COMPLETE_WITH("EXECUTE", "SELECT", "TABLE", "VALUES", "WITH"); /* Complete CREATE TABLE name (...) with supported options */ - else if (TailMatches("CREATE", "TABLE", MatchAny, "(*)") || - TailMatches("CREATE", "UNLOGGED", "TABLE", MatchAny, "(*)")) + else if (TailMatches("CREATE", "TABLE", MatchAny, "(*)")) COMPLETE_WITH("AS", "INHERITS (", "PARTITION BY", "USING", "TABLESPACE", "WITH ("); + else if (TailMatches("CREATE", "UNLOGGED", "TABLE", MatchAny, "(*)")) + COMPLETE_WITH("AS", "INHERITS (", "USING", "TABLESPACE", "WITH ("); else if (TailMatches("CREATE", "TEMP|TEMPORARY", "TABLE", MatchAny, "(*)")) COMPLETE_WITH("AS", "INHERITS (", "ON COMMIT", "PARTITION BY", "USING", "TABLESPACE", "WITH ("); @@ -3728,8 +3781,9 @@ match_previous_words(int pattern_id, else if (Matches("CREATE", "SUBSCRIPTION", MatchAnyN, "WITH", "(")) COMPLETE_WITH("binary", "connect", "copy_data", "create_slot", "disable_on_error", "enabled", "failover", "origin", - "password_required", "run_as_owner", "slot_name", - "streaming", "synchronous_commit", "two_phase"); + "password_required", "retain_dead_tuples", + "run_as_owner", "slot_name", "streaming", + "synchronous_commit", "two_phase"); /* CREATE TRIGGER --- is allowed inside CREATE SCHEMA, so use TailMatches */ @@ -4573,10 +4627,14 @@ match_previous_words(int pattern_id, else if (Matches("ALTER", "DEFAULT", "PRIVILEGES", MatchAnyN, "TO", MatchAny)) COMPLETE_WITH("WITH GRANT OPTION"); /* Complete "GRANT/REVOKE ... ON * *" with TO/FROM */ - else if (Matches("GRANT", MatchAnyN, "ON", MatchAny, MatchAny)) - COMPLETE_WITH("TO"); - else if (Matches("REVOKE", MatchAnyN, "ON", MatchAny, MatchAny)) - COMPLETE_WITH("FROM"); + else if (Matches("GRANT|REVOKE", MatchAnyN, "ON", MatchAny, MatchAny) && + !TailMatches("FOREIGN", "SERVER") && !TailMatches("LARGE", "OBJECT")) + { + if (Matches("GRANT", MatchAnyN, "ON", MatchAny, MatchAny)) + COMPLETE_WITH("TO"); + else + COMPLETE_WITH("FROM"); + } /* Complete "GRANT/REVOKE * ON ALL * IN SCHEMA *" with TO/FROM */ else if (TailMatches("GRANT|REVOKE", MatchAny, "ON", "ALL", MatchAny, "IN", "SCHEMA", MatchAny) || @@ -4608,6 +4666,26 @@ match_previous_words(int pattern_id, COMPLETE_WITH("FROM"); } + /* Complete "GRANT/REVOKE * ON LARGE OBJECT *" with TO/FROM */ + else if (TailMatches("GRANT|REVOKE", MatchAny, "ON", "LARGE", "OBJECT", MatchAny) || + TailMatches("REVOKE", "GRANT", "OPTION", "FOR", MatchAny, "ON", "LARGE", "OBJECT", MatchAny)) + { + if (TailMatches("GRANT", MatchAny, MatchAny, MatchAny, MatchAny, MatchAny)) + COMPLETE_WITH("TO"); + else + COMPLETE_WITH("FROM"); + } + + /* Complete "GRANT/REVOKE * ON LARGE OBJECTS" with TO/FROM */ + else if (TailMatches("GRANT|REVOKE", MatchAny, "ON", "LARGE", "OBJECTS") || + TailMatches("REVOKE", "GRANT", "OPTION", "FOR", MatchAny, "ON", "LARGE", "OBJECTS")) + { + if (TailMatches("GRANT", MatchAny, MatchAny, MatchAny, MatchAny)) + COMPLETE_WITH("TO"); + else + COMPLETE_WITH("FROM"); + } + /* GROUP BY */ else if (TailMatches("FROM", MatchAny, "GROUP")) COMPLETE_WITH("BY"); @@ -4943,7 +5021,7 @@ match_previous_words(int pattern_id, /* Complete with a variable name */ else if (TailMatches("SET|RESET") && !TailMatches("UPDATE", MatchAny, "SET") && - !TailMatches("ALTER", "DATABASE", MatchAny, "RESET")) + !TailMatches("ALTER", "DATABASE|USER|ROLE", MatchAny, "RESET")) COMPLETE_WITH_QUERY_VERBATIM_PLUS(Query_for_list_of_set_vars, "CONSTRAINTS", "TRANSACTION", diff --git a/src/bin/psql/variables.c b/src/bin/psql/variables.c index ae2d0e5ed3f..6b64302ebca 100644 --- a/src/bin/psql/variables.c +++ b/src/bin/psql/variables.c @@ -204,7 +204,7 @@ ParseVariableDouble(const char *value, const char *name, double *result, double if ((value == NULL) || (*value == '\0')) { if (name) - pg_log_error("invalid input syntax for \"%s\"", name); + pg_log_error("invalid input syntax for variable \"%s\"", name); return false; } @@ -215,14 +215,14 @@ ParseVariableDouble(const char *value, const char *name, double *result, double if (dblval < min) { if (name) - pg_log_error("invalid value \"%s\" for \"%s\": must be greater than %.2f", + pg_log_error("invalid value \"%s\" for variable \"%s\": must be greater than %.2f", value, name, min); return false; } else if (dblval > max) { if (name) - pg_log_error("invalid value \"%s\" for \"%s\": must be less than %.2f", + pg_log_error("invalid value \"%s\" for variable \"%s\": must be less than %.2f", value, name, max); } *result = dblval; @@ -238,13 +238,13 @@ ParseVariableDouble(const char *value, const char *name, double *result, double (dblval == 0.0 || dblval >= HUGE_VAL || dblval <= -HUGE_VAL)) { if (name) - pg_log_error("\"%s\" is out of range for \"%s\"", value, name); + pg_log_error("value \"%s\" is out of range for variable \"%s\"", value, name); return false; } else { if (name) - pg_log_error("invalid value \"%s\" for \"%s\"", value, name); + pg_log_error("invalid value \"%s\" for variable \"%s\"", value, name); return false; } } diff --git a/src/bin/scripts/t/100_vacuumdb.pl b/src/bin/scripts/t/100_vacuumdb.pl index 75ac24a7a55..ff56a13b46b 100644 --- a/src/bin/scripts/t/100_vacuumdb.pl +++ b/src/bin/scripts/t/100_vacuumdb.pl @@ -238,62 +238,105 @@ $node->command_fails_like( 'cannot use option --all and a dbname as argument at the same time'); $node->safe_psql('postgres', - 'CREATE TABLE regression_vacuumdb_test AS select generate_series(1, 10) a, generate_series(2, 11) b;'); + 'CREATE TABLE regression_vacuumdb_test AS select generate_series(1, 10) a, generate_series(2, 11) b;' +); $node->issues_sql_like( - [ 'vacuumdb', '--analyze-only', '--missing-stats-only', '-t', 'regression_vacuumdb_test', 'postgres' ], + [ + 'vacuumdb', '--analyze-only', + '--missing-stats-only', '-t', + 'regression_vacuumdb_test', 'postgres' + ], qr/statement:\ ANALYZE/sx, '--missing-stats-only with missing stats'); $node->issues_sql_unlike( - [ 'vacuumdb', '--analyze-only', '--missing-stats-only', '-t', 'regression_vacuumdb_test', 'postgres' ], + [ + 'vacuumdb', '--analyze-only', + '--missing-stats-only', '-t', + 'regression_vacuumdb_test', 'postgres' + ], qr/statement:\ ANALYZE/sx, '--missing-stats-only with no missing stats'); $node->safe_psql('postgres', - 'CREATE INDEX regression_vacuumdb_test_idx ON regression_vacuumdb_test (mod(a, 2));'); + 'CREATE INDEX regression_vacuumdb_test_idx ON regression_vacuumdb_test (mod(a, 2));' +); $node->issues_sql_like( - [ 'vacuumdb', '--analyze-in-stages', '--missing-stats-only', '-t', 'regression_vacuumdb_test', 'postgres' ], + [ + 'vacuumdb', '--analyze-in-stages', + '--missing-stats-only', '-t', + 'regression_vacuumdb_test', 'postgres' + ], qr/statement:\ ANALYZE/sx, '--missing-stats-only with missing index expression stats'); $node->issues_sql_unlike( - [ 'vacuumdb', '--analyze-in-stages', '--missing-stats-only', '-t', 'regression_vacuumdb_test', 'postgres' ], + [ + 'vacuumdb', '--analyze-in-stages', + '--missing-stats-only', '-t', + 'regression_vacuumdb_test', 'postgres' + ], qr/statement:\ ANALYZE/sx, '--missing-stats-only with no missing index expression stats'); $node->safe_psql('postgres', - 'CREATE STATISTICS regression_vacuumdb_test_stat ON a, b FROM regression_vacuumdb_test;'); + 'CREATE STATISTICS regression_vacuumdb_test_stat ON a, b FROM regression_vacuumdb_test;' +); $node->issues_sql_like( - [ 'vacuumdb', '--analyze-only', '--missing-stats-only', '-t', 'regression_vacuumdb_test', 'postgres' ], + [ + 'vacuumdb', '--analyze-only', + '--missing-stats-only', '-t', + 'regression_vacuumdb_test', 'postgres' + ], qr/statement:\ ANALYZE/sx, '--missing-stats-only with missing extended stats'); $node->issues_sql_unlike( - [ 'vacuumdb', '--analyze-only', '--missing-stats-only', '-t', 'regression_vacuumdb_test', 'postgres' ], + [ + 'vacuumdb', '--analyze-only', + '--missing-stats-only', '-t', + 'regression_vacuumdb_test', 'postgres' + ], qr/statement:\ ANALYZE/sx, '--missing-stats-only with no missing extended stats'); $node->safe_psql('postgres', "CREATE TABLE regression_vacuumdb_child (a INT) INHERITS (regression_vacuumdb_test);\n" - . "INSERT INTO regression_vacuumdb_child VALUES (1, 2);\n" - . "ANALYZE regression_vacuumdb_child;\n"); + . "INSERT INTO regression_vacuumdb_child VALUES (1, 2);\n" + . "ANALYZE regression_vacuumdb_child;\n"); $node->issues_sql_like( - [ 'vacuumdb', '--analyze-in-stages', '--missing-stats-only', '-t', 'regression_vacuumdb_test', 'postgres' ], + [ + 'vacuumdb', '--analyze-in-stages', + '--missing-stats-only', '-t', + 'regression_vacuumdb_test', 'postgres' + ], qr/statement:\ ANALYZE/sx, '--missing-stats-only with missing inherited stats'); $node->issues_sql_unlike( - [ 'vacuumdb', '--analyze-in-stages', '--missing-stats-only', '-t', 'regression_vacuumdb_test', 'postgres' ], + [ + 'vacuumdb', '--analyze-in-stages', + '--missing-stats-only', '-t', + 'regression_vacuumdb_test', 'postgres' + ], qr/statement:\ ANALYZE/sx, '--missing-stats-only with no missing inherited stats'); $node->safe_psql('postgres', "CREATE TABLE regression_vacuumdb_parted (a INT) PARTITION BY LIST (a);\n" - . "CREATE TABLE regression_vacuumdb_part1 PARTITION OF regression_vacuumdb_parted FOR VALUES IN (1);\n" - . "INSERT INTO regression_vacuumdb_parted VALUES (1);\n" - . "ANALYZE regression_vacuumdb_part1;\n"); + . "CREATE TABLE regression_vacuumdb_part1 PARTITION OF regression_vacuumdb_parted FOR VALUES IN (1);\n" + . "INSERT INTO regression_vacuumdb_parted VALUES (1);\n" + . "ANALYZE regression_vacuumdb_part1;\n"); $node->issues_sql_like( - [ 'vacuumdb', '--analyze-only', '--missing-stats-only', '-t', 'regression_vacuumdb_parted', 'postgres' ], + [ + 'vacuumdb', '--analyze-only', + '--missing-stats-only', '-t', + 'regression_vacuumdb_parted', 'postgres' + ], qr/statement:\ ANALYZE/sx, '--missing-stats-only with missing partition stats'); $node->issues_sql_unlike( - [ 'vacuumdb', '--analyze-only', '--missing-stats-only', '-t', 'regression_vacuumdb_parted', 'postgres' ], + [ + 'vacuumdb', '--analyze-only', + '--missing-stats-only', '-t', + 'regression_vacuumdb_parted', 'postgres' + ], qr/statement:\ ANALYZE/sx, '--missing-stats-only with no missing partition stats'); diff --git a/src/common/Makefile b/src/common/Makefile index 1e2b91c83c4..2c720caa509 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -163,7 +163,7 @@ libpgcommon_shlib.a: $(OBJS_SHLIB) # The JSON API normally exits on out-of-memory; disable that behavior for shared # library builds. This requires libpq's pqexpbuffer.h. jsonapi_shlib.o: override CPPFLAGS += -DJSONAPI_USE_PQEXPBUFFER -jsonapi_shlib.o: override CPPFLAGS += -I$(libpq_srcdir) +jsonapi_shlib.o: override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) # Because this uses its own compilation rule, it doesn't use the # dependency tracking logic from Makefile.global. To make sure that diff --git a/src/common/parse_manifest.c b/src/common/parse_manifest.c index 71973af199b..58e0948100f 100644 --- a/src/common/parse_manifest.c +++ b/src/common/parse_manifest.c @@ -942,7 +942,7 @@ parse_xlogrecptr(XLogRecPtr *result, char *input) uint32 hi; uint32 lo; - if (sscanf(input, "%X/%X", &hi, &lo) != 2) + if (sscanf(input, "%X/%08X", &hi, &lo) != 2) return false; *result = ((uint64) hi) << 32 | lo; return true; diff --git a/src/fe_utils/astreamer_lz4.c b/src/fe_utils/astreamer_lz4.c index 781aaf99f38..5f581d1de37 100644 --- a/src/fe_utils/astreamer_lz4.c +++ b/src/fe_utils/astreamer_lz4.c @@ -322,9 +322,9 @@ astreamer_lz4_decompressor_content(astreamer *streamer, mystreamer = (astreamer_lz4_frame *) streamer; next_in = (uint8 *) data; - next_out = (uint8 *) mystreamer->base.bbs_buffer.data; + next_out = (uint8 *) mystreamer->base.bbs_buffer.data + mystreamer->bytes_written; avail_in = len; - avail_out = mystreamer->base.bbs_buffer.maxlen; + avail_out = mystreamer->base.bbs_buffer.maxlen - mystreamer->bytes_written; while (avail_in > 0) { diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 52916bab7a3..70949de56ac 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -293,7 +293,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; - aminsertcleanup_function aminsertcleanup; + aminsertcleanup_function aminsertcleanup; /* can be NULL */ ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h index b8294e41b97..dc39e7dd32c 100644 --- a/src/include/access/commit_ts.h +++ b/src/include/access/commit_ts.h @@ -46,17 +46,6 @@ extern int committssyncfiletag(const FileTag *ftag, char *path); #define COMMIT_TS_ZEROPAGE 0x00 #define COMMIT_TS_TRUNCATE 0x10 -typedef struct xl_commit_ts_set -{ - TimestampTz timestamp; - RepOriginId nodeid; - TransactionId mainxid; - /* subxact Xids follow */ -} xl_commit_ts_set; - -#define SizeOfCommitTsSet (offsetof(xl_commit_ts_set, mainxid) + \ - sizeof(TransactionId)) - typedef struct xl_commit_ts_truncate { int64 pageno; diff --git a/src/include/access/gist.h b/src/include/access/gist.h index db78e60eeab..b3f4e02cbfd 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -40,7 +40,7 @@ #define GIST_FETCH_PROC 9 #define GIST_OPTIONS_PROC 10 #define GIST_SORTSUPPORT_PROC 11 -#define GIST_STRATNUM_PROC 12 +#define GIST_TRANSLATE_CMPTYPE_PROC 12 #define GISTNProcs 12 /* diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index e48fe434cd3..a2bd5a897f8 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -21,6 +21,7 @@ #include "access/skey.h" #include "access/table.h" /* for backward compatibility */ #include "access/tableam.h" +#include "commands/vacuum.h" #include "nodes/lockoptions.h" #include "nodes/primnodes.h" #include "storage/bufpage.h" @@ -96,7 +97,7 @@ typedef struct HeapScanDescData uint32 rs_cindex; /* current tuple's index in vistuples */ uint32 rs_ntuples; /* number of visible tuples on page */ OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ -} HeapScanDescData; +} HeapScanDescData; typedef struct HeapScanDescData *HeapScanDesc; typedef struct BitmapHeapScanDescData @@ -396,9 +397,8 @@ extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer, OffsetNumber *unused, int nunused); /* in heap/vacuumlazy.c */ -struct VacuumParams; extern void heap_vacuum_rel(Relation rel, - struct VacuumParams *params, BufferAccessStrategy bstrategy); + const VacuumParams params, BufferAccessStrategy bstrategy); /* in heap/heapam_visibility.c */ extern bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 4e6b0eec2ff..b876e98f46e 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -11,6 +11,7 @@ #ifndef MULTIXACT_H #define MULTIXACT_H +#include "access/transam.h" #include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/sync.h" @@ -119,7 +120,7 @@ extern int multixactmemberssyncfiletag(const FileTag *ftag, char *path); extern void AtEOXact_MultiXact(void); extern void AtPrepare_MultiXact(void); -extern void PostPrepare_MultiXact(TransactionId xid); +extern void PostPrepare_MultiXact(FullTransactionId fxid); extern Size MultiXactShmemSize(void); extern void MultiXactShmemInit(void); @@ -145,11 +146,11 @@ extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB); extern int MultiXactMemberFreezeThreshold(void); -extern void multixact_twophase_recover(TransactionId xid, uint16 info, +extern void multixact_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); -extern void multixact_twophase_postcommit(TransactionId xid, uint16 info, +extern void multixact_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); -extern void multixact_twophase_postabort(TransactionId xid, uint16 info, +extern void multixact_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); extern void multixact_redo(XLogReaderState *record); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index ebca02588d3..e709d2e0afe 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -939,7 +939,7 @@ typedef BTVacuumPostingData *BTVacuumPosting; * processing. This approach minimizes lock/unlock traffic. We must always * drop the lock to make it okay for caller to process the returned items. * Whether or not we can also release the pin during this window will vary. - * We drop the pin eagerly (when safe) to avoid blocking progress by VACUUM + * We drop the pin (when so->dropPin) to avoid blocking progress by VACUUM * (see nbtree/README section about making concurrent TID recycling safe). * We'll always release both the lock and the pin on the current page before * moving on to its sibling page. @@ -967,7 +967,7 @@ typedef struct BTScanPosData BlockNumber currPage; /* page referenced by items array */ BlockNumber prevPage; /* currPage's left link */ BlockNumber nextPage; /* currPage's right link */ - XLogRecPtr lsn; /* currPage's LSN */ + XLogRecPtr lsn; /* currPage's LSN (when so->dropPin) */ /* scan direction for the saved position's call to _bt_readpage */ ScanDirection dir; @@ -1070,6 +1070,7 @@ typedef struct BTScanOpaqueData /* info about killed items if any (killedItems is NULL if never used) */ int *killedItems; /* currPos.items indexes of killed items */ int numKilled; /* number of currently stored items */ + bool dropPin; /* drop leaf pin before btgettuple returns? */ /* * If we are doing an index-only scan, these are the tuple storage diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h index dfbb4c85460..a604a4702c3 100644 --- a/src/include/access/reloptions.h +++ b/src/include/access/reloptions.h @@ -233,7 +233,7 @@ extern void add_local_string_reloption(local_relopts *relopts, const char *name, fill_string_relopt filler, int offset); extern Datum transformRelOptions(Datum oldOptions, List *defList, - const char *namspace, const char *const validnsps[], + const char *nameSpace, const char *const validnsps[], bool acceptOidsOff, bool isReset); extern List *untransformRelOptions(Datum options); extern bytea *extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, diff --git a/src/include/access/slru.h b/src/include/access/slru.h index e142800aab2..20dbd1e0070 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -187,6 +187,7 @@ extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, int bank_tranche_id, SyncRequestHandler sync_handler, bool long_segment_names); extern int SimpleLruZeroPage(SlruCtl ctl, int64 pageno); +extern void SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno); extern int SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, TransactionId xid); extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 8713e12cbfb..1c9e802a6b1 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -20,6 +20,7 @@ #include "access/relscan.h" #include "access/sdir.h" #include "access/xact.h" +#include "commands/vacuum.h" #include "executor/tuptable.h" #include "storage/read_stream.h" #include "utils/rel.h" @@ -36,7 +37,6 @@ extern PGDLLIMPORT bool synchronize_seqscans; struct BulkInsertStateData; struct IndexInfo; struct SampleScanState; -struct VacuumParams; struct ValidateIndexState; /* @@ -645,7 +645,7 @@ typedef struct TableAmRoutine * integrate with autovacuum's scheduling. */ void (*relation_vacuum) (Relation rel, - struct VacuumParams *params, + const VacuumParams params, BufferAccessStrategy bstrategy); /* @@ -1664,7 +1664,7 @@ table_relation_copy_for_cluster(Relation OldTable, Relation NewTable, * routine, even if (for ANALYZE) it is part of the same VACUUM command. */ static inline void -table_relation_vacuum(Relation rel, struct VacuumParams *params, +table_relation_vacuum(Relation rel, const VacuumParams params, BufferAccessStrategy bstrategy) { rel->rd_tableam->relation_vacuum(rel, params, bstrategy); diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index 9fa82355033..509bdad9a5d 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -36,10 +36,10 @@ extern void PostPrepare_Twophase(void); extern TransactionId TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid, bool *have_more); -extern PGPROC *TwoPhaseGetDummyProc(TransactionId xid, bool lock_held); -extern int TwoPhaseGetDummyProcNumber(TransactionId xid, bool lock_held); +extern PGPROC *TwoPhaseGetDummyProc(FullTransactionId fxid, bool lock_held); +extern int TwoPhaseGetDummyProcNumber(FullTransactionId fxid, bool lock_held); -extern GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid, +extern GlobalTransaction MarkAsPreparing(FullTransactionId fxid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid); @@ -56,8 +56,9 @@ extern void CheckPointTwoPhase(XLogRecPtr redo_horizon); extern void FinishPreparedTransaction(const char *gid, bool isCommit); -extern void PrepareRedoAdd(char *buf, XLogRecPtr start_lsn, - XLogRecPtr end_lsn, RepOriginId origin_id); +extern void PrepareRedoAdd(FullTransactionId fxid, char *buf, + XLogRecPtr start_lsn, XLogRecPtr end_lsn, + RepOriginId origin_id); extern void PrepareRedoRemove(TransactionId xid, bool giveWarning); extern void restoreTwoPhaseData(void); extern bool LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn, diff --git a/src/include/access/twophase_rmgr.h b/src/include/access/twophase_rmgr.h index 3ed154bb231..8f576402e36 100644 --- a/src/include/access/twophase_rmgr.h +++ b/src/include/access/twophase_rmgr.h @@ -14,7 +14,9 @@ #ifndef TWOPHASE_RMGR_H #define TWOPHASE_RMGR_H -typedef void (*TwoPhaseCallback) (TransactionId xid, uint16 info, +#include "access/transam.h" + +typedef void (*TwoPhaseCallback) (FullTransactionId fxid, uint16 info, void *recdata, uint32 len); typedef uint8 TwoPhaseRmgrId; diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d313099c027..d12798be3d8 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -139,10 +139,9 @@ extern PGDLLIMPORT bool XLOG_DEBUG; #define CHECKPOINT_IS_SHUTDOWN 0x0001 /* Checkpoint is for shutdown */ #define CHECKPOINT_END_OF_RECOVERY 0x0002 /* Like shutdown checkpoint, but * issued at end of WAL recovery */ -#define CHECKPOINT_IMMEDIATE 0x0004 /* Do it without delays */ +#define CHECKPOINT_FAST 0x0004 /* Do it without delays */ #define CHECKPOINT_FORCE 0x0008 /* Force even if no activity */ -#define CHECKPOINT_FLUSH_ALL 0x0010 /* Flush all pages, including those - * belonging to unlogged tables */ +#define CHECKPOINT_FLUSH_UNLOGGED 0x0010 /* Flush unlogged tables */ /* These are important to RequestCheckpoint */ #define CHECKPOINT_WAIT 0x0020 /* Wait for completion */ #define CHECKPOINT_REQUESTED 0x0040 /* Checkpoint request has been made */ diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 2cf8d55d706..cc06fc29ab2 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -316,16 +316,6 @@ typedef struct XLogRecData uint32 len; /* length of rmgr data to include */ } XLogRecData; -/* - * Recovery target action. - */ -typedef enum -{ - RECOVERY_TARGET_ACTION_PAUSE, - RECOVERY_TARGET_ACTION_PROMOTE, - RECOVERY_TARGET_ACTION_SHUTDOWN, -} RecoveryTargetAction; - struct LogicalDecodingContext; struct XLogRecordBuffer; diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h index 9e41c9f6e84..514f03df0b6 100644 --- a/src/include/access/xlogdefs.h +++ b/src/include/access/xlogdefs.h @@ -38,7 +38,10 @@ typedef uint64 XLogRecPtr; /* * Handy macro for printing XLogRecPtr in conventional format, e.g., * - * printf("%X/%X", LSN_FORMAT_ARGS(lsn)); + * printf("%X/08X", LSN_FORMAT_ARGS(lsn)); + * + * To avoid breaking translatable messages, we're directly applying the + * LSN format instead of using a macro. */ #define LSN_FORMAT_ARGS(lsn) (AssertVariableIsOfTypeMacro((lsn), XLogRecPtr), (uint32) ((lsn) >> 32)), ((uint32) (lsn)) diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index cf057f033a2..d6a71415d4f 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -44,6 +44,7 @@ extern void XLogBeginInsert(void); extern void XLogSetRecordFlags(uint8 flags); extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info); +extern XLogRecPtr XLogSimpleInsertInt64(RmgrId rmid, uint8 info, int64 value); extern void XLogEnsureRecordSpace(int max_block_id, int ndatas); extern void XLogRegisterData(const void *data, uint32 len); extern void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags); diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h index 91446303024..8e475e266d1 100644 --- a/src/include/access/xlogrecovery.h +++ b/src/include/access/xlogrecovery.h @@ -40,6 +40,16 @@ typedef enum RECOVERY_TARGET_TIMELINE_NUMERIC, } RecoveryTargetTimeLineGoal; +/* + * Recovery target action. + */ +typedef enum +{ + RECOVERY_TARGET_ACTION_PAUSE, + RECOVERY_TARGET_ACTION_PROMOTE, + RECOVERY_TARGET_ACTION_SHUTDOWN, +} RecoveryTargetAction; + /* Recovery pause states */ typedef enum RecoveryPauseState { diff --git a/src/include/backup/basebackup_sink.h b/src/include/backup/basebackup_sink.h index 8a5ee996a45..310d92b8b9d 100644 --- a/src/include/backup/basebackup_sink.h +++ b/src/include/backup/basebackup_sink.h @@ -287,7 +287,8 @@ extern bbsink *bbsink_copystream_new(bool send_to_client); extern bbsink *bbsink_gzip_new(bbsink *next, pg_compress_specification *); extern bbsink *bbsink_lz4_new(bbsink *next, pg_compress_specification *); extern bbsink *bbsink_zstd_new(bbsink *next, pg_compress_specification *); -extern bbsink *bbsink_progress_new(bbsink *next, bool estimate_backup_size); +extern bbsink *bbsink_progress_new(bbsink *next, bool estimate_backup_size, + bool incremental); extern bbsink *bbsink_server_new(bbsink *next, char *pathname); extern bbsink *bbsink_throttle_new(bbsink *next, uint32 maxrate); diff --git a/src/include/c.h b/src/include/c.h index 8cdc16a0f4a..bbdaa88c63a 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -333,6 +333,36 @@ #endif /* + * pg_assume(expr) states that we assume `expr` to evaluate to true. In assert + * enabled builds pg_assume() is turned into an assertion, in optimized builds + * we try to clue the compiler into the fact that `expr` is true. + * + * This is useful for two purposes: + * + * 1) Avoid compiler warnings by telling the compiler about assumptions the + * code makes. This is particularly useful when building with optimizations + * and w/o assertions. + * + * 2) Help the compiler to generate more efficient code + * + * It is unspecified whether `expr` is evaluated, therefore it better be + * side-effect free. + */ +#if defined(USE_ASSERT_CHECKING) +#define pg_assume(expr) Assert(expr) +#elif defined(HAVE__BUILTIN_UNREACHABLE) +#define pg_assume(expr) \ + do { \ + if (!(expr)) \ + __builtin_unreachable(); \ + } while (0) +#elif defined(_MSC_VER) +#define pg_assume(expr) __assume(expr) +#else +#define pg_assume(expr) ((void) 0) +#endif + +/* * Hints to the compiler about the likelihood of a branch. Both likely() and * unlikely() return the boolean value of the contained expression. * @@ -376,25 +406,7 @@ * pretty trivial: VA_ARGS_NARGS_() returns its 64th argument, and we set up * the call so that that is the appropriate one of the list of constants. * This idea is due to Laurent Deniau. - * - * MSVC has an implementation of __VA_ARGS__ that doesn't conform to the - * standard unless you use the /Zc:preprocessor compiler flag, but that - * isn't available before Visual Studio 2019. For now, use a different - * definition that also works on older compilers. */ -#ifdef _MSC_VER -#define EXPAND(args) args -#define VA_ARGS_NARGS(...) \ - VA_ARGS_NARGS_ EXPAND((__VA_ARGS__, \ - 63,62,61,60, \ - 59,58,57,56,55,54,53,52,51,50, \ - 49,48,47,46,45,44,43,42,41,40, \ - 39,38,37,36,35,34,33,32,31,30, \ - 29,28,27,26,25,24,23,22,21,20, \ - 19,18,17,16,15,14,13,12,11,10, \ - 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)) -#else - #define VA_ARGS_NARGS(...) \ VA_ARGS_NARGS_(__VA_ARGS__, \ 63,62,61,60, \ @@ -404,7 +416,6 @@ 29,28,27,26,25,24,23,22,21,20, \ 19,18,17,16,15,14,13,12,11,10, \ 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) -#endif #define VA_ARGS_NARGS_( \ _01,_02,_03,_04,_05,_06,_07,_08,_09,_10, \ @@ -519,8 +530,6 @@ typedef uint32 bits32; /* >= 32 bits */ /* snprintf format strings to use for 64-bit integers */ #define INT64_FORMAT "%" PRId64 #define UINT64_FORMAT "%" PRIu64 -#define INT64_HEX_FORMAT "%" PRIx64 -#define UINT64_HEX_FORMAT "%" PRIx64 /* * 128-bit signed and unsigned integers diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 82988d24433..c4fe8b991af 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202505071 +#define CATALOG_VERSION_NO 202508051 #endif diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index 92505148998..e3477500baa 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -533,7 +533,7 @@ amprocrighttype => 'box', amprocnum => '8', amproc => 'gist_box_distance' }, { amprocfamily => 'gist/box_ops', amproclefttype => 'any', amprocrighttype => 'any', amprocnum => '12', - amproc => 'gist_stratnum_common' }, + amproc => 'gist_translate_cmptype_common' }, { amprocfamily => 'gist/poly_ops', amproclefttype => 'polygon', amprocrighttype => 'polygon', amprocnum => '1', amproc => 'gist_poly_consistent' }, @@ -555,7 +555,7 @@ amproc => 'gist_poly_distance' }, { amprocfamily => 'gist/poly_ops', amproclefttype => 'any', amprocrighttype => 'any', amprocnum => '12', - amproc => 'gist_stratnum_common' }, + amproc => 'gist_translate_cmptype_common' }, { amprocfamily => 'gist/circle_ops', amproclefttype => 'circle', amprocrighttype => 'circle', amprocnum => '1', amproc => 'gist_circle_consistent' }, @@ -576,7 +576,7 @@ amproc => 'gist_circle_distance' }, { amprocfamily => 'gist/circle_ops', amproclefttype => 'any', amprocrighttype => 'any', amprocnum => '12', - amproc => 'gist_stratnum_common' }, + amproc => 'gist_translate_cmptype_common' }, { amprocfamily => 'gist/tsvector_ops', amproclefttype => 'tsvector', amprocrighttype => 'tsvector', amprocnum => '1', amproc => 'gtsvector_consistent(internal,tsvector,int2,oid,internal)' }, @@ -636,7 +636,7 @@ amproc => 'range_sortsupport' }, { amprocfamily => 'gist/range_ops', amproclefttype => 'any', amprocrighttype => 'any', amprocnum => '12', - amproc => 'gist_stratnum_common' }, + amproc => 'gist_translate_cmptype_common' }, { amprocfamily => 'gist/network_ops', amproclefttype => 'inet', amprocrighttype => 'inet', amprocnum => '1', amproc => 'inet_gist_consistent' }, @@ -655,7 +655,7 @@ amprocrighttype => 'inet', amprocnum => '9', amproc => 'inet_gist_fetch' }, { amprocfamily => 'gist/network_ops', amproclefttype => 'any', amprocrighttype => 'any', amprocnum => '12', - amproc => 'gist_stratnum_common' }, + amproc => 'gist_translate_cmptype_common' }, { amprocfamily => 'gist/multirange_ops', amproclefttype => 'anymultirange', amprocrighttype => 'anymultirange', amprocnum => '1', amproc => 'multirange_gist_consistent' }, @@ -676,7 +676,7 @@ amproc => 'range_gist_same' }, { amprocfamily => 'gist/multirange_ops', amproclefttype => 'any', amprocrighttype => 'any', amprocnum => '12', - amproc => 'gist_stratnum_common' }, + amproc => 'gist_translate_cmptype_common' }, # gin { amprocfamily => 'gin/array_ops', amproclefttype => 'anyarray', diff --git a/src/include/catalog/pg_authid.dat b/src/include/catalog/pg_authid.dat index eb4dab5c6aa..c881c13adf1 100644 --- a/src/include/catalog/pg_authid.dat +++ b/src/include/catalog/pg_authid.dat @@ -99,7 +99,7 @@ rolcreaterole => 'f', rolcreatedb => 'f', rolcanlogin => 'f', rolreplication => 'f', rolbypassrls => 'f', rolconnlimit => '-1', rolpassword => '_null_', rolvaliduntil => '_null_' }, -{ oid => '8916', oid_symbol => 'ROLE_PG_SIGNAL_AUTOVACUUM_WORKER', +{ oid => '6392', oid_symbol => 'ROLE_PG_SIGNAL_AUTOVACUUM_WORKER', rolname => 'pg_signal_autovacuum_worker', rolsuper => 'f', rolinherit => 't', rolcreaterole => 'f', rolcreatedb => 'f', rolcanlogin => 'f', rolreplication => 'f', rolbypassrls => 'f', rolconnlimit => '-1', diff --git a/src/include/catalog/pg_cast.dat b/src/include/catalog/pg_cast.dat index ab46be606f0..fbfd669587f 100644 --- a/src/include/catalog/pg_cast.dat +++ b/src/include/catalog/pg_cast.dat @@ -281,6 +281,20 @@ castcontext => 'a', castmethod => 'f' }, { castsource => 'regnamespace', casttarget => 'int4', castfunc => '0', castcontext => 'a', castmethod => 'b' }, +{ castsource => 'oid', casttarget => 'regdatabase', castfunc => '0', + castcontext => 'i', castmethod => 'b' }, +{ castsource => 'regdatabase', casttarget => 'oid', castfunc => '0', + castcontext => 'i', castmethod => 'b' }, +{ castsource => 'int8', casttarget => 'regdatabase', castfunc => 'oid', + castcontext => 'i', castmethod => 'f' }, +{ castsource => 'int2', casttarget => 'regdatabase', castfunc => 'int4(int2)', + castcontext => 'i', castmethod => 'f' }, +{ castsource => 'int4', casttarget => 'regdatabase', castfunc => '0', + castcontext => 'i', castmethod => 'b' }, +{ castsource => 'regdatabase', casttarget => 'int8', castfunc => 'int8(oid)', + castcontext => 'a', castmethod => 'f' }, +{ castsource => 'regdatabase', casttarget => 'int4', castfunc => '0', + castcontext => 'a', castmethod => 'b' }, # String category { castsource => 'text', casttarget => 'bpchar', castfunc => '0', diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index fb76c421931..8cfd09f0314 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -33,7 +33,8 @@ descr => 'sorts by Unicode code point; Unicode and POSIX character semantics', collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6', colllocale => 'C.UTF-8', collversion => '1' }, -{ oid => '9535', descr => 'sorts by Unicode code point; Unicode character semantics', +{ oid => '6411', + descr => 'sorts by Unicode code point; Unicode character semantics', collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6', colllocale => 'PG_UNICODE_FAST', collversion => '1' }, diff --git a/src/include/catalog/pg_index.h b/src/include/catalog/pg_index.h index 4392b9d221d..731d3938169 100644 --- a/src/include/catalog/pg_index.h +++ b/src/include/catalog/pg_index.h @@ -69,7 +69,7 @@ CATALOG(pg_index,2610,IndexRelationId) BKI_SCHEMA_MACRO */ typedef FormData_pg_index *Form_pg_index; -DECLARE_TOAST_WITH_MACRO(pg_index, 8149, 8150, PgIndexToastTable, PgIndexToastIndex); +DECLARE_TOAST_WITH_MACRO(pg_index, 6351, 6352, PgIndexToastTable, PgIndexToastIndex); DECLARE_INDEX(pg_index_indrelid_index, 2678, IndexIndrelidIndexId, pg_index, btree(indrelid oid_ops)); DECLARE_UNIQUE_INDEX_PKEY(pg_index_indexrelid_index, 2679, IndexRelidIndexId, pg_index, btree(indexrelid oid_ops)); diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 62beb71da28..118d6da1ace 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -1004,7 +1004,7 @@ { oid => '3129', descr => 'sort support', proname => 'btint2sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint2sortsupport' }, -{ oid => '9290', descr => 'skip support', +{ oid => '6402', descr => 'skip support', proname => 'btint2skipsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint2skipsupport' }, { oid => '351', descr => 'less-equal-greater', @@ -1013,7 +1013,7 @@ { oid => '3130', descr => 'sort support', proname => 'btint4sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint4sortsupport' }, -{ oid => '9291', descr => 'skip support', +{ oid => '6403', descr => 'skip support', proname => 'btint4skipsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint4skipsupport' }, { oid => '842', descr => 'less-equal-greater', @@ -1022,7 +1022,7 @@ { oid => '3131', descr => 'sort support', proname => 'btint8sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint8sortsupport' }, -{ oid => '9292', descr => 'skip support', +{ oid => '6404', descr => 'skip support', proname => 'btint8skipsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btint8skipsupport' }, { oid => '354', descr => 'less-equal-greater', @@ -1043,7 +1043,7 @@ { oid => '3134', descr => 'sort support', proname => 'btoidsortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btoidsortsupport' }, -{ oid => '9293', descr => 'skip support', +{ oid => '6405', descr => 'skip support', proname => 'btoidskipsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btoidskipsupport' }, { oid => '404', descr => 'less-equal-greater', @@ -1052,7 +1052,7 @@ { oid => '358', descr => 'less-equal-greater', proname => 'btcharcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'char char', prosrc => 'btcharcmp' }, -{ oid => '9294', descr => 'skip support', +{ oid => '6406', descr => 'skip support', proname => 'btcharskipsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btcharskipsupport' }, { oid => '359', descr => 'less-equal-greater', @@ -1180,24 +1180,24 @@ proname => 'name', proleakproof => 't', prorettype => 'name', proargtypes => 'bpchar', prosrc => 'bpchar_name' }, -{ oid => '8577', descr => 'convert int2 to bytea', +{ oid => '6367', descr => 'convert int2 to bytea', proname => 'bytea', proleakproof => 't', prorettype => 'bytea', proargtypes => 'int2', prosrc => 'int2_bytea' }, -{ oid => '8578', descr => 'convert int4 to bytea', +{ oid => '6368', descr => 'convert int4 to bytea', proname => 'bytea', proleakproof => 't', prorettype => 'bytea', proargtypes => 'int4', prosrc => 'int4_bytea' }, -{ oid => '8579', descr => 'convert int8 to bytea', +{ oid => '6369', descr => 'convert int8 to bytea', proname => 'bytea', proleakproof => 't', prorettype => 'bytea', proargtypes => 'int8', prosrc => 'int8_bytea' }, -{ oid => '8580', descr => 'convert bytea to int2', - proname => 'int2', prorettype => 'int2', - proargtypes => 'bytea', prosrc => 'bytea_int2' }, -{ oid => '8581', descr => 'convert bytea to int4', - proname => 'int4', prorettype => 'int4', - proargtypes => 'bytea', prosrc => 'bytea_int4' }, -{ oid => '8582', descr => 'convert bytea to int8', - proname => 'int8', prorettype => 'int8', - proargtypes => 'bytea', prosrc => 'bytea_int8' }, +{ oid => '6370', descr => 'convert bytea to int2', + proname => 'int2', prorettype => 'int2', proargtypes => 'bytea', + prosrc => 'bytea_int2' }, +{ oid => '6371', descr => 'convert bytea to int4', + proname => 'int4', prorettype => 'int4', proargtypes => 'bytea', + prosrc => 'bytea_int4' }, +{ oid => '6372', descr => 'convert bytea to int8', + proname => 'int8', prorettype => 'int8', proargtypes => 'bytea', + prosrc => 'bytea_int8' }, { oid => '449', descr => 'hash', proname => 'hashint2', prorettype => 'int4', proargtypes => 'int2', @@ -1259,10 +1259,10 @@ { oid => '772', descr => 'hash', proname => 'hashvarlenaextended', prorettype => 'int8', proargtypes => 'internal int8', prosrc => 'hashvarlenaextended' }, -{ oid => '9708', descr => 'hash', +{ oid => '6413', descr => 'hash', proname => 'hashbytea', prorettype => 'int4', proargtypes => 'bytea', prosrc => 'hashbytea' }, -{ oid => '9709', descr => 'hash', +{ oid => '6414', descr => 'hash', proname => 'hashbyteaextended', prorettype => 'int8', proargtypes => 'bytea int8', prosrc => 'hashbyteaextended' }, { oid => '457', descr => 'hash', @@ -1301,34 +1301,34 @@ { oid => '781', descr => 'hash', proname => 'hashmacaddr8extended', prorettype => 'int8', proargtypes => 'macaddr8 int8', prosrc => 'hashmacaddr8extended' }, -{ oid => '9710', descr => 'hash', +{ oid => '6415', descr => 'hash', proname => 'hashdate', prorettype => 'int4', proargtypes => 'date', prosrc => 'hashdate' }, -{ oid => '9711', descr => 'hash', +{ oid => '6416', descr => 'hash', proname => 'hashdateextended', prorettype => 'int8', proargtypes => 'date int8', prosrc => 'hashdateextended' }, -{ oid => '9712', descr => 'hash', +{ oid => '6417', descr => 'hash', proname => 'hashbool', prorettype => 'int4', proargtypes => 'bool', prosrc => 'hashbool' }, -{ oid => '9713', descr => 'hash', +{ oid => '6418', descr => 'hash', proname => 'hashboolextended', prorettype => 'int8', proargtypes => 'bool int8', prosrc => 'hashboolextended' }, -{ oid => '9714', descr => 'hash', +{ oid => '6419', descr => 'hash', proname => 'hashxid', prorettype => 'int4', proargtypes => 'xid', prosrc => 'hashxid' }, -{ oid => '9715', descr => 'hash', +{ oid => '6420', descr => 'hash', proname => 'hashxidextended', prorettype => 'int8', proargtypes => 'xid int8', prosrc => 'hashxidextended' }, -{ oid => '9716', descr => 'hash', +{ oid => '6421', descr => 'hash', proname => 'hashxid8', prorettype => 'int4', proargtypes => 'xid8', prosrc => 'hashxid8' }, -{ oid => '9717', descr => 'hash', +{ oid => '6422', descr => 'hash', proname => 'hashxid8extended', prorettype => 'int8', proargtypes => 'xid8 int8', prosrc => 'hashxid8extended' }, -{ oid => '9718', descr => 'hash', +{ oid => '6423', descr => 'hash', proname => 'hashcid', prorettype => 'int4', proargtypes => 'cid', prosrc => 'hashcid' }, -{ oid => '9719', descr => 'hash', +{ oid => '6424', descr => 'hash', proname => 'hashcidextended', prorettype => 'int8', proargtypes => 'cid int8', prosrc => 'hashcidextended' }, @@ -1348,10 +1348,10 @@ proname => 'text_smaller', proleakproof => 't', prorettype => 'text', proargtypes => 'text text', prosrc => 'text_smaller' }, -{ oid => '8920', descr => 'larger of two', +{ oid => '6393', descr => 'larger of two', proname => 'bytea_larger', proleakproof => 't', prorettype => 'bytea', proargtypes => 'bytea bytea', prosrc => 'bytea_larger' }, -{ oid => '8921', descr => 'smaller of two', +{ oid => '6394', descr => 'smaller of two', proname => 'bytea_smaller', proleakproof => 't', prorettype => 'bytea', proargtypes => 'bytea bytea', prosrc => 'bytea_smaller' }, @@ -1533,7 +1533,7 @@ { oid => '6163', descr => 'number of set bits', proname => 'bit_count', prorettype => 'int8', proargtypes => 'bytea', prosrc => 'bytea_bit_count' }, -{ oid => '8694', descr => 'reverse bytea', +{ oid => '6382', descr => 'reverse bytea', proname => 'reverse', prorettype => 'bytea', proargtypes => 'bytea', prosrc => 'bytea_reverse' }, @@ -1638,7 +1638,7 @@ proname => 'array_append', prosupport => 'array_append_support', proisstrict => 'f', prorettype => 'anycompatiblearray', proargtypes => 'anycompatiblearray anycompatible', prosrc => 'array_append' }, -{ oid => '8680', descr => 'planner support for array_append', +{ oid => '6378', descr => 'planner support for array_append', proname => 'array_append_support', prorettype => 'internal', proargtypes => 'internal', prosrc => 'array_append_support' }, { oid => '379', descr => 'prepend element onto front of array', @@ -1646,7 +1646,7 @@ proisstrict => 'f', prorettype => 'anycompatiblearray', proargtypes => 'anycompatible anycompatiblearray', prosrc => 'array_prepend' }, -{ oid => '8681', descr => 'planner support for array_prepend', +{ oid => '6379', descr => 'planner support for array_prepend', proname => 'array_prepend_support', prorettype => 'internal', proargtypes => 'internal', prosrc => 'array_prepend_support' }, { oid => '383', @@ -1784,17 +1784,17 @@ { oid => '6216', descr => 'take samples from array', proname => 'array_sample', provolatile => 'v', prorettype => 'anyarray', proargtypes => 'anyarray int4', prosrc => 'array_sample' }, -{ oid => '8686', descr => 'reverse array', +{ oid => '6381', descr => 'reverse array', proname => 'array_reverse', prorettype => 'anyarray', proargtypes => 'anyarray', prosrc => 'array_reverse' }, -{ oid => '8810', descr => 'sort array', +{ oid => '6388', descr => 'sort array', proname => 'array_sort', prorettype => 'anyarray', proargtypes => 'anyarray', prosrc => 'array_sort' }, -{ oid => '8811', descr => 'sort array', +{ oid => '6389', descr => 'sort array', proname => 'array_sort', prorettype => 'anyarray', proargtypes => 'anyarray bool', proargnames => '{array,descending}', prosrc => 'array_sort_order' }, -{ oid => '8812', descr => 'sort array', +{ oid => '6390', descr => 'sort array', proname => 'array_sort', prorettype => 'anyarray', proargtypes => 'anyarray bool bool', proargnames => '{array,descending,nulls_first}', @@ -2315,7 +2315,7 @@ { oid => '3136', descr => 'sort support', proname => 'date_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'date_sortsupport' }, -{ oid => '9295', descr => 'skip support', +{ oid => '6407', descr => 'skip support', proname => 'date_skipsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'date_skipsupport' }, { oid => '4133', descr => 'window RANGE support', @@ -3433,7 +3433,7 @@ proname => 'pg_sequence_last_value', provolatile => 'v', proparallel => 'u', prorettype => 'int8', proargtypes => 'regclass', prosrc => 'pg_sequence_last_value' }, -{ oid => '9876', descr => 'return sequence tuple, for use by pg_dump', +{ oid => '6427', descr => 'return sequence tuple, for use by pg_dump', proname => 'pg_get_sequence_data', provolatile => 'v', proparallel => 'u', prorettype => 'record', proargtypes => 'regclass', proallargtypes => '{regclass,int8,bool}', proargmodes => '{i,o,o}', @@ -3594,10 +3594,11 @@ proname => 'erfc', prorettype => 'float8', proargtypes => 'float8', prosrc => 'derfc' }, -{ oid => '8702', descr => 'gamma function', +{ oid => '6383', descr => 'gamma function', proname => 'gamma', prorettype => 'float8', proargtypes => 'float8', prosrc => 'dgamma' }, -{ oid => '8703', descr => 'natural logarithm of absolute value of gamma function', +{ oid => '6384', + descr => 'natural logarithm of absolute value of gamma function', proname => 'lgamma', prorettype => 'float8', proargtypes => 'float8', prosrc => 'dlgamma' }, @@ -3688,7 +3689,7 @@ { oid => '872', descr => 'capitalize each word', proname => 'initcap', prorettype => 'text', proargtypes => 'text', prosrc => 'initcap' }, -{ oid => '9569', descr => 'fold case', +{ oid => '6412', descr => 'fold case', proname => 'casefold', prorettype => 'text', proargtypes => 'text', prosrc => 'casefold' }, { oid => '873', descr => 'left-pad string to length', @@ -4515,7 +4516,7 @@ { oid => '1693', descr => 'less-equal-greater', proname => 'btboolcmp', proleakproof => 't', prorettype => 'int4', proargtypes => 'bool bool', prosrc => 'btboolcmp' }, -{ oid => '9296', descr => 'skip support', +{ oid => '6408', descr => 'skip support', proname => 'btboolskipsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'btboolskipsupport' }, @@ -5450,17 +5451,17 @@ prorettype => 'bool', proargtypes => 'oid text', prosrc => 'has_any_column_privilege_id' }, -{ oid => '8048', +{ oid => '6348', descr => 'user privilege on large object by username, large object oid', proname => 'has_largeobject_privilege', procost => '10', provolatile => 's', prorettype => 'bool', proargtypes => 'name oid text', prosrc => 'has_largeobject_privilege_name_id' }, -{ oid => '8049', +{ oid => '6349', descr => 'current user privilege on large object by large object oid', proname => 'has_largeobject_privilege', procost => '10', provolatile => 's', prorettype => 'bool', proargtypes => 'oid text', prosrc => 'has_largeobject_privilege_id' }, -{ oid => '8050', +{ oid => '6350', descr => 'user privilege on large object by user oid, large object oid', proname => 'has_largeobject_privilege', procost => '10', provolatile => 's', prorettype => 'bool', proargtypes => 'oid oid text', @@ -5611,19 +5612,19 @@ proname => 'pg_stat_get_autoanalyze_count', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => 'oid', prosrc => 'pg_stat_get_autoanalyze_count' }, -{ oid => '8406', descr => 'total vacuum time, in milliseconds', +{ oid => '6358', descr => 'total vacuum time, in milliseconds', proname => 'pg_stat_get_total_vacuum_time', provolatile => 's', proparallel => 'r', prorettype => 'float8', proargtypes => 'oid', prosrc => 'pg_stat_get_total_vacuum_time' }, -{ oid => '8407', descr => 'total autovacuum time, in milliseconds', +{ oid => '6359', descr => 'total autovacuum time, in milliseconds', proname => 'pg_stat_get_total_autovacuum_time', provolatile => 's', proparallel => 'r', prorettype => 'float8', proargtypes => 'oid', prosrc => 'pg_stat_get_total_autovacuum_time' }, -{ oid => '8408', descr => 'total analyze time, in milliseconds', +{ oid => '6360', descr => 'total analyze time, in milliseconds', proname => 'pg_stat_get_total_analyze_time', provolatile => 's', proparallel => 'r', prorettype => 'float8', proargtypes => 'oid', prosrc => 'pg_stat_get_total_analyze_time' }, -{ oid => '8409', descr => 'total autoanalyze time, in milliseconds', +{ oid => '6361', descr => 'total autoanalyze time, in milliseconds', proname => 'pg_stat_get_total_autoanalyze_time', provolatile => 's', proparallel => 'r', prorettype => 'float8', proargtypes => 'oid', prosrc => 'pg_stat_get_total_autoanalyze_time' }, @@ -5687,9 +5688,9 @@ { oid => '6231', descr => 'statistics: information about subscription stats', proname => 'pg_stat_get_subscription_stats', provolatile => 's', proparallel => 'r', prorettype => 'record', proargtypes => 'oid', - proallargtypes => '{oid,oid,int8,int8,int8,int8,int8,int8,int8,int8,int8,timestamptz}', - proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o}', - proargnames => '{subid,subid,apply_error_count,sync_error_count,confl_insert_exists,confl_update_origin_differs,confl_update_exists,confl_update_missing,confl_delete_origin_differs,confl_delete_missing,confl_multiple_unique_conflicts,stats_reset}', + proallargtypes => '{oid,oid,int8,int8,int8,int8,int8,int8,int8,int8,int8,int8,timestamptz}', + proargmodes => '{i,o,o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{subid,subid,apply_error_count,sync_error_count,confl_insert_exists,confl_update_origin_differs,confl_update_exists,confl_update_deleted,confl_update_missing,confl_delete_origin_differs,confl_delete_missing,confl_multiple_unique_conflicts,stats_reset}', prosrc => 'pg_stat_get_subscription_stats' }, { oid => '6118', descr => 'statistics: information about subscription', proname => 'pg_stat_get_subscription', prorows => '10', proisstrict => 'f', @@ -5900,12 +5901,12 @@ proname => 'pg_stat_get_db_sessions_killed', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => 'oid', prosrc => 'pg_stat_get_db_sessions_killed' }, -{ oid => '8403', +{ oid => '6355', descr => 'statistics: number of parallel workers planned to be launched by queries', proname => 'pg_stat_get_db_parallel_workers_to_launch', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => 'oid', prosrc => 'pg_stat_get_db_parallel_workers_to_launch' }, -{ oid => '8404', +{ oid => '6356', descr => 'statistics: number of parallel workers effectively launched by queries', proname => 'pg_stat_get_db_parallel_workers_launched', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => 'oid', @@ -5927,7 +5928,7 @@ proname => 'pg_stat_get_checkpointer_num_requested', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_checkpointer_num_requested' }, -{ oid => '8599', +{ oid => '6377', descr => 'statistics: number of checkpoints performed by the checkpointer', proname => 'pg_stat_get_checkpointer_num_performed', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => '', @@ -5954,7 +5955,7 @@ proname => 'pg_stat_get_checkpointer_buffers_written', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_checkpointer_buffers_written' }, -{ oid => '8573', +{ oid => '6366', descr => 'statistics: number of SLRU buffers written during checkpoints and restartpoints', proname => 'pg_stat_get_checkpointer_slru_written', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => '', @@ -6000,7 +6001,7 @@ proargnames => '{backend_type,object,context,reads,read_bytes,read_time,writes,write_bytes,write_time,writebacks,writeback_time,extends,extend_bytes,extend_time,hits,evictions,reuses,fsyncs,fsync_time,stats_reset}', prosrc => 'pg_stat_get_io' }, -{ oid => '8806', descr => 'statistics: backend IO statistics', +{ oid => '6386', descr => 'statistics: backend IO statistics', proname => 'pg_stat_get_backend_io', prorows => '5', proretset => 't', provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => 'int4', @@ -6016,7 +6017,7 @@ proargmodes => '{o,o,o,o,o}', proargnames => '{wal_records,wal_fpi,wal_bytes,wal_buffers_full,stats_reset}', prosrc => 'pg_stat_get_wal' }, -{ oid => '8037', descr => 'statistics: backend WAL activity', +{ oid => '6313', descr => 'statistics: backend WAL activity', proname => 'pg_stat_get_backend_wal', provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => 'int4', proallargtypes => '{int4,int8,int8,numeric,int8,timestamptz}', @@ -6155,7 +6156,7 @@ proname => 'pg_stat_reset_single_function_counters', provolatile => 'v', prorettype => 'void', proargtypes => 'oid', prosrc => 'pg_stat_reset_single_function_counters' }, -{ oid => '8807', descr => 'statistics: reset statistics for a single backend', +{ oid => '6387', descr => 'statistics: reset statistics for a single backend', proname => 'pg_stat_reset_backend_stats', provolatile => 'v', prorettype => 'void', proargtypes => 'int4', prosrc => 'pg_stat_reset_backend_stats' }, @@ -6369,10 +6370,10 @@ { oid => '3411', descr => 'hash', proname => 'timestamp_hash_extended', prorettype => 'int8', proargtypes => 'timestamp int8', prosrc => 'timestamp_hash_extended' }, -{ oid => '9720', descr => 'hash', +{ oid => '6425', descr => 'hash', proname => 'timestamptz_hash', prorettype => 'int4', proargtypes => 'timestamptz', prosrc => 'timestamptz_hash' }, -{ oid => '9721', descr => 'hash', +{ oid => '6426', descr => 'hash', proname => 'timestamptz_hash_extended', prorettype => 'int8', proargtypes => 'timestamptz int8', prosrc => 'timestamptz_hash_extended' }, { oid => '2041', descr => 'intervals overlap?', @@ -6397,7 +6398,7 @@ { oid => '3137', descr => 'sort support', proname => 'timestamp_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'timestamp_sortsupport' }, -{ oid => '9297', descr => 'skip support', +{ oid => '6409', descr => 'skip support', proname => 'timestamp_skipsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'timestamp_skipsupport' }, @@ -6593,7 +6594,7 @@ proname => 'pg_describe_object', provolatile => 's', prorettype => 'text', proargtypes => 'oid oid int4', prosrc => 'pg_describe_object' }, -{ oid => '8730', descr => 'get ACL for SQL object', +{ oid => '6385', descr => 'get ACL for SQL object', proname => 'pg_get_acl', provolatile => 's', prorettype => '_aclitem', proargtypes => 'oid oid int4', proargnames => '{classid,objid,objsubid}', prosrc => 'pg_get_acl' }, @@ -6792,7 +6793,7 @@ proargnames => '{rm_id, rm_name, rm_builtin}', prosrc => 'pg_get_wal_resource_managers' }, -{ oid => '8303', descr => 'get info about loaded modules', +{ oid => '6353', descr => 'get info about loaded modules', proname => 'pg_get_loaded_modules', prorows => '10', proretset => 't', provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => '', proallargtypes => '{text,text,text}', @@ -6992,7 +6993,7 @@ proname => 'max', prokind => 'a', proisstrict => 'f', prorettype => 'anyarray', proargtypes => 'anyarray', prosrc => 'aggregate_dummy' }, -{ oid => '8595', descr => 'maximum value of all record input values', +{ oid => '6373', descr => 'maximum value of all record input values', proname => 'max', prokind => 'a', proisstrict => 'f', prorettype => 'record', proargtypes => 'record', prosrc => 'aggregate_dummy' }, { oid => '2244', descr => 'maximum value of all bpchar input values', @@ -7010,7 +7011,7 @@ { oid => '5099', descr => 'maximum value of all xid8 input values', proname => 'max', prokind => 'a', proisstrict => 'f', prorettype => 'xid8', proargtypes => 'xid8', prosrc => 'aggregate_dummy' }, -{ oid => '8922', descr => 'maximum value of all bytea input values', +{ oid => '6395', descr => 'maximum value of all bytea input values', proname => 'max', prokind => 'a', proisstrict => 'f', prorettype => 'bytea', proargtypes => 'bytea', prosrc => 'aggregate_dummy' }, @@ -7068,7 +7069,7 @@ proname => 'min', prokind => 'a', proisstrict => 'f', prorettype => 'anyarray', proargtypes => 'anyarray', prosrc => 'aggregate_dummy' }, -{ oid => '8596', descr => 'minimum value of all record input values', +{ oid => '6374', descr => 'minimum value of all record input values', proname => 'min', prokind => 'a', proisstrict => 'f', prorettype => 'record', proargtypes => 'record', prosrc => 'aggregate_dummy' }, { oid => '2245', descr => 'minimum value of all bpchar input values', @@ -7086,7 +7087,7 @@ { oid => '5100', descr => 'minimum value of all xid8 input values', proname => 'min', prokind => 'a', proisstrict => 'f', prorettype => 'xid8', proargtypes => 'xid8', prosrc => 'aggregate_dummy' }, -{ oid => '8923', descr => 'minimum value of all bytea input values', +{ oid => '6396', descr => 'minimum value of all bytea input values', proname => 'min', prokind => 'a', proisstrict => 'f', prorettype => 'bytea', proargtypes => 'bytea', prosrc => 'aggregate_dummy' }, @@ -7454,6 +7455,17 @@ prorettype => 'regnamespace', proargtypes => 'text', prosrc => 'to_regnamespace' }, +{ oid => '8321', descr => 'I/O', + proname => 'regdatabasein', provolatile => 's', prorettype => 'regdatabase', + proargtypes => 'cstring', prosrc => 'regdatabasein' }, +{ oid => '8322', descr => 'I/O', + proname => 'regdatabaseout', provolatile => 's', prorettype => 'cstring', + proargtypes => 'regdatabase', prosrc => 'regdatabaseout' }, +{ oid => '8323', descr => 'convert database name to regdatabase', + proname => 'to_regdatabase', provolatile => 's', + prorettype => 'regdatabase', proargtypes => 'text', + prosrc => 'to_regdatabase' }, + { oid => '6210', descr => 'test whether string is valid input for data type', proname => 'pg_input_is_valid', provolatile => 's', prorettype => 'bool', proargtypes => 'text text', prosrc => 'pg_input_is_valid' }, @@ -7949,10 +7961,10 @@ proargtypes => 'internal', prosrc => 'tsm_system_handler' }, # CRC variants -{ oid => '8571', descr => 'CRC-32 value', +{ oid => '6364', descr => 'CRC-32 value', proname => 'crc32', proleakproof => 't', prorettype => 'int8', proargtypes => 'bytea', prosrc => 'crc32_bytea' }, -{ oid => '8572', descr => 'CRC-32C value', +{ oid => '6365', descr => 'CRC-32C value', proname => 'crc32c', proleakproof => 't', prorettype => 'int8', proargtypes => 'bytea', prosrc => 'crc32c_bytea' }, @@ -8312,6 +8324,12 @@ { oid => '4088', descr => 'I/O', proname => 'regnamespacesend', prorettype => 'bytea', proargtypes => 'regnamespace', prosrc => 'regnamespacesend' }, +{ oid => '8324', descr => 'I/O', + proname => 'regdatabaserecv', prorettype => 'regdatabase', + proargtypes => 'internal', prosrc => 'regdatabaserecv' }, +{ oid => '8325', descr => 'I/O', + proname => 'regdatabasesend', prorettype => 'bytea', + proargtypes => 'regdatabase', prosrc => 'regdatabasesend' }, { oid => '2456', descr => 'I/O', proname => 'bit_recv', prorettype => 'bit', proargtypes => 'internal oid int4', prosrc => 'bit_recv' }, @@ -8496,7 +8514,7 @@ proargmodes => '{o,o,o,o,o,o}', proargnames => '{name,statement,is_holdable,is_binary,is_scrollable,creation_time}', prosrc => 'pg_cursor' }, -{ oid => '9221', descr => 'get abbreviations from current timezone', +{ oid => '6401', descr => 'get abbreviations from current timezone', proname => 'pg_timezone_abbrevs_zone', prorows => '10', proretset => 't', provolatile => 's', prorettype => 'record', proargtypes => '', proallargtypes => '{text,interval,bool}', proargmodes => '{o,o,o}', @@ -8554,6 +8572,14 @@ proargnames => '{name,numa_node,size}', prosrc => 'pg_get_shmem_allocations_numa' }, +{ oid => '9314', + descr => 'shared memory allocations tracked in the DSM registry', + proname => 'pg_get_dsm_registry_allocations', prorows => '50', + proretset => 't', provolatile => 'v', prorettype => 'record', + proargtypes => '', proallargtypes => '{text,text,int8}', + proargmodes => '{o,o,o}', proargnames => '{name,type,size}', + prosrc => 'pg_get_dsm_registry_allocations' }, + # memory context of local backend { oid => '2282', descr => 'information about all memory contexts of local backend', @@ -8571,16 +8597,6 @@ prorettype => 'bool', proargtypes => 'int4', prosrc => 'pg_log_backend_memory_contexts' }, -# publishing memory contexts of the specified postgres process -{ oid => '2173', descr => 'publish memory contexts of the specified backend', - proname => 'pg_get_process_memory_contexts', provolatile => 'v', - prorows => '100', proretset => 't', proparallel => 'r', - prorettype => 'record', proargtypes => 'int4 bool float8', - proallargtypes => '{int4,bool,float8,text,text,text,_int4,int4,int8,int8,int8,int8,int8,int4,timestamptz}', - proargmodes => '{i,i,i,o,o,o,o,o,o,o,o,o,o,o,o}', - proargnames => '{pid, summary, timeout, name, ident, type, path, level, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes, num_agg_contexts, stats_timestamp}', - prosrc => 'pg_get_process_memory_contexts' }, - # non-persistent series generator { oid => '1066', descr => 'non-persistent series generator', proname => 'generate_series', prorows => '1000', @@ -8618,7 +8634,7 @@ prosupport => 'generate_series_numeric_support', proretset => 't', prorettype => 'numeric', proargtypes => 'numeric numeric', prosrc => 'generate_series_numeric' }, -{ oid => '8405', descr => 'planner support for generate_series', +{ oid => '6357', descr => 'planner support for generate_series', proname => 'generate_series_numeric_support', prorettype => 'internal', proargtypes => 'internal', prosrc => 'generate_series_numeric_support' }, { oid => '938', descr => 'non-persistent series generator', @@ -8638,7 +8654,7 @@ prorettype => 'timestamptz', proargtypes => 'timestamptz timestamptz interval text', prosrc => 'generate_series_timestamptz_at_zone' }, -{ oid => '8402', descr => 'planner support for generate_series', +{ oid => '6354', descr => 'planner support for generate_series', proname => 'generate_series_timestamp_support', prorettype => 'internal', proargtypes => 'internal', prosrc => 'generate_series_timestamp_support' }, @@ -9370,8 +9386,8 @@ proname => 'to_json', provolatile => 's', prorettype => 'json', proargtypes => 'anyelement', prosrc => 'to_json' }, { oid => '3261', descr => 'remove object fields with null values from json', - proname => 'json_strip_nulls', prorettype => 'json', proargtypes => 'json bool', - prosrc => 'json_strip_nulls' }, + proname => 'json_strip_nulls', prorettype => 'json', + proargtypes => 'json bool', prosrc => 'json_strip_nulls' }, { oid => '3947', proname => 'json_object_field', prorettype => 'json', @@ -9477,7 +9493,7 @@ { oid => '3300', descr => 'sort support', proname => 'uuid_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'uuid_sortsupport' }, -{ oid => '9298', descr => 'skip support', +{ oid => '6410', descr => 'skip support', proname => 'uuid_skipsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'uuid_skipsupport' }, { oid => '2961', descr => 'I/O', @@ -9493,17 +9509,19 @@ proname => 'uuid_hash_extended', prorettype => 'int8', proargtypes => 'uuid int8', prosrc => 'uuid_hash_extended' }, { oid => '3432', descr => 'generate random UUID', - proname => 'gen_random_uuid', provolatile => 'v', - prorettype => 'uuid', proargtypes => '', prosrc => 'gen_random_uuid' }, -{ oid => '9895', descr => 'generate UUID version 4', - proname => 'uuidv4', provolatile => 'v', - prorettype => 'uuid', proargtypes => '', prosrc => 'gen_random_uuid' }, -{ oid => '9896', descr => 'generate UUID version 7', - proname => 'uuidv7', provolatile => 'v', - prorettype => 'uuid', proargtypes => '', prosrc => 'uuidv7' }, -{ oid => '9897', descr => 'generate UUID version 7 with a timestamp shifted by specified interval', - proname => 'uuidv7', provolatile => 'v', proargnames => '{shift}', - prorettype => 'uuid', proargtypes => 'interval', prosrc => 'uuidv7_interval' }, + proname => 'gen_random_uuid', provolatile => 'v', prorettype => 'uuid', + proargtypes => '', prosrc => 'gen_random_uuid' }, +{ oid => '6428', descr => 'generate UUID version 4', + proname => 'uuidv4', provolatile => 'v', prorettype => 'uuid', + proargtypes => '', prosrc => 'gen_random_uuid' }, +{ oid => '6429', descr => 'generate UUID version 7', + proname => 'uuidv7', provolatile => 'v', prorettype => 'uuid', + proargtypes => '', prosrc => 'uuidv7' }, +{ oid => '6430', + descr => 'generate UUID version 7 with a timestamp shifted by specified interval', + proname => 'uuidv7', provolatile => 'v', prorettype => 'uuid', + proargtypes => 'interval', proargnames => '{shift}', + prosrc => 'uuidv7_interval' }, { oid => '6342', descr => 'extract timestamp from UUID', proname => 'uuid_extract_timestamp', proleakproof => 't', prorettype => 'timestamptz', proargtypes => 'uuid', @@ -10309,8 +10327,8 @@ prorettype => 'jsonb', proargtypes => '', prosrc => 'jsonb_build_object_noargs' }, { oid => '3262', descr => 'remove object fields with null values from jsonb', - proname => 'jsonb_strip_nulls', prorettype => 'jsonb', proargtypes => 'jsonb bool', - prosrc => 'jsonb_strip_nulls' }, + proname => 'jsonb_strip_nulls', prorettype => 'jsonb', + proargtypes => 'jsonb bool', prosrc => 'jsonb_strip_nulls' }, { oid => '3478', proname => 'jsonb_object_field', prorettype => 'jsonb', @@ -10661,10 +10679,10 @@ { oid => '2987', descr => 'less-equal-greater', proname => 'btrecordcmp', prorettype => 'int4', proargtypes => 'record record', prosrc => 'btrecordcmp' }, -{ oid => '8597', descr => 'larger of two', +{ oid => '6375', descr => 'larger of two', proname => 'record_larger', prorettype => 'record', proargtypes => 'record record', prosrc => 'record_larger' }, -{ oid => '8598', descr => 'smaller of two', +{ oid => '6376', descr => 'smaller of two', proname => 'record_smaller', prorettype => 'record', proargtypes => 'record record', prosrc => 'record_smaller' }, @@ -10904,7 +10922,7 @@ { oid => '3870', descr => 'less-equal-greater', proname => 'range_cmp', prorettype => 'int4', proargtypes => 'anyrange anyrange', prosrc => 'range_cmp' }, -{ oid => '8849', descr => 'sort support', +{ oid => '6391', descr => 'sort support', proname => 'range_sortsupport', prorettype => 'void', proargtypes => 'internal', prosrc => 'range_sortsupport' }, { oid => '3871', @@ -11783,6 +11801,10 @@ proname => 'binary_upgrade_replorigin_advance', proisstrict => 'f', provolatile => 'v', proparallel => 'u', prorettype => 'void', proargtypes => 'text pg_lsn', prosrc => 'binary_upgrade_replorigin_advance' }, +{ oid => '9159', descr => 'for use by pg_upgrade (conflict detection slot)', + proname => 'binary_upgrade_create_conflict_detection_slot', proisstrict => 'f', + provolatile => 'v', proparallel => 'u', prorettype => 'void', + proargtypes => '', prosrc => 'binary_upgrade_create_conflict_detection_slot' }, # conversion functions { oid => '4302', @@ -12323,7 +12345,7 @@ proname => 'array_subscript_handler', prosupport => 'array_subscript_handler_support', prorettype => 'internal', proargtypes => 'internal', prosrc => 'array_subscript_handler' }, -{ oid => '8682', descr => 'planner support for array_subscript_handler', +{ oid => '6380', descr => 'planner support for array_subscript_handler', proname => 'array_subscript_handler_support', prorettype => 'internal', proargtypes => 'internal', prosrc => 'array_subscript_handler_support' }, { oid => '6180', descr => 'raw array subscripting support', @@ -12362,7 +12384,7 @@ provolatile => 'v', prorettype => 'record', proargtypes => '', proallargtypes => '{text,int8,timestamptz}', proargmodes => '{o,o,o}', proargnames => '{name,size,modification}', prosrc => 'pg_ls_waldir' }, -{ oid => '9220', descr => 'list of files in the pg_wal/summaries directory', +{ oid => '6400', descr => 'list of files in the pg_wal/summaries directory', proname => 'pg_ls_summariesdir', procost => '10', prorows => '20', proretset => 't', provolatile => 'v', prorettype => 'record', proargtypes => '', proallargtypes => '{text,int8,timestamptz}', @@ -12518,49 +12540,37 @@ proargnames => '{summarized_tli,summarized_lsn,pending_lsn,summarizer_pid}', prosrc => 'pg_get_wal_summarizer_state' }, # Statistics Import -{ oid => '8459', - descr => 'restore statistics on relation', - proname => 'pg_restore_relation_stats', provolatile => 'v', proisstrict => 'f', - provariadic => 'any', - proparallel => 'u', prorettype => 'bool', - proargtypes => 'any', - proargnames => '{kwargs}', - proargmodes => '{v}', - prosrc => 'pg_restore_relation_stats' }, -{ oid => '9160', - descr => 'clear statistics on relation', - proname => 'pg_clear_relation_stats', provolatile => 'v', proisstrict => 'f', - proparallel => 'u', prorettype => 'void', - proargtypes => 'text text', - proargnames => '{schemaname,relname}', - prosrc => 'pg_clear_relation_stats' }, -{ oid => '8461', - descr => 'restore statistics on attribute', - proname => 'pg_restore_attribute_stats', provolatile => 'v', proisstrict => 'f', - provariadic => 'any', - proparallel => 'u', prorettype => 'bool', - proargtypes => 'any', - proargnames => '{kwargs}', - proargmodes => '{v}', - prosrc => 'pg_restore_attribute_stats' }, -{ oid => '9162', - descr => 'clear statistics on attribute', - proname => 'pg_clear_attribute_stats', provolatile => 'v', proisstrict => 'f', +{ oid => '6362', descr => 'restore statistics on relation', + proname => 'pg_restore_relation_stats', provariadic => 'any', + proisstrict => 'f', provolatile => 'v', proparallel => 'u', + prorettype => 'bool', proargtypes => 'any', proargmodes => '{v}', + proargnames => '{kwargs}', prosrc => 'pg_restore_relation_stats' }, +{ oid => '6397', descr => 'clear statistics on relation', + proname => 'pg_clear_relation_stats', proisstrict => 'f', provolatile => 'v', + proparallel => 'u', prorettype => 'void', proargtypes => 'text text', + proargnames => '{schemaname,relname}', prosrc => 'pg_clear_relation_stats' }, +{ oid => '6363', descr => 'restore statistics on attribute', + proname => 'pg_restore_attribute_stats', provariadic => 'any', + proisstrict => 'f', provolatile => 'v', proparallel => 'u', + prorettype => 'bool', proargtypes => 'any', proargmodes => '{v}', + proargnames => '{kwargs}', prosrc => 'pg_restore_attribute_stats' }, +{ oid => '6398', descr => 'clear statistics on attribute', + proname => 'pg_clear_attribute_stats', proisstrict => 'f', provolatile => 'v', proparallel => 'u', prorettype => 'void', proargtypes => 'text text text bool', proargnames => '{schemaname,relname,attname,inherited}', prosrc => 'pg_clear_attribute_stats' }, # GiST stratnum implementations -{ oid => '8047', descr => 'GiST support', - proname => 'gist_stratnum_common', prorettype => 'int2', - proargtypes => 'int4', - prosrc => 'gist_stratnum_common' }, +{ oid => '6347', descr => 'GiST support', + proname => 'gist_translate_cmptype_common', prorettype => 'int2', + proargtypes => 'int4', prosrc => 'gist_translate_cmptype_common' }, # AIO related functions -{ oid => '9200', descr => 'information about in-progress asynchronous IOs', +{ oid => '6399', descr => 'information about in-progress asynchronous IOs', proname => 'pg_get_aios', prorows => '100', proretset => 't', - provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => '', + provolatile => 'v', proparallel => 'r', prorettype => 'record', + proargtypes => '', proallargtypes => '{int4,int4,int8,text,text,int8,int8,text,int2,int4,text,text,bool,bool,bool}', proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}', proargnames => '{pid,io_id,io_generation,state,operation,off,length,target,handle_data_len,raw_result,result,target_desc,f_sync,f_localmem,f_buffered}', diff --git a/src/include/catalog/pg_publication.h b/src/include/catalog/pg_publication.h index 48c7d1a8615..6e074190fd2 100644 --- a/src/include/catalog/pg_publication.h +++ b/src/include/catalog/pg_publication.h @@ -146,7 +146,7 @@ extern Publication *GetPublicationByName(const char *pubname, bool missing_ok); extern List *GetRelationPublications(Oid relid); /*--------- - * Expected values for pub_partopt parameter of GetRelationPublications(), + * Expected values for pub_partopt parameter of GetPublicationRelations(), * which allows callers to specify which partitions of partitioned tables * mentioned in the publication they expect to see. * diff --git a/src/include/catalog/pg_subscription.h b/src/include/catalog/pg_subscription.h index 20fc329992d..231ef84ec9a 100644 --- a/src/include/catalog/pg_subscription.h +++ b/src/include/catalog/pg_subscription.h @@ -78,6 +78,9 @@ CATALOG(pg_subscription,6100,SubscriptionRelationId) BKI_SHARED_RELATION BKI_ROW * slots) in the upstream database are enabled * to be synchronized to the standbys. */ + bool subretaindeadtuples; /* True if dead tuples useful for + * conflict detection are retained */ + #ifdef CATALOG_VARLEN /* variable-length fields start here */ /* Connection string to the publisher */ text subconninfo BKI_FORCE_NOT_NULL; @@ -131,6 +134,8 @@ typedef struct Subscription * (i.e. the main slot and the table sync * slots) in the upstream database are enabled * to be synchronized to the standbys. */ + bool retaindeadtuples; /* True if dead tuples useful for conflict + * detection are retained */ char *conninfo; /* Connection string to the publisher */ char *slotname; /* Name of the replication slot */ char *synccommit; /* Synchronous commit setting for worker */ diff --git a/src/include/catalog/pg_subscription_rel.h b/src/include/catalog/pg_subscription_rel.h index c91797c869c..f458447a0e5 100644 --- a/src/include/catalog/pg_subscription_rel.h +++ b/src/include/catalog/pg_subscription_rel.h @@ -85,7 +85,7 @@ typedef struct SubscriptionRelState extern void AddSubscriptionRelState(Oid subid, Oid relid, char state, XLogRecPtr sublsn, bool retain_lock); extern void UpdateSubscriptionRelState(Oid subid, Oid relid, char state, - XLogRecPtr sublsn); + XLogRecPtr sublsn, bool already_locked); extern char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn); extern void RemoveSubscriptionRel(Oid subid, Oid relid); diff --git a/src/include/catalog/pg_type.dat b/src/include/catalog/pg_type.dat index 6dca77e0a22..29e4ffffc98 100644 --- a/src/include/catalog/pg_type.dat +++ b/src/include/catalog/pg_type.dat @@ -399,6 +399,11 @@ typinput => 'regnamespacein', typoutput => 'regnamespaceout', typreceive => 'regnamespacerecv', typsend => 'regnamespacesend', typalign => 'i' }, +{ oid => '8326', array_type_oid => '8327', descr => 'registered database', + typname => 'regdatabase', typlen => '4', typbyval => 't', typcategory => 'N', + typinput => 'regdatabasein', typoutput => 'regdatabaseout', + typreceive => 'regdatabaserecv', typsend => 'regdatabasesend', + typalign => 'i' }, # uuid { oid => '2950', array_type_oid => '2951', descr => 'UUID', diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h index 06dfdfef721..541176e1980 100644 --- a/src/include/commands/copy.h +++ b/src/include/commands/copy.h @@ -20,15 +20,12 @@ #include "tcop/dest.h" /* - * Represents whether a header line should be present, and whether it must - * match the actual names (which implies "true"). + * Represents whether a header line must match the actual names + * (which implies "true"), and whether it should be present. */ -typedef enum CopyHeaderChoice -{ - COPY_HEADER_FALSE = 0, - COPY_HEADER_TRUE, - COPY_HEADER_MATCH, -} CopyHeaderChoice; +#define COPY_HEADER_MATCH -1 +#define COPY_HEADER_FALSE 0 +#define COPY_HEADER_TRUE 1 /* * Represents where to save input processing errors. More values to be added @@ -64,7 +61,8 @@ typedef struct CopyFormatOptions bool binary; /* binary format? */ bool freeze; /* freeze rows on loading? */ bool csv_mode; /* Comma Separated Value format? */ - CopyHeaderChoice header_line; /* header line? */ + int header_line; /* number of lines to skip or COPY_HEADER_XXX + * value (see the above) */ char *null_print; /* NULL marker string (server encoding!) */ int null_print_len; /* length of same */ char *null_print_client; /* same converted to file encoding */ diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 03c5b3d73e5..3b122f79ed8 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -63,10 +63,8 @@ extern void ExplainOneUtility(Node *utilityStmt, IntoClause *into, struct ExplainState *es, ParseState *pstate, ParamListInfo params); -extern void ExplainOnePlan(PlannedStmt *plannedstmt, CachedPlan *cplan, - CachedPlanSource *plansource, int query_index, - IntoClause *into, struct ExplainState *es, - const char *queryString, +extern void ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, + struct ExplainState *es, const char *queryString, ParamListInfo params, QueryEnvironment *queryEnv, const instr_time *planduration, const BufferUsage *bufusage, diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 7c736e7b03b..1cde4bd9bcf 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -130,6 +130,7 @@ #define PROGRESS_BASEBACKUP_BACKUP_STREAMED 2 #define PROGRESS_BASEBACKUP_TBLSPC_TOTAL 3 #define PROGRESS_BASEBACKUP_TBLSPC_STREAMED 4 +#define PROGRESS_BASEBACKUP_BACKUP_TYPE 5 /* Phases of pg_basebackup (as advertised via PROGRESS_BASEBACKUP_PHASE) */ #define PROGRESS_BASEBACKUP_PHASE_WAIT_CHECKPOINT 1 @@ -138,6 +139,10 @@ #define PROGRESS_BASEBACKUP_PHASE_WAIT_WAL_ARCHIVE 4 #define PROGRESS_BASEBACKUP_PHASE_TRANSFER_WAL 5 +/* Types of pg_basebackup (as advertised via PROGRESS_BASEBACKUP_BACKUP_TYPE) */ +#define PROGRESS_BASEBACKUP_BACKUP_TYPE_FULL 1 +#define PROGRESS_BASEBACKUP_BACKUP_TYPE_INCREMENTAL 2 + /* Progress parameters for PROGRESS_COPY */ #define PROGRESS_COPY_BYTES_PROCESSED 0 #define PROGRESS_COPY_BYTES_TOTAL 1 diff --git a/src/include/commands/subscriptioncmds.h b/src/include/commands/subscriptioncmds.h index c2262e46a7f..9b288ad22a6 100644 --- a/src/include/commands/subscriptioncmds.h +++ b/src/include/commands/subscriptioncmds.h @@ -28,4 +28,9 @@ extern void AlterSubscriptionOwner_oid(Oid subid, Oid newOwnerId); extern char defGetStreamingMode(DefElem *def); +extern ObjectAddress AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt, bool isTopLevel); + +extern void CheckSubDeadTupleRetention(bool check_guc, bool sub_disabled, + int elevel_for_sub_disabled); + #endif /* SUBSCRIPTIONCMDS_H */ diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index 4180601dcd4..cfd7daa20ed 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -213,7 +213,8 @@ extern bool ExecBRDeleteTriggers(EState *estate, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, - TM_FailureData *tmfd); + TM_FailureData *tmfd, + bool is_merge_delete); extern void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, ItemPointer tupleid, @@ -235,7 +236,8 @@ extern bool ExecBRUpdateTriggers(EState *estate, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, - TM_FailureData *tmfd); + TM_FailureData *tmfd, + bool is_merge_update); extern void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, @@ -258,7 +260,6 @@ extern void ExecASTruncateTriggers(EState *estate, extern void AfterTriggerBeginXact(void); extern void AfterTriggerBeginQuery(void); extern void AfterTriggerEndQuery(EState *estate); -extern void AfterTriggerAbortQuery(void); extern void AfterTriggerFireDeferred(void); extern void AfterTriggerEndXact(bool isCommit); extern void AfterTriggerBeginSubXact(void); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index bc37a80dc74..14eeccbd718 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -336,7 +336,7 @@ extern PGDLLIMPORT int64 parallel_vacuum_worker_delay_ns; /* in commands/vacuum.c */ extern void ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel); -extern void vacuum(List *relations, VacuumParams *params, +extern void vacuum(List *relations, const VacuumParams params, BufferAccessStrategy bstrategy, MemoryContext vac_context, bool isTopLevel); extern void vac_open_indexes(Relation relation, LOCKMODE lockmode, @@ -357,7 +357,7 @@ extern void vac_update_relstats(Relation relation, bool *frozenxid_updated, bool *minmulti_updated, bool in_outer_xact); -extern bool vacuum_get_cutoffs(Relation rel, const VacuumParams *params, +extern bool vacuum_get_cutoffs(Relation rel, const VacuumParams params, struct VacuumCutoffs *cutoffs); extern bool vacuum_xid_failsafe_check(const struct VacuumCutoffs *cutoffs); extern void vac_update_datfrozenxid(void); @@ -398,7 +398,7 @@ extern void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc); /* in commands/analyze.c */ extern void analyze_rel(Oid relid, RangeVar *relation, - VacuumParams *params, List *va_cols, bool in_outer_xact, + const VacuumParams params, List *va_cols, bool in_outer_xact, BufferAccessStrategy bstrategy); extern bool std_typanalyze(VacAttrStats *stats); diff --git a/src/include/common/int128.h b/src/include/common/int128.h index a50f5709c29..addef99bfa5 100644 --- a/src/include/common/int128.h +++ b/src/include/common/int128.h @@ -6,7 +6,7 @@ * We make use of the native int128 type if there is one, otherwise * implement things the hard way based on two int64 halves. * - * See src/tools/testint128.c for a simple test harness for this file. + * See src/test/modules/test_int128 for a simple test harness for this file. * * Copyright (c) 2017-2025, PostgreSQL Global Development Group * @@ -29,81 +29,21 @@ #endif #endif - -#if USE_NATIVE_INT128 - -typedef int128 INT128; - -/* - * Add an unsigned int64 value into an INT128 variable. - */ -static inline void -int128_add_uint64(INT128 *i128, uint64 v) -{ - *i128 += v; -} - /* - * Add a signed int64 value into an INT128 variable. - */ -static inline void -int128_add_int64(INT128 *i128, int64 v) -{ - *i128 += v; -} - -/* - * Add the 128-bit product of two int64 values into an INT128 variable. + * If native int128 support is enabled, INT128 is just int128. Otherwise, it + * is a structure with separate 64-bit high and low parts. * - * XXX with a stupid compiler, this could actually be less efficient than - * the other implementation; maybe we should do it by hand always? - */ -static inline void -int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y) -{ - *i128 += (int128) x * (int128) y; -} - -/* - * Compare two INT128 values, return -1, 0, or +1. - */ -static inline int -int128_compare(INT128 x, INT128 y) -{ - if (x < y) - return -1; - if (x > y) - return 1; - return 0; -} - -/* - * Widen int64 to INT128. - */ -static inline INT128 -int64_to_int128(int64 v) -{ - return (INT128) v; -} - -/* - * Convert INT128 to int64 (losing any high-order bits). - * This also works fine for casting down to uint64. - */ -static inline int64 -int128_to_int64(INT128 val) -{ - return (int64) val; -} - -#else /* !USE_NATIVE_INT128 */ - -/* * We lay out the INT128 structure with the same content and byte ordering * that a native int128 type would (probably) have. This makes no difference * for ordinary use of INT128, but allows union'ing INT128 with int128 for * testing purposes. */ +#if USE_NATIVE_INT128 + +typedef int128 INT128; + +#else + typedef struct { #ifdef WORDS_BIGENDIAN @@ -115,25 +55,31 @@ typedef struct #endif } INT128; +#endif + /* * Add an unsigned int64 value into an INT128 variable. */ static inline void int128_add_uint64(INT128 *i128, uint64 v) { +#if USE_NATIVE_INT128 + *i128 += v; +#else /* * First add the value to the .lo part, then check to see if a carry needs - * to be propagated into the .hi part. A carry is needed if both inputs - * have high bits set, or if just one input has high bit set while the new - * .lo part doesn't. Remember that .lo part is unsigned; we cast to - * signed here just as a cheap way to check the high bit. + * to be propagated into the .hi part. Since this is unsigned integer + * arithmetic, which is just modular arithmetic, a carry is needed if the + * new .lo part is less than the old .lo part (i.e., if modular + * wrap-around occurred). Writing this in the form below, rather than + * using an "if" statement causes modern compilers to produce branchless + * machine code identical to the native code. */ uint64 oldlo = i128->lo; i128->lo += v; - if (((int64) v < 0 && (int64) oldlo < 0) || - (((int64) v < 0 || (int64) oldlo < 0) && (int64) i128->lo >= 0)) - i128->hi++; + i128->hi += (i128->lo < oldlo); +#endif } /* @@ -142,33 +88,33 @@ int128_add_uint64(INT128 *i128, uint64 v) static inline void int128_add_int64(INT128 *i128, int64 v) { +#if USE_NATIVE_INT128 + *i128 += v; +#else /* * This is much like the above except that the carry logic differs for - * negative v. Ordinarily we'd need to subtract 1 from the .hi part - * (corresponding to adding the sign-extended bits of v to it); but if - * there is a carry out of the .lo part, that cancels and we do nothing. + * negative v -- we need to subtract 1 from the .hi part if the new .lo + * value is greater than the old .lo value. That can be achieved without + * any branching by adding the sign bit from v (v >> 63 = 0 or -1) to the + * previous result (for negative v, if the new .lo value is less than the + * old .lo value, the two terms cancel and we leave the .hi part + * unchanged, otherwise we subtract 1 from the .hi part). With modern + * compilers this often produces machine code identical to the native + * code. */ uint64 oldlo = i128->lo; i128->lo += v; - if (v >= 0) - { - if ((int64) oldlo < 0 && (int64) i128->lo >= 0) - i128->hi++; - } - else - { - if (!((int64) oldlo < 0 || (int64) i128->lo >= 0)) - i128->hi--; - } + i128->hi += (i128->lo < oldlo) + (v >> 63); +#endif } /* - * INT64_AU32 extracts the most significant 32 bits of int64 as int64, while - * INT64_AL32 extracts the least significant 32 bits as uint64. + * INT64_HI_INT32 extracts the most significant 32 bits of int64 as int32. + * INT64_LO_UINT32 extracts the least significant 32 bits as uint32. */ -#define INT64_AU32(i64) ((i64) >> 32) -#define INT64_AL32(i64) ((i64) & UINT64CONST(0xFFFFFFFF)) +#define INT64_HI_INT32(i64) ((int32) ((i64) >> 32)) +#define INT64_LO_UINT32(i64) ((uint32) (i64)) /* * Add the 128-bit product of two int64 values into an INT128 variable. @@ -176,7 +122,14 @@ int128_add_int64(INT128 *i128, int64 v) static inline void int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y) { - /* INT64_AU32 must use arithmetic right shift */ +#if USE_NATIVE_INT128 + /* + * XXX with a stupid compiler, this could actually be less efficient than + * the non-native implementation; maybe we should do it by hand always? + */ + *i128 += (int128) x * (int128) y; +#else + /* INT64_HI_INT32 must use arithmetic right shift */ StaticAssertDecl(((int64) -1 >> 1) == (int64) -1, "arithmetic right shift is needed"); @@ -201,34 +154,29 @@ int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y) /* No need to work hard if product must be zero */ if (x != 0 && y != 0) { - int64 x_u32 = INT64_AU32(x); - uint64 x_l32 = INT64_AL32(x); - int64 y_u32 = INT64_AU32(y); - uint64 y_l32 = INT64_AL32(y); + int32 x_hi = INT64_HI_INT32(x); + uint32 x_lo = INT64_LO_UINT32(x); + int32 y_hi = INT64_HI_INT32(y); + uint32 y_lo = INT64_LO_UINT32(y); int64 tmp; /* the first term */ - i128->hi += x_u32 * y_u32; + i128->hi += (int64) x_hi * (int64) y_hi; - /* the second term: sign-extend it only if x is negative */ - tmp = x_u32 * y_l32; - if (x < 0) - i128->hi += INT64_AU32(tmp); - else - i128->hi += ((uint64) tmp) >> 32; - int128_add_uint64(i128, ((uint64) INT64_AL32(tmp)) << 32); + /* the second term: sign-extended with the sign of x */ + tmp = (int64) x_hi * (int64) y_lo; + i128->hi += INT64_HI_INT32(tmp); + int128_add_uint64(i128, ((uint64) INT64_LO_UINT32(tmp)) << 32); - /* the third term: sign-extend it only if y is negative */ - tmp = x_l32 * y_u32; - if (y < 0) - i128->hi += INT64_AU32(tmp); - else - i128->hi += ((uint64) tmp) >> 32; - int128_add_uint64(i128, ((uint64) INT64_AL32(tmp)) << 32); + /* the third term: sign-extended with the sign of y */ + tmp = (int64) x_lo * (int64) y_hi; + i128->hi += INT64_HI_INT32(tmp); + int128_add_uint64(i128, ((uint64) INT64_LO_UINT32(tmp)) << 32); /* the fourth term: always unsigned */ - int128_add_uint64(i128, x_l32 * y_l32); + int128_add_uint64(i128, (uint64) x_lo * (uint64) y_lo); } +#endif } /* @@ -237,6 +185,13 @@ int128_add_int64_mul_int64(INT128 *i128, int64 x, int64 y) static inline int int128_compare(INT128 x, INT128 y) { +#if USE_NATIVE_INT128 + if (x < y) + return -1; + if (x > y) + return 1; + return 0; +#else if (x.hi < y.hi) return -1; if (x.hi > y.hi) @@ -246,6 +201,7 @@ int128_compare(INT128 x, INT128 y) if (x.lo > y.lo) return 1; return 0; +#endif } /* @@ -254,11 +210,15 @@ int128_compare(INT128 x, INT128 y) static inline INT128 int64_to_int128(int64 v) { +#if USE_NATIVE_INT128 + return (INT128) v; +#else INT128 val; val.lo = (uint64) v; val.hi = (v < 0) ? -INT64CONST(1) : INT64CONST(0); return val; +#endif } /* @@ -268,9 +228,11 @@ int64_to_int128(int64 v) static inline int64 int128_to_int64(INT128 val) { +#if USE_NATIVE_INT128 + return (int64) val; +#else return (int64) val.lo; +#endif } -#endif /* USE_NATIVE_INT128 */ - #endif /* INT128_H */ diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h index ba53305ad42..86db3dc8d0d 100644 --- a/src/include/executor/execdesc.h +++ b/src/include/executor/execdesc.h @@ -35,7 +35,6 @@ typedef struct QueryDesc /* These fields are provided by CreateQueryDesc */ CmdType operation; /* CMD_SELECT, CMD_UPDATE, etc. */ PlannedStmt *plannedstmt; /* planner's output (could be utility, too) */ - CachedPlan *cplan; /* CachedPlan that supplies the plannedstmt */ const char *sourceText; /* source text of the query */ Snapshot snapshot; /* snapshot to use for query */ Snapshot crosscheck_snapshot; /* crosscheck for RI update/delete */ @@ -58,7 +57,6 @@ typedef struct QueryDesc /* in pquery.c */ extern QueryDesc *CreateQueryDesc(PlannedStmt *plannedstmt, - CachedPlan *cplan, const char *sourceText, Snapshot snapshot, Snapshot crosscheck_snapshot, diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index ae99407db89..a71502efeed 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -14,12 +14,12 @@ #ifndef EXECUTOR_H #define EXECUTOR_H +#include "datatype/timestamp.h" #include "executor/execdesc.h" #include "fmgr.h" #include "nodes/lockoptions.h" #include "nodes/parsenodes.h" #include "utils/memutils.h" -#include "utils/plancache.h" /* @@ -73,7 +73,7 @@ /* Hook for plugins to get control in ExecutorStart() */ -typedef bool (*ExecutorStart_hook_type) (QueryDesc *queryDesc, int eflags); +typedef void (*ExecutorStart_hook_type) (QueryDesc *queryDesc, int eflags); extern PGDLLIMPORT ExecutorStart_hook_type ExecutorStart_hook; /* Hook for plugins to get control in ExecutorRun() */ @@ -229,11 +229,8 @@ ExecGetJunkAttribute(TupleTableSlot *slot, AttrNumber attno, bool *isNull) /* * prototypes from functions in execMain.c */ -extern bool ExecutorStart(QueryDesc *queryDesc, int eflags); -extern void ExecutorStartCachedPlan(QueryDesc *queryDesc, int eflags, - CachedPlanSource *plansource, - int query_index); -extern bool standard_ExecutorStart(QueryDesc *queryDesc, int eflags); +extern void ExecutorStart(QueryDesc *queryDesc, int eflags); +extern void standard_ExecutorStart(QueryDesc *queryDesc, int eflags); extern void ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count); extern void standard_ExecutorRun(QueryDesc *queryDesc, @@ -300,30 +297,6 @@ extern void ExecEndNode(PlanState *node); extern void ExecShutdownNode(PlanState *node); extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node); -/* - * Is the CachedPlan in es_cachedplan still valid? - * - * Called from InitPlan() because invalidation messages that affect the plan - * might be received after locks have been taken on runtime-prunable relations. - * The caller should take appropriate action if the plan has become invalid. - */ -static inline bool -ExecPlanStillValid(EState *estate) -{ - return estate->es_cachedplan == NULL ? true : - CachedPlanValid(estate->es_cachedplan); -} - -/* - * Locks are needed only if running a cached plan that might contain unlocked - * relations, such as a reused generic plan. - */ -static inline bool -ExecShouldLockRelations(EState *estate) -{ - return estate->es_cachedplan == NULL ? false : - CachedPlanRequiresLocking(estate->es_cachedplan); -} /* ---------------------------------------------------------------- * ExecProcNode @@ -787,7 +760,18 @@ extern bool RelationFindReplTupleByIndex(Relation rel, Oid idxoid, TupleTableSlot *outslot); extern bool RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, TupleTableSlot *searchslot, TupleTableSlot *outslot); - +extern bool RelationFindDeletedTupleInfoSeq(Relation rel, + TupleTableSlot *searchslot, + TransactionId oldestxmin, + TransactionId *delete_xid, + RepOriginId *delete_origin, + TimestampTz *delete_time); +extern bool RelationFindDeletedTupleInfoByIndex(Relation rel, Oid idxoid, + TupleTableSlot *searchslot, + TransactionId oldestxmin, + TransactionId *delete_xid, + RepOriginId *delete_origin, + TimestampTz *delete_time); extern void ExecSimpleRelationInsert(ResultRelInfo *resultRelInfo, EState *estate, TupleTableSlot *slot); extern void ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h index 34b82d0f5d1..6c4891bbaeb 100644 --- a/src/include/executor/nodeAgg.h +++ b/src/include/executor/nodeAgg.h @@ -264,7 +264,7 @@ typedef struct AggStatePerGroupData * NULL and not auto-replace it with a later input value. Only the first * non-NULL input will be auto-substituted. */ -} AggStatePerGroupData; +} AggStatePerGroupData; /* * AggStatePerPhaseData - per-grouping-set-phase state diff --git a/src/include/libpq/libpq-be-fe-helpers.h b/src/include/libpq/libpq-be-fe-helpers.h index 16205b824fa..1c4a342090c 100644 --- a/src/include/libpq/libpq-be-fe-helpers.h +++ b/src/include/libpq/libpq-be-fe-helpers.h @@ -30,17 +30,7 @@ #ifndef LIBPQ_BE_FE_HELPERS_H #define LIBPQ_BE_FE_HELPERS_H -/* - * Despite the name, BUILDING_DLL is set only when building code directly part - * of the backend. Which also is where libpq isn't allowed to be - * used. Obviously this doesn't protect against libpq-fe.h getting included - * otherwise, but perhaps still protects against a few mistakes... - */ -#ifdef BUILDING_DLL -#error "libpq may not be used code directly built into the backend" -#endif - -#include "libpq-fe.h" +#include "libpq/libpq-be-fe.h" #include "miscadmin.h" #include "storage/fd.h" #include "storage/latch.h" @@ -289,41 +279,30 @@ libpqsrv_exec_params(PGconn *conn, static inline PGresult * libpqsrv_get_result_last(PGconn *conn, uint32 wait_event_info) { - PGresult *volatile lastResult = NULL; + PGresult *lastResult = NULL; - /* In what follows, do not leak any PGresults on an error. */ - PG_TRY(); + for (;;) { - for (;;) - { - /* Wait for, and collect, the next PGresult. */ - PGresult *result; - - result = libpqsrv_get_result(conn, wait_event_info); - if (result == NULL) - break; /* query is complete, or failure */ + /* Wait for, and collect, the next PGresult. */ + PGresult *result; - /* - * Emulate PQexec()'s behavior of returning the last result when - * there are many. - */ - PQclear(lastResult); - lastResult = result; + result = libpqsrv_get_result(conn, wait_event_info); + if (result == NULL) + break; /* query is complete, or failure */ - if (PQresultStatus(lastResult) == PGRES_COPY_IN || - PQresultStatus(lastResult) == PGRES_COPY_OUT || - PQresultStatus(lastResult) == PGRES_COPY_BOTH || - PQstatus(conn) == CONNECTION_BAD) - break; - } - } - PG_CATCH(); - { + /* + * Emulate PQexec()'s behavior of returning the last result when there + * are many. + */ PQclear(lastResult); - PG_RE_THROW(); - } - PG_END_TRY(); + lastResult = result; + if (PQresultStatus(lastResult) == PGRES_COPY_IN || + PQresultStatus(lastResult) == PGRES_COPY_OUT || + PQresultStatus(lastResult) == PGRES_COPY_BOTH || + PQstatus(conn) == CONNECTION_BAD) + break; + } return lastResult; } @@ -454,4 +433,45 @@ exit: ; return error; } +/* + * libpqsrv_notice_receiver + * + * Custom notice receiver for libpq connections. + * + * This function is intended to be set via PQsetNoticeReceiver() so that + * NOTICE, WARNING, and similar messages from the connection are reported via + * ereport(), instead of being printed to stderr. + * + * Because this will be called from libpq with a "real" (not wrapped) + * PGresult, we need to temporarily ignore libpq-be-fe.h's wrapper macros + * for PGresult and also PQresultErrorMessage, and put back the wrappers + * afterwards. That's not pretty, but there seems no better alternative. + */ +#undef PGresult +#undef PQresultErrorMessage + +static inline void +libpqsrv_notice_receiver(void *arg, const PGresult *res) +{ + const char *message; + int len; + const char *prefix = (const char *) arg; + + /* + * Trim the trailing newline from the message text returned from + * PQresultErrorMessage(), as it always includes one, to produce cleaner + * log output. + */ + message = PQresultErrorMessage(res); + len = strlen(message); + if (len > 0 && message[len - 1] == '\n') + len--; + + ereport(LOG, + errmsg_internal("%s: %.*s", prefix, len, message)); +} + +#define PGresult libpqsrv_PGresult +#define PQresultErrorMessage libpqsrv_PQresultErrorMessage + #endif /* LIBPQ_BE_FE_HELPERS_H */ diff --git a/src/include/libpq/libpq-be-fe.h b/src/include/libpq/libpq-be-fe.h new file mode 100644 index 00000000000..e3f796b0230 --- /dev/null +++ b/src/include/libpq/libpq-be-fe.h @@ -0,0 +1,259 @@ +/*------------------------------------------------------------------------- + * + * libpq-be-fe.h + * Wrapper functions for using libpq in extensions + * + * Code built directly into the backend is not allowed to link to libpq + * directly. Extension code is allowed to use libpq however. One of the + * main risks in doing so is leaking the malloc-allocated structures + * returned by libpq, causing a process-lifespan memory leak. + * + * This file provides wrapper objects to help in building memory-safe code. + * A PGresult object wrapped this way acts much as if it were palloc'd: + * it will go away when the specified context is reset or deleted. + * We might later extend the concept to other objects such as PGconns. + * + * See also the libpq-be-fe-helpers.h file, which provides additional + * facilities built on top of this one. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/libpq/libpq-be-fe.h + * + *------------------------------------------------------------------------- + */ +#ifndef LIBPQ_BE_FE_H +#define LIBPQ_BE_FE_H + +/* + * Despite the name, BUILDING_DLL is set only when building code directly part + * of the backend. Which also is where libpq isn't allowed to be + * used. Obviously this doesn't protect against libpq-fe.h getting included + * otherwise, but perhaps still protects against a few mistakes... + */ +#ifdef BUILDING_DLL +#error "libpq may not be used in code directly built into the backend" +#endif + +#include "libpq-fe.h" + +/* + * Memory-context-safe wrapper object for a PGresult. + */ +typedef struct libpqsrv_PGresult +{ + PGresult *res; /* the wrapped PGresult */ + MemoryContext ctx; /* the MemoryContext it's attached to */ + MemoryContextCallback cb; /* the callback that implements freeing */ +} libpqsrv_PGresult; + + +/* + * Wrap the given PGresult in a libpqsrv_PGresult object, so that it will + * go away automatically if the current memory context is reset or deleted. + * + * To avoid potential memory leaks, backend code must always apply this + * immediately to the output of any PGresult-yielding libpq function. + */ +static inline libpqsrv_PGresult * +libpqsrv_PQwrap(PGresult *res) +{ + libpqsrv_PGresult *bres; + MemoryContext ctx = CurrentMemoryContext; + + /* We pass through a NULL result as-is, since there's nothing to free */ + if (res == NULL) + return NULL; + /* Attempt to allocate the wrapper ... this had better not throw error */ + bres = (libpqsrv_PGresult *) + MemoryContextAllocExtended(ctx, + sizeof(libpqsrv_PGresult), + MCXT_ALLOC_NO_OOM); + /* If we failed to allocate a wrapper, free the PGresult before failing */ + if (bres == NULL) + { + PQclear(res); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + /* Okay, set up the wrapper */ + bres->res = res; + bres->ctx = ctx; + bres->cb.func = (MemoryContextCallbackFunction) PQclear; + bres->cb.arg = res; + MemoryContextRegisterResetCallback(ctx, &bres->cb); + return bres; +} + +/* + * Free a wrapped PGresult, after detaching it from the memory context. + * Like PQclear(), allow the argument to be NULL. + */ +static inline void +libpqsrv_PQclear(libpqsrv_PGresult *bres) +{ + if (bres) + { + MemoryContextUnregisterResetCallback(bres->ctx, &bres->cb); + PQclear(bres->res); + pfree(bres); + } +} + +/* + * Move a wrapped PGresult to have a different parent context. + */ +static inline libpqsrv_PGresult * +libpqsrv_PGresultSetParent(libpqsrv_PGresult *bres, MemoryContext ctx) +{ + libpqsrv_PGresult *newres; + + /* We pass through a NULL result as-is */ + if (bres == NULL) + return NULL; + /* Make a new wrapper in the target context, raising error on OOM */ + newres = (libpqsrv_PGresult *) + MemoryContextAlloc(ctx, sizeof(libpqsrv_PGresult)); + /* Okay, set up the new wrapper */ + newres->res = bres->res; + newres->ctx = ctx; + newres->cb.func = (MemoryContextCallbackFunction) PQclear; + newres->cb.arg = bres->res; + MemoryContextRegisterResetCallback(ctx, &newres->cb); + /* Disarm and delete the old wrapper */ + MemoryContextUnregisterResetCallback(bres->ctx, &bres->cb); + pfree(bres); + return newres; +} + +/* + * Convenience wrapper for PQgetResult. + * + * We could supply wrappers for other PGresult-returning functions too, + * but at present there's no need. + */ +static inline libpqsrv_PGresult * +libpqsrv_PQgetResult(PGconn *conn) +{ + return libpqsrv_PQwrap(PQgetResult(conn)); +} + +/* + * Accessor functions for libpqsrv_PGresult. While it's not necessary to use + * these, they emulate the behavior of the underlying libpq functions when + * passed a NULL pointer. This is particularly important for PQresultStatus, + * which is often the first check on a result. + */ + +static inline ExecStatusType +libpqsrv_PQresultStatus(const libpqsrv_PGresult *res) +{ + if (!res) + return PGRES_FATAL_ERROR; + return PQresultStatus(res->res); +} + +static inline const char * +libpqsrv_PQresultErrorMessage(const libpqsrv_PGresult *res) +{ + if (!res) + return ""; + return PQresultErrorMessage(res->res); +} + +static inline char * +libpqsrv_PQresultErrorField(const libpqsrv_PGresult *res, int fieldcode) +{ + if (!res) + return NULL; + return PQresultErrorField(res->res, fieldcode); +} + +static inline char * +libpqsrv_PQcmdStatus(const libpqsrv_PGresult *res) +{ + if (!res) + return NULL; + return PQcmdStatus(res->res); +} + +static inline int +libpqsrv_PQntuples(const libpqsrv_PGresult *res) +{ + if (!res) + return 0; + return PQntuples(res->res); +} + +static inline int +libpqsrv_PQnfields(const libpqsrv_PGresult *res) +{ + if (!res) + return 0; + return PQnfields(res->res); +} + +static inline char * +libpqsrv_PQgetvalue(const libpqsrv_PGresult *res, int tup_num, int field_num) +{ + if (!res) + return NULL; + return PQgetvalue(res->res, tup_num, field_num); +} + +static inline int +libpqsrv_PQgetlength(const libpqsrv_PGresult *res, int tup_num, int field_num) +{ + if (!res) + return 0; + return PQgetlength(res->res, tup_num, field_num); +} + +static inline int +libpqsrv_PQgetisnull(const libpqsrv_PGresult *res, int tup_num, int field_num) +{ + if (!res) + return 1; /* pretend it is null */ + return PQgetisnull(res->res, tup_num, field_num); +} + +static inline char * +libpqsrv_PQfname(const libpqsrv_PGresult *res, int field_num) +{ + if (!res) + return NULL; + return PQfname(res->res, field_num); +} + +static inline const char * +libpqsrv_PQcmdTuples(const libpqsrv_PGresult *res) +{ + if (!res) + return ""; + return PQcmdTuples(res->res); +} + +/* + * Redefine these libpq entry point names concerned with PGresults so that + * they will operate on libpqsrv_PGresults instead. This avoids needing to + * convert a lot of pre-existing code, and reduces the notational differences + * between frontend and backend libpq-using code. + */ +#define PGresult libpqsrv_PGresult +#define PQclear libpqsrv_PQclear +#define PQgetResult libpqsrv_PQgetResult +#define PQresultStatus libpqsrv_PQresultStatus +#define PQresultErrorMessage libpqsrv_PQresultErrorMessage +#define PQresultErrorField libpqsrv_PQresultErrorField +#define PQcmdStatus libpqsrv_PQcmdStatus +#define PQntuples libpqsrv_PQntuples +#define PQnfields libpqsrv_PQnfields +#define PQgetvalue libpqsrv_PQgetvalue +#define PQgetlength libpqsrv_PQgetlength +#define PQgetisnull libpqsrv_PQgetisnull +#define PQfname libpqsrv_PQfname +#define PQcmdTuples libpqsrv_PQcmdTuples + +#endif /* LIBPQ_BE_FE_H */ diff --git a/src/include/libpq/protocol.h b/src/include/libpq/protocol.h index b0bcb3cdc26..c64e628628d 100644 --- a/src/include/libpq/protocol.h +++ b/src/include/libpq/protocol.h @@ -69,6 +69,27 @@ #define PqMsg_Progress 'P' +/* Replication codes sent by the primary (wrapped in CopyData messages). */ + +#define PqReplMsg_Keepalive 'k' +#define PqReplMsg_PrimaryStatusUpdate 's' +#define PqReplMsg_WALData 'w' + + +/* Replication codes sent by the standby (wrapped in CopyData messages). */ + +#define PqReplMsg_HotStandbyFeedback 'h' +#define PqReplMsg_PrimaryStatusRequest 'p' +#define PqReplMsg_StandbyStatusUpdate 'r' + + +/* Codes used for backups via COPY OUT (wrapped in CopyData messages). */ + +#define PqBackupMsg_Manifest 'm' +#define PqBackupMsg_NewArchive 'n' +#define PqBackupMsg_ProgressReport 'p' + + /* These are the authentication request codes sent by the backend. */ #define AUTH_REQ_OK 0 /* User is authenticated */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 1e59a7f910f..1bef98471c3 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -96,7 +96,6 @@ extern PGDLLIMPORT volatile sig_atomic_t IdleSessionTimeoutPending; extern PGDLLIMPORT volatile sig_atomic_t ProcSignalBarrierPending; extern PGDLLIMPORT volatile sig_atomic_t LogMemoryContextPending; extern PGDLLIMPORT volatile sig_atomic_t IdleStatsUpdateTimeoutPending; -extern PGDLLIMPORT volatile sig_atomic_t PublishMemoryContextPending; extern PGDLLIMPORT volatile sig_atomic_t CheckClientConnectionPending; extern PGDLLIMPORT volatile sig_atomic_t ClientConnectionLost; diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 5b6cadb5a6c..e107d6e5f81 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -42,7 +42,6 @@ #include "storage/condition_variable.h" #include "utils/hsearch.h" #include "utils/queryenvironment.h" -#include "utils/plancache.h" #include "utils/reltrigger.h" #include "utils/sharedtuplestore.h" #include "utils/snapshot.h" @@ -158,34 +157,6 @@ typedef struct ExprState * entries for a particular index. Used for both index_build and * retail creation of index entries. * - * NumIndexAttrs total number of columns in this index - * NumIndexKeyAttrs number of key columns in index - * IndexAttrNumbers underlying-rel attribute numbers used as keys - * (zeroes indicate expressions). It also contains - * info about included columns. - * Expressions expr trees for expression entries, or NIL if none - * ExpressionsState exec state for expressions, or NIL if none - * Predicate partial-index predicate, or NIL if none - * PredicateState exec state for predicate, or NIL if none - * ExclusionOps Per-column exclusion operators, or NULL if none - * ExclusionProcs Underlying function OIDs for ExclusionOps - * ExclusionStrats Opclass strategy numbers for ExclusionOps - * UniqueOps These are like Exclusion*, but for unique indexes - * UniqueProcs - * UniqueStrats - * Unique is it a unique index? - * OpclassOptions opclass-specific options, or NULL if none - * ReadyForInserts is it valid for inserts? - * CheckedUnchanged IndexUnchanged status determined yet? - * IndexUnchanged aminsert hint, cached for retail inserts - * Concurrent are we doing a concurrent index build? - * BrokenHotChain did we detect any broken HOT chains? - * Summarizing is it a summarizing index? - * ParallelWorkers # of workers requested (excludes leader) - * Am Oid of index AM - * AmCache private cache area for index AM - * Context memory context holding this IndexInfo - * * ii_Concurrent, ii_BrokenHotChain, and ii_ParallelWorkers are used only * during index build; they're conventionally zeroed otherwise. * ---------------- @@ -193,31 +164,67 @@ typedef struct ExprState typedef struct IndexInfo { NodeTag type; - int ii_NumIndexAttrs; /* total number of columns in index */ - int ii_NumIndexKeyAttrs; /* number of key columns in index */ + + /* total number of columns in index */ + int ii_NumIndexAttrs; + /* number of key columns in index */ + int ii_NumIndexKeyAttrs; + + /* + * Underlying-rel attribute numbers used as keys (zeroes indicate + * expressions). It also contains info about included columns. + */ AttrNumber ii_IndexAttrNumbers[INDEX_MAX_KEYS]; + + /* expr trees for expression entries, or NIL if none */ List *ii_Expressions; /* list of Expr */ + /* exec state for expressions, or NIL if none */ List *ii_ExpressionsState; /* list of ExprState */ + + /* partial-index predicate, or NIL if none */ List *ii_Predicate; /* list of Expr */ + /* exec state for expressions, or NIL if none */ ExprState *ii_PredicateState; + + /* Per-column exclusion operators, or NULL if none */ Oid *ii_ExclusionOps; /* array with one entry per column */ + /* Underlying function OIDs for ExclusionOps */ Oid *ii_ExclusionProcs; /* array with one entry per column */ + /* Opclass strategy numbers for ExclusionOps */ uint16 *ii_ExclusionStrats; /* array with one entry per column */ + + /* These are like Exclusion*, but for unique indexes */ Oid *ii_UniqueOps; /* array with one entry per column */ Oid *ii_UniqueProcs; /* array with one entry per column */ uint16 *ii_UniqueStrats; /* array with one entry per column */ + + /* is it a unique index? */ bool ii_Unique; + /* is NULLS NOT DISTINCT? */ bool ii_NullsNotDistinct; + /* is it valid for inserts? */ bool ii_ReadyForInserts; + /* IndexUnchanged status determined yet? */ bool ii_CheckedUnchanged; + /* aminsert hint, cached for retail inserts */ bool ii_IndexUnchanged; + /* are we doing a concurrent index build? */ bool ii_Concurrent; + /* did we detect any broken HOT chains? */ bool ii_BrokenHotChain; + /* is it a summarizing index? */ bool ii_Summarizing; + /* is it a WITHOUT OVERLAPS index? */ bool ii_WithoutOverlaps; + /* # of workers requested (excludes leader) */ int ii_ParallelWorkers; + + /* Oid of index AM */ Oid ii_Am; + /* private cache area for index AM */ void *ii_AmCache; + + /* memory context holding this IndexInfo */ MemoryContext ii_Context; } IndexInfo; @@ -664,7 +671,6 @@ typedef struct EState * ExecRowMarks, or NULL if none */ List *es_rteperminfos; /* List of RTEPermissionInfo */ PlannedStmt *es_plannedstmt; /* link to top of plan tree */ - CachedPlan *es_cachedplan; /* CachedPlan providing the plan tree */ List *es_part_prune_infos; /* List of PartitionPruneInfo */ List *es_part_prune_states; /* List of PartitionPruneState */ List *es_part_prune_results; /* List of Bitmapset */ @@ -717,7 +723,6 @@ typedef struct EState int es_top_eflags; /* eflags passed to ExecutorStart */ int es_instrument; /* OR of InstrumentOption flags */ bool es_finished; /* true when ExecutorFinish is done */ - bool es_aborted; /* true when execution was aborted */ List *es_exprcontexts; /* List of ExprContexts within EState */ diff --git a/src/include/nodes/meson.build b/src/include/nodes/meson.build index d1ca24dd32f..ea36cb0fda4 100644 --- a/src/include/nodes/meson.build +++ b/src/include/nodes/meson.build @@ -28,7 +28,7 @@ node_support_input_i = [ node_support_input = [] foreach i : node_support_input_i - node_support_input += meson.source_root() / 'src' / 'include' / i + node_support_input += meson.project_source_root() / 'src' / 'include' / i endforeach node_support_output = [ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 4610fc61293..86a236bd58b 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -127,8 +127,13 @@ typedef struct Query * query identifier (can be set by plugins); ignored for equal, as it * might not be set; also not stored. This is the result of the query * jumble, hence ignored. + * + * We store this as a signed value as this is the form it's displayed to + * users in places such as EXPLAIN and pg_stat_statements. Primarily this + * is done due to lack of an SQL type to represent the full range of + * uint64. */ - uint64 queryId pg_node_attr(equal_ignore, query_jumble_ignore, read_write_ignore, read_as(0)); + int64 queryId pg_node_attr(equal_ignore, query_jumble_ignore, read_write_ignore, read_as(0)); /* do I set the command result tag? */ bool canSetTag pg_node_attr(query_jumble_ignore); @@ -346,6 +351,14 @@ typedef struct A_Expr List *name; /* possibly-qualified name of operator */ Node *lexpr; /* left argument, or NULL if none */ Node *rexpr; /* right argument, or NULL if none */ + + /* + * If rexpr is a list of some kind, we separately track its starting and + * ending location; it's not the same as the starting and ending location + * of the token itself. + */ + ParseLoc rexpr_list_start; + ParseLoc rexpr_list_end; ParseLoc location; /* token location, or -1 if unknown */ } A_Expr; @@ -501,6 +514,8 @@ typedef struct A_ArrayExpr { NodeTag type; List *elements; /* array element expressions */ + ParseLoc list_start; /* start of the element list */ + ParseLoc list_end; /* end of the elements list */ ParseLoc location; /* token location, or -1 if unknown */ } A_ArrayExpr; @@ -2095,8 +2110,6 @@ typedef struct InsertStmt ReturningClause *returningClause; /* RETURNING clause */ WithClause *withClause; /* WITH clause */ OverridingKind override; /* OVERRIDING clause */ - ParseLoc stmt_location; /* start location, or -1 if unknown */ - ParseLoc stmt_len; /* length in bytes; 0 means "rest of string" */ } InsertStmt; /* ---------------------- @@ -2111,8 +2124,6 @@ typedef struct DeleteStmt Node *whereClause; /* qualifications */ ReturningClause *returningClause; /* RETURNING clause */ WithClause *withClause; /* WITH clause */ - ParseLoc stmt_location; /* start location, or -1 if unknown */ - ParseLoc stmt_len; /* length in bytes; 0 means "rest of string" */ } DeleteStmt; /* ---------------------- @@ -2128,8 +2139,6 @@ typedef struct UpdateStmt List *fromClause; /* optional from clause for more tables */ ReturningClause *returningClause; /* RETURNING clause */ WithClause *withClause; /* WITH clause */ - ParseLoc stmt_location; /* start location, or -1 if unknown */ - ParseLoc stmt_len; /* length in bytes; 0 means "rest of string" */ } UpdateStmt; /* ---------------------- @@ -2145,8 +2154,6 @@ typedef struct MergeStmt List *mergeWhenClauses; /* list of MergeWhenClause(es) */ ReturningClause *returningClause; /* RETURNING clause */ WithClause *withClause; /* WITH clause */ - ParseLoc stmt_location; /* start location, or -1 if unknown */ - ParseLoc stmt_len; /* length in bytes; 0 means "rest of string" */ } MergeStmt; /* ---------------------- @@ -2216,8 +2223,6 @@ typedef struct SelectStmt bool all; /* ALL specified? */ struct SelectStmt *larg; /* left child */ struct SelectStmt *rarg; /* right child */ - ParseLoc stmt_location; /* start location, or -1 if unknown */ - ParseLoc stmt_len; /* length in bytes; 0 means "rest of string" */ /* Eventually add fields for CORRESPONDING spec here */ } SelectStmt; @@ -2531,17 +2536,20 @@ typedef struct AlterCollationStmt * this command. * ---------------------- */ +typedef enum AlterDomainType +{ + AD_AlterDefault = 'T', /* SET|DROP DEFAULT */ + AD_DropNotNull = 'N', /* DROP NOT NULL */ + AD_SetNotNull = 'O', /* SET NOT NULL */ + AD_AddConstraint = 'C', /* ADD CONSTRAINT */ + AD_DropConstraint = 'X', /* DROP CONSTRAINT */ + AD_ValidateConstraint = 'V', /* VALIDATE CONSTRAINT */ +} AlterDomainType; + typedef struct AlterDomainStmt { NodeTag type; - char subtype; /*------------ - * T = alter column default - * N = alter column drop not null - * O = alter column set not null - * C = add constraint - * X = drop constraint - *------------ - */ + AlterDomainType subtype; /* subtype of command */ List *typeName; /* domain to work on */ char *name; /* column or constraint name to act on */ Node *def; /* definition of default or constraint */ @@ -3417,15 +3425,44 @@ typedef enum FetchDirection FETCH_RELATIVE, } FetchDirection; +typedef enum FetchDirectionKeywords +{ + FETCH_KEYWORD_NONE = 0, + FETCH_KEYWORD_NEXT, + FETCH_KEYWORD_PRIOR, + FETCH_KEYWORD_FIRST, + FETCH_KEYWORD_LAST, + FETCH_KEYWORD_ABSOLUTE, + FETCH_KEYWORD_RELATIVE, + FETCH_KEYWORD_ALL, + FETCH_KEYWORD_FORWARD, + FETCH_KEYWORD_FORWARD_ALL, + FETCH_KEYWORD_BACKWARD, + FETCH_KEYWORD_BACKWARD_ALL, +} FetchDirectionKeywords; + #define FETCH_ALL LONG_MAX typedef struct FetchStmt { NodeTag type; FetchDirection direction; /* see above */ - long howMany; /* number of rows, or position argument */ - char *portalname; /* name of portal (cursor) */ - bool ismove; /* true if MOVE */ + /* number of rows, or position argument */ + long howMany pg_node_attr(query_jumble_ignore); + /* name of portal (cursor) */ + char *portalname; + /* true if MOVE */ + bool ismove; + + /* + * Set when a direction_keyword (e.g., FETCH FORWARD) is used, to + * distinguish it from a numeric variant (e.g., FETCH 1) for the purpose + * of query jumbling. + */ + FetchDirectionKeywords direction_keyword; + + /* token location, or -1 if unknown */ + ParseLoc location pg_node_attr(query_jumble_location); } FetchStmt; /* ---------------------- @@ -4010,6 +4047,7 @@ typedef struct RefreshMatViewStmt typedef struct CheckPointStmt { NodeTag type; + List *options; /* list of DefElem nodes */ } CheckPointStmt; /* ---------------------- diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 1dd2d1560cb..ad2726f026f 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -138,9 +138,6 @@ typedef struct PlannerGlobal /* "flat" list of integer RT indexes */ List *resultRelations; - /* "flat" list of integer RT indexes (one per ModifyTable node) */ - List *firstResultRels; - /* "flat" list of AppendRelInfos */ List *appendRelations; @@ -182,6 +179,9 @@ typedef struct PlannerGlobal /* partition descriptors */ PartitionDirectory partition_directory pg_node_attr(read_write_ignore); + + /* hash table for NOT NULL attnums of relations */ + struct HTAB *rel_notnullatts_hash pg_node_attr(read_write_ignore); } PlannerGlobal; /* macro for fetching the Plan associated with a SubPlan node */ @@ -722,6 +722,9 @@ typedef struct PartitionSchemeData *PartitionScheme; * the attribute is needed as part of final targetlist * attr_widths - cache space for per-attribute width estimates; * zero means not computed yet + * notnullattnums - zero-based set containing attnums of NOT NULL + * columns (not populated for rels corresponding to + * non-partitioned inh==true RTEs) * nulling_relids - relids of outer joins that can null this rel * lateral_vars - lateral cross-references of rel, if any (list of * Vars and PlaceHolderVars) @@ -955,11 +958,7 @@ typedef struct RelOptInfo Relids *attr_needed pg_node_attr(read_write_ignore); /* array indexed [min_attr .. max_attr] */ int32 *attr_widths pg_node_attr(read_write_ignore); - - /* - * Zero-based set containing attnums of NOT NULL columns. Not populated - * for rels corresponding to non-partitioned inh==true RTEs. - */ + /* zero-based set containing attnums of NOT NULL columns */ Bitmapset *notnullattnums; /* relids of outer joins that can null this baserel */ Relids nulling_relids; @@ -2134,10 +2133,12 @@ typedef struct MemoizePath * complete after caching the first record. */ bool binary_mode; /* true when cache key should be compared bit * by bit, false when using hash equality ops */ - Cardinality calls; /* expected number of rescans */ uint32 est_entries; /* The maximum number of entries that the * planner expects will fit in the cache, or 0 * if unknown */ + Cardinality est_calls; /* expected number of rescans */ + Cardinality est_unique_keys; /* estimated unique keys, for EXPLAIN */ + double est_hit_ratio; /* estimated cache hit ratio, for EXPLAIN */ } MemoizePath; /* diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 658d76225e4..29d7732d6a0 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -29,6 +29,21 @@ */ /* ---------------- + * PlannedStmtOrigin + * + * PlannedStmtOrigin identifies from where a PlannedStmt comes from. + * ---------------- + */ +typedef enum PlannedStmtOrigin +{ + PLAN_STMT_UNKNOWN = 0, /* plan origin is not yet known */ + PLAN_STMT_INTERNAL, /* generated internally by a query */ + PLAN_STMT_STANDARD, /* standard planned statement */ + PLAN_STMT_CACHE_GENERIC, /* Generic cached plan */ + PLAN_STMT_CACHE_CUSTOM, /* Custom cached plan */ +} PlannedStmtOrigin; + +/* ---------------- * PlannedStmt node * * The output of the planner is a Plan tree headed by a PlannedStmt node. @@ -53,10 +68,13 @@ typedef struct PlannedStmt CmdType commandType; /* query identifier (copied from Query) */ - uint64 queryId; + int64 queryId; /* plan identifier (can be set by plugins) */ - uint64 planId; + int64 planId; + + /* origin of plan */ + PlannedStmtOrigin planOrigin; /* is it insert|update|delete|merge RETURNING? */ bool hasReturning; @@ -105,13 +123,6 @@ typedef struct PlannedStmt /* integer list of RT indexes, or NIL */ List *resultRelations; - /* - * rtable indexes of first target relation in each ModifyTable node in the - * plan for INSERT/UPDATE/DELETE/MERGE - */ - /* integer list of RT indexes, or NIL */ - List *firstResultRels; - /* list of AppendRelInfo nodes */ List *appendRelations; @@ -1063,6 +1074,16 @@ typedef struct Memoize /* paramids from param_exprs */ Bitmapset *keyparamids; + + /* Estimated number of rescans, for EXPLAIN */ + Cardinality est_calls; + + /* Estimated number of distinct lookup keys, for EXPLAIN */ + Cardinality est_unique_keys; + + /* Estimated cache hit ratio, for EXPLAIN */ + double est_hit_ratio; + } Memoize; /* ---------------- diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h index 7d3b4198f26..6dfca3cb35b 100644 --- a/src/include/nodes/primnodes.h +++ b/src/include/nodes/primnodes.h @@ -389,14 +389,16 @@ typedef enum ParamKind typedef struct Param { + pg_node_attr(custom_query_jumble) + Expr xpr; ParamKind paramkind; /* kind of parameter. See above */ int paramid; /* numeric ID for parameter */ Oid paramtype; /* pg_type OID of parameter's datatype */ /* typmod value, if known */ - int32 paramtypmod pg_node_attr(query_jumble_ignore); + int32 paramtypmod; /* OID of collation, or InvalidOid if none */ - Oid paramcollid pg_node_attr(query_jumble_ignore); + Oid paramcollid; /* token location, or -1 if unknown */ ParseLoc location; } Param; @@ -1397,6 +1399,10 @@ typedef struct ArrayExpr List *elements pg_node_attr(query_jumble_squash); /* true if elements are sub-arrays */ bool multidims pg_node_attr(query_jumble_ignore); + /* location of the start of the elements list */ + ParseLoc list_start; + /* location of the end of the elements list */ + ParseLoc list_end; /* token location, or -1 if unknown */ ParseLoc location; } ArrayExpr; diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h index da7c7abed2e..dcb36dcb44f 100644 --- a/src/include/nodes/queryjumble.h +++ b/src/include/nodes/queryjumble.h @@ -24,11 +24,11 @@ typedef struct LocationLen int location; /* start offset in query text */ int length; /* length in bytes, or -1 to ignore */ - /* - * Indicates that this location represents the beginning or end of a run - * of squashed constants. - */ + /* Does this location represent a squashed list? */ bool squashed; + + /* Is this location a PARAM_EXTERN parameter? */ + bool extern_param; } LocationLen; /* @@ -52,9 +52,18 @@ typedef struct JumbleState /* Current number of valid entries in clocations array */ int clocations_count; - /* highest Param id we've seen, in order to start normalization correctly */ + /* + * ID of the highest PARAM_EXTERN parameter we've seen in the query; used + * to start normalization correctly. However, if there are any squashed + * lists in the query, we disregard query-supplied parameter numbers and + * renumber everything. This is to avoid possible gaps caused by + * squashing in case any params are in squashed lists. + */ int highest_extern_param_id; + /* Whether squashable lists are present */ + bool has_squashed_lists; + /* * Count of the number of NULL nodes seen since last appending a value. * These are flushed out to the jumble buffer before subsequent appends diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index d397fe27dc1..b523bcda8f3 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -118,7 +118,7 @@ extern void cost_incremental_sort(Path *path, Cost input_startup_cost, Cost input_total_cost, double input_tuples, int width, Cost comparison_cost, int sort_mem, double limit_tuples); -extern void cost_append(AppendPath *apath); +extern void cost_append(AppendPath *apath, PlannerInfo *root); extern void cost_merge_append(Path *path, PlannerInfo *root, List *pathkeys, int n_streams, int input_disabled_nodes, diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h index 546828b54bd..37bc13c2cbd 100644 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -154,6 +154,8 @@ extern Node *estimate_expression_value(PlannerInfo *root, Node *node); extern Expr *evaluate_expr(Expr *expr, Oid result_type, int32 result_typmod, Oid result_collation); +extern bool var_is_nonnullable(PlannerInfo *root, Var *var, bool use_rel_info); + extern List *expand_function_arguments(List *args, bool include_out_arguments, Oid result_type, struct HeapTupleData *func_tuple); diff --git a/src/include/optimizer/paramassign.h b/src/include/optimizer/paramassign.h index 59dcb1ff053..bbf7214289b 100644 --- a/src/include/optimizer/paramassign.h +++ b/src/include/optimizer/paramassign.h @@ -30,7 +30,8 @@ extern Param *replace_nestloop_param_placeholdervar(PlannerInfo *root, extern void process_subquery_nestloop_params(PlannerInfo *root, List *subplan_params); extern List *identify_current_nestloop_params(PlannerInfo *root, - Relids leftrelids); + Relids leftrelids, + Relids outerrelids); extern Param *generate_new_exec_param(PlannerInfo *root, Oid paramtype, int32 paramtypmod, Oid paramcollation); extern int assign_special_exec_param(PlannerInfo *root); diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 60dcdb77e41..58936e963cb 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -90,7 +90,7 @@ extern MemoizePath *create_memoize_path(PlannerInfo *root, List *hash_operators, bool singlerow, bool binary_mode, - double calls); + Cardinality est_calls); extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, SpecialJoinInfo *sjinfo); extern GatherPath *create_gather_path(PlannerInfo *root, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index a48c9721797..8410531f2d6 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -109,8 +109,6 @@ extern Relids add_outer_joins_to_relids(PlannerInfo *root, Relids input_relids, List **pushed_down_joins); extern bool have_join_order_restriction(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2); -extern bool have_dangerous_phv(PlannerInfo *root, - Relids outer_relids, Relids inner_params); extern void mark_dummy_rel(RelOptInfo *rel); extern void init_dummy_sjinfo(SpecialJoinInfo *sjinfo, Relids left_relids, Relids right_relids); diff --git a/src/include/optimizer/placeholder.h b/src/include/optimizer/placeholder.h index d351045e2e0..db92d8861ba 100644 --- a/src/include/optimizer/placeholder.h +++ b/src/include/optimizer/placeholder.h @@ -30,5 +30,7 @@ extern void add_placeholders_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel, SpecialJoinInfo *sjinfo); extern bool contain_placeholder_references_to(PlannerInfo *root, Node *clause, int relid); +extern Relids get_placeholder_nulling_relids(PlannerInfo *root, + PlaceHolderInfo *phinfo); #endif /* PLACEHOLDER_H */ diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index cd74e4b1e8b..d6f6f4ad2d7 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -28,6 +28,10 @@ extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel); +extern void get_relation_notnullatts(PlannerInfo *root, Relation relation); + +extern Relids find_relation_notnullatts(PlannerInfo *root, Oid relid); + extern List *infer_arbiter_indexes(PlannerInfo *root); extern void estimate_rel_size(Relation rel, int32 *attr_widths, diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h index df56202777c..4fbecdb4462 100644 --- a/src/include/optimizer/prep.h +++ b/src/include/optimizer/prep.h @@ -22,10 +22,10 @@ * prototypes for prepjointree.c */ extern void transform_MERGE_to_join(Query *parse); +extern Query *preprocess_relation_rtes(PlannerInfo *root); extern void replace_empty_jointree(Query *parse); extern void pull_up_sublinks(PlannerInfo *root); extern void preprocess_function_rtes(PlannerInfo *root); -extern Query *expand_virtual_generated_columns(PlannerInfo *root); extern void pull_up_subqueries(PlannerInfo *root); extern void flatten_simple_union_all(PlannerInfo *root); extern void reduce_outer_joins(PlannerInfo *root); diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h index 994284019fb..f7d07c84542 100644 --- a/src/include/parser/parse_node.h +++ b/src/include/parser/parse_node.h @@ -108,20 +108,6 @@ typedef Node *(*CoerceParamHook) (ParseState *pstate, Param *param, * byte-wise locations in parse structures to character-wise cursor * positions.) * - * p_stmt_location: location of the top level RawStmt's start. During - * transformation, the Query's location will be set to the statement's - * location if available. Otherwise, the RawStmt's start location will - * be used. Propagating the location through ParseState is needed for - * the Query length calculation (see p_stmt_len below). - * - * p_stmt_len: length of the top level RawStmt. Most of the time, the - * statement's length is not provided by the parser, with the exception - * of SelectStmt within parentheses and PreparableStmt in COPY. If the - * statement's location is provided by the parser, the top-level location - * and length are needed to accurately compute the Query's length. If the - * statement's location is not provided, the RawStmt's length can be used - * directly. - * * p_rtable: list of RTEs that will become the rangetable of the query. * Note that neither relname nor refname of these entries are necessarily * unique; searching the rtable by name is a bad idea. @@ -207,8 +193,6 @@ struct ParseState { ParseState *parentParseState; /* stack link */ const char *p_sourcetext; /* source text, or NULL if not available */ - ParseLoc p_stmt_location; /* start location, or -1 if unknown */ - ParseLoc p_stmt_len; /* length in bytes; 0 means "rest of string" */ List *p_rtable; /* range table so far */ List *p_rteperminfos; /* list of RTEPermissionInfo nodes for each * RTE_RELATION entry in rtable */ diff --git a/src/include/pch/meson.build b/src/include/pch/meson.build index f6babee6f6d..603add1a351 100644 --- a/src/include/pch/meson.build +++ b/src/include/pch/meson.build @@ -1,6 +1,6 @@ # Copyright (c) 2022-2025, PostgreSQL Global Development Group # See https://github.com/mesonbuild/meson/issues/10338 -pch_c_h = meson.source_root() / meson.current_source_dir() / 'c_pch.h' -pch_postgres_h = meson.source_root() / meson.current_source_dir() / 'postgres_pch.h' -pch_postgres_fe_h = meson.source_root() / meson.current_source_dir() / 'postgres_fe_pch.h' +pch_c_h = meson.project_source_root() / meson.current_source_dir() / 'c_pch.h' +pch_postgres_h = meson.project_source_root() / meson.current_source_dir() / 'postgres_pch.h' +pch_postgres_fe_h = meson.project_source_root() / meson.current_source_dir() / 'postgres_fe_pch.h' diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 726a7c1be1f..c4dc5d72bdb 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -229,6 +229,9 @@ /* Define to 1 if you have the global variable 'int timezone'. */ #undef HAVE_INT_TIMEZONE +/* Define to 1 if you have the `io_uring_queue_init_mem' function. */ +#undef HAVE_IO_URING_QUEUE_INIT_MEM + /* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */ #undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 378f2f2c2ba..202bd2d5ace 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -718,9 +718,9 @@ extern void pgstat_count_heap_delete(Relation rel); extern void pgstat_count_truncate(Relation rel); extern void pgstat_update_heap_dead_tuples(Relation rel, int delta); -extern void pgstat_twophase_postcommit(TransactionId xid, uint16 info, +extern void pgstat_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); -extern void pgstat_twophase_postabort(TransactionId xid, uint16 info, +extern void pgstat_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid); diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 82313bb7fcf..ae008118ea8 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -72,7 +72,7 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) { if (__builtin_constant_p(len) && len < 32) { - const unsigned char *p = data; + const unsigned char *p = (const unsigned char *) data; /* * For small constant inputs, inline the computation to avoid a diff --git a/src/include/port/pg_iovec.h b/src/include/port/pg_iovec.h index df40c7208be..90be3af449d 100644 --- a/src/include/port/pg_iovec.h +++ b/src/include/port/pg_iovec.h @@ -21,9 +21,6 @@ #else -/* POSIX requires at least 16 as a maximum iovcnt. */ -#define IOV_MAX 16 - /* Define our own POSIX-compatible iovec struct. */ struct iovec { @@ -34,6 +31,15 @@ struct iovec #endif /* + * If <limits.h> didn't define IOV_MAX, define our own. X/Open requires at + * least 16. (GNU Hurd apparently feel that they're not bound by X/Open, + * because they don't define this symbol at all.) + */ +#ifndef IOV_MAX +#define IOV_MAX 16 +#endif + +/* * Define a reasonable maximum that is safe to use on the stack in arrays of * struct iovec and other small types. The operating system could limit us to * a number as low as 16, but most systems have 1024. diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h index 40f1d324dcf..9d1ea6d0db8 100644 --- a/src/include/port/pg_numa.h +++ b/src/include/port/pg_numa.h @@ -24,12 +24,17 @@ extern PGDLLIMPORT int pg_numa_get_max_node(void); * This is required on Linux, before pg_numa_query_pages() as we * need to page-fault before move_pages(2) syscall returns valid results. */ -#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \ - ro_volatile_var = *(volatile uint64 *) ptr +static inline void +pg_numa_touch_mem_if_required(void *ptr) +{ + volatile uint64 touch pg_attribute_unused(); + + touch = *(volatile uint64 *) ptr; +} #else -#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \ +#define pg_numa_touch_mem_if_required(ptr) \ do {} while(0) #endif diff --git a/src/include/port/solaris.h b/src/include/port/solaris.h index e63a3bd824d..8ff40007c7f 100644 --- a/src/include/port/solaris.h +++ b/src/include/port/solaris.h @@ -24,3 +24,12 @@ #if defined(__i386__) #include <sys/isa_defs.h> #endif + +/* + * On original Solaris, PAM conversation procs lack a "const" in their + * declaration; but recent OpenIndiana versions put it there by default. + * The least messy way to deal with this is to define _PAM_LEGACY_NONCONST, + * which causes OpenIndiana to declare pam_conv per the Solaris tradition, + * and also use that symbol to control omitting the "const" in our own code. + */ +#define _PAM_LEGACY_NONCONST 1 diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index 800ecbfd13b..97001f4e7f6 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -15,6 +15,7 @@ #ifndef _BGWRITER_H #define _BGWRITER_H +#include "parser/parse_node.h" #include "storage/block.h" #include "storage/relfilelocator.h" #include "storage/smgr.h" @@ -30,6 +31,7 @@ extern PGDLLIMPORT double CheckPointCompletionTarget; pg_noreturn extern void BackgroundWriterMain(const void *startup_data, size_t startup_data_len); pg_noreturn extern void CheckpointerMain(const void *startup_data, size_t startup_data_len); +extern void ExecCheckpoint(ParseState *pstate, CheckPointStmt *stmt); extern void RequestCheckpoint(int flags); extern void CheckpointWriteDelay(int flags, double progress); diff --git a/src/include/replication/conflict.h b/src/include/replication/conflict.h index 6c59125f256..ff3cb8416ec 100644 --- a/src/include/replication/conflict.h +++ b/src/include/replication/conflict.h @@ -32,6 +32,9 @@ typedef enum /* The updated row value violates unique constraint */ CT_UPDATE_EXISTS, + /* The row to be updated was concurrently deleted by a different origin */ + CT_UPDATE_DELETED, + /* The row to be updated is missing */ CT_UPDATE_MISSING, diff --git a/src/include/replication/logicallauncher.h b/src/include/replication/logicallauncher.h index 82b202f3305..b29453e8e4f 100644 --- a/src/include/replication/logicallauncher.h +++ b/src/include/replication/logicallauncher.h @@ -25,8 +25,11 @@ extern void ApplyLauncherShmemInit(void); extern void ApplyLauncherForgetWorkerStartTime(Oid subid); extern void ApplyLauncherWakeupAtCommit(void); +extern void ApplyLauncherWakeup(void); extern void AtEOXact_ApplyLauncher(bool isCommit); +extern void CreateConflictDetectionSlot(void); + extern bool IsLogicalLauncher(void); extern pid_t GetLeaderApplyWorkerPid(pid_t pid); diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h index 24e88c409ba..fa0745552f8 100644 --- a/src/include/replication/reorderbuffer.h +++ b/src/include/replication/reorderbuffer.h @@ -176,6 +176,7 @@ typedef struct ReorderBufferChange #define RBTXN_SENT_PREPARE 0x0200 #define RBTXN_IS_COMMITTED 0x0400 #define RBTXN_IS_ABORTED 0x0800 +#define RBTXN_DISTR_INVAL_OVERFLOWED 0x1000 #define RBTXN_PREPARE_STATUS_MASK (RBTXN_IS_PREPARED | RBTXN_SKIPPED_PREPARE | RBTXN_SENT_PREPARE) @@ -265,6 +266,12 @@ typedef struct ReorderBufferChange ((txn)->txn_flags & RBTXN_SKIPPED_PREPARE) != 0 \ ) +/* Is the array of distributed inval messages overflowed? */ +#define rbtxn_distr_inval_overflowed(txn) \ +( \ + ((txn)->txn_flags & RBTXN_DISTR_INVAL_OVERFLOWED) != 0 \ +) + /* Is this a top-level transaction? */ #define rbtxn_is_toptxn(txn) \ ( \ @@ -422,6 +429,12 @@ typedef struct ReorderBufferTXN uint32 ninvalidations; SharedInvalidationMessage *invalidations; + /* + * Stores cache invalidation messages distributed by other transactions. + */ + uint32 ninvalidations_distributed; + SharedInvalidationMessage *invalidations_distributed; + /* --- * Position in one of two lists: * * list of subtransactions if we are *known* to be subxact @@ -738,6 +751,9 @@ extern void ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, CommandId cmin, CommandId cmax, CommandId combocid); extern void ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, Size nmsgs, SharedInvalidationMessage *msgs); +extern void ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs); extern void ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations, SharedInvalidationMessage *invalidations); extern void ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn); diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h index eb0b93b1114..e8fc342d1a9 100644 --- a/src/include/replication/slot.h +++ b/src/include/replication/slot.h @@ -21,6 +21,13 @@ #define PG_REPLSLOT_DIR "pg_replslot" /* + * The reserved name for a replication slot used to retain dead tuples for + * conflict detection in logical replication. See + * maybe_advance_nonremovable_xid() for detail. + */ +#define CONFLICT_DETECTION_SLOT "pg_conflict_detection" + +/* * Behaviour of replication slots, upon release or crash. * * Slots marked as PERSISTENT are crash-safe and will not be dropped when @@ -215,6 +222,33 @@ typedef struct ReplicationSlot * recently stopped. */ TimestampTz inactive_since; + + /* + * Latest restart_lsn that has been flushed to disk. For persistent slots + * the flushed LSN should be taken into account when calculating the + * oldest LSN for WAL segments removal. + * + * Do not assume that restart_lsn will always move forward, i.e., that the + * previously flushed restart_lsn is always behind data.restart_lsn. In + * streaming replication using a physical slot, the restart_lsn is updated + * based on the flushed WAL position reported by the walreceiver. + * + * This replication mode allows duplicate WAL records to be received and + * overwritten. If the walreceiver receives older WAL records and then + * reports them as flushed to the walsender, the restart_lsn may appear to + * move backward. + * + * This typically occurs at the beginning of replication. One reason is + * that streaming replication starts at the beginning of a segment, so, if + * restart_lsn is in the middle of a segment, it will be updated to an + * earlier LSN, see RequestXLogStreaming. Another reason is that the + * walreceiver chooses its startpoint based on the replayed LSN, so, if + * some records have been received but not yet applied, they will be + * received again and leads to updating the restart_lsn to an earlier + * position. + */ + XLogRecPtr last_saved_restart_lsn; + } ReplicationSlot; #define SlotIsPhysical(slot) ((slot)->data.database == InvalidOid) @@ -258,7 +292,7 @@ extern PGDLLIMPORT ReplicationSlot *MyReplicationSlot; /* GUCs */ extern PGDLLIMPORT int max_replication_slots; extern PGDLLIMPORT char *synchronized_standby_slots; -extern PGDLLIMPORT int idle_replication_slot_timeout_mins; +extern PGDLLIMPORT int idle_replication_slot_timeout_secs; /* shmem initialization functions */ extern Size ReplicationSlotsShmemSize(void); @@ -284,7 +318,9 @@ extern void ReplicationSlotMarkDirty(void); /* misc stuff */ extern void ReplicationSlotInitialize(void); -extern bool ReplicationSlotValidateName(const char *name, int elevel); +extern bool ReplicationSlotValidateName(const char *name, + bool allow_reserved_name, + int elevel); extern void ReplicationSlotReserveWal(void); extern void ReplicationSlotsComputeRequiredXmin(bool already_locked); extern void ReplicationSlotsComputeRequiredLSN(void); diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h index 30b2775952c..7c0204dd6f4 100644 --- a/src/include/replication/worker_internal.h +++ b/src/include/replication/worker_internal.h @@ -86,6 +86,17 @@ typedef struct LogicalRepWorker /* Indicates whether apply can be performed in parallel. */ bool parallel_apply; + /* + * Changes made by this transaction and subsequent ones must be preserved. + * This ensures that update_deleted conflicts can be accurately detected + * during the apply phase of logical replication by this worker. + * + * The logical replication launcher manages an internal replication slot + * named "pg_conflict_detection". It asynchronously collects this ID to + * decide when to advance the xmin value of the slot. + */ + TransactionId oldest_nonremovable_xid; + /* Stats. */ XLogRecPtr last_lsn; TimestampTz last_send_time; @@ -245,7 +256,8 @@ extern List *logicalrep_workers_find(Oid subid, bool only_running, extern bool logicalrep_worker_launch(LogicalRepWorkerType wtype, Oid dbid, Oid subid, const char *subname, Oid userid, Oid relid, - dsm_handle subworker_dsm); + dsm_handle subworker_dsm, + bool retain_dead_tuples); extern void logicalrep_worker_stop(Oid subid, Oid relid); extern void logicalrep_pa_worker_stop(ParallelApplyWorkerInfo *winfo); extern void logicalrep_worker_wakeup(Oid subid, Oid relid); diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h index f3726bc3dc5..2933eea0649 100644 --- a/src/include/storage/aio.h +++ b/src/include/storage/aio.h @@ -36,7 +36,7 @@ typedef enum IoMethod #ifdef IOMETHOD_IO_URING_ENABLED IOMETHOD_IO_URING, #endif -} IoMethod; +} IoMethod; /* We'll default to worker based execution. */ #define DEFAULT_IO_METHOD IOMETHOD_WORKER @@ -201,7 +201,7 @@ typedef enum PgAioHandleCallbackID } PgAioHandleCallbackID; #define PGAIO_HCB_MAX PGAIO_HCB_LOCAL_BUFFER_READV -StaticAssertDecl(PGAIO_HCB_MAX <= (1 << PGAIO_RESULT_ID_BITS), +StaticAssertDecl(PGAIO_HCB_MAX < (1 << PGAIO_RESULT_ID_BITS), "PGAIO_HCB_MAX is too big for PGAIO_RESULT_ID_BITS"); diff --git a/src/include/storage/aio_types.h b/src/include/storage/aio_types.h index 18183366077..afee85c787b 100644 --- a/src/include/storage/aio_types.h +++ b/src/include/storage/aio_types.h @@ -107,7 +107,7 @@ typedef struct PgAioResult /* of type PgAioResultStatus, see above */ uint32 status:PGAIO_RESULT_STATUS_BITS; - /* meaning defined by callback->error */ + /* meaning defined by callback->report */ uint32 error_data:PGAIO_RESULT_ERROR_BITS; int32 result; diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 0dec7d93b3b..52a71b138f7 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -486,8 +486,8 @@ extern bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait); extern void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln); extern void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced); extern void DropRelationLocalBuffers(RelFileLocator rlocator, - ForkNumber forkNum, - BlockNumber firstDelBlock); + ForkNumber *forkNum, int nforks, + BlockNumber *firstDelBlock); extern void DropRelationAllLocalBuffers(RelFileLocator rlocator); extern void AtEOXact_LocalBuffers(bool isCommit); diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h index 940d74462d1..f1d7beeed1a 100644 --- a/src/include/storage/copydir.h +++ b/src/include/storage/copydir.h @@ -17,7 +17,7 @@ typedef enum FileCopyMethod { FILE_COPY_METHOD_COPY, FILE_COPY_METHOD_CLONE, -} FileCopyMethod; +} FileCopyMethod; /* GUC parameters */ extern PGDLLIMPORT int file_copy_method; diff --git a/src/include/storage/dsm_registry.h b/src/include/storage/dsm_registry.h index b381e44bc9d..4871ed509eb 100644 --- a/src/include/storage/dsm_registry.h +++ b/src/include/storage/dsm_registry.h @@ -13,10 +13,15 @@ #ifndef DSM_REGISTRY_H #define DSM_REGISTRY_H +#include "lib/dshash.h" + extern void *GetNamedDSMSegment(const char *name, size_t size, void (*init_callback) (void *ptr), bool *found); - +extern dsa_area *GetNamedDSA(const char *name, bool *found); +extern dshash_table *GetNamedDSHash(const char *name, + const dshash_parameters *params, + bool *found); extern Size DSMRegistryShmemSize(void); extern void DSMRegistryShmemInit(void); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index 6f2108a44e8..826cf28fdbd 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -18,6 +18,7 @@ #error "lock.h may not be included from frontend code" #endif +#include "access/transam.h" #include "lib/ilist.h" #include "storage/lockdefs.h" #include "storage/lwlock.h" @@ -30,7 +31,7 @@ typedef struct PGPROC PGPROC; /* GUC variables */ extern PGDLLIMPORT int max_locks_per_xact; -extern PGDLLIMPORT bool log_lock_failure; +extern PGDLLIMPORT bool log_lock_failures; #ifdef LOCK_DEBUG extern PGDLLIMPORT int Trace_lock_oidmin; @@ -581,7 +582,7 @@ extern bool LockHasWaiters(const LOCKTAG *locktag, extern VirtualTransactionId *GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp); extern void AtPrepare_Locks(void); -extern void PostPrepare_Locks(TransactionId xid); +extern void PostPrepare_Locks(FullTransactionId fxid); extern bool LockCheckConflicts(LockMethod lockMethodTable, LOCKMODE lockmode, LOCK *lock, PROCLOCK *proclock); @@ -597,13 +598,13 @@ extern BlockedProcsData *GetBlockerStatusData(int blocked_pid); extern xl_standby_lock *GetRunningTransactionLocks(int *nlocks); extern const char *GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode); -extern void lock_twophase_recover(TransactionId xid, uint16 info, +extern void lock_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); -extern void lock_twophase_postcommit(TransactionId xid, uint16 info, +extern void lock_twophase_postcommit(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); -extern void lock_twophase_postabort(TransactionId xid, uint16 info, +extern void lock_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); -extern void lock_twophase_standby_recover(TransactionId xid, uint16 info, +extern void lock_twophase_standby_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); extern DeadLockState DeadLockCheck(PGPROC *proc); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 2b4cbda39a5..5e717765764 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -176,53 +176,23 @@ extern void LWLockInitialize(LWLock *lock, int tranche_id); * Every tranche ID less than NUM_INDIVIDUAL_LWLOCKS is reserved; also, * we reserve additional tranche IDs for builtin tranches not included in * the set of individual LWLocks. A call to LWLockNewTrancheId will never - * return a value less than LWTRANCHE_FIRST_USER_DEFINED. + * return a value less than LWTRANCHE_FIRST_USER_DEFINED. The actual list of + * built-in tranches is kept in lwlocklist.h. */ typedef enum BuiltinTrancheIds { - LWTRANCHE_XACT_BUFFER = NUM_INDIVIDUAL_LWLOCKS, - LWTRANCHE_COMMITTS_BUFFER, - LWTRANCHE_SUBTRANS_BUFFER, - LWTRANCHE_MULTIXACTOFFSET_BUFFER, - LWTRANCHE_MULTIXACTMEMBER_BUFFER, - LWTRANCHE_NOTIFY_BUFFER, - LWTRANCHE_SERIAL_BUFFER, - LWTRANCHE_WAL_INSERT, - LWTRANCHE_BUFFER_CONTENT, - LWTRANCHE_REPLICATION_ORIGIN_STATE, - LWTRANCHE_REPLICATION_SLOT_IO, - LWTRANCHE_LOCK_FASTPATH, - LWTRANCHE_BUFFER_MAPPING, - LWTRANCHE_LOCK_MANAGER, - LWTRANCHE_PREDICATE_LOCK_MANAGER, - LWTRANCHE_PARALLEL_HASH_JOIN, - LWTRANCHE_PARALLEL_BTREE_SCAN, - LWTRANCHE_PARALLEL_QUERY_DSA, - LWTRANCHE_PER_SESSION_DSA, - LWTRANCHE_PER_SESSION_RECORD_TYPE, - LWTRANCHE_PER_SESSION_RECORD_TYPMOD, - LWTRANCHE_SHARED_TUPLESTORE, - LWTRANCHE_SHARED_TIDBITMAP, - LWTRANCHE_PARALLEL_APPEND, - LWTRANCHE_PER_XACT_PREDICATE_LIST, - LWTRANCHE_PGSTATS_DSA, - LWTRANCHE_PGSTATS_HASH, - LWTRANCHE_PGSTATS_DATA, - LWTRANCHE_LAUNCHER_DSA, - LWTRANCHE_LAUNCHER_HASH, - LWTRANCHE_DSM_REGISTRY_DSA, - LWTRANCHE_DSM_REGISTRY_HASH, - LWTRANCHE_COMMITTS_SLRU, - LWTRANCHE_MULTIXACTMEMBER_SLRU, - LWTRANCHE_MULTIXACTOFFSET_SLRU, - LWTRANCHE_NOTIFY_SLRU, - LWTRANCHE_SERIAL_SLRU, - LWTRANCHE_SUBTRANS_SLRU, - LWTRANCHE_XACT_SLRU, - LWTRANCHE_PARALLEL_VACUUM_DSA, - LWTRANCHE_AIO_URING_COMPLETION, - LWTRANCHE_MEMORY_CONTEXT_REPORTING_STATE, - LWTRANCHE_MEMORY_CONTEXT_REPORTING_PROC, + /* + * LWTRANCHE_INVALID is an unused value that only exists to initialize the + * rest of the tranches to appropriate values. + */ + LWTRANCHE_INVALID = NUM_INDIVIDUAL_LWLOCKS - 1, + +#define PG_LWLOCK(id, name) +#define PG_LWLOCKTRANCHE(id, name) LWTRANCHE_##id, +#include "storage/lwlocklist.h" +#undef PG_LWLOCK +#undef PG_LWLOCKTRANCHE + LWTRANCHE_FIRST_USER_DEFINED, } BuiltinTrancheIds; diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index a9681738146..208d2e3a8ed 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -2,9 +2,10 @@ * * lwlocklist.h * - * The predefined LWLock list is kept in its own source file for use by - * automatic tools. The exact representation of a keyword is determined by - * the PG_LWLOCK macro, which is not defined in this file; it can be + * The list of predefined LWLocks and built-in LWLock tranches is kept in + * its own source file for use by automatic tools. The exact + * representation of a keyword is determined by the PG_LWLOCK and + * PG_LWLOCKTRANCHE macros, which are not defined in this file; they can be * defined by the caller for special purposes. * * Also, generate-lwlocknames.pl processes this file to create lwlocknames.h. @@ -84,3 +85,53 @@ PG_LWLOCK(50, DSMRegistry) PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, AioWorkerSubmissionQueue) + +/* + * There also exist several built-in LWLock tranches. As with the predefined + * LWLocks, be sure to update the WaitEventLWLock section of + * src/backend/utils/activity/wait_event_names.txt when modifying this list. + * + * Note that the IDs here (the first value) don't include the LWTRANCHE_ + * prefix. It's added elsewhere. + */ +PG_LWLOCKTRANCHE(XACT_BUFFER, XactBuffer) +PG_LWLOCKTRANCHE(COMMITTS_BUFFER, CommitTsBuffer) +PG_LWLOCKTRANCHE(SUBTRANS_BUFFER, SubtransBuffer) +PG_LWLOCKTRANCHE(MULTIXACTOFFSET_BUFFER, MultiXactOffsetBuffer) +PG_LWLOCKTRANCHE(MULTIXACTMEMBER_BUFFER, MultiXactMemberBuffer) +PG_LWLOCKTRANCHE(NOTIFY_BUFFER, NotifyBuffer) +PG_LWLOCKTRANCHE(SERIAL_BUFFER, SerialBuffer) +PG_LWLOCKTRANCHE(WAL_INSERT, WALInsert) +PG_LWLOCKTRANCHE(BUFFER_CONTENT, BufferContent) +PG_LWLOCKTRANCHE(REPLICATION_ORIGIN_STATE, ReplicationOriginState) +PG_LWLOCKTRANCHE(REPLICATION_SLOT_IO, ReplicationSlotIO) +PG_LWLOCKTRANCHE(LOCK_FASTPATH, LockFastPath) +PG_LWLOCKTRANCHE(BUFFER_MAPPING, BufferMapping) +PG_LWLOCKTRANCHE(LOCK_MANAGER, LockManager) +PG_LWLOCKTRANCHE(PREDICATE_LOCK_MANAGER, PredicateLockManager) +PG_LWLOCKTRANCHE(PARALLEL_HASH_JOIN, ParallelHashJoin) +PG_LWLOCKTRANCHE(PARALLEL_BTREE_SCAN, ParallelBtreeScan) +PG_LWLOCKTRANCHE(PARALLEL_QUERY_DSA, ParallelQueryDSA) +PG_LWLOCKTRANCHE(PER_SESSION_DSA, PerSessionDSA) +PG_LWLOCKTRANCHE(PER_SESSION_RECORD_TYPE, PerSessionRecordType) +PG_LWLOCKTRANCHE(PER_SESSION_RECORD_TYPMOD, PerSessionRecordTypmod) +PG_LWLOCKTRANCHE(SHARED_TUPLESTORE, SharedTupleStore) +PG_LWLOCKTRANCHE(SHARED_TIDBITMAP, SharedTidBitmap) +PG_LWLOCKTRANCHE(PARALLEL_APPEND, ParallelAppend) +PG_LWLOCKTRANCHE(PER_XACT_PREDICATE_LIST, PerXactPredicateList) +PG_LWLOCKTRANCHE(PGSTATS_DSA, PgStatsDSA) +PG_LWLOCKTRANCHE(PGSTATS_HASH, PgStatsHash) +PG_LWLOCKTRANCHE(PGSTATS_DATA, PgStatsData) +PG_LWLOCKTRANCHE(LAUNCHER_DSA, LogicalRepLauncherDSA) +PG_LWLOCKTRANCHE(LAUNCHER_HASH, LogicalRepLauncherHash) +PG_LWLOCKTRANCHE(DSM_REGISTRY_DSA, DSMRegistryDSA) +PG_LWLOCKTRANCHE(DSM_REGISTRY_HASH, DSMRegistryHash) +PG_LWLOCKTRANCHE(COMMITTS_SLRU, CommitTsSLRU) +PG_LWLOCKTRANCHE(MULTIXACTOFFSET_SLRU, MultiXactOffsetSLRU) +PG_LWLOCKTRANCHE(MULTIXACTMEMBER_SLRU, MultiXactMemberSLRU) +PG_LWLOCKTRANCHE(NOTIFY_SLRU, NotifySLRU) +PG_LWLOCKTRANCHE(SERIAL_SLRU, SerialSLRU) +PG_LWLOCKTRANCHE(SUBTRANS_SLRU, SubtransSLRU) +PG_LWLOCKTRANCHE(XACT_SLRU, XactSLRU) +PG_LWLOCKTRANCHE(PARALLEL_VACUUM_DSA, ParallelVacuumDSA) +PG_LWLOCKTRANCHE(AIO_URING_COMPLETION, AioUringCompletion) diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h index 267d5d90e94..c1e3a4d9f64 100644 --- a/src/include/storage/predicate.h +++ b/src/include/storage/predicate.h @@ -72,9 +72,9 @@ extern void PreCommit_CheckForSerializationFailure(void); /* two-phase commit support */ extern void AtPrepare_PredicateLocks(void); -extern void PostPrepare_PredicateLocks(TransactionId xid); -extern void PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit); -extern void predicatelock_twophase_recover(TransactionId xid, uint16 info, +extern void PostPrepare_PredicateLocks(FullTransactionId fxid); +extern void PredicateLockTwoPhaseFinish(FullTransactionId xid, bool isCommit); +extern void predicatelock_twophase_recover(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); /* parallel query support */ diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 9f9b3fcfbf1..c6f5ebceefd 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -130,9 +130,17 @@ extern PGDLLIMPORT int FastPathLockGroupsPerBackend; * the checkpoint are actually destroyed on disk. Replay can cope with a file * or block that doesn't exist, but not with a block that has the wrong * contents. + * + * Setting DELAY_CHKPT_IN_COMMIT is similar to setting DELAY_CHKPT_START, but + * it explicitly indicates that the reason for delaying the checkpoint is due + * to a transaction being within a critical commit section. We need this new + * flag to ensure all the transactions that have acquired commit timestamp are + * finished before we allow the logical replication client to advance its xid + * which is used to hold back dead rows for conflict detection. */ #define DELAY_CHKPT_START (1<<0) #define DELAY_CHKPT_COMPLETE (1<<1) +#define DELAY_CHKPT_IN_COMMIT (DELAY_CHKPT_START | 1<<2) typedef enum { diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index ef0b733ebe8..2f4ae06c279 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -53,10 +53,10 @@ extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc); extern RunningTransactions GetRunningTransactionData(void); extern bool TransactionIdIsInProgress(TransactionId xid); -extern bool TransactionIdIsActive(TransactionId xid); extern TransactionId GetOldestNonRemovableTransactionId(Relation rel); extern TransactionId GetOldestTransactionIdConsideredRunning(void); -extern TransactionId GetOldestActiveTransactionId(void); +extern TransactionId GetOldestActiveTransactionId(bool inCommitOnly, + bool allDbs); extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); extern void GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin); diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 345d5a0ecb1..afeeb1ca019 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -35,7 +35,6 @@ typedef enum PROCSIG_WALSND_INIT_STOPPING, /* ask walsenders to prepare for shutdown */ PROCSIG_BARRIER, /* global barrier interrupt */ PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */ - PROCSIG_GET_MEMORY_CONTEXT, /* ask backend to send the memory contexts */ PROCSIG_PARALLEL_APPLY_MESSAGE, /* Message from parallel apply workers */ /* Recovery conflict reasons */ diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h index 5dc5aafe5c9..845a5851b57 100644 --- a/src/include/storage/sinval.h +++ b/src/include/storage/sinval.h @@ -119,7 +119,7 @@ typedef struct Oid dbId; /* database ID */ Oid relid; /* relation ID, or 0 if whole * RelationSyncCache */ -} SharedInvalRelSyncMsg; +} SharedInvalRelSyncMsg; typedef union { diff --git a/src/include/tcop/backend_startup.h b/src/include/tcop/backend_startup.h index dcb9d056643..e8639688c00 100644 --- a/src/include/tcop/backend_startup.h +++ b/src/include/tcop/backend_startup.h @@ -86,7 +86,7 @@ typedef enum LogConnectionOption LOG_CONNECTION_AUTHENTICATION | LOG_CONNECTION_AUTHORIZATION | LOG_CONNECTION_SETUP_DURATIONS, -} LogConnectionOption; +} LogConnectionOption; /* * A collection of timings of various stages of connection establishment and diff --git a/src/include/utils/backend_status.h b/src/include/utils/backend_status.h index 430ccd7d78e..3016501ac05 100644 --- a/src/include/utils/backend_status.h +++ b/src/include/utils/backend_status.h @@ -170,10 +170,10 @@ typedef struct PgBackendStatus int64 st_progress_param[PGSTAT_NUM_PROGRESS_PARAM]; /* query identifier, optionally computed using post_parse_analyze_hook */ - uint64 st_query_id; + int64 st_query_id; /* plan identifier, optionally computed using planner_hook */ - uint64 st_plan_id; + int64 st_plan_id; } PgBackendStatus; @@ -321,16 +321,16 @@ extern void pgstat_clear_backend_activity_snapshot(void); /* Activity reporting functions */ extern void pgstat_report_activity(BackendState state, const char *cmd_str); -extern void pgstat_report_query_id(uint64 query_id, bool force); -extern void pgstat_report_plan_id(uint64 plan_id, bool force); +extern void pgstat_report_query_id(int64 query_id, bool force); +extern void pgstat_report_plan_id(int64 plan_id, bool force); extern void pgstat_report_tempfile(size_t filesize); extern void pgstat_report_appname(const char *appname); extern void pgstat_report_xact_timestamp(TimestampTz tstamp); extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser); extern const char *pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen); -extern uint64 pgstat_get_my_query_id(void); -extern uint64 pgstat_get_my_plan_id(void); +extern int64 pgstat_get_my_query_id(void); +extern int64 pgstat_get_my_plan_id(void); extern BackendType pgstat_get_backend_type_by_proc_number(ProcNumber procNumber); diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h index 277ec33c00b..00808e23f49 100644 --- a/src/include/utils/catcache.h +++ b/src/include/utils/catcache.h @@ -87,6 +87,14 @@ typedef struct catcache typedef struct catctup { + /* + * Each tuple in a cache is a member of a dlist that stores the elements + * of its hash bucket. We keep each dlist in LRU order to speed repeated + * lookups. Keep the dlist_node field first so that Valgrind understands + * the struct is reachable. + */ + dlist_node cache_elem; /* list member of per-bucket list */ + int ct_magic; /* for identifying CatCTup entries */ #define CT_MAGIC 0x57261502 @@ -99,13 +107,6 @@ typedef struct catctup Datum keys[CATCACHE_MAXKEYS]; /* - * Each tuple in a cache is a member of a dlist that stores the elements - * of its hash bucket. We keep each dlist in LRU order to speed repeated - * lookups. - */ - dlist_node cache_elem; /* list member of per-bucket list */ - - /* * A tuple marked "dead" must not be returned by subsequent searches. * However, it won't be physically deleted from the cache until its * refcount goes to zero. (If it's a member of a CatCList, the list's @@ -158,13 +159,17 @@ typedef struct catctup */ typedef struct catclist { + /* + * Keep the dlist_node field first so that Valgrind understands the struct + * is reachable. + */ + dlist_node cache_elem; /* list member of per-catcache list */ + int cl_magic; /* for identifying CatCList entries */ #define CL_MAGIC 0x52765103 uint32 hash_value; /* hash value for lookup keys */ - dlist_node cache_elem; /* list member of per-catcache list */ - /* * Lookup keys for the entry, with the first nkeys elements being valid. * All by-reference are separately allocated. diff --git a/src/include/utils/date.h b/src/include/utils/date.h index bb5c1e57b07..abfda0b1ae9 100644 --- a/src/include/utils/date.h +++ b/src/include/utils/date.h @@ -100,6 +100,8 @@ extern int32 anytime_typmod_check(bool istz, int32 typmod); extern double date2timestamp_no_overflow(DateADT dateVal); extern Timestamp date2timestamp_opt_overflow(DateADT dateVal, int *overflow); extern TimestampTz date2timestamptz_opt_overflow(DateADT dateVal, int *overflow); +extern DateADT timestamp2date_opt_overflow(Timestamp timestamp, int *overflow); +extern DateADT timestamptz2date_opt_overflow(TimestampTz timestamp, int *overflow); extern int32 date_cmp_timestamp_internal(DateADT dateVal, Timestamp dt2); extern int32 date_cmp_timestamptz_internal(DateADT dateVal, TimestampTz dt2); diff --git a/src/include/utils/dsa.h b/src/include/utils/dsa.h index 9eca8788908..0a6067be628 100644 --- a/src/include/utils/dsa.h +++ b/src/include/utils/dsa.h @@ -145,6 +145,7 @@ extern dsa_area *dsa_create_in_place_ext(void *place, size_t size, size_t init_segment_size, size_t max_segment_size); extern dsa_area *dsa_attach(dsa_handle handle); +extern bool dsa_is_attached(dsa_handle handle); extern dsa_area *dsa_attach_in_place(void *place, dsm_segment *segment); extern void dsa_release_in_place(void *place); extern void dsa_on_dsm_detach_release_in_place(dsm_segment *, Datum); diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h index 5eac0e16970..675f4f5f469 100644 --- a/src/include/utils/elog.h +++ b/src/include/utils/elog.h @@ -485,7 +485,7 @@ typedef enum PGERROR_TERSE, /* single-line error messages */ PGERROR_DEFAULT, /* recommended style */ PGERROR_VERBOSE, /* all the facts, ma'am */ -} PGErrorVerbosity; +} PGErrorVerbosity; extern PGDLLIMPORT int Log_error_verbosity; extern PGDLLIMPORT char *Log_line_prefix; diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index 799fa7ace68..82ac8646a8d 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -84,8 +84,6 @@ extern const char *show_log_timezone(void); extern void assign_maintenance_io_concurrency(int newval, void *extra); extern void assign_io_max_combine_limit(int newval, void *extra); extern void assign_io_combine_limit(int newval, void *extra); -extern bool check_max_slot_wal_keep_size(int *newval, void **extra, - GucSource source); extern void assign_max_wal_size(int newval, void *extra); extern bool check_max_stack_depth(int *newval, void **extra, GucSource source); extern void assign_max_stack_depth(int newval, void *extra); @@ -176,7 +174,5 @@ extern void assign_wal_sync_method(int new_wal_sync_method, void *extra); extern bool check_synchronized_standby_slots(char **newval, void **extra, GucSource source); extern void assign_synchronized_standby_slots(const char *newval, void *extra); -extern bool check_idle_replication_slot_timeout(int *newval, void **extra, - GucSource source); #endif /* GUC_HOOKS_H */ diff --git a/src/include/utils/injection_point.h b/src/include/utils/injection_point.h index a37958e1835..fd5bc061b7b 100644 --- a/src/include/utils/injection_point.h +++ b/src/include/utils/injection_point.h @@ -11,6 +11,19 @@ #ifndef INJECTION_POINT_H #define INJECTION_POINT_H +#include "nodes/pg_list.h" + +/* + * Injection point data, used when retrieving a list of all the attached + * injection points. + */ +typedef struct InjectionPointData +{ + const char *name; + const char *library; + const char *function; +} InjectionPointData; + /* * Injection points require --enable-injection-points. */ @@ -47,6 +60,9 @@ extern void InjectionPointCached(const char *name, void *arg); extern bool IsInjectionPointAttached(const char *name); extern bool InjectionPointDetach(const char *name); +/* Get the current set of injection points attached */ +extern List *InjectionPointList(void); + #ifdef EXEC_BACKEND extern PGDLLIMPORT struct InjectionPointsCtl *ActiveInjectionPoints; #endif diff --git a/src/include/utils/memdebug.h b/src/include/utils/memdebug.h index 7309271834b..80692dcef93 100644 --- a/src/include/utils/memdebug.h +++ b/src/include/utils/memdebug.h @@ -29,6 +29,7 @@ #define VALGRIND_MEMPOOL_ALLOC(context, addr, size) do {} while (0) #define VALGRIND_MEMPOOL_FREE(context, addr) do {} while (0) #define VALGRIND_MEMPOOL_CHANGE(context, optr, nptr, size) do {} while (0) +#define VALGRIND_MEMPOOL_TRIM(context, addr, size) do {} while (0) #endif diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h index c0987dca155..8abc26abce2 100644 --- a/src/include/utils/memutils.h +++ b/src/include/utils/memutils.h @@ -18,9 +18,6 @@ #define MEMUTILS_H #include "nodes/memnodes.h" -#include "storage/condition_variable.h" -#include "storage/lmgr.h" -#include "utils/dsa.h" /* @@ -51,23 +48,6 @@ #define AllocHugeSizeIsValid(size) ((Size) (size) <= MaxAllocHugeSize) -/* - * Memory Context reporting size limits. - */ - -/* Max length of context name and ident */ -#define MEMORY_CONTEXT_IDENT_SHMEM_SIZE 64 -/* Maximum size (in bytes) of DSA area per process */ -#define MEMORY_CONTEXT_REPORT_MAX_PER_BACKEND ((size_t) (1 * 1024 * 1024)) - -/* - * Maximum size per context. Actual size may be lower as this assumes the worst - * case of deepest path and longest identifiers (name and ident, thus the - * multiplication by 2). The path depth is limited to 100 like for memory - * context logging. - */ -#define MAX_MEMORY_CONTEXT_STATS_SIZE (sizeof(MemoryStatsEntry) + \ - (100 * sizeof(int)) + (2 * MEMORY_CONTEXT_IDENT_SHMEM_SIZE)) /* * Standard top-level memory contexts. @@ -339,66 +319,4 @@ pg_memory_is_all_zeros(const void *ptr, size_t len) return true; } -/* Dynamic shared memory state for statistics per context */ -typedef struct MemoryStatsEntry -{ - dsa_pointer name; - dsa_pointer ident; - dsa_pointer path; - NodeTag type; - int path_length; - int levels; - int64 totalspace; - int64 nblocks; - int64 freespace; - int64 freechunks; - int num_agg_stats; -} MemoryStatsEntry; - -/* - * Static shared memory state representing the DSA area created for memory - * context statistics reporting. A single DSA area is created and used by all - * the processes, each having its specific DSA allocations for sharing memory - * statistics, tracked by per backend static shared memory state. - */ -typedef struct MemoryStatsCtl -{ - dsa_handle memstats_dsa_handle; - LWLock lw_lock; -} MemoryStatsCtl; - -/* - * Per backend static shared memory state for memory context statistics - * reporting. - */ -typedef struct MemoryStatsBackendState -{ - ConditionVariable memcxt_cv; - LWLock lw_lock; - int proc_id; - int total_stats; - bool summary; - dsa_pointer memstats_dsa_pointer; - TimestampTz stats_timestamp; -} MemoryStatsBackendState; - - -/* - * Used for storage of transient identifiers for pg_get_backend_memory_contexts - */ -typedef struct MemoryStatsContextId -{ - MemoryContext context; - int context_id; -} MemoryStatsContextId; - -extern PGDLLIMPORT MemoryStatsBackendState *memCxtState; -extern PGDLLIMPORT MemoryStatsCtl *memCxtArea; -extern PGDLLIMPORT dsa_area *MemoryStatsDsaArea; -extern void ProcessGetMemoryContextInterrupt(void); -extern const char *ContextTypeToString(NodeTag type); -extern void HandleGetMemoryContextInterrupt(void); -extern Size MemoryContextReportingShmemSize(void); -extern void MemoryContextReportingShmemInit(void); -extern void AtProcExit_memstats_cleanup(int code, Datum arg); #endif /* MEMUTILS_H */ diff --git a/src/include/utils/palloc.h b/src/include/utils/palloc.h index e1b42267b22..039b9cba61a 100644 --- a/src/include/utils/palloc.h +++ b/src/include/utils/palloc.h @@ -133,6 +133,8 @@ MemoryContextSwitchTo(MemoryContext context) /* Registration of memory context reset/delete callbacks */ extern void MemoryContextRegisterResetCallback(MemoryContext context, MemoryContextCallback *cb); +extern void MemoryContextUnregisterResetCallback(MemoryContext context, + MemoryContextCallback *cb); /* * These are like standard strdup() except the copied string is diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 7b8cbf58d2c..2b072cafb4d 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -12,7 +12,14 @@ #ifndef _PG_LOCALE_ #define _PG_LOCALE_ +#include "mb/pg_wchar.h" + #ifdef USE_ICU +/* only include the C APIs, to avoid errors in cpluspluscheck */ +#undef U_SHOW_CPLUSPLUS_API +#define U_SHOW_CPLUSPLUS_API 0 +#undef U_SHOW_CPLUSPLUS_HEADER_API +#define U_SHOW_CPLUSPLUS_HEADER_API 0 #include <unicode/ucol.h> #endif @@ -77,6 +84,52 @@ struct collate_methods bool strxfrm_is_safe; }; +struct ctype_methods +{ + /* case mapping: LOWER()/INITCAP()/UPPER() */ + size_t (*strlower) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + size_t (*strtitle) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + size_t (*strupper) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + size_t (*strfold) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + + /* required */ + bool (*wc_isdigit) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isalpha) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isalnum) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isupper) (pg_wchar wc, pg_locale_t locale); + bool (*wc_islower) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isgraph) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isprint) (pg_wchar wc, pg_locale_t locale); + bool (*wc_ispunct) (pg_wchar wc, pg_locale_t locale); + bool (*wc_isspace) (pg_wchar wc, pg_locale_t locale); + pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale); + pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale); + + /* required */ + bool (*char_is_cased) (char ch, pg_locale_t locale); + + /* + * Optional. If defined, will only be called for single-byte encodings. If + * not defined, or if the encoding is multibyte, will fall back to + * pg_strlower(). + */ + char (*char_tolower) (unsigned char ch, pg_locale_t locale); + + /* + * For regex and pattern matching efficiency, the maximum char value + * supported by the above methods. If zero, limit is set by regex code. + */ + pg_wchar max_chr; +}; + /* * We use a discriminated union to hold either a locale_t or an ICU collator. * pg_locale_t is occasionally checked for truth, so make it a pointer. @@ -95,13 +148,13 @@ struct collate_methods */ struct pg_locale_struct { - char provider; bool deterministic; bool collate_is_c; bool ctype_is_c; bool is_default; const struct collate_methods *collate; /* NULL if collate_is_c */ + const struct ctype_methods *ctype; /* NULL if ctype_is_c */ union { @@ -125,6 +178,10 @@ extern void init_database_collation(void); extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); + +extern bool char_is_cased(char ch, pg_locale_t locale); +extern bool char_tolower_enabled(pg_locale_t locale); +extern char char_tolower(unsigned char ch, pg_locale_t locale); extern size_t pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -159,8 +216,8 @@ extern void report_newlocale_failure(const char *localename); /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */ extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen, - pg_locale_t locale); + locale_t loc); extern size_t char2wchar(wchar_t *to, size_t tolen, - const char *from, size_t fromlen, pg_locale_t locale); + const char *from, size_t fromlen, locale_t loc); #endif /* _PG_LOCALE_ */ diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h index d5557e6e998..6cf00008f63 100644 --- a/src/include/utils/pgstat_internal.h +++ b/src/include/utils/pgstat_internal.h @@ -295,18 +295,11 @@ typedef struct PgStat_KindInfo * * Returns true if some of the stats could not be flushed, due to lock * contention for example. Optional. - */ - bool (*flush_static_cb) (bool nowait); - - /* - * For fixed-numbered or variable-numbered statistics: Check for pending - * stats in need of flush with flush_static_cb, when these do not use - * PgStat_EntryRef->pending. * - * Returns true if there are any stats pending for flush, triggering - * flush_static_cb. Optional. + * "pgstat_report_fixed" needs to be set to trigger the flush of pending + * stats. */ - bool (*have_static_pending_cb) (void); + bool (*flush_static_cb) (bool nowait); /* * For fixed-numbered statistics: Reset All. @@ -627,7 +620,6 @@ extern void pgstat_archiver_snapshot_cb(void); extern bool pgstat_flush_backend(bool nowait, bits32 flags); extern bool pgstat_backend_flush_cb(bool nowait); -extern bool pgstat_backend_have_pending_cb(void); extern void pgstat_backend_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); @@ -676,7 +668,6 @@ extern bool pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); extern void pgstat_flush_io(bool nowait); -extern bool pgstat_io_have_pending_cb(void); extern bool pgstat_io_flush_cb(bool nowait); extern void pgstat_io_init_shmem_cb(void *stats); extern void pgstat_io_reset_all_cb(TimestampTz ts); @@ -738,7 +729,6 @@ extern PgStatShared_Common *pgstat_init_entry(PgStat_Kind kind, * Functions in pgstat_slru.c */ -extern bool pgstat_slru_have_pending_cb(void); extern bool pgstat_slru_flush_cb(bool nowait); extern void pgstat_slru_init_shmem_cb(void *stats); extern void pgstat_slru_reset_all_cb(TimestampTz ts); @@ -750,7 +740,6 @@ extern void pgstat_slru_snapshot_cb(void); */ extern void pgstat_wal_init_backend_cb(void); -extern bool pgstat_wal_have_pending_cb(void); extern bool pgstat_wal_flush_cb(bool nowait); extern void pgstat_wal_init_shmem_cb(void *stats); extern void pgstat_wal_reset_all_cb(TimestampTz ts); @@ -778,8 +767,23 @@ extern void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, uint64 obji * Variables in pgstat.c */ -extern PGDLLIMPORT PgStat_LocalState pgStatLocal; +/* + * Track if *any* pending fixed-numbered statistics should be flushed to + * shared memory. + * + * This flag can be switched to true by fixed-numbered statistics to let + * pgstat_report_stat() know if it needs to go through one round of + * reports, calling flush_static_cb for each fixed-numbered statistics + * kind. When this flag is not set, pgstat_report_stat() is able to do + * a fast exit, knowing that there are no pending fixed-numbered statistics. + * + * Statistics callbacks should never reset this flag; pgstat_report_stat() + * is in charge of doing that. + */ +extern PGDLLIMPORT bool pgstat_report_fixed; +/* Backend-local stats state */ +extern PGDLLIMPORT PgStat_LocalState pgStatLocal; /* * Implementation of inline functions declared above. diff --git a/src/include/utils/pgstat_kind.h b/src/include/utils/pgstat_kind.h index f44169fd5a3..eb5f0b3ae6d 100644 --- a/src/include/utils/pgstat_kind.h +++ b/src/include/utils/pgstat_kind.h @@ -18,7 +18,7 @@ /* Range of IDs allowed, for built-in and custom kinds */ #define PGSTAT_KIND_MIN 1 /* Minimum ID allowed */ -#define PGSTAT_KIND_MAX 256 /* Maximum ID allowed */ +#define PGSTAT_KIND_MAX 32 /* Maximum ID allowed */ /* use 0 for INVALID, to catch zero-initialized data */ #define PGSTAT_KIND_INVALID 0 @@ -46,7 +46,7 @@ /* Custom stats kinds */ /* Range of IDs allowed for custom stats kinds */ -#define PGSTAT_KIND_CUSTOM_MIN 128 +#define PGSTAT_KIND_CUSTOM_MIN 24 #define PGSTAT_KIND_CUSTOM_MAX PGSTAT_KIND_MAX #define PGSTAT_KIND_CUSTOM_SIZE (PGSTAT_KIND_CUSTOM_MAX - PGSTAT_KIND_CUSTOM_MIN + 1) @@ -55,7 +55,7 @@ * development and have not reserved their own unique kind ID yet. See: * https://wiki.postgresql.org/wiki/CustomCumulativeStats */ -#define PGSTAT_KIND_EXPERIMENTAL 128 +#define PGSTAT_KIND_EXPERIMENTAL 24 static inline bool pgstat_is_kind_builtin(PgStat_Kind kind) diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h index 07ec5318db7..1baa6d50bfd 100644 --- a/src/include/utils/plancache.h +++ b/src/include/utils/plancache.h @@ -18,8 +18,6 @@ #include "access/tupdesc.h" #include "lib/ilist.h" #include "nodes/params.h" -#include "nodes/parsenodes.h" -#include "nodes/plannodes.h" #include "tcop/cmdtag.h" #include "utils/queryenvironment.h" #include "utils/resowner.h" @@ -153,11 +151,10 @@ typedef struct CachedPlanSource * The reference count includes both the link from the parent CachedPlanSource * (if any), and any active plan executions, so the plan can be discarded * exactly when refcount goes to zero. Both the struct itself and the - * subsidiary data, except the PlannedStmts in stmt_list live in the context - * denoted by the context field; the PlannedStmts live in the context denoted - * by stmt_context. Separate contexts makes it easy to free a no-longer-needed - * cached plan. (However, if is_oneshot is true, the context does not belong - * solely to the CachedPlan so no freeing is possible.) + * subsidiary data live in the context denoted by the context field. + * This makes it easy to free a no-longer-needed cached plan. (However, + * if is_oneshot is true, the context does not belong solely to the CachedPlan + * so no freeing is possible.) */ typedef struct CachedPlan { @@ -165,7 +162,6 @@ typedef struct CachedPlan List *stmt_list; /* list of PlannedStmts */ bool is_oneshot; /* is it a "oneshot" plan? */ bool is_saved; /* is CachedPlan in a long-lived context? */ - bool is_reused; /* is it a reused generic plan? */ bool is_valid; /* is the stmt_list currently valid? */ Oid planRoleId; /* Role ID the plan was created for */ bool dependsOnRole; /* is plan specific to that role? */ @@ -174,10 +170,6 @@ typedef struct CachedPlan int generation; /* parent's generation number for this plan */ int refcount; /* count of live references to this struct */ MemoryContext context; /* context containing this CachedPlan */ - MemoryContext stmt_context; /* context containing the PlannedStmts in - * stmt_list, but not the List itself which is - * in the above context; NULL if is_oneshot is - * true. */ } CachedPlan; /* @@ -249,10 +241,6 @@ extern CachedPlan *GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams, ResourceOwner owner, QueryEnvironment *queryEnv); -extern PlannedStmt *UpdateCachedPlan(CachedPlanSource *plansource, - int query_index, - QueryEnvironment *queryEnv); - extern void ReleaseCachedPlan(CachedPlan *plan, ResourceOwner owner); extern bool CachedPlanAllowsSimpleValidityCheck(CachedPlanSource *plansource, @@ -265,30 +253,4 @@ extern bool CachedPlanIsSimplyValid(CachedPlanSource *plansource, extern CachedExpression *GetCachedExpression(Node *expr); extern void FreeCachedExpression(CachedExpression *cexpr); -/* - * CachedPlanRequiresLocking: should the executor acquire additional locks? - * - * If the plan is a saved generic plan, the executor must acquire locks for - * relations that are not covered by AcquireExecutorLocks(), such as partitions - * that are subject to initial runtime pruning. - */ -static inline bool -CachedPlanRequiresLocking(CachedPlan *cplan) -{ - return !cplan->is_oneshot && cplan->is_reused; -} - -/* - * CachedPlanValid - * Returns whether a cached generic plan is still valid. - * - * Invoked by the executor to check if the plan has not been invalidated after - * taking locks during the initialization of the plan. - */ -static inline bool -CachedPlanValid(CachedPlan *cplan) -{ - return cplan->is_valid; -} - #endif /* PLANCACHE_H */ diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h index ddee031f551..0b62143af8b 100644 --- a/src/include/utils/portal.h +++ b/src/include/utils/portal.h @@ -138,7 +138,6 @@ typedef struct PortalData QueryCompletion qc; /* command completion data for executed query */ List *stmts; /* list of PlannedStmts */ CachedPlan *cplan; /* CachedPlan, if stmts are from one */ - CachedPlanSource *plansource; /* CachedPlanSource, for cplan */ ParamListInfo portalParams; /* params to pass to query */ QueryEnvironment *queryEnv; /* environment for query */ @@ -241,8 +240,7 @@ extern void PortalDefineQuery(Portal portal, const char *sourceText, CommandTag commandTag, List *stmts, - CachedPlan *cplan, - CachedPlanSource *plansource); + CachedPlan *cplan); extern PlannedStmt *PortalGetPrimaryStmt(Portal portal); extern void PortalCreateHoldStore(Portal portal); extern void PortalHashTableDeleteAll(void); diff --git a/src/include/utils/skipsupport.h b/src/include/utils/skipsupport.h index bc51847cf61..c42be001fb5 100644 --- a/src/include/utils/skipsupport.h +++ b/src/include/utils/skipsupport.h @@ -90,7 +90,7 @@ typedef struct SkipSupportData */ SkipSupportIncDec decrement; SkipSupportIncDec increment; -} SkipSupportData; +} SkipSupportData; extern SkipSupport PrepareSkipSupportFromOpclass(Oid opfamily, Oid opcintype, bool reverse); diff --git a/src/include/utils/timestamp.h b/src/include/utils/timestamp.h index 8c205859c3b..93531732b08 100644 --- a/src/include/utils/timestamp.h +++ b/src/include/utils/timestamp.h @@ -144,6 +144,9 @@ extern int timestamp_cmp_internal(Timestamp dt1, Timestamp dt2); extern TimestampTz timestamp2timestamptz_opt_overflow(Timestamp timestamp, int *overflow); +extern Timestamp timestamptz2timestamp_opt_overflow(TimestampTz timestamp, + int *overflow); + extern int32 timestamp_cmp_timestamptz_internal(Timestamp timestampVal, TimestampTz dt2); diff --git a/src/include/varatt.h b/src/include/varatt.h index 2e8564d4998..aeeabf9145b 100644 --- a/src/include/varatt.h +++ b/src/include/varatt.h @@ -89,20 +89,35 @@ typedef enum vartag_external VARTAG_ONDISK = 18 } vartag_external; +/* Is a TOAST pointer either type of expanded-object pointer? */ /* this test relies on the specific tag values above */ -#define VARTAG_IS_EXPANDED(tag) \ - (((tag) & ~1) == VARTAG_EXPANDED_RO) +static inline bool +VARTAG_IS_EXPANDED(vartag_external tag) +{ + return ((tag & ~1) == VARTAG_EXPANDED_RO); +} -#define VARTAG_SIZE(tag) \ - ((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ - VARTAG_IS_EXPANDED(tag) ? sizeof(varatt_expanded) : \ - (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \ - (AssertMacro(false), 0)) +/* Size of the data part of a "TOAST pointer" datum */ +static inline Size +VARTAG_SIZE(vartag_external tag) +{ + if (tag == VARTAG_INDIRECT) + return sizeof(varatt_indirect); + else if (VARTAG_IS_EXPANDED(tag)) + return sizeof(varatt_expanded); + else if (tag == VARTAG_ONDISK) + return sizeof(varatt_external); + else + { + Assert(false); + return 0; + } +} /* * These structs describe the header of a varlena object that may have been * TOASTed. Generally, don't reference these structs directly, but use the - * macros below. + * functions and macros below. * * We use separate structs for the aligned and unaligned cases because the * compiler might otherwise think it could generate code that assumes @@ -166,7 +181,9 @@ typedef struct /* * Endian-dependent macros. These are considered internal --- use the - * external macros below instead of using these directly. + * external functions below instead of using these directly. All of these + * expect an argument that is a pointer, not a Datum. Some of them have + * multiple-evaluation hazards, too. * * Note: IS_1B is true for external toast records but VARSIZE_1B will return 0 * for such records. Hence you should usually check for IS_EXTERNAL before @@ -194,7 +211,7 @@ typedef struct #define VARSIZE_1B(PTR) \ (((varattrib_1b *) (PTR))->va_header & 0x7F) #define VARTAG_1B_E(PTR) \ - (((varattrib_1b_e *) (PTR))->va_tag) + ((vartag_external) ((varattrib_1b_e *) (PTR))->va_tag) #define SET_VARSIZE_4B(PTR,len) \ (((varattrib_4b *) (PTR))->va_4byte.va_header = (len) & 0x3FFFFFFF) @@ -227,7 +244,7 @@ typedef struct #define VARSIZE_1B(PTR) \ ((((varattrib_1b *) (PTR))->va_header >> 1) & 0x7F) #define VARTAG_1B_E(PTR) \ - (((varattrib_1b_e *) (PTR))->va_tag) + ((vartag_external) ((varattrib_1b_e *) (PTR))->va_tag) #define SET_VARSIZE_4B(PTR,len) \ (((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2)) @@ -247,19 +264,19 @@ typedef struct #define VARDATA_1B_E(PTR) (((varattrib_1b_e *) (PTR))->va_data) /* - * Externally visible TOAST macros begin here. + * Externally visible TOAST functions and macros begin here. All of these + * were originally macros, accounting for the upper-case naming. + * + * Most of these functions accept a pointer to a value of a toastable data + * type. The caller's variable might be declared "text *" or the like, + * so we use "void *" here. Callers that are working with a Datum variable + * must apply DatumGetPointer before calling these functions. */ #define VARHDRSZ_EXTERNAL offsetof(varattrib_1b_e, va_data) #define VARHDRSZ_COMPRESSED offsetof(varattrib_4b, va_compressed.va_data) #define VARHDRSZ_SHORT offsetof(varattrib_1b, va_data) - #define VARATT_SHORT_MAX 0x7F -#define VARATT_CAN_MAKE_SHORT(PTR) \ - (VARATT_IS_4B_U(PTR) && \ - (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) <= VARATT_SHORT_MAX) -#define VARATT_CONVERTED_SHORT_SIZE(PTR) \ - (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) /* * In consumers oblivious to data alignment, call PG_DETOAST_DATUM_PACKED(), @@ -272,70 +289,234 @@ typedef struct * Code assembling a new datum should call VARDATA() and SET_VARSIZE(). * (Datums begin life untoasted.) * - * Other macros here should usually be used only by tuple assembly/disassembly + * Other functions here should usually be used only by tuple assembly/disassembly * code and code that specifically wants to work with still-toasted Datums. */ -#define VARDATA(PTR) VARDATA_4B(PTR) -#define VARSIZE(PTR) VARSIZE_4B(PTR) - -#define VARSIZE_SHORT(PTR) VARSIZE_1B(PTR) -#define VARDATA_SHORT(PTR) VARDATA_1B(PTR) - -#define VARTAG_EXTERNAL(PTR) VARTAG_1B_E(PTR) -#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR))) -#define VARDATA_EXTERNAL(PTR) VARDATA_1B_E(PTR) - -#define VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) -#define VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) -#define VARATT_IS_EXTERNAL_ONDISK(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK) -#define VARATT_IS_EXTERNAL_INDIRECT(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_INDIRECT) -#define VARATT_IS_EXTERNAL_EXPANDED_RO(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RO) -#define VARATT_IS_EXTERNAL_EXPANDED_RW(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RW) -#define VARATT_IS_EXTERNAL_EXPANDED(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) -#define VARATT_IS_EXTERNAL_NON_EXPANDED(PTR) \ - (VARATT_IS_EXTERNAL(PTR) && !VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR))) -#define VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) -#define VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) - -#define SET_VARSIZE(PTR, len) SET_VARSIZE_4B(PTR, len) -#define SET_VARSIZE_SHORT(PTR, len) SET_VARSIZE_1B(PTR, len) -#define SET_VARSIZE_COMPRESSED(PTR, len) SET_VARSIZE_4B_C(PTR, len) - -#define SET_VARTAG_EXTERNAL(PTR, tag) SET_VARTAG_1B_E(PTR, tag) - -#define VARSIZE_ANY(PTR) \ - (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR) : \ - (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR) : \ - VARSIZE_4B(PTR))) - -/* Size of a varlena data, excluding header */ -#define VARSIZE_ANY_EXHDR(PTR) \ - (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR)-VARHDRSZ_EXTERNAL : \ - (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR)-VARHDRSZ_SHORT : \ - VARSIZE_4B(PTR)-VARHDRSZ)) +/* Size of a known-not-toasted varlena datum, including header */ +static inline Size +VARSIZE(const void *PTR) +{ + return VARSIZE_4B(PTR); +} + +/* Start of data area of a known-not-toasted varlena datum */ +static inline char * +VARDATA(const void *PTR) +{ + return VARDATA_4B(PTR); +} + +/* Size of a known-short-header varlena datum, including header */ +static inline Size +VARSIZE_SHORT(const void *PTR) +{ + return VARSIZE_1B(PTR); +} + +/* Start of data area of a known-short-header varlena datum */ +static inline char * +VARDATA_SHORT(const void *PTR) +{ + return VARDATA_1B(PTR); +} + +/* Type tag of a "TOAST pointer" datum */ +static inline vartag_external +VARTAG_EXTERNAL(const void *PTR) +{ + return VARTAG_1B_E(PTR); +} + +/* Size of a "TOAST pointer" datum, including header */ +static inline Size +VARSIZE_EXTERNAL(const void *PTR) +{ + return VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR)); +} + +/* Start of data area of a "TOAST pointer" datum */ +static inline char * +VARDATA_EXTERNAL(const void *PTR) +{ + return VARDATA_1B_E(PTR); +} + +/* Is varlena datum in inline-compressed format? */ +static inline bool +VARATT_IS_COMPRESSED(const void *PTR) +{ + return VARATT_IS_4B_C(PTR); +} + +/* Is varlena datum a "TOAST pointer" datum? */ +static inline bool +VARATT_IS_EXTERNAL(const void *PTR) +{ + return VARATT_IS_1B_E(PTR); +} + +/* Is varlena datum a pointer to on-disk toasted data? */ +static inline bool +VARATT_IS_EXTERNAL_ONDISK(const void *PTR) +{ + return VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK; +} + +/* Is varlena datum an indirect pointer? */ +static inline bool +VARATT_IS_EXTERNAL_INDIRECT(const void *PTR) +{ + return VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_INDIRECT; +} + +/* Is varlena datum a read-only pointer to an expanded object? */ +static inline bool +VARATT_IS_EXTERNAL_EXPANDED_RO(const void *PTR) +{ + return VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RO; +} + +/* Is varlena datum a read-write pointer to an expanded object? */ +static inline bool +VARATT_IS_EXTERNAL_EXPANDED_RW(const void *PTR) +{ + return VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RW; +} + +/* Is varlena datum either type of pointer to an expanded object? */ +static inline bool +VARATT_IS_EXTERNAL_EXPANDED(const void *PTR) +{ + return VARATT_IS_EXTERNAL(PTR) && VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR)); +} + +/* Is varlena datum a "TOAST pointer", but not for an expanded object? */ +static inline bool +VARATT_IS_EXTERNAL_NON_EXPANDED(const void *PTR) +{ + return VARATT_IS_EXTERNAL(PTR) && !VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR)); +} + +/* Is varlena datum a short-header datum? */ +static inline bool +VARATT_IS_SHORT(const void *PTR) +{ + return VARATT_IS_1B(PTR); +} + +/* Is varlena datum not in traditional (4-byte-header, uncompressed) format? */ +static inline bool +VARATT_IS_EXTENDED(const void *PTR) +{ + return !VARATT_IS_4B_U(PTR); +} + +/* Is varlena datum short enough to convert to short-header format? */ +static inline bool +VARATT_CAN_MAKE_SHORT(const void *PTR) +{ + return VARATT_IS_4B_U(PTR) && + (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) <= VARATT_SHORT_MAX; +} + +/* Size that datum will have in short-header format, including header */ +static inline Size +VARATT_CONVERTED_SHORT_SIZE(const void *PTR) +{ + return VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT; +} + +/* Set the size (including header) of a 4-byte-header varlena datum */ +static inline void +SET_VARSIZE(void *PTR, Size len) +{ + SET_VARSIZE_4B(PTR, len); +} + +/* Set the size (including header) of a short-header varlena datum */ +static inline void +SET_VARSIZE_SHORT(void *PTR, Size len) +{ + SET_VARSIZE_1B(PTR, len); +} + +/* Set the size (including header) of an inline-compressed varlena datum */ +static inline void +SET_VARSIZE_COMPRESSED(void *PTR, Size len) +{ + SET_VARSIZE_4B_C(PTR, len); +} + +/* Set the type tag of a "TOAST pointer" datum */ +static inline void +SET_VARTAG_EXTERNAL(void *PTR, vartag_external tag) +{ + SET_VARTAG_1B_E(PTR, tag); +} + +/* Size of a varlena datum of any format, including header */ +static inline Size +VARSIZE_ANY(const void *PTR) +{ + if (VARATT_IS_1B_E(PTR)) + return VARSIZE_EXTERNAL(PTR); + else if (VARATT_IS_1B(PTR)) + return VARSIZE_1B(PTR); + else + return VARSIZE_4B(PTR); +} + +/* Size of a varlena datum of any format, excluding header */ +static inline Size +VARSIZE_ANY_EXHDR(const void *PTR) +{ + if (VARATT_IS_1B_E(PTR)) + return VARSIZE_EXTERNAL(PTR) - VARHDRSZ_EXTERNAL; + else if (VARATT_IS_1B(PTR)) + return VARSIZE_1B(PTR) - VARHDRSZ_SHORT; + else + return VARSIZE_4B(PTR) - VARHDRSZ; +} + +/* Start of data area of a plain or short-header varlena datum */ /* caution: this will not work on an external or compressed-in-line Datum */ /* caution: this will return a possibly unaligned pointer */ -#define VARDATA_ANY(PTR) \ - (VARATT_IS_1B(PTR) ? VARDATA_1B(PTR) : VARDATA_4B(PTR)) +static inline char * +VARDATA_ANY(const void *PTR) +{ + return VARATT_IS_1B(PTR) ? VARDATA_1B(PTR) : VARDATA_4B(PTR); +} -/* Decompressed size and compression method of a compressed-in-line Datum */ -#define VARDATA_COMPRESSED_GET_EXTSIZE(PTR) \ - (((varattrib_4b *) (PTR))->va_compressed.va_tcinfo & VARLENA_EXTSIZE_MASK) -#define VARDATA_COMPRESSED_GET_COMPRESS_METHOD(PTR) \ - (((varattrib_4b *) (PTR))->va_compressed.va_tcinfo >> VARLENA_EXTSIZE_BITS) +/* Decompressed size of a compressed-in-line varlena datum */ +static inline Size +VARDATA_COMPRESSED_GET_EXTSIZE(const void *PTR) +{ + return ((varattrib_4b *) PTR)->va_compressed.va_tcinfo & VARLENA_EXTSIZE_MASK; +} + +/* Compression method of a compressed-in-line varlena datum */ +static inline uint32 +VARDATA_COMPRESSED_GET_COMPRESS_METHOD(const void *PTR) +{ + return ((varattrib_4b *) PTR)->va_compressed.va_tcinfo >> VARLENA_EXTSIZE_BITS; +} /* Same for external Datums; but note argument is a struct varatt_external */ -#define VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer) \ - ((toast_pointer).va_extinfo & VARLENA_EXTSIZE_MASK) -#define VARATT_EXTERNAL_GET_COMPRESS_METHOD(toast_pointer) \ - ((toast_pointer).va_extinfo >> VARLENA_EXTSIZE_BITS) +static inline Size +VARATT_EXTERNAL_GET_EXTSIZE(struct varatt_external toast_pointer) +{ + return toast_pointer.va_extinfo & VARLENA_EXTSIZE_MASK; +} +static inline uint32 +VARATT_EXTERNAL_GET_COMPRESS_METHOD(struct varatt_external toast_pointer) +{ + return toast_pointer.va_extinfo >> VARLENA_EXTSIZE_BITS; +} + +/* Set size and compress method of an externally-stored varlena datum */ +/* This has to remain a macro; beware multiple evaluations! */ #define VARATT_EXTERNAL_SET_SIZE_AND_COMPRESS_METHOD(toast_pointer, len, cm) \ do { \ Assert((cm) == TOAST_PGLZ_COMPRESSION_ID || \ @@ -351,8 +532,11 @@ typedef struct * VARHDRSZ overhead, the former doesn't. We never use compression unless it * actually saves space, so we expect either equality or less-than. */ -#define VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer) \ - (VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer) < \ - (toast_pointer).va_rawsize - VARHDRSZ) +static inline bool +VARATT_EXTERNAL_IS_COMPRESSED(struct varatt_external toast_pointer) +{ + return VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer) < + (Size) (toast_pointer.va_rawsize - VARHDRSZ); +} #endif diff --git a/src/interfaces/ecpg/compatlib/informix.c b/src/interfaces/ecpg/compatlib/informix.c index e829d722f22..ca11a81f1bc 100644 --- a/src/interfaces/ecpg/compatlib/informix.c +++ b/src/interfaces/ecpg/compatlib/informix.c @@ -807,8 +807,10 @@ rfmtlong(long lng_val, const char *fmt, char *outbuf) if (strchr(fmt, (int) '(') && strchr(fmt, (int) ')')) brackets_ok = 1; - /* get position of the right-most dot in the format-string */ - /* and fill the temp-string wit '0's up to there. */ + /* + * get position of the right-most dot in the format-string and fill the + * temp-string with '0's up to there. + */ dotpos = getRightMostDot(fmt); /* start to parse the format-string */ diff --git a/src/interfaces/ecpg/ecpglib/connect.c b/src/interfaces/ecpg/ecpglib/connect.c index 2bbb70333dc..78de9f298ba 100644 --- a/src/interfaces/ecpg/ecpglib/connect.c +++ b/src/interfaces/ecpg/ecpglib/connect.c @@ -58,7 +58,12 @@ ecpg_get_connection_nr(const char *connection_name) for (con = all_connections; con != NULL; con = con->next) { - if (strcmp(connection_name, con->name) == 0) + /* + * Check for the case of a NULL connection name, stored as such in + * the connection information by ECPGconnect() when the database + * name is not specified by its caller. + */ + if (con->name != NULL && strcmp(connection_name, con->name) == 0) break; } ret = con; @@ -259,7 +264,8 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p struct connection *this; int i, connect_params = 0; - char *dbname = name ? ecpg_strdup(name, lineno) : NULL, + bool alloc_failed = (sqlca == NULL); + char *dbname = name ? ecpg_strdup(name, lineno, &alloc_failed) : NULL, *host = NULL, *tmp, *port = NULL, @@ -268,11 +274,12 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p const char **conn_keywords; const char **conn_values; - if (sqlca == NULL) + if (alloc_failed) { ecpg_raise(lineno, ECPG_OUT_OF_MEMORY, ECPG_SQLSTATE_ECPG_OUT_OF_MEMORY, NULL); - ecpg_free(dbname); + if (dbname) + ecpg_free(dbname); return false; } @@ -297,7 +304,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p if (envname) { ecpg_free(dbname); - dbname = ecpg_strdup(envname, lineno); + dbname = ecpg_strdup(envname, lineno, &alloc_failed); } } @@ -349,7 +356,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p tmp = strrchr(dbname + offset, '?'); if (tmp != NULL) /* options given */ { - options = ecpg_strdup(tmp + 1, lineno); + options = ecpg_strdup(tmp + 1, lineno, &alloc_failed); *tmp = '\0'; } @@ -358,7 +365,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p { if (tmp[1] != '\0') /* non-empty database name */ { - realname = ecpg_strdup(tmp + 1, lineno); + realname = ecpg_strdup(tmp + 1, lineno, &alloc_failed); connect_params++; } *tmp = '\0'; @@ -368,7 +375,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p if (tmp != NULL) /* port number given */ { *tmp = '\0'; - port = ecpg_strdup(tmp + 1, lineno); + port = ecpg_strdup(tmp + 1, lineno, &alloc_failed); connect_params++; } @@ -402,7 +409,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p { if (*(dbname + offset) != '\0') { - host = ecpg_strdup(dbname + offset, lineno); + host = ecpg_strdup(dbname + offset, lineno, &alloc_failed); connect_params++; } } @@ -414,7 +421,7 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p tmp = strrchr(dbname, ':'); if (tmp != NULL) /* port number given */ { - port = ecpg_strdup(tmp + 1, lineno); + port = ecpg_strdup(tmp + 1, lineno, &alloc_failed); connect_params++; *tmp = '\0'; } @@ -422,14 +429,14 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p tmp = strrchr(dbname, '@'); if (tmp != NULL) /* host name given */ { - host = ecpg_strdup(tmp + 1, lineno); + host = ecpg_strdup(tmp + 1, lineno, &alloc_failed); connect_params++; *tmp = '\0'; } if (strlen(dbname) > 0) { - realname = ecpg_strdup(dbname, lineno); + realname = ecpg_strdup(dbname, lineno, &alloc_failed); connect_params++; } else @@ -460,7 +467,18 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p */ conn_keywords = (const char **) ecpg_alloc((connect_params + 1) * sizeof(char *), lineno); conn_values = (const char **) ecpg_alloc(connect_params * sizeof(char *), lineno); - if (conn_keywords == NULL || conn_values == NULL) + + /* Decide on a connection name */ + if (connection_name != NULL || realname != NULL) + { + this->name = ecpg_strdup(connection_name ? connection_name : realname, + lineno, &alloc_failed); + } + else + this->name = NULL; + + /* Deal with any failed allocations above */ + if (conn_keywords == NULL || conn_values == NULL || alloc_failed) { if (host) ecpg_free(host); @@ -476,6 +494,8 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p ecpg_free(conn_keywords); if (conn_values) ecpg_free(conn_values); + if (this->name) + ecpg_free(this->name); free(this); return false; } @@ -510,17 +530,14 @@ ECPGconnect(int lineno, int c, const char *name, const char *user, const char *p ecpg_free(conn_keywords); if (conn_values) ecpg_free(conn_values); + if (this->name) + ecpg_free(this->name); free(this); return false; } } #endif - if (connection_name != NULL) - this->name = ecpg_strdup(connection_name, lineno); - else - this->name = ecpg_strdup(realname, lineno); - this->cache_head = NULL; this->prep_stmts = NULL; diff --git a/src/interfaces/ecpg/ecpglib/descriptor.c b/src/interfaces/ecpg/ecpglib/descriptor.c index 651d5c8b2ed..466428edfeb 100644 --- a/src/interfaces/ecpg/ecpglib/descriptor.c +++ b/src/interfaces/ecpg/ecpglib/descriptor.c @@ -240,8 +240,9 @@ ECPGget_desc(int lineno, const char *desc_name, int index,...) act_tuple; struct variable data_var; struct sqlca_t *sqlca = ECPGget_sqlca(); + bool alloc_failed = (sqlca == NULL); - if (sqlca == NULL) + if (alloc_failed) { ecpg_raise(lineno, ECPG_OUT_OF_MEMORY, ECPG_SQLSTATE_ECPG_OUT_OF_MEMORY, NULL); @@ -493,7 +494,14 @@ ECPGget_desc(int lineno, const char *desc_name, int index,...) #ifdef WIN32 stmt.oldthreadlocale = _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); #endif - stmt.oldlocale = ecpg_strdup(setlocale(LC_NUMERIC, NULL), lineno); + stmt.oldlocale = ecpg_strdup(setlocale(LC_NUMERIC, NULL), + lineno, &alloc_failed); + if (alloc_failed) + { + va_end(args); + return false; + } + setlocale(LC_NUMERIC, "C"); #endif diff --git a/src/interfaces/ecpg/ecpglib/ecpglib_extern.h b/src/interfaces/ecpg/ecpglib/ecpglib_extern.h index 75cc68275bd..949ff66cefc 100644 --- a/src/interfaces/ecpg/ecpglib/ecpglib_extern.h +++ b/src/interfaces/ecpg/ecpglib/ecpglib_extern.h @@ -175,7 +175,7 @@ void ecpg_free(void *ptr); bool ecpg_init(const struct connection *con, const char *connection_name, const int lineno); -char *ecpg_strdup(const char *string, int lineno); +char *ecpg_strdup(const char *string, int lineno, bool *alloc_failed); const char *ecpg_type_name(enum ECPGttype typ); int ecpg_dynamic_type(Oid type); int sqlda_dynamic_type(Oid type, enum COMPAT_MODE compat); diff --git a/src/interfaces/ecpg/ecpglib/execute.c b/src/interfaces/ecpg/ecpglib/execute.c index f52da06de9a..84a4a9fc578 100644 --- a/src/interfaces/ecpg/ecpglib/execute.c +++ b/src/interfaces/ecpg/ecpglib/execute.c @@ -860,9 +860,9 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari numeric *nval; if (var->arrsize > 1) - mallocedval = ecpg_strdup("{", lineno); + mallocedval = ecpg_strdup("{", lineno, NULL); else - mallocedval = ecpg_strdup("", lineno); + mallocedval = ecpg_strdup("", lineno, NULL); if (!mallocedval) return false; @@ -923,9 +923,9 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari int slen; if (var->arrsize > 1) - mallocedval = ecpg_strdup("{", lineno); + mallocedval = ecpg_strdup("{", lineno, NULL); else - mallocedval = ecpg_strdup("", lineno); + mallocedval = ecpg_strdup("", lineno, NULL); if (!mallocedval) return false; @@ -970,9 +970,9 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari int slen; if (var->arrsize > 1) - mallocedval = ecpg_strdup("{", lineno); + mallocedval = ecpg_strdup("{", lineno, NULL); else - mallocedval = ecpg_strdup("", lineno); + mallocedval = ecpg_strdup("", lineno, NULL); if (!mallocedval) return false; @@ -1017,9 +1017,9 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari int slen; if (var->arrsize > 1) - mallocedval = ecpg_strdup("{", lineno); + mallocedval = ecpg_strdup("{", lineno, NULL); else - mallocedval = ecpg_strdup("", lineno); + mallocedval = ecpg_strdup("", lineno, NULL); if (!mallocedval) return false; @@ -2001,7 +2001,8 @@ ecpg_do_prologue(int lineno, const int compat, const int force_indicator, return false; } #endif - stmt->oldlocale = ecpg_strdup(setlocale(LC_NUMERIC, NULL), lineno); + stmt->oldlocale = ecpg_strdup(setlocale(LC_NUMERIC, NULL), lineno, + NULL); if (stmt->oldlocale == NULL) { ecpg_do_epilogue(stmt); @@ -2030,7 +2031,14 @@ ecpg_do_prologue(int lineno, const int compat, const int force_indicator, statement_type = ECPGst_execute; } else - stmt->command = ecpg_strdup(query, lineno); + { + stmt->command = ecpg_strdup(query, lineno, NULL); + if (!stmt->command) + { + ecpg_do_epilogue(stmt); + return false; + } + } stmt->name = NULL; @@ -2042,7 +2050,12 @@ ecpg_do_prologue(int lineno, const int compat, const int force_indicator, if (command) { stmt->name = stmt->command; - stmt->command = ecpg_strdup(command, lineno); + stmt->command = ecpg_strdup(command, lineno, NULL); + if (!stmt->command) + { + ecpg_do_epilogue(stmt); + return false; + } } else { @@ -2175,7 +2188,12 @@ ecpg_do_prologue(int lineno, const int compat, const int force_indicator, if (!is_prepared_name_set && stmt->statement_type == ECPGst_prepare) { - stmt->name = ecpg_strdup(var->value, lineno); + stmt->name = ecpg_strdup(var->value, lineno, NULL); + if (!stmt->name) + { + ecpg_do_epilogue(stmt); + return false; + } is_prepared_name_set = true; } } diff --git a/src/interfaces/ecpg/ecpglib/memory.c b/src/interfaces/ecpg/ecpglib/memory.c index 6979be2c988..2112e55b6e4 100644 --- a/src/interfaces/ecpg/ecpglib/memory.c +++ b/src/interfaces/ecpg/ecpglib/memory.c @@ -43,8 +43,15 @@ ecpg_realloc(void *ptr, long size, int lineno) return new; } +/* + * Wrapper for strdup(), with NULL in input treated as a correct case. + * + * "alloc_failed" can be optionally specified by the caller to check for + * allocation failures. The caller is responsible for its initialization, + * as ecpg_strdup() may be called repeatedly across multiple allocations. + */ char * -ecpg_strdup(const char *string, int lineno) +ecpg_strdup(const char *string, int lineno, bool *alloc_failed) { char *new; @@ -54,6 +61,8 @@ ecpg_strdup(const char *string, int lineno) new = strdup(string); if (!new) { + if (alloc_failed) + *alloc_failed = true; ecpg_raise(lineno, ECPG_OUT_OF_MEMORY, ECPG_SQLSTATE_ECPG_OUT_OF_MEMORY, NULL); return NULL; } diff --git a/src/interfaces/ecpg/ecpglib/prepare.c b/src/interfaces/ecpg/ecpglib/prepare.c index ea1146f520f..06f0135813b 100644 --- a/src/interfaces/ecpg/ecpglib/prepare.c +++ b/src/interfaces/ecpg/ecpglib/prepare.c @@ -85,9 +85,22 @@ ecpg_register_prepared_stmt(struct statement *stmt) /* create statement */ prep_stmt->lineno = lineno; prep_stmt->connection = con; - prep_stmt->command = ecpg_strdup(stmt->command, lineno); + prep_stmt->command = ecpg_strdup(stmt->command, lineno, NULL); + if (!prep_stmt->command) + { + ecpg_free(prep_stmt); + ecpg_free(this); + return false; + } prep_stmt->inlist = prep_stmt->outlist = NULL; - this->name = ecpg_strdup(stmt->name, lineno); + this->name = ecpg_strdup(stmt->name, lineno, NULL); + if (!this->name) + { + ecpg_free(prep_stmt->command); + ecpg_free(prep_stmt); + ecpg_free(this); + return false; + } this->stmt = prep_stmt; this->prepared = true; @@ -177,14 +190,27 @@ prepare_common(int lineno, struct connection *con, const char *name, const char /* create statement */ stmt->lineno = lineno; stmt->connection = con; - stmt->command = ecpg_strdup(variable, lineno); + stmt->command = ecpg_strdup(variable, lineno, NULL); + if (!stmt->command) + { + ecpg_free(stmt); + ecpg_free(this); + return false; + } stmt->inlist = stmt->outlist = NULL; /* if we have C variables in our statement replace them with '?' */ replace_variables(&(stmt->command), lineno); /* add prepared statement to our list */ - this->name = ecpg_strdup(name, lineno); + this->name = ecpg_strdup(name, lineno, NULL); + if (!this->name) + { + ecpg_free(stmt->command); + ecpg_free(stmt); + ecpg_free(this); + return false; + } this->stmt = stmt; /* and finally really prepare the statement */ @@ -540,7 +566,9 @@ AddStmtToCache(int lineno, /* line # of statement */ /* add the query to the entry */ entry = &stmtCacheEntries[entNo]; entry->lineno = lineno; - entry->ecpgQuery = ecpg_strdup(ecpgQuery, lineno); + entry->ecpgQuery = ecpg_strdup(ecpgQuery, lineno, NULL); + if (!entry->ecpgQuery) + return -1; entry->connection = connection; entry->execs = 0; memcpy(entry->stmtID, stmtID, sizeof(entry->stmtID)); @@ -567,14 +595,19 @@ ecpg_auto_prepare(int lineno, const char *connection_name, const int compat, cha ecpg_log("ecpg_auto_prepare on line %d: statement found in cache; entry %d\n", lineno, entNo); stmtID = stmtCacheEntries[entNo].stmtID; + *name = ecpg_strdup(stmtID, lineno, NULL); + if (*name == NULL) + return false; con = ecpg_get_connection(connection_name); prep = ecpg_find_prepared_statement(stmtID, con, NULL); /* This prepared name doesn't exist on this connection. */ if (!prep && !prepare_common(lineno, con, stmtID, query)) + { + ecpg_free(*name); return false; + } - *name = ecpg_strdup(stmtID, lineno); } else { @@ -584,15 +617,22 @@ ecpg_auto_prepare(int lineno, const char *connection_name, const int compat, cha /* generate a statement ID */ sprintf(stmtID, "ecpg%d", nextStmtID++); + *name = ecpg_strdup(stmtID, lineno, NULL); + if (*name == NULL) + return false; if (!ECPGprepare(lineno, connection_name, 0, stmtID, query)) + { + ecpg_free(*name); return false; + } entNo = AddStmtToCache(lineno, stmtID, connection_name, compat, query); if (entNo < 0) + { + ecpg_free(*name); return false; - - *name = ecpg_strdup(stmtID, lineno); + } } /* increase usage counter */ diff --git a/src/interfaces/libpq-oauth/.gitignore b/src/interfaces/libpq-oauth/.gitignore new file mode 100644 index 00000000000..a4afe7c1c68 --- /dev/null +++ b/src/interfaces/libpq-oauth/.gitignore @@ -0,0 +1 @@ +/exports.list diff --git a/src/interfaces/libpq-oauth/Makefile b/src/interfaces/libpq-oauth/Makefile index 270fc0cf2d9..682f17413b3 100644 --- a/src/interfaces/libpq-oauth/Makefile +++ b/src/interfaces/libpq-oauth/Makefile @@ -24,7 +24,7 @@ NAME = pq-oauth-$(MAJORVERSION) override shlib := lib$(NAME)$(DLSUFFIX) override stlib := libpq-oauth.a -override CPPFLAGS := -I$(libpq_srcdir) -I$(top_builddir)/src/port $(LIBCURL_CPPFLAGS) $(CPPFLAGS) +override CPPFLAGS := -I$(libpq_srcdir) -I$(top_builddir)/src/port $(CPPFLAGS) $(LIBCURL_CPPFLAGS) OBJS = \ $(WIN32RES) diff --git a/src/interfaces/libpq-oauth/oauth-curl.c b/src/interfaces/libpq-oauth/oauth-curl.c index d13b9cbabb4..dba9a684fa8 100644 --- a/src/interfaces/libpq-oauth/oauth-curl.c +++ b/src/interfaces/libpq-oauth/oauth-curl.c @@ -83,6 +83,20 @@ #define MAX_OAUTH_RESPONSE_SIZE (256 * 1024) /* + * Similarly, a limit on the maximum JSON nesting level keeps a server from + * running us out of stack space. A common nesting level in practice is 2 (for a + * top-level object containing arrays of strings). As of May 2025, the maximum + * depth for standard server metadata appears to be 6, if the document contains + * a full JSON Web Key Set in its "jwks" parameter. + * + * Since it's easy to nest JSON, and the number of parameters and key types + * keeps growing, take a healthy buffer of 16. (If this ever proves to be a + * problem in practice, we may want to switch over to the incremental JSON + * parser instead of playing with this parameter.) + */ +#define MAX_OAUTH_NESTING_LEVEL 16 + +/* * Parsed JSON Representations * * As a general rule, we parse and cache only the fields we're currently using. @@ -495,6 +509,12 @@ oauth_json_object_start(void *state) } ++ctx->nested; + if (ctx->nested > MAX_OAUTH_NESTING_LEVEL) + { + oauth_parse_set_error(ctx, "JSON is too deeply nested"); + return JSON_SEM_ACTION_FAILED; + } + return JSON_SUCCESS; } @@ -599,6 +619,12 @@ oauth_json_array_start(void *state) } ++ctx->nested; + if (ctx->nested > MAX_OAUTH_NESTING_LEVEL) + { + oauth_parse_set_error(ctx, "JSON is too deeply nested"); + return JSON_SEM_ACTION_FAILED; + } + return JSON_SUCCESS; } diff --git a/src/interfaces/libpq/Makefile b/src/interfaces/libpq/Makefile index c6fe5fec7f6..da6650066d4 100644 --- a/src/interfaces/libpq/Makefile +++ b/src/interfaces/libpq/Makefile @@ -24,7 +24,7 @@ NAME= pq SO_MAJOR_VERSION= 5 SO_MINOR_VERSION= $(MAJORVERSION) -override CPPFLAGS := -I$(srcdir) $(CPPFLAGS) -I$(top_builddir)/src/port -I$(top_srcdir)/src/port +override CPPFLAGS := -I$(srcdir) -I$(top_builddir)/src/port -I$(top_srcdir)/src/port $(CPPFLAGS) ifneq ($(PORTNAME), win32) override CFLAGS += $(PTHREAD_CFLAGS) endif @@ -87,7 +87,7 @@ endif # that are built correctly for use in a shlib. SHLIB_LINK_INTERNAL = -lpgcommon_shlib -lpgport_shlib ifneq ($(PORTNAME), win32) -SHLIB_LINK += $(filter -lcrypt -ldes -lcom_err -lcrypto -lk5crypto -lkrb5 -lgssapi_krb5 -lgss -lgssapi -lssl -lsocket -lnsl -lresolv -lintl -lm, $(LIBS)) $(LDAP_LIBS_FE) $(PTHREAD_LIBS) +SHLIB_LINK += $(filter -lcrypt -ldes -lcom_err -lcrypto -lk5crypto -lkrb5 -lgssapi_krb5 -lgss -lgssapi -lssl -lsocket -lnsl -lresolv -lintl -ldl -lm, $(LIBS)) $(LDAP_LIBS_FE) $(PTHREAD_LIBS) else SHLIB_LINK += $(filter -lcrypt -ldes -lcom_err -lcrypto -lk5crypto -lkrb5 -lgssapi32 -lssl -lsocket -lnsl -lresolv -lintl -lm $(PTHREAD_LIBS), $(LIBS)) $(LDAP_LIBS_FE) endif @@ -98,14 +98,21 @@ SHLIB_PREREQS = submake-libpgport SHLIB_EXPORTS = exports.txt +# Appends to a comma-separated list. +comma := , +define add_to_list +$(eval $1 := $(if $($1),$($1)$(comma) $2,$2)) +endef + ifeq ($(with_ssl),openssl) -PKG_CONFIG_REQUIRES_PRIVATE = libssl, libcrypto +$(call add_to_list,PKG_CONFIG_REQUIRES_PRIVATE,libssl) +$(call add_to_list,PKG_CONFIG_REQUIRES_PRIVATE,libcrypto) endif ifeq ($(with_libcurl),yes) # libpq.so doesn't link against libcurl, but libpq.a needs libpq-oauth, and # libpq-oauth needs libcurl. Put both into *.private. -PKG_CONFIG_REQUIRES_PRIVATE += libcurl +$(call add_to_list,PKG_CONFIG_REQUIRES_PRIVATE,libcurl) %.pc: override SHLIB_LINK_INTERNAL += -lpq-oauth endif diff --git a/src/interfaces/libpq/exports.txt b/src/interfaces/libpq/exports.txt index 0625cf39e9a..dbbae642d76 100644 --- a/src/interfaces/libpq/exports.txt +++ b/src/interfaces/libpq/exports.txt @@ -205,9 +205,8 @@ PQcancelFinish 202 PQsocketPoll 203 PQsetChunkedRowsMode 204 PQgetCurrentTimeUSec 205 -PQservice 206 -PQsetAuthDataHook 207 -PQgetAuthDataHook 208 -PQdefaultAuthDataHook 209 -PQfullProtocolVersion 210 -appendPQExpBufferVA 211 +PQsetAuthDataHook 206 +PQgetAuthDataHook 207 +PQdefaultAuthDataHook 208 +PQfullProtocolVersion 209 +appendPQExpBufferVA 210 diff --git a/src/interfaces/libpq/fe-auth-oauth.c b/src/interfaces/libpq/fe-auth-oauth.c index 9fbff89a21d..d146c5f567c 100644 --- a/src/interfaces/libpq/fe-auth-oauth.c +++ b/src/interfaces/libpq/fe-auth-oauth.c @@ -157,6 +157,14 @@ client_initial_response(PGconn *conn, bool discover) #define ERROR_SCOPE_FIELD "scope" #define ERROR_OPENID_CONFIGURATION_FIELD "openid-configuration" +/* + * Limit the maximum number of nested objects/arrays. Because OAUTHBEARER + * doesn't have any defined extensions for its JSON yet, we can be much more + * conservative here than with libpq-oauth's MAX_OAUTH_NESTING_LEVEL; we expect + * a nesting level of 1 in practice. + */ +#define MAX_SASL_NESTING_LEVEL 8 + struct json_ctx { char *errmsg; /* any non-NULL value stops all processing */ @@ -196,6 +204,9 @@ oauth_json_object_start(void *state) } ++ctx->nested; + if (ctx->nested > MAX_SASL_NESTING_LEVEL) + oauth_json_set_error(ctx, libpq_gettext("JSON is too deeply nested")); + return oauth_json_has_error(ctx) ? JSON_SEM_ACTION_FAILED : JSON_SUCCESS; } @@ -254,10 +265,23 @@ oauth_json_array_start(void *state) ctx->target_field_name); } + ++ctx->nested; + if (ctx->nested > MAX_SASL_NESTING_LEVEL) + oauth_json_set_error(ctx, libpq_gettext("JSON is too deeply nested")); + return oauth_json_has_error(ctx) ? JSON_SEM_ACTION_FAILED : JSON_SUCCESS; } static JsonParseErrorType +oauth_json_array_end(void *state) +{ + struct json_ctx *ctx = state; + + --ctx->nested; + return JSON_SUCCESS; +} + +static JsonParseErrorType oauth_json_scalar(void *state, char *token, JsonTokenType type) { struct json_ctx *ctx = state; @@ -519,6 +543,7 @@ handle_oauth_sasl_error(PGconn *conn, const char *msg, int msglen) sem.object_end = oauth_json_object_end; sem.object_field_start = oauth_json_object_field_start; sem.array_start = oauth_json_array_start; + sem.array_end = oauth_json_array_end; sem.scalar = oauth_json_scalar; err = pg_parse_json(lex, &sem); diff --git a/src/interfaces/libpq/fe-cancel.c b/src/interfaces/libpq/fe-cancel.c index 8c7c198a530..c872a0267f0 100644 --- a/src/interfaces/libpq/fe-cancel.c +++ b/src/interfaces/libpq/fe-cancel.c @@ -114,7 +114,7 @@ PQcancelCreate(PGconn *conn) if (conn->be_cancel_key != NULL) { cancelConn->be_cancel_key = malloc(conn->be_cancel_key_len); - if (!conn->be_cancel_key) + if (cancelConn->be_cancel_key == NULL) goto oom_error; memcpy(cancelConn->be_cancel_key, conn->be_cancel_key, conn->be_cancel_key_len); } @@ -137,6 +137,7 @@ PQcancelCreate(PGconn *conn) goto oom_error; originalHost = conn->connhost[conn->whichhost]; + cancelConn->connhost[0].type = originalHost.type; if (originalHost.host) { cancelConn->connhost[0].host = strdup(originalHost.host); @@ -378,7 +379,24 @@ PQgetCancel(PGconn *conn) /* Check that we have received a cancellation key */ if (conn->be_cancel_key_len == 0) - return NULL; + { + /* + * In case there is no cancel key, return an all-zero PGcancel object. + * Actually calling PQcancel on this will fail, but we allow creating + * the PGcancel object anyway. Arguably it would be better return NULL + * to indicate that cancellation is not possible, but there'd be no + * way for the caller to distinguish "out of memory" from "server did + * not send a cancel key". Also, this is how PGgetCancel() has always + * behaved, and if we changed it, some clients would stop working + * altogether with servers that don't support cancellation. (The + * modern PQcancelCreate() function returns a failed connection object + * instead.) + * + * The returned dummy object has cancel_pkt_len == 0; we check for + * that in PQcancel() to identify it as a dummy. + */ + return calloc(1, sizeof(PGcancel)); + } cancel_req_len = offsetof(CancelRequestPacket, cancelAuthCode) + conn->be_cancel_key_len; cancel = malloc(offsetof(PGcancel, cancel_req) + cancel_req_len); @@ -543,6 +561,15 @@ PQcancel(PGcancel *cancel, char *errbuf, int errbufsize) return false; } + if (cancel->cancel_pkt_len == 0) + { + /* This is a dummy PGcancel object, see PQgetCancel */ + strlcpy(errbuf, "PQcancel() -- no cancellation key received", errbufsize); + /* strlcpy probably doesn't change errno, but be paranoid */ + SOCK_ERRNO_SET(save_errno); + return false; + } + /* * We need to open a temporary connection to the postmaster. Do this with * only kernel calls. diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 430c0fa4442..afa85d9fca9 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -201,6 +201,10 @@ static const internalPQconninfoOption PQconninfoOptions[] = { "Database-Service", "", 20, offsetof(struct pg_conn, pgservice)}, + {"servicefile", "PGSERVICEFILE", NULL, NULL, + "Database-Service-File", "", 64, + offsetof(struct pg_conn, pgservicefile)}, + {"user", "PGUSER", NULL, NULL, "Database-User", "", 20, offsetof(struct pg_conn, pguser)}, @@ -2027,13 +2031,11 @@ pqConnectOptions2(PGconn *conn) if (len < 0) { libpq_append_conn_error(conn, "invalid SCRAM client key"); - free(conn->scram_client_key_binary); return false; } if (len != SCRAM_MAX_KEY_LEN) { libpq_append_conn_error(conn, "invalid SCRAM client key length: %d", len); - free(conn->scram_client_key_binary); return false; } conn->scram_client_key_len = len; @@ -2052,13 +2054,11 @@ pqConnectOptions2(PGconn *conn) if (len < 0) { libpq_append_conn_error(conn, "invalid SCRAM server key"); - free(conn->scram_server_key_binary); return false; } if (len != SCRAM_MAX_KEY_LEN) { libpq_append_conn_error(conn, "invalid SCRAM server key length: %d", len); - free(conn->scram_server_key_binary); return false; } conn->scram_server_key_len = len; @@ -2145,7 +2145,7 @@ pqConnectOptions2(PGconn *conn) if (conn->min_pversion > conn->max_pversion) { conn->status = CONNECTION_BAD; - libpq_append_conn_error(conn, "min_protocol_version is greater than max_protocol_version"); + libpq_append_conn_error(conn, "\"%s\" is greater than \"%s\"", "min_protocol_version", "max_protocol_version"); return false; } @@ -5053,21 +5053,20 @@ freePGconn(PGconn *conn) free(conn->events[i].name); } - release_conn_addrinfo(conn); - pqReleaseConnHosts(conn); - - free(conn->client_encoding_initial); - free(conn->events); + /* free everything not freed in pqClosePGconn */ free(conn->pghost); free(conn->pghostaddr); free(conn->pgport); free(conn->connect_timeout); free(conn->pgtcp_user_timeout); + free(conn->client_encoding_initial); free(conn->pgoptions); free(conn->appname); free(conn->fbappname); free(conn->dbName); free(conn->replication); + free(conn->pgservice); + free(conn->pgservicefile); free(conn->pguser); if (conn->pgpass) { @@ -5082,8 +5081,9 @@ freePGconn(PGconn *conn) free(conn->keepalives_count); free(conn->sslmode); free(conn->sslnegotiation); - free(conn->sslcert); + free(conn->sslcompression); free(conn->sslkey); + free(conn->sslcert); if (conn->sslpassword) { explicit_bzero(conn->sslpassword, strlen(conn->sslpassword)); @@ -5093,32 +5093,40 @@ freePGconn(PGconn *conn) free(conn->sslrootcert); free(conn->sslcrl); free(conn->sslcrldir); - free(conn->sslcompression); free(conn->sslsni); free(conn->requirepeer); - free(conn->require_auth); - free(conn->ssl_min_protocol_version); - free(conn->ssl_max_protocol_version); free(conn->gssencmode); free(conn->krbsrvname); free(conn->gsslib); free(conn->gssdelegation); - free(conn->connip); - /* Note that conn->Pfdebug is not ours to close or free */ - free(conn->write_err_msg); - free(conn->inBuffer); - free(conn->outBuffer); - free(conn->rowBuf); + free(conn->min_protocol_version); + free(conn->max_protocol_version); + free(conn->ssl_min_protocol_version); + free(conn->ssl_max_protocol_version); free(conn->target_session_attrs); + free(conn->require_auth); free(conn->load_balance_hosts); free(conn->scram_client_key); free(conn->scram_server_key); + free(conn->sslkeylogfile); free(conn->oauth_issuer); free(conn->oauth_issuer_id); free(conn->oauth_discovery_uri); free(conn->oauth_client_id); free(conn->oauth_client_secret); free(conn->oauth_scope); + /* Note that conn->Pfdebug is not ours to close or free */ + free(conn->events); + pqReleaseConnHosts(conn); + free(conn->connip); + release_conn_addrinfo(conn); + free(conn->scram_client_key_binary); + free(conn->scram_server_key_binary); + /* if this is a cancel connection, be_cancel_key may still be allocated */ + free(conn->be_cancel_key); + free(conn->inBuffer); + free(conn->outBuffer); + free(conn->rowBuf); termPQExpBuffer(&conn->errorMessage); termPQExpBuffer(&conn->workBuffer); @@ -5147,6 +5155,7 @@ pqReleaseConnHosts(PGconn *conn) } } free(conn->connhost); + conn->connhost = NULL; } } @@ -5910,6 +5919,7 @@ static int parseServiceInfo(PQconninfoOption *options, PQExpBuffer errorMessage) { const char *service = conninfo_getval(options, "service"); + const char *service_fname = conninfo_getval(options, "servicefile"); char serviceFile[MAXPGPATH]; char *env; bool group_found = false; @@ -5929,10 +5939,13 @@ parseServiceInfo(PQconninfoOption *options, PQExpBuffer errorMessage) return 0; /* - * Try PGSERVICEFILE if specified, else try ~/.pg_service.conf (if that - * exists). + * First, try the "servicefile" option in connection string. Then, try + * the PGSERVICEFILE environment variable. Finally, check + * ~/.pg_service.conf (if that exists). */ - if ((env = getenv("PGSERVICEFILE")) != NULL) + if (service_fname != NULL) + strlcpy(serviceFile, service_fname, sizeof(serviceFile)); + else if ((env = getenv("PGSERVICEFILE")) != NULL) strlcpy(serviceFile, env, sizeof(serviceFile)); else { @@ -6088,7 +6101,17 @@ parseServiceFile(const char *serviceFile, if (strcmp(key, "service") == 0) { libpq_append_error(errorMessage, - "nested service specifications not supported in service file \"%s\", line %d", + "nested \"service\" specifications not supported in service file \"%s\", line %d", + serviceFile, + linenr); + result = 3; + goto exit; + } + + if (strcmp(key, "servicefile") == 0) + { + libpq_append_error(errorMessage, + "nested \"servicefile\" specifications not supported in service file \"%s\", line %d", serviceFile, linenr); result = 3; @@ -6131,6 +6154,33 @@ parseServiceFile(const char *serviceFile, } exit: + + /* + * If a service has been successfully found, set the "servicefile" option + * if not already set. This matters when we use a default service file or + * PGSERVICEFILE, where we want to be able track the value. + */ + if (*group_found && result == 0) + { + for (i = 0; options[i].keyword; i++) + { + if (strcmp(options[i].keyword, "servicefile") != 0) + continue; + + /* If value is already set, nothing to do */ + if (options[i].val != NULL) + break; + + options[i].val = strdup(serviceFile); + if (options[i].val == NULL) + { + libpq_append_error(errorMessage, "out of memory"); + result = 3; + } + break; + } + } + fclose(f); return result; @@ -7458,14 +7508,6 @@ PQdb(const PGconn *conn) } char * -PQservice(const PGconn *conn) -{ - if (!conn) - return NULL; - return conn->pgservice; -} - -char * PQuser(const PGconn *conn) { if (!conn) @@ -7532,10 +7574,12 @@ PQport(const PGconn *conn) if (!conn) return NULL; - if (conn->connhost != NULL) + if (conn->connhost != NULL && + conn->connhost[conn->whichhost].port != NULL && + conn->connhost[conn->whichhost].port[0] != '\0') return conn->connhost[conn->whichhost].port; - return ""; + return DEF_PGPORT_STR; } /* diff --git a/src/interfaces/libpq/fe-misc.c b/src/interfaces/libpq/fe-misc.c index c14e3c95250..dca44fdc5d2 100644 --- a/src/interfaces/libpq/fe-misc.c +++ b/src/interfaces/libpq/fe-misc.c @@ -553,9 +553,35 @@ pqPutMsgEnd(PGconn *conn) /* Make message eligible to send */ conn->outCount = conn->outMsgEnd; + /* If appropriate, try to push out some data */ if (conn->outCount >= 8192) { - int toSend = conn->outCount - (conn->outCount % 8192); + int toSend = conn->outCount; + + /* + * On Unix-pipe connections, it seems profitable to prefer sending + * pipe-buffer-sized packets not randomly-sized ones, so retain the + * last partial-8K chunk in our buffer for now. On TCP connections, + * the advantage of that is far less clear. Moreover, it flat out + * isn't safe when using SSL or GSSAPI, because those code paths have + * API stipulations that if they fail to send all the data that was + * offered in the previous write attempt, we mustn't offer less data + * in this write attempt. The previous write attempt might've been + * pqFlush attempting to send everything in the buffer, so we mustn't + * offer less now. (Presently, we won't try to use SSL or GSSAPI on + * Unix connections, so those checks are just Asserts. They'll have + * to become part of the regular if-test if we ever change that.) + */ + if (conn->raddr.addr.ss_family == AF_UNIX) + { +#ifdef USE_SSL + Assert(!conn->ssl_in_use); +#endif +#ifdef ENABLE_GSS + Assert(!conn->gssenc); +#endif + toSend -= toSend % 8192; + } if (pqSendSome(conn, toSend) < 0) return EOF; diff --git a/src/interfaces/libpq/fe-protocol3.c b/src/interfaces/libpq/fe-protocol3.c index beb1c889aad..1599de757d1 100644 --- a/src/interfaces/libpq/fe-protocol3.c +++ b/src/interfaces/libpq/fe-protocol3.c @@ -1434,7 +1434,7 @@ pqGetNegotiateProtocolVersion3(PGconn *conn) /* 3.1 never existed, we went straight from 3.0 to 3.2 */ if (their_version == PG_PROTOCOL(3, 1)) { - libpq_append_conn_error(conn, "received invalid protocol negotiation message: server requests downgrade to non-existent 3.1 protocol version"); + libpq_append_conn_error(conn, "received invalid protocol negotiation message: server requested downgrade to non-existent 3.1 protocol version"); goto failure; } @@ -1452,9 +1452,10 @@ pqGetNegotiateProtocolVersion3(PGconn *conn) if (their_version < conn->min_pversion) { - libpq_append_conn_error(conn, "server only supports protocol version %d.%d, but min_protocol_version was set to %d.%d", + libpq_append_conn_error(conn, "server only supports protocol version %d.%d, but \"%s\" was set to %d.%d", PG_PROTOCOL_MAJOR(their_version), PG_PROTOCOL_MINOR(their_version), + "min_protocol_version", PG_PROTOCOL_MAJOR(conn->min_pversion), PG_PROTOCOL_MINOR(conn->min_pversion)); @@ -1476,7 +1477,7 @@ pqGetNegotiateProtocolVersion3(PGconn *conn) } if (strncmp(conn->workBuffer.data, "_pq_.", 5) != 0) { - libpq_append_conn_error(conn, "received invalid protocol negotiation message: server reported unsupported parameter name without a _pq_. prefix (\"%s\")", conn->workBuffer.data); + libpq_append_conn_error(conn, "received invalid protocol negotiation message: server reported unsupported parameter name without a \"%s\" prefix (\"%s\")", "_pq_.", conn->workBuffer.data); goto failure; } libpq_append_conn_error(conn, "received invalid protocol negotiation message: server reported an unsupported parameter that was not requested (\"%s\")", conn->workBuffer.data); diff --git a/src/interfaces/libpq/fe-secure-gssapi.c b/src/interfaces/libpq/fe-secure-gssapi.c index ce183bc04b4..bc9e1ce06fa 100644 --- a/src/interfaces/libpq/fe-secure-gssapi.c +++ b/src/interfaces/libpq/fe-secure-gssapi.c @@ -47,11 +47,18 @@ * don't want the other side to send arbitrarily huge packets as we * would have to allocate memory for them to then pass them to GSSAPI. * - * Therefore, these two #define's are effectively part of the protocol + * Therefore, this #define is effectively part of the protocol * spec and can't ever be changed. */ -#define PQ_GSS_SEND_BUFFER_SIZE 16384 -#define PQ_GSS_RECV_BUFFER_SIZE 16384 +#define PQ_GSS_MAX_PACKET_SIZE 16384 /* includes uint32 header word */ + +/* + * However, during the authentication exchange we must cope with whatever + * message size the GSSAPI library wants to send (because our protocol + * doesn't support splitting those messages). Depending on configuration + * those messages might be as much as 64kB. + */ +#define PQ_GSS_AUTH_BUFFER_SIZE 65536 /* includes uint32 header word */ /* * We need these state variables per-connection. To allow the functions @@ -105,9 +112,9 @@ pg_GSS_write(PGconn *conn, const void *ptr, size_t len) * again, so if it offers a len less than that, something is wrong. * * Note: it may seem attractive to report partial write completion once - * we've successfully sent any encrypted packets. However, that can cause - * problems for callers; notably, pqPutMsgEnd's heuristic to send only - * full 8K blocks interacts badly with such a hack. We won't save much, + * we've successfully sent any encrypted packets. However, doing that + * expands the state space of this processing and has been responsible for + * bugs in the past (cf. commit d053a879b). We won't save much, * typically, by letting callers discard data early, so don't risk it. */ if (len < PqGSSSendConsumed) @@ -203,11 +210,11 @@ pg_GSS_write(PGconn *conn, const void *ptr, size_t len) goto cleanup; } - if (output.length > PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)) + if (output.length > PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)) { libpq_append_conn_error(conn, "client tried to send oversize GSSAPI packet (%zu > %zu)", (size_t) output.length, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)); + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)); errno = EIO; /* for lack of a better idea */ goto cleanup; } @@ -342,11 +349,11 @@ pg_GSS_read(PGconn *conn, void *ptr, size_t len) /* Decode the packet length and check for overlength packet */ input.length = pg_ntoh32(*(uint32 *) PqGSSRecvBuffer); - if (input.length > PQ_GSS_RECV_BUFFER_SIZE - sizeof(uint32)) + if (input.length > PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)) { libpq_append_conn_error(conn, "oversize GSSAPI packet sent by the server (%zu > %zu)", (size_t) input.length, - PQ_GSS_RECV_BUFFER_SIZE - sizeof(uint32)); + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32)); errno = EIO; /* for lack of a better idea */ return -1; } @@ -485,12 +492,15 @@ pqsecure_open_gss(PGconn *conn) * initialize state variables. By malloc'ing the buffers separately, we * ensure that they are sufficiently aligned for the length-word accesses * that we do in some places in this file. + * + * We'll use PQ_GSS_AUTH_BUFFER_SIZE-sized buffers until transport + * negotiation is complete, then switch to PQ_GSS_MAX_PACKET_SIZE. */ if (PqGSSSendBuffer == NULL) { - PqGSSSendBuffer = malloc(PQ_GSS_SEND_BUFFER_SIZE); - PqGSSRecvBuffer = malloc(PQ_GSS_RECV_BUFFER_SIZE); - PqGSSResultBuffer = malloc(PQ_GSS_RECV_BUFFER_SIZE); + PqGSSSendBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); + PqGSSRecvBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); + PqGSSResultBuffer = malloc(PQ_GSS_AUTH_BUFFER_SIZE); if (!PqGSSSendBuffer || !PqGSSRecvBuffer || !PqGSSResultBuffer) { libpq_append_conn_error(conn, "out of memory"); @@ -564,13 +574,13 @@ pqsecure_open_gss(PGconn *conn) * so leave a spot at the end for a NULL byte too) and report that * back to the caller. */ - result = gss_read(conn, PqGSSRecvBuffer + PqGSSRecvLength, PQ_GSS_RECV_BUFFER_SIZE - PqGSSRecvLength - 1, &ret); + result = gss_read(conn, PqGSSRecvBuffer + PqGSSRecvLength, PQ_GSS_AUTH_BUFFER_SIZE - PqGSSRecvLength - 1, &ret); if (result != PGRES_POLLING_OK) return result; PqGSSRecvLength += ret; - Assert(PqGSSRecvLength < PQ_GSS_RECV_BUFFER_SIZE); + Assert(PqGSSRecvLength < PQ_GSS_AUTH_BUFFER_SIZE); PqGSSRecvBuffer[PqGSSRecvLength] = '\0'; appendPQExpBuffer(&conn->errorMessage, "%s\n", PqGSSRecvBuffer + 1); @@ -584,11 +594,11 @@ pqsecure_open_gss(PGconn *conn) /* Get the length and check for over-length packet */ input.length = pg_ntoh32(*(uint32 *) PqGSSRecvBuffer); - if (input.length > PQ_GSS_RECV_BUFFER_SIZE - sizeof(uint32)) + if (input.length > PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)) { libpq_append_conn_error(conn, "oversize GSSAPI packet sent by the server (%zu > %zu)", (size_t) input.length, - PQ_GSS_RECV_BUFFER_SIZE - sizeof(uint32)); + PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)); return PGRES_POLLING_FAILED; } @@ -669,11 +679,32 @@ pqsecure_open_gss(PGconn *conn) gss_release_buffer(&minor, &output); /* + * Release the large authentication buffers and allocate the ones we + * want for normal operation. (This maneuver is safe only because + * pqDropConnection will drop the buffers; otherwise, during a + * reconnection we'd be at risk of using undersized buffers during + * negotiation.) + */ + free(PqGSSSendBuffer); + free(PqGSSRecvBuffer); + free(PqGSSResultBuffer); + PqGSSSendBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + PqGSSRecvBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + PqGSSResultBuffer = malloc(PQ_GSS_MAX_PACKET_SIZE); + if (!PqGSSSendBuffer || !PqGSSRecvBuffer || !PqGSSResultBuffer) + { + libpq_append_conn_error(conn, "out of memory"); + return PGRES_POLLING_FAILED; + } + PqGSSSendLength = PqGSSSendNext = PqGSSSendConsumed = 0; + PqGSSRecvLength = PqGSSResultLength = PqGSSResultNext = 0; + + /* * Determine the max packet size which will fit in our buffer, after * accounting for the length. pg_GSS_write will need this. */ major = gss_wrap_size_limit(&minor, conn->gctx, 1, GSS_C_QOP_DEFAULT, - PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32), + PQ_GSS_MAX_PACKET_SIZE - sizeof(uint32), &PqGSSMaxPktSize); if (GSS_ERROR(major)) @@ -687,10 +718,11 @@ pqsecure_open_gss(PGconn *conn) } /* Must have output.length > 0 */ - if (output.length > PQ_GSS_SEND_BUFFER_SIZE - sizeof(uint32)) + if (output.length > PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)) { - pg_GSS_error(libpq_gettext("GSSAPI context establishment error"), - conn, major, minor); + libpq_append_conn_error(conn, "client tried to send oversize GSSAPI packet (%zu > %zu)", + (size_t) output.length, + PQ_GSS_AUTH_BUFFER_SIZE - sizeof(uint32)); gss_release_buffer(&minor, &output); return PGRES_POLLING_FAILED; } diff --git a/src/interfaces/libpq/fe-secure-openssl.c b/src/interfaces/libpq/fe-secure-openssl.c index 78f9e84eb35..51dd7b9fec0 100644 --- a/src/interfaces/libpq/fe-secure-openssl.c +++ b/src/interfaces/libpq/fe-secure-openssl.c @@ -693,34 +693,35 @@ static unsigned char alpn_protos[] = PG_ALPN_PROTOCOL_VECTOR; * purposes. The file will be written using the NSS keylog format. LibreSSL * 3.5 introduced stub function to set the callback for OpenSSL compatibility * but the callback is never invoked. + * + * Error messages added to the connection object wont be printed anywhere if + * the connection is successful. Errors in processing keylogging are printed + * to stderr to overcome this. */ static void SSL_CTX_keylog_cb(const SSL *ssl, const char *line) { int fd; - mode_t old_umask; ssize_t rc; PGconn *conn = SSL_get_app_data(ssl); if (conn == NULL) return; - old_umask = umask(077); fd = open(conn->sslkeylogfile, O_WRONLY | O_APPEND | O_CREAT, 0600); - umask(old_umask); if (fd == -1) { - libpq_append_conn_error(conn, "could not open ssl keylog file \"%s\": %s", - conn->sslkeylogfile, pg_strerror(errno)); + fprintf(stderr, libpq_gettext("WARNING: could not open SSL key logging file \"%s\": %m\n"), + conn->sslkeylogfile); return; } /* line is guaranteed by OpenSSL to be NUL terminated */ rc = write(fd, line, strlen(line)); if (rc < 0) - libpq_append_conn_error(conn, "could not write to ssl keylog file \"%s\": %s", - conn->sslkeylogfile, pg_strerror(errno)); + fprintf(stderr, libpq_gettext("WARNING: could not write to SSL key logging file \"%s\": %m\n"), + conn->sslkeylogfile); else rc = write(fd, "\n", 1); (void) rc; /* silence compiler warnings */ @@ -1044,6 +1045,10 @@ initialize_SSL(PGconn *conn) } conn->ssl_in_use = true; + /* + * If SSL key logging is requested, set up the callback if a compatible + * version of OpenSSL is used and libpq was compiled to support it. + */ if (conn->sslkeylogfile && strlen(conn->sslkeylogfile) > 0) { #ifdef HAVE_SSL_CTX_SET_KEYLOG_CALLBACK @@ -1057,7 +1062,6 @@ initialize_SSL(PGconn *conn) #endif } - /* * SSL contexts are reference counted by OpenSSL. We can free it as soon * as we have created the SSL object, and it will stick around for as long diff --git a/src/interfaces/libpq/libpq-fe.h b/src/interfaces/libpq/libpq-fe.h index 7d3a9df6fd5..af8004f952a 100644 --- a/src/interfaces/libpq/libpq-fe.h +++ b/src/interfaces/libpq/libpq-fe.h @@ -400,7 +400,6 @@ extern int PQrequestCancel(PGconn *conn); /* Accessor functions for PGconn objects */ extern char *PQdb(const PGconn *conn); -extern char *PQservice(const PGconn *conn); extern char *PQuser(const PGconn *conn); extern char *PQpass(const PGconn *conn); extern char *PQhost(const PGconn *conn); diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h index a6cfd7f5c9d..a701c25038a 100644 --- a/src/interfaces/libpq/libpq-int.h +++ b/src/interfaces/libpq/libpq-int.h @@ -357,7 +357,8 @@ typedef struct pg_conn_host pg_conn_host_type type; /* type of host address */ char *host; /* host name or socket path */ char *hostaddr; /* host numeric IP address */ - char *port; /* port number (always provided) */ + char *port; /* port number (if NULL or empty, use + * DEF_PGPORT[_STR]) */ char *password; /* password for this host, read from the * password file; NULL if not sought or not * found in password file. */ @@ -389,6 +390,8 @@ struct pg_conn char *dbName; /* database name */ char *replication; /* connect as the replication standby? */ char *pgservice; /* Postgres service, if any */ + char *pgservicefile; /* path to a service file containing + * service(s) */ char *pguser; /* Postgres username and password, if any */ char *pgpass; char *pgpassfile; /* path to a file containing password(s) */ diff --git a/src/interfaces/libpq/t/005_negotiate_encryption.pl b/src/interfaces/libpq/t/005_negotiate_encryption.pl index f6a453c1b41..ac6d8bcb4a6 100644 --- a/src/interfaces/libpq/t/005_negotiate_encryption.pl +++ b/src/interfaces/libpq/t/005_negotiate_encryption.pl @@ -107,7 +107,7 @@ $node->append_conf( listen_addresses = '$hostaddr' # Capturing the EVENTS that occur during tests requires these settings -log_connections = on +log_connections = 'receipt,authentication,authorization' log_disconnections = on trace_connection_negotiation = on lc_messages = 'C' diff --git a/src/interfaces/libpq/t/006_service.pl b/src/interfaces/libpq/t/006_service.pl index 4fe5adc5c2a..797e6232b8f 100644 --- a/src/interfaces/libpq/t/006_service.pl +++ b/src/interfaces/libpq/t/006_service.pl @@ -47,6 +47,19 @@ my $srvfile_default = "$td/pg_service.conf"; # Missing service file. my $srvfile_missing = "$td/pg_service_missing.conf"; +# Service file with nested "service" defined. +my $srvfile_nested = "$td/pg_service_nested.conf"; +copy($srvfile_valid, $srvfile_nested) + or die "Could not copy $srvfile_valid to $srvfile_nested: $!"; +append_to_file($srvfile_nested, 'service=invalid_srv' . $newline); + +# Service file with nested "servicefile" defined. +my $srvfile_nested_2 = "$td/pg_service_nested_2.conf"; +copy($srvfile_valid, $srvfile_nested_2) + or die "Could not copy $srvfile_valid to $srvfile_nested_2: $!"; +append_to_file($srvfile_nested_2, + 'servicefile=' . $srvfile_default . $newline); + # Set the fallback directory lookup of the service file to the temporary # directory of this test. PGSYSCONFDIR is used if the service file # defined in PGSERVICEFILE cannot be found, or when a service file is @@ -146,6 +159,85 @@ local $ENV{PGSERVICEFILE} = "$srvfile_empty"; unlink($srvfile_default); } +# Checks nested service file contents. +{ + local $ENV{PGSERVICEFILE} = $srvfile_nested; + + $dummy_node->connect_fails( + 'service=my_srv', + 'connection with "service" in nested service file', + expected_stderr => + qr/nested "service" specifications not supported in service file/); + + local $ENV{PGSERVICEFILE} = $srvfile_nested_2; + + $dummy_node->connect_fails( + 'service=my_srv', + 'connection with "servicefile" in nested service file', + expected_stderr => + qr/nested "servicefile" specifications not supported in service file/ + ); +} + +# Properly escape backslashes in the path, to ensure the generation of +# correct connection strings. +my $srvfile_win_cared = $srvfile_valid; +$srvfile_win_cared =~ s/\\/\\\\/g; + +# Checks that the "servicefile" option works as expected +{ + $dummy_node->connect_ok( + q{service=my_srv servicefile='} . $srvfile_win_cared . q{'}, + 'connection with valid servicefile in connection string', + sql => "SELECT 'connect3_1'", + expected_stdout => qr/connect3_1/); + + # Encode slashes and backslash + my $encoded_srvfile = $srvfile_valid =~ s{([\\/])}{ + $1 eq '/' ? '%2F' : '%5C' + }ger; + + # Additionally encode a colon in servicefile path of Windows + $encoded_srvfile =~ s/:/%3A/g; + + $dummy_node->connect_ok( + 'postgresql:///?service=my_srv&servicefile=' . $encoded_srvfile, + 'connection with valid servicefile in URI', + sql => "SELECT 'connect3_2'", + expected_stdout => qr/connect3_2/); + + local $ENV{PGSERVICE} = 'my_srv'; + $dummy_node->connect_ok( + q{servicefile='} . $srvfile_win_cared . q{'}, + 'connection with PGSERVICE and servicefile in connection string', + sql => "SELECT 'connect3_3'", + expected_stdout => qr/connect3_3/); + + $dummy_node->connect_ok( + 'postgresql://?servicefile=' . $encoded_srvfile, + 'connection with PGSERVICE and servicefile in URI', + sql => "SELECT 'connect3_4'", + expected_stdout => qr/connect3_4/); +} + +# Check that the "servicefile" option takes priority over the PGSERVICEFILE +# environment variable. +{ + local $ENV{PGSERVICEFILE} = 'non-existent-file.conf'; + + $dummy_node->connect_fails( + 'service=my_srv', + 'connection with invalid PGSERVICEFILE', + expected_stderr => + qr/service file "non-existent-file\.conf" not found/); + + $dummy_node->connect_ok( + q{service=my_srv servicefile='} . $srvfile_win_cared . q{'}, + 'connection with both servicefile and PGSERVICEFILE', + sql => "SELECT 'connect4_1'", + expected_stdout => qr/connect4_1/); +} + $node->teardown_node; done_testing(); diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index 91a8de1ee9b..54dbc059ada 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -6,7 +6,7 @@ # Emulation of PGAC_CHECK_STRIP strip_bin = find_program(get_option('STRIP'), required: false, native: true) -strip_cmd = strip_bin.found() ? [strip_bin.path()] : [':'] +strip_cmd = strip_bin.found() ? [strip_bin.full_path()] : [':'] working_strip = false if strip_bin.found() @@ -49,8 +49,8 @@ pgxs_kv = { 'PORTNAME': portname, 'PG_SYSROOT': pg_sysroot, - 'abs_top_builddir': meson.build_root(), - 'abs_top_srcdir': meson.source_root(), + 'abs_top_builddir': meson.project_build_root(), + 'abs_top_srcdir': meson.project_source_root(), 'enable_rpath': get_option('rpath') ? 'yes' : 'no', 'enable_nls': libintl.found() ? 'yes' : 'no', @@ -123,7 +123,7 @@ pgxs_kv = { if llvm.found() pgxs_kv += { - 'CLANG': clang.path(), + 'CLANG': clang.full_path(), 'CXX': ' '.join(cpp.cmd_array()), 'LLVM_BINPATH': llvm_binpath, } @@ -258,7 +258,7 @@ pgxs_deps = { pgxs_cdata = configuration_data(pgxs_kv) foreach b, p : pgxs_bins - pgxs_cdata.set(b, p.found() ? p.path() : '') + pgxs_cdata.set(b, p.found() ? p.full_path() : '') endforeach foreach pe : pgxs_empty diff --git a/src/makefiles/pgxs.mk b/src/makefiles/pgxs.mk index 0de3737e789..039cee3dfe5 100644 --- a/src/makefiles/pgxs.mk +++ b/src/makefiles/pgxs.mk @@ -376,10 +376,7 @@ endif ifdef REGRESS # things created by various check targets rm -rf $(pg_regress_clean_files) -ifeq ($(PORTNAME), win) - rm -f regress.def endif -endif # REGRESS ifdef TAP_TESTS rm -rf tmp_check/ endif diff --git a/src/pl/plperl/meson.build b/src/pl/plperl/meson.build index b463d4d56c5..7c4081c3460 100644 --- a/src/pl/plperl/meson.build +++ b/src/pl/plperl/meson.build @@ -96,7 +96,7 @@ tests += { 'plperl_transaction', 'plperl_env', ], - 'regress_args': ['--dlpath', meson.build_root() / 'src/test/regress'], + 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], }, } diff --git a/src/pl/plpgsql/src/expected/plpgsql_misc.out b/src/pl/plpgsql/src/expected/plpgsql_misc.out index a6511df08ec..ffb377f5f54 100644 --- a/src/pl/plpgsql/src/expected/plpgsql_misc.out +++ b/src/pl/plpgsql/src/expected/plpgsql_misc.out @@ -65,3 +65,39 @@ do $$ declare x public.foo%rowtype; begin end $$; ERROR: relation "public.foo" does not exist CONTEXT: compilation of PL/pgSQL function "inline_code_block" near line 1 do $$ declare x public.misc_table%rowtype; begin end $$; +-- Test handling of an unreserved keyword as a variable name +-- and record field name. +do $$ +declare + execute int; + r record; +begin + execute := 10; + raise notice 'execute = %', execute; + select 1 as strict into r; + raise notice 'r.strict = %', r.strict; +end $$; +NOTICE: execute = 10 +NOTICE: r.strict = 1 +-- Test handling of a reserved keyword as a record field name. +do $$ declare r record; +begin + select 1 as x, 2 as foreach into r; + raise notice 'r.x = %', r.x; + raise notice 'r.foreach = %', r.foreach; -- fails +end $$; +NOTICE: r.x = 1 +ERROR: field name "foreach" is a reserved key word +LINE 1: r.foreach + ^ +HINT: Use double quotes to quote it. +QUERY: r.foreach +CONTEXT: PL/pgSQL function inline_code_block line 5 at RAISE +do $$ declare r record; +begin + select 1 as x, 2 as foreach into r; + raise notice 'r.x = %', r.x; + raise notice 'r."foreach" = %', r."foreach"; -- ok +end $$; +NOTICE: r.x = 1 +NOTICE: r."foreach" = 2 diff --git a/src/pl/plpgsql/src/pl_comp.c b/src/pl/plpgsql/src/pl_comp.c index 519f7695d7c..f6976689a69 100644 --- a/src/pl/plpgsql/src/pl_comp.c +++ b/src/pl/plpgsql/src/pl_comp.c @@ -177,6 +177,7 @@ plpgsql_compile_callback(FunctionCallInfo fcinfo, yyscan_t scanner; Datum prosrcdatum; char *proc_source; + char *proc_signature; HeapTuple typeTup; Form_pg_type typeStruct; PLpgSQL_variable *var; @@ -223,16 +224,24 @@ plpgsql_compile_callback(FunctionCallInfo fcinfo, plpgsql_check_syntax = forValidator; plpgsql_curr_compile = function; + /* format_procedure leaks memory, so run it in temp context */ + proc_signature = format_procedure(fcinfo->flinfo->fn_oid); + /* * All the permanent output of compilation (e.g. parse tree) is kept in a * per-function memory context, so it can be reclaimed easily. + * + * While the func_cxt needs to be long-lived, we initially make it a child + * of the assumed-short-lived caller's context, and reparent it under + * CacheMemoryContext only upon success. This arrangement avoids memory + * leakage during compilation of a faulty function. */ - func_cxt = AllocSetContextCreate(TopMemoryContext, + func_cxt = AllocSetContextCreate(CurrentMemoryContext, "PL/pgSQL function", ALLOCSET_DEFAULT_SIZES); plpgsql_compile_tmp_cxt = MemoryContextSwitchTo(func_cxt); - function->fn_signature = format_procedure(fcinfo->flinfo->fn_oid); + function->fn_signature = pstrdup(proc_signature); MemoryContextSetIdentifier(func_cxt, function->fn_signature); function->fn_oid = fcinfo->flinfo->fn_oid; function->fn_input_collation = fcinfo->fncollation; @@ -704,6 +713,11 @@ plpgsql_compile_callback(FunctionCallInfo fcinfo, plpgsql_dumptree(function); /* + * All is well, so make the func_cxt long-lived + */ + MemoryContextSetParent(func_cxt, CacheMemoryContext); + + /* * Pop the error context stack */ error_context_stack = plerrcontext.previous; @@ -1201,17 +1215,22 @@ resolve_column_ref(ParseState *pstate, PLpgSQL_expr *expr, } /* - * We should not get here, because a RECFIELD datum should - * have been built at parse time for every possible qualified - * reference to fields of this record. But if we do, handle - * it like field-not-found: throw error or return NULL. + * Ideally we'd never get here, because a RECFIELD datum + * should have been built at parse time for every qualified + * reference to a field of this record that appears in the + * source text. However, plpgsql_yylex will not build such a + * datum unless the field name lexes as token type IDENT. + * Hence, if the would-be field name is a PL/pgSQL reserved + * word, we lose. Assume that that's what happened and tell + * the user to quote it, unless the caller prefers we just + * return NULL. */ if (error_if_no_field) ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("record \"%s\" has no field \"%s\"", - (nnames_field == 1) ? name1 : name2, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("field name \"%s\" is a reserved key word", colname), + errhint("Use double quotes to quote it."), parser_errposition(pstate, cref->location))); } break; @@ -1658,6 +1677,11 @@ plpgsql_parse_wordrowtype(char *ident) { Oid classOid; Oid typOid; + TypeName *typName; + MemoryContext oldCxt; + + /* Avoid memory leaks in long-term function context */ + oldCxt = MemoryContextSwitchTo(plpgsql_compile_tmp_cxt); /* * Look up the relation. Note that because relation rowtypes have the @@ -1680,9 +1704,12 @@ plpgsql_parse_wordrowtype(char *ident) errmsg("relation \"%s\" does not have a composite type", ident))); + typName = makeTypeName(ident); + + MemoryContextSwitchTo(oldCxt); + /* Build and return the row type struct */ - return plpgsql_build_datatype(typOid, -1, InvalidOid, - makeTypeName(ident)); + return plpgsql_build_datatype(typOid, -1, InvalidOid, typName); } /* ---------- @@ -1696,6 +1723,7 @@ plpgsql_parse_cwordrowtype(List *idents) Oid classOid; Oid typOid; RangeVar *relvar; + TypeName *typName; MemoryContext oldCxt; /* @@ -1718,11 +1746,12 @@ plpgsql_parse_cwordrowtype(List *idents) errmsg("relation \"%s\" does not have a composite type", relvar->relname))); + typName = makeTypeNameFromNameList(idents); + MemoryContextSwitchTo(oldCxt); /* Build and return the row type struct */ - return plpgsql_build_datatype(typOid, -1, InvalidOid, - makeTypeNameFromNameList(idents)); + return plpgsql_build_datatype(typOid, -1, InvalidOid, typName); } /* @@ -1937,6 +1966,8 @@ plpgsql_build_recfield(PLpgSQL_rec *rec, const char *fldname) * origtypname is the parsed form of what the user wrote as the type name. * It can be NULL if the type could not be a composite type, or if it was * identified by OID to begin with (e.g., it's a function argument type). + * origtypname is in short-lived storage and must be copied if we choose + * to incorporate it into the function's parse tree. */ PLpgSQL_type * plpgsql_build_datatype(Oid typeOid, int32 typmod, @@ -2055,7 +2086,7 @@ build_datatype(HeapTuple typeTup, int32 typmod, errmsg("type %s is not composite", format_type_be(typ->typoid)))); - typ->origtypname = origtypname; + typ->origtypname = copyObject(origtypname); typ->tcache = typentry; typ->tupdesc_id = typentry->tupDesc_identifier; } diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index bb99781c56e..d19425b7a71 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -5703,7 +5703,7 @@ exec_eval_expr(PLpgSQL_execstate *estate, /* * Else do it the hard way via exec_run_select */ - rc = exec_run_select(estate, expr, 2, NULL); + rc = exec_run_select(estate, expr, 0, NULL); if (rc != SPI_OK_SELECT) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -5757,6 +5757,10 @@ exec_eval_expr(PLpgSQL_execstate *estate, /* ---------- * exec_run_select Execute a select query + * + * Note: passing maxtuples different from 0 ("return all tuples") is + * deprecated because it will prevent parallel execution of the query. + * However, we retain the parameter in case we need it someday. * ---------- */ static int @@ -8606,6 +8610,15 @@ exec_set_found(PLpgSQL_execstate *estate, bool state) PLpgSQL_var *var; var = (PLpgSQL_var *) (estate->datums[estate->found_varno]); + + /* + * Use pg_assume() to avoid a spurious warning with some compilers, by + * telling the compiler that the VARATT_IS_EXTERNAL_NON_EXPANDED() branch + * in assign_simple_var() will never be reached when called from here, due + * to "found" being a boolean (i.e. a byvalue type), not a varlena. + */ + pg_assume(var->datatype->typlen != -1); + assign_simple_var(estate, var, BoolGetDatum(state), false, false); } diff --git a/src/pl/plpgsql/src/pl_gram.y b/src/pl/plpgsql/src/pl_gram.y index 5612e66d023..17568d82554 100644 --- a/src/pl/plpgsql/src/pl_gram.y +++ b/src/pl/plpgsql/src/pl_gram.y @@ -1368,7 +1368,8 @@ for_control : for_variable K_IN int tok = yylex(&yylval, &yylloc, yyscanner); int tokloc = yylloc; - if (tok == K_EXECUTE) + if (tok_is_keyword(tok, &yylval, + K_EXECUTE, "execute")) { /* EXECUTE means it's a dynamic FOR loop */ PLpgSQL_stmt_dynfors *new; @@ -2135,7 +2136,8 @@ stmt_open : K_OPEN cursor_variable yyerror(&yylloc, NULL, yyscanner, "syntax error, expected \"FOR\""); tok = yylex(&yylval, &yylloc, yyscanner); - if (tok == K_EXECUTE) + if (tok_is_keyword(tok, &yylval, + K_EXECUTE, "execute")) { int endtoken; @@ -2536,6 +2538,7 @@ unreserved_keyword : | K_ERRCODE | K_ERROR | K_EXCEPTION + | K_EXECUTE | K_EXIT | K_FETCH | K_FIRST @@ -2581,6 +2584,7 @@ unreserved_keyword : | K_SLICE | K_SQLSTATE | K_STACKED + | K_STRICT | K_TABLE | K_TABLE_NAME | K_TYPE @@ -3514,7 +3518,8 @@ make_return_query_stmt(int location, YYSTYPE *yylvalp, YYLTYPE *yyllocp, yyscan_ new->stmtid = ++plpgsql_curr_compile->nstatements; /* check for RETURN QUERY EXECUTE */ - if ((tok = yylex(yylvalp, yyllocp, yyscanner)) != K_EXECUTE) + tok = yylex(yylvalp, yyllocp, yyscanner); + if (!tok_is_keyword(tok, yylvalp, K_EXECUTE, "execute")) { /* ordinary static query */ plpgsql_push_back_token(tok, yylvalp, yyllocp, yyscanner); @@ -3597,7 +3602,7 @@ read_into_target(PLpgSQL_variable **target, bool *strict, YYSTYPE *yylvalp, YYLT *strict = false; tok = yylex(yylvalp, yyllocp, yyscanner); - if (strict && tok == K_STRICT) + if (strict && tok_is_keyword(tok, yylvalp, K_STRICT, "strict")) { *strict = true; tok = yylex(yylvalp, yyllocp, yyscanner); @@ -3848,6 +3853,7 @@ parse_datatype(const char *string, int location, yyscan_t yyscanner) int32 typmod; sql_error_callback_arg cbarg; ErrorContextCallback syntax_errcontext; + MemoryContext oldCxt; cbarg.location = location; cbarg.yyscanner = yyscanner; @@ -3857,9 +3863,14 @@ parse_datatype(const char *string, int location, yyscan_t yyscanner) syntax_errcontext.previous = error_context_stack; error_context_stack = &syntax_errcontext; - /* Let the main parser try to parse it under standard SQL rules */ + /* + * Let the main parser try to parse it under standard SQL rules. The + * parser leaks memory, so run it in temp context. + */ + oldCxt = MemoryContextSwitchTo(plpgsql_compile_tmp_cxt); typeName = typeStringToTypeName(string, NULL); typenameTypeIdAndMod(NULL, typeName, &type_id, &typmod); + MemoryContextSwitchTo(oldCxt); /* Restore former ereport callback */ error_context_stack = syntax_errcontext.previous; diff --git a/src/pl/plpgsql/src/pl_reserved_kwlist.h b/src/pl/plpgsql/src/pl_reserved_kwlist.h index ce7b0c9d331..f3ef2cbd8d7 100644 --- a/src/pl/plpgsql/src/pl_reserved_kwlist.h +++ b/src/pl/plpgsql/src/pl_reserved_kwlist.h @@ -33,7 +33,6 @@ PG_KEYWORD("case", K_CASE) PG_KEYWORD("declare", K_DECLARE) PG_KEYWORD("else", K_ELSE) PG_KEYWORD("end", K_END) -PG_KEYWORD("execute", K_EXECUTE) PG_KEYWORD("for", K_FOR) PG_KEYWORD("foreach", K_FOREACH) PG_KEYWORD("from", K_FROM) @@ -44,7 +43,6 @@ PG_KEYWORD("loop", K_LOOP) PG_KEYWORD("not", K_NOT) PG_KEYWORD("null", K_NULL) PG_KEYWORD("or", K_OR) -PG_KEYWORD("strict", K_STRICT) PG_KEYWORD("then", K_THEN) PG_KEYWORD("to", K_TO) PG_KEYWORD("using", K_USING) diff --git a/src/pl/plpgsql/src/pl_scanner.c b/src/pl/plpgsql/src/pl_scanner.c index d08187dafcb..19825e5c718 100644 --- a/src/pl/plpgsql/src/pl_scanner.c +++ b/src/pl/plpgsql/src/pl_scanner.c @@ -53,7 +53,7 @@ IdentifierLookup plpgsql_IdentifierLookup = IDENTIFIER_LOOKUP_NORMAL; * We try to avoid reserving more keywords than we have to; but there's * little point in not reserving a word if it's reserved in the core grammar. * Currently, the following words are reserved here but not in the core: - * BEGIN BY DECLARE EXECUTE FOREACH IF LOOP STRICT WHILE + * BEGIN BY DECLARE FOREACH IF LOOP WHILE */ /* ScanKeywordList lookup data for PL/pgSQL keywords */ diff --git a/src/pl/plpgsql/src/pl_unreserved_kwlist.h b/src/pl/plpgsql/src/pl_unreserved_kwlist.h index 98f99ec470c..b48c5a645ff 100644 --- a/src/pl/plpgsql/src/pl_unreserved_kwlist.h +++ b/src/pl/plpgsql/src/pl_unreserved_kwlist.h @@ -58,6 +58,7 @@ PG_KEYWORD("elsif", K_ELSIF) PG_KEYWORD("errcode", K_ERRCODE) PG_KEYWORD("error", K_ERROR) PG_KEYWORD("exception", K_EXCEPTION) +PG_KEYWORD("execute", K_EXECUTE) PG_KEYWORD("exit", K_EXIT) PG_KEYWORD("fetch", K_FETCH) PG_KEYWORD("first", K_FIRST) @@ -103,6 +104,7 @@ PG_KEYWORD("scroll", K_SCROLL) PG_KEYWORD("slice", K_SLICE) PG_KEYWORD("sqlstate", K_SQLSTATE) PG_KEYWORD("stacked", K_STACKED) +PG_KEYWORD("strict", K_STRICT) PG_KEYWORD("table", K_TABLE) PG_KEYWORD("table_name", K_TABLE_NAME) PG_KEYWORD("type", K_TYPE) diff --git a/src/pl/plpgsql/src/sql/plpgsql_misc.sql b/src/pl/plpgsql/src/sql/plpgsql_misc.sql index d3a7f703a75..0bc39fcf325 100644 --- a/src/pl/plpgsql/src/sql/plpgsql_misc.sql +++ b/src/pl/plpgsql/src/sql/plpgsql_misc.sql @@ -37,3 +37,32 @@ do $$ declare x foo.bar%rowtype; begin end $$; do $$ declare x foo.bar.baz%rowtype; begin end $$; do $$ declare x public.foo%rowtype; begin end $$; do $$ declare x public.misc_table%rowtype; begin end $$; + +-- Test handling of an unreserved keyword as a variable name +-- and record field name. +do $$ +declare + execute int; + r record; +begin + execute := 10; + raise notice 'execute = %', execute; + select 1 as strict into r; + raise notice 'r.strict = %', r.strict; +end $$; + +-- Test handling of a reserved keyword as a record field name. + +do $$ declare r record; +begin + select 1 as x, 2 as foreach into r; + raise notice 'r.x = %', r.x; + raise notice 'r.foreach = %', r.foreach; -- fails +end $$; + +do $$ declare r record; +begin + select 1 as x, 2 as foreach into r; + raise notice 'r.x = %', r.x; + raise notice 'r."foreach" = %', r."foreach"; -- ok +end $$; diff --git a/src/pl/plpython/Makefile b/src/pl/plpython/Makefile index f959083a0bd..25f295c3709 100644 --- a/src/pl/plpython/Makefile +++ b/src/pl/plpython/Makefile @@ -11,7 +11,7 @@ ifeq ($(PORTNAME), win32) override python_libspec = endif -override CPPFLAGS := -I. -I$(srcdir) $(python_includespec) $(CPPFLAGS) +override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) $(python_includespec) rpathdir = $(python_libdir) diff --git a/src/pl/plpython/expected/README b/src/pl/plpython/expected/README deleted file mode 100644 index 388c553a589..00000000000 --- a/src/pl/plpython/expected/README +++ /dev/null @@ -1,3 +0,0 @@ -Guide to alternative expected files: - -plpython_error_5.out Python 3.5 and newer diff --git a/src/pl/plpython/expected/plpython_error.out b/src/pl/plpython/expected/plpython_error.out index 68722b00097..fd9cd73be74 100644 --- a/src/pl/plpython/expected/plpython_error.out +++ b/src/pl/plpython/expected/plpython_error.out @@ -243,7 +243,7 @@ $$ plpy.nonexistent $$ LANGUAGE plpython3u; SELECT toplevel_attribute_error(); -ERROR: AttributeError: 'module' object has no attribute 'nonexistent' +ERROR: AttributeError: module 'plpy' has no attribute 'nonexistent' CONTEXT: Traceback (most recent call last): PL/Python function "toplevel_attribute_error", line 2, in <module> plpy.nonexistent diff --git a/src/pl/plpython/expected/plpython_error_5.out b/src/pl/plpython/expected/plpython_error_5.out deleted file mode 100644 index fd9cd73be74..00000000000 --- a/src/pl/plpython/expected/plpython_error_5.out +++ /dev/null @@ -1,460 +0,0 @@ --- test error handling, i forgot to restore Warn_restart in --- the trigger handler once. the errors and subsequent core dump were --- interesting. -/* Flat out Python syntax error - */ -CREATE FUNCTION python_syntax_error() RETURNS text - AS -'.syntaxerror' - LANGUAGE plpython3u; -ERROR: could not compile PL/Python function "python_syntax_error" -DETAIL: SyntaxError: invalid syntax (<string>, line 2) -/* With check_function_bodies = false the function should get defined - * and the error reported when called - */ -SET check_function_bodies = false; -CREATE FUNCTION python_syntax_error() RETURNS text - AS -'.syntaxerror' - LANGUAGE plpython3u; -SELECT python_syntax_error(); -ERROR: could not compile PL/Python function "python_syntax_error" -DETAIL: SyntaxError: invalid syntax (<string>, line 2) -/* Run the function twice to check if the hashtable entry gets cleaned up */ -SELECT python_syntax_error(); -ERROR: could not compile PL/Python function "python_syntax_error" -DETAIL: SyntaxError: invalid syntax (<string>, line 2) -RESET check_function_bodies; -/* Flat out syntax error - */ -CREATE FUNCTION sql_syntax_error() RETURNS text - AS -'plpy.execute("syntax error")' - LANGUAGE plpython3u; -SELECT sql_syntax_error(); -ERROR: spiexceptions.SyntaxError: syntax error at or near "syntax" -LINE 1: syntax error - ^ -QUERY: syntax error -CONTEXT: Traceback (most recent call last): - PL/Python function "sql_syntax_error", line 1, in <module> - plpy.execute("syntax error") -PL/Python function "sql_syntax_error" -/* check the handling of uncaught python exceptions - */ -CREATE FUNCTION exception_index_invalid(text) RETURNS text - AS -'return args[1]' - LANGUAGE plpython3u; -SELECT exception_index_invalid('test'); -ERROR: IndexError: list index out of range -CONTEXT: Traceback (most recent call last): - PL/Python function "exception_index_invalid", line 1, in <module> - return args[1] -PL/Python function "exception_index_invalid" -/* check handling of nested exceptions - */ -CREATE FUNCTION exception_index_invalid_nested() RETURNS text - AS -'rv = plpy.execute("SELECT test5(''foo'')") -return rv[0]' - LANGUAGE plpython3u; -SELECT exception_index_invalid_nested(); -ERROR: spiexceptions.UndefinedFunction: function test5(unknown) does not exist -LINE 1: SELECT test5('foo') - ^ -HINT: No function matches the given name and argument types. You might need to add explicit type casts. -QUERY: SELECT test5('foo') -CONTEXT: Traceback (most recent call last): - PL/Python function "exception_index_invalid_nested", line 1, in <module> - rv = plpy.execute("SELECT test5('foo')") -PL/Python function "exception_index_invalid_nested" -/* a typo - */ -CREATE FUNCTION invalid_type_uncaught(a text) RETURNS text - AS -'if "plan" not in SD: - q = "SELECT fname FROM users WHERE lname = $1" - SD["plan"] = plpy.prepare(q, [ "test" ]) -rv = plpy.execute(SD["plan"], [ a ]) -if len(rv): - return rv[0]["fname"] -return None -' - LANGUAGE plpython3u; -SELECT invalid_type_uncaught('rick'); -ERROR: spiexceptions.UndefinedObject: type "test" does not exist -CONTEXT: Traceback (most recent call last): - PL/Python function "invalid_type_uncaught", line 3, in <module> - SD["plan"] = plpy.prepare(q, [ "test" ]) -PL/Python function "invalid_type_uncaught" -/* for what it's worth catch the exception generated by - * the typo, and return None - */ -CREATE FUNCTION invalid_type_caught(a text) RETURNS text - AS -'if "plan" not in SD: - q = "SELECT fname FROM users WHERE lname = $1" - try: - SD["plan"] = plpy.prepare(q, [ "test" ]) - except plpy.SPIError as ex: - plpy.notice(str(ex)) - return None -rv = plpy.execute(SD["plan"], [ a ]) -if len(rv): - return rv[0]["fname"] -return None -' - LANGUAGE plpython3u; -SELECT invalid_type_caught('rick'); -NOTICE: type "test" does not exist - invalid_type_caught ---------------------- - -(1 row) - -/* for what it's worth catch the exception generated by - * the typo, and reraise it as a plain error - */ -CREATE FUNCTION invalid_type_reraised(a text) RETURNS text - AS -'if "plan" not in SD: - q = "SELECT fname FROM users WHERE lname = $1" - try: - SD["plan"] = plpy.prepare(q, [ "test" ]) - except plpy.SPIError as ex: - plpy.error(str(ex)) -rv = plpy.execute(SD["plan"], [ a ]) -if len(rv): - return rv[0]["fname"] -return None -' - LANGUAGE plpython3u; -SELECT invalid_type_reraised('rick'); -ERROR: plpy.Error: type "test" does not exist -CONTEXT: Traceback (most recent call last): - PL/Python function "invalid_type_reraised", line 6, in <module> - plpy.error(str(ex)) -PL/Python function "invalid_type_reraised" -/* no typo no messing about - */ -CREATE FUNCTION valid_type(a text) RETURNS text - AS -'if "plan" not in SD: - SD["plan"] = plpy.prepare("SELECT fname FROM users WHERE lname = $1", [ "text" ]) -rv = plpy.execute(SD["plan"], [ a ]) -if len(rv): - return rv[0]["fname"] -return None -' - LANGUAGE plpython3u; -SELECT valid_type('rick'); - valid_type ------------- - -(1 row) - -/* error in nested functions to get a traceback -*/ -CREATE FUNCTION nested_error() RETURNS text - AS -'def fun1(): - plpy.error("boom") - -def fun2(): - fun1() - -def fun3(): - fun2() - -fun3() -return "not reached" -' - LANGUAGE plpython3u; -SELECT nested_error(); -ERROR: plpy.Error: boom -CONTEXT: Traceback (most recent call last): - PL/Python function "nested_error", line 10, in <module> - fun3() - PL/Python function "nested_error", line 8, in fun3 - fun2() - PL/Python function "nested_error", line 5, in fun2 - fun1() - PL/Python function "nested_error", line 2, in fun1 - plpy.error("boom") -PL/Python function "nested_error" -/* raising plpy.Error is just like calling plpy.error -*/ -CREATE FUNCTION nested_error_raise() RETURNS text - AS -'def fun1(): - raise plpy.Error("boom") - -def fun2(): - fun1() - -def fun3(): - fun2() - -fun3() -return "not reached" -' - LANGUAGE plpython3u; -SELECT nested_error_raise(); -ERROR: plpy.Error: boom -CONTEXT: Traceback (most recent call last): - PL/Python function "nested_error_raise", line 10, in <module> - fun3() - PL/Python function "nested_error_raise", line 8, in fun3 - fun2() - PL/Python function "nested_error_raise", line 5, in fun2 - fun1() - PL/Python function "nested_error_raise", line 2, in fun1 - raise plpy.Error("boom") -PL/Python function "nested_error_raise" -/* using plpy.warning should not produce a traceback -*/ -CREATE FUNCTION nested_warning() RETURNS text - AS -'def fun1(): - plpy.warning("boom") - -def fun2(): - fun1() - -def fun3(): - fun2() - -fun3() -return "you''ve been warned" -' - LANGUAGE plpython3u; -SELECT nested_warning(); -WARNING: boom - nested_warning --------------------- - you've been warned -(1 row) - -/* AttributeError at toplevel used to give segfaults with the traceback -*/ -CREATE FUNCTION toplevel_attribute_error() RETURNS void AS -$$ -plpy.nonexistent -$$ LANGUAGE plpython3u; -SELECT toplevel_attribute_error(); -ERROR: AttributeError: module 'plpy' has no attribute 'nonexistent' -CONTEXT: Traceback (most recent call last): - PL/Python function "toplevel_attribute_error", line 2, in <module> - plpy.nonexistent -PL/Python function "toplevel_attribute_error" -/* Calling PL/Python functions from SQL and vice versa should not lose context. - */ -CREATE OR REPLACE FUNCTION python_traceback() RETURNS void AS $$ -def first(): - second() - -def second(): - third() - -def third(): - plpy.execute("select sql_error()") - -first() -$$ LANGUAGE plpython3u; -CREATE OR REPLACE FUNCTION sql_error() RETURNS void AS $$ -begin - select 1/0; -end -$$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION python_from_sql_error() RETURNS void AS $$ -begin - select python_traceback(); -end -$$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION sql_from_python_error() RETURNS void AS $$ -plpy.execute("select sql_error()") -$$ LANGUAGE plpython3u; -SELECT python_traceback(); -ERROR: spiexceptions.DivisionByZero: division by zero -CONTEXT: Traceback (most recent call last): - PL/Python function "python_traceback", line 11, in <module> - first() - PL/Python function "python_traceback", line 3, in first - second() - PL/Python function "python_traceback", line 6, in second - third() - PL/Python function "python_traceback", line 9, in third - plpy.execute("select sql_error()") -PL/Python function "python_traceback" -SELECT sql_error(); -ERROR: division by zero -CONTEXT: SQL statement "select 1/0" -PL/pgSQL function sql_error() line 3 at SQL statement -SELECT python_from_sql_error(); -ERROR: spiexceptions.DivisionByZero: division by zero -CONTEXT: Traceback (most recent call last): - PL/Python function "python_traceback", line 11, in <module> - first() - PL/Python function "python_traceback", line 3, in first - second() - PL/Python function "python_traceback", line 6, in second - third() - PL/Python function "python_traceback", line 9, in third - plpy.execute("select sql_error()") -PL/Python function "python_traceback" -SQL statement "select python_traceback()" -PL/pgSQL function python_from_sql_error() line 3 at SQL statement -SELECT sql_from_python_error(); -ERROR: spiexceptions.DivisionByZero: division by zero -CONTEXT: Traceback (most recent call last): - PL/Python function "sql_from_python_error", line 2, in <module> - plpy.execute("select sql_error()") -PL/Python function "sql_from_python_error" -/* check catching specific types of exceptions - */ -CREATE TABLE specific ( - i integer PRIMARY KEY -); -CREATE FUNCTION specific_exception(i integer) RETURNS void AS -$$ -from plpy import spiexceptions -try: - plpy.execute("insert into specific values (%s)" % (i or "NULL")); -except spiexceptions.NotNullViolation as e: - plpy.notice("Violated the NOT NULL constraint, sqlstate %s" % e.sqlstate) -except spiexceptions.UniqueViolation as e: - plpy.notice("Violated the UNIQUE constraint, sqlstate %s" % e.sqlstate) -$$ LANGUAGE plpython3u; -SELECT specific_exception(2); - specific_exception --------------------- - -(1 row) - -SELECT specific_exception(NULL); -NOTICE: Violated the NOT NULL constraint, sqlstate 23502 - specific_exception --------------------- - -(1 row) - -SELECT specific_exception(2); -NOTICE: Violated the UNIQUE constraint, sqlstate 23505 - specific_exception --------------------- - -(1 row) - -/* SPI errors in PL/Python functions should preserve the SQLSTATE value - */ -CREATE FUNCTION python_unique_violation() RETURNS void AS $$ -plpy.execute("insert into specific values (1)") -plpy.execute("insert into specific values (1)") -$$ LANGUAGE plpython3u; -CREATE FUNCTION catch_python_unique_violation() RETURNS text AS $$ -begin - begin - perform python_unique_violation(); - exception when unique_violation then - return 'ok'; - end; - return 'not reached'; -end; -$$ language plpgsql; -SELECT catch_python_unique_violation(); - catch_python_unique_violation -------------------------------- - ok -(1 row) - -/* manually starting subtransactions - a bad idea - */ -CREATE FUNCTION manual_subxact() RETURNS void AS $$ -plpy.execute("savepoint save") -plpy.execute("create table foo(x integer)") -plpy.execute("rollback to save") -$$ LANGUAGE plpython3u; -SELECT manual_subxact(); -ERROR: plpy.SPIError: SPI_execute failed: SPI_ERROR_TRANSACTION -CONTEXT: Traceback (most recent call last): - PL/Python function "manual_subxact", line 2, in <module> - plpy.execute("savepoint save") -PL/Python function "manual_subxact" -/* same for prepared plans - */ -CREATE FUNCTION manual_subxact_prepared() RETURNS void AS $$ -save = plpy.prepare("savepoint save") -rollback = plpy.prepare("rollback to save") -plpy.execute(save) -plpy.execute("create table foo(x integer)") -plpy.execute(rollback) -$$ LANGUAGE plpython3u; -SELECT manual_subxact_prepared(); -ERROR: plpy.SPIError: SPI_execute_plan failed: SPI_ERROR_TRANSACTION -CONTEXT: Traceback (most recent call last): - PL/Python function "manual_subxact_prepared", line 4, in <module> - plpy.execute(save) -PL/Python function "manual_subxact_prepared" -/* raising plpy.spiexception.* from python code should preserve sqlstate - */ -CREATE FUNCTION plpy_raise_spiexception() RETURNS void AS $$ -raise plpy.spiexceptions.DivisionByZero() -$$ LANGUAGE plpython3u; -DO $$ -BEGIN - SELECT plpy_raise_spiexception(); -EXCEPTION WHEN division_by_zero THEN - -- NOOP -END -$$ LANGUAGE plpgsql; -/* setting a custom sqlstate should be handled - */ -CREATE FUNCTION plpy_raise_spiexception_override() RETURNS void AS $$ -exc = plpy.spiexceptions.DivisionByZero() -exc.sqlstate = 'SILLY' -raise exc -$$ LANGUAGE plpython3u; -DO $$ -BEGIN - SELECT plpy_raise_spiexception_override(); -EXCEPTION WHEN SQLSTATE 'SILLY' THEN - -- NOOP -END -$$ LANGUAGE plpgsql; -/* test the context stack trace for nested execution levels - */ -CREATE FUNCTION notice_innerfunc() RETURNS int AS $$ -plpy.execute("DO LANGUAGE plpython3u $x$ plpy.notice('inside DO') $x$") -return 1 -$$ LANGUAGE plpython3u; -CREATE FUNCTION notice_outerfunc() RETURNS int AS $$ -plpy.execute("SELECT notice_innerfunc()") -return 1 -$$ LANGUAGE plpython3u; -\set SHOW_CONTEXT always -SELECT notice_outerfunc(); -NOTICE: inside DO -CONTEXT: PL/Python anonymous code block -SQL statement "DO LANGUAGE plpython3u $x$ plpy.notice('inside DO') $x$" -PL/Python function "notice_innerfunc" -SQL statement "SELECT notice_innerfunc()" -PL/Python function "notice_outerfunc" - notice_outerfunc ------------------- - 1 -(1 row) - -/* test error logged with an underlying exception that includes a detail - * string (bug #18070). - */ -CREATE FUNCTION python_error_detail() RETURNS SETOF text AS $$ - plan = plpy.prepare("SELECT to_date('xy', 'DD') d") - for row in plpy.cursor(plan): - yield row['d'] -$$ LANGUAGE plpython3u; -SELECT python_error_detail(); -ERROR: error fetching next item from iterator -DETAIL: spiexceptions.InvalidDatetimeFormat: invalid value "xy" for "DD" -CONTEXT: Traceback (most recent call last): -PL/Python function "python_error_detail" diff --git a/src/pl/plpython/plpy_cursorobject.c b/src/pl/plpython/plpy_cursorobject.c index 37d7efca77c..cc74c4df6ba 100644 --- a/src/pl/plpython/plpy_cursorobject.c +++ b/src/pl/plpython/plpy_cursorobject.c @@ -58,9 +58,9 @@ static PyType_Slot PLyCursor_slots[] = static PyType_Spec PLyCursor_spec = { .name = "PLyCursor", - .basicsize = sizeof(PLyCursorObject), - .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, - .slots = PLyCursor_slots, + .basicsize = sizeof(PLyCursorObject), + .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, + .slots = PLyCursor_slots, }; static PyTypeObject *PLy_CursorType; diff --git a/src/pl/plpython/plpy_elog.c b/src/pl/plpython/plpy_elog.c index ddf3573f0e7..f6d10045e5c 100644 --- a/src/pl/plpython/plpy_elog.c +++ b/src/pl/plpython/plpy_elog.c @@ -18,7 +18,8 @@ PyObject *PLy_exc_spi_error = NULL; static void PLy_traceback(PyObject *e, PyObject *v, PyObject *tb, - char **xmsg, char **tbmsg, int *tb_depth); + char *volatile *xmsg, char *volatile *tbmsg, + int *tb_depth); static void PLy_get_spi_error_data(PyObject *exc, int *sqlerrcode, char **detail, char **hint, char **query, int *position, char **schema_name, char **table_name, char **column_name, @@ -43,78 +44,82 @@ void PLy_elog_impl(int elevel, const char *fmt,...) { int save_errno = errno; - char *xmsg; - char *tbmsg; + char *volatile xmsg = NULL; + char *volatile tbmsg = NULL; int tb_depth; StringInfoData emsg; PyObject *exc, *val, *tb; - const char *primary = NULL; - int sqlerrcode = 0; - char *detail = NULL; - char *hint = NULL; - char *query = NULL; - int position = 0; - char *schema_name = NULL; - char *table_name = NULL; - char *column_name = NULL; - char *datatype_name = NULL; - char *constraint_name = NULL; + + /* If we'll need emsg, must initialize it before entering PG_TRY */ + if (fmt) + initStringInfo(&emsg); PyErr_Fetch(&exc, &val, &tb); - if (exc != NULL) + /* Use a PG_TRY block to ensure we release the PyObjects just acquired */ + PG_TRY(); { - PyErr_NormalizeException(&exc, &val, &tb); - - if (PyErr_GivenExceptionMatches(val, PLy_exc_spi_error)) - PLy_get_spi_error_data(val, &sqlerrcode, - &detail, &hint, &query, &position, + const char *primary = NULL; + int sqlerrcode = 0; + char *detail = NULL; + char *hint = NULL; + char *query = NULL; + int position = 0; + char *schema_name = NULL; + char *table_name = NULL; + char *column_name = NULL; + char *datatype_name = NULL; + char *constraint_name = NULL; + + if (exc != NULL) + { + PyErr_NormalizeException(&exc, &val, &tb); + + if (PyErr_GivenExceptionMatches(val, PLy_exc_spi_error)) + PLy_get_spi_error_data(val, &sqlerrcode, + &detail, &hint, &query, &position, + &schema_name, &table_name, &column_name, + &datatype_name, &constraint_name); + else if (PyErr_GivenExceptionMatches(val, PLy_exc_error)) + PLy_get_error_data(val, &sqlerrcode, &detail, &hint, &schema_name, &table_name, &column_name, &datatype_name, &constraint_name); - else if (PyErr_GivenExceptionMatches(val, PLy_exc_error)) - PLy_get_error_data(val, &sqlerrcode, &detail, &hint, - &schema_name, &table_name, &column_name, - &datatype_name, &constraint_name); - else if (PyErr_GivenExceptionMatches(val, PLy_exc_fatal)) - elevel = FATAL; - } + else if (PyErr_GivenExceptionMatches(val, PLy_exc_fatal)) + elevel = FATAL; + } - /* this releases our refcount on tb! */ - PLy_traceback(exc, val, tb, - &xmsg, &tbmsg, &tb_depth); + PLy_traceback(exc, val, tb, + &xmsg, &tbmsg, &tb_depth); - if (fmt) - { - initStringInfo(&emsg); - for (;;) + if (fmt) { - va_list ap; - int needed; - - errno = save_errno; - va_start(ap, fmt); - needed = appendStringInfoVA(&emsg, dgettext(TEXTDOMAIN, fmt), ap); - va_end(ap); - if (needed == 0) - break; - enlargeStringInfo(&emsg, needed); - } - primary = emsg.data; + for (;;) + { + va_list ap; + int needed; + + errno = save_errno; + va_start(ap, fmt); + needed = appendStringInfoVA(&emsg, dgettext(TEXTDOMAIN, fmt), ap); + va_end(ap); + if (needed == 0) + break; + enlargeStringInfo(&emsg, needed); + } + primary = emsg.data; - /* If there's an exception message, it goes in the detail. */ - if (xmsg) - detail = xmsg; - } - else - { - if (xmsg) - primary = xmsg; - } + /* If there's an exception message, it goes in the detail. */ + if (xmsg) + detail = xmsg; + } + else + { + if (xmsg) + primary = xmsg; + } - PG_TRY(); - { ereport(elevel, (errcode(sqlerrcode ? sqlerrcode : ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg_internal("%s", primary ? primary : "no exception data"), @@ -136,14 +141,23 @@ PLy_elog_impl(int elevel, const char *fmt,...) } PG_FINALLY(); { + Py_XDECREF(exc); + Py_XDECREF(val); + /* Must release all the objects in the traceback stack */ + while (tb != NULL && tb != Py_None) + { + PyObject *tb_prev = tb; + + tb = PyObject_GetAttrString(tb, "tb_next"); + Py_DECREF(tb_prev); + } + /* For neatness' sake, also release our string buffers */ if (fmt) pfree(emsg.data); if (xmsg) pfree(xmsg); if (tbmsg) pfree(tbmsg); - Py_XDECREF(exc); - Py_XDECREF(val); } PG_END_TRY(); } @@ -154,21 +168,14 @@ PLy_elog_impl(int elevel, const char *fmt,...) * The exception error message is returned in xmsg, the traceback in * tbmsg (both as palloc'd strings) and the traceback depth in * tb_depth. - * - * We release refcounts on all the Python objects in the traceback stack, - * but not on e or v. */ static void PLy_traceback(PyObject *e, PyObject *v, PyObject *tb, - char **xmsg, char **tbmsg, int *tb_depth) + char *volatile *xmsg, char *volatile *tbmsg, int *tb_depth) { - PyObject *e_type_o; - PyObject *e_module_o; - char *e_type_s = NULL; - char *e_module_s = NULL; - PyObject *vob = NULL; - char *vstr; - StringInfoData xstr; + PyObject *volatile e_type_o = NULL; + PyObject *volatile e_module_o = NULL; + PyObject *volatile vob = NULL; StringInfoData tbstr; /* @@ -186,47 +193,59 @@ PLy_traceback(PyObject *e, PyObject *v, PyObject *tb, /* * Format the exception and its value and put it in xmsg. */ - - e_type_o = PyObject_GetAttrString(e, "__name__"); - e_module_o = PyObject_GetAttrString(e, "__module__"); - if (e_type_o) - e_type_s = PLyUnicode_AsString(e_type_o); - if (e_type_s) - e_module_s = PLyUnicode_AsString(e_module_o); - - if (v && ((vob = PyObject_Str(v)) != NULL)) - vstr = PLyUnicode_AsString(vob); - else - vstr = "unknown"; - - initStringInfo(&xstr); - if (!e_type_s || !e_module_s) + PG_TRY(); { - /* shouldn't happen */ - appendStringInfoString(&xstr, "unrecognized exception"); + char *e_type_s = NULL; + char *e_module_s = NULL; + const char *vstr; + StringInfoData xstr; + + e_type_o = PyObject_GetAttrString(e, "__name__"); + e_module_o = PyObject_GetAttrString(e, "__module__"); + if (e_type_o) + e_type_s = PLyUnicode_AsString(e_type_o); + if (e_module_o) + e_module_s = PLyUnicode_AsString(e_module_o); + + if (v && ((vob = PyObject_Str(v)) != NULL)) + vstr = PLyUnicode_AsString(vob); + else + vstr = "unknown"; + + initStringInfo(&xstr); + if (!e_type_s || !e_module_s) + { + /* shouldn't happen */ + appendStringInfoString(&xstr, "unrecognized exception"); + } + /* mimics behavior of traceback.format_exception_only */ + else if (strcmp(e_module_s, "builtins") == 0 + || strcmp(e_module_s, "__main__") == 0 + || strcmp(e_module_s, "exceptions") == 0) + appendStringInfoString(&xstr, e_type_s); + else + appendStringInfo(&xstr, "%s.%s", e_module_s, e_type_s); + appendStringInfo(&xstr, ": %s", vstr); + + *xmsg = xstr.data; } - /* mimics behavior of traceback.format_exception_only */ - else if (strcmp(e_module_s, "builtins") == 0 - || strcmp(e_module_s, "__main__") == 0 - || strcmp(e_module_s, "exceptions") == 0) - appendStringInfoString(&xstr, e_type_s); - else - appendStringInfo(&xstr, "%s.%s", e_module_s, e_type_s); - appendStringInfo(&xstr, ": %s", vstr); - - *xmsg = xstr.data; + PG_FINALLY(); + { + Py_XDECREF(e_type_o); + Py_XDECREF(e_module_o); + Py_XDECREF(vob); + } + PG_END_TRY(); /* * Now format the traceback and put it in tbmsg. */ - *tb_depth = 0; initStringInfo(&tbstr); /* Mimic Python traceback reporting as close as possible. */ appendStringInfoString(&tbstr, "Traceback (most recent call last):"); while (tb != NULL && tb != Py_None) { - PyObject *volatile tb_prev = NULL; PyObject *volatile frame = NULL; PyObject *volatile code = NULL; PyObject *volatile name = NULL; @@ -254,84 +273,74 @@ PLy_traceback(PyObject *e, PyObject *v, PyObject *tb, filename = PyObject_GetAttrString(code, "co_filename"); if (filename == NULL) elog(ERROR, "could not get file name from Python code object"); + + /* The first frame always points at <module>, skip it. */ + if (*tb_depth > 0) + { + PLyExecutionContext *exec_ctx = PLy_current_execution_context(); + char *proname; + char *fname; + char *line; + char *plain_filename; + long plain_lineno; + + /* + * The second frame points at the internal function, but to + * mimic Python error reporting we want to say <module>. + */ + if (*tb_depth == 1) + fname = "<module>"; + else + fname = PLyUnicode_AsString(name); + + proname = PLy_procedure_name(exec_ctx->curr_proc); + plain_filename = PLyUnicode_AsString(filename); + plain_lineno = PyLong_AsLong(lineno); + + if (proname == NULL) + appendStringInfo(&tbstr, "\n PL/Python anonymous code block, line %ld, in %s", + plain_lineno - 1, fname); + else + appendStringInfo(&tbstr, "\n PL/Python function \"%s\", line %ld, in %s", + proname, plain_lineno - 1, fname); + + /* + * function code object was compiled with "<string>" as the + * filename + */ + if (exec_ctx->curr_proc && plain_filename != NULL && + strcmp(plain_filename, "<string>") == 0) + { + /* + * If we know the current procedure, append the exact line + * from the source, again mimicking Python's traceback.py + * module behavior. We could store the already line-split + * source to avoid splitting it every time, but producing + * a traceback is not the most important scenario to + * optimize for. But we do not go as far as traceback.py + * in reading the source of imported modules. + */ + line = get_source_line(exec_ctx->curr_proc->src, plain_lineno); + if (line) + { + appendStringInfo(&tbstr, "\n %s", line); + pfree(line); + } + } + } } - PG_CATCH(); + PG_FINALLY(); { Py_XDECREF(frame); Py_XDECREF(code); Py_XDECREF(name); Py_XDECREF(lineno); Py_XDECREF(filename); - PG_RE_THROW(); } PG_END_TRY(); - /* The first frame always points at <module>, skip it. */ - if (*tb_depth > 0) - { - PLyExecutionContext *exec_ctx = PLy_current_execution_context(); - char *proname; - char *fname; - char *line; - char *plain_filename; - long plain_lineno; - - /* - * The second frame points at the internal function, but to mimic - * Python error reporting we want to say <module>. - */ - if (*tb_depth == 1) - fname = "<module>"; - else - fname = PLyUnicode_AsString(name); - - proname = PLy_procedure_name(exec_ctx->curr_proc); - plain_filename = PLyUnicode_AsString(filename); - plain_lineno = PyLong_AsLong(lineno); - - if (proname == NULL) - appendStringInfo(&tbstr, "\n PL/Python anonymous code block, line %ld, in %s", - plain_lineno - 1, fname); - else - appendStringInfo(&tbstr, "\n PL/Python function \"%s\", line %ld, in %s", - proname, plain_lineno - 1, fname); - - /* - * function code object was compiled with "<string>" as the - * filename - */ - if (exec_ctx->curr_proc && plain_filename != NULL && - strcmp(plain_filename, "<string>") == 0) - { - /* - * If we know the current procedure, append the exact line - * from the source, again mimicking Python's traceback.py - * module behavior. We could store the already line-split - * source to avoid splitting it every time, but producing a - * traceback is not the most important scenario to optimize - * for. But we do not go as far as traceback.py in reading - * the source of imported modules. - */ - line = get_source_line(exec_ctx->curr_proc->src, plain_lineno); - if (line) - { - appendStringInfo(&tbstr, "\n %s", line); - pfree(line); - } - } - } - - Py_DECREF(frame); - Py_DECREF(code); - Py_DECREF(name); - Py_DECREF(lineno); - Py_DECREF(filename); - - /* Release the current frame and go to the next one. */ - tb_prev = tb; + /* Advance to the next frame. */ tb = PyObject_GetAttrString(tb, "tb_next"); - Assert(tb_prev != Py_None); - Py_DECREF(tb_prev); if (tb == NULL) elog(ERROR, "could not traverse Python traceback"); (*tb_depth)++; @@ -339,10 +348,6 @@ PLy_traceback(PyObject *e, PyObject *v, PyObject *tb, /* Return the traceback. */ *tbmsg = tbstr.data; - - Py_XDECREF(e_type_o); - Py_XDECREF(e_module_o); - Py_XDECREF(vob); } /* diff --git a/src/pl/plpython/plpy_planobject.c b/src/pl/plpython/plpy_planobject.c index 6044893afdd..edfb76c8770 100644 --- a/src/pl/plpython/plpy_planobject.c +++ b/src/pl/plpython/plpy_planobject.c @@ -45,9 +45,9 @@ static PyType_Slot PLyPlan_slots[] = static PyType_Spec PLyPlan_spec = { .name = "PLyPlan", - .basicsize = sizeof(PLyPlanObject), - .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, - .slots = PLyPlan_slots, + .basicsize = sizeof(PLyPlanObject), + .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, + .slots = PLyPlan_slots, }; static PyTypeObject *PLy_PlanType; diff --git a/src/pl/plpython/plpy_resultobject.c b/src/pl/plpython/plpy_resultobject.c index 0d9997cbaa3..d433929b360 100644 --- a/src/pl/plpython/plpy_resultobject.c +++ b/src/pl/plpython/plpy_resultobject.c @@ -70,9 +70,9 @@ static PyType_Slot PLyResult_slots[] = static PyType_Spec PLyResult_spec = { .name = "PLyResult", - .basicsize = sizeof(PLyResultObject), - .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, - .slots = PLyResult_slots, + .basicsize = sizeof(PLyResultObject), + .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, + .slots = PLyResult_slots, }; static PyTypeObject *PLy_ResultType; diff --git a/src/pl/plpython/plpy_subxactobject.c b/src/pl/plpython/plpy_subxactobject.c index c2484a99b4a..c225b652ab4 100644 --- a/src/pl/plpython/plpy_subxactobject.c +++ b/src/pl/plpython/plpy_subxactobject.c @@ -46,9 +46,9 @@ static PyType_Slot PLySubtransaction_slots[] = static PyType_Spec PLySubtransaction_spec = { .name = "PLySubtransaction", - .basicsize = sizeof(PLySubtransactionObject), - .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, - .slots = PLySubtransaction_slots, + .basicsize = sizeof(PLySubtransactionObject), + .flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, + .slots = PLySubtransaction_slots, }; static PyTypeObject *PLy_SubtransactionType; diff --git a/src/pl/tcl/Makefile b/src/pl/tcl/Makefile index ea52a2efc22..dd57f7d694c 100644 --- a/src/pl/tcl/Makefile +++ b/src/pl/tcl/Makefile @@ -11,7 +11,7 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -override CPPFLAGS := -I. -I$(srcdir) $(TCL_INCLUDE_SPEC) $(CPPFLAGS) +override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) $(TCL_INCLUDE_SPEC) # On Windows, we don't link directly with the Tcl library; see below ifneq ($(PORTNAME), win32) diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 9af3474a6ca..1a717255355 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -123,7 +123,7 @@ pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len) __m512i k; k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0)); - x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0); + x0 = _mm512_xor_si512(_mm512_zextsi128_si512(_mm_cvtsi32_si128(crc0)), x0); buf += 64; /* Main loop. */ diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c index 4b487a2a4e8..3368a43a338 100644 --- a/src/port/pg_numa.c +++ b/src/port/pg_numa.c @@ -16,6 +16,7 @@ #include "c.h" #include <unistd.h> +#include "miscadmin.h" #include "port/pg_numa.h" /* @@ -29,6 +30,19 @@ #include <numa.h> #include <numaif.h> +/* + * numa_move_pages() chunk size, has to be <= 16 to work around a kernel bug + * in do_pages_stat() (chunked by DO_PAGES_STAT_CHUNK_NR). By using the same + * chunk size, we make it work even on unfixed kernels. + * + * 64-bit system are not affected by the bug, and so use much larger chunks. + */ +#if SIZEOF_SIZE_T == 4 +#define NUMA_QUERY_CHUNK_SIZE 16 +#else +#define NUMA_QUERY_CHUNK_SIZE 1024 +#endif + /* libnuma requires initialization as per numa(3) on Linux */ int pg_numa_init(void) @@ -42,11 +56,48 @@ pg_numa_init(void) * We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the * first one allows us to batch and query about many memory pages in one single * giant system call that is way faster. + * + * We call numa_move_pages() for smaller chunks of the whole array. The first + * reason is to work around a kernel bug, but also to allow interrupting the + * query between the calls (for many pointers processing the whole array can + * take a lot of time). */ int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status) { - return numa_move_pages(pid, count, pages, NULL, status, 0); + unsigned long next = 0; + int ret = 0; + + /* + * Chunk pointers passed to numa_move_pages to NUMA_QUERY_CHUNK_SIZE + * items, to work around a kernel bug in do_pages_stat(). + */ + while (next < count) + { + unsigned long count_chunk = Min(count - next, + NUMA_QUERY_CHUNK_SIZE); + + CHECK_FOR_INTERRUPTS(); + + /* + * Bail out if any of the chunks errors out (ret<0). We ignore (ret>0) + * which is used to return number of nonmigrated pages, but we're not + * migrating any pages here. + */ + ret = numa_move_pages(pid, count_chunk, &pages[next], NULL, &status[next], 0); + if (ret < 0) + { + /* plain error, return as is */ + return ret; + } + + next += count_chunk; + } + + /* should have consumed the input array exactly */ + Assert(next == count); + + return 0; } int diff --git a/src/port/pgmkdirp.c b/src/port/pgmkdirp.c index d943559760d..7d7cea4dd0e 100644 --- a/src/port/pgmkdirp.c +++ b/src/port/pgmkdirp.c @@ -73,7 +73,7 @@ pg_mkdir_p(char *path, int omode) if (p[0] == '/' && p[1] == '/') { /* network drive */ - p = strstr(p + 2, "/"); + p = strchr(p + 2, '/'); if (p == NULL) { errno = EINVAL; diff --git a/src/test/authentication/t/001_password.pl b/src/test/authentication/t/001_password.pl index 37d96d95a1a..a16e9a563f3 100644 --- a/src/test/authentication/t/001_password.pl +++ b/src/test/authentication/t/001_password.pl @@ -79,39 +79,40 @@ $node->start; # other tests are added to this file in the future $node->safe_psql('postgres', "CREATE DATABASE test_log_connections"); -my $log_connections = $node->safe_psql('test_log_connections', q(SHOW log_connections;)); +my $log_connections = + $node->safe_psql('test_log_connections', q(SHOW log_connections;)); is($log_connections, 'on', qq(check log connections has expected value 'on')); -$node->connect_ok('test_log_connections', +$node->connect_ok( + 'test_log_connections', qq(log_connections 'on' works as expected for backwards compatibility), log_like => [ qr/connection received/, qr/connection authenticated/, qr/connection authorized: user=\S+ database=test_log_connections/, ], - log_unlike => [ - qr/connection ready/, - ],); + log_unlike => [ qr/connection ready/, ],); -$node->safe_psql('test_log_connections', +$node->safe_psql( + 'test_log_connections', q[ALTER SYSTEM SET log_connections = receipt,authorization,setup_durations; SELECT pg_reload_conf();]); -$node->connect_ok('test_log_connections', +$node->connect_ok( + 'test_log_connections', q(log_connections with subset of specified options logs only those aspects), log_like => [ qr/connection received/, qr/connection authorized: user=\S+ database=test_log_connections/, qr/connection ready/, ], - log_unlike => [ - qr/connection authenticated/, - ],); + log_unlike => [ qr/connection authenticated/, ],); $node->safe_psql('test_log_connections', qq(ALTER SYSTEM SET log_connections = 'all'; SELECT pg_reload_conf();)); -$node->connect_ok('test_log_connections', +$node->connect_ok( + 'test_log_connections', qq(log_connections 'all' logs all available connection aspects), log_like => [ qr/connection received/, diff --git a/src/test/authentication/t/003_peer.pl b/src/test/authentication/t/003_peer.pl index 2879800eacf..c751fbdbaa5 100644 --- a/src/test/authentication/t/003_peer.pl +++ b/src/test/authentication/t/003_peer.pl @@ -71,7 +71,7 @@ sub test_role my $node = PostgreSQL::Test::Cluster->new('node'); $node->init; -$node->append_conf('postgresql.conf', "log_connections = on\n"); +$node->append_conf('postgresql.conf', "log_connections = authentication\n"); # Needed to allow connect_fails to inspect postmaster log: $node->append_conf('postgresql.conf', "log_min_messages = debug2"); $node->start; @@ -171,7 +171,8 @@ test_role( # Test with regular expression in user name map. # Extract the last 3 characters from the system_user -# or the entire system_user (if its length is <= -3). +# or the entire system_user name (if its length is <= 3). +# We trust this will not include any regex metacharacters. my $regex_test_string = substr($system_user, -3); # Success as the system user regular expression matches. @@ -210,12 +211,17 @@ test_role( log_like => [qr/connection authenticated: identity="$system_user" method=peer/]); +# Create target role for \1 tests. +my $mapped_name = "test${regex_test_string}map${regex_test_string}user"; +$node->safe_psql('postgres', "CREATE ROLE $mapped_name LOGIN"); + # Success as the regular expression matches and \1 is replaced in the given # subexpression. -reset_pg_ident($node, 'mypeermap', qq{/^$system_user(.*)\$}, 'test\1mapuser'); +reset_pg_ident($node, 'mypeermap', qq{/^.*($regex_test_string)\$}, + 'test\1map\1user'); test_role( $node, - qq{testmapuser}, + $mapped_name, 'peer', 0, 'with regular expression in user name map with \1 replaced', @@ -224,11 +230,11 @@ test_role( # Success as the regular expression matches and \1 is replaced in the given # subexpression, even if quoted. -reset_pg_ident($node, 'mypeermap', qq{/^$system_user(.*)\$}, - '"test\1mapuser"'); +reset_pg_ident($node, 'mypeermap', qq{/^.*($regex_test_string)\$}, + '"test\1map\1user"'); test_role( $node, - qq{testmapuser}, + $mapped_name, 'peer', 0, 'with regular expression in user name map with quoted \1 replaced', diff --git a/src/test/authentication/t/005_sspi.pl b/src/test/authentication/t/005_sspi.pl index b480b702590..cb3e169002f 100644 --- a/src/test/authentication/t/005_sspi.pl +++ b/src/test/authentication/t/005_sspi.pl @@ -18,7 +18,7 @@ if (!$windows_os || $use_unix_sockets) # Initialize primary node my $node = PostgreSQL::Test::Cluster->new('primary'); $node->init; -$node->append_conf('postgresql.conf', "log_connections = on\n"); +$node->append_conf('postgresql.conf', "log_connections = authentication\n"); $node->start; my $huge_pages_status = diff --git a/src/test/authentication/t/007_pre_auth.pl b/src/test/authentication/t/007_pre_auth.pl index 12e40dc722c..7b3765e6d25 100644 --- a/src/test/authentication/t/007_pre_auth.pl +++ b/src/test/authentication/t/007_pre_auth.pl @@ -20,7 +20,7 @@ my $node = PostgreSQL::Test::Cluster->new('primary'); $node->init; $node->append_conf( 'postgresql.conf', q[ -log_connections = on +log_connections = 'receipt,authentication' ]); $node->start; diff --git a/src/test/isolation/expected/merge-match-recheck.out b/src/test/isolation/expected/merge-match-recheck.out index 9a44a595927..90300f1db5a 100644 --- a/src/test/isolation/expected/merge-match-recheck.out +++ b/src/test/isolation/expected/merge-match-recheck.out @@ -241,19 +241,28 @@ starting permutation: update_bal1_tg merge_bal_tg c2 select1_tg c1 s2: NOTICE: Update: (1,160,s1,setup) -> (1,50,s1,"setup updated by update_bal1_tg") step update_bal1_tg: UPDATE target_tg t SET balance = 50, val = t.val || ' updated by update_bal1_tg' WHERE t.key = 1; step merge_bal_tg: - MERGE INTO target_tg t - USING (SELECT 1 as key) s - ON s.key = t.key - WHEN MATCHED AND balance < 100 THEN - UPDATE SET balance = balance * 2, val = t.val || ' when1' - WHEN MATCHED AND balance < 200 THEN - UPDATE SET balance = balance * 4, val = t.val || ' when2' - WHEN MATCHED AND balance < 300 THEN - UPDATE SET balance = balance * 8, val = t.val || ' when3'; + WITH t AS ( + MERGE INTO target_tg t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3' + RETURNING t.* + ) + SELECT * FROM t; <waiting ...> step c2: COMMIT; s1: NOTICE: Update: (1,50,s1,"setup updated by update_bal1_tg") -> (1,100,s1,"setup updated by update_bal1_tg when1") step merge_bal_tg: <... completed> +key|balance|status|val +---+-------+------+------------------------------------- + 1| 100|s1 |setup updated by update_bal1_tg when1 +(1 row) + step select1_tg: SELECT * FROM target_tg; key|balance|status|val ---+-------+------+------------------------------------- diff --git a/src/test/isolation/specs/merge-match-recheck.spec b/src/test/isolation/specs/merge-match-recheck.spec index 26266b8c297..15226e40c9e 100644 --- a/src/test/isolation/specs/merge-match-recheck.spec +++ b/src/test/isolation/specs/merge-match-recheck.spec @@ -99,15 +99,19 @@ step "merge_bal_pa" } step "merge_bal_tg" { - MERGE INTO target_tg t - USING (SELECT 1 as key) s - ON s.key = t.key - WHEN MATCHED AND balance < 100 THEN - UPDATE SET balance = balance * 2, val = t.val || ' when1' - WHEN MATCHED AND balance < 200 THEN - UPDATE SET balance = balance * 4, val = t.val || ' when2' - WHEN MATCHED AND balance < 300 THEN - UPDATE SET balance = balance * 8, val = t.val || ' when3'; + WITH t AS ( + MERGE INTO target_tg t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3' + RETURNING t.* + ) + SELECT * FROM t; } step "merge_delete" diff --git a/src/test/kerberos/t/001_auth.pl b/src/test/kerberos/t/001_auth.pl index 2dc6bec9b89..b0be96f2beb 100644 --- a/src/test/kerberos/t/001_auth.pl +++ b/src/test/kerberos/t/001_auth.pl @@ -65,7 +65,7 @@ $node->append_conf( 'postgresql.conf', qq{ listen_addresses = '$hostaddr' krb_server_keyfile = '$krb->{keytab}' -log_connections = on +log_connections = all log_min_messages = debug2 lc_messages = 'C' }); diff --git a/src/test/ldap/t/001_auth.pl b/src/test/ldap/t/001_auth.pl index d1315ed5351..440c30b7ddd 100644 --- a/src/test/ldap/t/001_auth.pl +++ b/src/test/ldap/t/001_auth.pl @@ -47,7 +47,7 @@ note "setting up PostgreSQL instance"; my $node = PostgreSQL::Test::Cluster->new('node'); $node->init; -$node->append_conf('postgresql.conf', "log_connections = on\n"); +$node->append_conf('postgresql.conf', "log_connections = all\n"); # Needed to allow connect_fails to inspect postmaster log: $node->append_conf('postgresql.conf', "log_min_messages = debug2"); $node->start; diff --git a/src/test/ldap/t/002_bindpasswd.pl b/src/test/ldap/t/002_bindpasswd.pl index f8beba2b279..642bb2d9a77 100644 --- a/src/test/ldap/t/002_bindpasswd.pl +++ b/src/test/ldap/t/002_bindpasswd.pl @@ -43,7 +43,7 @@ note "setting up PostgreSQL instance"; my $node = PostgreSQL::Test::Cluster->new('node'); $node->init; -$node->append_conf('postgresql.conf', "log_connections = on\n"); +$node->append_conf('postgresql.conf', "log_connections = all\n"); $node->start; $node->safe_psql('postgres', 'CREATE USER test0;'); diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index aa1d27bbed3..903a8ac151a 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -15,6 +15,7 @@ SUBDIRS = \ plsample \ spgist_name_ops \ test_aio \ + test_binaryheap \ test_bloomfilter \ test_copy_callbacks \ test_custom_rmgrs \ @@ -24,6 +25,7 @@ SUBDIRS = \ test_escape \ test_extensions \ test_ginpostinglist \ + test_int128 \ test_integerset \ test_json_parser \ test_lfind \ diff --git a/src/test/modules/commit_ts/t/001_base.pl b/src/test/modules/commit_ts/t/001_base.pl index 1953b18f6b3..50e79ce6409 100644 --- a/src/test/modules/commit_ts/t/001_base.pl +++ b/src/test/modules/commit_ts/t/001_base.pl @@ -11,8 +11,7 @@ use Test::More; use PostgreSQL::Test::Cluster; my $node = PostgreSQL::Test::Cluster->new('foxtrot'); -$node->init; -$node->append_conf('postgresql.conf', 'track_commit_timestamp = on'); +$node->init(extra => [ '-c', "track_commit_timestamp=on" ]); $node->start; # Create a table, compare "now()" to the commit TS of its xmin diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index e680991f8d4..fc82cd67f6c 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -11,7 +11,7 @@ EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" -REGRESS = injection_points hashagg reindex_conc +REGRESS = injection_points hashagg reindex_conc vacuum REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress ISOLATION = basic inplace syscache-update-pruned diff --git a/src/test/modules/injection_points/expected/injection_points.out b/src/test/modules/injection_points/expected/injection_points.out index 43bcdd01582..382f3b0bf88 100644 --- a/src/test/modules/injection_points/expected/injection_points.out +++ b/src/test/modules/injection_points/expected/injection_points.out @@ -39,6 +39,15 @@ SELECT injection_points_attach('TestInjectionLog2', 'notice'); (1 row) +SELECT point_name, library, function FROM injection_points_list() + ORDER BY point_name COLLATE "C"; + point_name | library | function +--------------------+------------------+------------------ + TestInjectionError | injection_points | injection_error + TestInjectionLog | injection_points | injection_notice + TestInjectionLog2 | injection_points | injection_notice +(3 rows) + SELECT injection_points_run('TestInjectionBooh'); -- nothing injection_points_run ---------------------- @@ -298,5 +307,12 @@ SELECT injection_points_detach('TestConditionLocal1'); (1 row) +-- No points should be left around. +SELECT point_name, library, function FROM injection_points_list() + ORDER BY point_name COLLATE "C"; + point_name | library | function +------------+---------+---------- +(0 rows) + DROP EXTENSION injection_points; DROP FUNCTION wait_pid; diff --git a/src/test/modules/injection_points/expected/vacuum.out b/src/test/modules/injection_points/expected/vacuum.out new file mode 100644 index 00000000000..58df59fa927 --- /dev/null +++ b/src/test/modules/injection_points/expected/vacuum.out @@ -0,0 +1,122 @@ +-- Tests for VACUUM +CREATE EXTENSION injection_points; +SELECT injection_points_set_local(); + injection_points_set_local +---------------------------- + +(1 row) + +SELECT injection_points_attach('vacuum-index-cleanup-auto', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('vacuum-index-cleanup-disabled', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('vacuum-index-cleanup-enabled', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('vacuum-truncate-auto', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('vacuum-truncate-disabled', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('vacuum-truncate-enabled', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +-- Check state of index_cleanup and truncate in VACUUM. +CREATE TABLE vac_tab_on_toast_off(i int, j text) WITH + (autovacuum_enabled=false, + vacuum_index_cleanup=true, toast.vacuum_index_cleanup=false, + vacuum_truncate=true, toast.vacuum_truncate=false); +CREATE TABLE vac_tab_off_toast_on(i int, j text) WITH + (autovacuum_enabled=false, + vacuum_index_cleanup=false, toast.vacuum_index_cleanup=true, + vacuum_truncate=false, toast.vacuum_truncate=true); +-- Multiple relations should use their options in isolation. +VACUUM vac_tab_on_toast_off, vac_tab_off_toast_on; +NOTICE: notice triggered for injection point vacuum-index-cleanup-enabled +NOTICE: notice triggered for injection point vacuum-truncate-enabled +NOTICE: notice triggered for injection point vacuum-index-cleanup-disabled +NOTICE: notice triggered for injection point vacuum-truncate-disabled +NOTICE: notice triggered for injection point vacuum-index-cleanup-disabled +NOTICE: notice triggered for injection point vacuum-truncate-disabled +NOTICE: notice triggered for injection point vacuum-index-cleanup-enabled +NOTICE: notice triggered for injection point vacuum-truncate-enabled +-- Check "auto" case of index_cleanup and "truncate" controlled by +-- its GUC. +CREATE TABLE vac_tab_auto(i int, j text) WITH + (autovacuum_enabled=false, + vacuum_index_cleanup=auto, toast.vacuum_index_cleanup=auto); +SET vacuum_truncate = false; +VACUUM vac_tab_auto; +NOTICE: notice triggered for injection point vacuum-index-cleanup-auto +NOTICE: notice triggered for injection point vacuum-truncate-disabled +NOTICE: notice triggered for injection point vacuum-index-cleanup-auto +NOTICE: notice triggered for injection point vacuum-truncate-disabled +SET vacuum_truncate = true; +VACUUM vac_tab_auto; +NOTICE: notice triggered for injection point vacuum-index-cleanup-auto +NOTICE: notice triggered for injection point vacuum-truncate-enabled +NOTICE: notice triggered for injection point vacuum-index-cleanup-auto +NOTICE: notice triggered for injection point vacuum-truncate-enabled +RESET vacuum_truncate; +DROP TABLE vac_tab_auto; +DROP TABLE vac_tab_on_toast_off; +DROP TABLE vac_tab_off_toast_on; +-- Cleanup +SELECT injection_points_detach('vacuum-index-cleanup-auto'); + injection_points_detach +------------------------- + +(1 row) + +SELECT injection_points_detach('vacuum-index-cleanup-disabled'); + injection_points_detach +------------------------- + +(1 row) + +SELECT injection_points_detach('vacuum-index-cleanup-enabled'); + injection_points_detach +------------------------- + +(1 row) + +SELECT injection_points_detach('vacuum-truncate-auto'); + injection_points_detach +------------------------- + +(1 row) + +SELECT injection_points_detach('vacuum-truncate-disabled'); + injection_points_detach +------------------------- + +(1 row) + +SELECT injection_points_detach('vacuum-truncate-enabled'); + injection_points_detach +------------------------- + +(1 row) + +DROP EXTENSION injection_points; diff --git a/src/test/modules/injection_points/injection_points--1.0.sql b/src/test/modules/injection_points/injection_points--1.0.sql index cc76b1bf99a..5f5657b2043 100644 --- a/src/test/modules/injection_points/injection_points--1.0.sql +++ b/src/test/modules/injection_points/injection_points--1.0.sql @@ -78,6 +78,18 @@ AS 'MODULE_PATHNAME', 'injection_points_detach' LANGUAGE C STRICT PARALLEL UNSAFE; -- +-- injection_points_list() +-- +-- List of all the injection points currently attached. +-- +CREATE FUNCTION injection_points_list(OUT point_name text, + OUT library text, + OUT function text) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'injection_points_list' +LANGUAGE C STRICT VOLATILE PARALLEL RESTRICTED; + +-- -- injection_points_stats_numcalls() -- -- Reports statistics, if any, related to the given injection point. diff --git a/src/test/modules/injection_points/injection_points.c b/src/test/modules/injection_points/injection_points.c index 3da0cbc10e0..31138301117 100644 --- a/src/test/modules/injection_points/injection_points.c +++ b/src/test/modules/injection_points/injection_points.c @@ -18,6 +18,7 @@ #include "postgres.h" #include "fmgr.h" +#include "funcapi.h" #include "injection_stats.h" #include "miscadmin.h" #include "nodes/pg_list.h" @@ -545,6 +546,44 @@ injection_points_detach(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +/* + * SQL function for listing all the injection points attached. + */ +PG_FUNCTION_INFO_V1(injection_points_list); +Datum +injection_points_list(PG_FUNCTION_ARGS) +{ +#define NUM_INJECTION_POINTS_LIST 3 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + List *inj_points; + ListCell *lc; + + /* Build a tuplestore to return our results in */ + InitMaterializedSRF(fcinfo, 0); + + inj_points = InjectionPointList(); + + foreach(lc, inj_points) + { + Datum values[NUM_INJECTION_POINTS_LIST]; + bool nulls[NUM_INJECTION_POINTS_LIST]; + InjectionPointData *inj_point = lfirst(lc); + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + values[0] = PointerGetDatum(cstring_to_text(inj_point->name)); + values[1] = PointerGetDatum(cstring_to_text(inj_point->library)); + values[2] = PointerGetDatum(cstring_to_text(inj_point->function)); + + /* shove row into tuplestore */ + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } + + return (Datum) 0; +#undef NUM_INJECTION_POINTS_LIST +} + void _PG_init(void) diff --git a/src/test/modules/injection_points/injection_stats.c b/src/test/modules/injection_points/injection_stats.c index 14903c629e0..e3947b23ba5 100644 --- a/src/test/modules/injection_points/injection_stats.c +++ b/src/test/modules/injection_points/injection_stats.c @@ -59,7 +59,7 @@ static const PgStat_KindInfo injection_stats = { /* * Kind ID reserved for statistics of injection points. */ -#define PGSTAT_KIND_INJECTION 129 +#define PGSTAT_KIND_INJECTION 25 /* Track if stats are loaded */ static bool inj_stats_loaded = false; diff --git a/src/test/modules/injection_points/injection_stats_fixed.c b/src/test/modules/injection_points/injection_stats_fixed.c index 3d0c01bdd05..bc54c79d190 100644 --- a/src/test/modules/injection_points/injection_stats_fixed.c +++ b/src/test/modules/injection_points/injection_stats_fixed.c @@ -64,7 +64,7 @@ static const PgStat_KindInfo injection_stats_fixed = { /* * Kind ID reserved for statistics of injection points. */ -#define PGSTAT_KIND_INJECTION_FIXED 130 +#define PGSTAT_KIND_INJECTION_FIXED 26 /* Track if fixed-numbered stats are loaded */ static bool inj_fixed_loaded = false; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index d61149712fd..20390d6b4bf 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -37,8 +37,9 @@ tests += { 'injection_points', 'hashagg', 'reindex_conc', + 'vacuum', ], - 'regress_args': ['--dlpath', meson.build_root() / 'src/test/regress'], + 'regress_args': ['--dlpath', meson.project_build_root() / 'src/test/regress'], # The injection points are cluster-wide, so disable installcheck 'runningcheck': false, }, diff --git a/src/test/modules/injection_points/sql/injection_points.sql b/src/test/modules/injection_points/sql/injection_points.sql index d9748331c77..874421e9c11 100644 --- a/src/test/modules/injection_points/sql/injection_points.sql +++ b/src/test/modules/injection_points/sql/injection_points.sql @@ -18,6 +18,9 @@ SELECT injection_points_attach('TestInjectionError', 'error'); SELECT injection_points_attach('TestInjectionLog', 'notice'); SELECT injection_points_attach('TestInjectionLog2', 'notice'); +SELECT point_name, library, function FROM injection_points_list() + ORDER BY point_name COLLATE "C"; + SELECT injection_points_run('TestInjectionBooh'); -- nothing SELECT injection_points_run('TestInjectionLog2'); -- notice SELECT injection_points_run('TestInjectionLog2', NULL); -- notice @@ -85,5 +88,9 @@ SELECT injection_points_detach('TestConditionError'); SELECT injection_points_attach('TestConditionLocal1', 'error'); SELECT injection_points_detach('TestConditionLocal1'); +-- No points should be left around. +SELECT point_name, library, function FROM injection_points_list() + ORDER BY point_name COLLATE "C"; + DROP EXTENSION injection_points; DROP FUNCTION wait_pid; diff --git a/src/test/modules/injection_points/sql/vacuum.sql b/src/test/modules/injection_points/sql/vacuum.sql new file mode 100644 index 00000000000..23760dd0f38 --- /dev/null +++ b/src/test/modules/injection_points/sql/vacuum.sql @@ -0,0 +1,47 @@ +-- Tests for VACUUM + +CREATE EXTENSION injection_points; + +SELECT injection_points_set_local(); +SELECT injection_points_attach('vacuum-index-cleanup-auto', 'notice'); +SELECT injection_points_attach('vacuum-index-cleanup-disabled', 'notice'); +SELECT injection_points_attach('vacuum-index-cleanup-enabled', 'notice'); +SELECT injection_points_attach('vacuum-truncate-auto', 'notice'); +SELECT injection_points_attach('vacuum-truncate-disabled', 'notice'); +SELECT injection_points_attach('vacuum-truncate-enabled', 'notice'); + +-- Check state of index_cleanup and truncate in VACUUM. +CREATE TABLE vac_tab_on_toast_off(i int, j text) WITH + (autovacuum_enabled=false, + vacuum_index_cleanup=true, toast.vacuum_index_cleanup=false, + vacuum_truncate=true, toast.vacuum_truncate=false); +CREATE TABLE vac_tab_off_toast_on(i int, j text) WITH + (autovacuum_enabled=false, + vacuum_index_cleanup=false, toast.vacuum_index_cleanup=true, + vacuum_truncate=false, toast.vacuum_truncate=true); +-- Multiple relations should use their options in isolation. +VACUUM vac_tab_on_toast_off, vac_tab_off_toast_on; + +-- Check "auto" case of index_cleanup and "truncate" controlled by +-- its GUC. +CREATE TABLE vac_tab_auto(i int, j text) WITH + (autovacuum_enabled=false, + vacuum_index_cleanup=auto, toast.vacuum_index_cleanup=auto); +SET vacuum_truncate = false; +VACUUM vac_tab_auto; +SET vacuum_truncate = true; +VACUUM vac_tab_auto; +RESET vacuum_truncate; + +DROP TABLE vac_tab_auto; +DROP TABLE vac_tab_on_toast_off; +DROP TABLE vac_tab_off_toast_on; + +-- Cleanup +SELECT injection_points_detach('vacuum-index-cleanup-auto'); +SELECT injection_points_detach('vacuum-index-cleanup-disabled'); +SELECT injection_points_detach('vacuum-index-cleanup-enabled'); +SELECT injection_points_detach('vacuum-truncate-auto'); +SELECT injection_points_detach('vacuum-truncate-disabled'); +SELECT injection_points_detach('vacuum-truncate-enabled'); +DROP EXTENSION injection_points; diff --git a/src/test/modules/ldap_password_func/t/001_mutated_bindpasswd.pl b/src/test/modules/ldap_password_func/t/001_mutated_bindpasswd.pl index 9b062e1c800..5dc1e442d29 100644 --- a/src/test/modules/ldap_password_func/t/001_mutated_bindpasswd.pl +++ b/src/test/modules/ldap_password_func/t/001_mutated_bindpasswd.pl @@ -42,7 +42,8 @@ note "setting up PostgreSQL instance"; my $node = PostgreSQL::Test::Cluster->new('node'); $node->init; -$node->append_conf('postgresql.conf', "log_connections = on\n"); +$node->append_conf('postgresql.conf', + "log_connections = 'receipt,authentication,authorization'\n"); $node->append_conf('postgresql.conf', "shared_preload_libraries = 'ldap_password_func'"); $node->start; diff --git a/src/test/modules/libpq_pipeline/t/001_libpq_pipeline.pl b/src/test/modules/libpq_pipeline/t/001_libpq_pipeline.pl index 61524bdbd8f..f9678853070 100644 --- a/src/test/modules/libpq_pipeline/t/001_libpq_pipeline.pl +++ b/src/test/modules/libpq_pipeline/t/001_libpq_pipeline.pl @@ -53,7 +53,8 @@ for my $testname (@tests) $node->command_ok( [ 'libpq_pipeline', @extraargs, - $testname, $node->connstr('postgres') . " max_protocol_version=latest" + $testname, + $node->connstr('postgres') . " max_protocol_version=latest" ], "libpq_pipeline $testname"); @@ -76,7 +77,8 @@ for my $testname (@tests) # test separately that it still works the old protocol version too. $node->command_ok( [ - 'libpq_pipeline', 'cancel', $node->connstr('postgres') . " max_protocol_version=3.0" + 'libpq_pipeline', 'cancel', + $node->connstr('postgres') . " max_protocol_version=3.0" ], "libpq_pipeline cancel with protocol 3.0"); diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 9de0057bd1d..93be0f57289 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -14,6 +14,7 @@ subdir('plsample') subdir('spgist_name_ops') subdir('ssl_passphrase_callback') subdir('test_aio') +subdir('test_binaryheap') subdir('test_bloomfilter') subdir('test_copy_callbacks') subdir('test_custom_rmgrs') @@ -23,6 +24,7 @@ subdir('test_dsm_registry') subdir('test_escape') subdir('test_extensions') subdir('test_ginpostinglist') +subdir('test_int128') subdir('test_integerset') subdir('test_json_parser') subdir('test_lfind') diff --git a/src/test/modules/oauth_validator/meson.build b/src/test/modules/oauth_validator/meson.build index e190f9cf15a..a6f937fd7d7 100644 --- a/src/test/modules/oauth_validator/meson.build +++ b/src/test/modules/oauth_validator/meson.build @@ -77,7 +77,7 @@ tests += { 't/002_client.pl', ], 'env': { - 'PYTHON': python.path(), + 'PYTHON': python.full_path(), 'with_libcurl': oauth_flow_supported ? 'yes' : 'no', 'with_python': 'yes', }, diff --git a/src/test/modules/oauth_validator/t/001_server.pl b/src/test/modules/oauth_validator/t/001_server.pl index 4f035417a40..41672ebd5c6 100644 --- a/src/test/modules/oauth_validator/t/001_server.pl +++ b/src/test/modules/oauth_validator/t/001_server.pl @@ -45,7 +45,7 @@ if ($ENV{with_python} ne 'yes') my $node = PostgreSQL::Test::Cluster->new('primary'); $node->init; -$node->append_conf('postgresql.conf', "log_connections = on\n"); +$node->append_conf('postgresql.conf', "log_connections = all\n"); $node->append_conf('postgresql.conf', "oauth_validator_libraries = 'validator'\n"); # Needed to allow connect_fails to inspect postmaster log: @@ -295,6 +295,26 @@ $node->connect_fails( expected_stderr => qr/failed to obtain access token: response is too large/); +my $nesting_limit = 16; +$node->connect_ok( + connstr( + stage => 'device', + nested_array => $nesting_limit, + nested_object => $nesting_limit), + "nested arrays and objects, up to parse limit", + expected_stderr => + qr@Visit https://example\.com/ and enter the code: postgresuser@); +$node->connect_fails( + connstr(stage => 'device', nested_array => $nesting_limit + 1), + "bad discovery response: overly nested JSON array", + expected_stderr => + qr/failed to parse device authorization: JSON is too deeply nested/); +$node->connect_fails( + connstr(stage => 'device', nested_object => $nesting_limit + 1), + "bad discovery response: overly nested JSON object", + expected_stderr => + qr/failed to parse device authorization: JSON is too deeply nested/); + $node->connect_fails( connstr(stage => 'device', content_type => 'text/plain'), "bad device authz response: wrong content type", diff --git a/src/test/modules/oauth_validator/t/002_client.pl b/src/test/modules/oauth_validator/t/002_client.pl index 21d4acc1926..aac0220d215 100644 --- a/src/test/modules/oauth_validator/t/002_client.pl +++ b/src/test/modules/oauth_validator/t/002_client.pl @@ -26,7 +26,7 @@ if (!$ENV{PG_TEST_EXTRA} || $ENV{PG_TEST_EXTRA} !~ /\boauth\b/) my $node = PostgreSQL::Test::Cluster->new('primary'); $node->init; -$node->append_conf('postgresql.conf', "log_connections = on\n"); +$node->append_conf('postgresql.conf', "log_connections = all\n"); $node->append_conf('postgresql.conf', "oauth_validator_libraries = 'validator'\n"); $node->start; diff --git a/src/test/modules/oauth_validator/t/oauth_server.py b/src/test/modules/oauth_validator/t/oauth_server.py index 20b3a9506cb..0f8836aadf3 100755 --- a/src/test/modules/oauth_validator/t/oauth_server.py +++ b/src/test/modules/oauth_validator/t/oauth_server.py @@ -7,6 +7,7 @@ # import base64 +import functools import http.server import json import os @@ -213,14 +214,32 @@ class OAuthHandler(http.server.BaseHTTPRequestHandler): @property def _response_padding(self): """ - If the huge_response test parameter is set to True, returns a dict - containing a gigantic string value, which can then be folded into a JSON - response. + Returns a dict with any additional entries that should be folded into a + JSON response, as determined by test parameters provided by the client: + + - huge_response: if set to True, the dict will contain a gigantic string + value + + - nested_array: if set to nonzero, the dict will contain a deeply nested + array so that the top-level object has the given depth + + - nested_object: if set to nonzero, the dict will contain a deeply + nested JSON object so that the top-level object has the given depth """ - if not self._get_param("huge_response", False): - return dict() + ret = dict() + + if self._get_param("huge_response", False): + ret["_pad_"] = "x" * 1024 * 1024 + + depth = self._get_param("nested_array", 0) + if depth: + ret["_arr_"] = functools.reduce(lambda x, _: [x], range(depth)) + + depth = self._get_param("nested_object", 0) + if depth: + ret["_obj_"] = functools.reduce(lambda x, _: {"": x}, range(depth)) - return {"_pad_": "x" * 1024 * 1024} + return ret @property def _access_token(self): diff --git a/src/test/modules/test_aio/t/001_aio.pl b/src/test/modules/test_aio/t/001_aio.pl index 4527c70785d..82ffffc058f 100644 --- a/src/test/modules/test_aio/t/001_aio.pl +++ b/src/test/modules/test_aio/t/001_aio.pl @@ -1123,7 +1123,8 @@ COMMIT; { # Create a corruption and then read the block without waiting for # completion. - $psql_a->query(qq( + $psql_a->query( + qq( SELECT modify_rel_block('tbl_zero', 1, corrupt_header=>true); SELECT read_rel_block_ll('tbl_zero', 1, wait_complete=>false, zero_on_error=>true) )); @@ -1133,7 +1134,8 @@ SELECT read_rel_block_ll('tbl_zero', 1, wait_complete=>false, zero_on_error=>tru $psql_b, "$persistency: test completing read by other session doesn't generate warning", qq(SELECT count(*) > 0 FROM tbl_zero;), - qr/^t$/, qr/^$/); + qr/^t$/, + qr/^$/); } # Clean up @@ -1355,18 +1357,24 @@ SELECT modify_rel_block('tbl_cs_fail', 6, corrupt_checksum=>true); )); $psql->query_safe($invalidate_sql); - psql_like($io_method, $psql, + psql_like( + $io_method, + $psql, "reading block w/ wrong checksum with ignore_checksum_failure=off fails", - $count_sql, qr/^$/, qr/ERROR: invalid page in block/); + $count_sql, + qr/^$/, + qr/ERROR: invalid page in block/); $psql->query_safe("SET ignore_checksum_failure=on"); $psql->query_safe($invalidate_sql); - psql_like($io_method, $psql, - "reading block w/ wrong checksum with ignore_checksum_failure=off succeeds", - $count_sql, - qr/^$expect$/, - qr/WARNING: ignoring (checksum failure|\d checksum failures)/); + psql_like( + $io_method, + $psql, + "reading block w/ wrong checksum with ignore_checksum_failure=off succeeds", + $count_sql, + qr/^$expect$/, + qr/WARNING: ignoring (checksum failure|\d checksum failures)/); # Verify that ignore_checksum_failure=off works in multi-block reads @@ -1432,19 +1440,22 @@ SELECT read_rel_block_ll('tbl_cs_fail', 1, nblocks=>5, zero_on_error=>true);), # file. $node->wait_for_log(qr/LOG: ignoring checksum failure in block 2/, - $log_location); + $log_location); ok(1, "$io_method: found information about checksum failure in block 2"); - $node->wait_for_log(qr/LOG: invalid page in block 3 of relation base.*; zeroing out page/, - $log_location); + $node->wait_for_log( + qr/LOG: invalid page in block 3 of relation base.*; zeroing out page/, + $log_location); ok(1, "$io_method: found information about invalid page in block 3"); - $node->wait_for_log(qr/LOG: invalid page in block 4 of relation base.*; zeroing out page/, - $log_location); + $node->wait_for_log( + qr/LOG: invalid page in block 4 of relation base.*; zeroing out page/, + $log_location); ok(1, "$io_method: found information about checksum failure in block 4"); - $node->wait_for_log(qr/LOG: invalid page in block 5 of relation base.*; zeroing out page/, - $log_location); + $node->wait_for_log( + qr/LOG: invalid page in block 5 of relation base.*; zeroing out page/, + $log_location); ok(1, "$io_method: found information about checksum failure in block 5"); @@ -1462,8 +1473,7 @@ SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header qq( SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);), qr/^$/, - qr/^psql:<stdin>:\d+: ERROR: invalid page in block 3 of relation/ - ); + qr/^psql:<stdin>:\d+: ERROR: invalid page in block 3 of relation/); psql_like( $io_method, diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c index 5cdfb89210b..c55cf6c0aac 100644 --- a/src/test/modules/test_aio/test_aio.c +++ b/src/test/modules/test_aio/test_aio.c @@ -42,9 +42,9 @@ typedef struct InjIoErrorState bool short_read_result_set; int short_read_result; -} InjIoErrorState; +} InjIoErrorState; -static InjIoErrorState * inj_io_error_state; +static InjIoErrorState *inj_io_error_state; /* Shared memory init callbacks */ static shmem_request_hook_type prev_shmem_request_hook = NULL; diff --git a/src/test/modules/test_binaryheap/.gitignore b/src/test/modules/test_binaryheap/.gitignore new file mode 100644 index 00000000000..5dcb3ff9723 --- /dev/null +++ b/src/test/modules/test_binaryheap/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/src/test/modules/test_binaryheap/Makefile b/src/test/modules/test_binaryheap/Makefile new file mode 100644 index 00000000000..d310fbc9e88 --- /dev/null +++ b/src/test/modules/test_binaryheap/Makefile @@ -0,0 +1,24 @@ +# src/test/modules/test_binaryheap/Makefile + +MODULE_big = test_binaryheap +OBJS = \ + $(WIN32RES) \ + test_binaryheap.o + +PGFILEDESC = "test_binaryheap - test code for binaryheap" + +EXTENSION = test_binaryheap +DATA = test_binaryheap--1.0.sql + +REGRESS = test_binaryheap + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_binaryheap +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_binaryheap/expected/test_binaryheap.out b/src/test/modules/test_binaryheap/expected/test_binaryheap.out new file mode 100644 index 00000000000..16ce07875e3 --- /dev/null +++ b/src/test/modules/test_binaryheap/expected/test_binaryheap.out @@ -0,0 +1,12 @@ +CREATE EXTENSION test_binaryheap; +-- +-- These tests don't produce any interesting output. We're checking that +-- the operations complete without crashing or hanging and that none of their +-- internal sanity tests fail. +-- +SELECT test_binaryheap(); + test_binaryheap +----------------- + +(1 row) + diff --git a/src/test/modules/test_binaryheap/meson.build b/src/test/modules/test_binaryheap/meson.build new file mode 100644 index 00000000000..816a43c93e9 --- /dev/null +++ b/src/test/modules/test_binaryheap/meson.build @@ -0,0 +1,33 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +test_binaryheap_sources = files( + 'test_binaryheap.c', +) + +if host_system == 'windows' + test_binaryheap_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_binaryheap', + '--FILEDESC', 'test_binaryheap - test code for binaryheap',]) +endif + +test_binaryheap = shared_module('test_binaryheap', + test_binaryheap_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += test_binaryheap + +test_install_data += files( + 'test_binaryheap.control', + 'test_binaryheap--1.0.sql', +) + +tests += { + 'name': 'test_binaryheap', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'test_binaryheap', + ], + }, +} diff --git a/src/test/modules/test_binaryheap/sql/test_binaryheap.sql b/src/test/modules/test_binaryheap/sql/test_binaryheap.sql new file mode 100644 index 00000000000..8439545815b --- /dev/null +++ b/src/test/modules/test_binaryheap/sql/test_binaryheap.sql @@ -0,0 +1,8 @@ +CREATE EXTENSION test_binaryheap; + +-- +-- These tests don't produce any interesting output. We're checking that +-- the operations complete without crashing or hanging and that none of their +-- internal sanity tests fail. +-- +SELECT test_binaryheap(); diff --git a/src/test/modules/test_binaryheap/test_binaryheap--1.0.sql b/src/test/modules/test_binaryheap/test_binaryheap--1.0.sql new file mode 100644 index 00000000000..cddceeee603 --- /dev/null +++ b/src/test/modules/test_binaryheap/test_binaryheap--1.0.sql @@ -0,0 +1,7 @@ +/* src/test/modules/test_binaryheap/test_binaryheap--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_binaryheap" to load this file. \quit + +CREATE FUNCTION test_binaryheap() RETURNS VOID + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_binaryheap/test_binaryheap.c b/src/test/modules/test_binaryheap/test_binaryheap.c new file mode 100644 index 00000000000..583dae1da30 --- /dev/null +++ b/src/test/modules/test_binaryheap/test_binaryheap.c @@ -0,0 +1,275 @@ +/*-------------------------------------------------------------------------- + * + * test_binaryheap.c + * Test correctness of binary heap implementation. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_binaryheap/test_binaryheap.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "common/int.h" +#include "common/pg_prng.h" +#include "fmgr.h" +#include "lib/binaryheap.h" + +PG_MODULE_MAGIC; + +/* + * Test binaryheap_comparator for max-heap of integers. + */ +static int +int_cmp(Datum a, Datum b, void *arg) +{ + return pg_cmp_s32(DatumGetInt32(a), DatumGetInt32(b)); +} + +/* + * Loops through all nodes and returns the maximum value. + */ +static int +get_max_from_heap(binaryheap *heap) +{ + int max = -1; + + for (int i = 0; i < binaryheap_size(heap); i++) + max = Max(max, DatumGetInt32(binaryheap_get_node(heap, i))); + + return max; +} + +/* + * Generate a random permutation of the integers 0..size-1. + */ +static int * +get_permutation(int size) +{ + int *permutation = (int *) palloc(size * sizeof(int)); + + permutation[0] = 0; + + /* + * This is the "inside-out" variant of the Fisher-Yates shuffle algorithm. + * Notionally, we append each new value to the array and then swap it with + * a randomly-chosen array element (possibly including itself, else we + * fail to generate permutations with the last integer last). The swap + * step can be optimized by combining it with the insertion. + */ + for (int i = 1; i < size; i++) + { + int j = pg_prng_uint64_range(&pg_global_prng_state, 0, i); + + if (j < i) /* avoid fetching undefined data if j=i */ + permutation[i] = permutation[j]; + permutation[j] = i; + } + + return permutation; +} + +/* + * Ensure that the heap property holds for the given heap, i.e., each parent is + * greater than or equal to its children. + */ +static void +verify_heap_property(binaryheap *heap) +{ + for (int i = 0; i < binaryheap_size(heap); i++) + { + int left = 2 * i + 1; + int right = 2 * i + 2; + int parent_val = DatumGetInt32(binaryheap_get_node(heap, i)); + + if (left < binaryheap_size(heap) && + parent_val < DatumGetInt32(binaryheap_get_node(heap, left))) + elog(ERROR, "parent node less than left child"); + + if (right < binaryheap_size(heap) && + parent_val < DatumGetInt32(binaryheap_get_node(heap, right))) + elog(ERROR, "parent node less than right child"); + } +} + +/* + * Check correctness of basic operations. + */ +static void +test_basic(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + int *permutation = get_permutation(size); + + if (!binaryheap_empty(heap)) + elog(ERROR, "new heap not empty"); + if (binaryheap_size(heap) != 0) + elog(ERROR, "wrong size for new heap"); + + for (int i = 0; i < size; i++) + { + binaryheap_add(heap, Int32GetDatum(permutation[i])); + verify_heap_property(heap); + } + + if (binaryheap_empty(heap)) + elog(ERROR, "heap empty after adding values"); + if (binaryheap_size(heap) != size) + elog(ERROR, "wrong size for heap after adding values"); + + if (DatumGetInt32(binaryheap_first(heap)) != get_max_from_heap(heap)) + elog(ERROR, "incorrect root node after adding values"); + + for (int i = 0; i < size; i++) + { + int expected = get_max_from_heap(heap); + int actual = DatumGetInt32(binaryheap_remove_first(heap)); + + if (actual != expected) + elog(ERROR, "incorrect root node after removing root"); + verify_heap_property(heap); + } + + if (!binaryheap_empty(heap)) + elog(ERROR, "heap not empty after removing all nodes"); +} + +/* + * Test building heap after unordered additions. + */ +static void +test_build(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + int *permutation = get_permutation(size); + + for (int i = 0; i < size; i++) + binaryheap_add_unordered(heap, Int32GetDatum(permutation[i])); + + if (binaryheap_size(heap) != size) + elog(ERROR, "wrong size for heap after unordered additions"); + + binaryheap_build(heap); + verify_heap_property(heap); +} + +/* + * Test removing nodes. + */ +static void +test_remove_node(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + int *permutation = get_permutation(size); + int remove_count = pg_prng_uint64_range(&pg_global_prng_state, + 0, size - 1); + + for (int i = 0; i < size; i++) + binaryheap_add(heap, Int32GetDatum(permutation[i])); + + for (int i = 0; i < remove_count; i++) + { + int idx = pg_prng_uint64_range(&pg_global_prng_state, + 0, binaryheap_size(heap) - 1); + + binaryheap_remove_node(heap, idx); + verify_heap_property(heap); + } + + if (binaryheap_size(heap) != size - remove_count) + elog(ERROR, "wrong size after removing nodes"); +} + +/* + * Test replacing the root node. + */ +static void +test_replace_first(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + + for (int i = 0; i < size; i++) + binaryheap_add(heap, Int32GetDatum(i)); + + /* + * Replace root with a value smaller than everything in the heap. + */ + binaryheap_replace_first(heap, Int32GetDatum(-1)); + verify_heap_property(heap); + + /* + * Replace root with a value in the middle of the heap. + */ + binaryheap_replace_first(heap, Int32GetDatum(size / 2)); + verify_heap_property(heap); + + /* + * Replace root with a larger value than everything in the heap. + */ + binaryheap_replace_first(heap, Int32GetDatum(size + 1)); + verify_heap_property(heap); +} + +/* + * Test duplicate values. + */ +static void +test_duplicates(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + int dup = pg_prng_uint64_range(&pg_global_prng_state, 0, size - 1); + + for (int i = 0; i < size; i++) + binaryheap_add(heap, Int32GetDatum(dup)); + + for (int i = 0; i < size; i++) + { + if (DatumGetInt32(binaryheap_remove_first(heap)) != dup) + elog(ERROR, "unexpected value in heap with duplicates"); + } +} + +/* + * Test resetting. + */ +static void +test_reset(int size) +{ + binaryheap *heap = binaryheap_allocate(size, int_cmp, NULL); + + for (int i = 0; i < size; i++) + binaryheap_add(heap, Int32GetDatum(i)); + + binaryheap_reset(heap); + + if (!binaryheap_empty(heap)) + elog(ERROR, "heap not empty after resetting"); +} + +/* + * SQL-callable entry point to perform all tests. + */ +PG_FUNCTION_INFO_V1(test_binaryheap); + +Datum +test_binaryheap(PG_FUNCTION_ARGS) +{ + static const int test_sizes[] = {1, 2, 3, 10, 100, 1000}; + + for (int i = 0; i < sizeof(test_sizes) / sizeof(int); i++) + { + int size = test_sizes[i]; + + test_basic(size); + test_build(size); + test_remove_node(size); + test_replace_first(size); + test_duplicates(size); + test_reset(size); + } + + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_binaryheap/test_binaryheap.control b/src/test/modules/test_binaryheap/test_binaryheap.control new file mode 100644 index 00000000000..dd0785e05bd --- /dev/null +++ b/src/test/modules/test_binaryheap/test_binaryheap.control @@ -0,0 +1,5 @@ +# test_binaryheap extension +comment = 'Test code for binaryheap' +default_version = '1.0' +module_pathname = '$libdir/test_binaryheap' +relocatable = true diff --git a/src/test/modules/test_dsm_registry/expected/test_dsm_registry.out b/src/test/modules/test_dsm_registry/expected/test_dsm_registry.out index 8ffbd343a05..ca8abbb377e 100644 --- a/src/test/modules/test_dsm_registry/expected/test_dsm_registry.out +++ b/src/test/modules/test_dsm_registry/expected/test_dsm_registry.out @@ -1,3 +1,10 @@ +SELECT name, type, size IS DISTINCT FROM 0 AS size +FROM pg_dsm_registry_allocations +WHERE name like 'test_dsm_registry%' ORDER BY name; + name | type | size +------+------+------ +(0 rows) + CREATE EXTENSION test_dsm_registry; SELECT set_val_in_shmem(1236); set_val_in_shmem @@ -5,6 +12,12 @@ SELECT set_val_in_shmem(1236); (1 row) +SELECT set_val_in_hash('test', '1414'); + set_val_in_hash +----------------- + +(1 row) + \c SELECT get_val_in_shmem(); get_val_in_shmem @@ -12,3 +25,20 @@ SELECT get_val_in_shmem(); 1236 (1 row) +SELECT get_val_in_hash('test'); + get_val_in_hash +----------------- + 1414 +(1 row) + +\c +SELECT name, type, size IS DISTINCT FROM 0 AS size +FROM pg_dsm_registry_allocations +WHERE name like 'test_dsm_registry%' ORDER BY name; + name | type | size +------------------------+---------+------ + test_dsm_registry_dsa | area | t + test_dsm_registry_dsm | segment | t + test_dsm_registry_hash | hash | t +(3 rows) + diff --git a/src/test/modules/test_dsm_registry/sql/test_dsm_registry.sql b/src/test/modules/test_dsm_registry/sql/test_dsm_registry.sql index b3351be0a16..965a3f1ebb6 100644 --- a/src/test/modules/test_dsm_registry/sql/test_dsm_registry.sql +++ b/src/test/modules/test_dsm_registry/sql/test_dsm_registry.sql @@ -1,4 +1,13 @@ +SELECT name, type, size IS DISTINCT FROM 0 AS size +FROM pg_dsm_registry_allocations +WHERE name like 'test_dsm_registry%' ORDER BY name; CREATE EXTENSION test_dsm_registry; SELECT set_val_in_shmem(1236); +SELECT set_val_in_hash('test', '1414'); \c SELECT get_val_in_shmem(); +SELECT get_val_in_hash('test'); +\c +SELECT name, type, size IS DISTINCT FROM 0 AS size +FROM pg_dsm_registry_allocations +WHERE name like 'test_dsm_registry%' ORDER BY name; diff --git a/src/test/modules/test_dsm_registry/test_dsm_registry--1.0.sql b/src/test/modules/test_dsm_registry/test_dsm_registry--1.0.sql index 8c55b0919b1..5da45155be9 100644 --- a/src/test/modules/test_dsm_registry/test_dsm_registry--1.0.sql +++ b/src/test/modules/test_dsm_registry/test_dsm_registry--1.0.sql @@ -8,3 +8,9 @@ CREATE FUNCTION set_val_in_shmem(val INT) RETURNS VOID CREATE FUNCTION get_val_in_shmem() RETURNS INT AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION set_val_in_hash(key TEXT, val TEXT) RETURNS VOID + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE FUNCTION get_val_in_hash(key TEXT) RETURNS TEXT + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_dsm_registry/test_dsm_registry.c b/src/test/modules/test_dsm_registry/test_dsm_registry.c index 462a80f8790..141c8ed1b34 100644 --- a/src/test/modules/test_dsm_registry/test_dsm_registry.c +++ b/src/test/modules/test_dsm_registry/test_dsm_registry.c @@ -15,6 +15,7 @@ #include "fmgr.h" #include "storage/dsm_registry.h" #include "storage/lwlock.h" +#include "utils/builtins.h" PG_MODULE_MAGIC; @@ -24,15 +25,31 @@ typedef struct TestDSMRegistryStruct LWLock lck; } TestDSMRegistryStruct; -static TestDSMRegistryStruct *tdr_state; +typedef struct TestDSMRegistryHashEntry +{ + char key[64]; + dsa_pointer val; +} TestDSMRegistryHashEntry; + +static TestDSMRegistryStruct *tdr_dsm; +static dsa_area *tdr_dsa; +static dshash_table *tdr_hash; + +static const dshash_parameters dsh_params = { + offsetof(TestDSMRegistryHashEntry, val), + sizeof(TestDSMRegistryHashEntry), + dshash_strcmp, + dshash_strhash, + dshash_strcpy +}; static void -tdr_init_shmem(void *ptr) +init_tdr_dsm(void *ptr) { - TestDSMRegistryStruct *state = (TestDSMRegistryStruct *) ptr; + TestDSMRegistryStruct *dsm = (TestDSMRegistryStruct *) ptr; - LWLockInitialize(&state->lck, LWLockNewTrancheId()); - state->val = 0; + LWLockInitialize(&dsm->lck, LWLockNewTrancheId()); + dsm->val = 0; } static void @@ -40,11 +57,17 @@ tdr_attach_shmem(void) { bool found; - tdr_state = GetNamedDSMSegment("test_dsm_registry", - sizeof(TestDSMRegistryStruct), - tdr_init_shmem, - &found); - LWLockRegisterTranche(tdr_state->lck.tranche, "test_dsm_registry"); + tdr_dsm = GetNamedDSMSegment("test_dsm_registry_dsm", + sizeof(TestDSMRegistryStruct), + init_tdr_dsm, + &found); + LWLockRegisterTranche(tdr_dsm->lck.tranche, "test_dsm_registry"); + + if (tdr_dsa == NULL) + tdr_dsa = GetNamedDSA("test_dsm_registry_dsa", &found); + + if (tdr_hash == NULL) + tdr_hash = GetNamedDSHash("test_dsm_registry_hash", &dsh_params, &found); } PG_FUNCTION_INFO_V1(set_val_in_shmem); @@ -53,9 +76,9 @@ set_val_in_shmem(PG_FUNCTION_ARGS) { tdr_attach_shmem(); - LWLockAcquire(&tdr_state->lck, LW_EXCLUSIVE); - tdr_state->val = PG_GETARG_UINT32(0); - LWLockRelease(&tdr_state->lck); + LWLockAcquire(&tdr_dsm->lck, LW_EXCLUSIVE); + tdr_dsm->val = PG_GETARG_INT32(0); + LWLockRelease(&tdr_dsm->lck); PG_RETURN_VOID(); } @@ -68,9 +91,57 @@ get_val_in_shmem(PG_FUNCTION_ARGS) tdr_attach_shmem(); - LWLockAcquire(&tdr_state->lck, LW_SHARED); - ret = tdr_state->val; - LWLockRelease(&tdr_state->lck); + LWLockAcquire(&tdr_dsm->lck, LW_SHARED); + ret = tdr_dsm->val; + LWLockRelease(&tdr_dsm->lck); + + PG_RETURN_INT32(ret); +} + +PG_FUNCTION_INFO_V1(set_val_in_hash); +Datum +set_val_in_hash(PG_FUNCTION_ARGS) +{ + TestDSMRegistryHashEntry *entry; + char *key = TextDatumGetCString(PG_GETARG_DATUM(0)); + char *val = TextDatumGetCString(PG_GETARG_DATUM(1)); + bool found; + + if (strlen(key) >= offsetof(TestDSMRegistryHashEntry, val)) + ereport(ERROR, + (errmsg("key too long"))); + + tdr_attach_shmem(); + + entry = dshash_find_or_insert(tdr_hash, key, &found); + if (found) + dsa_free(tdr_dsa, entry->val); + + entry->val = dsa_allocate(tdr_dsa, strlen(val) + 1); + strcpy(dsa_get_address(tdr_dsa, entry->val), val); + + dshash_release_lock(tdr_hash, entry); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(get_val_in_hash); +Datum +get_val_in_hash(PG_FUNCTION_ARGS) +{ + TestDSMRegistryHashEntry *entry; + char *key = TextDatumGetCString(PG_GETARG_DATUM(0)); + text *val = NULL; + + tdr_attach_shmem(); + + entry = dshash_find(tdr_hash, key, false); + if (entry == NULL) + PG_RETURN_NULL(); + + val = cstring_to_text(dsa_get_address(tdr_dsa, entry->val)); + + dshash_release_lock(tdr_hash, entry); - PG_RETURN_UINT32(ret); + PG_RETURN_TEXT_P(val); } diff --git a/src/test/modules/test_int128/.gitignore b/src/test/modules/test_int128/.gitignore new file mode 100644 index 00000000000..277fec6ed2c --- /dev/null +++ b/src/test/modules/test_int128/.gitignore @@ -0,0 +1,2 @@ +/tmp_check/ +/test_int128 diff --git a/src/test/modules/test_int128/Makefile b/src/test/modules/test_int128/Makefile new file mode 100644 index 00000000000..2e86ee93a9d --- /dev/null +++ b/src/test/modules/test_int128/Makefile @@ -0,0 +1,23 @@ +# src/test/modules/test_int128/Makefile + +PGFILEDESC = "test_int128 - test 128-bit integer arithmetic" + +PROGRAM = test_int128 +OBJS = $(WIN32RES) test_int128.o + +PG_CPPFLAGS = -I$(libpq_srcdir) +PG_LIBS_INTERNAL += $(libpq_pgport) + +NO_INSTALL = 1 +TAP_TESTS = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_int128 +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_int128/meson.build b/src/test/modules/test_int128/meson.build new file mode 100644 index 00000000000..4c2be7a0326 --- /dev/null +++ b/src/test/modules/test_int128/meson.build @@ -0,0 +1,33 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +test_int128_sources = files( + 'test_int128.c', +) + +if host_system == 'windows' + test_int128_sources += rc_bin_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_int128', + '--FILEDESC', 'test int128 program',]) +endif + +test_int128 = executable('test_int128', + test_int128_sources, + dependencies: [frontend_code, libpq], + kwargs: default_bin_args + { + 'install': false, + }, +) +testprep_targets += test_int128 + + +tests += { + 'name': 'test_int128', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'tap': { + 'tests': [ + 't/001_test_int128.pl', + ], + 'deps': [test_int128], + }, +} diff --git a/src/test/modules/test_int128/t/001_test_int128.pl b/src/test/modules/test_int128/t/001_test_int128.pl new file mode 100644 index 00000000000..0c683869f34 --- /dev/null +++ b/src/test/modules/test_int128/t/001_test_int128.pl @@ -0,0 +1,27 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Test 128-bit integer arithmetic code in int128.h + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Utils; +use Test::More; + +# Run the test program with 1M iterations +my $exe = "test_int128"; +my $size = 1_000_000; + +note "testing executable $exe"; + +my ($stdout, $stderr) = run_command([ $exe, $size ]); + +SKIP: +{ + skip "no native int128 type", 2 if $stdout =~ /skipping tests/; + + is($stdout, "", "test_int128: no stdout"); + is($stderr, "", "test_int128: no stderr"); +} + +done_testing(); diff --git a/src/tools/testint128.c b/src/test/modules/test_int128/test_int128.c index a25631e277d..caa06541a1f 100644 --- a/src/tools/testint128.c +++ b/src/test/modules/test_int128/test_int128.c @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * testint128.c + * test_int128.c * Testbed for roll-our-own 128-bit integer arithmetic. * * This is a standalone test program that compares the behavior of an @@ -10,13 +10,18 @@ * * * IDENTIFICATION - * src/tools/testint128.c + * src/test/modules/test_int128/test_int128.c * *------------------------------------------------------------------------- */ #include "postgres_fe.h" +#include <time.h> + +/* Require a native int128 type */ +#ifdef HAVE_INT128 + /* * By default, we test the non-native implementation in int128.h; but * by predefining USE_NATIVE_INT128 to 1, you can test the native @@ -36,7 +41,7 @@ typedef union { int128 i128; INT128 I128; - union + struct { #ifdef WORDS_BIGENDIAN int64 hi; @@ -48,6 +53,7 @@ typedef union } hl; } test128; +#define INT128_HEX_FORMAT "%016" PRIx64 "%016" PRIx64 /* * Control version of comparator. @@ -75,7 +81,7 @@ main(int argc, char **argv) { long count; - pg_prng_seed(&pg_global_prng_state, 0); + pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL)); if (argc >= 2) count = strtol(argv[1], NULL, 0); @@ -99,9 +105,9 @@ main(int argc, char **argv) if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo) { - printf("%016lX%016lX + unsigned %lX\n", x, y, z); - printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo); - printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo); + printf(INT128_HEX_FORMAT " + unsigned %016" PRIx64 "\n", x, y, z); + printf("native = " INT128_HEX_FORMAT "\n", t1.hl.hi, t1.hl.lo); + printf("result = " INT128_HEX_FORMAT "\n", t2.hl.hi, t2.hl.lo); return 1; } @@ -114,9 +120,9 @@ main(int argc, char **argv) if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo) { - printf("%016lX%016lX + signed %lX\n", x, y, z); - printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo); - printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo); + printf(INT128_HEX_FORMAT " + signed %016" PRIx64 "\n", x, y, z); + printf("native = " INT128_HEX_FORMAT "\n", t1.hl.hi, t1.hl.lo); + printf("result = " INT128_HEX_FORMAT "\n", t2.hl.hi, t2.hl.lo); return 1; } @@ -128,9 +134,9 @@ main(int argc, char **argv) if (t1.hl.hi != t2.hl.hi || t1.hl.lo != t2.hl.lo) { - printf("%lX * %lX\n", x, y); - printf("native = %016lX%016lX\n", t1.hl.hi, t1.hl.lo); - printf("result = %016lX%016lX\n", t2.hl.hi, t2.hl.lo); + printf("%016" PRIx64 " * %016" PRIx64 "\n", x, y); + printf("native = " INT128_HEX_FORMAT "\n", t1.hl.hi, t1.hl.lo); + printf("result = " INT128_HEX_FORMAT "\n", t2.hl.hi, t2.hl.lo); return 1; } @@ -146,8 +152,8 @@ main(int argc, char **argv) printf("comparison failure: %d vs %d\n", my_int128_compare(t1.i128, t2.i128), int128_compare(t1.I128, t2.I128)); - printf("arg1 = %016lX%016lX\n", t1.hl.hi, t1.hl.lo); - printf("arg2 = %016lX%016lX\n", t2.hl.hi, t2.hl.lo); + printf("arg1 = " INT128_HEX_FORMAT "\n", t1.hl.hi, t1.hl.lo); + printf("arg2 = " INT128_HEX_FORMAT "\n", t2.hl.hi, t2.hl.lo); return 1; } @@ -160,11 +166,25 @@ main(int argc, char **argv) printf("comparison failure: %d vs %d\n", my_int128_compare(t1.i128, t2.i128), int128_compare(t1.I128, t2.I128)); - printf("arg1 = %016lX%016lX\n", t1.hl.hi, t1.hl.lo); - printf("arg2 = %016lX%016lX\n", t2.hl.hi, t2.hl.lo); + printf("arg1 = " INT128_HEX_FORMAT "\n", t1.hl.hi, t1.hl.lo); + printf("arg2 = " INT128_HEX_FORMAT "\n", t2.hl.hi, t2.hl.lo); return 1; } } return 0; } + +#else /* ! HAVE_INT128 */ + +/* + * For now, do nothing if we don't have a native int128 type. + */ +int +main(int argc, char **argv) +{ + printf("skipping tests: no native int128 type\n"); + return 0; +} + +#endif diff --git a/src/test/modules/test_radixtree/test_radixtree.c b/src/test/modules/test_radixtree/test_radixtree.c index 32de6a3123e..80ad0296164 100644 --- a/src/test/modules/test_radixtree/test_radixtree.c +++ b/src/test/modules/test_radixtree/test_radixtree.c @@ -44,7 +44,7 @@ uint64 _expected = (expected_expr); \ if (_result != _expected) \ elog(ERROR, \ - "%s yielded " UINT64_HEX_FORMAT ", expected " UINT64_HEX_FORMAT " (%s) in file \"%s\" line %u", \ + "%s yielded %" PRIx64 ", expected %" PRIx64 " (%s) in file \"%s\" line %u", \ #result_expr, _result, _expected, #expected_expr, __FILE__, __LINE__); \ } while (0) diff --git a/src/test/modules/test_shm_mq/worker.c b/src/test/modules/test_shm_mq/worker.c index 96cd304dbbc..c1d321b69a4 100644 --- a/src/test/modules/test_shm_mq/worker.c +++ b/src/test/modules/test_shm_mq/worker.c @@ -77,7 +77,7 @@ test_shm_mq_main(Datum main_arg) * exit, which is fine. If there were a ResourceOwner, it would acquire * ownership of the mapping, but we have no need for that. */ - seg = dsm_attach(DatumGetInt32(main_arg)); + seg = dsm_attach(DatumGetUInt32(main_arg)); if (seg == NULL) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), diff --git a/src/test/perl/PostgreSQL/Test/AdjustUpgrade.pm b/src/test/perl/PostgreSQL/Test/AdjustUpgrade.pm index 609275e2c26..7224c286e1d 100644 --- a/src/test/perl/PostgreSQL/Test/AdjustUpgrade.pm +++ b/src/test/perl/PostgreSQL/Test/AdjustUpgrade.pm @@ -251,6 +251,32 @@ sub adjust_database_contents 'drop operator if exists public.=> (bigint, NONE)'); } + # Version 19 changed the output format of pg_lsn. To avoid output + # differences, set all pg_lsn columns to NULL if the old version is + # older than 19. + if ($old_version < 19) + { + if ($old_version >= '9.5') + { + _add_st($result, 'regression', + "update brintest set lsncol = NULL"); + } + + if ($old_version >= 12) + { + _add_st($result, 'regression', + "update tab_core_types set pg_lsn = NULL"); + } + + if ($old_version >= 14) + { + _add_st($result, 'regression', + "update brintest_multi set lsncol = NULL"); + _add_st($result, 'regression', + "update brintest_bloom set lsncol = NULL"); + } + } + return $result; } @@ -538,6 +564,7 @@ my @_unused_view_qualifiers = ( { obj => 'VIEW public.limit_thousand_v_2', qual => 'onek' }, { obj => 'VIEW public.limit_thousand_v_3', qual => 'onek' }, { obj => 'VIEW public.limit_thousand_v_4', qual => 'onek' }, + { obj => 'VIEW public.limit_thousand_v_5', qual => 'onek' }, # Since 14 { obj => 'MATERIALIZED VIEW public.compressmv', qual => 'cmdata1' }); diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 1c11750ac1d..35413f14019 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -290,6 +290,33 @@ sub connstr =pod +=item $node->is_alive() + +Check if the node is alive, using pg_isready. +Returns 1 if successful, 0 on failure. + +=cut + +sub is_alive +{ + my ($self) = @_; + local %ENV = $self->_get_env(); + + my $ret = PostgreSQL::Test::Utils::system_log( + 'pg_isready', + '--timeout' => $PostgreSQL::Test::Utils::timeout_default, + '--host' => $self->host, + '--port' => $self->port); + + if ($ret != 0) + { + return 0; + } + return 1; +} + +=pod + =item $node->raw_connect() Open a raw TCP or Unix domain socket connection to the server. This is @@ -684,7 +711,7 @@ sub init print $conf "\n# Added by PostgreSQL::Test::Cluster.pm\n"; print $conf "fsync = off\n"; print $conf "restart_after_crash = off\n"; - print $conf "log_line_prefix = '%m [%p] %q%a '\n"; + print $conf "log_line_prefix = '%m %b[%p] %q%a '\n"; print $conf "log_statement = all\n"; print $conf "log_replication_commands = on\n"; print $conf "wal_retrieve_retry_interval = '500ms'\n"; @@ -2199,6 +2226,14 @@ sub psql $ret = $?; }; my $exc_save = $@; + + # we need a dummy $stderr from hereon, if we didn't collect it + if (! defined $stderr) + { + my $errtxt = "<not collected>"; + $stderr = \$errtxt; + } + if ($exc_save) { diff --git a/src/test/perl/PostgreSQL/Test/Utils.pm b/src/test/perl/PostgreSQL/Test/Utils.pm index 7d7ca83495f..85d36a3171e 100644 --- a/src/test/perl/PostgreSQL/Test/Utils.pm +++ b/src/test/perl/PostgreSQL/Test/Utils.pm @@ -108,6 +108,7 @@ BEGIN delete $ENV{LANGUAGE}; delete $ENV{LC_ALL}; $ENV{LC_MESSAGES} = 'C'; + $ENV{LC_NUMERIC} = 'C'; setlocale(LC_ALL, ""); # This list should be kept in sync with pg_regress.c. diff --git a/src/test/postmaster/t/002_connection_limits.pl b/src/test/postmaster/t/002_connection_limits.pl index 325a00efd47..4a7fb16261f 100644 --- a/src/test/postmaster/t/002_connection_limits.pl +++ b/src/test/postmaster/t/002_connection_limits.pl @@ -20,7 +20,8 @@ $node->init( $node->append_conf('postgresql.conf', "max_connections = 6"); $node->append_conf('postgresql.conf', "reserved_connections = 2"); $node->append_conf('postgresql.conf', "superuser_reserved_connections = 1"); -$node->append_conf('postgresql.conf', "log_connections = on"); +$node->append_conf('postgresql.conf', + "log_connections = 'receipt,authentication,authorization'"); $node->append_conf('postgresql.conf', "log_min_messages=debug2"); $node->start; @@ -67,7 +68,8 @@ sub connect_fails_wait my $log_location = -s $node->logfile; $node->connect_fails($connstr, $test_name, %params); - $node->wait_for_log(qr/DEBUG: (00000: )?client backend.*exited with exit code 1/, + $node->wait_for_log( + qr/DEBUG: (00000: )?client backend.*exited with exit code 1/, $log_location); ok(1, "$test_name: client backend process exited"); } diff --git a/src/test/postmaster/t/003_start_stop.pl b/src/test/postmaster/t/003_start_stop.pl index 4dc394139d9..58e7ba6cc42 100644 --- a/src/test/postmaster/t/003_start_stop.pl +++ b/src/test/postmaster/t/003_start_stop.pl @@ -33,7 +33,8 @@ $node->append_conf('postgresql.conf', "max_connections = 5"); $node->append_conf('postgresql.conf', "max_wal_senders = 0"); $node->append_conf('postgresql.conf', "autovacuum_max_workers = 1"); $node->append_conf('postgresql.conf', "max_worker_processes = 1"); -$node->append_conf('postgresql.conf', "log_connections = on"); +$node->append_conf('postgresql.conf', + "log_connections = 'receipt,authentication,authorization'"); $node->append_conf('postgresql.conf', "log_min_messages = debug2"); $node->append_conf('postgresql.conf', "authentication_timeout = '$authentication_timeout s'"); diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index cb983766c67..52993c32dbb 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -54,6 +54,9 @@ tests += { 't/043_no_contrecord_switch.pl', 't/044_invalidate_inactive_slots.pl', 't/045_archive_restartpoint.pl', + 't/046_checkpoint_logical_slot.pl', + 't/047_checkpoint_physical_slot.pl', + 't/048_vacuum_horizon_floor.pl' ], }, } diff --git a/src/test/recovery/t/003_recovery_targets.pl b/src/test/recovery/t/003_recovery_targets.pl index 0ae2e982727..f2109efa9b1 100644 --- a/src/test/recovery/t/003_recovery_targets.pl +++ b/src/test/recovery/t/003_recovery_targets.pl @@ -187,4 +187,54 @@ ok( $logfile =~ qr/FATAL: .* recovery ended before configured recovery target was reached/, 'recovery end before target reached is a fatal error'); +# Invalid timeline target +$node_standby = PostgreSQL::Test::Cluster->new('standby_9'); +$node_standby->init_from_backup($node_primary, 'my_backup', + has_restoring => 1); +$node_standby->append_conf('postgresql.conf', + "recovery_target_timeline = 'bogus'"); + +$res = run_log( + [ + 'pg_ctl', + '--pgdata' => $node_standby->data_dir, + '--log' => $node_standby->logfile, + 'start', + ]); +ok(!$res, 'invalid timeline target (bogus value)'); + +my $log_start = $node_standby->wait_for_log("is not a valid number"); + +# Timeline target out of min range +$node_standby->append_conf('postgresql.conf', + "recovery_target_timeline = '0'"); + +$res = run_log( + [ + 'pg_ctl', + '--pgdata' => $node_standby->data_dir, + '--log' => $node_standby->logfile, + 'start', + ]); +ok(!$res, 'invalid timeline target (lower bound check)'); + +$log_start = + $node_standby->wait_for_log("must be between 1 and 4294967295", $log_start); + +# Timeline target out of max range +$node_standby->append_conf('postgresql.conf', + "recovery_target_timeline = '4294967296'"); + +$res = run_log( + [ + 'pg_ctl', + '--pgdata' => $node_standby->data_dir, + '--log' => $node_standby->logfile, + 'start', + ]); +ok(!$res, 'invalid timeline target (upper bound check)'); + +$log_start = + $node_standby->wait_for_log("must be between 1 and 4294967295", $log_start); + done_testing(); diff --git a/src/test/recovery/t/013_crash_restart.pl b/src/test/recovery/t/013_crash_restart.pl index 4e60806563f..4c5af018ee4 100644 --- a/src/test/recovery/t/013_crash_restart.pl +++ b/src/test/recovery/t/013_crash_restart.pl @@ -27,7 +27,7 @@ $node->start(); $node->safe_psql( 'postgres', q[ALTER SYSTEM SET restart_after_crash = 1; - ALTER SYSTEM SET log_connections = 1; + ALTER SYSTEM SET log_connections = receipt; SELECT pg_reload_conf();]); # Run psql, keeping session alive, so we have an alive backend to kill. @@ -228,6 +228,13 @@ is( $node->safe_psql( 'before-orderly-restart', 'can still write after crash restart'); +# Confirm that the logical replication launcher, a background worker +# without the never-restart flag, has also restarted successfully. +is($node->poll_query_until('postgres', + "SELECT count(*) = 1 FROM pg_stat_activity WHERE backend_type = 'logical replication launcher'"), + '1', + 'logical replication launcher restarted after crash'); + # Just to be sure, check that an orderly restart now still works $node->restart(); diff --git a/src/test/recovery/t/016_min_consistency.pl b/src/test/recovery/t/016_min_consistency.pl index 9a3b4866fce..b381d0c21b5 100644 --- a/src/test/recovery/t/016_min_consistency.pl +++ b/src/test/recovery/t/016_min_consistency.pl @@ -39,7 +39,7 @@ sub find_largest_lsn defined($len) or die "read error on $filename: $!"; close($fh); - return sprintf("%X/%X", $max_hi, $max_lo); + return sprintf("%X/%08X", $max_hi, $max_lo); } # Initialize primary node diff --git a/src/test/recovery/t/022_crash_temp_files.pl b/src/test/recovery/t/022_crash_temp_files.pl index 50def031c96..0b68860bd3e 100644 --- a/src/test/recovery/t/022_crash_temp_files.pl +++ b/src/test/recovery/t/022_crash_temp_files.pl @@ -26,7 +26,7 @@ $node->start(); $node->safe_psql( 'postgres', q[ALTER SYSTEM SET remove_temp_files_after_crash = on; - ALTER SYSTEM SET log_connections = 1; + ALTER SYSTEM SET log_connections = receipt; ALTER SYSTEM SET work_mem = '64kB'; ALTER SYSTEM SET restart_after_crash = on; SELECT pg_reload_conf();]); diff --git a/src/test/recovery/t/027_stream_regress.pl b/src/test/recovery/t/027_stream_regress.pl index 83def062d11..5d2c06ba06e 100644 --- a/src/test/recovery/t/027_stream_regress.pl +++ b/src/test/recovery/t/027_stream_regress.pl @@ -81,7 +81,14 @@ my $rc = . "--max-concurrent-tests=20 " . "--inputdir=../regress " . "--outputdir=\"$outputdir\""); -if ($rc != 0) + +# Regression diffs are only meaningful if both the primary and the standby +# are still alive after a regression test failure. A crash would cause a +# useless increase in the log quantity, mostly filled with information +# related to queries that could not run. +my $primary_alive = $node_primary->is_alive; +my $standby_alive = $node_standby_1->is_alive; +if ($rc != 0 && $primary_alive && $standby_alive) { # Dump out the regression diffs file, if there is one my $diffs = "$outputdir/regression.diffs"; @@ -93,6 +100,8 @@ if ($rc != 0) } } is($rc, 0, 'regression tests pass'); +is($primary_alive, 1, 'primary alive after regression test run'); +is($standby_alive, 1, 'standby alive after regression test run'); # Clobber all sequences with their next value, so that we don't have # differences between nodes due to caching. diff --git a/src/test/recovery/t/032_relfilenode_reuse.pl b/src/test/recovery/t/032_relfilenode_reuse.pl index 492ef115ba4..0c44883cc34 100644 --- a/src/test/recovery/t/032_relfilenode_reuse.pl +++ b/src/test/recovery/t/032_relfilenode_reuse.pl @@ -14,7 +14,7 @@ $node_primary->init(allows_streaming => 1); $node_primary->append_conf( 'postgresql.conf', q[ allow_in_place_tablespaces = true -log_connections=on +log_connections=receipt # to avoid "repairing" corruption full_page_writes=off log_min_messages=debug2 diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index 921813483e3..c9c182892cf 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -8,6 +8,7 @@ use warnings FATAL => 'all'; use PostgreSQL::Test::Cluster; use PostgreSQL::Test::Utils; +use Time::HiRes qw(usleep); use Test::More; if ($ENV{enable_injection_points} ne 'yes') @@ -623,7 +624,7 @@ ok( $stderr =~ /ERROR: cannot copy invalidated replication slot "vacuum_full_inactiveslot"/, "invalidated slot cannot be copied"); -# Turn hot_standby_feedback back on +# Set hot_standby_feedback to on change_hot_standby_feedback_and_wait_for_xmins(1, 1); ################################################## @@ -754,12 +755,12 @@ wait_until_vacuum_can_remove( # message should not be issued ok( !$node_standby->log_contains( - "invalidating obsolete slot \"no_conflict_inactiveslot\"", $logstart), + "invalidating obsolete replication slot \"no_conflict_inactiveslot\"", $logstart), 'inactiveslot slot invalidation is not logged with vacuum on conflict_test' ); ok( !$node_standby->log_contains( - "invalidating obsolete slot \"no_conflict_activeslot\"", $logstart), + "invalidating obsolete replication slot \"no_conflict_activeslot\"", $logstart), 'activeslot slot invalidation is not logged with vacuum on conflict_test' ); diff --git a/src/test/recovery/t/037_invalid_database.pl b/src/test/recovery/t/037_invalid_database.pl index bdf39397397..dc52c55c7af 100644 --- a/src/test/recovery/t/037_invalid_database.pl +++ b/src/test/recovery/t/037_invalid_database.pl @@ -15,7 +15,7 @@ $node->append_conf( autovacuum = off max_prepared_transactions=5 log_min_duration_statement=0 -log_connections=on +log_connections=receipt log_disconnections=on )); diff --git a/src/test/recovery/t/040_standby_failover_slots_sync.pl b/src/test/recovery/t/040_standby_failover_slots_sync.pl index 9c8b49e942d..2c61c51e914 100644 --- a/src/test/recovery/t/040_standby_failover_slots_sync.pl +++ b/src/test/recovery/t/040_standby_failover_slots_sync.pl @@ -941,8 +941,7 @@ is( $standby1->safe_psql( 'synced slot retained on the new primary'); # Commit the prepared transaction -$standby1->safe_psql('postgres', - "COMMIT PREPARED 'test_twophase_slotsync';"); +$standby1->safe_psql('postgres', "COMMIT PREPARED 'test_twophase_slotsync';"); $standby1->wait_for_catchup('regress_mysub1'); # Confirm that the prepared transaction is replicated to the subscriber diff --git a/src/test/recovery/t/041_checkpoint_at_promote.pl b/src/test/recovery/t/041_checkpoint_at_promote.pl index cb63ac8d5c9..12750ff7d4f 100644 --- a/src/test/recovery/t/041_checkpoint_at_promote.pl +++ b/src/test/recovery/t/041_checkpoint_at_promote.pl @@ -91,7 +91,7 @@ $node_standby->wait_for_event('checkpointer', 'create-restart-point'); # Check the logs that the restart point has started on standby. This is # optional, but let's be sure. ok( $node_standby->log_contains( - "restartpoint starting: immediate wait", $logstart), + "restartpoint starting: fast wait", $logstart), "restartpoint has started"); # Trigger promotion during the restart point. diff --git a/src/test/recovery/t/046_checkpoint_logical_slot.pl b/src/test/recovery/t/046_checkpoint_logical_slot.pl new file mode 100644 index 00000000000..4fd709e3a03 --- /dev/null +++ b/src/test/recovery/t/046_checkpoint_logical_slot.pl @@ -0,0 +1,142 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group +# +# This test verifies the case when the logical slot is advanced during +# checkpoint. The test checks that the logical slot's restart_lsn still refers +# to an existed WAL segment after immediate restart. +# +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; + +use Test::More; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my ($node, $result); + +$node = PostgreSQL::Test::Cluster->new('mike'); +$node->init; +$node->append_conf('postgresql.conf', "wal_level = 'logical'"); +$node->start; + +# Check if the extension injection_points is available, as it may be +# possible that this script is run with installcheck, where the module +# would not be installed by default. +if (!$node->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} + +$node->safe_psql('postgres', q(CREATE EXTENSION injection_points)); + +# Create the two slots we'll need. +$node->safe_psql('postgres', + q{select pg_create_logical_replication_slot('slot_logical', 'test_decoding')} +); +$node->safe_psql('postgres', + q{select pg_create_physical_replication_slot('slot_physical', true)}); + +# Advance both slots to the current position just to have everything "valid". +$node->safe_psql('postgres', + q{select count(*) from pg_logical_slot_get_changes('slot_logical', null, null)} +); +$node->safe_psql('postgres', + q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} +); + +# Run checkpoint to flush current state to disk and set a baseline. +$node->safe_psql('postgres', q{checkpoint}); + +# Generate some transactions to get RUNNING_XACTS. +my $xacts = $node->background_psql('postgres'); +$xacts->query_until( + qr/run_xacts/, + q(\echo run_xacts +SELECT 1 \watch 0.1 +\q +)); + +$node->advance_wal(20); + +# Run another checkpoint to set a new restore LSN. +$node->safe_psql('postgres', q{checkpoint}); + +$node->advance_wal(20); + +# Run another checkpoint, this time in the background, and make it wait +# on the injection point) so that the checkpoint stops right before +# removing old WAL segments. +note('starting checkpoint'); + +my $checkpoint = $node->background_psql('postgres'); +$checkpoint->query_safe( + q(select injection_points_attach('checkpoint-before-old-wal-removal','wait')) +); +$checkpoint->query_until( + qr/starting_checkpoint/, + q(\echo starting_checkpoint +checkpoint; +\q +)); + +# Wait until the checkpoint stops right before removing WAL segments. +note('waiting for injection_point'); +$node->wait_for_event('checkpointer', 'checkpoint-before-old-wal-removal'); +note('injection_point is reached'); + +# Try to advance the logical slot, but make it stop when it moves to the next +# WAL segment (this has to happen in the background, too). +my $logical = $node->background_psql('postgres'); +$logical->query_safe( + q{select injection_points_attach('logical-replication-slot-advance-segment','wait');} +); +$logical->query_until( + qr/get_changes/, + q( +\echo get_changes +select count(*) from pg_logical_slot_get_changes('slot_logical', null, null) \watch 1 +\q +)); + +# Wait until the slot's restart_lsn points to the next WAL segment. +note('waiting for injection_point'); +$node->wait_for_event('client backend', + 'logical-replication-slot-advance-segment'); +note('injection_point is reached'); + +# OK, we're in the right situation: time to advance the physical slot, which +# recalculates the required LSN, and then unblock the checkpoint, which +# removes the WAL still needed by the logical slot. +$node->safe_psql('postgres', + q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} +); + +# Generate a long WAL record, spawning at least two pages for the follow-up +# post-recovery check. +$node->safe_psql('postgres', + q{select pg_logical_emit_message(false, '', repeat('123456789', 1000))}); + +# Continue the checkpoint and wait for its completion. +my $log_offset = -s $node->logfile; +$node->safe_psql('postgres', + q{select injection_points_wakeup('checkpoint-before-old-wal-removal')}); +$node->wait_for_log(qr/checkpoint complete/, $log_offset); + +# Abruptly stop the server. +$node->stop('immediate'); + +$node->start; + +eval { + $node->safe_psql('postgres', + q{select count(*) from pg_logical_slot_get_changes('slot_logical', null, null);} + ); +}; +is($@, '', "Logical slot still valid"); + +done_testing(); diff --git a/src/test/recovery/t/047_checkpoint_physical_slot.pl b/src/test/recovery/t/047_checkpoint_physical_slot.pl new file mode 100644 index 00000000000..9e98383e30e --- /dev/null +++ b/src/test/recovery/t/047_checkpoint_physical_slot.pl @@ -0,0 +1,133 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group +# +# This test verifies the case when the physical slot is advanced during +# checkpoint. The test checks that the physical slot's restart_lsn still refers +# to an existed WAL segment after immediate restart. +# +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; + +use Test::More; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my ($node, $result); + +$node = PostgreSQL::Test::Cluster->new('mike'); +$node->init; +$node->append_conf('postgresql.conf', "wal_level = 'replica'"); +$node->start; + +# Check if the extension injection_points is available, as it may be +# possible that this script is run with installcheck, where the module +# would not be installed by default. +if (!$node->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} + +$node->safe_psql('postgres', q(CREATE EXTENSION injection_points)); + +# Create a physical replication slot. +$node->safe_psql('postgres', + q{select pg_create_physical_replication_slot('slot_physical', true)}); + +# Advance slot to the current position, just to have everything "valid". +$node->safe_psql('postgres', + q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} +); + +# Run checkpoint to flush current state to disk and set a baseline. +$node->safe_psql('postgres', q{checkpoint}); + +# Insert 2M rows; that's about 260MB (~20 segments) worth of WAL. +$node->advance_wal(20); + +# Advance slot to the current position, just to have everything "valid". +$node->safe_psql('postgres', + q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} +); + +# Run another checkpoint to set a new restore LSN. +$node->safe_psql('postgres', q{checkpoint}); + +# Another 2M rows; that's about 260MB (~20 segments) worth of WAL. +$node->advance_wal(20); + +my $restart_lsn_init = $node->safe_psql('postgres', + q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'} +); +chomp($restart_lsn_init); +note("restart lsn before checkpoint: $restart_lsn_init"); + +# Run another checkpoint, this time in the background, and make it wait +# on the injection point) so that the checkpoint stops right before +# removing old WAL segments. +note('starting checkpoint'); + +my $checkpoint = $node->background_psql('postgres'); +$checkpoint->query_safe( + q{select injection_points_attach('checkpoint-before-old-wal-removal','wait')} +); +$checkpoint->query_until( + qr/starting_checkpoint/, + q(\echo starting_checkpoint +checkpoint; +\q +)); + +# Wait until the checkpoint stops right before removing WAL segments. +note('waiting for injection_point'); +$node->wait_for_event('checkpointer', 'checkpoint-before-old-wal-removal'); +note('injection_point is reached'); + +# OK, we're in the right situation: time to advance the physical slot, which +# recalculates the required LSN and then unblock the checkpoint, which +# removes the WAL still needed by the physical slot. +$node->safe_psql('postgres', + q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} +); + +# Continue the checkpoint and wait for its completion. +my $log_offset = -s $node->logfile; +$node->safe_psql('postgres', + q{select injection_points_wakeup('checkpoint-before-old-wal-removal')}); +$node->wait_for_log(qr/checkpoint complete/, $log_offset); + +my $restart_lsn_old = $node->safe_psql('postgres', + q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'} +); +chomp($restart_lsn_old); +note("restart lsn before stop: $restart_lsn_old"); + +# Abruptly stop the server. +$node->stop('immediate'); + +$node->start; + +# Get the restart_lsn of the slot right after restarting. +my $restart_lsn = $node->safe_psql('postgres', + q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'} +); +chomp($restart_lsn); +note("restart lsn: $restart_lsn"); + +# Get the WAL segment name for the slot's restart_lsn. +my $restart_lsn_segment = $node->safe_psql('postgres', + "SELECT pg_walfile_name('$restart_lsn'::pg_lsn)"); +chomp($restart_lsn_segment); + +# Check if the required wal segment exists. +note("required by slot segment name: $restart_lsn_segment"); +my $datadir = $node->data_dir; +ok( -f "$datadir/pg_wal/$restart_lsn_segment", + "WAL segment $restart_lsn_segment for physical slot's restart_lsn $restart_lsn exists" +); + +done_testing(); diff --git a/src/test/recovery/t/048_vacuum_horizon_floor.pl b/src/test/recovery/t/048_vacuum_horizon_floor.pl new file mode 100644 index 00000000000..e56fce59d58 --- /dev/null +++ b/src/test/recovery/t/048_vacuum_horizon_floor.pl @@ -0,0 +1,288 @@ +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use Test::More; + +# Test that vacuum prunes away all dead tuples killed before OldestXmin +# +# This test creates a table on a primary, updates the table to generate dead +# tuples for vacuum, and then, during the vacuum, uses the replica to force +# GlobalVisState->maybe_needed on the primary to move backwards and precede +# the value of OldestXmin set at the beginning of vacuuming the table. + +# Set up nodes +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 'physical'); + +# io_combine_limit is set to 1 to avoid pinning more than one buffer at a time +# to ensure test determinism. +$node_primary->append_conf( + 'postgresql.conf', qq[ +hot_standby_feedback = on +autovacuum = off +log_min_messages = INFO +maintenance_work_mem = 64 +io_combine_limit = 1 +]); +$node_primary->start; + +my $node_replica = PostgreSQL::Test::Cluster->new('standby'); + +$node_primary->backup('my_backup'); +$node_replica->init_from_backup($node_primary, 'my_backup', + has_streaming => 1); + +$node_replica->start; + +my $test_db = "test_db"; +$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db"); + +# Save the original connection info for later use +my $orig_conninfo = $node_primary->connstr(); + +my $table1 = "vac_horizon_floor_table"; + +# Long-running Primary Session A +my $psql_primaryA = + $node_primary->background_psql($test_db, on_error_stop => 1); + +# Long-running Primary Session B +my $psql_primaryB = + $node_primary->background_psql($test_db, on_error_stop => 1); + +# Our test relies on two rounds of index vacuuming for reasons elaborated +# later. To trigger two rounds of index vacuuming, we must fill up the +# TIDStore with dead items partway through a vacuum of the table. The number +# of rows is just enough to ensure we exceed maintenance_work_mem on all +# supported platforms, while keeping test runtime as short as we can. +my $nrows = 2000; + +# Because vacuum's first pass, pruning, is where we use the GlobalVisState to +# check tuple visibility, GlobalVisState->maybe_needed must move backwards +# during pruning before checking the visibility for a tuple which would have +# been considered HEAPTUPLE_DEAD prior to maybe_needed moving backwards but +# HEAPTUPLE_RECENTLY_DEAD compared to the new, older value of maybe_needed. +# +# We must not only force the horizon on the primary to move backwards but also +# force the vacuuming backend's GlobalVisState to be updated. GlobalVisState +# is forced to update during index vacuuming. +# +# _bt_pendingfsm_finalize() calls GetOldestNonRemovableTransactionId() at the +# end of a round of index vacuuming, updating the backend's GlobalVisState +# and, in our case, moving maybe_needed backwards. +# +# Then vacuum's first (pruning) pass will continue and pruning will find our +# later inserted and updated tuple HEAPTUPLE_RECENTLY_DEAD when compared to +# maybe_needed but HEAPTUPLE_DEAD when compared to OldestXmin. +# +# Thus, we must force at least two rounds of index vacuuming to ensure that +# some tuple visibility checks will happen after a round of index vacuuming. +# To accomplish this, we set maintenance_work_mem to its minimum value and +# insert and delete enough rows that we force at least one round of index +# vacuuming before getting to a dead tuple which was killed after the standby +# is disconnected. +$node_primary->safe_psql( + $test_db, qq[ + CREATE TABLE ${table1}(col1 int) + WITH (autovacuum_enabled=false, fillfactor=10); + INSERT INTO $table1 VALUES(7); + INSERT INTO $table1 SELECT generate_series(1, $nrows) % 3; + CREATE INDEX on ${table1}(col1); + DELETE FROM $table1 WHERE col1 = 0; + INSERT INTO $table1 VALUES(7); +]); + +# We will later move the primary forward while the standby is disconnected. +# For now, however, there is no reason not to wait for the standby to catch +# up. +my $primary_lsn = $node_primary->lsn('flush'); +$node_primary->wait_for_catchup($node_replica, 'replay', $primary_lsn); + +# Test that the WAL receiver is up and running. +$node_replica->poll_query_until( + $test_db, qq[ + SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);], 't'); + +# Set primary_conninfo to something invalid on the replica and reload the +# config. Once the config is reloaded, the startup process will force the WAL +# receiver to restart and it will be unable to reconnect because of the +# invalid connection information. +$node_replica->safe_psql( + $test_db, qq[ + ALTER SYSTEM SET primary_conninfo = ''; + SELECT pg_reload_conf(); + ]); + +# Wait until the WAL receiver has shut down and been unable to start up again. +$node_replica->poll_query_until( + $test_db, qq[ + SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);], 'f'); + +# Now insert and update a tuple which will be visible to the vacuum on the +# primary but which will have xmax newer than the oldest xmin on the standby +# that was recently disconnected. +my $res = $psql_primaryA->query_safe( + qq[ + INSERT INTO $table1 VALUES (99); + UPDATE $table1 SET col1 = 100 WHERE col1 = 99; + SELECT 'after_update'; + ] +); + +# Make sure the UPDATE finished +like($res, qr/^after_update$/m, "UPDATE occurred on primary session A"); + +# Open a cursor on the primary whose pin will keep VACUUM from getting a +# cleanup lock on the first page of the relation. We want VACUUM to be able to +# start, calculate initial values for OldestXmin and GlobalVisState and then +# be unable to proceed with pruning our dead tuples. This will allow us to +# reconnect the standby and push the horizon back before we start actual +# pruning and vacuuming. +my $primary_cursor1 = "vac_horizon_floor_cursor1"; + +# The first value inserted into the table was a 7, so FETCH FORWARD should +# return a 7. That's how we know the cursor has a pin. +# Disable index scans so the cursor pins heap pages and not index pages. +$res = $psql_primaryB->query_safe( + qq[ + BEGIN; + SET enable_bitmapscan = off; + SET enable_indexscan = off; + SET enable_indexonlyscan = off; + DECLARE $primary_cursor1 CURSOR FOR SELECT * FROM $table1 WHERE col1 = 7; + FETCH $primary_cursor1; + ] +); + +is($res, 7, qq[Cursor query returned $res. Expected value 7.]); + +# Get the PID of the session which will run the VACUUM FREEZE so that we can +# use it to filter pg_stat_activity later. +my $vacuum_pid = $psql_primaryA->query_safe("SELECT pg_backend_pid();"); + +# Now start a VACUUM FREEZE on the primary. It will call vacuum_get_cutoffs() +# and establish values of OldestXmin and GlobalVisState which are newer than +# all of our dead tuples. Then it will be unable to get a cleanup lock to +# start pruning, so it will hang. +# +# We use VACUUM FREEZE because it will wait for a cleanup lock instead of +# skipping the page pinned by the cursor. Note that works because the target +# tuple's xmax precedes OldestXmin which ensures that lazy_scan_noprune() will +# return false and we will wait for the cleanup lock. +# +# Disable any prefetching, parallelism, or other concurrent I/O by vacuum. The +# pages of the heap must be processed in order by a single worker to ensure +# test stability (PARALLEL 0 shouldn't be necessary but guards against the +# possibility of parallel heap vacuuming). +$psql_primaryA->{stdin} .= qq[ + SET maintenance_io_concurrency = 0; + VACUUM (VERBOSE, FREEZE, PARALLEL 0) $table1; + \\echo VACUUM + ]; + +# Make sure the VACUUM command makes it to the server. +$psql_primaryA->{run}->pump_nb(); + +# Make sure that the VACUUM has already called vacuum_get_cutoffs() and is +# just waiting on the lock to start vacuuming. We don't want the standby to +# re-establish a connection to the primary and push the horizon back until +# we've saved initial values in GlobalVisState and calculated OldestXmin. +$node_primary->poll_query_until( + $test_db, + qq[ + SELECT count(*) >= 1 FROM pg_stat_activity + WHERE pid = $vacuum_pid + AND wait_event = 'BufferPin'; + ], + 't'); + +# Ensure the WAL receiver is still not active on the replica. +$node_replica->poll_query_until( + $test_db, qq[ + SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);], 'f'); + +# Allow the WAL receiver connection to re-establish. +$node_replica->safe_psql( + $test_db, qq[ + ALTER SYSTEM SET primary_conninfo = '$orig_conninfo'; + SELECT pg_reload_conf(); + ]); + +# Ensure the new WAL receiver has connected. +$node_replica->poll_query_until( + $test_db, qq[ + SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);], 't'); + +# Once the WAL sender is shown on the primary, the replica should have +# connected with the primary and pushed the horizon backward. Primary Session +# A won't see that until the VACUUM FREEZE proceeds and does its first round +# of index vacuuming. +$node_primary->poll_query_until( + $test_db, qq[ + SELECT EXISTS (SELECT * FROM pg_stat_replication);], 't'); + +# Move the cursor forward to the next 7. We inserted the 7 much later, so +# advancing the cursor should allow vacuum to proceed vacuuming most pages of +# the relation. Because we set maintanence_work_mem sufficiently low, we +# expect that a round of index vacuuming has happened and that the vacuum is +# now waiting for the cursor to release its pin on the last page of the +# relation. +$res = $psql_primaryB->query_safe("FETCH $primary_cursor1"); +is($res, 7, + qq[Cursor query returned $res from second fetch. Expected value 7.]); + +# Prevent the test from incorrectly passing by confirming that we did indeed +# do a pass of index vacuuming. +$node_primary->poll_query_until( + $test_db, qq[ + SELECT index_vacuum_count > 0 + FROM pg_stat_progress_vacuum + WHERE datname='$test_db' AND relid::regclass = '$table1'::regclass; + ], 't'); + +# Commit the transaction with the open cursor so that the VACUUM can finish. +$psql_primaryB->query_until( + qr/^commit$/m, + qq[ + COMMIT; + \\echo commit + ] +); + +# VACUUM proceeds with pruning and does a visibility check on each tuple. In +# older versions of Postgres, pruning found our final dead tuple +# non-removable (HEAPTUPLE_RECENTLY_DEAD) since its xmax is after the new +# value of maybe_needed. Then heap_prepare_freeze_tuple() would decide the +# tuple xmax should be frozen because it precedes OldestXmin. Vacuum would +# then error out in heap_pre_freeze_checks() with "cannot freeze committed +# xmax". This was fixed by changing pruning to find all +# HEAPTUPLE_RECENTLY_DEAD tuples with xmaxes preceding OldestXmin +# HEAPTUPLE_DEAD and removing them. + +# With the fix, VACUUM should finish successfully, incrementing the table +# vacuum_count. +$node_primary->poll_query_until( + $test_db, + qq[ + SELECT vacuum_count > 0 + FROM pg_stat_all_tables WHERE relname = '${table1}'; + ] + , 't'); + +$primary_lsn = $node_primary->lsn('flush'); + +# Make sure something causes us to flush +$node_primary->safe_psql($test_db, "INSERT INTO $table1 VALUES (1);"); + +# Nothing on the replica should cause a recovery conflict, so this should +# finish successfully. +$node_primary->wait_for_catchup($node_replica, 'replay', $primary_lsn); + +## Shut down psqls +$psql_primaryA->quit; +$psql_primaryB->quit; + +$node_replica->stop(); +$node_primary->stop(); + +done_testing(); diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 476266e3f4b..08984dd98f1 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -4745,6 +4745,21 @@ alter table attbl alter column p1 set data type bigint; alter table atref alter column c1 set data type bigint; drop table attbl, atref; /* End test case for bug #17409 */ +/* Test case for bug #18970 */ +create table attbl(a int); +create table atref(b attbl check ((b).a is not null)); +alter table attbl alter column a type numeric; -- someday this should work +ERROR: cannot alter table "attbl" because column "atref.b" uses its row type +alter table atref drop constraint atref_b_check; +create statistics atref_stat on ((b).a is not null) from atref; +alter table attbl alter column a type numeric; -- someday this should work +ERROR: cannot alter table "attbl" because column "atref.b" uses its row type +drop statistics atref_stat; +create index atref_idx on atref (((b).a)); +alter table attbl alter column a type numeric; -- someday this should work +ERROR: cannot alter table "attbl" because column "atref.b" uses its row type +drop table attbl, atref; +/* End test case for bug #18970 */ -- Test that ALTER TABLE rewrite preserves a clustered index -- for normal indexes and indexes on constraints. create table alttype_cluster (a int); diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index bfb1a286ea4..21dc9b5783a 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -195,54 +195,123 @@ ORDER BY proname DESC, proargtypes DESC, pronamespace DESC LIMIT 1; (1 row) -- --- Add coverage for RowCompare quals whose rhs row has a NULL that ends scan +-- Forwards scan RowCompare qual whose row arg has a NULL that affects our +-- initial positioning strategy -- explain (costs off) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) < ('abs', NULL) + WHERE (proname, proargtypes) >= ('abs', NULL) AND proname <= 'abs' ORDER BY proname, proargtypes, pronamespace; - QUERY PLAN -------------------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- Index Only Scan using pg_proc_proname_args_nsp_index on pg_proc - Index Cond: ((ROW(proname, proargtypes) < ROW('abs'::name, NULL::oidvector)) AND (proname = 'abs'::name)) + Index Cond: ((ROW(proname, proargtypes) >= ROW('abs'::name, NULL::oidvector)) AND (proname <= 'abs'::name)) (2 rows) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) < ('abs', NULL) + WHERE (proname, proargtypes) >= ('abs', NULL) AND proname <= 'abs' ORDER BY proname, proargtypes, pronamespace; proname | proargtypes | pronamespace ---------+-------------+-------------- (0 rows) -- --- Add coverage for backwards scan RowCompare quals whose rhs row has a NULL --- that ends scan +-- Forwards scan RowCompare quals whose row arg has a NULL that ends scan -- explain (costs off) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) > ('abs', NULL) + WHERE proname >= 'abs' AND (proname, proargtypes) < ('abs', NULL) +ORDER BY proname, proargtypes, pronamespace; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Index Only Scan using pg_proc_proname_args_nsp_index on pg_proc + Index Cond: ((proname >= 'abs'::name) AND (ROW(proname, proargtypes) < ROW('abs'::name, NULL::oidvector))) +(2 rows) + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname >= 'abs' AND (proname, proargtypes) < ('abs', NULL) +ORDER BY proname, proargtypes, pronamespace; + proname | proargtypes | pronamespace +---------+-------------+-------------- +(0 rows) + +-- +-- Backwards scan RowCompare qual whose row arg has a NULL that affects our +-- initial positioning strategy +-- +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname >= 'abs' AND (proname, proargtypes) <= ('abs', NULL) +ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Index Only Scan Backward using pg_proc_proname_args_nsp_index on pg_proc + Index Cond: ((proname >= 'abs'::name) AND (ROW(proname, proargtypes) <= ROW('abs'::name, NULL::oidvector))) +(2 rows) + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname >= 'abs' AND (proname, proargtypes) <= ('abs', NULL) +ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; + proname | proargtypes | pronamespace +---------+-------------+-------------- +(0 rows) + +-- +-- Backwards scan RowCompare qual whose row arg has a NULL that ends scan +-- +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE (proname, proargtypes) > ('abs', NULL) AND proname <= 'abs' ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; - QUERY PLAN -------------------------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- Index Only Scan Backward using pg_proc_proname_args_nsp_index on pg_proc - Index Cond: ((ROW(proname, proargtypes) > ROW('abs'::name, NULL::oidvector)) AND (proname = 'abs'::name)) + Index Cond: ((ROW(proname, proargtypes) > ROW('abs'::name, NULL::oidvector)) AND (proname <= 'abs'::name)) (2 rows) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) > ('abs', NULL) + WHERE (proname, proargtypes) > ('abs', NULL) AND proname <= 'abs' ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; proname | proargtypes | pronamespace ---------+-------------+-------------- (0 rows) +-- Makes B-Tree preprocessing deal with unmarking redundant keys that were +-- initially marked required (test case relies on current row compare +-- preprocessing limitations) +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname = 'zzzzzz' AND (proname, proargtypes) > ('abs', NULL) + AND pronamespace IN (1, 2, 3) AND proargtypes IN ('26 23', '5077') +ORDER BY proname, proargtypes, pronamespace; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Index Only Scan using pg_proc_proname_args_nsp_index on pg_proc + Index Cond: ((ROW(proname, proargtypes) > ROW('abs'::name, NULL::oidvector)) AND (proname = 'zzzzzz'::name) AND (proargtypes = ANY ('{"26 23",5077}'::oidvector[])) AND (pronamespace = ANY ('{1,2,3}'::oid[]))) +(2 rows) + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname = 'zzzzzz' AND (proname, proargtypes) > ('abs', NULL) + AND pronamespace IN (1, 2, 3) AND proargtypes IN ('26 23', '5077') +ORDER BY proname, proargtypes, pronamespace; + proname | proargtypes | pronamespace +---------+-------------+-------------- +(0 rows) + -- --- Add coverage for recheck of > key following array advancement on previous --- (left sibling) page that used a high key whose attribute value corresponding --- to the > key was -inf (due to being truncated when the high key was created). +-- Performs a recheck of > key following array advancement on previous (left +-- sibling) page that used a high key whose attribute value corresponding to +-- the > key was -inf (due to being truncated when the high key was created). -- -- XXX This relies on the assumption that tenk1_thous_tenthous has a truncated -- high key "(183, -inf)" on the first page that we'll scan. The test will only diff --git a/src/test/regress/expected/compression.out b/src/test/regress/expected/compression.out index 4dd9ee7200d..09f198149aa 100644 --- a/src/test/regress/expected/compression.out +++ b/src/test/regress/expected/compression.out @@ -1,3 +1,7 @@ +-- Default set of tests for TOAST compression, independent on compression +-- methods supported by the build. +CREATE SCHEMA pglz; +SET search_path TO pglz, public; \set HIDE_TOAST_COMPRESSION false -- ensure we get stable results regardless of installation's default SET default_toast_compression = 'pglz'; @@ -6,21 +10,13 @@ CREATE TABLE cmdata(f1 text COMPRESSION pglz); CREATE INDEX idx ON cmdata(f1); INSERT INTO cmdata VALUES(repeat('1234567890', 1000)); \d+ cmdata - Table "public.cmdata" + Table "pglz.cmdata" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+------+-----------+----------+---------+----------+-------------+--------------+------------- f1 | text | | | | extended | pglz | | Indexes: "idx" btree (f1) -CREATE TABLE cmdata1(f1 TEXT COMPRESSION lz4); -INSERT INTO cmdata1 VALUES(repeat('1234567890', 1004)); -\d+ cmdata1 - Table "public.cmdata1" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | lz4 | | - -- verify stored compression method in the data SELECT pg_column_compression(f1) FROM cmdata; pg_column_compression @@ -28,12 +24,6 @@ SELECT pg_column_compression(f1) FROM cmdata; pglz (1 row) -SELECT pg_column_compression(f1) FROM cmdata1; - pg_column_compression ------------------------ - lz4 -(1 row) - -- decompress data slice SELECT SUBSTR(f1, 200, 5) FROM cmdata; substr @@ -41,16 +31,10 @@ SELECT SUBSTR(f1, 200, 5) FROM cmdata; 01234 (1 row) -SELECT SUBSTR(f1, 2000, 50) FROM cmdata1; - substr ----------------------------------------------------- - 01234567890123456789012345678901234567890123456789 -(1 row) - -- copy with table creation SELECT * INTO cmmove1 FROM cmdata; \d+ cmmove1 - Table "public.cmmove1" + Table "pglz.cmmove1" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+------+-----------+----------+---------+----------+-------------+--------------+------------- f1 | text | | | | extended | | | @@ -61,45 +45,9 @@ SELECT pg_column_compression(f1) FROM cmmove1; pglz (1 row) --- copy to existing table -CREATE TABLE cmmove3(f1 text COMPRESSION pglz); -INSERT INTO cmmove3 SELECT * FROM cmdata; -INSERT INTO cmmove3 SELECT * FROM cmdata1; -SELECT pg_column_compression(f1) FROM cmmove3; - pg_column_compression ------------------------ - pglz - lz4 -(2 rows) - --- test LIKE INCLUDING COMPRESSION -CREATE TABLE cmdata2 (LIKE cmdata1 INCLUDING COMPRESSION); -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | lz4 | | - -DROP TABLE cmdata2; -- try setting compression for incompressible data type CREATE TABLE cmdata2 (f1 int COMPRESSION pglz); ERROR: column data type integer does not support compression --- update using datum from different table -CREATE TABLE cmmove2(f1 text COMPRESSION pglz); -INSERT INTO cmmove2 VALUES (repeat('1234567890', 1004)); -SELECT pg_column_compression(f1) FROM cmmove2; - pg_column_compression ------------------------ - pglz -(1 row) - -UPDATE cmmove2 SET f1 = cmdata1.f1 FROM cmdata1; -SELECT pg_column_compression(f1) FROM cmmove2; - pg_column_compression ------------------------ - lz4 -(1 row) - -- test externally stored compressed data CREATE OR REPLACE FUNCTION large_val() RETURNS TEXT LANGUAGE SQL AS 'select array_agg(fipshash(g::text))::text from generate_series(1, 256) g'; @@ -111,21 +59,6 @@ SELECT pg_column_compression(f1) FROM cmdata2; pglz (1 row) -INSERT INTO cmdata1 SELECT large_val() || repeat('a', 4000); -SELECT pg_column_compression(f1) FROM cmdata1; - pg_column_compression ------------------------ - lz4 - lz4 -(2 rows) - -SELECT SUBSTR(f1, 200, 5) FROM cmdata1; - substr --------- - 01234 - 79026 -(2 rows) - SELECT SUBSTR(f1, 200, 5) FROM cmdata2; substr -------- @@ -136,21 +69,21 @@ DROP TABLE cmdata2; --test column type update varlena/non-varlena CREATE TABLE cmdata2 (f1 int); \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+---------+-----------+----------+---------+---------+-------------+--------------+------------- f1 | integer | | | | plain | | | ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE varchar; \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+-------------------+-----------+----------+---------+----------+-------------+--------------+------------- f1 | character varying | | | | extended | | | ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE int USING f1::integer; \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+---------+-----------+----------+---------+---------+-------------+--------------+------------- f1 | integer | | | | plain | | | @@ -160,14 +93,14 @@ ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE int USING f1::integer; ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE varchar; ALTER TABLE cmdata2 ALTER COLUMN f1 SET COMPRESSION pglz; \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+-------------------+-----------+----------+---------+----------+-------------+--------------+------------- f1 | character varying | | | | extended | pglz | | ALTER TABLE cmdata2 ALTER COLUMN f1 SET STORAGE plain; \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+-------------------+-----------+----------+---------+---------+-------------+--------------+------------- f1 | character varying | | | | plain | pglz | | @@ -179,164 +112,47 @@ SELECT pg_column_compression(f1) FROM cmdata2; (1 row) --- test compression with materialized view -CREATE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata1; -\d+ compressmv - Materialized view "public.compressmv" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - x | text | | | | extended | | | -View definition: - SELECT f1 AS x - FROM cmdata1; - -SELECT pg_column_compression(f1) FROM cmdata1; - pg_column_compression ------------------------ - lz4 - lz4 -(2 rows) - -SELECT pg_column_compression(x) FROM compressmv; - pg_column_compression ------------------------ - lz4 - lz4 -(2 rows) - --- test compression with partition -CREATE TABLE cmpart(f1 text COMPRESSION lz4) PARTITION BY HASH(f1); -CREATE TABLE cmpart1 PARTITION OF cmpart FOR VALUES WITH (MODULUS 2, REMAINDER 0); -CREATE TABLE cmpart2(f1 text COMPRESSION pglz); -ALTER TABLE cmpart ATTACH PARTITION cmpart2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -SELECT pg_column_compression(f1) FROM cmpart1; - pg_column_compression ------------------------ - lz4 -(1 row) - -SELECT pg_column_compression(f1) FROM cmpart2; - pg_column_compression ------------------------ - pglz -(1 row) - -- test compression with inheritance -CREATE TABLE cminh() INHERITS(cmdata, cmdata1); -- error -NOTICE: merging multiple inherited definitions of column "f1" -ERROR: column "f1" has a compression method conflict -DETAIL: pglz versus lz4 -CREATE TABLE cminh(f1 TEXT COMPRESSION lz4) INHERITS(cmdata); -- error -NOTICE: merging column "f1" with inherited definition -ERROR: column "f1" has a compression method conflict -DETAIL: pglz versus lz4 CREATE TABLE cmdata3(f1 text); CREATE TABLE cminh() INHERITS (cmdata, cmdata3); NOTICE: merging multiple inherited definitions of column "f1" -- test default_toast_compression GUC +-- suppress machine-dependent details +\set VERBOSITY terse SET default_toast_compression = ''; ERROR: invalid value for parameter "default_toast_compression": "" -HINT: Available values: pglz, lz4. SET default_toast_compression = 'I do not exist compression'; ERROR: invalid value for parameter "default_toast_compression": "I do not exist compression" -HINT: Available values: pglz, lz4. -SET default_toast_compression = 'lz4'; SET default_toast_compression = 'pglz'; --- test alter compression method -ALTER TABLE cmdata ALTER COLUMN f1 SET COMPRESSION lz4; -INSERT INTO cmdata VALUES (repeat('123456789', 4004)); -\d+ cmdata - Table "public.cmdata" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | lz4 | | -Indexes: - "idx" btree (f1) -Child tables: cminh - -SELECT pg_column_compression(f1) FROM cmdata; - pg_column_compression ------------------------ - pglz - lz4 -(2 rows) - +\set VERBOSITY default ALTER TABLE cmdata2 ALTER COLUMN f1 SET COMPRESSION default; \d+ cmdata2 - Table "public.cmdata2" + Table "pglz.cmdata2" Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description --------+-------------------+-----------+----------+---------+---------+-------------+--------------+------------- f1 | character varying | | | | plain | | | --- test alter compression method for materialized views -ALTER MATERIALIZED VIEW compressmv ALTER COLUMN x SET COMPRESSION lz4; -\d+ compressmv - Materialized view "public.compressmv" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - x | text | | | | extended | lz4 | | -View definition: - SELECT f1 AS x - FROM cmdata1; - --- test alter compression method for partitioned tables -ALTER TABLE cmpart1 ALTER COLUMN f1 SET COMPRESSION pglz; -ALTER TABLE cmpart2 ALTER COLUMN f1 SET COMPRESSION lz4; --- new data should be compressed with the current compression method -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -SELECT pg_column_compression(f1) FROM cmpart1; - pg_column_compression ------------------------ - lz4 - pglz -(2 rows) - -SELECT pg_column_compression(f1) FROM cmpart2; - pg_column_compression ------------------------ - pglz - lz4 -(2 rows) - +DROP TABLE cmdata2; -- VACUUM FULL does not recompress SELECT pg_column_compression(f1) FROM cmdata; pg_column_compression ----------------------- pglz - lz4 -(2 rows) +(1 row) VACUUM FULL cmdata; SELECT pg_column_compression(f1) FROM cmdata; pg_column_compression ----------------------- pglz - lz4 -(2 rows) +(1 row) --- test expression index -DROP TABLE cmdata2; -CREATE TABLE cmdata2 (f1 TEXT COMPRESSION pglz, f2 TEXT COMPRESSION lz4); -CREATE UNIQUE INDEX idx1 ON cmdata2 ((f1 || f2)); -INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEXT))::TEXT FROM -generate_series(1, 50) g), VERSION()); -- check data is ok SELECT length(f1) FROM cmdata; length -------- 10000 - 36036 -(2 rows) - -SELECT length(f1) FROM cmdata1; - length --------- - 10040 - 12449 -(2 rows) +(1 row) SELECT length(f1) FROM cmmove1; length @@ -344,19 +160,6 @@ SELECT length(f1) FROM cmmove1; 10000 (1 row) -SELECT length(f1) FROM cmmove2; - length --------- - 10040 -(1 row) - -SELECT length(f1) FROM cmmove3; - length --------- - 10000 - 10040 -(2 rows) - CREATE TABLE badcompresstbl (a text COMPRESSION I_Do_Not_Exist_Compression); -- fails ERROR: invalid compression method "i_do_not_exist_compression" CREATE TABLE badcompresstbl (a text); diff --git a/src/test/regress/expected/compression_1.out b/src/test/regress/expected/compression_1.out deleted file mode 100644 index 7bd7642b4b9..00000000000 --- a/src/test/regress/expected/compression_1.out +++ /dev/null @@ -1,360 +0,0 @@ -\set HIDE_TOAST_COMPRESSION false --- ensure we get stable results regardless of installation's default -SET default_toast_compression = 'pglz'; --- test creating table with compression method -CREATE TABLE cmdata(f1 text COMPRESSION pglz); -CREATE INDEX idx ON cmdata(f1); -INSERT INTO cmdata VALUES(repeat('1234567890', 1000)); -\d+ cmdata - Table "public.cmdata" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | pglz | | -Indexes: - "idx" btree (f1) - -CREATE TABLE cmdata1(f1 TEXT COMPRESSION lz4); -ERROR: compression method lz4 not supported -DETAIL: This functionality requires the server to be built with lz4 support. -INSERT INTO cmdata1 VALUES(repeat('1234567890', 1004)); -ERROR: relation "cmdata1" does not exist -LINE 1: INSERT INTO cmdata1 VALUES(repeat('1234567890', 1004)); - ^ -\d+ cmdata1 --- verify stored compression method in the data -SELECT pg_column_compression(f1) FROM cmdata; - pg_column_compression ------------------------ - pglz -(1 row) - -SELECT pg_column_compression(f1) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT pg_column_compression(f1) FROM cmdata1; - ^ --- decompress data slice -SELECT SUBSTR(f1, 200, 5) FROM cmdata; - substr --------- - 01234 -(1 row) - -SELECT SUBSTR(f1, 2000, 50) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT SUBSTR(f1, 2000, 50) FROM cmdata1; - ^ --- copy with table creation -SELECT * INTO cmmove1 FROM cmdata; -\d+ cmmove1 - Table "public.cmmove1" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | | | - -SELECT pg_column_compression(f1) FROM cmmove1; - pg_column_compression ------------------------ - pglz -(1 row) - --- copy to existing table -CREATE TABLE cmmove3(f1 text COMPRESSION pglz); -INSERT INTO cmmove3 SELECT * FROM cmdata; -INSERT INTO cmmove3 SELECT * FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: INSERT INTO cmmove3 SELECT * FROM cmdata1; - ^ -SELECT pg_column_compression(f1) FROM cmmove3; - pg_column_compression ------------------------ - pglz -(1 row) - --- test LIKE INCLUDING COMPRESSION -CREATE TABLE cmdata2 (LIKE cmdata1 INCLUDING COMPRESSION); -ERROR: relation "cmdata1" does not exist -LINE 1: CREATE TABLE cmdata2 (LIKE cmdata1 INCLUDING COMPRESSION); - ^ -\d+ cmdata2 -DROP TABLE cmdata2; -ERROR: table "cmdata2" does not exist --- try setting compression for incompressible data type -CREATE TABLE cmdata2 (f1 int COMPRESSION pglz); -ERROR: column data type integer does not support compression --- update using datum from different table -CREATE TABLE cmmove2(f1 text COMPRESSION pglz); -INSERT INTO cmmove2 VALUES (repeat('1234567890', 1004)); -SELECT pg_column_compression(f1) FROM cmmove2; - pg_column_compression ------------------------ - pglz -(1 row) - -UPDATE cmmove2 SET f1 = cmdata1.f1 FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: UPDATE cmmove2 SET f1 = cmdata1.f1 FROM cmdata1; - ^ -SELECT pg_column_compression(f1) FROM cmmove2; - pg_column_compression ------------------------ - pglz -(1 row) - --- test externally stored compressed data -CREATE OR REPLACE FUNCTION large_val() RETURNS TEXT LANGUAGE SQL AS -'select array_agg(fipshash(g::text))::text from generate_series(1, 256) g'; -CREATE TABLE cmdata2 (f1 text COMPRESSION pglz); -INSERT INTO cmdata2 SELECT large_val() || repeat('a', 4000); -SELECT pg_column_compression(f1) FROM cmdata2; - pg_column_compression ------------------------ - pglz -(1 row) - -INSERT INTO cmdata1 SELECT large_val() || repeat('a', 4000); -ERROR: relation "cmdata1" does not exist -LINE 1: INSERT INTO cmdata1 SELECT large_val() || repeat('a', 4000); - ^ -SELECT pg_column_compression(f1) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT pg_column_compression(f1) FROM cmdata1; - ^ -SELECT SUBSTR(f1, 200, 5) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT SUBSTR(f1, 200, 5) FROM cmdata1; - ^ -SELECT SUBSTR(f1, 200, 5) FROM cmdata2; - substr --------- - 79026 -(1 row) - -DROP TABLE cmdata2; ---test column type update varlena/non-varlena -CREATE TABLE cmdata2 (f1 int); -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+---------+-----------+----------+---------+---------+-------------+--------------+------------- - f1 | integer | | | | plain | | | - -ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE varchar; -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+-------------------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | character varying | | | | extended | | | - -ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE int USING f1::integer; -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+---------+-----------+----------+---------+---------+-------------+--------------+------------- - f1 | integer | | | | plain | | | - ---changing column storage should not impact the compression method ---but the data should not be compressed -ALTER TABLE cmdata2 ALTER COLUMN f1 TYPE varchar; -ALTER TABLE cmdata2 ALTER COLUMN f1 SET COMPRESSION pglz; -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+-------------------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | character varying | | | | extended | pglz | | - -ALTER TABLE cmdata2 ALTER COLUMN f1 SET STORAGE plain; -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+-------------------+-----------+----------+---------+---------+-------------+--------------+------------- - f1 | character varying | | | | plain | pglz | | - -INSERT INTO cmdata2 VALUES (repeat('123456789', 800)); -SELECT pg_column_compression(f1) FROM cmdata2; - pg_column_compression ------------------------ - -(1 row) - --- test compression with materialized view -CREATE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: ...TE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata1; - ^ -\d+ compressmv -SELECT pg_column_compression(f1) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT pg_column_compression(f1) FROM cmdata1; - ^ -SELECT pg_column_compression(x) FROM compressmv; -ERROR: relation "compressmv" does not exist -LINE 1: SELECT pg_column_compression(x) FROM compressmv; - ^ --- test compression with partition -CREATE TABLE cmpart(f1 text COMPRESSION lz4) PARTITION BY HASH(f1); -ERROR: compression method lz4 not supported -DETAIL: This functionality requires the server to be built with lz4 support. -CREATE TABLE cmpart1 PARTITION OF cmpart FOR VALUES WITH (MODULUS 2, REMAINDER 0); -ERROR: relation "cmpart" does not exist -CREATE TABLE cmpart2(f1 text COMPRESSION pglz); -ALTER TABLE cmpart ATTACH PARTITION cmpart2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); -ERROR: relation "cmpart" does not exist -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -ERROR: relation "cmpart" does not exist -LINE 1: INSERT INTO cmpart VALUES (repeat('123456789', 1004)); - ^ -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -ERROR: relation "cmpart" does not exist -LINE 1: INSERT INTO cmpart VALUES (repeat('123456789', 4004)); - ^ -SELECT pg_column_compression(f1) FROM cmpart1; -ERROR: relation "cmpart1" does not exist -LINE 1: SELECT pg_column_compression(f1) FROM cmpart1; - ^ -SELECT pg_column_compression(f1) FROM cmpart2; - pg_column_compression ------------------------ -(0 rows) - --- test compression with inheritance -CREATE TABLE cminh() INHERITS(cmdata, cmdata1); -- error -ERROR: relation "cmdata1" does not exist -CREATE TABLE cminh(f1 TEXT COMPRESSION lz4) INHERITS(cmdata); -- error -NOTICE: merging column "f1" with inherited definition -ERROR: column "f1" has a compression method conflict -DETAIL: pglz versus lz4 -CREATE TABLE cmdata3(f1 text); -CREATE TABLE cminh() INHERITS (cmdata, cmdata3); -NOTICE: merging multiple inherited definitions of column "f1" --- test default_toast_compression GUC -SET default_toast_compression = ''; -ERROR: invalid value for parameter "default_toast_compression": "" -HINT: Available values: pglz. -SET default_toast_compression = 'I do not exist compression'; -ERROR: invalid value for parameter "default_toast_compression": "I do not exist compression" -HINT: Available values: pglz. -SET default_toast_compression = 'lz4'; -ERROR: invalid value for parameter "default_toast_compression": "lz4" -HINT: Available values: pglz. -SET default_toast_compression = 'pglz'; --- test alter compression method -ALTER TABLE cmdata ALTER COLUMN f1 SET COMPRESSION lz4; -ERROR: compression method lz4 not supported -DETAIL: This functionality requires the server to be built with lz4 support. -INSERT INTO cmdata VALUES (repeat('123456789', 4004)); -\d+ cmdata - Table "public.cmdata" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+------+-----------+----------+---------+----------+-------------+--------------+------------- - f1 | text | | | | extended | pglz | | -Indexes: - "idx" btree (f1) -Child tables: cminh - -SELECT pg_column_compression(f1) FROM cmdata; - pg_column_compression ------------------------ - pglz - pglz -(2 rows) - -ALTER TABLE cmdata2 ALTER COLUMN f1 SET COMPRESSION default; -\d+ cmdata2 - Table "public.cmdata2" - Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description ---------+-------------------+-----------+----------+---------+---------+-------------+--------------+------------- - f1 | character varying | | | | plain | | | - --- test alter compression method for materialized views -ALTER MATERIALIZED VIEW compressmv ALTER COLUMN x SET COMPRESSION lz4; -ERROR: relation "compressmv" does not exist -\d+ compressmv --- test alter compression method for partitioned tables -ALTER TABLE cmpart1 ALTER COLUMN f1 SET COMPRESSION pglz; -ERROR: relation "cmpart1" does not exist -ALTER TABLE cmpart2 ALTER COLUMN f1 SET COMPRESSION lz4; -ERROR: compression method lz4 not supported -DETAIL: This functionality requires the server to be built with lz4 support. --- new data should be compressed with the current compression method -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -ERROR: relation "cmpart" does not exist -LINE 1: INSERT INTO cmpart VALUES (repeat('123456789', 1004)); - ^ -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -ERROR: relation "cmpart" does not exist -LINE 1: INSERT INTO cmpart VALUES (repeat('123456789', 4004)); - ^ -SELECT pg_column_compression(f1) FROM cmpart1; -ERROR: relation "cmpart1" does not exist -LINE 1: SELECT pg_column_compression(f1) FROM cmpart1; - ^ -SELECT pg_column_compression(f1) FROM cmpart2; - pg_column_compression ------------------------ -(0 rows) - --- VACUUM FULL does not recompress -SELECT pg_column_compression(f1) FROM cmdata; - pg_column_compression ------------------------ - pglz - pglz -(2 rows) - -VACUUM FULL cmdata; -SELECT pg_column_compression(f1) FROM cmdata; - pg_column_compression ------------------------ - pglz - pglz -(2 rows) - --- test expression index -DROP TABLE cmdata2; -CREATE TABLE cmdata2 (f1 TEXT COMPRESSION pglz, f2 TEXT COMPRESSION lz4); -ERROR: compression method lz4 not supported -DETAIL: This functionality requires the server to be built with lz4 support. -CREATE UNIQUE INDEX idx1 ON cmdata2 ((f1 || f2)); -ERROR: relation "cmdata2" does not exist -INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEXT))::TEXT FROM -generate_series(1, 50) g), VERSION()); -ERROR: relation "cmdata2" does not exist -LINE 1: INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEX... - ^ --- check data is ok -SELECT length(f1) FROM cmdata; - length --------- - 10000 - 36036 -(2 rows) - -SELECT length(f1) FROM cmdata1; -ERROR: relation "cmdata1" does not exist -LINE 1: SELECT length(f1) FROM cmdata1; - ^ -SELECT length(f1) FROM cmmove1; - length --------- - 10000 -(1 row) - -SELECT length(f1) FROM cmmove2; - length --------- - 10040 -(1 row) - -SELECT length(f1) FROM cmmove3; - length --------- - 10000 -(1 row) - -CREATE TABLE badcompresstbl (a text COMPRESSION I_Do_Not_Exist_Compression); -- fails -ERROR: invalid compression method "i_do_not_exist_compression" -CREATE TABLE badcompresstbl (a text); -ALTER TABLE badcompresstbl ALTER a SET COMPRESSION I_Do_Not_Exist_Compression; -- fails -ERROR: invalid compression method "i_do_not_exist_compression" -DROP TABLE badcompresstbl; -\set HIDE_TOAST_COMPRESSION true diff --git a/src/test/regress/expected/compression_lz4.out b/src/test/regress/expected/compression_lz4.out new file mode 100644 index 00000000000..068dd7c3674 --- /dev/null +++ b/src/test/regress/expected/compression_lz4.out @@ -0,0 +1,249 @@ +-- Tests for TOAST compression with lz4 +SELECT NOT(enumvals @> '{lz4}') AS skip_test FROM pg_settings WHERE + name = 'default_toast_compression' \gset +\if :skip_test + \echo '*** skipping TOAST tests with lz4 (not supported) ***' + \quit +\endif +CREATE SCHEMA lz4; +SET search_path TO lz4, public; +\set HIDE_TOAST_COMPRESSION false +-- Ensure we get stable results regardless of the installation's default. +-- We rely on this GUC value for a few tests. +SET default_toast_compression = 'pglz'; +-- test creating table with compression method +CREATE TABLE cmdata_pglz(f1 text COMPRESSION pglz); +CREATE INDEX idx ON cmdata_pglz(f1); +INSERT INTO cmdata_pglz VALUES(repeat('1234567890', 1000)); +\d+ cmdata +CREATE TABLE cmdata_lz4(f1 TEXT COMPRESSION lz4); +INSERT INTO cmdata_lz4 VALUES(repeat('1234567890', 1004)); +\d+ cmdata1 +-- verify stored compression method in the data +SELECT pg_column_compression(f1) FROM cmdata_lz4; + pg_column_compression +----------------------- + lz4 +(1 row) + +-- decompress data slice +SELECT SUBSTR(f1, 200, 5) FROM cmdata_pglz; + substr +-------- + 01234 +(1 row) + +SELECT SUBSTR(f1, 2000, 50) FROM cmdata_lz4; + substr +---------------------------------------------------- + 01234567890123456789012345678901234567890123456789 +(1 row) + +-- copy with table creation +SELECT * INTO cmmove1 FROM cmdata_lz4; +\d+ cmmove1 + Table "lz4.cmmove1" + Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description +--------+------+-----------+----------+---------+----------+-------------+--------------+------------- + f1 | text | | | | extended | | | + +SELECT pg_column_compression(f1) FROM cmmove1; + pg_column_compression +----------------------- + lz4 +(1 row) + +-- test LIKE INCLUDING COMPRESSION. The GUC default_toast_compression +-- has no effect, the compression method from the table being copied. +CREATE TABLE cmdata2 (LIKE cmdata_lz4 INCLUDING COMPRESSION); +\d+ cmdata2 + Table "lz4.cmdata2" + Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description +--------+------+-----------+----------+---------+----------+-------------+--------------+------------- + f1 | text | | | | extended | lz4 | | + +DROP TABLE cmdata2; +-- copy to existing table +CREATE TABLE cmmove3(f1 text COMPRESSION pglz); +INSERT INTO cmmove3 SELECT * FROM cmdata_pglz; +INSERT INTO cmmove3 SELECT * FROM cmdata_lz4; +SELECT pg_column_compression(f1) FROM cmmove3; + pg_column_compression +----------------------- + pglz + lz4 +(2 rows) + +-- update using datum from different table with LZ4 data. +CREATE TABLE cmmove2(f1 text COMPRESSION pglz); +INSERT INTO cmmove2 VALUES (repeat('1234567890', 1004)); +SELECT pg_column_compression(f1) FROM cmmove2; + pg_column_compression +----------------------- + pglz +(1 row) + +UPDATE cmmove2 SET f1 = cmdata_lz4.f1 FROM cmdata_lz4; +SELECT pg_column_compression(f1) FROM cmmove2; + pg_column_compression +----------------------- + lz4 +(1 row) + +-- test externally stored compressed data +CREATE OR REPLACE FUNCTION large_val_lz4() RETURNS TEXT LANGUAGE SQL AS +'select array_agg(fipshash(g::text))::text from generate_series(1, 256) g'; +CREATE TABLE cmdata2 (f1 text COMPRESSION lz4); +INSERT INTO cmdata2 SELECT large_val_lz4() || repeat('a', 4000); +SELECT pg_column_compression(f1) FROM cmdata2; + pg_column_compression +----------------------- + lz4 +(1 row) + +SELECT SUBSTR(f1, 200, 5) FROM cmdata2; + substr +-------- + 79026 +(1 row) + +DROP TABLE cmdata2; +DROP FUNCTION large_val_lz4; +-- test compression with materialized view +CREATE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata_lz4; +\d+ compressmv + Materialized view "lz4.compressmv" + Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description +--------+------+-----------+----------+---------+----------+-------------+--------------+------------- + x | text | | | | extended | | | +View definition: + SELECT f1 AS x + FROM cmdata_lz4; + +SELECT pg_column_compression(f1) FROM cmdata_lz4; + pg_column_compression +----------------------- + lz4 +(1 row) + +SELECT pg_column_compression(x) FROM compressmv; + pg_column_compression +----------------------- + lz4 +(1 row) + +-- test compression with partition +CREATE TABLE cmpart(f1 text COMPRESSION lz4) PARTITION BY HASH(f1); +CREATE TABLE cmpart1 PARTITION OF cmpart FOR VALUES WITH (MODULUS 2, REMAINDER 0); +CREATE TABLE cmpart2(f1 text COMPRESSION pglz); +ALTER TABLE cmpart ATTACH PARTITION cmpart2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); +INSERT INTO cmpart VALUES (repeat('123456789', 1004)); +INSERT INTO cmpart VALUES (repeat('123456789', 4004)); +SELECT pg_column_compression(f1) FROM cmpart1; + pg_column_compression +----------------------- + lz4 +(1 row) + +SELECT pg_column_compression(f1) FROM cmpart2; + pg_column_compression +----------------------- + pglz +(1 row) + +-- test compression with inheritance +CREATE TABLE cminh() INHERITS(cmdata_pglz, cmdata_lz4); -- error +NOTICE: merging multiple inherited definitions of column "f1" +ERROR: column "f1" has a compression method conflict +DETAIL: pglz versus lz4 +CREATE TABLE cminh(f1 TEXT COMPRESSION lz4) INHERITS(cmdata_pglz); -- error +NOTICE: merging column "f1" with inherited definition +ERROR: column "f1" has a compression method conflict +DETAIL: pglz versus lz4 +CREATE TABLE cmdata3(f1 text); +CREATE TABLE cminh() INHERITS (cmdata_pglz, cmdata3); +NOTICE: merging multiple inherited definitions of column "f1" +-- test default_toast_compression GUC +SET default_toast_compression = 'lz4'; +-- test alter compression method +ALTER TABLE cmdata_pglz ALTER COLUMN f1 SET COMPRESSION lz4; +INSERT INTO cmdata_pglz VALUES (repeat('123456789', 4004)); +\d+ cmdata +SELECT pg_column_compression(f1) FROM cmdata_pglz; + pg_column_compression +----------------------- + pglz + lz4 +(2 rows) + +ALTER TABLE cmdata_pglz ALTER COLUMN f1 SET COMPRESSION pglz; +-- test alter compression method for materialized views +ALTER MATERIALIZED VIEW compressmv ALTER COLUMN x SET COMPRESSION lz4; +\d+ compressmv + Materialized view "lz4.compressmv" + Column | Type | Collation | Nullable | Default | Storage | Compression | Stats target | Description +--------+------+-----------+----------+---------+----------+-------------+--------------+------------- + x | text | | | | extended | lz4 | | +View definition: + SELECT f1 AS x + FROM cmdata_lz4; + +-- test alter compression method for partitioned tables +ALTER TABLE cmpart1 ALTER COLUMN f1 SET COMPRESSION pglz; +ALTER TABLE cmpart2 ALTER COLUMN f1 SET COMPRESSION lz4; +-- new data should be compressed with the current compression method +INSERT INTO cmpart VALUES (repeat('123456789', 1004)); +INSERT INTO cmpart VALUES (repeat('123456789', 4004)); +SELECT pg_column_compression(f1) FROM cmpart1; + pg_column_compression +----------------------- + lz4 + pglz +(2 rows) + +SELECT pg_column_compression(f1) FROM cmpart2; + pg_column_compression +----------------------- + pglz + lz4 +(2 rows) + +-- test expression index +CREATE TABLE cmdata2 (f1 TEXT COMPRESSION pglz, f2 TEXT COMPRESSION lz4); +CREATE UNIQUE INDEX idx1 ON cmdata2 ((f1 || f2)); +INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEXT))::TEXT FROM +generate_series(1, 50) g), VERSION()); +-- check data is ok +SELECT length(f1) FROM cmdata_pglz; + length +-------- + 10000 + 36036 +(2 rows) + +SELECT length(f1) FROM cmdata_lz4; + length +-------- + 10040 +(1 row) + +SELECT length(f1) FROM cmmove1; + length +-------- + 10040 +(1 row) + +SELECT length(f1) FROM cmmove2; + length +-------- + 10040 +(1 row) + +SELECT length(f1) FROM cmmove3; + length +-------- + 10000 + 10040 +(2 rows) + +\set HIDE_TOAST_COMPRESSION true diff --git a/src/test/regress/expected/compression_lz4_1.out b/src/test/regress/expected/compression_lz4_1.out new file mode 100644 index 00000000000..198056fa224 --- /dev/null +++ b/src/test/regress/expected/compression_lz4_1.out @@ -0,0 +1,7 @@ +-- Tests for TOAST compression with lz4 +SELECT NOT(enumvals @> '{lz4}') AS skip_test FROM pg_settings WHERE + name = 'default_toast_compression' \gset +\if :skip_test + \echo '*** skipping TOAST tests with lz4 (not supported) ***' +*** skipping TOAST tests with lz4 (not supported) *** + \quit diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out index ad6aaab7385..3590d3274f0 100644 --- a/src/test/regress/expected/constraints.out +++ b/src/test/regress/expected/constraints.out @@ -748,6 +748,11 @@ ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key ENFORCED; ERROR: cannot alter enforceability of constraint "unique_tbl_i_key" of relation "unique_tbl" ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key NOT ENFORCED; ERROR: cannot alter enforceability of constraint "unique_tbl_i_key" of relation "unique_tbl" +-- can't make an existing constraint NOT VALID +ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key NOT VALID; +ERROR: constraints cannot be altered to be NOT VALID +LINE 1: ...ABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key NOT VALID; + ^ DROP TABLE unique_tbl; -- -- EXCLUDE constraints @@ -1659,6 +1664,8 @@ EXECUTE get_nnconstraint_info('{constr_parent3, constr_child3}'); constr_parent3 | constr_parent3_a_not_null | t | t | 0 (2 rows) +COMMENT ON CONSTRAINT constr_parent2_a_not_null ON constr_parent2 IS 'this constraint is invalid'; +COMMENT ON CONSTRAINT constr_parent2_a_not_null ON constr_child2 IS 'this constraint is valid'; DEALLOCATE get_nnconstraint_info; -- end NOT NULL NOT VALID -- Comments @@ -1694,3 +1701,7 @@ DROP TABLE constraint_comments_tbl; DROP DOMAIN constraint_comments_dom; DROP ROLE regress_constraint_comments; DROP ROLE regress_constraint_comments_noaccess; +-- Leave some constraints for the pg_upgrade test to pick up +CREATE DOMAIN constraint_comments_dom AS int; +ALTER DOMAIN constraint_comments_dom ADD CONSTRAINT inv_ck CHECK (value > 0) NOT VALID; +COMMENT ON CONSTRAINT inv_ck ON DOMAIN constraint_comments_dom IS 'comment on invalid constraint'; diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out index 8d5a06563c4..ac66eb55aee 100644 --- a/src/test/regress/expected/copy.out +++ b/src/test/regress/expected/copy.out @@ -81,6 +81,29 @@ copy copytest4 to stdout (header); c1 colname with tab: \t 1 a 2 b +-- test multi-line header line feature +create temp table copytest5 (c1 int); +copy copytest5 from stdin (format csv, header 2); +copy copytest5 to stdout (header); +c1 +1 +2 +truncate copytest5; +copy copytest5 from stdin (format csv, header 4); +select count(*) from copytest5; + count +------- + 0 +(1 row) + +truncate copytest5; +copy copytest5 from stdin (format csv, header 5); +select count(*) from copytest5; + count +------- + 0 +(1 row) + -- test copy from with a partitioned table create table parted_copytest ( a int, @@ -224,7 +247,7 @@ alter table header_copytest add column c text; copy header_copytest to stdout with (header match); ERROR: cannot use "match" with HEADER in COPY TO copy header_copytest from stdin with (header wrong_choice); -ERROR: header requires a Boolean value or "match" +ERROR: header requires a Boolean value, a non-negative integer, or the string "match" -- works copy header_copytest from stdin with (header match); copy header_copytest (c, a, b) from stdin with (header match); diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out index 64ea33aeae8..caa3c44f0d0 100644 --- a/src/test/regress/expected/copy2.out +++ b/src/test/regress/expected/copy2.out @@ -132,6 +132,12 @@ COPY x from stdin with (reject_limit 1); ERROR: COPY REJECT_LIMIT requires ON_ERROR to be set to IGNORE COPY x from stdin with (on_error ignore, reject_limit 0); ERROR: REJECT_LIMIT (0) must be greater than zero +COPY x from stdin with (header -1); +ERROR: a negative integer value cannot be specified for header +COPY x from stdin with (header 2.5); +ERROR: header requires a Boolean value, a non-negative integer, or the string "match" +COPY x to stdout with (header 2); +ERROR: cannot use multi-line header in COPY TO -- too many columns in column list: should fail COPY x (a, b, c, d, e, d, c) from stdin; ERROR: column "d" specified more than once diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 9ade7b835e6..98e68e972be 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1624,8 +1624,8 @@ DROP TABLE cwi_test; -- CREATE TABLE syscol_table (a INT); -- System columns cannot be indexed -CREATE INDEX ON syscolcol_table (ctid); -ERROR: relation "syscolcol_table" does not exist +CREATE INDEX ON syscol_table (ctid); +ERROR: index creation on system columns is not supported -- nor used in expressions CREATE INDEX ON syscol_table ((ctid >= '(1000,0)')); ERROR: index creation on system columns is not supported diff --git a/src/test/regress/expected/create_table_like.out b/src/test/regress/expected/create_table_like.out index bf34289e984..29a779c2e90 100644 --- a/src/test/regress/expected/create_table_like.out +++ b/src/test/regress/expected/create_table_like.out @@ -332,9 +332,10 @@ COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check'; COMMENT ON INDEX ctlt1_pkey IS 'index pkey'; COMMENT ON INDEX ctlt1_b_key IS 'index b_key'; ALTER TABLE ctlt1 ALTER COLUMN a SET STORAGE MAIN; -CREATE TABLE ctlt2 (c text); +CREATE TABLE ctlt2 (c text NOT NULL); ALTER TABLE ctlt2 ALTER COLUMN c SET STORAGE EXTERNAL; COMMENT ON COLUMN ctlt2.c IS 'C'; +COMMENT ON CONSTRAINT ctlt2_c_not_null ON ctlt2 IS 't2_c_not_null'; CREATE TABLE ctlt3 (a text CHECK (length(a) < 5), c text CHECK (length(c) < 7)); ALTER TABLE ctlt3 ALTER COLUMN c SET STORAGE EXTERNAL; ALTER TABLE ctlt3 ALTER COLUMN a SET STORAGE MAIN; @@ -351,9 +352,10 @@ CREATE TABLE ctlt12_storage (LIKE ctlt1 INCLUDING STORAGE, LIKE ctlt2 INCLUDING --------+------+-----------+----------+---------+----------+--------------+------------- a | text | | not null | | main | | b | text | | | | extended | | - c | text | | | | external | | + c | text | | not null | | external | | Not-null constraints: "ctlt1_a_not_null" NOT NULL "a" + "ctlt2_c_not_null" NOT NULL "c" CREATE TABLE ctlt12_comments (LIKE ctlt1 INCLUDING COMMENTS, LIKE ctlt2 INCLUDING COMMENTS); \d+ ctlt12_comments @@ -362,9 +364,16 @@ CREATE TABLE ctlt12_comments (LIKE ctlt1 INCLUDING COMMENTS, LIKE ctlt2 INCLUDIN --------+------+-----------+----------+---------+----------+--------------+------------- a | text | | not null | | extended | | A b | text | | | | extended | | B - c | text | | | | extended | | C + c | text | | not null | | extended | | C Not-null constraints: "ctlt1_a_not_null" NOT NULL "a" + "ctlt2_c_not_null" NOT NULL "c" + +SELECT conname, description FROM pg_description, pg_constraint c WHERE classoid = 'pg_constraint'::regclass AND objoid = c.oid AND c.conrelid = 'ctlt12_comments'::regclass; + conname | description +------------------+--------------- + ctlt2_c_not_null | t2_c_not_null +(1 row) CREATE TABLE ctlt1_inh (LIKE ctlt1 INCLUDING CONSTRAINTS INCLUDING COMMENTS) INHERITS (ctlt1); NOTICE: merging column "a" with inherited definition @@ -529,7 +538,9 @@ NOTICE: drop cascades to table inhe -- LIKE must respect NO INHERIT property of constraints CREATE TABLE noinh_con_copy (a int CHECK (a > 0) NO INHERIT, b int not null, c int not null no inherit); -CREATE TABLE noinh_con_copy1 (LIKE noinh_con_copy INCLUDING CONSTRAINTS); +COMMENT ON CONSTRAINT noinh_con_copy_b_not_null ON noinh_con_copy IS 'not null b'; +COMMENT ON CONSTRAINT noinh_con_copy_c_not_null ON noinh_con_copy IS 'not null c no inherit'; +CREATE TABLE noinh_con_copy1 (LIKE noinh_con_copy INCLUDING CONSTRAINTS INCLUDING COMMENTS); \d+ noinh_con_copy1 Table "public.noinh_con_copy1" Column | Type | Collation | Nullable | Default | Storage | Stats target | Description @@ -543,6 +554,17 @@ Not-null constraints: "noinh_con_copy_b_not_null" NOT NULL "b" "noinh_con_copy_c_not_null" NOT NULL "c" NO INHERIT +SELECT conname, description +FROM pg_description, pg_constraint c +WHERE classoid = 'pg_constraint'::regclass +AND objoid = c.oid AND c.conrelid = 'noinh_con_copy1'::regclass +ORDER BY conname COLLATE "C"; + conname | description +---------------------------+----------------------- + noinh_con_copy_b_not_null | not null b + noinh_con_copy_c_not_null | not null c no inherit +(2 rows) + -- fail, as partitioned tables don't allow NO INHERIT constraints CREATE TABLE noinh_con_copy1_parted (LIKE noinh_con_copy INCLUDING ALL) PARTITION BY LIST (a); diff --git a/src/test/regress/expected/domain.out b/src/test/regress/expected/domain.out index ba6f05eeb7d..b5ea707df31 100644 --- a/src/test/regress/expected/domain.out +++ b/src/test/regress/expected/domain.out @@ -1019,6 +1019,11 @@ insert into domain_test values (1, 2); -- should fail alter table domain_test add column c str_domain; ERROR: domain str_domain does not allow null values +-- disallow duplicated not-null constraints +create domain int_domain1 as int constraint nn1 not null constraint nn2 not null; +ERROR: redundant NOT NULL constraint definition +LINE 1: ...domain int_domain1 as int constraint nn1 not null constraint... + ^ create domain str_domain2 as text check (value <> 'foo') default 'foo'; -- should fail alter table domain_test add column d str_domain2; diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out index 4f3f280a439..dc541d61adf 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1359,7 +1359,7 @@ LINE 1: ...e ALTER CONSTRAINT fktable_fk_fkey NOT DEFERRABLE INITIALLY ... ALTER TABLE fktable ALTER CONSTRAINT fktable_fk_fkey NO INHERIT; ERROR: constraint "fktable_fk_fkey" of relation "fktable" is not a not-null constraint ALTER TABLE fktable ALTER CONSTRAINT fktable_fk_fkey NOT VALID; -ERROR: FOREIGN KEY constraints cannot be marked NOT VALID +ERROR: constraints cannot be altered to be NOT VALID LINE 1: ...ER TABLE fktable ALTER CONSTRAINT fktable_fk_fkey NOT VALID; ^ ALTER TABLE fktable ALTER CONSTRAINT fktable_fk_fkey ENFORCED NOT ENFORCED; @@ -1750,7 +1750,7 @@ Indexes: Referenced by: TABLE "fk_partitioned_fk" CONSTRAINT "fk_partitioned_fk_a_b_fkey" FOREIGN KEY (a, b) REFERENCES fk_notpartitioned_pk(a, b) --- Check the exsting FK trigger +-- Check the existing FK trigger SELECT conname, tgrelid::regclass as tgrel, regexp_replace(tgname, '[0-9]+', 'N') as tgname, tgtype FROM pg_trigger t JOIN pg_constraint c ON (t.tgconstraint = c.oid) WHERE tgrelid IN (SELECT relid FROM pg_partition_tree('fk_partitioned_fk'::regclass) @@ -1895,29 +1895,76 @@ WHERE conrelid::regclass::text like 'fk_partitioned_fk%' ORDER BY oid::regclass: (5 rows) DROP TABLE fk_partitioned_fk, fk_notpartitioned_pk; --- NOT VALID foreign key on a non-partitioned table referencing a partitioned table +-- NOT VALID and NOT ENFORCED foreign key on a non-partitioned table +-- referencing a partitioned table CREATE TABLE fk_partitioned_pk (a int, b int, PRIMARY KEY (a, b)) PARTITION BY RANGE (a, b); CREATE TABLE fk_partitioned_pk_1 PARTITION OF fk_partitioned_pk FOR VALUES FROM (0,0) TO (1000,1000); +CREATE TABLE fk_partitioned_pk_2 PARTITION OF fk_partitioned_pk FOR VALUES FROM (1000,1000) TO (2000,2000); CREATE TABLE fk_notpartitioned_fk (b int, a int); -ALTER TABLE fk_notpartitioned_fk ADD FOREIGN KEY (a, b) REFERENCES fk_partitioned_pk NOT VALID; --- Constraint will be invalid. -SELECT conname, convalidated FROM pg_constraint +INSERT INTO fk_partitioned_pk VALUES(100,100), (1000,1000); +INSERT INTO fk_notpartitioned_fk VALUES(100,100), (1000,1000); +ALTER TABLE fk_notpartitioned_fk ADD CONSTRAINT fk_notpartitioned_fk_a_b_fkey + FOREIGN KEY (a, b) REFERENCES fk_partitioned_pk NOT VALID; +ALTER TABLE fk_notpartitioned_fk ADD CONSTRAINT fk_notpartitioned_fk_a_b_fkey2 + FOREIGN KEY (a, b) REFERENCES fk_partitioned_pk NOT ENFORCED; +-- All constraints will be invalid, and _fkey2 constraints will not be enforced. +SELECT conname, conenforced, convalidated FROM pg_constraint WHERE conrelid = 'fk_notpartitioned_fk'::regclass ORDER BY oid::regclass::text; - conname | convalidated ----------------------------------+-------------- - fk_notpartitioned_fk_a_b_fkey | f - fk_notpartitioned_fk_a_b_fkey_1 | f -(2 rows) + conname | conenforced | convalidated +----------------------------------+-------------+-------------- + fk_notpartitioned_fk_a_b_fkey | t | f + fk_notpartitioned_fk_a_b_fkey_1 | t | f + fk_notpartitioned_fk_a_b_fkey_2 | t | f + fk_notpartitioned_fk_a_b_fkey2 | f | f + fk_notpartitioned_fk_a_b_fkey2_1 | f | f + fk_notpartitioned_fk_a_b_fkey2_2 | f | f +(6 rows) ALTER TABLE fk_notpartitioned_fk VALIDATE CONSTRAINT fk_notpartitioned_fk_a_b_fkey; --- All constraints are now valid. -SELECT conname, convalidated FROM pg_constraint +ALTER TABLE fk_notpartitioned_fk ALTER CONSTRAINT fk_notpartitioned_fk_a_b_fkey2 ENFORCED; +-- All constraints are now valid and enforced. +SELECT conname, conenforced, convalidated FROM pg_constraint WHERE conrelid = 'fk_notpartitioned_fk'::regclass ORDER BY oid::regclass::text; - conname | convalidated ----------------------------------+-------------- - fk_notpartitioned_fk_a_b_fkey | t - fk_notpartitioned_fk_a_b_fkey_1 | t -(2 rows) + conname | conenforced | convalidated +----------------------------------+-------------+-------------- + fk_notpartitioned_fk_a_b_fkey | t | t + fk_notpartitioned_fk_a_b_fkey_1 | t | t + fk_notpartitioned_fk_a_b_fkey_2 | t | t + fk_notpartitioned_fk_a_b_fkey2 | t | t + fk_notpartitioned_fk_a_b_fkey2_1 | t | t + fk_notpartitioned_fk_a_b_fkey2_2 | t | t +(6 rows) + +-- test a self-referential FK +ALTER TABLE fk_partitioned_pk ADD CONSTRAINT selffk FOREIGN KEY (a, b) REFERENCES fk_partitioned_pk NOT VALID; +CREATE TABLE fk_partitioned_pk_3 PARTITION OF fk_partitioned_pk FOR VALUES FROM (2000,2000) TO (3000,3000) + PARTITION BY RANGE (a); +CREATE TABLE fk_partitioned_pk_3_1 PARTITION OF fk_partitioned_pk_3 FOR VALUES FROM (2000) TO (2100); +SELECT conname, conenforced, convalidated FROM pg_constraint +WHERE conrelid = 'fk_partitioned_pk'::regclass AND contype = 'f' +ORDER BY oid::regclass::text; + conname | conenforced | convalidated +------------+-------------+-------------- + selffk | t | f + selffk_1 | t | f + selffk_2 | t | f + selffk_3 | t | f + selffk_3_1 | t | f +(5 rows) + +ALTER TABLE fk_partitioned_pk_2 VALIDATE CONSTRAINT selffk; +ALTER TABLE fk_partitioned_pk VALIDATE CONSTRAINT selffk; +SELECT conname, conenforced, convalidated FROM pg_constraint +WHERE conrelid = 'fk_partitioned_pk'::regclass AND contype = 'f' +ORDER BY oid::regclass::text; + conname | conenforced | convalidated +------------+-------------+-------------- + selffk | t | t + selffk_1 | t | t + selffk_2 | t | t + selffk_3 | t | t + selffk_3_1 | t | t +(5 rows) DROP TABLE fk_notpartitioned_fk, fk_partitioned_pk; -- Test some other exotic foreign key features: MATCH SIMPLE, ON UPDATE/DELETE diff --git a/src/test/regress/expected/generated_stored.out b/src/test/regress/expected/generated_stored.out index 16de30ab191..adac2cedfb2 100644 --- a/src/test/regress/expected/generated_stored.out +++ b/src/test/regress/expected/generated_stored.out @@ -1313,6 +1313,18 @@ CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') STORED, c te CREATE TABLE gtest31_2 (x int, y gtest31_1); ALTER TABLE gtest31_1 ALTER COLUMN b TYPE varchar; -- fails ERROR: cannot alter table "gtest31_1" because column "gtest31_2.y" uses its row type +-- bug #18970: these cases are unsupported, but make sure they fail cleanly +ALTER TABLE gtest31_2 ADD CONSTRAINT cc CHECK ((y).b IS NOT NULL); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello1'); +ERROR: cannot alter table "gtest31_1" because column "gtest31_2.y" uses its row type +ALTER TABLE gtest31_2 DROP CONSTRAINT cc; +CREATE STATISTICS gtest31_2_stat ON ((y).b is not null) FROM gtest31_2; +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello2'); +ERROR: cannot alter table "gtest31_1" because column "gtest31_2.y" uses its row type +DROP STATISTICS gtest31_2_stat; +CREATE INDEX gtest31_2_y_idx ON gtest31_2(((y).b)); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello3'); +ERROR: cannot alter table "gtest31_1" because column "gtest31_2.y" uses its row type DROP TABLE gtest31_1, gtest31_2; -- Check it for a partitioned table, too CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') STORED, c text) PARTITION BY LIST (a); diff --git a/src/test/regress/expected/generated_virtual.out b/src/test/regress/expected/generated_virtual.out index 6300e7c1d96..aca6347babe 100644 --- a/src/test/regress/expected/generated_virtual.out +++ b/src/test/regress/expected/generated_virtual.out @@ -553,15 +553,11 @@ CREATE TABLE gtest4 ( a int, b double_int GENERATED ALWAYS AS ((a * 2, a * 3)) VIRTUAL ); -INSERT INTO gtest4 VALUES (1), (6); -SELECT * FROM gtest4; - a | b ----+--------- - 1 | (2,3) - 6 | (12,18) -(2 rows) - -DROP TABLE gtest4; +ERROR: virtual generated column "b" cannot have a user-defined type +DETAIL: Virtual generated columns that make use of user-defined types are not yet supported. +--INSERT INTO gtest4 VALUES (1), (6); +--SELECT * FROM gtest4; +--DROP TABLE gtest4; DROP TYPE double_int; -- using tableoid is allowed CREATE TABLE gtest_tableoid ( @@ -604,9 +600,13 @@ INSERT INTO gtest11 VALUES (1, 10), (2, 20); GRANT SELECT (a, c) ON gtest11 TO regress_user11; CREATE FUNCTION gf1(a int) RETURNS int AS $$ SELECT a * 3 $$ IMMUTABLE LANGUAGE SQL; REVOKE ALL ON FUNCTION gf1(int) FROM PUBLIC; -CREATE TABLE gtest12 (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (gf1(b)) VIRTUAL); -INSERT INTO gtest12 VALUES (1, 10), (2, 20); -GRANT SELECT (a, c), INSERT ON gtest12 TO regress_user11; +CREATE TABLE gtest12 (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (gf1(b)) VIRTUAL); -- fails, user-defined function +ERROR: generation expression uses user-defined function +LINE 1: ...nt PRIMARY KEY, b int, c int GENERATED ALWAYS AS (gf1(b)) VI... + ^ +DETAIL: Virtual generated columns that make use of user-defined functions are not yet supported. +--INSERT INTO gtest12 VALUES (1, 10), (2, 20); +--GRANT SELECT (a, c), INSERT ON gtest12 TO regress_user11; SET ROLE regress_user11; SELECT a, b FROM gtest11; -- not allowed ERROR: permission denied for table gtest11 @@ -619,15 +619,12 @@ SELECT a, c FROM gtest11; -- allowed SELECT gf1(10); -- not allowed ERROR: permission denied for function gf1 -INSERT INTO gtest12 VALUES (3, 30), (4, 40); -- allowed (does not actually invoke the function) -SELECT a, c FROM gtest12; -- currently not allowed because of function permissions, should arguably be allowed -ERROR: permission denied for function gf1 +--INSERT INTO gtest12 VALUES (3, 30), (4, 40); -- allowed (does not actually invoke the function) +--SELECT a, c FROM gtest12; -- currently not allowed because of function permissions, should arguably be allowed RESET ROLE; -DROP FUNCTION gf1(int); -- fail -ERROR: cannot drop function gf1(integer) because other objects depend on it -DETAIL: column c of table gtest12 depends on function gf1(integer) -HINT: Use DROP ... CASCADE to drop the dependent objects too. -DROP TABLE gtest11, gtest12; +--DROP FUNCTION gf1(int); -- fail +DROP TABLE gtest11; +--DROP TABLE gtest12; DROP FUNCTION gf1(int); DROP USER regress_user11; -- check constraints @@ -637,10 +634,10 @@ INSERT INTO gtest20 (a) VALUES (30); -- violates constraint ERROR: new row for relation "gtest20" violates check constraint "gtest20_b_check" DETAIL: Failing row contains (30, virtual). ALTER TABLE gtest20 ALTER COLUMN b SET EXPRESSION AS (a * 100); -- violates constraint (currently not supported) -ERROR: ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns on tables with check constraints +ERROR: ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns in tables with check constraints DETAIL: Column "b" of relation "gtest20" is a virtual generated column. ALTER TABLE gtest20 ALTER COLUMN b SET EXPRESSION AS (a * 3); -- ok (currently not supported) -ERROR: ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns on tables with check constraints +ERROR: ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns in tables with check constraints DETAIL: Column "b" of relation "gtest20" is a virtual generated column. CREATE TABLE gtest20a (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) VIRTUAL); INSERT INTO gtest20a (a) VALUES (10); @@ -800,11 +797,23 @@ CREATE TABLE gtest24r (a int PRIMARY KEY, b gtestdomain1range GENERATED ALWAYS A ERROR: virtual generated column "b" cannot have a domain type --INSERT INTO gtest24r (a) VALUES (4); -- ok --INSERT INTO gtest24r (a) VALUES (6); -- error +CREATE TABLE gtest24at (a int PRIMARY KEY); +ALTER TABLE gtest24at ADD COLUMN b gtestdomain1 GENERATED ALWAYS AS (a * 2) VIRTUAL; -- error +ERROR: virtual generated column "b" cannot have a domain type +CREATE TABLE gtest24ata (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) VIRTUAL); +ALTER TABLE gtest24ata ALTER COLUMN b TYPE gtestdomain1; -- error +ERROR: virtual generated column "b" cannot have a domain type CREATE DOMAIN gtestdomainnn AS int CHECK (VALUE IS NOT NULL); CREATE TABLE gtest24nn (a int, b gtestdomainnn GENERATED ALWAYS AS (a * 2) VIRTUAL); ERROR: virtual generated column "b" cannot have a domain type --INSERT INTO gtest24nn (a) VALUES (4); -- ok --INSERT INTO gtest24nn (a) VALUES (NULL); -- error +-- using user-defined type not yet supported +CREATE TABLE gtest24xxx (a gtestdomain1, b gtestdomain1, c int GENERATED ALWAYS AS (greatest(a, b)) VIRTUAL); -- error +ERROR: generation expression uses user-defined type +LINE 1: ...main1, b gtestdomain1, c int GENERATED ALWAYS AS (greatest(a... + ^ +DETAIL: Virtual generated columns that make use of user-defined types are not yet supported. -- typed tables (currently not supported) CREATE TYPE gtest_type AS (f1 integer, f2 text, f3 bigint); CREATE TABLE gtest28 OF gtest_type (f1 WITH OPTIONS GENERATED ALWAYS AS (f2 *2) VIRTUAL); @@ -1274,6 +1283,15 @@ CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') VIRTUAL, c t CREATE TABLE gtest31_2 (x int, y gtest31_1); ALTER TABLE gtest31_1 ALTER COLUMN b TYPE varchar; -- fails ERROR: cannot alter table "gtest31_1" because column "gtest31_2.y" uses its row type +-- bug #18970 +ALTER TABLE gtest31_2 ADD CONSTRAINT cc CHECK ((y).b IS NOT NULL); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello1'); +ALTER TABLE gtest31_2 DROP CONSTRAINT cc; +CREATE STATISTICS gtest31_2_stat ON ((y).b is not null) FROM gtest31_2; +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello2'); +DROP STATISTICS gtest31_2_stat; +CREATE INDEX gtest31_2_y_idx ON gtest31_2(((y).b)); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello3'); DROP TABLE gtest31_1, gtest31_2; -- Check it for a partitioned table, too CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') VIRTUAL, c text) PARTITION BY LIST (a); @@ -1470,7 +1488,8 @@ create table gtest32 ( a int primary key, b int generated always as (a * 2), c int generated always as (10 + 10), - d int generated always as (coalesce(a, 100)) + d int generated always as (coalesce(a, 100)), + e int ); insert into gtest32 values (1), (2); analyze gtest32; @@ -1531,11 +1550,11 @@ where coalesce(t2.b, 1) = 2; explain (costs off) select t1.a from gtest32 t1 left join gtest32 t2 on t1.a = t2.a where coalesce(t2.b, 1) = 2 or t1.a is null; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +----------------------------------------- Hash Left Join Hash Cond: (t1.a = t2.a) - Filter: ((COALESCE((t2.a * 2), 1) = 2) OR (t1.a IS NULL)) + Filter: (COALESCE((t2.a * 2), 1) = 2) -> Seq Scan on gtest32 t1 -> Hash -> Seq Scan on gtest32 t2 @@ -1554,41 +1573,66 @@ select t2.* from gtest32 t1 left join gtest32 t2 on false; QUERY PLAN ------------------------------------------------------ Nested Loop Left Join - Output: a, (a * 2), (20), (COALESCE(a, 100)) + Output: a, (a * 2), (20), (COALESCE(a, 100)), e Join Filter: false -> Seq Scan on generated_virtual_tests.gtest32 t1 - Output: t1.a, t1.b, t1.c, t1.d + Output: t1.a, t1.b, t1.c, t1.d, t1.e -> Result - Output: a, 20, COALESCE(a, 100) + Output: a, e, 20, COALESCE(a, 100) One-Time Filter: false (8 rows) select t2.* from gtest32 t1 left join gtest32 t2 on false; - a | b | c | d ----+---+---+--- - | | | - | | | + a | b | c | d | e +---+---+---+---+--- + | | | | + | | | | (2 rows) explain (verbose, costs off) -select * from gtest32 t group by grouping sets (a, b, c, d) having c = 20; +select * from gtest32 t group by grouping sets (a, b, c, d, e) having c = 20; QUERY PLAN ----------------------------------------------------- HashAggregate - Output: a, ((a * 2)), (20), (COALESCE(a, 100)) + Output: a, ((a * 2)), (20), (COALESCE(a, 100)), e Hash Key: t.a Hash Key: (t.a * 2) Hash Key: 20 Hash Key: COALESCE(t.a, 100) + Hash Key: t.e Filter: ((20) = 20) -> Seq Scan on generated_virtual_tests.gtest32 t - Output: a, (a * 2), 20, COALESCE(a, 100) -(9 rows) + Output: a, (a * 2), 20, COALESCE(a, 100), e +(10 rows) + +select * from gtest32 t group by grouping sets (a, b, c, d, e) having c = 20; + a | b | c | d | e +---+---+----+---+--- + | | 20 | | +(1 row) + +-- Ensure that the virtual generated columns in ALTER COLUMN TYPE USING expression are expanded +alter table gtest32 alter column e type bigint using b; +-- Ensure that virtual generated column references within SubLinks that should +-- be transformed into joins can get expanded +explain (costs off) +select 1 from gtest32 t1 where exists + (select 1 from gtest32 t2 where t1.a > t2.a and t2.b = 2); + QUERY PLAN +------------------------------------- + Nested Loop Semi Join + Join Filter: (t1.a > t2.a) + -> Seq Scan on gtest32 t1 + -> Materialize + -> Seq Scan on gtest32 t2 + Filter: ((a * 2) = 2) +(6 rows) -select * from gtest32 t group by grouping sets (a, b, c, d) having c = 20; - a | b | c | d ----+---+----+--- - | | 20 | +select 1 from gtest32 t1 where exists + (select 1 from gtest32 t2 where t1.a > t2.a and t2.b = 2); + ?column? +---------- + 1 (1 row) drop table gtest32; diff --git a/src/test/regress/expected/horology.out b/src/test/regress/expected/horology.out index b90bfcd794f..5ae93d8e8a5 100644 --- a/src/test/regress/expected/horology.out +++ b/src/test/regress/expected/horology.out @@ -467,6 +467,15 @@ SELECT timestamp with time zone 'Y2001M12D27H04MM05S06.789-08'; ERROR: invalid input syntax for type timestamp with time zone: "Y2001M12D27H04MM05S06.789-08" LINE 1: SELECT timestamp with time zone 'Y2001M12D27H04MM05S06.789-0... ^ +-- More examples we used to accept and should not +SELECT timestamp with time zone 'J2452271 T X03456-08'; +ERROR: invalid input syntax for type timestamp with time zone: "J2452271 T X03456-08" +LINE 1: SELECT timestamp with time zone 'J2452271 T X03456-08'; + ^ +SELECT timestamp with time zone 'J2452271 T X03456.001e6-08'; +ERROR: invalid input syntax for type timestamp with time zone: "J2452271 T X03456.001e6-08" +LINE 1: SELECT timestamp with time zone 'J2452271 T X03456.001e6-08'... + ^ -- conflicting fields should throw errors SELECT date '1995-08-06 epoch'; ERROR: invalid input syntax for type date: "1995-08-06 epoch" diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out index b00219643b9..5a1dd9fc022 100644 --- a/src/test/regress/expected/incremental_sort.out +++ b/src/test/regress/expected/incremental_sort.out @@ -1722,3 +1722,43 @@ order by t1.four, t1.two limit 1; -> Seq Scan on tenk1 t2 (12 rows) +-- +-- Test incremental sort for Append/MergeAppend +-- +create table prt_tbl (a int, b int) partition by range (a); +create table prt_tbl_1 partition of prt_tbl for values from (0) to (100); +create table prt_tbl_2 partition of prt_tbl for values from (100) to (200); +insert into prt_tbl select i%200, i from generate_series(1,1000)i; +create index on prt_tbl_1(a); +create index on prt_tbl_2(a, b); +analyze prt_tbl; +set enable_seqscan to off; +set enable_bitmapscan to off; +-- Ensure we get an incremental sort for the subpath of Append +explain (costs off) select * from prt_tbl order by a, b; + QUERY PLAN +------------------------------------------------------------ + Append + -> Incremental Sort + Sort Key: prt_tbl_1.a, prt_tbl_1.b + Presorted Key: prt_tbl_1.a + -> Index Scan using prt_tbl_1_a_idx on prt_tbl_1 + -> Index Only Scan using prt_tbl_2_a_b_idx on prt_tbl_2 +(6 rows) + +-- Ensure we get an incremental sort for the subpath of MergeAppend +explain (costs off) select * from prt_tbl_1 union all select * from prt_tbl_2 order by a, b; + QUERY PLAN +------------------------------------------------------------ + Merge Append + Sort Key: prt_tbl_1.a, prt_tbl_1.b + -> Incremental Sort + Sort Key: prt_tbl_1.a, prt_tbl_1.b + Presorted Key: prt_tbl_1.a + -> Index Scan using prt_tbl_1_a_idx on prt_tbl_1 + -> Index Only Scan using prt_tbl_2_a_b_idx on prt_tbl_2 +(7 rows) + +reset enable_bitmapscan; +reset enable_seqscan; +drop table prt_tbl; diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out index f9b0c415cfd..5b5055babdc 100644 --- a/src/test/regress/expected/inherit.out +++ b/src/test/regress/expected/inherit.out @@ -1898,10 +1898,11 @@ ORDER BY thousand, tenthous; Merge Append Sort Key: tenk1.thousand, tenk1.tenthous -> Index Only Scan using tenk1_thous_tenthous on tenk1 - -> Sort + -> Incremental Sort Sort Key: tenk1_1.thousand, tenk1_1.thousand + Presorted Key: tenk1_1.thousand -> Index Only Scan using tenk1_thous_tenthous on tenk1 tenk1_1 -(6 rows) +(7 rows) explain (costs off) SELECT thousand, tenthous, thousand+tenthous AS x FROM tenk1 @@ -1982,10 +1983,11 @@ ORDER BY x, y; Merge Append Sort Key: a.thousand, a.tenthous -> Index Only Scan using tenk1_thous_tenthous on tenk1 a - -> Sort + -> Incremental Sort Sort Key: b.unique2, b.unique2 + Presorted Key: b.unique2 -> Index Only Scan using tenk1_unique2 on tenk1 b -(6 rows) +(7 rows) -- exercise rescan code path via a repeatedly-evaluated subquery explain (costs off) @@ -2281,7 +2283,7 @@ Inherits: pp1, create table cc3 (a2 int not null no inherit) inherits (cc1); NOTICE: moving and merging column "a2" with inherited definition DETAIL: User-specified column moved to the position of the inherited column. -ERROR: cannot define not-null constraint on column "a2" with NO INHERIT +ERROR: cannot define not-null constraint with NO INHERIT on column "a2" DETAIL: The column has an inherited not-null constraint. -- change NO INHERIT status of inherited constraint: no dice, it's inherited alter table cc2 add not null a2 no inherit; @@ -2530,7 +2532,7 @@ ERROR: conflicting NO INHERIT declaration for not-null constraint on column "a" CREATE TABLE inh_nn1 (a int not null); CREATE TABLE inh_nn2 (a int not null no inherit) INHERITS (inh_nn1); NOTICE: merging column "a" with inherited definition -ERROR: cannot define not-null constraint on column "a" with NO INHERIT +ERROR: cannot define not-null constraint with NO INHERIT on column "a" DETAIL: The column has an inherited not-null constraint. CREATE TABLE inh_nn3 (a int not null, b int, not null a no inherit); ERROR: conflicting NO INHERIT declaration for not-null constraint on column "a" diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index f35a0b18c37..4d5d35d0727 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -3639,8 +3639,8 @@ from nt3 as nt3 ) as ss2 on ss2.id = nt3.nt2_id where nt3.id = 1 and ss2.b3; - QUERY PLAN ------------------------------------------------ + QUERY PLAN +---------------------------------------------- Nested Loop -> Nested Loop -> Index Scan using nt3_pkey on nt3 @@ -3649,7 +3649,7 @@ where nt3.id = 1 and ss2.b3; Index Cond: (id = nt3.nt2_id) -> Index Only Scan using nt1_pkey on nt1 Index Cond: (id = nt2.nt1_id) - Filter: (nt2.b1 AND (id IS NOT NULL)) + Filter: (nt2.b1 AND true) (9 rows) select nt3.id @@ -3946,6 +3946,59 @@ where t1.unique2 < 42 and t1.stringu1 > t2.stringu2; (1 row) -- variant that isn't quite a star-schema case +explain (verbose, costs off) +select ss1.d1 from + tenk1 as t1 + inner join tenk1 as t2 + on t1.tenthous = t2.ten + inner join + int8_tbl as i8 + left join int4_tbl as i4 + inner join (select 64::information_schema.cardinal_number as d1 + from tenk1 t3, + lateral (select abs(t3.unique1) + random()) ss0(x) + where t3.fivethous < 0) as ss1 + on i4.f1 = ss1.d1 + on i8.q1 = i4.f1 + on t1.tenthous = ss1.d1 +where t1.unique1 < i4.f1; + QUERY PLAN +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Nested Loop + Output: (64)::information_schema.cardinal_number + Join Filter: (t1.tenthous = ((64)::information_schema.cardinal_number)::integer) + -> Seq Scan on public.tenk1 t3 + Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4 + Filter: (t3.fivethous < 0) + -> Nested Loop + Output: t1.tenthous, t2.ten + -> Nested Loop + Output: t1.tenthous, t2.ten, i4.f1 + Join Filter: (t1.unique1 < i4.f1) + -> Hash Join + Output: t1.tenthous, t1.unique1, t2.ten + Hash Cond: (t2.ten = t1.tenthous) + -> Seq Scan on public.tenk1 t2 + Output: t2.unique1, t2.unique2, t2.two, t2.four, t2.ten, t2.twenty, t2.hundred, t2.thousand, t2.twothousand, t2.fivethous, t2.tenthous, t2.odd, t2.even, t2.stringu1, t2.stringu2, t2.string4 + -> Hash + Output: t1.tenthous, t1.unique1 + -> Nested Loop + Output: t1.tenthous, t1.unique1 + -> Subquery Scan on ss0 + Output: ss0.x, (64)::information_schema.cardinal_number + -> Result + Output: ((abs(t3.unique1))::double precision + random()) + -> Index Scan using tenk1_thous_tenthous on public.tenk1 t1 + Output: t1.unique1, t1.unique2, t1.two, t1.four, t1.ten, t1.twenty, t1.hundred, t1.thousand, t1.twothousand, t1.fivethous, t1.tenthous, t1.odd, t1.even, t1.stringu1, t1.stringu2, t1.string4 + Index Cond: (t1.tenthous = (((64)::information_schema.cardinal_number))::integer) + -> Seq Scan on public.int4_tbl i4 + Output: i4.f1 + Filter: (i4.f1 = ((64)::information_schema.cardinal_number)::integer) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + Filter: (i8.q1 = ((64)::information_schema.cardinal_number)::integer) +(33 rows) + select ss1.d1 from tenk1 as t1 inner join tenk1 as t2 @@ -4035,6 +4088,195 @@ select * from 1 | 2 | 2 (1 row) +-- This example demonstrates the folly of our old "have_dangerous_phv" logic +begin; +set local from_collapse_limit to 2; +explain (verbose, costs off) +select * from int8_tbl t1 + left join + (select coalesce(t2.q1 + x, 0) from int8_tbl t2, + lateral (select t3.q1 as x from int8_tbl t3, + lateral (select t2.q1, t3.q1 offset 0) s)) + on true; + QUERY PLAN +------------------------------------------------------------------ + Nested Loop Left Join + Output: t1.q1, t1.q2, (COALESCE((t2.q1 + t3.q1), '0'::bigint)) + -> Seq Scan on public.int8_tbl t1 + Output: t1.q1, t1.q2 + -> Materialize + Output: (COALESCE((t2.q1 + t3.q1), '0'::bigint)) + -> Nested Loop + Output: COALESCE((t2.q1 + t3.q1), '0'::bigint) + -> Seq Scan on public.int8_tbl t2 + Output: t2.q1, t2.q2 + -> Nested Loop + Output: t3.q1 + -> Seq Scan on public.int8_tbl t3 + Output: t3.q1, t3.q2 + -> Result + Output: NULL::bigint, NULL::bigint +(16 rows) + +rollback; +-- ... not that the initial replacement didn't have some bugs too +begin; +create temp table t(i int primary key); +explain (verbose, costs off) +select * from t t1 + left join (select 1 as x, * from t t2(i2)) t2ss on t1.i = t2ss.i2 + left join t t3(i3) on false + left join t t4(i4) on t4.i4 > t2ss.x; + QUERY PLAN +---------------------------------------------------------- + Nested Loop Left Join + Output: t1.i, (1), t2.i2, i3, t4.i4 + -> Nested Loop Left Join + Output: t1.i, t2.i2, (1), i3 + Join Filter: false + -> Hash Left Join + Output: t1.i, t2.i2, (1) + Inner Unique: true + Hash Cond: (t1.i = t2.i2) + -> Seq Scan on pg_temp.t t1 + Output: t1.i + -> Hash + Output: t2.i2, (1) + -> Seq Scan on pg_temp.t t2 + Output: t2.i2, 1 + -> Result + Output: i3 + One-Time Filter: false + -> Memoize + Output: t4.i4 + Cache Key: (1) + Cache Mode: binary + -> Index Only Scan using t_pkey on pg_temp.t t4 + Output: t4.i4 + Index Cond: (t4.i4 > (1)) +(25 rows) + +explain (verbose, costs off) +select * from + (select k from + (select i, coalesce(i, j) as k from + (select i from t union all select 0) + join (select 1 as j limit 1) on i = j) + right join (select 2 as x) on true + join (select 3 as y) on i is not null + ), + lateral (select k as kl limit 1); + QUERY PLAN +------------------------------------------------------------------- + Nested Loop + Output: COALESCE(t.i, (1)), ((COALESCE(t.i, (1)))) + -> Limit + Output: 1 + -> Result + Output: 1 + -> Nested Loop + Output: t.i, ((COALESCE(t.i, (1)))) + -> Result + Output: t.i, COALESCE(t.i, (1)) + -> Append + -> Index Only Scan using t_pkey on pg_temp.t + Output: t.i + Index Cond: (t.i = (1)) + -> Result + Output: 0 + One-Time Filter: ((1) = 0) + -> Limit + Output: ((COALESCE(t.i, (1)))) + -> Result + Output: (COALESCE(t.i, (1))) +(21 rows) + +rollback; +-- PHVs containing SubLinks are quite tricky to get right +explain (verbose, costs off) +select * +from int8_tbl i8 + inner join + (select (select true) as x + from int4_tbl i4, lateral (select i4.f1 as y limit 1) ss1 + where i4.f1 = 0) ss2 on true + right join (select false as z) ss3 on true, + lateral (select i8.q2 as q2l where x limit 1) ss4 +where i8.q2 = 123; + QUERY PLAN +---------------------------------------------------------------- + Nested Loop + Output: i8.q1, i8.q2, (InitPlan 1).col1, false, (i8.q2) + InitPlan 1 + -> Result + Output: true + InitPlan 2 + -> Result + Output: true + -> Seq Scan on public.int4_tbl i4 + Output: i4.f1 + Filter: (i4.f1 = 0) + -> Nested Loop + Output: i8.q1, i8.q2, (i8.q2) + -> Subquery Scan on ss1 + Output: ss1.y, (InitPlan 1).col1 + -> Limit + Output: NULL::integer + -> Result + Output: NULL::integer + -> Nested Loop + Output: i8.q1, i8.q2, (i8.q2) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + Filter: (i8.q2 = 123) + -> Limit + Output: (i8.q2) + -> Result + Output: i8.q2 + One-Time Filter: ((InitPlan 1).col1) +(29 rows) + +explain (verbose, costs off) +select * +from int8_tbl i8 + inner join + (select (select true) as x + from int4_tbl i4, lateral (select 1 as y limit 1) ss1 + where i4.f1 = 0) ss2 on true + right join (select false as z) ss3 on true, + lateral (select i8.q2 as q2l where x limit 1) ss4 +where i8.q2 = 123; + QUERY PLAN +---------------------------------------------------------------- + Nested Loop + Output: i8.q1, i8.q2, (InitPlan 1).col1, false, (i8.q2) + InitPlan 1 + -> Result + Output: true + InitPlan 2 + -> Result + Output: true + -> Limit + Output: NULL::integer + -> Result + Output: NULL::integer + -> Nested Loop + Output: i8.q1, i8.q2, (i8.q2) + -> Seq Scan on public.int4_tbl i4 + Output: i4.f1, (InitPlan 1).col1 + Filter: (i4.f1 = 0) + -> Nested Loop + Output: i8.q1, i8.q2, (i8.q2) + -> Seq Scan on public.int8_tbl i8 + Output: i8.q1, i8.q2 + Filter: (i8.q2 = 123) + -> Limit + Output: (i8.q2) + -> Result + Output: i8.q2 + One-Time Filter: ((InitPlan 1).col1) +(27 rows) + -- Test proper handling of appendrel PHVs during useless-RTE removal explain (costs off) select * from @@ -5384,14 +5626,14 @@ select * from (select 1 as id) as xx left join (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id)) - on (xx.id = coalesce(yy.id)); - QUERY PLAN ---------------------------------------- + on (xx.id = coalesce(yy.id, yy.id)); + QUERY PLAN +------------------------------------------ Nested Loop Left Join -> Result -> Hash Full Join Hash Cond: (a1.unique1 = (1)) - Filter: (1 = COALESCE((1))) + Filter: (1 = COALESCE((1), (1))) -> Seq Scan on tenk1 a1 -> Hash -> Result @@ -5401,7 +5643,7 @@ select * from (select 1 as id) as xx left join (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id)) - on (xx.id = coalesce(yy.id)); + on (xx.id = coalesce(yy.id, yy.id)); id | unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 | id ----+---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------+---- 1 | 1 | 2838 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | BAAAAA | EFEAAA | OOOOxx | 1 @@ -8169,20 +8411,20 @@ select * from int4_tbl i left join explain (verbose, costs off) select * from int4_tbl i left join - lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true; - QUERY PLAN -------------------------------------- + lateral (select coalesce(i, i) from int2_tbl j where i.f1 = j.f1) k on true; + QUERY PLAN +------------------------------------------ Nested Loop Left Join - Output: i.f1, (COALESCE(i.*)) + Output: i.f1, (COALESCE(i.*, i.*)) -> Seq Scan on public.int4_tbl i Output: i.f1, i.* -> Seq Scan on public.int2_tbl j - Output: j.f1, COALESCE(i.*) + Output: j.f1, COALESCE(i.*, i.*) Filter: (i.f1 = j.f1) (7 rows) select * from int4_tbl i left join - lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true; + lateral (select coalesce(i, i) from int2_tbl j where i.f1 = j.f1) k on true; f1 | coalesce -------------+---------- 0 | (0) @@ -9351,14 +9593,14 @@ CREATE STATISTICS group_tbl_stat (ndistinct) ON a, b FROM group_tbl; ANALYZE group_tbl; EXPLAIN (COSTS OFF) SELECT 1 FROM group_tbl t1 - LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE + LEFT JOIN (SELECT a c1, COALESCE(a, a) c2 FROM group_tbl t2) s ON TRUE GROUP BY s.c1, s.c2; - QUERY PLAN --------------------------------------------- + QUERY PLAN +------------------------------------------------ Group - Group Key: t2.a, (COALESCE(t2.a)) + Group Key: t2.a, (COALESCE(t2.a, t2.a)) -> Sort - Sort Key: t2.a, (COALESCE(t2.a)) + Sort Key: t2.a, (COALESCE(t2.a, t2.a)) -> Nested Loop Left Join -> Seq Scan on group_tbl t1 -> Seq Scan on group_tbl t2 diff --git a/src/test/regress/expected/limit.out b/src/test/regress/expected/limit.out index f4267c002d7..e3bcc680653 100644 --- a/src/test/regress/expected/limit.out +++ b/src/test/regress/expected/limit.out @@ -647,7 +647,7 @@ View definition: WHERE thousand < 995 ORDER BY thousand OFFSET 10 - FETCH FIRST 5 ROWS WITH TIES; + FETCH FIRST (5) ROWS WITH TIES; CREATE VIEW limit_thousand_v_2 AS SELECT thousand FROM onek WHERE thousand < 995 ORDER BY thousand OFFSET 10 FETCH FIRST 5 ROWS ONLY; @@ -679,10 +679,10 @@ View definition: FROM onek WHERE thousand < 995 ORDER BY thousand - FETCH FIRST (NULL::integer + 1) ROWS WITH TIES; + FETCH FIRST ((NULL::integer + 1)) ROWS WITH TIES; CREATE VIEW limit_thousand_v_4 AS SELECT thousand FROM onek WHERE thousand < 995 - ORDER BY thousand FETCH FIRST NULL ROWS ONLY; + ORDER BY thousand FETCH FIRST (5::bigint) ROWS WITH TIES; \d+ limit_thousand_v_4 View "public.limit_thousand_v_4" Column | Type | Collation | Nullable | Default | Storage | Description @@ -693,6 +693,20 @@ View definition: FROM onek WHERE thousand < 995 ORDER BY thousand + FETCH FIRST (5::bigint) ROWS WITH TIES; + +CREATE VIEW limit_thousand_v_5 AS SELECT thousand FROM onek WHERE thousand < 995 + ORDER BY thousand FETCH FIRST NULL ROWS ONLY; +\d+ limit_thousand_v_5 + View "public.limit_thousand_v_5" + Column | Type | Collation | Nullable | Default | Storage | Description +----------+---------+-----------+----------+---------+---------+------------- + thousand | integer | | | | plain | +View definition: + SELECT thousand + FROM onek + WHERE thousand < 995 + ORDER BY thousand LIMIT ALL; -- leave these views diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out index 54939ecc6b0..c56c9fa3a25 100644 --- a/src/test/regress/expected/matview.out +++ b/src/test/regress/expected/matview.out @@ -587,7 +587,7 @@ CREATE MATERIALIZED VIEW drop_idx_matview AS NOTICE: index "mvtest_drop_idx" does not exist, skipping CREATE UNIQUE INDEX mvtest_drop_idx ON drop_idx_matview (i); REFRESH MATERIALIZED VIEW CONCURRENTLY drop_idx_matview; -ERROR: could not find suitable unique index on materialized view +ERROR: could not find suitable unique index on materialized view "drop_idx_matview" DROP MATERIALIZED VIEW drop_idx_matview; -- clean up RESET search_path; -- make sure that create WITH NO DATA works via SPI diff --git a/src/test/regress/expected/memoize.out b/src/test/regress/expected/memoize.out index 38dfaf021c9..150dc1b44cf 100644 --- a/src/test/regress/expected/memoize.out +++ b/src/test/regress/expected/memoize.out @@ -25,6 +25,7 @@ begin ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N'); ln := regexp_replace(ln, 'loops=\d+', 'loops=N'); ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N'); + ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB'); return next ln; end loop; end; @@ -500,3 +501,62 @@ RESET max_parallel_workers_per_gather; RESET parallel_tuple_cost; RESET parallel_setup_cost; RESET min_parallel_table_scan_size; +-- Ensure memoize works for ANTI joins +CREATE TABLE tab_anti (a int, b boolean); +INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i; +ANALYZE tab_anti; +-- Ensure we get a Memoize plan for ANTI join +SELECT explain_memoize(' +SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN +LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2 +ON t1.a+1 = t2.a +WHERE t2.a IS NULL;', false); + explain_memoize +-------------------------------------------------------------------------------------------- + Aggregate (actual rows=1.00 loops=N) + -> Nested Loop Anti Join (actual rows=33.00 loops=N) + -> Seq Scan on tab_anti t1 (actual rows=100.00 loops=N) + -> Memoize (actual rows=0.67 loops=N) + Cache Key: (t1.a + 1), t1.a + Cache Mode: binary + Hits: 97 Misses: 3 Evictions: Zero Overflows: 0 Memory Usage: NkB + -> Subquery Scan on t2 (actual rows=0.67 loops=N) + Filter: ((t1.a + 1) = t2.a) + Rows Removed by Filter: 2 + -> Unique (actual rows=2.67 loops=N) + -> Sort (actual rows=67.33 loops=N) + Sort Key: t2_1.a + Sort Method: quicksort Memory: NkB + -> Seq Scan on tab_anti t2_1 (actual rows=100.00 loops=N) +(15 rows) + +-- And check we get the expected results. +SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN +LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2 +ON t1.a+1 = t2.a +WHERE t2.a IS NULL; + count +------- + 33 +(1 row) + +-- Ensure we do not add memoize node for SEMI join +EXPLAIN (COSTS OFF) +SELECT * FROM tab_anti t1 WHERE t1.a IN + (SELECT a FROM tab_anti t2 WHERE t2.b IN + (SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0)); + QUERY PLAN +------------------------------------------------- + Nested Loop Semi Join + -> Seq Scan on tab_anti t1 + -> Nested Loop Semi Join + Join Filter: (t1.a = t2.a) + -> Seq Scan on tab_anti t2 + -> Subquery Scan on "ANY_subquery" + Filter: (t2.b = "ANY_subquery".b) + -> Result + One-Time Filter: (t2.a > 1) + -> Seq Scan on tab_anti t3 +(10 rows) + +DROP TABLE tab_anti; diff --git a/src/test/regress/expected/merge.out b/src/test/regress/expected/merge.out index bcd29668297..cf2219df754 100644 --- a/src/test/regress/expected/merge.out +++ b/src/test/regress/expected/merge.out @@ -2702,6 +2702,76 @@ SELECT * FROM new_measurement ORDER BY city_id, logdate; 1 | 01-17-2007 | | (2 rows) +-- MERGE into inheritance root table +DROP TRIGGER insert_measurement_trigger ON measurement; +ALTER TABLE measurement ADD CONSTRAINT mcheck CHECK (city_id = 0) NO INHERIT; +EXPLAIN (COSTS OFF) +MERGE INTO measurement m + USING (VALUES (1, '01-17-2007'::date)) nm(city_id, logdate) ON + (m.city_id = nm.city_id and m.logdate=nm.logdate) +WHEN NOT MATCHED THEN INSERT + (city_id, logdate, peaktemp, unitsales) + VALUES (city_id - 1, logdate, 25, 100); + QUERY PLAN +-------------------------------------------------------------------------- + Merge on measurement m + Merge on measurement_y2007m01 m_1 + -> Nested Loop Left Join + -> Result + -> Seq Scan on measurement_y2007m01 m_1 + Filter: ((city_id = 1) AND (logdate = '01-17-2007'::date)) +(6 rows) + +BEGIN; +MERGE INTO measurement m + USING (VALUES (1, '01-17-2007'::date)) nm(city_id, logdate) ON + (m.city_id = nm.city_id and m.logdate=nm.logdate) +WHEN NOT MATCHED THEN INSERT + (city_id, logdate, peaktemp, unitsales) + VALUES (city_id - 1, logdate, 25, 100); +SELECT * FROM ONLY measurement ORDER BY city_id, logdate; + city_id | logdate | peaktemp | unitsales +---------+------------+----------+----------- + 0 | 07-21-2005 | 25 | 35 + 0 | 01-17-2007 | 25 | 100 +(2 rows) + +ROLLBACK; +ALTER TABLE measurement ENABLE ROW LEVEL SECURITY; +ALTER TABLE measurement FORCE ROW LEVEL SECURITY; +CREATE POLICY measurement_p ON measurement USING (peaktemp IS NOT NULL); +MERGE INTO measurement m + USING (VALUES (1, '01-17-2007'::date)) nm(city_id, logdate) ON + (m.city_id = nm.city_id and m.logdate=nm.logdate) +WHEN NOT MATCHED THEN INSERT + (city_id, logdate, peaktemp, unitsales) + VALUES (city_id - 1, logdate, NULL, 100); -- should fail +ERROR: new row violates row-level security policy for table "measurement" +MERGE INTO measurement m + USING (VALUES (1, '01-17-2007'::date)) nm(city_id, logdate) ON + (m.city_id = nm.city_id and m.logdate=nm.logdate) +WHEN NOT MATCHED THEN INSERT + (city_id, logdate, peaktemp, unitsales) + VALUES (city_id - 1, logdate, 25, 100); -- ok +SELECT * FROM ONLY measurement ORDER BY city_id, logdate; + city_id | logdate | peaktemp | unitsales +---------+------------+----------+----------- + 0 | 07-21-2005 | 25 | 35 + 0 | 01-17-2007 | 25 | 100 +(2 rows) + +MERGE INTO measurement m + USING (VALUES (1, '01-18-2007'::date)) nm(city_id, logdate) ON + (m.city_id = nm.city_id and m.logdate=nm.logdate) +WHEN NOT MATCHED THEN INSERT + (city_id, logdate, peaktemp, unitsales) + VALUES (city_id - 1, logdate, 25, 200) +RETURNING merge_action(), m.*; + merge_action | city_id | logdate | peaktemp | unitsales +--------------+---------+------------+----------+----------- + INSERT | 0 | 01-18-2007 | 25 | 200 +(1 row) + DROP TABLE measurement, new_measurement CASCADE; NOTICE: drop cascades to 3 other objects DETAIL: drop cascades to table measurement_y2006m02 diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out index cc517ed5e90..c3b2b9d8603 100644 --- a/src/test/regress/expected/misc_functions.out +++ b/src/test/regress/expected/misc_functions.out @@ -890,17 +890,17 @@ SELECT pg_column_toast_chunk_id(a) IS NULL, DROP TABLE test_chunk_id; DROP FUNCTION explain_mask_costs(text, bool, bool, bool, bool); --- test stratnum support functions -SELECT gist_stratnum_common(7); - gist_stratnum_common ----------------------- - 3 +-- test stratnum translation support functions +SELECT gist_translate_cmptype_common(7); + gist_translate_cmptype_common +------------------------------- + 3 (1 row) -SELECT gist_stratnum_common(3); - gist_stratnum_common ----------------------- - 18 +SELECT gist_translate_cmptype_common(3); + gist_translate_cmptype_common +------------------------------- + 18 (1 row) -- relpath tests diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out index 072d76ce131..c58e232a263 100644 --- a/src/test/regress/expected/numeric.out +++ b/src/test/regress/expected/numeric.out @@ -1464,9 +1464,21 @@ ERROR: count must be greater than zero SELECT width_bucket(3.5::float8, 3.0::float8, 3.0::float8, 888); ERROR: lower bound cannot equal upper bound SELECT width_bucket('NaN', 3.0, 4.0, 888); -ERROR: operand, lower bound, and upper bound cannot be NaN + width_bucket +-------------- + 889 +(1 row) + +SELECT width_bucket('NaN'::float8, 3.0::float8, 4.0::float8, 888); + width_bucket +-------------- + 889 +(1 row) + +SELECT width_bucket(0, 'NaN', 4.0, 888); +ERROR: lower and upper bounds cannot be NaN SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888); -ERROR: operand, lower bound, and upper bound cannot be NaN +ERROR: lower and upper bounds cannot be NaN SELECT width_bucket(2.0, 3.0, '-inf', 888); ERROR: lower and upper bounds must be finite SELECT width_bucket(0::float8, '-inf', 4.0::float8, 888); @@ -3860,15 +3872,15 @@ ERROR: factorial of a negative number is undefined -- Tests for pg_lsn() -- SELECT pg_lsn(23783416::numeric); - pg_lsn ------------ - 0/16AE7F8 + pg_lsn +------------ + 0/016AE7F8 (1 row) SELECT pg_lsn(0::numeric); - pg_lsn --------- - 0/0 + pg_lsn +------------ + 0/00000000 (1 row) SELECT pg_lsn(18446744073709551615::numeric); diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out index 0bf35260b46..d1966cd7d82 100644 --- a/src/test/regress/expected/partition_prune.out +++ b/src/test/regress/expected/partition_prune.out @@ -4553,16 +4553,18 @@ create view part_abc_view as select * from part_abc where b <> 'a' with check op prepare update_part_abc_view as update part_abc_view set b = $2 where a = $1 returning *; -- Only the unpruned partition should be shown in the list of relations to be -- updated -explain (costs off) execute update_part_abc_view (1, 'd'); - QUERY PLAN -------------------------------------------------------- - Update on part_abc - Update on part_abc_1 +explain (verbose, costs off) execute update_part_abc_view (1, 'd'); + QUERY PLAN +----------------------------------------------------------------------------- + Update on public.part_abc + Output: part_abc_1.a, part_abc_1.b, part_abc_1.c + Update on public.part_abc_1 -> Append Subplans Removed: 1 - -> Seq Scan on part_abc_1 - Filter: ((b <> 'a'::text) AND (a = $1)) -(6 rows) + -> Seq Scan on public.part_abc_1 + Output: $2, part_abc_1.tableoid, part_abc_1.ctid + Filter: ((part_abc_1.b <> 'a'::text) AND (part_abc_1.a = $1)) +(8 rows) execute update_part_abc_view (1, 'd'); a | b | c @@ -4570,28 +4572,31 @@ execute update_part_abc_view (1, 'd'); 1 | d | t (1 row) -explain (costs off) execute update_part_abc_view (2, 'a'); - QUERY PLAN -------------------------------------------------------- - Update on part_abc - Update on part_abc_2 part_abc_1 +explain (verbose, costs off) execute update_part_abc_view (2, 'a'); + QUERY PLAN +----------------------------------------------------------------------------- + Update on public.part_abc + Output: part_abc_1.a, part_abc_1.b, part_abc_1.c + Update on public.part_abc_2 -> Append Subplans Removed: 1 - -> Seq Scan on part_abc_2 part_abc_1 - Filter: ((b <> 'a'::text) AND (a = $1)) -(6 rows) + -> Seq Scan on public.part_abc_2 + Output: $2, part_abc_2.tableoid, part_abc_2.ctid + Filter: ((part_abc_2.b <> 'a'::text) AND (part_abc_2.a = $1)) +(8 rows) execute update_part_abc_view (2, 'a'); ERROR: new row violates check option for view "part_abc_view" DETAIL: Failing row contains (2, a, t). -- All pruned. -explain (costs off) execute update_part_abc_view (3, 'a'); - QUERY PLAN ------------------------------ - Update on part_abc +explain (verbose, costs off) execute update_part_abc_view (3, 'a'); + QUERY PLAN +---------------------------------------------------- + Update on public.part_abc + Output: part_abc_1.a, part_abc_1.b, part_abc_1.c -> Append Subplans Removed: 2 -(3 rows) +(4 rows) execute update_part_abc_view (3, 'a'); a | b | c diff --git a/src/test/regress/expected/pg_lsn.out b/src/test/regress/expected/pg_lsn.out index b27eec7c015..8ab59b2e445 100644 --- a/src/test/regress/expected/pg_lsn.out +++ b/src/test/regress/expected/pg_lsn.out @@ -41,9 +41,9 @@ SELECT * FROM pg_input_error_info('16AE7F7', 'pg_lsn'); -- Min/Max aggregation SELECT MIN(f1), MAX(f1) FROM PG_LSN_TBL; - min | max ------+------------------- - 0/0 | FFFFFFFF/FFFFFFFF + min | max +------------+------------------- + 0/00000000 | FFFFFFFF/FFFFFFFF (1 row) DROP TABLE PG_LSN_TBL; @@ -85,21 +85,21 @@ SELECT '0/16AE7F8'::pg_lsn - '0/16AE7F7'::pg_lsn; (1 row) SELECT '0/16AE7F7'::pg_lsn + 16::numeric; - ?column? ------------ - 0/16AE807 + ?column? +------------ + 0/016AE807 (1 row) SELECT 16::numeric + '0/16AE7F7'::pg_lsn; - ?column? ------------ - 0/16AE807 + ?column? +------------ + 0/016AE807 (1 row) SELECT '0/16AE7F7'::pg_lsn - 16::numeric; - ?column? ------------ - 0/16AE7E7 + ?column? +------------ + 0/016AE7E7 (1 row) SELECT 'FFFFFFFF/FFFFFFFE'::pg_lsn + 1::numeric; @@ -111,9 +111,9 @@ SELECT 'FFFFFFFF/FFFFFFFE'::pg_lsn + 1::numeric; SELECT 'FFFFFFFF/FFFFFFFE'::pg_lsn + 2::numeric; -- out of range error ERROR: pg_lsn out of range SELECT '0/1'::pg_lsn - 1::numeric; - ?column? ----------- - 0/0 + ?column? +------------ + 0/00000000 (1 row) SELECT '0/1'::pg_lsn - 2::numeric; -- out of range error @@ -125,9 +125,9 @@ SELECT '0/0'::pg_lsn + ('FFFFFFFF/FFFFFFFF'::pg_lsn - '0/0'::pg_lsn); (1 row) SELECT 'FFFFFFFF/FFFFFFFF'::pg_lsn - ('FFFFFFFF/FFFFFFFF'::pg_lsn - '0/0'::pg_lsn); - ?column? ----------- - 0/0 + ?column? +------------ + 0/00000000 (1 row) SELECT '0/16AE7F7'::pg_lsn + 'NaN'::numeric; @@ -164,107 +164,107 @@ SELECT DISTINCT (i || '/' || j)::pg_lsn f generate_series(1, 5) k WHERE i <= 10 AND j > 0 AND j <= 10 ORDER BY f; - f -------- - 1/1 - 1/2 - 1/3 - 1/4 - 1/5 - 1/6 - 1/7 - 1/8 - 1/9 - 1/10 - 2/1 - 2/2 - 2/3 - 2/4 - 2/5 - 2/6 - 2/7 - 2/8 - 2/9 - 2/10 - 3/1 - 3/2 - 3/3 - 3/4 - 3/5 - 3/6 - 3/7 - 3/8 - 3/9 - 3/10 - 4/1 - 4/2 - 4/3 - 4/4 - 4/5 - 4/6 - 4/7 - 4/8 - 4/9 - 4/10 - 5/1 - 5/2 - 5/3 - 5/4 - 5/5 - 5/6 - 5/7 - 5/8 - 5/9 - 5/10 - 6/1 - 6/2 - 6/3 - 6/4 - 6/5 - 6/6 - 6/7 - 6/8 - 6/9 - 6/10 - 7/1 - 7/2 - 7/3 - 7/4 - 7/5 - 7/6 - 7/7 - 7/8 - 7/9 - 7/10 - 8/1 - 8/2 - 8/3 - 8/4 - 8/5 - 8/6 - 8/7 - 8/8 - 8/9 - 8/10 - 9/1 - 9/2 - 9/3 - 9/4 - 9/5 - 9/6 - 9/7 - 9/8 - 9/9 - 9/10 - 10/1 - 10/2 - 10/3 - 10/4 - 10/5 - 10/6 - 10/7 - 10/8 - 10/9 - 10/10 + f +------------- + 1/00000001 + 1/00000002 + 1/00000003 + 1/00000004 + 1/00000005 + 1/00000006 + 1/00000007 + 1/00000008 + 1/00000009 + 1/00000010 + 2/00000001 + 2/00000002 + 2/00000003 + 2/00000004 + 2/00000005 + 2/00000006 + 2/00000007 + 2/00000008 + 2/00000009 + 2/00000010 + 3/00000001 + 3/00000002 + 3/00000003 + 3/00000004 + 3/00000005 + 3/00000006 + 3/00000007 + 3/00000008 + 3/00000009 + 3/00000010 + 4/00000001 + 4/00000002 + 4/00000003 + 4/00000004 + 4/00000005 + 4/00000006 + 4/00000007 + 4/00000008 + 4/00000009 + 4/00000010 + 5/00000001 + 5/00000002 + 5/00000003 + 5/00000004 + 5/00000005 + 5/00000006 + 5/00000007 + 5/00000008 + 5/00000009 + 5/00000010 + 6/00000001 + 6/00000002 + 6/00000003 + 6/00000004 + 6/00000005 + 6/00000006 + 6/00000007 + 6/00000008 + 6/00000009 + 6/00000010 + 7/00000001 + 7/00000002 + 7/00000003 + 7/00000004 + 7/00000005 + 7/00000006 + 7/00000007 + 7/00000008 + 7/00000009 + 7/00000010 + 8/00000001 + 8/00000002 + 8/00000003 + 8/00000004 + 8/00000005 + 8/00000006 + 8/00000007 + 8/00000008 + 8/00000009 + 8/00000010 + 9/00000001 + 9/00000002 + 9/00000003 + 9/00000004 + 9/00000005 + 9/00000006 + 9/00000007 + 9/00000008 + 9/00000009 + 9/00000010 + 10/00000001 + 10/00000002 + 10/00000003 + 10/00000004 + 10/00000005 + 10/00000006 + 10/00000007 + 10/00000008 + 10/00000009 + 10/00000010 (100 rows) diff --git a/src/test/regress/expected/predicate.out b/src/test/regress/expected/predicate.out index b79037748b7..59bfe33bb1c 100644 --- a/src/test/regress/expected/predicate.out +++ b/src/test/regress/expected/predicate.out @@ -84,10 +84,10 @@ SELECT * FROM pred_tab t WHERE t.a IS NULL OR t.c IS NULL; -- are provably false EXPLAIN (COSTS OFF) SELECT * FROM pred_tab t WHERE t.b IS NULL OR t.c IS NULL; - QUERY PLAN ----------------------------------------- + QUERY PLAN +------------------------ Seq Scan on pred_tab t - Filter: ((b IS NULL) OR (c IS NULL)) + Filter: (b IS NULL) (2 rows) -- @@ -231,6 +231,54 @@ SELECT * FROM pred_tab t1 -> Seq Scan on pred_tab t3 (9 rows) +-- +-- Tests for NullTest reduction in EXISTS sublink +-- +-- Ensure the IS_NOT_NULL qual is ignored +EXPLAIN (COSTS OFF) +SELECT * FROM pred_tab t1 + LEFT JOIN pred_tab t2 ON EXISTS + (SELECT 1 FROM pred_tab t3, pred_tab t4, pred_tab t5, pred_tab t6 + WHERE t1.a = t3.a AND t6.a IS NOT NULL); + QUERY PLAN +--------------------------------------------------------- + Nested Loop Left Join + Join Filter: EXISTS(SubPlan 1) + -> Seq Scan on pred_tab t1 + -> Materialize + -> Seq Scan on pred_tab t2 + SubPlan 1 + -> Nested Loop + -> Nested Loop + -> Nested Loop + -> Seq Scan on pred_tab t4 + -> Materialize + -> Seq Scan on pred_tab t3 + Filter: (t1.a = a) + -> Materialize + -> Seq Scan on pred_tab t5 + -> Materialize + -> Seq Scan on pred_tab t6 +(17 rows) + +-- Ensure the IS_NULL qual is reduced to constant-FALSE +EXPLAIN (COSTS OFF) +SELECT * FROM pred_tab t1 + LEFT JOIN pred_tab t2 ON EXISTS + (SELECT 1 FROM pred_tab t3, pred_tab t4, pred_tab t5, pred_tab t6 + WHERE t1.a = t3.a AND t6.a IS NULL); + QUERY PLAN +------------------------------------- + Nested Loop Left Join + Join Filter: (InitPlan 1).col1 + InitPlan 1 + -> Result + One-Time Filter: false + -> Seq Scan on pred_tab t1 + -> Materialize + -> Seq Scan on pred_tab t2 +(8 rows) + DROP TABLE pred_tab; -- Validate we handle IS NULL and IS NOT NULL quals correctly with inheritance -- parents. diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index c25062c288f..602a6b255bc 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -2568,6 +2568,26 @@ SELECT makeaclitem('regress_priv_user1'::regrole, 'regress_priv_user2'::regrole, SELECT makeaclitem('regress_priv_user1'::regrole, 'regress_priv_user2'::regrole, 'SELECT, fake_privilege', FALSE); -- error ERROR: unrecognized privilege type: "fake_privilege" +-- Test quoting and dequoting of user names in ACLs +CREATE ROLE "regress_""quoted"; +SELECT makeaclitem('regress_"quoted'::regrole, 'regress_"quoted'::regrole, + 'SELECT', TRUE); + makeaclitem +------------------------------------------ + "regress_""quoted"=r*/"regress_""quoted" +(1 row) + +SELECT '"regress_""quoted"=r*/"regress_""quoted"'::aclitem; + aclitem +------------------------------------------ + "regress_""quoted"=r*/"regress_""quoted" +(1 row) + +SELECT '""=r*/""'::aclitem; -- used to be misparsed as """" +ERROR: a name must follow the "/" sign +LINE 1: SELECT '""=r*/""'::aclitem; + ^ +DROP ROLE "regress_""quoted"; -- Test non-throwing aclitem I/O SELECT pg_input_is_valid('regress_priv_user1=r/regress_priv_user2', 'aclitem'); pg_input_is_valid @@ -3220,7 +3240,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user; DROP TABLE lock_table; DROP USER regress_locktable_user; -- test to check privileges of system views pg_shmem_allocations, --- pg_shmem_allocations_numa and pg_backend_memory_contexts. +-- pg_shmem_allocations_numa, pg_dsm_registry_allocations, and +-- pg_backend_memory_contexts. -- switch to superuser \c - CREATE ROLE regress_readallstats; @@ -3248,6 +3269,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','S f (1 row) +SELECT has_table_privilege('regress_readallstats','pg_dsm_registry_allocations','SELECT'); -- no + has_table_privilege +--------------------- + f +(1 row) + GRANT pg_read_all_stats TO regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes has_table_privilege @@ -3273,6 +3300,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','S t (1 row) +SELECT has_table_privilege('regress_readallstats','pg_dsm_registry_allocations','SELECT'); -- yes + has_table_privilege +--------------------- + t +(1 row) + -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; SELECT COUNT(*) >= 0 AS ok FROM pg_aios; diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out index cf48ae6d0c2..236eba2540e 100644 --- a/src/test/regress/expected/psql.out +++ b/src/test/regress/expected/psql.out @@ -160,12 +160,12 @@ LINE 1: SELECT $1, $2 foo4 | bar4 (1 row) --- \close (extended query protocol) -\close -\close: missing required argument -\close '' -\close stmt2 -\close stmt2 +-- \close_prepared (extended query protocol) +\close_prepared +\close_prepared: missing required argument +\close_prepared '' +\close_prepared stmt2 +\close_prepared stmt2 SELECT name, statement FROM pg_prepared_statements ORDER BY name; name | statement -------+---------------- @@ -4666,7 +4666,7 @@ bar 'bar' "bar" \C arg1 \c arg1 arg2 arg3 arg4 \cd arg1 - \close stmt1 + \close_prepared stmt1 \conninfo \copy arg1 arg2 arg3 arg4 arg5 arg6 \copyright diff --git a/src/test/regress/expected/psql_pipeline.out b/src/test/regress/expected/psql_pipeline.out index a30dec088b9..a0816fb10b6 100644 --- a/src/test/regress/expected/psql_pipeline.out +++ b/src/test/regress/expected/psql_pipeline.out @@ -228,192 +228,6 @@ BEGIN \bind \sendpipeline INSERT INTO psql_pipeline VALUES ($1) \bind 1 \sendpipeline COMMIT \bind \sendpipeline \endpipeline --- COPY FROM STDIN --- with \sendpipeline and \bind -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -COPY psql_pipeline FROM STDIN \bind \sendpipeline -\endpipeline - ?column? ----------- - val1 -(1 row) - --- with semicolon -\startpipeline -SELECT 'val1'; -COPY psql_pipeline FROM STDIN; -\endpipeline - ?column? ----------- - val1 -(1 row) - --- COPY FROM STDIN with \flushrequest + \getresults --- with \sendpipeline and \bind -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -COPY psql_pipeline FROM STDIN \bind \sendpipeline -\flushrequest -\getresults - ?column? ----------- - val1 -(1 row) - -message type 0x5a arrived from server while idle -\endpipeline --- with semicolon -\startpipeline -SELECT 'val1'; -COPY psql_pipeline FROM STDIN; -\flushrequest -\getresults - ?column? ----------- - val1 -(1 row) - -message type 0x5a arrived from server while idle -\endpipeline --- COPY FROM STDIN with \syncpipeline + \getresults --- with \bind and \sendpipeline -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -COPY psql_pipeline FROM STDIN \bind \sendpipeline -\syncpipeline -\getresults - ?column? ----------- - val1 -(1 row) - -\endpipeline --- with semicolon -\startpipeline -SELECT 'val1'; -COPY psql_pipeline FROM STDIN; -\syncpipeline -\getresults - ?column? ----------- - val1 -(1 row) - -\endpipeline --- COPY TO STDOUT --- with \bind and \sendpipeline -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -copy psql_pipeline TO STDOUT \bind \sendpipeline -\endpipeline - ?column? ----------- - val1 -(1 row) - -1 \N -2 test2 -20 test2 -3 test3 -30 test3 -4 test4 -40 test4 --- with semicolon -\startpipeline -SELECT 'val1'; -copy psql_pipeline TO STDOUT; -\endpipeline - ?column? ----------- - val1 -(1 row) - -1 \N -2 test2 -20 test2 -3 test3 -30 test3 -4 test4 -40 test4 --- COPY TO STDOUT with \flushrequest + \getresults --- with \bind and \sendpipeline -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -copy psql_pipeline TO STDOUT \bind \sendpipeline -\flushrequest -\getresults - ?column? ----------- - val1 -(1 row) - -1 \N -2 test2 -20 test2 -3 test3 -30 test3 -4 test4 -40 test4 -\endpipeline --- with semicolon -\startpipeline -SELECT 'val1'; -copy psql_pipeline TO STDOUT; -\flushrequest -\getresults - ?column? ----------- - val1 -(1 row) - -1 \N -2 test2 -20 test2 -3 test3 -30 test3 -4 test4 -40 test4 -\endpipeline --- COPY TO STDOUT with \syncpipeline + \getresults --- with \bind and \sendpipeline -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -copy psql_pipeline TO STDOUT \bind \sendpipeline -\syncpipeline -\getresults - ?column? ----------- - val1 -(1 row) - -1 \N -2 test2 -20 test2 -3 test3 -30 test3 -4 test4 -40 test4 -\endpipeline --- with semicolon -\startpipeline -SELECT 'val1'; -copy psql_pipeline TO STDOUT; -\syncpipeline -\getresults - ?column? ----------- - val1 -(1 row) - -1 \N -2 test2 -20 test2 -3 test3 -30 test3 -4 test4 -40 test4 -\endpipeline -- Use \parse and \bind_named \startpipeline SELECT $1 \parse '' @@ -740,7 +554,7 @@ SELECT COUNT(*) FROM psql_pipeline \bind \sendpipeline count ------- - 7 + 1 (1 row) -- After an error, pipeline is aborted and requires \syncpipeline to be @@ -750,7 +564,7 @@ SELECT $1 \bind \sendpipeline SELECT $1 \bind 1 \sendpipeline SELECT $1 \parse a \bind_named a 1 \sendpipeline -\close a +\close_prepared a \flushrequest \getresults ERROR: bind message supplies 0 parameters, but prepared statement "" requires 1 @@ -758,7 +572,7 @@ ERROR: bind message supplies 0 parameters, but prepared statement "" requires 1 SELECT $1 \bind 1 \sendpipeline SELECT $1 \parse a \bind_named a 1 \sendpipeline -\close a +\close_prepared a -- Sync allows pipeline to recover. \syncpipeline \getresults @@ -766,7 +580,7 @@ Pipeline aborted, command did not run SELECT $1 \bind 1 \sendpipeline SELECT $1 \parse a \bind_named a 1 \sendpipeline -\close a +\close_prepared a \flushrequest \getresults ?column? diff --git a/src/test/regress/expected/publication.out b/src/test/regress/expected/publication.out index 4de96c04f9d..53268059142 100644 --- a/src/test/regress/expected/publication.out +++ b/src/test/regress/expected/publication.out @@ -34,7 +34,11 @@ ERROR: conflicting or redundant options LINE 1: ...pub_xxx WITH (publish_generated_columns = stored, publish_ge... ^ CREATE PUBLICATION testpub_xxx WITH (publish_generated_columns = foo); -ERROR: publish_generated_columns requires a "none" or "stored" value +ERROR: invalid value for publication parameter "publish_generated_columns": "foo" +DETAIL: Valid values are "none" and "stored". +CREATE PUBLICATION testpub_xxx WITH (publish_generated_columns); +ERROR: invalid value for publication parameter "publish_generated_columns": "" +DETAIL: Valid values are "none" and "stored". \dRp List of publications Name | Owner | All tables | Inserts | Updates | Deletes | Truncates | Generated columns | Via root @@ -524,16 +528,22 @@ Tables from schemas: "testpub_rf_schema2" -- fail - virtual generated column uses user-defined function +-- (Actually, this already fails at CREATE TABLE rather than at CREATE +-- PUBLICATION, but let's keep the test in case the former gets +-- relaxed sometime.) CREATE TABLE testpub_rf_tbl6 (id int PRIMARY KEY, x int, y int GENERATED ALWAYS AS (x * testpub_rf_func2()) VIRTUAL); +ERROR: generation expression uses user-defined function +LINE 1: ...RIMARY KEY, x int, y int GENERATED ALWAYS AS (x * testpub_rf... + ^ +DETAIL: Virtual generated columns that make use of user-defined functions are not yet supported. CREATE PUBLICATION testpub7 FOR TABLE testpub_rf_tbl6 WHERE (y > 100); -ERROR: invalid publication WHERE expression -DETAIL: User-defined or built-in mutable functions are not allowed. +ERROR: relation "testpub_rf_tbl6" does not exist -- test that SET EXPRESSION is rejected, because it could affect a row filter SET client_min_messages = 'ERROR'; CREATE TABLE testpub_rf_tbl7 (id int PRIMARY KEY, x int, y int GENERATED ALWAYS AS (x * 111) VIRTUAL); CREATE PUBLICATION testpub8 FOR TABLE testpub_rf_tbl7 WHERE (y > 100); ALTER TABLE testpub_rf_tbl7 ALTER COLUMN y SET EXPRESSION AS (x * testpub_rf_func2()); -ERROR: ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns on tables that are part of a publication +ERROR: ALTER TABLE / SET EXPRESSION is not supported for virtual generated columns in tables that are part of a publication DETAIL: Column "y" of relation "testpub_rf_tbl7" is a virtual generated column. RESET client_min_messages; DROP TABLE testpub_rf_tbl1; @@ -541,7 +551,7 @@ DROP TABLE testpub_rf_tbl2; DROP TABLE testpub_rf_tbl3; DROP TABLE testpub_rf_tbl4; DROP TABLE testpub_rf_tbl5; -DROP TABLE testpub_rf_tbl6; +--DROP TABLE testpub_rf_tbl6; DROP TABLE testpub_rf_schema1.testpub_rf_tbl5; DROP TABLE testpub_rf_schema2.testpub_rf_tbl6; DROP SCHEMA testpub_rf_schema1; @@ -1837,8 +1847,7 @@ DROP SCHEMA sch1 cascade; DROP SCHEMA sch2 cascade; -- ====================================================== -- Test the 'publish_generated_columns' parameter with the following values: --- 'stored', 'none', and the default (no value specified), which defaults to --- 'stored'. +-- 'stored', 'none'. SET client_min_messages = 'ERROR'; CREATE PUBLICATION pub1 FOR ALL TABLES WITH (publish_generated_columns = stored); \dRp+ pub1 @@ -1856,17 +1865,8 @@ CREATE PUBLICATION pub2 FOR ALL TABLES WITH (publish_generated_columns = none); regress_publication_user | t | t | t | t | t | none | f (1 row) -CREATE PUBLICATION pub3 FOR ALL TABLES WITH (publish_generated_columns); -\dRp+ pub3 - Publication pub3 - Owner | All tables | Inserts | Updates | Deletes | Truncates | Generated columns | Via root ---------------------------+------------+---------+---------+---------+-----------+-------------------+---------- - regress_publication_user | t | t | t | t | t | stored | f -(1 row) - DROP PUBLICATION pub1; DROP PUBLICATION pub2; -DROP PUBLICATION pub3; -- Test the 'publish_generated_columns' parameter as 'none' and 'stored' for -- different scenarios with/without generated columns in column lists. CREATE TABLE gencols (a int, gen1 int GENERATED ALWAYS AS (a * 2) STORED); @@ -1927,3 +1927,24 @@ RESET client_min_messages; RESET SESSION AUTHORIZATION; DROP ROLE regress_publication_user, regress_publication_user2; DROP ROLE regress_publication_user_dummy; +-- stage objects for pg_dump tests +CREATE SCHEMA pubme CREATE TABLE t0 (c int, d int) CREATE TABLE t1 (c int); +CREATE SCHEMA pubme2 CREATE TABLE t0 (c int, d int); +SET client_min_messages = 'ERROR'; +CREATE PUBLICATION dump_pub_qual_1ct FOR + TABLE ONLY pubme.t0 (c, d) WHERE (c > 0); +CREATE PUBLICATION dump_pub_qual_2ct FOR + TABLE ONLY pubme.t0 (c) WHERE (c > 0), + TABLE ONLY pubme.t1 (c); +CREATE PUBLICATION dump_pub_nsp_1ct FOR + TABLES IN SCHEMA pubme; +CREATE PUBLICATION dump_pub_nsp_2ct FOR + TABLES IN SCHEMA pubme, + TABLES IN SCHEMA pubme2; +CREATE PUBLICATION dump_pub_all FOR + TABLE ONLY pubme.t0, + TABLE ONLY pubme.t1 WHERE (c < 0), + TABLES IN SCHEMA pubme, + TABLES IN SCHEMA pubme2 + WITH (publish_via_partition_root = true); +RESET client_min_messages; diff --git a/src/test/regress/expected/regproc.out b/src/test/regress/expected/regproc.out index 97b917502ca..84c84aef420 100644 --- a/src/test/regress/expected/regproc.out +++ b/src/test/regress/expected/regproc.out @@ -192,6 +192,18 @@ SELECT regnamespace('"pg_catalog"'); pg_catalog (1 row) +SELECT regdatabase('template1'); + regdatabase +------------- + template1 +(1 row) + +SELECT regdatabase('"template1"'); + regdatabase +------------- + template1 +(1 row) + SELECT to_regrole('regress_regrole_test'); to_regrole ---------------------- @@ -216,6 +228,132 @@ SELECT to_regnamespace('"pg_catalog"'); pg_catalog (1 row) +SELECT to_regdatabase('template1'); + to_regdatabase +---------------- + template1 +(1 row) + +SELECT to_regdatabase('"template1"'); + to_regdatabase +---------------- + template1 +(1 row) + +-- special "single dash" case +SELECT regproc('-')::oid; + regproc +--------- + 0 +(1 row) + +SELECT regprocedure('-')::oid; + regprocedure +-------------- + 0 +(1 row) + +SELECT regclass('-')::oid; + regclass +---------- + 0 +(1 row) + +SELECT regcollation('-')::oid; + regcollation +-------------- + 0 +(1 row) + +SELECT regtype('-')::oid; + regtype +--------- + 0 +(1 row) + +SELECT regconfig('-')::oid; + regconfig +----------- + 0 +(1 row) + +SELECT regdictionary('-')::oid; + regdictionary +--------------- + 0 +(1 row) + +SELECT regrole('-')::oid; + regrole +--------- + 0 +(1 row) + +SELECT regnamespace('-')::oid; + regnamespace +-------------- + 0 +(1 row) + +SELECT regdatabase('-')::oid; + regdatabase +------------- + 0 +(1 row) + +SELECT to_regproc('-')::oid; + to_regproc +------------ + 0 +(1 row) + +SELECT to_regprocedure('-')::oid; + to_regprocedure +----------------- + 0 +(1 row) + +SELECT to_regclass('-')::oid; + to_regclass +------------- + 0 +(1 row) + +SELECT to_regcollation('-')::oid; + to_regcollation +----------------- + 0 +(1 row) + +SELECT to_regtype('-')::oid; + to_regtype +------------ + 0 +(1 row) + +SELECT to_regrole('-')::oid; + to_regrole +------------ + 0 +(1 row) + +SELECT to_regnamespace('-')::oid; + to_regnamespace +----------------- + 0 +(1 row) + +SELECT to_regdatabase('-')::oid; + to_regdatabase +---------------- + 0 +(1 row) + +-- constant cannot be used here +CREATE TABLE regrole_test (rolid OID DEFAULT 'regress_regrole_test'::regrole); +ERROR: constant of the type regrole cannot be used here +CREATE TABLE regdatabase_test (datid OID DEFAULT 'template1'::regdatabase); +ERROR: constant of the type regdatabase cannot be used here /* If objects don't exist, raise errors. */ DROP ROLE regress_regrole_test; -- without schemaname @@ -305,6 +443,18 @@ SELECT regnamespace('foo.bar'); ERROR: invalid name syntax LINE 1: SELECT regnamespace('foo.bar'); ^ +SELECT regdatabase('Nonexistent'); +ERROR: database "nonexistent" does not exist +LINE 1: SELECT regdatabase('Nonexistent'); + ^ +SELECT regdatabase('"Nonexistent"'); +ERROR: database "Nonexistent" does not exist +LINE 1: SELECT regdatabase('"Nonexistent"'); + ^ +SELECT regdatabase('foo.bar'); +ERROR: invalid name syntax +LINE 1: SELECT regdatabase('foo.bar'); + ^ /* If objects don't exist, return NULL with no error. */ -- without schemaname SELECT to_regoper('||//'); @@ -447,6 +597,24 @@ SELECT to_regnamespace('foo.bar'); (1 row) +SELECT to_regdatabase('Nonexistent'); + to_regdatabase +---------------- + +(1 row) + +SELECT to_regdatabase('"Nonexistent"'); + to_regdatabase +---------------- + +(1 row) + +SELECT to_regdatabase('foo.bar'); + to_regdatabase +---------------- + +(1 row) + -- Test to_regtypemod SELECT to_regtypemod('text'); to_regtypemod @@ -569,6 +737,12 @@ SELECT * FROM pg_input_error_info('no_such_type', 'regtype'); type "no_such_type" does not exist | | | 42704 (1 row) +SELECT * FROM pg_input_error_info('Nonexistent', 'regdatabase'); + message | detail | hint | sql_error_code +---------------------------------------+--------+------+---------------- + database "nonexistent" does not exist | | | 42704 +(1 row) + -- Some cases that should be soft errors, but are not yet SELECT * FROM pg_input_error_info('incorrect type name syntax', 'regtype'); ERROR: syntax error at or near "type" diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 6cf828ca8d0..35e8aad7701 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1340,6 +1340,10 @@ pg_cursors| SELECT name, is_scrollable, creation_time FROM pg_cursor() c(name, statement, is_holdable, is_binary, is_scrollable, creation_time); +pg_dsm_registry_allocations| SELECT name, + type, + size + FROM pg_get_dsm_registry_allocations() pg_get_dsm_registry_allocations(name, type, size); pg_file_settings| SELECT sourcefile, sourceline, seqno, @@ -1973,7 +1977,12 @@ pg_stat_progress_basebackup| SELECT pid, END AS backup_total, param3 AS backup_streamed, param4 AS tablespaces_total, - param5 AS tablespaces_streamed + param5 AS tablespaces_streamed, + CASE param6 + WHEN 1 THEN 'full'::text + WHEN 2 THEN 'incremental'::text + ELSE NULL::text + END AS backup_type FROM pg_stat_get_progress_info('BASEBACKUP'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20); pg_stat_progress_cluster| SELECT s.pid, s.datid, @@ -2175,13 +2184,14 @@ pg_stat_subscription_stats| SELECT ss.subid, ss.confl_insert_exists, ss.confl_update_origin_differs, ss.confl_update_exists, + ss.confl_update_deleted, ss.confl_update_missing, ss.confl_delete_origin_differs, ss.confl_delete_missing, ss.confl_multiple_unique_conflicts, ss.stats_reset FROM pg_subscription s, - LATERAL pg_stat_get_subscription_stats(s.oid) ss(subid, apply_error_count, sync_error_count, confl_insert_exists, confl_update_origin_differs, confl_update_exists, confl_update_missing, confl_delete_origin_differs, confl_delete_missing, confl_multiple_unique_conflicts, stats_reset); + LATERAL pg_stat_get_subscription_stats(s.oid) ss(subid, apply_error_count, sync_error_count, confl_insert_exists, confl_update_origin_differs, confl_update_exists, confl_update_deleted, confl_update_missing, confl_delete_origin_differs, confl_delete_missing, confl_multiple_unique_conflicts, stats_reset); pg_stat_sys_indexes| SELECT relid, indexrelid, schemaname, diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out index 776f1ad0e53..605f5070376 100644 --- a/src/test/regress/expected/stats.out +++ b/src/test/regress/expected/stats.out @@ -926,8 +926,19 @@ DROP TABLE test_stats_temp; -- Checkpoint twice: The checkpointer reports stats after reporting completion -- of the checkpoint. But after a second checkpoint we'll see at least the -- results of the first. -CHECKPOINT; -CHECKPOINT; +-- +-- While at it, test checkpoint options. Note that we don't test MODE SPREAD +-- because it would prolong the test. +CHECKPOINT (WRONG); +ERROR: unrecognized CHECKPOINT option "wrong" +LINE 1: CHECKPOINT (WRONG); + ^ +CHECKPOINT (MODE WRONG); +ERROR: unrecognized MODE option "wrong" +LINE 1: CHECKPOINT (MODE WRONG); + ^ +CHECKPOINT (MODE FAST, FLUSH_UNLOGGED FALSE); +CHECKPOINT (FLUSH_UNLOGGED); SELECT num_requested > :rqst_ckpts_before FROM pg_stat_checkpointer; ?column? ---------- diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 174f0a68331..ba302da51e7 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -236,6 +236,12 @@ SELECT E'De\\678dBeEf'::bytea; ERROR: invalid input syntax for type bytea LINE 1: SELECT E'De\\678dBeEf'::bytea; ^ +SELECT E'DeAd\\\\BeEf'::bytea; + bytea +---------------------- + \x446541645c42654566 +(1 row) + SELECT reverse(''::bytea); reverse --------- @@ -291,6 +297,12 @@ SELECT E'De\\123dBeEf'::bytea; DeSdBeEf (1 row) +SELECT E'DeAd\\\\BeEf'::bytea; + bytea +------------ + DeAd\\BeEf +(1 row) + -- Test non-error-throwing API too SELECT pg_input_is_valid(E'\\xDeAdBeE', 'bytea'); pg_input_is_valid @@ -614,6 +626,73 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; ERROR: invalid escape string HINT: Escape string must be empty or one character. +-- Characters that should be left alone in character classes when a +-- SIMILAR TO regexp pattern is converted to POSIX style. +-- Underscore "_" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_'; + QUERY PLAN +------------------------------------------------ + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text) +(2 rows) + +-- Percentage "%" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%'; + QUERY PLAN +-------------------------------------------------- + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text) +(2 rows) + +-- Dot "." +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].'; + QUERY PLAN +-------------------------------------------------- + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text) +(2 rows) + +-- Dollar "$" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$'; + QUERY PLAN +-------------------------------------------------- + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text) +(2 rows) + +-- Opening parenthesis "(" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '()[([:alnum:](]()'; + QUERY PLAN +------------------------------------------------------ + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:(?:)[([:alnum:](](?:))$'::text) +(2 rows) + +-- Caret "^" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^'; + QUERY PLAN +------------------------------------------------------------------------ + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text) +(2 rows) + +-- Closing square bracket "]" at the beginning of character class +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%'; + QUERY PLAN +------------------------------------------------ + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text) +(2 rows) + +-- Closing square bracket effective after two carets at the beginning +-- of character class. +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^'; + QUERY PLAN +--------------------------------------- + Seq Scan on text_tbl + Filter: (f1 ~ '^(?:[^^]\^)$'::text) +(2 rows) + -- Test backslash escapes in regexp_replace's replacement string SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); regexp_replace @@ -2012,6 +2091,40 @@ SELECT c FROM toasttest; (1 row) DROP TABLE toasttest; +-- test with short varlenas (up to 126 data bytes reduced to a 1-byte header) +-- being toasted. +CREATE TABLE toasttest (f1 text, f2 text); +ALTER TABLE toasttest SET (toast_tuple_target = 128); +ALTER TABLE toasttest ALTER COLUMN f1 SET STORAGE EXTERNAL; +ALTER TABLE toasttest ALTER COLUMN f2 SET STORAGE EXTERNAL; +-- Here, the first value is a varlena large enough to make it toasted and +-- stored uncompressed. The second value is a short varlena, toasted +-- and stored uncompressed. +INSERT INTO toasttest values(repeat('1234', 1000), repeat('5678', 30)); +SELECT reltoastrelid::regclass AS reltoastname FROM pg_class + WHERE oid = 'toasttest'::regclass \gset +-- There should be two values inserted in the toast relation. +SELECT count(*) FROM :reltoastname WHERE chunk_seq = 0; + count +------- + 2 +(1 row) + +SELECT substr(f1, 5, 10) AS f1_data, substr(f2, 5, 10) AS f2_data + FROM toasttest; + f1_data | f2_data +------------+------------ + 1234123412 | 5678567856 +(1 row) + +SELECT pg_column_compression(f1) AS f1_comp, pg_column_compression(f2) AS f2_comp + FROM toasttest; + f1_comp | f2_comp +---------+--------- + | +(1 row) + +DROP TABLE toasttest; -- -- test length -- diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out index 1443e1d9292..a98c97f7616 100644 --- a/src/test/regress/expected/subscription.out +++ b/src/test/regress/expected/subscription.out @@ -116,18 +116,18 @@ CREATE SUBSCRIPTION regress_testsub4 CONNECTION 'dbname=regress_doesnotexist' PU WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ regress_testsub4 - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN -------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | none | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | none | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub4 SET (origin = any); \dRs+ regress_testsub4 - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN -------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub4 | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) DROP SUBSCRIPTION regress_testsub3; @@ -145,10 +145,10 @@ ALTER SUBSCRIPTION regress_testsub CONNECTION 'foobar'; ERROR: invalid connection string syntax: missing "=" after "foobar" in connection info string \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET PUBLICATION testpub2, testpub3 WITH (refresh = false); @@ -157,10 +157,10 @@ ALTER SUBSCRIPTION regress_testsub SET (slot_name = 'newname'); ALTER SUBSCRIPTION regress_testsub SET (password_required = false); ALTER SUBSCRIPTION regress_testsub SET (run_as_owner = true); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | f | t | f | off | dbname=regress_doesnotexist2 | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+------------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | f | t | f | f | off | dbname=regress_doesnotexist2 | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (password_required = true); @@ -176,10 +176,10 @@ ERROR: unrecognized subscription parameter: "create_slot" -- ok ALTER SUBSCRIPTION regress_testsub SKIP (lsn = '0/12345'); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist2 | 0/12345 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+------------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist2 | 0/00012345 (1 row) -- ok - with lsn = NONE @@ -188,10 +188,10 @@ ALTER SUBSCRIPTION regress_testsub SKIP (lsn = NONE); ALTER SUBSCRIPTION regress_testsub SKIP (lsn = '0/0'); ERROR: invalid WAL location (LSN): 0/0 \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist2 | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+------------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist2 | 0/00000000 (1 row) BEGIN; @@ -223,10 +223,10 @@ ALTER SUBSCRIPTION regress_testsub_foo SET (synchronous_commit = foobar); ERROR: invalid value for parameter "synchronous_commit": "foobar" HINT: Available values: local, remote_write, remote_apply, on, off. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ----------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+------------------------------+---------- - regress_testsub_foo | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | local | dbname=regress_doesnotexist2 | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +---------------------+---------------------------+---------+---------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+------------------------------+------------ + regress_testsub_foo | regress_subscription_user | f | {testpub2,testpub3} | f | parallel | d | f | any | t | f | f | f | local | dbname=regress_doesnotexist2 | 0/00000000 (1 row) -- rename back to keep the rest simple @@ -255,19 +255,19 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | t | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | t | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (binary = false); ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) DROP SUBSCRIPTION regress_testsub; @@ -279,27 +279,27 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | on | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | on | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (streaming = parallel); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (streaming = false); ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) -- fail - publication already exists @@ -314,10 +314,10 @@ ALTER SUBSCRIPTION regress_testsub ADD PUBLICATION testpub1, testpub2 WITH (refr ALTER SUBSCRIPTION regress_testsub ADD PUBLICATION testpub1, testpub2 WITH (refresh = false); ERROR: publication "testpub1" is already in subscription "regress_testsub" \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-----------------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub,testpub1,testpub2} | f | off | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-----------------------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub,testpub1,testpub2} | f | off | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) -- fail - publication used more than once @@ -332,10 +332,10 @@ ERROR: publication "testpub3" is not in subscription "regress_testsub" -- ok - delete publications ALTER SUBSCRIPTION regress_testsub DROP PUBLICATION testpub1, testpub2 WITH (refresh = false); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | off | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) DROP SUBSCRIPTION regress_testsub; @@ -371,19 +371,19 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | p | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | p | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) -- we can alter streaming when two_phase enabled ALTER SUBSCRIPTION regress_testsub SET (streaming = true); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); @@ -393,10 +393,10 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | on | p | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); @@ -409,18 +409,34 @@ CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUB WARNING: subscription was created, but is not connected HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (disable_on_error = true); \dRs+ - List of subscriptions - Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Synchronous commit | Conninfo | Skip LSN ------------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+-----------------------------+---------- - regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | t | any | t | f | f | off | dbname=regress_doesnotexist | 0/0 + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | t | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 +(1 row) + +ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); +DROP SUBSCRIPTION regress_testsub; +-- fail - retain_dead_tuples must be boolean +CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, retain_dead_tuples = foo); +ERROR: retain_dead_tuples requires a Boolean value +-- ok +CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, retain_dead_tuples = false); +WARNING: subscription was created, but is not connected +HINT: To initiate replication, you must manually create the replication slot, enable the subscription, and refresh the subscription. +\dRs+ + List of subscriptions + Name | Owner | Enabled | Publication | Binary | Streaming | Two-phase commit | Disable on error | Origin | Password required | Run as owner? | Failover | Retain dead tuples | Synchronous commit | Conninfo | Skip LSN +-----------------+---------------------------+---------+-------------+--------+-----------+------------------+------------------+--------+-------------------+---------------+----------+--------------------+--------------------+-----------------------------+------------ + regress_testsub | regress_subscription_user | f | {testpub} | f | parallel | d | f | any | t | f | f | f | off | dbname=regress_doesnotexist | 0/00000000 (1 row) ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index 40d8056fcea..18fed63e738 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -2127,30 +2127,30 @@ explain (verbose, costs off) select ss2.* from int8_tbl t1 left join (int8_tbl t2 left join - (select coalesce(q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 inner join + (select coalesce(q1, q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 inner join lateral (select ss1.x as y, * from int8_tbl t4) ss2 on t2.q2 = ss2.q1) on t1.q2 = ss2.q1 order by 1, 2, 3; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------- Sort - Output: (COALESCE(t3.q1)), t4.q1, t4.q2 - Sort Key: (COALESCE(t3.q1)), t4.q1, t4.q2 + Output: (COALESCE(t3.q1, t3.q1)), t4.q1, t4.q2 + Sort Key: (COALESCE(t3.q1, t3.q1)), t4.q1, t4.q2 -> Hash Right Join - Output: (COALESCE(t3.q1)), t4.q1, t4.q2 + Output: (COALESCE(t3.q1, t3.q1)), t4.q1, t4.q2 Hash Cond: (t4.q1 = t1.q2) -> Hash Join - Output: (COALESCE(t3.q1)), t4.q1, t4.q2 + Output: (COALESCE(t3.q1, t3.q1)), t4.q1, t4.q2 Hash Cond: (t2.q2 = t4.q1) -> Hash Left Join - Output: t2.q2, (COALESCE(t3.q1)) + Output: t2.q2, (COALESCE(t3.q1, t3.q1)) Hash Cond: (t2.q1 = t3.q2) -> Seq Scan on public.int8_tbl t2 Output: t2.q1, t2.q2 -> Hash - Output: t3.q2, (COALESCE(t3.q1)) + Output: t3.q2, (COALESCE(t3.q1, t3.q1)) -> Seq Scan on public.int8_tbl t3 - Output: t3.q2, COALESCE(t3.q1) + Output: t3.q2, COALESCE(t3.q1, t3.q1) -> Hash Output: t4.q1, t4.q2 -> Seq Scan on public.int8_tbl t4 @@ -2164,7 +2164,7 @@ order by 1, 2, 3; select ss2.* from int8_tbl t1 left join (int8_tbl t2 left join - (select coalesce(q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 inner join + (select coalesce(q1, q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 inner join lateral (select ss1.x as y, * from int8_tbl t4) ss2 on t2.q2 = ss2.q1) on t1.q2 = ss2.q1 order by 1, 2, 3; @@ -2201,32 +2201,32 @@ explain (verbose, costs off) select ss2.* from int8_tbl t1 left join (int8_tbl t2 left join - (select coalesce(q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 left join + (select coalesce(q1, q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 left join lateral (select ss1.x as y, * from int8_tbl t4) ss2 on t2.q2 = ss2.q1) on t1.q2 = ss2.q1 order by 1, 2, 3; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------- Sort - Output: ((COALESCE(t3.q1))), t4.q1, t4.q2 - Sort Key: ((COALESCE(t3.q1))), t4.q1, t4.q2 + Output: ((COALESCE(t3.q1, t3.q1))), t4.q1, t4.q2 + Sort Key: ((COALESCE(t3.q1, t3.q1))), t4.q1, t4.q2 -> Hash Right Join - Output: ((COALESCE(t3.q1))), t4.q1, t4.q2 + Output: ((COALESCE(t3.q1, t3.q1))), t4.q1, t4.q2 Hash Cond: (t4.q1 = t1.q2) -> Nested Loop - Output: t4.q1, t4.q2, ((COALESCE(t3.q1))) + Output: t4.q1, t4.q2, ((COALESCE(t3.q1, t3.q1))) Join Filter: (t2.q2 = t4.q1) -> Hash Left Join - Output: t2.q2, (COALESCE(t3.q1)) + Output: t2.q2, (COALESCE(t3.q1, t3.q1)) Hash Cond: (t2.q1 = t3.q2) -> Seq Scan on public.int8_tbl t2 Output: t2.q1, t2.q2 -> Hash - Output: t3.q2, (COALESCE(t3.q1)) + Output: t3.q2, (COALESCE(t3.q1, t3.q1)) -> Seq Scan on public.int8_tbl t3 - Output: t3.q2, COALESCE(t3.q1) + Output: t3.q2, COALESCE(t3.q1, t3.q1) -> Seq Scan on public.int8_tbl t4 - Output: t4.q1, t4.q2, (COALESCE(t3.q1)) + Output: t4.q1, t4.q2, (COALESCE(t3.q1, t3.q1)) -> Hash Output: t1.q2 -> Seq Scan on public.int8_tbl t1 @@ -2236,7 +2236,7 @@ order by 1, 2, 3; select ss2.* from int8_tbl t1 left join (int8_tbl t2 left join - (select coalesce(q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 left join + (select coalesce(q1, q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 left join lateral (select ss1.x as y, * from int8_tbl t4) ss2 on t2.q2 = ss2.q1) on t1.q2 = ss2.q1 order by 1, 2, 3; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index ae17d028ed3..83228cfca29 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -232,22 +232,3 @@ select * from pg_timezone_abbrevs where abbrev = 'LMT'; LMT | @ 7 hours 52 mins 58 secs ago | f (1 row) -DO $$ -DECLARE - bg_writer_pid int; - r RECORD; -BEGIN - SELECT pid from pg_stat_activity where backend_type='background writer' - INTO bg_writer_pid; - - select type, name, ident - from pg_get_process_memory_contexts(bg_writer_pid, false, 20) - where path = '{1}' into r; - RAISE NOTICE '%', r; - select type, name, ident - from pg_get_process_memory_contexts(pg_backend_pid(), false, 20) - where path = '{1}' into r; - RAISE NOTICE '%', r; -END $$; -NOTICE: (AllocSet,TopMemoryContext,) -NOTICE: (AllocSet,TopMemoryContext,) diff --git a/src/test/regress/expected/timestamp.out b/src/test/regress/expected/timestamp.out index 6aaa19c8f4e..14a9f5b56a6 100644 --- a/src/test/regress/expected/timestamp.out +++ b/src/test/regress/expected/timestamp.out @@ -591,6 +591,16 @@ SELECT date_trunc( 'week', timestamp '2004-02-29 15:44:17.71393' ) AS week_trunc Mon Feb 23 00:00:00 2004 (1 row) +SELECT date_trunc( 'week', timestamp 'infinity' ) AS inf_trunc; + inf_trunc +----------- + infinity +(1 row) + +SELECT date_trunc( 'timezone', timestamp '2004-02-29 15:44:17.71393' ) AS notsupp_trunc; +ERROR: unit "timezone" not supported for type timestamp without time zone +SELECT date_trunc( 'timezone', timestamp 'infinity' ) AS notsupp_inf_trunc; +ERROR: unit "timezone" not supported for type timestamp without time zone SELECT date_trunc( 'ago', timestamp 'infinity' ) AS invalid_trunc; ERROR: unit "ago" not recognized for type timestamp without time zone -- verify date_bin behaves the same as date_trunc for relevant intervals diff --git a/src/test/regress/expected/timestamptz.out b/src/test/regress/expected/timestamptz.out index 2a69953ff25..5dc8a621f6c 100644 --- a/src/test/regress/expected/timestamptz.out +++ b/src/test/regress/expected/timestamptz.out @@ -760,6 +760,16 @@ SELECT date_trunc( 'week', timestamp with time zone '2004-02-29 15:44:17.71393' Mon Feb 23 00:00:00 2004 PST (1 row) +SELECT date_trunc( 'week', timestamp with time zone 'infinity' ) AS inf_trunc; + inf_trunc +----------- + infinity +(1 row) + +SELECT date_trunc( 'timezone', timestamp with time zone '2004-02-29 15:44:17.71393' ) AS notsupp_trunc; +ERROR: unit "timezone" not supported for type timestamp with time zone +SELECT date_trunc( 'timezone', timestamp with time zone 'infinity' ) AS notsupp_inf_trunc; +ERROR: unit "timezone" not supported for type timestamp with time zone SELECT date_trunc( 'ago', timestamp with time zone 'infinity' ) AS invalid_trunc; ERROR: unit "ago" not recognized for type timestamp with time zone SELECT date_trunc('day', timestamp with time zone '2001-02-16 20:38:40+00', 'Australia/Sydney') as sydney_trunc; -- zone name @@ -780,6 +790,14 @@ SELECT date_trunc('day', timestamp with time zone '2001-02-16 20:38:40+00', 'VET Thu Feb 15 20:00:00 2001 PST (1 row) +SELECT date_trunc('timezone', timestamp with time zone 'infinity', 'GMT') AS notsupp_zone_trunc; +ERROR: unit "timezone" not supported for type timestamp with time zone +SELECT date_trunc( 'week', timestamp with time zone 'infinity', 'GMT') AS inf_zone_trunc; + inf_zone_trunc +---------------- + infinity +(1 row) + SELECT date_trunc('ago', timestamp with time zone 'infinity', 'GMT') AS invalid_zone_trunc; ERROR: unit "ago" not recognized for type timestamp with time zone -- verify date_bin behaves the same as date_trunc for relevant intervals diff --git a/src/test/regress/expected/triggers.out b/src/test/regress/expected/triggers.out index f245d7f1549..872b9100e1a 100644 --- a/src/test/regress/expected/triggers.out +++ b/src/test/regress/expected/triggers.out @@ -2280,6 +2280,27 @@ select * from parted; drop table parted; drop function parted_trigfunc(); -- +-- Constraint triggers +-- +create constraint trigger crtr + after insert on foo not valid + for each row execute procedure foo (); +ERROR: constraint triggers cannot be marked NOT VALID +LINE 2: after insert on foo not valid + ^ +create constraint trigger crtr + after insert on foo no inherit + for each row execute procedure foo (); +ERROR: constraint triggers cannot be marked NO INHERIT +LINE 2: after insert on foo no inherit + ^ +create constraint trigger crtr + after insert on foo not enforced + for each row execute procedure foo (); +ERROR: constraint triggers cannot be marked NOT ENFORCED +LINE 2: after insert on foo not enforced + ^ +-- -- Constraint triggers and partitioned tables create table parted_constr_ancestor (a int, b text) partition by range (b); @@ -2294,7 +2315,7 @@ create constraint trigger parted_trig after insert on parted_constr_ancestor deferrable for each row execute procedure trigger_notice_ab(); create constraint trigger parted_trig_two after insert on parted_constr - deferrable initially deferred + deferrable initially deferred enforced for each row when (bark(new.b) AND new.a % 2 = 1) execute procedure trigger_notice_ab(); -- The immediate constraint is fired immediately; the WHEN clause of the @@ -3535,8 +3556,8 @@ drop table parent, child; drop function f(); -- Test who runs deferred trigger functions -- setup -create role regress_groot; -create role regress_outis; +create role regress_caller; +create role regress_fn_owner; create function whoami() returns trigger language plpgsql as $$ begin @@ -3544,7 +3565,7 @@ begin return null; end; $$; -alter function whoami() owner to regress_outis; +alter function whoami() owner to regress_fn_owner; create table defer_trig (id integer); grant insert on defer_trig to public; create constraint trigger whoami after insert on defer_trig @@ -3553,23 +3574,23 @@ create constraint trigger whoami after insert on defer_trig execute function whoami(); -- deferred triggers must run as the user that queued the trigger begin; -set role regress_groot; +set role regress_caller; insert into defer_trig values (1); reset role; -set role regress_outis; +set role regress_fn_owner; insert into defer_trig values (2); reset role; commit; -NOTICE: I am regress_groot -NOTICE: I am regress_outis +NOTICE: I am regress_caller +NOTICE: I am regress_fn_owner -- security definer functions override the user who queued the trigger alter function whoami() security definer; begin; -set role regress_groot; +set role regress_caller; insert into defer_trig values (3); reset role; commit; -NOTICE: I am regress_outis +NOTICE: I am regress_fn_owner alter function whoami() security invoker; -- make sure the current user is restored after error create or replace function whoami() returns trigger language plpgsql @@ -3581,11 +3602,11 @@ begin end; $$; begin; -set role regress_groot; +set role regress_caller; insert into defer_trig values (4); reset role; commit; -- error expected -NOTICE: I am regress_groot +NOTICE: I am regress_caller ERROR: division by zero CONTEXT: SQL statement "SELECT 1 / 0" PL/pgSQL function whoami() line 4 at PERFORM @@ -3598,5 +3619,5 @@ select current_user = session_user; -- clean up drop table defer_trig; drop function whoami(); -drop role regress_outis; -drop role regress_groot; +drop role regress_fn_owner; +drop role regress_caller; diff --git a/src/test/regress/expected/type_sanity.out b/src/test/regress/expected/type_sanity.out index dd0c52ab08b..943e56506bf 100644 --- a/src/test/regress/expected/type_sanity.out +++ b/src/test/regress/expected/type_sanity.out @@ -711,6 +711,7 @@ CREATE TABLE tab_core_types AS SELECT 'regtype'::regtype type, 'pg_monitor'::regrole, 'pg_class'::regclass::oid, + 'template1'::regdatabase, '(1,1)'::tid, '2'::xid, '3'::cid, '10:20:10,14,15'::txid_snapshot, '10:20:10,14,15'::pg_snapshot, diff --git a/src/test/regress/expected/without_overlaps.out b/src/test/regress/expected/without_overlaps.out index ea607bed0a4..f3144bdc39c 100644 --- a/src/test/regress/expected/without_overlaps.out +++ b/src/test/regress/expected/without_overlaps.out @@ -1426,7 +1426,7 @@ CREATE TABLE temporal_fk_rng2rng ( CONSTRAINT temporal_fk_rng2rng_fk FOREIGN KEY (parent_id, valid_at) REFERENCES temporal_rng (id, valid_at) ); -ERROR: foreign key must use PERIOD when referencing a primary using WITHOUT OVERLAPS +ERROR: foreign key must use PERIOD when referencing a primary key using WITHOUT OVERLAPS -- (parent_id, valid_at) REFERENCES (id, PERIOD valid_at) -- FOREIGN KEY part should specify PERIOD CREATE TABLE temporal_fk_rng2rng ( @@ -1900,7 +1900,7 @@ CREATE TABLE temporal_fk_mltrng2mltrng ( CONSTRAINT temporal_fk_mltrng2mltrng_fk FOREIGN KEY (parent_id, valid_at) REFERENCES temporal_mltrng (id, valid_at) ); -ERROR: foreign key must use PERIOD when referencing a primary using WITHOUT OVERLAPS +ERROR: foreign key must use PERIOD when referencing a primary key using WITHOUT OVERLAPS -- (parent_id, valid_at) REFERENCES (id, PERIOD valid_at) -- FOREIGN KEY part should specify PERIOD CREATE TABLE temporal_fk_mltrng2mltrng ( diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index a424be2a6bf..fbffc67ae60 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -123,7 +123,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr # The stats test resets stats, so nothing else needing stats access can be in # this group. # ---------- -test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa +test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression compression_lz4 memoize stats predicate numa # event_trigger depends on create_am and cannot run concurrently with # any test that runs DDL diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 5ce9d1e429f..fc6e36d0e78 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -3069,6 +3069,23 @@ drop table attbl, atref; /* End test case for bug #17409 */ +/* Test case for bug #18970 */ + +create table attbl(a int); +create table atref(b attbl check ((b).a is not null)); +alter table attbl alter column a type numeric; -- someday this should work +alter table atref drop constraint atref_b_check; + +create statistics atref_stat on ((b).a is not null) from atref; +alter table attbl alter column a type numeric; -- someday this should work +drop statistics atref_stat; + +create index atref_idx on atref (((b).a)); +alter table attbl alter column a type numeric; -- someday this should work +drop table attbl, atref; + +/* End test case for bug #18970 */ + -- Test that ALTER TABLE rewrite preserves a clustered index -- for normal indexes and indexes on constraints. create table alttype_cluster (a int); diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 68c61dbc7d1..6aaaa386abc 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -143,38 +143,83 @@ SELECT proname, proargtypes, pronamespace ORDER BY proname DESC, proargtypes DESC, pronamespace DESC LIMIT 1; -- --- Add coverage for RowCompare quals whose rhs row has a NULL that ends scan +-- Forwards scan RowCompare qual whose row arg has a NULL that affects our +-- initial positioning strategy -- explain (costs off) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) < ('abs', NULL) + WHERE (proname, proargtypes) >= ('abs', NULL) AND proname <= 'abs' ORDER BY proname, proargtypes, pronamespace; SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) < ('abs', NULL) + WHERE (proname, proargtypes) >= ('abs', NULL) AND proname <= 'abs' ORDER BY proname, proargtypes, pronamespace; -- --- Add coverage for backwards scan RowCompare quals whose rhs row has a NULL --- that ends scan +-- Forwards scan RowCompare quals whose row arg has a NULL that ends scan -- explain (costs off) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) > ('abs', NULL) + WHERE proname >= 'abs' AND (proname, proargtypes) < ('abs', NULL) +ORDER BY proname, proargtypes, pronamespace; + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname >= 'abs' AND (proname, proargtypes) < ('abs', NULL) +ORDER BY proname, proargtypes, pronamespace; + +-- +-- Backwards scan RowCompare qual whose row arg has a NULL that affects our +-- initial positioning strategy +-- +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname >= 'abs' AND (proname, proargtypes) <= ('abs', NULL) ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) > ('abs', NULL) + WHERE proname >= 'abs' AND (proname, proargtypes) <= ('abs', NULL) ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; -- --- Add coverage for recheck of > key following array advancement on previous --- (left sibling) page that used a high key whose attribute value corresponding --- to the > key was -inf (due to being truncated when the high key was created). +-- Backwards scan RowCompare qual whose row arg has a NULL that ends scan +-- +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE (proname, proargtypes) > ('abs', NULL) AND proname <= 'abs' +ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE (proname, proargtypes) > ('abs', NULL) AND proname <= 'abs' +ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; + +-- Makes B-Tree preprocessing deal with unmarking redundant keys that were +-- initially marked required (test case relies on current row compare +-- preprocessing limitations) +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname = 'zzzzzz' AND (proname, proargtypes) > ('abs', NULL) + AND pronamespace IN (1, 2, 3) AND proargtypes IN ('26 23', '5077') +ORDER BY proname, proargtypes, pronamespace; + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname = 'zzzzzz' AND (proname, proargtypes) > ('abs', NULL) + AND pronamespace IN (1, 2, 3) AND proargtypes IN ('26 23', '5077') +ORDER BY proname, proargtypes, pronamespace; + +-- +-- Performs a recheck of > key following array advancement on previous (left +-- sibling) page that used a high key whose attribute value corresponding to +-- the > key was -inf (due to being truncated when the high key was created). -- -- XXX This relies on the assumption that tenk1_thous_tenthous has a truncated -- high key "(183, -inf)" on the first page that we'll scan. The test will only diff --git a/src/test/regress/sql/compression.sql b/src/test/regress/sql/compression.sql index 490595fcfb2..ce5ea37a660 100644 --- a/src/test/regress/sql/compression.sql +++ b/src/test/regress/sql/compression.sql @@ -1,3 +1,8 @@ +-- Default set of tests for TOAST compression, independent on compression +-- methods supported by the build. + +CREATE SCHEMA pglz; +SET search_path TO pglz, public; \set HIDE_TOAST_COMPRESSION false -- ensure we get stable results regardless of installation's default @@ -8,53 +13,27 @@ CREATE TABLE cmdata(f1 text COMPRESSION pglz); CREATE INDEX idx ON cmdata(f1); INSERT INTO cmdata VALUES(repeat('1234567890', 1000)); \d+ cmdata -CREATE TABLE cmdata1(f1 TEXT COMPRESSION lz4); -INSERT INTO cmdata1 VALUES(repeat('1234567890', 1004)); -\d+ cmdata1 -- verify stored compression method in the data SELECT pg_column_compression(f1) FROM cmdata; -SELECT pg_column_compression(f1) FROM cmdata1; -- decompress data slice SELECT SUBSTR(f1, 200, 5) FROM cmdata; -SELECT SUBSTR(f1, 2000, 50) FROM cmdata1; -- copy with table creation SELECT * INTO cmmove1 FROM cmdata; \d+ cmmove1 SELECT pg_column_compression(f1) FROM cmmove1; --- copy to existing table -CREATE TABLE cmmove3(f1 text COMPRESSION pglz); -INSERT INTO cmmove3 SELECT * FROM cmdata; -INSERT INTO cmmove3 SELECT * FROM cmdata1; -SELECT pg_column_compression(f1) FROM cmmove3; - --- test LIKE INCLUDING COMPRESSION -CREATE TABLE cmdata2 (LIKE cmdata1 INCLUDING COMPRESSION); -\d+ cmdata2 -DROP TABLE cmdata2; - -- try setting compression for incompressible data type CREATE TABLE cmdata2 (f1 int COMPRESSION pglz); --- update using datum from different table -CREATE TABLE cmmove2(f1 text COMPRESSION pglz); -INSERT INTO cmmove2 VALUES (repeat('1234567890', 1004)); -SELECT pg_column_compression(f1) FROM cmmove2; -UPDATE cmmove2 SET f1 = cmdata1.f1 FROM cmdata1; -SELECT pg_column_compression(f1) FROM cmmove2; - -- test externally stored compressed data CREATE OR REPLACE FUNCTION large_val() RETURNS TEXT LANGUAGE SQL AS 'select array_agg(fipshash(g::text))::text from generate_series(1, 256) g'; CREATE TABLE cmdata2 (f1 text COMPRESSION pglz); INSERT INTO cmdata2 SELECT large_val() || repeat('a', 4000); SELECT pg_column_compression(f1) FROM cmdata2; -INSERT INTO cmdata1 SELECT large_val() || repeat('a', 4000); -SELECT pg_column_compression(f1) FROM cmdata1; -SELECT SUBSTR(f1, 200, 5) FROM cmdata1; SELECT SUBSTR(f1, 200, 5) FROM cmdata2; DROP TABLE cmdata2; @@ -76,76 +55,31 @@ ALTER TABLE cmdata2 ALTER COLUMN f1 SET STORAGE plain; INSERT INTO cmdata2 VALUES (repeat('123456789', 800)); SELECT pg_column_compression(f1) FROM cmdata2; --- test compression with materialized view -CREATE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata1; -\d+ compressmv -SELECT pg_column_compression(f1) FROM cmdata1; -SELECT pg_column_compression(x) FROM compressmv; - --- test compression with partition -CREATE TABLE cmpart(f1 text COMPRESSION lz4) PARTITION BY HASH(f1); -CREATE TABLE cmpart1 PARTITION OF cmpart FOR VALUES WITH (MODULUS 2, REMAINDER 0); -CREATE TABLE cmpart2(f1 text COMPRESSION pglz); - -ALTER TABLE cmpart ATTACH PARTITION cmpart2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -SELECT pg_column_compression(f1) FROM cmpart1; -SELECT pg_column_compression(f1) FROM cmpart2; - -- test compression with inheritance -CREATE TABLE cminh() INHERITS(cmdata, cmdata1); -- error -CREATE TABLE cminh(f1 TEXT COMPRESSION lz4) INHERITS(cmdata); -- error CREATE TABLE cmdata3(f1 text); CREATE TABLE cminh() INHERITS (cmdata, cmdata3); -- test default_toast_compression GUC +-- suppress machine-dependent details +\set VERBOSITY terse SET default_toast_compression = ''; SET default_toast_compression = 'I do not exist compression'; -SET default_toast_compression = 'lz4'; SET default_toast_compression = 'pglz'; - --- test alter compression method -ALTER TABLE cmdata ALTER COLUMN f1 SET COMPRESSION lz4; -INSERT INTO cmdata VALUES (repeat('123456789', 4004)); -\d+ cmdata -SELECT pg_column_compression(f1) FROM cmdata; +\set VERBOSITY default ALTER TABLE cmdata2 ALTER COLUMN f1 SET COMPRESSION default; \d+ cmdata2 --- test alter compression method for materialized views -ALTER MATERIALIZED VIEW compressmv ALTER COLUMN x SET COMPRESSION lz4; -\d+ compressmv - --- test alter compression method for partitioned tables -ALTER TABLE cmpart1 ALTER COLUMN f1 SET COMPRESSION pglz; -ALTER TABLE cmpart2 ALTER COLUMN f1 SET COMPRESSION lz4; - --- new data should be compressed with the current compression method -INSERT INTO cmpart VALUES (repeat('123456789', 1004)); -INSERT INTO cmpart VALUES (repeat('123456789', 4004)); -SELECT pg_column_compression(f1) FROM cmpart1; -SELECT pg_column_compression(f1) FROM cmpart2; +DROP TABLE cmdata2; -- VACUUM FULL does not recompress SELECT pg_column_compression(f1) FROM cmdata; VACUUM FULL cmdata; SELECT pg_column_compression(f1) FROM cmdata; --- test expression index -DROP TABLE cmdata2; -CREATE TABLE cmdata2 (f1 TEXT COMPRESSION pglz, f2 TEXT COMPRESSION lz4); -CREATE UNIQUE INDEX idx1 ON cmdata2 ((f1 || f2)); -INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEXT))::TEXT FROM -generate_series(1, 50) g), VERSION()); - -- check data is ok SELECT length(f1) FROM cmdata; -SELECT length(f1) FROM cmdata1; SELECT length(f1) FROM cmmove1; -SELECT length(f1) FROM cmmove2; -SELECT length(f1) FROM cmmove3; CREATE TABLE badcompresstbl (a text COMPRESSION I_Do_Not_Exist_Compression); -- fails CREATE TABLE badcompresstbl (a text); diff --git a/src/test/regress/sql/compression_lz4.sql b/src/test/regress/sql/compression_lz4.sql new file mode 100644 index 00000000000..3849f8618de --- /dev/null +++ b/src/test/regress/sql/compression_lz4.sql @@ -0,0 +1,129 @@ +-- Tests for TOAST compression with lz4 + +SELECT NOT(enumvals @> '{lz4}') AS skip_test FROM pg_settings WHERE + name = 'default_toast_compression' \gset +\if :skip_test + \echo '*** skipping TOAST tests with lz4 (not supported) ***' + \quit +\endif + +CREATE SCHEMA lz4; +SET search_path TO lz4, public; + +\set HIDE_TOAST_COMPRESSION false + +-- Ensure we get stable results regardless of the installation's default. +-- We rely on this GUC value for a few tests. +SET default_toast_compression = 'pglz'; + +-- test creating table with compression method +CREATE TABLE cmdata_pglz(f1 text COMPRESSION pglz); +CREATE INDEX idx ON cmdata_pglz(f1); +INSERT INTO cmdata_pglz VALUES(repeat('1234567890', 1000)); +\d+ cmdata +CREATE TABLE cmdata_lz4(f1 TEXT COMPRESSION lz4); +INSERT INTO cmdata_lz4 VALUES(repeat('1234567890', 1004)); +\d+ cmdata1 + +-- verify stored compression method in the data +SELECT pg_column_compression(f1) FROM cmdata_lz4; + +-- decompress data slice +SELECT SUBSTR(f1, 200, 5) FROM cmdata_pglz; +SELECT SUBSTR(f1, 2000, 50) FROM cmdata_lz4; + +-- copy with table creation +SELECT * INTO cmmove1 FROM cmdata_lz4; +\d+ cmmove1 +SELECT pg_column_compression(f1) FROM cmmove1; + +-- test LIKE INCLUDING COMPRESSION. The GUC default_toast_compression +-- has no effect, the compression method from the table being copied. +CREATE TABLE cmdata2 (LIKE cmdata_lz4 INCLUDING COMPRESSION); +\d+ cmdata2 +DROP TABLE cmdata2; + +-- copy to existing table +CREATE TABLE cmmove3(f1 text COMPRESSION pglz); +INSERT INTO cmmove3 SELECT * FROM cmdata_pglz; +INSERT INTO cmmove3 SELECT * FROM cmdata_lz4; +SELECT pg_column_compression(f1) FROM cmmove3; + +-- update using datum from different table with LZ4 data. +CREATE TABLE cmmove2(f1 text COMPRESSION pglz); +INSERT INTO cmmove2 VALUES (repeat('1234567890', 1004)); +SELECT pg_column_compression(f1) FROM cmmove2; +UPDATE cmmove2 SET f1 = cmdata_lz4.f1 FROM cmdata_lz4; +SELECT pg_column_compression(f1) FROM cmmove2; + +-- test externally stored compressed data +CREATE OR REPLACE FUNCTION large_val_lz4() RETURNS TEXT LANGUAGE SQL AS +'select array_agg(fipshash(g::text))::text from generate_series(1, 256) g'; +CREATE TABLE cmdata2 (f1 text COMPRESSION lz4); +INSERT INTO cmdata2 SELECT large_val_lz4() || repeat('a', 4000); +SELECT pg_column_compression(f1) FROM cmdata2; +SELECT SUBSTR(f1, 200, 5) FROM cmdata2; +DROP TABLE cmdata2; +DROP FUNCTION large_val_lz4; + +-- test compression with materialized view +CREATE MATERIALIZED VIEW compressmv(x) AS SELECT * FROM cmdata_lz4; +\d+ compressmv +SELECT pg_column_compression(f1) FROM cmdata_lz4; +SELECT pg_column_compression(x) FROM compressmv; + +-- test compression with partition +CREATE TABLE cmpart(f1 text COMPRESSION lz4) PARTITION BY HASH(f1); +CREATE TABLE cmpart1 PARTITION OF cmpart FOR VALUES WITH (MODULUS 2, REMAINDER 0); +CREATE TABLE cmpart2(f1 text COMPRESSION pglz); + +ALTER TABLE cmpart ATTACH PARTITION cmpart2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); +INSERT INTO cmpart VALUES (repeat('123456789', 1004)); +INSERT INTO cmpart VALUES (repeat('123456789', 4004)); +SELECT pg_column_compression(f1) FROM cmpart1; +SELECT pg_column_compression(f1) FROM cmpart2; + +-- test compression with inheritance +CREATE TABLE cminh() INHERITS(cmdata_pglz, cmdata_lz4); -- error +CREATE TABLE cminh(f1 TEXT COMPRESSION lz4) INHERITS(cmdata_pglz); -- error +CREATE TABLE cmdata3(f1 text); +CREATE TABLE cminh() INHERITS (cmdata_pglz, cmdata3); + +-- test default_toast_compression GUC +SET default_toast_compression = 'lz4'; + +-- test alter compression method +ALTER TABLE cmdata_pglz ALTER COLUMN f1 SET COMPRESSION lz4; +INSERT INTO cmdata_pglz VALUES (repeat('123456789', 4004)); +\d+ cmdata +SELECT pg_column_compression(f1) FROM cmdata_pglz; +ALTER TABLE cmdata_pglz ALTER COLUMN f1 SET COMPRESSION pglz; + +-- test alter compression method for materialized views +ALTER MATERIALIZED VIEW compressmv ALTER COLUMN x SET COMPRESSION lz4; +\d+ compressmv + +-- test alter compression method for partitioned tables +ALTER TABLE cmpart1 ALTER COLUMN f1 SET COMPRESSION pglz; +ALTER TABLE cmpart2 ALTER COLUMN f1 SET COMPRESSION lz4; + +-- new data should be compressed with the current compression method +INSERT INTO cmpart VALUES (repeat('123456789', 1004)); +INSERT INTO cmpart VALUES (repeat('123456789', 4004)); +SELECT pg_column_compression(f1) FROM cmpart1; +SELECT pg_column_compression(f1) FROM cmpart2; + +-- test expression index +CREATE TABLE cmdata2 (f1 TEXT COMPRESSION pglz, f2 TEXT COMPRESSION lz4); +CREATE UNIQUE INDEX idx1 ON cmdata2 ((f1 || f2)); +INSERT INTO cmdata2 VALUES((SELECT array_agg(fipshash(g::TEXT))::TEXT FROM +generate_series(1, 50) g), VERSION()); + +-- check data is ok +SELECT length(f1) FROM cmdata_pglz; +SELECT length(f1) FROM cmdata_lz4; +SELECT length(f1) FROM cmmove1; +SELECT length(f1) FROM cmmove2; +SELECT length(f1) FROM cmmove3; + +\set HIDE_TOAST_COMPRESSION true diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql index 337baab7ced..1f6dc8fd69f 100644 --- a/src/test/regress/sql/constraints.sql +++ b/src/test/regress/sql/constraints.sql @@ -537,6 +537,9 @@ CREATE TABLE UNIQUE_NOTEN_TBL(i int UNIQUE NOT ENFORCED); ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key ENFORCED; ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key NOT ENFORCED; +-- can't make an existing constraint NOT VALID +ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key NOT VALID; + DROP TABLE unique_tbl; -- @@ -997,6 +1000,9 @@ create table constr_parent3 (a int not null); create table constr_child3 () inherits (constr_parent2, constr_parent3); EXECUTE get_nnconstraint_info('{constr_parent3, constr_child3}'); +COMMENT ON CONSTRAINT constr_parent2_a_not_null ON constr_parent2 IS 'this constraint is invalid'; +COMMENT ON CONSTRAINT constr_parent2_a_not_null ON constr_child2 IS 'this constraint is valid'; + DEALLOCATE get_nnconstraint_info; -- end NOT NULL NOT VALID @@ -1037,3 +1043,9 @@ DROP DOMAIN constraint_comments_dom; DROP ROLE regress_constraint_comments; DROP ROLE regress_constraint_comments_noaccess; + +-- Leave some constraints for the pg_upgrade test to pick up +CREATE DOMAIN constraint_comments_dom AS int; + +ALTER DOMAIN constraint_comments_dom ADD CONSTRAINT inv_ck CHECK (value > 0) NOT VALID; +COMMENT ON CONSTRAINT inv_ck ON DOMAIN constraint_comments_dom IS 'comment on invalid constraint'; diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql index f0b88a23db8..a1316c73bac 100644 --- a/src/test/regress/sql/copy.sql +++ b/src/test/regress/sql/copy.sql @@ -94,6 +94,36 @@ this is just a line full of junk that would error out if parsed copy copytest4 to stdout (header); +-- test multi-line header line feature + +create temp table copytest5 (c1 int); + +copy copytest5 from stdin (format csv, header 2); +this is a first header line. +this is a second header line. +1 +2 +\. +copy copytest5 to stdout (header); + +truncate copytest5; +copy copytest5 from stdin (format csv, header 4); +this is a first header line. +this is a second header line. +1 +2 +\. +select count(*) from copytest5; + +truncate copytest5; +copy copytest5 from stdin (format csv, header 5); +this is a first header line. +this is a second header line. +1 +2 +\. +select count(*) from copytest5; + -- test copy from with a partitioned table create table parted_copytest ( a int, diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql index 45273557ce0..cef45868db5 100644 --- a/src/test/regress/sql/copy2.sql +++ b/src/test/regress/sql/copy2.sql @@ -90,6 +90,9 @@ COPY x to stdout (format BINARY, on_error unsupported); COPY x from stdin (log_verbosity unsupported); COPY x from stdin with (reject_limit 1); COPY x from stdin with (on_error ignore, reject_limit 0); +COPY x from stdin with (header -1); +COPY x from stdin with (header 2.5); +COPY x to stdout with (header 2); -- too many columns in column list: should fail COPY x (a, b, c, d, e, d, c) from stdin; diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index e21ff426519..eabc9623b20 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -635,7 +635,7 @@ DROP TABLE cwi_test; CREATE TABLE syscol_table (a INT); -- System columns cannot be indexed -CREATE INDEX ON syscolcol_table (ctid); +CREATE INDEX ON syscol_table (ctid); -- nor used in expressions CREATE INDEX ON syscol_table ((ctid >= '(1000,0)')); diff --git a/src/test/regress/sql/create_table_like.sql b/src/test/regress/sql/create_table_like.sql index 6e21722aaeb..bf8702116a7 100644 --- a/src/test/regress/sql/create_table_like.sql +++ b/src/test/regress/sql/create_table_like.sql @@ -143,9 +143,10 @@ COMMENT ON INDEX ctlt1_pkey IS 'index pkey'; COMMENT ON INDEX ctlt1_b_key IS 'index b_key'; ALTER TABLE ctlt1 ALTER COLUMN a SET STORAGE MAIN; -CREATE TABLE ctlt2 (c text); +CREATE TABLE ctlt2 (c text NOT NULL); ALTER TABLE ctlt2 ALTER COLUMN c SET STORAGE EXTERNAL; COMMENT ON COLUMN ctlt2.c IS 'C'; +COMMENT ON CONSTRAINT ctlt2_c_not_null ON ctlt2 IS 't2_c_not_null'; CREATE TABLE ctlt3 (a text CHECK (length(a) < 5), c text CHECK (length(c) < 7)); ALTER TABLE ctlt3 ALTER COLUMN c SET STORAGE EXTERNAL; @@ -162,6 +163,7 @@ CREATE TABLE ctlt12_storage (LIKE ctlt1 INCLUDING STORAGE, LIKE ctlt2 INCLUDING \d+ ctlt12_storage CREATE TABLE ctlt12_comments (LIKE ctlt1 INCLUDING COMMENTS, LIKE ctlt2 INCLUDING COMMENTS); \d+ ctlt12_comments +SELECT conname, description FROM pg_description, pg_constraint c WHERE classoid = 'pg_constraint'::regclass AND objoid = c.oid AND c.conrelid = 'ctlt12_comments'::regclass; CREATE TABLE ctlt1_inh (LIKE ctlt1 INCLUDING CONSTRAINTS INCLUDING COMMENTS) INHERITS (ctlt1); \d+ ctlt1_inh SELECT description FROM pg_description, pg_constraint c WHERE classoid = 'pg_constraint'::regclass AND objoid = c.oid AND c.conrelid = 'ctlt1_inh'::regclass; @@ -197,9 +199,19 @@ DROP TABLE ctlt1, ctlt2, ctlt3, ctlt4, ctlt12_storage, ctlt12_comments, ctlt1_in -- LIKE must respect NO INHERIT property of constraints CREATE TABLE noinh_con_copy (a int CHECK (a > 0) NO INHERIT, b int not null, c int not null no inherit); -CREATE TABLE noinh_con_copy1 (LIKE noinh_con_copy INCLUDING CONSTRAINTS); + +COMMENT ON CONSTRAINT noinh_con_copy_b_not_null ON noinh_con_copy IS 'not null b'; +COMMENT ON CONSTRAINT noinh_con_copy_c_not_null ON noinh_con_copy IS 'not null c no inherit'; + +CREATE TABLE noinh_con_copy1 (LIKE noinh_con_copy INCLUDING CONSTRAINTS INCLUDING COMMENTS); \d+ noinh_con_copy1 +SELECT conname, description +FROM pg_description, pg_constraint c +WHERE classoid = 'pg_constraint'::regclass +AND objoid = c.oid AND c.conrelid = 'noinh_con_copy1'::regclass +ORDER BY conname COLLATE "C"; + -- fail, as partitioned tables don't allow NO INHERIT constraints CREATE TABLE noinh_con_copy1_parted (LIKE noinh_con_copy INCLUDING ALL) PARTITION BY LIST (a); diff --git a/src/test/regress/sql/domain.sql b/src/test/regress/sql/domain.sql index b752a63ab5f..b8f5a639712 100644 --- a/src/test/regress/sql/domain.sql +++ b/src/test/regress/sql/domain.sql @@ -602,6 +602,9 @@ insert into domain_test values (1, 2); -- should fail alter table domain_test add column c str_domain; +-- disallow duplicated not-null constraints +create domain int_domain1 as int constraint nn1 not null constraint nn2 not null; + create domain str_domain2 as text check (value <> 'foo') default 'foo'; -- should fail diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql index 8159e363022..39174ad1eb9 100644 --- a/src/test/regress/sql/foreign_key.sql +++ b/src/test/regress/sql/foreign_key.sql @@ -1296,7 +1296,7 @@ UPDATE fk_notpartitioned_pk SET b = 2504 WHERE a = 2500; -- check psql behavior \d fk_notpartitioned_pk --- Check the exsting FK trigger +-- Check the existing FK trigger SELECT conname, tgrelid::regclass as tgrel, regexp_replace(tgname, '[0-9]+', 'N') as tgname, tgtype FROM pg_trigger t JOIN pg_constraint c ON (t.tgconstraint = c.oid) WHERE tgrelid IN (SELECT relid FROM pg_partition_tree('fk_partitioned_fk'::regclass) @@ -1389,22 +1389,44 @@ WHERE conrelid::regclass::text like 'fk_partitioned_fk%' ORDER BY oid::regclass: DROP TABLE fk_partitioned_fk, fk_notpartitioned_pk; --- NOT VALID foreign key on a non-partitioned table referencing a partitioned table +-- NOT VALID and NOT ENFORCED foreign key on a non-partitioned table +-- referencing a partitioned table CREATE TABLE fk_partitioned_pk (a int, b int, PRIMARY KEY (a, b)) PARTITION BY RANGE (a, b); CREATE TABLE fk_partitioned_pk_1 PARTITION OF fk_partitioned_pk FOR VALUES FROM (0,0) TO (1000,1000); +CREATE TABLE fk_partitioned_pk_2 PARTITION OF fk_partitioned_pk FOR VALUES FROM (1000,1000) TO (2000,2000); CREATE TABLE fk_notpartitioned_fk (b int, a int); -ALTER TABLE fk_notpartitioned_fk ADD FOREIGN KEY (a, b) REFERENCES fk_partitioned_pk NOT VALID; - --- Constraint will be invalid. -SELECT conname, convalidated FROM pg_constraint +INSERT INTO fk_partitioned_pk VALUES(100,100), (1000,1000); +INSERT INTO fk_notpartitioned_fk VALUES(100,100), (1000,1000); +ALTER TABLE fk_notpartitioned_fk ADD CONSTRAINT fk_notpartitioned_fk_a_b_fkey + FOREIGN KEY (a, b) REFERENCES fk_partitioned_pk NOT VALID; +ALTER TABLE fk_notpartitioned_fk ADD CONSTRAINT fk_notpartitioned_fk_a_b_fkey2 + FOREIGN KEY (a, b) REFERENCES fk_partitioned_pk NOT ENFORCED; + +-- All constraints will be invalid, and _fkey2 constraints will not be enforced. +SELECT conname, conenforced, convalidated FROM pg_constraint WHERE conrelid = 'fk_notpartitioned_fk'::regclass ORDER BY oid::regclass::text; ALTER TABLE fk_notpartitioned_fk VALIDATE CONSTRAINT fk_notpartitioned_fk_a_b_fkey; +ALTER TABLE fk_notpartitioned_fk ALTER CONSTRAINT fk_notpartitioned_fk_a_b_fkey2 ENFORCED; --- All constraints are now valid. -SELECT conname, convalidated FROM pg_constraint +-- All constraints are now valid and enforced. +SELECT conname, conenforced, convalidated FROM pg_constraint WHERE conrelid = 'fk_notpartitioned_fk'::regclass ORDER BY oid::regclass::text; +-- test a self-referential FK +ALTER TABLE fk_partitioned_pk ADD CONSTRAINT selffk FOREIGN KEY (a, b) REFERENCES fk_partitioned_pk NOT VALID; +CREATE TABLE fk_partitioned_pk_3 PARTITION OF fk_partitioned_pk FOR VALUES FROM (2000,2000) TO (3000,3000) + PARTITION BY RANGE (a); +CREATE TABLE fk_partitioned_pk_3_1 PARTITION OF fk_partitioned_pk_3 FOR VALUES FROM (2000) TO (2100); +SELECT conname, conenforced, convalidated FROM pg_constraint +WHERE conrelid = 'fk_partitioned_pk'::regclass AND contype = 'f' +ORDER BY oid::regclass::text; +ALTER TABLE fk_partitioned_pk_2 VALIDATE CONSTRAINT selffk; +ALTER TABLE fk_partitioned_pk VALIDATE CONSTRAINT selffk; +SELECT conname, conenforced, convalidated FROM pg_constraint +WHERE conrelid = 'fk_partitioned_pk'::regclass AND contype = 'f' +ORDER BY oid::regclass::text; + DROP TABLE fk_notpartitioned_fk, fk_partitioned_pk; -- Test some other exotic foreign key features: MATCH SIMPLE, ON UPDATE/DELETE diff --git a/src/test/regress/sql/generated_stored.sql b/src/test/regress/sql/generated_stored.sql index 4ec155f2da9..f56fde8d4e5 100644 --- a/src/test/regress/sql/generated_stored.sql +++ b/src/test/regress/sql/generated_stored.sql @@ -595,6 +595,19 @@ ALTER TABLE gtest30_1 ALTER COLUMN b DROP EXPRESSION; -- error CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') STORED, c text); CREATE TABLE gtest31_2 (x int, y gtest31_1); ALTER TABLE gtest31_1 ALTER COLUMN b TYPE varchar; -- fails + +-- bug #18970: these cases are unsupported, but make sure they fail cleanly +ALTER TABLE gtest31_2 ADD CONSTRAINT cc CHECK ((y).b IS NOT NULL); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello1'); +ALTER TABLE gtest31_2 DROP CONSTRAINT cc; + +CREATE STATISTICS gtest31_2_stat ON ((y).b is not null) FROM gtest31_2; +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello2'); +DROP STATISTICS gtest31_2_stat; + +CREATE INDEX gtest31_2_y_idx ON gtest31_2(((y).b)); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello3'); + DROP TABLE gtest31_1, gtest31_2; -- Check it for a partitioned table, too diff --git a/src/test/regress/sql/generated_virtual.sql b/src/test/regress/sql/generated_virtual.sql index b4eedeee2fb..ba19bc4c701 100644 --- a/src/test/regress/sql/generated_virtual.sql +++ b/src/test/regress/sql/generated_virtual.sql @@ -253,10 +253,10 @@ CREATE TABLE gtest4 ( a int, b double_int GENERATED ALWAYS AS ((a * 2, a * 3)) VIRTUAL ); -INSERT INTO gtest4 VALUES (1), (6); -SELECT * FROM gtest4; +--INSERT INTO gtest4 VALUES (1), (6); +--SELECT * FROM gtest4; -DROP TABLE gtest4; +--DROP TABLE gtest4; DROP TYPE double_int; -- using tableoid is allowed @@ -290,20 +290,21 @@ GRANT SELECT (a, c) ON gtest11 TO regress_user11; CREATE FUNCTION gf1(a int) RETURNS int AS $$ SELECT a * 3 $$ IMMUTABLE LANGUAGE SQL; REVOKE ALL ON FUNCTION gf1(int) FROM PUBLIC; -CREATE TABLE gtest12 (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (gf1(b)) VIRTUAL); -INSERT INTO gtest12 VALUES (1, 10), (2, 20); -GRANT SELECT (a, c), INSERT ON gtest12 TO regress_user11; +CREATE TABLE gtest12 (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (gf1(b)) VIRTUAL); -- fails, user-defined function +--INSERT INTO gtest12 VALUES (1, 10), (2, 20); +--GRANT SELECT (a, c), INSERT ON gtest12 TO regress_user11; SET ROLE regress_user11; SELECT a, b FROM gtest11; -- not allowed SELECT a, c FROM gtest11; -- allowed SELECT gf1(10); -- not allowed -INSERT INTO gtest12 VALUES (3, 30), (4, 40); -- allowed (does not actually invoke the function) -SELECT a, c FROM gtest12; -- currently not allowed because of function permissions, should arguably be allowed +--INSERT INTO gtest12 VALUES (3, 30), (4, 40); -- allowed (does not actually invoke the function) +--SELECT a, c FROM gtest12; -- currently not allowed because of function permissions, should arguably be allowed RESET ROLE; -DROP FUNCTION gf1(int); -- fail -DROP TABLE gtest11, gtest12; +--DROP FUNCTION gf1(int); -- fail +DROP TABLE gtest11; +--DROP TABLE gtest12; DROP FUNCTION gf1(int); DROP USER regress_user11; @@ -453,11 +454,19 @@ CREATE TABLE gtest24r (a int PRIMARY KEY, b gtestdomain1range GENERATED ALWAYS A --INSERT INTO gtest24r (a) VALUES (4); -- ok --INSERT INTO gtest24r (a) VALUES (6); -- error +CREATE TABLE gtest24at (a int PRIMARY KEY); +ALTER TABLE gtest24at ADD COLUMN b gtestdomain1 GENERATED ALWAYS AS (a * 2) VIRTUAL; -- error +CREATE TABLE gtest24ata (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) VIRTUAL); +ALTER TABLE gtest24ata ALTER COLUMN b TYPE gtestdomain1; -- error + CREATE DOMAIN gtestdomainnn AS int CHECK (VALUE IS NOT NULL); CREATE TABLE gtest24nn (a int, b gtestdomainnn GENERATED ALWAYS AS (a * 2) VIRTUAL); --INSERT INTO gtest24nn (a) VALUES (4); -- ok --INSERT INTO gtest24nn (a) VALUES (NULL); -- error +-- using user-defined type not yet supported +CREATE TABLE gtest24xxx (a gtestdomain1, b gtestdomain1, c int GENERATED ALWAYS AS (greatest(a, b)) VIRTUAL); -- error + -- typed tables (currently not supported) CREATE TYPE gtest_type AS (f1 integer, f2 text, f3 bigint); CREATE TABLE gtest28 OF gtest_type (f1 WITH OPTIONS GENERATED ALWAYS AS (f2 *2) VIRTUAL); @@ -637,6 +646,19 @@ ALTER TABLE gtest30_1 ALTER COLUMN b DROP EXPRESSION; -- error CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') VIRTUAL, c text); CREATE TABLE gtest31_2 (x int, y gtest31_1); ALTER TABLE gtest31_1 ALTER COLUMN b TYPE varchar; -- fails + +-- bug #18970 +ALTER TABLE gtest31_2 ADD CONSTRAINT cc CHECK ((y).b IS NOT NULL); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello1'); +ALTER TABLE gtest31_2 DROP CONSTRAINT cc; + +CREATE STATISTICS gtest31_2_stat ON ((y).b is not null) FROM gtest31_2; +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello2'); +DROP STATISTICS gtest31_2_stat; + +CREATE INDEX gtest31_2_y_idx ON gtest31_2(((y).b)); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello3'); + DROP TABLE gtest31_1, gtest31_2; -- Check it for a partitioned table, too @@ -788,7 +810,8 @@ create table gtest32 ( a int primary key, b int generated always as (a * 2), c int generated always as (10 + 10), - d int generated always as (coalesce(a, 100)) + d int generated always as (coalesce(a, 100)), + e int ); insert into gtest32 values (1), (2); @@ -829,7 +852,19 @@ select t2.* from gtest32 t1 left join gtest32 t2 on false; select t2.* from gtest32 t1 left join gtest32 t2 on false; explain (verbose, costs off) -select * from gtest32 t group by grouping sets (a, b, c, d) having c = 20; -select * from gtest32 t group by grouping sets (a, b, c, d) having c = 20; +select * from gtest32 t group by grouping sets (a, b, c, d, e) having c = 20; +select * from gtest32 t group by grouping sets (a, b, c, d, e) having c = 20; + +-- Ensure that the virtual generated columns in ALTER COLUMN TYPE USING expression are expanded +alter table gtest32 alter column e type bigint using b; + +-- Ensure that virtual generated column references within SubLinks that should +-- be transformed into joins can get expanded +explain (costs off) +select 1 from gtest32 t1 where exists + (select 1 from gtest32 t2 where t1.a > t2.a and t2.b = 2); + +select 1 from gtest32 t1 where exists + (select 1 from gtest32 t2 where t1.a > t2.a and t2.b = 2); drop table gtest32; diff --git a/src/test/regress/sql/horology.sql b/src/test/regress/sql/horology.sql index 1310b432773..8978249a5dc 100644 --- a/src/test/regress/sql/horology.sql +++ b/src/test/regress/sql/horology.sql @@ -102,6 +102,10 @@ SELECT date 'J J 1520447'; SELECT timestamp with time zone 'Y2001M12D27H04M05S06.789+08'; SELECT timestamp with time zone 'Y2001M12D27H04MM05S06.789-08'; +-- More examples we used to accept and should not +SELECT timestamp with time zone 'J2452271 T X03456-08'; +SELECT timestamp with time zone 'J2452271 T X03456.001e6-08'; + -- conflicting fields should throw errors SELECT date '1995-08-06 epoch'; SELECT date '1995-08-06 infinity'; diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql index f1f8fae5654..bbe658a7588 100644 --- a/src/test/regress/sql/incremental_sort.sql +++ b/src/test/regress/sql/incremental_sort.sql @@ -298,3 +298,27 @@ explain (costs off) select * from (select * from tenk1 order by four) t1 join tenk1 t2 on t1.four = t2.four and t1.two = t2.two order by t1.four, t1.two limit 1; + +-- +-- Test incremental sort for Append/MergeAppend +-- +create table prt_tbl (a int, b int) partition by range (a); +create table prt_tbl_1 partition of prt_tbl for values from (0) to (100); +create table prt_tbl_2 partition of prt_tbl for values from (100) to (200); +insert into prt_tbl select i%200, i from generate_series(1,1000)i; +create index on prt_tbl_1(a); +create index on prt_tbl_2(a, b); +analyze prt_tbl; + +set enable_seqscan to off; +set enable_bitmapscan to off; + +-- Ensure we get an incremental sort for the subpath of Append +explain (costs off) select * from prt_tbl order by a, b; + +-- Ensure we get an incremental sort for the subpath of MergeAppend +explain (costs off) select * from prt_tbl_1 union all select * from prt_tbl_2 order by a, b; + +reset enable_bitmapscan; +reset enable_seqscan; +drop table prt_tbl; diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index cc5128add4d..5f0a475894d 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1277,6 +1277,23 @@ where t1.unique2 < 42 and t1.stringu1 > t2.stringu2; -- variant that isn't quite a star-schema case +explain (verbose, costs off) +select ss1.d1 from + tenk1 as t1 + inner join tenk1 as t2 + on t1.tenthous = t2.ten + inner join + int8_tbl as i8 + left join int4_tbl as i4 + inner join (select 64::information_schema.cardinal_number as d1 + from tenk1 t3, + lateral (select abs(t3.unique1) + random()) ss0(x) + where t3.fivethous < 0) as ss1 + on i4.f1 = ss1.d1 + on i8.q1 = i4.f1 + on t1.tenthous = ss1.d1 +where t1.unique1 < i4.f1; + select ss1.d1 from tenk1 as t1 inner join tenk1 as t2 @@ -1332,6 +1349,64 @@ select * from (select 1 as x) ss1 left join (select 2 as y) ss2 on (true), lateral (select ss2.y as z limit 1) ss3; +-- This example demonstrates the folly of our old "have_dangerous_phv" logic +begin; +set local from_collapse_limit to 2; +explain (verbose, costs off) +select * from int8_tbl t1 + left join + (select coalesce(t2.q1 + x, 0) from int8_tbl t2, + lateral (select t3.q1 as x from int8_tbl t3, + lateral (select t2.q1, t3.q1 offset 0) s)) + on true; +rollback; + +-- ... not that the initial replacement didn't have some bugs too +begin; +create temp table t(i int primary key); + +explain (verbose, costs off) +select * from t t1 + left join (select 1 as x, * from t t2(i2)) t2ss on t1.i = t2ss.i2 + left join t t3(i3) on false + left join t t4(i4) on t4.i4 > t2ss.x; + +explain (verbose, costs off) +select * from + (select k from + (select i, coalesce(i, j) as k from + (select i from t union all select 0) + join (select 1 as j limit 1) on i = j) + right join (select 2 as x) on true + join (select 3 as y) on i is not null + ), + lateral (select k as kl limit 1); + +rollback; + +-- PHVs containing SubLinks are quite tricky to get right +explain (verbose, costs off) +select * +from int8_tbl i8 + inner join + (select (select true) as x + from int4_tbl i4, lateral (select i4.f1 as y limit 1) ss1 + where i4.f1 = 0) ss2 on true + right join (select false as z) ss3 on true, + lateral (select i8.q2 as q2l where x limit 1) ss4 +where i8.q2 = 123; + +explain (verbose, costs off) +select * +from int8_tbl i8 + inner join + (select (select true) as x + from int4_tbl i4, lateral (select 1 as y limit 1) ss1 + where i4.f1 = 0) ss2 on true + right join (select false as z) ss3 on true, + lateral (select i8.q2 as q2l where x limit 1) ss4 +where i8.q2 = 123; + -- Test proper handling of appendrel PHVs during useless-RTE removal explain (costs off) select * from @@ -1902,13 +1977,13 @@ select * from (select 1 as id) as xx left join (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id)) - on (xx.id = coalesce(yy.id)); + on (xx.id = coalesce(yy.id, yy.id)); select * from (select 1 as id) as xx left join (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id)) - on (xx.id = coalesce(yy.id)); + on (xx.id = coalesce(yy.id, yy.id)); -- -- test ability to push constants through outer join clauses @@ -3094,9 +3169,9 @@ select * from int4_tbl i left join lateral (select * from int2_tbl j where i.f1 = j.f1) k on true; explain (verbose, costs off) select * from int4_tbl i left join - lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true; + lateral (select coalesce(i, i) from int2_tbl j where i.f1 = j.f1) k on true; select * from int4_tbl i left join - lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true; + lateral (select coalesce(i, i) from int2_tbl j where i.f1 = j.f1) k on true; explain (verbose, costs off) select * from int4_tbl a, lateral ( @@ -3562,7 +3637,7 @@ ANALYZE group_tbl; EXPLAIN (COSTS OFF) SELECT 1 FROM group_tbl t1 - LEFT JOIN (SELECT a c1, COALESCE(a) c2 FROM group_tbl t2) s ON TRUE + LEFT JOIN (SELECT a c1, COALESCE(a, a) c2 FROM group_tbl t2) s ON TRUE GROUP BY s.c1, s.c2; DROP TABLE group_tbl; diff --git a/src/test/regress/sql/limit.sql b/src/test/regress/sql/limit.sql index 6f0cda98701..603910fe6d1 100644 --- a/src/test/regress/sql/limit.sql +++ b/src/test/regress/sql/limit.sql @@ -196,6 +196,9 @@ CREATE VIEW limit_thousand_v_3 AS SELECT thousand FROM onek WHERE thousand < 995 ORDER BY thousand FETCH FIRST (NULL+1) ROWS WITH TIES; \d+ limit_thousand_v_3 CREATE VIEW limit_thousand_v_4 AS SELECT thousand FROM onek WHERE thousand < 995 - ORDER BY thousand FETCH FIRST NULL ROWS ONLY; + ORDER BY thousand FETCH FIRST (5::bigint) ROWS WITH TIES; \d+ limit_thousand_v_4 +CREATE VIEW limit_thousand_v_5 AS SELECT thousand FROM onek WHERE thousand < 995 + ORDER BY thousand FETCH FIRST NULL ROWS ONLY; +\d+ limit_thousand_v_5 -- leave these views diff --git a/src/test/regress/sql/memoize.sql b/src/test/regress/sql/memoize.sql index c0d47fa875a..8d1cdd6990c 100644 --- a/src/test/regress/sql/memoize.sql +++ b/src/test/regress/sql/memoize.sql @@ -26,6 +26,7 @@ begin ln := regexp_replace(ln, 'Heap Fetches: \d+', 'Heap Fetches: N'); ln := regexp_replace(ln, 'loops=\d+', 'loops=N'); ln := regexp_replace(ln, 'Index Searches: \d+', 'Index Searches: N'); + ln := regexp_replace(ln, 'Memory: \d+kB', 'Memory: NkB'); return next ln; end loop; end; @@ -244,3 +245,29 @@ RESET max_parallel_workers_per_gather; RESET parallel_tuple_cost; RESET parallel_setup_cost; RESET min_parallel_table_scan_size; + +-- Ensure memoize works for ANTI joins +CREATE TABLE tab_anti (a int, b boolean); +INSERT INTO tab_anti SELECT i%3, false FROM generate_series(1,100)i; +ANALYZE tab_anti; + +-- Ensure we get a Memoize plan for ANTI join +SELECT explain_memoize(' +SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN +LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2 +ON t1.a+1 = t2.a +WHERE t2.a IS NULL;', false); + +-- And check we get the expected results. +SELECT COUNT(*) FROM tab_anti t1 LEFT JOIN +LATERAL (SELECT DISTINCT ON (a) a, b, t1.a AS x FROM tab_anti t2) t2 +ON t1.a+1 = t2.a +WHERE t2.a IS NULL; + +-- Ensure we do not add memoize node for SEMI join +EXPLAIN (COSTS OFF) +SELECT * FROM tab_anti t1 WHERE t1.a IN + (SELECT a FROM tab_anti t2 WHERE t2.b IN + (SELECT t1.b FROM tab_anti t3 WHERE t2.a > 1 OFFSET 0)); + +DROP TABLE tab_anti; diff --git a/src/test/regress/sql/merge.sql b/src/test/regress/sql/merge.sql index f7a19c0e7dd..2660b19f238 100644 --- a/src/test/regress/sql/merge.sql +++ b/src/test/regress/sql/merge.sql @@ -1722,6 +1722,55 @@ WHEN MATCHED THEN DELETE; SELECT * FROM new_measurement ORDER BY city_id, logdate; +-- MERGE into inheritance root table +DROP TRIGGER insert_measurement_trigger ON measurement; +ALTER TABLE measurement ADD CONSTRAINT mcheck CHECK (city_id = 0) NO INHERIT; + +EXPLAIN (COSTS OFF) +MERGE INTO measurement m + USING (VALUES (1, '01-17-2007'::date)) nm(city_id, logdate) ON + (m.city_id = nm.city_id and m.logdate=nm.logdate) +WHEN NOT MATCHED THEN INSERT + (city_id, logdate, peaktemp, unitsales) + VALUES (city_id - 1, logdate, 25, 100); + +BEGIN; +MERGE INTO measurement m + USING (VALUES (1, '01-17-2007'::date)) nm(city_id, logdate) ON + (m.city_id = nm.city_id and m.logdate=nm.logdate) +WHEN NOT MATCHED THEN INSERT + (city_id, logdate, peaktemp, unitsales) + VALUES (city_id - 1, logdate, 25, 100); +SELECT * FROM ONLY measurement ORDER BY city_id, logdate; +ROLLBACK; + +ALTER TABLE measurement ENABLE ROW LEVEL SECURITY; +ALTER TABLE measurement FORCE ROW LEVEL SECURITY; +CREATE POLICY measurement_p ON measurement USING (peaktemp IS NOT NULL); + +MERGE INTO measurement m + USING (VALUES (1, '01-17-2007'::date)) nm(city_id, logdate) ON + (m.city_id = nm.city_id and m.logdate=nm.logdate) +WHEN NOT MATCHED THEN INSERT + (city_id, logdate, peaktemp, unitsales) + VALUES (city_id - 1, logdate, NULL, 100); -- should fail + +MERGE INTO measurement m + USING (VALUES (1, '01-17-2007'::date)) nm(city_id, logdate) ON + (m.city_id = nm.city_id and m.logdate=nm.logdate) +WHEN NOT MATCHED THEN INSERT + (city_id, logdate, peaktemp, unitsales) + VALUES (city_id - 1, logdate, 25, 100); -- ok +SELECT * FROM ONLY measurement ORDER BY city_id, logdate; + +MERGE INTO measurement m + USING (VALUES (1, '01-18-2007'::date)) nm(city_id, logdate) ON + (m.city_id = nm.city_id and m.logdate=nm.logdate) +WHEN NOT MATCHED THEN INSERT + (city_id, logdate, peaktemp, unitsales) + VALUES (city_id - 1, logdate, 25, 200) +RETURNING merge_action(), m.*; + DROP TABLE measurement, new_measurement CASCADE; DROP FUNCTION measurement_insert_trigger(); diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql index 5f9c77512d1..23792c4132a 100644 --- a/src/test/regress/sql/misc_functions.sql +++ b/src/test/regress/sql/misc_functions.sql @@ -400,9 +400,9 @@ SELECT pg_column_toast_chunk_id(a) IS NULL, DROP TABLE test_chunk_id; DROP FUNCTION explain_mask_costs(text, bool, bool, bool, bool); --- test stratnum support functions -SELECT gist_stratnum_common(7); -SELECT gist_stratnum_common(3); +-- test stratnum translation support functions +SELECT gist_translate_cmptype_common(7); +SELECT gist_translate_cmptype_common(3); -- relpath tests diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql index b98ae27df56..640c6d92f4c 100644 --- a/src/test/regress/sql/numeric.sql +++ b/src/test/regress/sql/numeric.sql @@ -869,6 +869,8 @@ SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, 0); SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, -5); SELECT width_bucket(3.5::float8, 3.0::float8, 3.0::float8, 888); SELECT width_bucket('NaN', 3.0, 4.0, 888); +SELECT width_bucket('NaN'::float8, 3.0::float8, 4.0::float8, 888); +SELECT width_bucket(0, 'NaN', 4.0, 888); SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888); SELECT width_bucket(2.0, 3.0, '-inf', 888); SELECT width_bucket(0::float8, '-inf', 4.0::float8, 888); diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql index f6db9479f54..d93c0c03bab 100644 --- a/src/test/regress/sql/partition_prune.sql +++ b/src/test/regress/sql/partition_prune.sql @@ -1371,12 +1371,12 @@ create view part_abc_view as select * from part_abc where b <> 'a' with check op prepare update_part_abc_view as update part_abc_view set b = $2 where a = $1 returning *; -- Only the unpruned partition should be shown in the list of relations to be -- updated -explain (costs off) execute update_part_abc_view (1, 'd'); +explain (verbose, costs off) execute update_part_abc_view (1, 'd'); execute update_part_abc_view (1, 'd'); -explain (costs off) execute update_part_abc_view (2, 'a'); +explain (verbose, costs off) execute update_part_abc_view (2, 'a'); execute update_part_abc_view (2, 'a'); -- All pruned. -explain (costs off) execute update_part_abc_view (3, 'a'); +explain (verbose, costs off) execute update_part_abc_view (3, 'a'); execute update_part_abc_view (3, 'a'); deallocate update_part_abc_view; diff --git a/src/test/regress/sql/predicate.sql b/src/test/regress/sql/predicate.sql index 9dcb81b1bc5..d92277353a0 100644 --- a/src/test/regress/sql/predicate.sql +++ b/src/test/regress/sql/predicate.sql @@ -115,6 +115,24 @@ SELECT * FROM pred_tab t1 LEFT JOIN pred_tab t2 ON t1.a = 1 LEFT JOIN pred_tab t3 ON t2.a IS NULL OR t2.c IS NULL; +-- +-- Tests for NullTest reduction in EXISTS sublink +-- + +-- Ensure the IS_NOT_NULL qual is ignored +EXPLAIN (COSTS OFF) +SELECT * FROM pred_tab t1 + LEFT JOIN pred_tab t2 ON EXISTS + (SELECT 1 FROM pred_tab t3, pred_tab t4, pred_tab t5, pred_tab t6 + WHERE t1.a = t3.a AND t6.a IS NOT NULL); + +-- Ensure the IS_NULL qual is reduced to constant-FALSE +EXPLAIN (COSTS OFF) +SELECT * FROM pred_tab t1 + LEFT JOIN pred_tab t2 ON EXISTS + (SELECT 1 FROM pred_tab t3, pred_tab t4, pred_tab t5, pred_tab t6 + WHERE t1.a = t3.a AND t6.a IS NULL); + DROP TABLE pred_tab; -- Validate we handle IS NULL and IS NOT NULL quals correctly with inheritance diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index f337aa67c13..3eacc1340aa 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -1544,6 +1544,14 @@ SELECT makeaclitem('regress_priv_user1'::regrole, 'regress_priv_user2'::regrole, SELECT makeaclitem('regress_priv_user1'::regrole, 'regress_priv_user2'::regrole, 'SELECT, fake_privilege', FALSE); -- error +-- Test quoting and dequoting of user names in ACLs +CREATE ROLE "regress_""quoted"; +SELECT makeaclitem('regress_"quoted'::regrole, 'regress_"quoted'::regrole, + 'SELECT', TRUE); +SELECT '"regress_""quoted"=r*/"regress_""quoted"'::aclitem; +SELECT '""=r*/""'::aclitem; -- used to be misparsed as """" +DROP ROLE "regress_""quoted"; + -- Test non-throwing aclitem I/O SELECT pg_input_is_valid('regress_priv_user1=r/regress_priv_user2', 'aclitem'); SELECT pg_input_is_valid('regress_priv_user1=r/', 'aclitem'); @@ -1948,7 +1956,8 @@ DROP TABLE lock_table; DROP USER regress_locktable_user; -- test to check privileges of system views pg_shmem_allocations, --- pg_shmem_allocations_numa and pg_backend_memory_contexts. +-- pg_shmem_allocations_numa, pg_dsm_registry_allocations, and +-- pg_backend_memory_contexts. -- switch to superuser \c - @@ -1959,6 +1968,7 @@ SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- no +SELECT has_table_privilege('regress_readallstats','pg_dsm_registry_allocations','SELECT'); -- no GRANT pg_read_all_stats TO regress_readallstats; @@ -1966,6 +1976,7 @@ SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations_numa','SELECT'); -- yes +SELECT has_table_privilege('regress_readallstats','pg_dsm_registry_allocations','SELECT'); -- yes -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql index 1a8a83462f0..e2e31245439 100644 --- a/src/test/regress/sql/psql.sql +++ b/src/test/regress/sql/psql.sql @@ -68,11 +68,11 @@ SELECT $1, $2 \parse stmt3 -- Multiple \g calls mean multiple executions \bind_named stmt2 'foo3' \g \bind_named stmt3 'foo4' 'bar4' \g --- \close (extended query protocol) -\close -\close '' -\close stmt2 -\close stmt2 +-- \close_prepared (extended query protocol) +\close_prepared +\close_prepared '' +\close_prepared stmt2 +\close_prepared stmt2 SELECT name, statement FROM pg_prepared_statements ORDER BY name; -- \bind (extended query protocol) @@ -1035,7 +1035,7 @@ select \if false \\ (bogus \else \\ 42 \endif \\ forty_two; \C arg1 \c arg1 arg2 arg3 arg4 \cd arg1 - \close stmt1 + \close_prepared stmt1 \conninfo \copy arg1 arg2 arg3 arg4 arg5 arg6 \copyright diff --git a/src/test/regress/sql/psql_pipeline.sql b/src/test/regress/sql/psql_pipeline.sql index 16e1e1e84cd..6788dceee2e 100644 --- a/src/test/regress/sql/psql_pipeline.sql +++ b/src/test/regress/sql/psql_pipeline.sql @@ -105,106 +105,6 @@ INSERT INTO psql_pipeline VALUES ($1) \bind 1 \sendpipeline COMMIT \bind \sendpipeline \endpipeline --- COPY FROM STDIN --- with \sendpipeline and \bind -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -COPY psql_pipeline FROM STDIN \bind \sendpipeline -\endpipeline -2 test2 -\. --- with semicolon -\startpipeline -SELECT 'val1'; -COPY psql_pipeline FROM STDIN; -\endpipeline -20 test2 -\. - --- COPY FROM STDIN with \flushrequest + \getresults --- with \sendpipeline and \bind -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -COPY psql_pipeline FROM STDIN \bind \sendpipeline -\flushrequest -\getresults -3 test3 -\. -\endpipeline --- with semicolon -\startpipeline -SELECT 'val1'; -COPY psql_pipeline FROM STDIN; -\flushrequest -\getresults -30 test3 -\. -\endpipeline - --- COPY FROM STDIN with \syncpipeline + \getresults --- with \bind and \sendpipeline -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -COPY psql_pipeline FROM STDIN \bind \sendpipeline -\syncpipeline -\getresults -4 test4 -\. -\endpipeline --- with semicolon -\startpipeline -SELECT 'val1'; -COPY psql_pipeline FROM STDIN; -\syncpipeline -\getresults -40 test4 -\. -\endpipeline - --- COPY TO STDOUT --- with \bind and \sendpipeline -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -copy psql_pipeline TO STDOUT \bind \sendpipeline -\endpipeline --- with semicolon -\startpipeline -SELECT 'val1'; -copy psql_pipeline TO STDOUT; -\endpipeline - --- COPY TO STDOUT with \flushrequest + \getresults --- with \bind and \sendpipeline -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -copy psql_pipeline TO STDOUT \bind \sendpipeline -\flushrequest -\getresults -\endpipeline --- with semicolon -\startpipeline -SELECT 'val1'; -copy psql_pipeline TO STDOUT; -\flushrequest -\getresults -\endpipeline - --- COPY TO STDOUT with \syncpipeline + \getresults --- with \bind and \sendpipeline -\startpipeline -SELECT $1 \bind 'val1' \sendpipeline -copy psql_pipeline TO STDOUT \bind \sendpipeline -\syncpipeline -\getresults -\endpipeline --- with semicolon -\startpipeline -SELECT 'val1'; -copy psql_pipeline TO STDOUT; -\syncpipeline -\getresults -\endpipeline - -- Use \parse and \bind_named \startpipeline SELECT $1 \parse '' @@ -406,21 +306,21 @@ SELECT $1 \bind \sendpipeline SELECT $1 \bind 1 \sendpipeline SELECT $1 \parse a \bind_named a 1 \sendpipeline -\close a +\close_prepared a \flushrequest \getresults -- Pipeline is aborted. SELECT $1 \bind 1 \sendpipeline SELECT $1 \parse a \bind_named a 1 \sendpipeline -\close a +\close_prepared a -- Sync allows pipeline to recover. \syncpipeline \getresults SELECT $1 \bind 1 \sendpipeline SELECT $1 \parse a \bind_named a 1 \sendpipeline -\close a +\close_prepared a \flushrequest \getresults \endpipeline diff --git a/src/test/regress/sql/publication.sql b/src/test/regress/sql/publication.sql index 68001de4000..deddf0da844 100644 --- a/src/test/regress/sql/publication.sql +++ b/src/test/regress/sql/publication.sql @@ -26,6 +26,7 @@ CREATE PUBLICATION testpub_xxx WITH (publish = 'cluster, vacuum'); CREATE PUBLICATION testpub_xxx WITH (publish_via_partition_root = 'true', publish_via_partition_root = '0'); CREATE PUBLICATION testpub_xxx WITH (publish_generated_columns = stored, publish_generated_columns = none); CREATE PUBLICATION testpub_xxx WITH (publish_generated_columns = foo); +CREATE PUBLICATION testpub_xxx WITH (publish_generated_columns); \dRp @@ -262,6 +263,9 @@ ALTER PUBLICATION testpub6 SET TABLES IN SCHEMA testpub_rf_schema2, TABLE testpu RESET client_min_messages; \dRp+ testpub6 -- fail - virtual generated column uses user-defined function +-- (Actually, this already fails at CREATE TABLE rather than at CREATE +-- PUBLICATION, but let's keep the test in case the former gets +-- relaxed sometime.) CREATE TABLE testpub_rf_tbl6 (id int PRIMARY KEY, x int, y int GENERATED ALWAYS AS (x * testpub_rf_func2()) VIRTUAL); CREATE PUBLICATION testpub7 FOR TABLE testpub_rf_tbl6 WHERE (y > 100); -- test that SET EXPRESSION is rejected, because it could affect a row filter @@ -276,7 +280,7 @@ DROP TABLE testpub_rf_tbl2; DROP TABLE testpub_rf_tbl3; DROP TABLE testpub_rf_tbl4; DROP TABLE testpub_rf_tbl5; -DROP TABLE testpub_rf_tbl6; +--DROP TABLE testpub_rf_tbl6; DROP TABLE testpub_rf_schema1.testpub_rf_tbl5; DROP TABLE testpub_rf_schema2.testpub_rf_tbl6; DROP SCHEMA testpub_rf_schema1; @@ -1180,19 +1184,15 @@ DROP SCHEMA sch2 cascade; -- ====================================================== -- Test the 'publish_generated_columns' parameter with the following values: --- 'stored', 'none', and the default (no value specified), which defaults to --- 'stored'. +-- 'stored', 'none'. SET client_min_messages = 'ERROR'; CREATE PUBLICATION pub1 FOR ALL TABLES WITH (publish_generated_columns = stored); \dRp+ pub1 CREATE PUBLICATION pub2 FOR ALL TABLES WITH (publish_generated_columns = none); \dRp+ pub2 -CREATE PUBLICATION pub3 FOR ALL TABLES WITH (publish_generated_columns); -\dRp+ pub3 DROP PUBLICATION pub1; DROP PUBLICATION pub2; -DROP PUBLICATION pub3; -- Test the 'publish_generated_columns' parameter as 'none' and 'stored' for -- different scenarios with/without generated columns in column lists. @@ -1226,3 +1226,25 @@ RESET client_min_messages; RESET SESSION AUTHORIZATION; DROP ROLE regress_publication_user, regress_publication_user2; DROP ROLE regress_publication_user_dummy; + +-- stage objects for pg_dump tests +CREATE SCHEMA pubme CREATE TABLE t0 (c int, d int) CREATE TABLE t1 (c int); +CREATE SCHEMA pubme2 CREATE TABLE t0 (c int, d int); +SET client_min_messages = 'ERROR'; +CREATE PUBLICATION dump_pub_qual_1ct FOR + TABLE ONLY pubme.t0 (c, d) WHERE (c > 0); +CREATE PUBLICATION dump_pub_qual_2ct FOR + TABLE ONLY pubme.t0 (c) WHERE (c > 0), + TABLE ONLY pubme.t1 (c); +CREATE PUBLICATION dump_pub_nsp_1ct FOR + TABLES IN SCHEMA pubme; +CREATE PUBLICATION dump_pub_nsp_2ct FOR + TABLES IN SCHEMA pubme, + TABLES IN SCHEMA pubme2; +CREATE PUBLICATION dump_pub_all FOR + TABLE ONLY pubme.t0, + TABLE ONLY pubme.t1 WHERE (c < 0), + TABLES IN SCHEMA pubme, + TABLES IN SCHEMA pubme2 + WITH (publish_via_partition_root = true); +RESET client_min_messages; diff --git a/src/test/regress/sql/regproc.sql b/src/test/regress/sql/regproc.sql index 232289ac398..cfec8f8c754 100644 --- a/src/test/regress/sql/regproc.sql +++ b/src/test/regress/sql/regproc.sql @@ -47,11 +47,42 @@ SELECT regrole('regress_regrole_test'); SELECT regrole('"regress_regrole_test"'); SELECT regnamespace('pg_catalog'); SELECT regnamespace('"pg_catalog"'); +SELECT regdatabase('template1'); +SELECT regdatabase('"template1"'); SELECT to_regrole('regress_regrole_test'); SELECT to_regrole('"regress_regrole_test"'); SELECT to_regnamespace('pg_catalog'); SELECT to_regnamespace('"pg_catalog"'); +SELECT to_regdatabase('template1'); +SELECT to_regdatabase('"template1"'); + +-- special "single dash" case + +SELECT regproc('-')::oid; +SELECT regprocedure('-')::oid; +SELECT regclass('-')::oid; +SELECT regcollation('-')::oid; +SELECT regtype('-')::oid; +SELECT regconfig('-')::oid; +SELECT regdictionary('-')::oid; +SELECT regrole('-')::oid; +SELECT regnamespace('-')::oid; +SELECT regdatabase('-')::oid; + +SELECT to_regproc('-')::oid; +SELECT to_regprocedure('-')::oid; +SELECT to_regclass('-')::oid; +SELECT to_regcollation('-')::oid; +SELECT to_regtype('-')::oid; +SELECT to_regrole('-')::oid; +SELECT to_regnamespace('-')::oid; +SELECT to_regdatabase('-')::oid; + +-- constant cannot be used here + +CREATE TABLE regrole_test (rolid OID DEFAULT 'regress_regrole_test'::regrole); +CREATE TABLE regdatabase_test (datid OID DEFAULT 'template1'::regdatabase); /* If objects don't exist, raise errors. */ @@ -88,6 +119,9 @@ SELECT regrole('foo.bar'); SELECT regnamespace('Nonexistent'); SELECT regnamespace('"Nonexistent"'); SELECT regnamespace('foo.bar'); +SELECT regdatabase('Nonexistent'); +SELECT regdatabase('"Nonexistent"'); +SELECT regdatabase('foo.bar'); /* If objects don't exist, return NULL with no error. */ @@ -122,6 +156,9 @@ SELECT to_regrole('foo.bar'); SELECT to_regnamespace('Nonexistent'); SELECT to_regnamespace('"Nonexistent"'); SELECT to_regnamespace('foo.bar'); +SELECT to_regdatabase('Nonexistent'); +SELECT to_regdatabase('"Nonexistent"'); +SELECT to_regdatabase('foo.bar'); -- Test to_regtypemod SELECT to_regtypemod('text'); @@ -147,6 +184,7 @@ SELECT * FROM pg_input_error_info('ng_catalog.abs(numeric)', 'regprocedure'); SELECT * FROM pg_input_error_info('ng_catalog.abs(numeric', 'regprocedure'); SELECT * FROM pg_input_error_info('regress_regrole_test', 'regrole'); SELECT * FROM pg_input_error_info('no_such_type', 'regtype'); +SELECT * FROM pg_input_error_info('Nonexistent', 'regdatabase'); -- Some cases that should be soft errors, but are not yet SELECT * FROM pg_input_error_info('incorrect type name syntax', 'regtype'); diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql index 232ab8db8fa..54e72866344 100644 --- a/src/test/regress/sql/stats.sql +++ b/src/test/regress/sql/stats.sql @@ -439,8 +439,13 @@ DROP TABLE test_stats_temp; -- Checkpoint twice: The checkpointer reports stats after reporting completion -- of the checkpoint. But after a second checkpoint we'll see at least the -- results of the first. -CHECKPOINT; -CHECKPOINT; +-- +-- While at it, test checkpoint options. Note that we don't test MODE SPREAD +-- because it would prolong the test. +CHECKPOINT (WRONG); +CHECKPOINT (MODE WRONG); +CHECKPOINT (MODE FAST, FLUSH_UNLOGGED FALSE); +CHECKPOINT (FLUSH_UNLOGGED); SELECT num_requested > :rqst_ckpts_before FROM pg_stat_checkpointer; SELECT wal_bytes > :wal_bytes_before FROM pg_stat_wal; diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index f7b325baadf..b94004cc08c 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -76,6 +76,7 @@ SELECT E'De\\000dBeEf'::bytea; SELECT E'De\123dBeEf'::bytea; SELECT E'De\\123dBeEf'::bytea; SELECT E'De\\678dBeEf'::bytea; +SELECT E'DeAd\\\\BeEf'::bytea; SELECT reverse(''::bytea); SELECT reverse('\xaa'::bytea); @@ -88,6 +89,7 @@ SELECT E'\\xDe00BeEf'::bytea; SELECT E'DeAdBeEf'::bytea; SELECT E'De\\000dBeEf'::bytea; SELECT E'De\\123dBeEf'::bytea; +SELECT E'DeAd\\\\BeEf'::bytea; -- Test non-error-throwing API too SELECT pg_input_is_valid(E'\\xDeAdBeE', 'bytea'); @@ -197,6 +199,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true; SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null; SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error; +-- Characters that should be left alone in character classes when a +-- SIMILAR TO regexp pattern is converted to POSIX style. +-- Underscore "_" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_'; +-- Percentage "%" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%'; +-- Dot "." +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].'; +-- Dollar "$" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$'; +-- Opening parenthesis "(" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '()[([:alnum:](]()'; +-- Caret "^" +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^'; +-- Closing square bracket "]" at the beginning of character class +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%'; +-- Closing square bracket effective after two carets at the beginning +-- of character class. +EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^'; + -- Test backslash escapes in regexp_replace's replacement string SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3'); SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g'); @@ -628,6 +650,26 @@ SELECT length(c), c::text FROM toasttest; SELECT c FROM toasttest; DROP TABLE toasttest; +-- test with short varlenas (up to 126 data bytes reduced to a 1-byte header) +-- being toasted. +CREATE TABLE toasttest (f1 text, f2 text); +ALTER TABLE toasttest SET (toast_tuple_target = 128); +ALTER TABLE toasttest ALTER COLUMN f1 SET STORAGE EXTERNAL; +ALTER TABLE toasttest ALTER COLUMN f2 SET STORAGE EXTERNAL; +-- Here, the first value is a varlena large enough to make it toasted and +-- stored uncompressed. The second value is a short varlena, toasted +-- and stored uncompressed. +INSERT INTO toasttest values(repeat('1234', 1000), repeat('5678', 30)); +SELECT reltoastrelid::regclass AS reltoastname FROM pg_class + WHERE oid = 'toasttest'::regclass \gset +-- There should be two values inserted in the toast relation. +SELECT count(*) FROM :reltoastname WHERE chunk_seq = 0; +SELECT substr(f1, 5, 10) AS f1_data, substr(f2, 5, 10) AS f2_data + FROM toasttest; +SELECT pg_column_compression(f1) AS f1_comp, pg_column_compression(f2) AS f2_comp + FROM toasttest; +DROP TABLE toasttest; + -- -- test length -- diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql index 007c9e70374..f0f714fe747 100644 --- a/src/test/regress/sql/subscription.sql +++ b/src/test/regress/sql/subscription.sql @@ -287,6 +287,17 @@ ALTER SUBSCRIPTION regress_testsub SET (disable_on_error = true); ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); DROP SUBSCRIPTION regress_testsub; +-- fail - retain_dead_tuples must be boolean +CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, retain_dead_tuples = foo); + +-- ok +CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, retain_dead_tuples = false); + +\dRs+ + +ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE); +DROP SUBSCRIPTION regress_testsub; + -- let's do some tests with pg_create_subscription rather than superuser SET SESSION AUTHORIZATION regress_subscription_user3; diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index fec38ef85a6..d9a841fbc9f 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -1041,7 +1041,7 @@ explain (verbose, costs off) select ss2.* from int8_tbl t1 left join (int8_tbl t2 left join - (select coalesce(q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 inner join + (select coalesce(q1, q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 inner join lateral (select ss1.x as y, * from int8_tbl t4) ss2 on t2.q2 = ss2.q1) on t1.q2 = ss2.q1 order by 1, 2, 3; @@ -1049,7 +1049,7 @@ order by 1, 2, 3; select ss2.* from int8_tbl t1 left join (int8_tbl t2 left join - (select coalesce(q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 inner join + (select coalesce(q1, q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 inner join lateral (select ss1.x as y, * from int8_tbl t4) ss2 on t2.q2 = ss2.q1) on t1.q2 = ss2.q1 order by 1, 2, 3; @@ -1059,7 +1059,7 @@ explain (verbose, costs off) select ss2.* from int8_tbl t1 left join (int8_tbl t2 left join - (select coalesce(q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 left join + (select coalesce(q1, q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 left join lateral (select ss1.x as y, * from int8_tbl t4) ss2 on t2.q2 = ss2.q1) on t1.q2 = ss2.q1 order by 1, 2, 3; @@ -1067,7 +1067,7 @@ order by 1, 2, 3; select ss2.* from int8_tbl t1 left join (int8_tbl t2 left join - (select coalesce(q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 left join + (select coalesce(q1, q1) as x, * from int8_tbl t3) ss1 on t2.q1 = ss1.q2 left join lateral (select ss1.x as y, * from int8_tbl t4) ss2 on t2.q2 = ss2.q1) on t1.q2 = ss2.q1 order by 1, 2, 3; diff --git a/src/test/regress/sql/sysviews.sql b/src/test/regress/sql/sysviews.sql index d0917b6868e..66179f026b3 100644 --- a/src/test/regress/sql/sysviews.sql +++ b/src/test/regress/sql/sysviews.sql @@ -101,21 +101,3 @@ select count(distinct utc_offset) >= 24 as ok from pg_timezone_abbrevs; -- One specific case we can check without much fear of breakage -- is the historical local-mean-time value used for America/Los_Angeles. select * from pg_timezone_abbrevs where abbrev = 'LMT'; - -DO $$ -DECLARE - bg_writer_pid int; - r RECORD; -BEGIN - SELECT pid from pg_stat_activity where backend_type='background writer' - INTO bg_writer_pid; - - select type, name, ident - from pg_get_process_memory_contexts(bg_writer_pid, false, 20) - where path = '{1}' into r; - RAISE NOTICE '%', r; - select type, name, ident - from pg_get_process_memory_contexts(pg_backend_pid(), false, 20) - where path = '{1}' into r; - RAISE NOTICE '%', r; -END $$; diff --git a/src/test/regress/sql/timestamp.sql b/src/test/regress/sql/timestamp.sql index 55f80530ea0..313757ed041 100644 --- a/src/test/regress/sql/timestamp.sql +++ b/src/test/regress/sql/timestamp.sql @@ -175,7 +175,9 @@ SELECT d1 - timestamp without time zone '1997-01-02' AS diff FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; SELECT date_trunc( 'week', timestamp '2004-02-29 15:44:17.71393' ) AS week_trunc; - +SELECT date_trunc( 'week', timestamp 'infinity' ) AS inf_trunc; +SELECT date_trunc( 'timezone', timestamp '2004-02-29 15:44:17.71393' ) AS notsupp_trunc; +SELECT date_trunc( 'timezone', timestamp 'infinity' ) AS notsupp_inf_trunc; SELECT date_trunc( 'ago', timestamp 'infinity' ) AS invalid_trunc; -- verify date_bin behaves the same as date_trunc for relevant intervals diff --git a/src/test/regress/sql/timestamptz.sql b/src/test/regress/sql/timestamptz.sql index caca3123f13..6ace851d169 100644 --- a/src/test/regress/sql/timestamptz.sql +++ b/src/test/regress/sql/timestamptz.sql @@ -217,15 +217,18 @@ SELECT d1 - timestamp with time zone '1997-01-02' AS diff FROM TIMESTAMPTZ_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; SELECT date_trunc( 'week', timestamp with time zone '2004-02-29 15:44:17.71393' ) AS week_trunc; +SELECT date_trunc( 'week', timestamp with time zone 'infinity' ) AS inf_trunc; +SELECT date_trunc( 'timezone', timestamp with time zone '2004-02-29 15:44:17.71393' ) AS notsupp_trunc; +SELECT date_trunc( 'timezone', timestamp with time zone 'infinity' ) AS notsupp_inf_trunc; SELECT date_trunc( 'ago', timestamp with time zone 'infinity' ) AS invalid_trunc; SELECT date_trunc('day', timestamp with time zone '2001-02-16 20:38:40+00', 'Australia/Sydney') as sydney_trunc; -- zone name SELECT date_trunc('day', timestamp with time zone '2001-02-16 20:38:40+00', 'GMT') as gmt_trunc; -- fixed-offset abbreviation SELECT date_trunc('day', timestamp with time zone '2001-02-16 20:38:40+00', 'VET') as vet_trunc; -- variable-offset abbreviation +SELECT date_trunc('timezone', timestamp with time zone 'infinity', 'GMT') AS notsupp_zone_trunc; +SELECT date_trunc( 'week', timestamp with time zone 'infinity', 'GMT') AS inf_zone_trunc; SELECT date_trunc('ago', timestamp with time zone 'infinity', 'GMT') AS invalid_zone_trunc; - - -- verify date_bin behaves the same as date_trunc for relevant intervals SELECT str, diff --git a/src/test/regress/sql/triggers.sql b/src/test/regress/sql/triggers.sql index d3d242dd29b..d674b25c83b 100644 --- a/src/test/regress/sql/triggers.sql +++ b/src/test/regress/sql/triggers.sql @@ -1577,6 +1577,19 @@ drop table parted; drop function parted_trigfunc(); -- +-- Constraint triggers +-- +create constraint trigger crtr + after insert on foo not valid + for each row execute procedure foo (); +create constraint trigger crtr + after insert on foo no inherit + for each row execute procedure foo (); +create constraint trigger crtr + after insert on foo not enforced + for each row execute procedure foo (); + +-- -- Constraint triggers and partitioned tables create table parted_constr_ancestor (a int, b text) partition by range (b); @@ -1591,7 +1604,7 @@ create constraint trigger parted_trig after insert on parted_constr_ancestor deferrable for each row execute procedure trigger_notice_ab(); create constraint trigger parted_trig_two after insert on parted_constr - deferrable initially deferred + deferrable initially deferred enforced for each row when (bark(new.b) AND new.a % 2 = 1) execute procedure trigger_notice_ab(); @@ -2701,8 +2714,8 @@ drop function f(); -- Test who runs deferred trigger functions -- setup -create role regress_groot; -create role regress_outis; +create role regress_caller; +create role regress_fn_owner; create function whoami() returns trigger language plpgsql as $$ begin @@ -2710,7 +2723,7 @@ begin return null; end; $$; -alter function whoami() owner to regress_outis; +alter function whoami() owner to regress_fn_owner; create table defer_trig (id integer); grant insert on defer_trig to public; @@ -2721,10 +2734,10 @@ create constraint trigger whoami after insert on defer_trig -- deferred triggers must run as the user that queued the trigger begin; -set role regress_groot; +set role regress_caller; insert into defer_trig values (1); reset role; -set role regress_outis; +set role regress_fn_owner; insert into defer_trig values (2); reset role; commit; @@ -2732,7 +2745,7 @@ commit; -- security definer functions override the user who queued the trigger alter function whoami() security definer; begin; -set role regress_groot; +set role regress_caller; insert into defer_trig values (3); reset role; commit; @@ -2749,7 +2762,7 @@ end; $$; begin; -set role regress_groot; +set role regress_caller; insert into defer_trig values (4); reset role; commit; -- error expected @@ -2758,5 +2771,5 @@ select current_user = session_user; -- clean up drop table defer_trig; drop function whoami(); -drop role regress_outis; -drop role regress_groot; +drop role regress_fn_owner; +drop role regress_caller; diff --git a/src/test/regress/sql/type_sanity.sql b/src/test/regress/sql/type_sanity.sql index c94dd83d306..df795759bb4 100644 --- a/src/test/regress/sql/type_sanity.sql +++ b/src/test/regress/sql/type_sanity.sql @@ -539,6 +539,7 @@ CREATE TABLE tab_core_types AS SELECT 'regtype'::regtype type, 'pg_monitor'::regrole, 'pg_class'::regclass::oid, + 'template1'::regdatabase, '(1,1)'::tid, '2'::xid, '3'::cid, '10:20:10,14,15'::txid_snapshot, '10:20:10,14,15'::pg_snapshot, diff --git a/src/test/ssl/meson.build b/src/test/ssl/meson.build index cf8b2b9303a..d8e0fb518e0 100644 --- a/src/test/ssl/meson.build +++ b/src/test/ssl/meson.build @@ -7,7 +7,7 @@ tests += { 'tap': { 'env': { 'with_ssl': ssl_library, - 'OPENSSL': openssl.found() ? openssl.path() : '', + 'OPENSSL': openssl.found() ? openssl.full_path() : '', }, 'tests': [ 't/001_ssltests.pl', diff --git a/src/test/ssl/t/001_ssltests.pl b/src/test/ssl/t/001_ssltests.pl index 2cb4d0ffd41..b2eb18d3e81 100644 --- a/src/test/ssl/t/001_ssltests.pl +++ b/src/test/ssl/t/001_ssltests.pl @@ -173,6 +173,13 @@ SKIP: ok( (@status = stat("$tempdir/key.txt")), "keylog file exists and returned status"); ok(@status && !($status[2] & 0006), "keylog file is not world readable"); + + # Connect should work with an incorrect sslkeylogfile, with the error to + # open the logfile printed to stderr + $node->connect_ok( + "$common_connstr sslrootcert=ssl/root+server_ca.crt sslkeylogfile=$tempdir/invalid/key.txt sslmode=require", + "connect with server root cert and incorrect sslkeylogfile path", + expected_stderr => qr/could not open/); } # The server should not accept non-SSL connections. diff --git a/src/test/ssl/t/SSL/Server.pm b/src/test/ssl/t/SSL/Server.pm index 33975b28e8c..efbd0dafaf6 100644 --- a/src/test/ssl/t/SSL/Server.pm +++ b/src/test/ssl/t/SSL/Server.pm @@ -200,7 +200,7 @@ sub configure_test_server_for_ssl $node->append_conf( 'postgresql.conf', <<EOF fsync=off -log_connections=on +log_connections=all log_hostname=on listen_addresses='$serverhost' log_statement=all @@ -318,7 +318,8 @@ sub switch_server_cert $node->append_conf('sslconfig.conf', "ssl=on"); $node->append_conf('sslconfig.conf', $backend->set_server_cert(\%params)); # use lists of ECDH curves and cipher suites for syntax testing - $node->append_conf('sslconfig.conf', 'ssl_groups=X25519:prime256v1:secp521r1'); + $node->append_conf('sslconfig.conf', + 'ssl_groups=X25519:prime256v1:secp521r1'); $node->append_conf('sslconfig.conf', 'ssl_tls13_ciphers=TLS_AES_256_GCM_SHA384:TLS_AES_128_GCM_SHA256'); diff --git a/src/test/subscription/t/007_ddl.pl b/src/test/subscription/t/007_ddl.pl index 7d12bcbddb6..2a45fb13739 100644 --- a/src/test/subscription/t/007_ddl.pl +++ b/src/test/subscription/t/007_ddl.pl @@ -70,7 +70,8 @@ ok( $stderr =~ ); # Cleanup -$node_publisher->safe_psql('postgres', qq[ +$node_publisher->safe_psql( + 'postgres', qq[ DROP PUBLICATION mypub; SELECT pg_drop_replication_slot('mysub'); ]); @@ -86,32 +87,38 @@ sub test_swap my ($table_name, $pubname, $appname) = @_; # Confirms tuples can be replicated - $node_publisher->safe_psql('postgres', "INSERT INTO $table_name VALUES (1);"); + $node_publisher->safe_psql('postgres', + "INSERT INTO $table_name VALUES (1);"); $node_publisher->wait_for_catchup($appname); my $result = - $node_subscriber->safe_psql('postgres', "SELECT a FROM $table_name"); - is($result, qq(1), 'check replication worked well before renaming a publication'); + $node_subscriber->safe_psql('postgres', "SELECT a FROM $table_name"); + is($result, qq(1), + 'check replication worked well before renaming a publication'); # Swap the name of publications; $pubname <-> pub_empty - $node_publisher->safe_psql('postgres', qq[ + $node_publisher->safe_psql( + 'postgres', qq[ ALTER PUBLICATION $pubname RENAME TO tap_pub_tmp; ALTER PUBLICATION pub_empty RENAME TO $pubname; ALTER PUBLICATION tap_pub_tmp RENAME TO pub_empty; ]); # Insert the data again - $node_publisher->safe_psql('postgres', "INSERT INTO $table_name VALUES (2);"); + $node_publisher->safe_psql('postgres', + "INSERT INTO $table_name VALUES (2);"); $node_publisher->wait_for_catchup($appname); # Confirms the second tuple won't be replicated because $pubname does not # contains relations anymore. $result = - $node_subscriber->safe_psql('postgres', "SELECT a FROM $table_name ORDER BY a"); + $node_subscriber->safe_psql('postgres', + "SELECT a FROM $table_name ORDER BY a"); is($result, qq(1), 'check the tuple inserted after the RENAME was not replicated'); # Restore the name of publications because it can be called several times - $node_publisher->safe_psql('postgres', qq[ + $node_publisher->safe_psql( + 'postgres', qq[ ALTER PUBLICATION $pubname RENAME TO tap_pub_tmp; ALTER PUBLICATION pub_empty RENAME TO $pubname; ALTER PUBLICATION tap_pub_tmp RENAME TO pub_empty; @@ -124,7 +131,8 @@ $node_publisher->safe_psql('postgres', $ddl); $node_subscriber->safe_psql('postgres', $ddl); # Create publications and a subscription -$node_publisher->safe_psql('postgres', qq[ +$node_publisher->safe_psql( + 'postgres', qq[ CREATE PUBLICATION pub_empty; CREATE PUBLICATION pub_for_tab FOR TABLE test1; CREATE PUBLICATION pub_for_all_tables FOR ALL TABLES; @@ -139,19 +147,20 @@ test_swap('test1', 'pub_for_tab', 'tap_sub'); # Switches a publication which includes all tables $node_subscriber->safe_psql('postgres', - "ALTER SUBSCRIPTION tap_sub SET PUBLICATION pub_for_all_tables;" -); + "ALTER SUBSCRIPTION tap_sub SET PUBLICATION pub_for_all_tables;"); $node_subscriber->wait_for_subscription_sync($node_publisher, 'tap_sub'); # Confirms RENAME command works well for ALL TABLES publication test_swap('test2', 'pub_for_all_tables', 'tap_sub'); # Cleanup -$node_publisher->safe_psql('postgres', qq[ +$node_publisher->safe_psql( + 'postgres', qq[ DROP PUBLICATION pub_empty, pub_for_tab, pub_for_all_tables; DROP TABLE test1, test2; ]); -$node_subscriber->safe_psql('postgres', qq[ +$node_subscriber->safe_psql( + 'postgres', qq[ DROP SUBSCRIPTION tap_sub; DROP TABLE test1, test2; ]); diff --git a/src/test/subscription/t/013_partition.pl b/src/test/subscription/t/013_partition.pl index 61b0cb4aa1a..4f78dd48815 100644 --- a/src/test/subscription/t/013_partition.pl +++ b/src/test/subscription/t/013_partition.pl @@ -51,8 +51,7 @@ $node_subscriber1->safe_psql('postgres', ); # make a BRIN index to test aminsertcleanup logic in subscriber $node_subscriber1->safe_psql('postgres', - "CREATE INDEX tab1_c_brin_idx ON tab1 USING brin (c)" -); + "CREATE INDEX tab1_c_brin_idx ON tab1 USING brin (c)"); $node_subscriber1->safe_psql('postgres', "CREATE TABLE tab1_1 (b text, c text DEFAULT 'sub1_tab1', a int NOT NULL)" ); diff --git a/src/test/subscription/t/021_twophase.pl b/src/test/subscription/t/021_twophase.pl index 61c427aed21..b8e4242d1f1 100644 --- a/src/test/subscription/t/021_twophase.pl +++ b/src/test/subscription/t/021_twophase.pl @@ -373,7 +373,14 @@ $result = $node_publisher->safe_psql('postgres', "SELECT count(*) FROM tab_copy;"); is($result, qq(6), 'publisher inserted data'); +# Wait for both subscribers to catchup $node_publisher->wait_for_catchup($appname_copy); +$node_publisher->wait_for_catchup($appname); + +# Make sure there are no prepared transactions on the subscriber +$result = $node_subscriber->safe_psql('postgres', + "SELECT count(*) FROM pg_prepared_xacts;"); +is($result, qq(0), 'should be no prepared transactions on subscriber'); $result = $node_subscriber->safe_psql('postgres', "SELECT count(*) FROM tab_copy;"); diff --git a/src/test/subscription/t/024_add_drop_pub.pl b/src/test/subscription/t/024_add_drop_pub.pl index e995d8b3839..b396abe5599 100644 --- a/src/test/subscription/t/024_add_drop_pub.pl +++ b/src/test/subscription/t/024_add_drop_pub.pl @@ -108,11 +108,12 @@ $node_publisher->poll_query_until('postgres', my $offset = -s $node_publisher->logfile; -$node_publisher->safe_psql('postgres',"INSERT INTO tab_3 values(1)"); +$node_publisher->safe_psql('postgres', "INSERT INTO tab_3 values(1)"); # Verify that a warning is logged. $node_publisher->wait_for_log( - qr/WARNING: ( [A-Z0-9]+:)? skipped loading publication: tap_pub_3/, $offset); + qr/WARNING: ( [A-Z0-9]+:)? skipped loading publication "tap_pub_3"/, + $offset); $node_publisher->safe_psql('postgres', "CREATE PUBLICATION tap_pub_3 FOR TABLE tab_3"); @@ -128,10 +129,11 @@ $node_publisher->wait_for_catchup('tap_sub'); # Verify that the insert operation gets replicated to subscriber after # publication is created. -$result = $node_subscriber->safe_psql('postgres', - "SELECT * FROM tab_3"); -is($result, qq(1 -2), 'check that the incremental data is replicated after the publication is created'); +$result = $node_subscriber->safe_psql('postgres', "SELECT * FROM tab_3"); +is( $result, qq(1 +2), + 'check that the incremental data is replicated after the publication is created' +); # shutdown $node_subscriber->stop('fast'); diff --git a/src/test/subscription/t/035_conflicts.pl b/src/test/subscription/t/035_conflicts.pl index 2a7a8239a29..36aeb14c563 100644 --- a/src/test/subscription/t/035_conflicts.pl +++ b/src/test/subscription/t/035_conflicts.pl @@ -1,6 +1,6 @@ # Copyright (c) 2025, PostgreSQL Global Development Group -# Test the conflict detection of conflict type 'multiple_unique_conflicts'. +# Test conflicts in logical replication use strict; use warnings FATAL => 'all'; use PostgreSQL::Test::Cluster; @@ -18,7 +18,7 @@ $node_publisher->start; # Create a subscriber node my $node_subscriber = PostgreSQL::Test::Cluster->new('subscriber'); -$node_subscriber->init; +$node_subscriber->init(allows_streaming => 'logical'); $node_subscriber->start; # Create a table on publisher @@ -26,7 +26,8 @@ $node_publisher->safe_psql('postgres', "CREATE TABLE conf_tab (a int PRIMARY KEY, b int UNIQUE, c int UNIQUE);"); $node_publisher->safe_psql('postgres', - "CREATE TABLE conf_tab_2 (a int PRIMARY KEY, b int UNIQUE, c int UNIQUE);"); + "CREATE TABLE conf_tab_2 (a int PRIMARY KEY, b int UNIQUE, c int UNIQUE);" +); # Create same table on subscriber $node_subscriber->safe_psql('postgres', @@ -145,4 +146,267 @@ $node_subscriber->wait_for_log( pass('multiple_unique_conflicts detected on a leaf partition during insert'); +############################################################################### +# Setup a bidirectional logical replication between node_A & node_B +############################################################################### + +# Initialize nodes. Enable the track_commit_timestamp on both nodes to detect +# the conflict when attempting to update a row that was previously modified by +# a different origin. + +# node_A. Increase the log_min_messages setting to DEBUG2 to debug test +# failures. Disable autovacuum to avoid generating xid that could affect the +# replication slot's xmin value. +my $node_A = $node_publisher; +$node_A->append_conf( + 'postgresql.conf', + qq{track_commit_timestamp = on + autovacuum = off + log_min_messages = 'debug2'}); +$node_A->restart; + +# node_B +my $node_B = $node_subscriber; +$node_B->append_conf('postgresql.conf', "track_commit_timestamp = on"); +$node_B->restart; + +# Create table on node_A +$node_A->safe_psql('postgres', "CREATE TABLE tab (a int PRIMARY KEY, b int)"); + +# Create the same table on node_B +$node_B->safe_psql('postgres', "CREATE TABLE tab (a int PRIMARY KEY, b int)"); + +my $subname_AB = 'tap_sub_a_b'; +my $subname_BA = 'tap_sub_b_a'; + +# Setup logical replication +# node_A (pub) -> node_B (sub) +my $node_A_connstr = $node_A->connstr . ' dbname=postgres'; +$node_A->safe_psql('postgres', "CREATE PUBLICATION tap_pub_A FOR TABLE tab"); +$node_B->safe_psql( + 'postgres', " + CREATE SUBSCRIPTION $subname_BA + CONNECTION '$node_A_connstr application_name=$subname_BA' + PUBLICATION tap_pub_A + WITH (origin = none, retain_dead_tuples = true)"); + +# node_B (pub) -> node_A (sub) +my $node_B_connstr = $node_B->connstr . ' dbname=postgres'; +$node_B->safe_psql('postgres', "CREATE PUBLICATION tap_pub_B FOR TABLE tab"); +$node_A->safe_psql( + 'postgres', " + CREATE SUBSCRIPTION $subname_AB + CONNECTION '$node_B_connstr application_name=$subname_AB' + PUBLICATION tap_pub_B + WITH (origin = none, copy_data = off)"); + +# Wait for initial table sync to finish +$node_A->wait_for_subscription_sync($node_B, $subname_AB); +$node_B->wait_for_subscription_sync($node_A, $subname_BA); + +is(1, 1, 'Bidirectional replication setup is complete'); + +# Confirm that the conflict detection slot is created on Node B and the xmin +# value is valid. +ok( $node_B->poll_query_until( + 'postgres', + "SELECT xmin IS NOT NULL from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is valid on Node B"); + +################################################## +# Check that the retain_dead_tuples option can be enabled only for disabled +# subscriptions. Validate the NOTICE message during the subscription DDL, and +# ensure the conflict detection slot is created upon enabling the +# retain_dead_tuples option. +################################################## + +# Alter retain_dead_tuples for enabled subscription +my ($cmdret, $stdout, $stderr) = $node_A->psql('postgres', + "ALTER SUBSCRIPTION $subname_AB SET (retain_dead_tuples = true)"); +ok( $stderr =~ + /ERROR: cannot set option \"retain_dead_tuples\" for enabled subscription/, + "altering retain_dead_tuples is not allowed for enabled subscription"); + +# Disable the subscription +$node_A->psql('postgres', "ALTER SUBSCRIPTION $subname_AB DISABLE;"); + +# Wait for the apply worker to stop +$node_A->poll_query_until('postgres', + "SELECT count(*) = 0 FROM pg_stat_activity WHERE backend_type = 'logical replication apply worker'" +); + +# Enable retain_dead_tuples for disabled subscription +($cmdret, $stdout, $stderr) = $node_A->psql('postgres', + "ALTER SUBSCRIPTION $subname_AB SET (retain_dead_tuples = true);"); +ok( $stderr =~ + /NOTICE: deleted rows to detect conflicts would not be removed until the subscription is enabled/, + "altering retain_dead_tuples is allowed for disabled subscription"); + +# Re-enable the subscription +$node_A->safe_psql('postgres', "ALTER SUBSCRIPTION $subname_AB ENABLE;"); + +# Confirm that the conflict detection slot is created on Node A and the xmin +# value is valid. +ok( $node_A->poll_query_until( + 'postgres', + "SELECT xmin IS NOT NULL from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is valid on Node A"); + +################################################## +# Check the WARNING when changing the origin to ANY, if retain_dead_tuples is +# enabled. This warns of the possibility of receiving changes from origins +# other than the publisher. +################################################## + +($cmdret, $stdout, $stderr) = $node_A->psql('postgres', + "ALTER SUBSCRIPTION $subname_AB SET (origin = any);"); +ok( $stderr =~ + /WARNING: subscription "tap_sub_a_b" enabled retain_dead_tuples but might not reliably detect conflicts for changes from different origins/, + "warn of the possibility of receiving changes from origins other than the publisher"); + +# Reset the origin to none +$node_A->psql('postgres', + "ALTER SUBSCRIPTION $subname_AB SET (origin = none);"); + +############################################################################### +# Check that dead tuples on node A cannot be cleaned by VACUUM until the +# concurrent transactions on Node B have been applied and flushed on Node A. +# Also, check that an update_deleted conflict is detected when updating a row +# that was deleted by a different origin. +############################################################################### + +# Insert a record +$node_A->safe_psql('postgres', "INSERT INTO tab VALUES (1, 1), (2, 2);"); +$node_A->wait_for_catchup($subname_BA); + +my $result = $node_B->safe_psql('postgres', "SELECT * FROM tab;"); +is($result, qq(1|1 +2|2), 'check replicated insert on node B'); + +# Disable the logical replication from node B to node A +$node_A->safe_psql('postgres', "ALTER SUBSCRIPTION $subname_AB DISABLE"); + +# Wait for the apply worker to stop +$node_A->poll_query_until('postgres', + "SELECT count(*) = 0 FROM pg_stat_activity WHERE backend_type = 'logical replication apply worker'" +); + +my $log_location = -s $node_B->logfile; + +$node_B->safe_psql('postgres', "UPDATE tab SET b = 3 WHERE a = 1;"); +$node_A->safe_psql('postgres', "DELETE FROM tab WHERE a = 1;"); + +($cmdret, $stdout, $stderr) = $node_A->psql( + 'postgres', qq(VACUUM (verbose) public.tab;) +); + +ok( $stderr =~ + qr/1 are dead but not yet removable/, + 'the deleted column is non-removable'); + +# Ensure the DELETE is replayed on Node B +$node_A->wait_for_catchup($subname_BA); + +# Check the conflict detected on Node B +my $logfile = slurp_file($node_B->logfile(), $log_location); +ok( $logfile =~ + qr/conflict detected on relation "public.tab": conflict=delete_origin_differs.* +.*DETAIL:.* Deleting the row that was modified locally in transaction [0-9]+ at .* +.*Existing local tuple \(1, 3\); replica identity \(a\)=\(1\)/, + 'delete target row was modified in tab'); + +$log_location = -s $node_A->logfile; + +$node_A->safe_psql( + 'postgres', "ALTER SUBSCRIPTION $subname_AB ENABLE;"); +$node_B->wait_for_catchup($subname_AB); + +$logfile = slurp_file($node_A->logfile(), $log_location); +ok( $logfile =~ + qr/conflict detected on relation "public.tab": conflict=update_deleted.* +.*DETAIL:.* The row to be updated was deleted locally in transaction [0-9]+ at .* +.*Remote tuple \(1, 3\); replica identity \(a\)=\(1\)/, + 'update target row was deleted in tab'); + +# Remember the next transaction ID to be assigned +my $next_xid = $node_A->safe_psql('postgres', "SELECT txid_current() + 1;"); + +# Confirm that the xmin value is advanced to the latest nextXid. If no +# transactions are running, the apply worker selects nextXid as the candidate +# for the non-removable xid. See GetOldestActiveTransactionId(). +ok( $node_A->poll_query_until( + 'postgres', + "SELECT xmin = $next_xid from pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the xmin value of slot 'pg_conflict_detection' is updated on Node A"); + +# Confirm that the dead tuple can be removed now +($cmdret, $stdout, $stderr) = $node_A->psql( + 'postgres', qq(VACUUM (verbose) public.tab;) +); + +ok( $stderr =~ + qr/1 removed, 1 remain, 0 are dead but not yet removable/, + 'the deleted column is removed'); + +############################################################################### +# Ensure that the deleted tuple needed to detect an update_deleted conflict is +# accessible via a sequential table scan. +############################################################################### + +# Drop the primary key from tab on node A and set REPLICA IDENTITY to FULL to +# enforce sequential scanning of the table. +$node_A->safe_psql('postgres', "ALTER TABLE tab REPLICA IDENTITY FULL"); +$node_B->safe_psql('postgres', "ALTER TABLE tab REPLICA IDENTITY FULL"); +$node_A->safe_psql('postgres', "ALTER TABLE tab DROP CONSTRAINT tab_pkey;"); + +# Disable the logical replication from node B to node A +$node_A->safe_psql('postgres', "ALTER SUBSCRIPTION $subname_AB DISABLE"); + +# Wait for the apply worker to stop +$node_A->poll_query_until('postgres', + "SELECT count(*) = 0 FROM pg_stat_activity WHERE backend_type = 'logical replication apply worker'" +); + +$node_B->safe_psql('postgres', "UPDATE tab SET b = 4 WHERE a = 2;"); +$node_A->safe_psql('postgres', "DELETE FROM tab WHERE a = 2;"); + +$log_location = -s $node_A->logfile; + +$node_A->safe_psql( + 'postgres', "ALTER SUBSCRIPTION $subname_AB ENABLE;"); +$node_B->wait_for_catchup($subname_AB); + +$logfile = slurp_file($node_A->logfile(), $log_location); +ok( $logfile =~ + qr/conflict detected on relation "public.tab": conflict=update_deleted.* +.*DETAIL:.* The row to be updated was deleted locally in transaction [0-9]+ at .* +.*Remote tuple \(2, 4\); replica identity full \(2, 2\)/, + 'update target row was deleted in tab'); + +############################################################################### +# Check that the replication slot pg_conflict_detection is dropped after +# removing all the subscriptions. +############################################################################### + +$node_B->safe_psql( + 'postgres', "DROP SUBSCRIPTION $subname_BA"); + +ok( $node_B->poll_query_until( + 'postgres', + "SELECT count(*) = 0 FROM pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the slot 'pg_conflict_detection' has been dropped on Node B"); + +$node_A->safe_psql( + 'postgres', "DROP SUBSCRIPTION $subname_AB"); + +ok( $node_A->poll_query_until( + 'postgres', + "SELECT count(*) = 0 FROM pg_replication_slots WHERE slot_name = 'pg_conflict_detection'" + ), + "the slot 'pg_conflict_detection' has been dropped on Node A"); + done_testing(); diff --git a/src/tools/ci/pg_ci_base.conf b/src/tools/ci/pg_ci_base.conf index d8faa9c26c1..695e0a0d6ec 100644 --- a/src/tools/ci/pg_ci_base.conf +++ b/src/tools/ci/pg_ci_base.conf @@ -8,7 +8,7 @@ max_prepared_transactions = 10 # Settings that make logs more useful log_autovacuum_min_duration = 0 log_checkpoints = true -log_connections = true +log_connections = all log_disconnections = true -log_line_prefix = '%m [%p][%b] %q[%a][%v:%x] ' +log_line_prefix = '%m %b[%p] %q%a ' log_lock_waits = true diff --git a/src/tools/git_changelog b/src/tools/git_changelog index b8bd874f208..c25e399a87f 100755 --- a/src/tools/git_changelog +++ b/src/tools/git_changelog @@ -59,6 +59,7 @@ require IPC::Open2; # (We could get this from "git branches", but not worth the trouble.) # NB: master must be first! my @BRANCHES = qw(master + REL_18_STABLE REL_17_STABLE REL_16_STABLE REL_15_STABLE REL_14_STABLE REL_13_STABLE REL_12_STABLE REL_11_STABLE REL_10_STABLE REL9_6_STABLE REL9_5_STABLE REL9_4_STABLE REL9_3_STABLE REL9_2_STABLE REL9_1_STABLE REL9_0_STABLE diff --git a/src/tools/pgflex b/src/tools/pgflex index 3986b06874e..b8d9aa0086f 100755 --- a/src/tools/pgflex +++ b/src/tools/pgflex @@ -48,7 +48,7 @@ os.chdir(args.privatedir) # contents. Set FLEX_TMP_DIR to the target private directory to avoid # that. That environment variable isn't consulted on other platforms, so we # don't even need to make this conditional. -env = {'FLEX_TMP_DIR': args.privatedir} +os.environ['FLEX_TMP_DIR'] = args.privatedir # build flex invocation command = [args.flex, '-o', args.output_file] @@ -58,7 +58,7 @@ command += args.flex_flags command += [args.input_file] # create .c file from .l file -sp = subprocess.run(command, env=env) +sp = subprocess.run(command) if sp.returncode != 0: sys.exit(sp.returncode) diff --git a/src/tools/pgindent/pgindent b/src/tools/pgindent/pgindent index 54e138b598d..b7d71808924 100755 --- a/src/tools/pgindent/pgindent +++ b/src/tools/pgindent/pgindent @@ -73,11 +73,14 @@ if ($sourcedir) # might make them so. For the moment we just hardwire a list of names # to add and a list of names to exclude; eventually this may need to be # easier to configure. Note that the typedefs need trailing newlines. -my @additional = ("bool\n"); +my @additional = map { "$_\n" } qw( + bool regex_t regmatch_t regoff +); my %excluded = map { +"$_\n" => 1 } qw( - ANY FD_SET U abs allocfunc boolean date digit ilist interval iterator other - pointer printfunc reference string timestamp type wrap + FD_SET LookupSet boolean date duration + element_type inquiry iterator other + pointer reference rep string timestamp type wrap ); # globals diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9ea573fae21..e6f2e93b2d6 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -6,6 +6,7 @@ ASN1_INTEGER ASN1_OBJECT ASN1_OCTET_STRING ASN1_STRING +ATAlterConstraint AV A_ArrayExpr A_Const @@ -47,7 +48,6 @@ AggSplit AggState AggStatePerAgg AggStatePerGroup -AggStatePerGroupData AggStatePerHash AggStatePerPhase AggStatePerTrans @@ -55,9 +55,6 @@ AggStrategy AggTransInfo Aggref AggregateInstrumentation -AioWorkerControl -AioWorkerSlot -AioWorkerSubmissionQueue AlenState Alias AllocBlock @@ -74,6 +71,7 @@ AlterDatabaseSetStmt AlterDatabaseStmt AlterDefaultPrivilegesStmt AlterDomainStmt +AlterDomainType AlterEnumStmt AlterEventTrigStmt AlterExtensionContentsStmt @@ -161,7 +159,6 @@ ArrayType AsyncQueueControl AsyncQueueEntry AsyncRequest -ATAlterConstraint AttInMetadata AttStatsSlot AttoptCacheEntry @@ -174,8 +171,8 @@ AttrNumber AttributeOpts AuthRequest AuthToken -AutoPrewarmSharedState AutoPrewarmReadStreamData +AutoPrewarmSharedState AutoVacOpts AutoVacuumShmemStruct AutoVacuumWorkItem @@ -222,7 +219,6 @@ BTScanInsertData BTScanKeyPreproc BTScanOpaque BTScanOpaqueData -BTScanPos BTScanPosData BTScanPosItem BTShared @@ -270,8 +266,8 @@ BitmapAndPath BitmapAndState BitmapHeapPath BitmapHeapScan -BitmapHeapScanInstrumentation BitmapHeapScanDesc +BitmapHeapScanInstrumentation BitmapHeapScanState BitmapIndexScan BitmapIndexScanState @@ -341,8 +337,8 @@ BufFile Buffer BufferAccessStrategy BufferAccessStrategyType -BufferCacheNumaRec BufferCacheNumaContext +BufferCacheNumaRec BufferCachePagesContext BufferCachePagesRec BufferDesc @@ -382,6 +378,9 @@ CTEMaterialize CTESearchClause CURL CURLM +CURLMcode +CURLMsg +CURLcode CURLoption CV CachedExpression @@ -520,7 +519,6 @@ CopyFormatOptions CopyFromRoutine CopyFromState CopyFromStateData -CopyHeaderChoice CopyInsertMethod CopyLogVerbosityChoice CopyMethod @@ -600,6 +598,7 @@ DR_intorel DR_printtup DR_sqlfunction DR_transientrel +DSMREntryType DSMRegistryCtxStruct DSMRegistryEntry DWORD @@ -628,6 +627,7 @@ DefElem DefElemAction DefaultACLInfo DefineStmt +DefnDumperPtr DeleteStmt DependencyGenerator DependencyGeneratorData @@ -677,9 +677,8 @@ DumpableObjectType DumpableObjectWithAcl DynamicFileList DynamicZoneAbbrev -EC_KEY -ECDerivesKey ECDerivesEntry +ECDerivesKey EDGE ENGINE EOM_flatten_into_method @@ -761,10 +760,12 @@ ExpandedRange ExpandedRecordFieldInfo ExpandedRecordHeader ExplainDirectModify_function +ExplainExtensionOption ExplainForeignModify_function ExplainForeignScan_function ExplainFormat ExplainOneQuery_hook_type +ExplainOptionHandler ExplainSerializeOption ExplainState ExplainStmt @@ -792,6 +793,7 @@ FDWCollateState FD_SET FILE FILETIME +FPI FSMAddress FSMPage FSMPageData @@ -801,12 +803,12 @@ FastPathStrongRelationLockData FdwInfo FdwRoutine FetchDirection +FetchDirectionKeywords FetchStmt FieldSelect FieldStore File FileBackupMethod -FileCopyMethod FileFdwExecutionState FileFdwPlanState FileNameMap @@ -1190,6 +1192,7 @@ HeapCheckContext HeapCheckReadStreamData HeapPageFreeze HeapScanDesc +HeapScanDescData HeapTuple HeapTupleData HeapTupleFields @@ -1249,6 +1252,7 @@ IndexClause IndexClauseSet IndexDeleteCounts IndexDeletePrefetchState +IndexDoCheckCallback IndexElem IndexFetchHeapData IndexFetchTableData @@ -1279,13 +1283,15 @@ InheritableSocket InitSampleScan_function InitializeDSMForeignScan_function InitializeWorkerForeignScan_function +InjIoErrorState InjectionPointCacheEntry InjectionPointCallback InjectionPointCondition InjectionPointConditionType +InjectionPointData InjectionPointEntry -InjectionPointsCtl InjectionPointSharedState +InjectionPointsCtl InlineCodeBlock InsertStmt Instrumentation @@ -1302,7 +1308,6 @@ IntoClause InvalMessageArray InvalidationInfo InvalidationMsgsGroup -IoMethod IoMethodOps IpcMemoryId IpcMemoryKey @@ -1492,8 +1497,7 @@ LLVMOrcResourceTrackerRef LLVMOrcSymbolStringPoolRef LLVMOrcThreadSafeContextRef LLVMOrcThreadSafeModuleRef -LLVMPassManagerBuilderRef -LLVMPassManagerRef +LLVMPassBuilderOptionsRef LLVMTargetMachineRef LLVMTargetRef LLVMTypeRef @@ -1563,6 +1567,7 @@ LoadStmt LocalBufferLookupEnt LocalPgBackendStatus LocalTransactionId +Location LocationIndex LocationLen LockAcquireResult @@ -1582,7 +1587,6 @@ LockTupleMode LockViewRecurse_context LockWaitPolicy LockingClause -LogConnectionOption LogOpts LogStmtLevel LogicalDecodeBeginCB @@ -1633,6 +1637,7 @@ LogicalSlotInfo LogicalSlotInfoArr LogicalTape LogicalTapeSet +LookupSet LsnReadQueue LsnReadQueueNextFun LsnReadQueueNextStatus @@ -1657,8 +1662,8 @@ ManyTestResourceKind Material MaterialPath MaterialState -MdfdVec MdPathStr +MdfdVec Memoize MemoizeEntry MemoizeInstrumentation @@ -1672,12 +1677,9 @@ MemoryContextCallback MemoryContextCallbackFunction MemoryContextCounters MemoryContextData +MemoryContextId MemoryContextMethodID MemoryContextMethods -MemoryStatsBackendState -MemoryStatsContextId -MemoryStatsCtl -MemoryStatsEntry MemoryStatsPrintFunc MergeAction MergeActionState @@ -1734,6 +1736,9 @@ Name NameData NameHashEntry NamedArgExpr +NamedDSAState +NamedDSHState +NamedDSMState NamedLWLockTranche NamedLWLockTrancheRequest NamedTuplestoreScan @@ -1754,6 +1759,7 @@ NonEmptyRange Notification NotificationList NotifyStmt +NotnullHashEntry Nsrt NtDllRoutine NtFlushBuffersFileEx_t @@ -1769,6 +1775,7 @@ NumericSortSupport NumericSumAccum NumericVar OAuthValidatorCallbacks +OAuthValidatorModuleInit OM_uint32 OP OSAPerGroupState @@ -1838,7 +1845,6 @@ PGCALL2 PGCRYPTO_SHA_t PGChecksummablePage PGContextVisibility -PGErrorVerbosity PGEvent PGEventConnDestroy PGEventConnReset @@ -1876,7 +1882,6 @@ PGTargetServerType PGTernaryBool PGTransactionStatusType PGVerbosity -PG_Locale_Strategy PG_Lock_Status PG_init_t PGauthData @@ -1908,7 +1913,6 @@ PLpgSQL_exception PLpgSQL_exception_block PLpgSQL_execstate PLpgSQL_expr -PLpgSQL_func_hashkey PLpgSQL_function PLpgSQL_getdiag_kind PLpgSQL_if_elsif @@ -2159,10 +2163,10 @@ PermutationStepBlockerType PgAioBackend PgAioCtl PgAioHandle -PgAioHandleCallbackID -PgAioHandleCallbackStage PgAioHandleCallbackComplete +PgAioHandleCallbackID PgAioHandleCallbackReport +PgAioHandleCallbackStage PgAioHandleCallbacks PgAioHandleCallbacksEntry PgAioHandleFlags @@ -2175,8 +2179,12 @@ PgAioReturn PgAioTargetData PgAioTargetID PgAioTargetInfo +PgAioUringCaps PgAioUringContext PgAioWaitRef +PgAioWorkerControl +PgAioWorkerSlot +PgAioWorkerSubmissionQueue PgArchData PgBackendGSSStatus PgBackendSSLStatus @@ -2207,9 +2215,9 @@ PgStatShared_Common PgStatShared_Database PgStatShared_Function PgStatShared_HashEntry +PgStatShared_IO PgStatShared_InjectionPoint PgStatShared_InjectionPointFixed -PgStatShared_IO PgStatShared_Relation PgStatShared_ReplSlot PgStatShared_SLRU @@ -2230,7 +2238,6 @@ PgStat_FunctionCallUsage PgStat_FunctionCounts PgStat_HashKey PgStat_IO -PgStat_Kind PgStat_KindInfo PgStat_LocalState PgStat_PendingDroppedStatsItem @@ -2268,6 +2275,7 @@ PlanInvalItem PlanRowMark PlanState PlannedStmt +PlannedStmtOrigin PlannerGlobal PlannerInfo PlannerParamItem @@ -2358,12 +2366,12 @@ PushFilter PushFilterOps PushFunction PyCFunction -PyMappingMethods PyMethodDef PyModuleDef PyObject -PySequenceMethods PyTypeObject +PyType_Slot +PyType_Spec Py_ssize_t QPRS_STATE QTN2QTState @@ -2477,6 +2485,7 @@ RelOptInfo RelOptKind RelPathStr RelStatsInfo +RelSyncCallbackFunction RelToCheck RelToCluster RelabelType @@ -2558,6 +2567,8 @@ RestrictInfo Result ResultRelInfo ResultState +RetainDeadTuplesData +RetainDeadTuplesPhase ReturnSetInfo ReturnStmt ReturningClause @@ -2629,7 +2640,6 @@ SQLDropObject SQLFunctionCache SQLFunctionCachePtr SQLFunctionHashEntry -SQLFunctionLink SQLFunctionParseInfo SQLFunctionParseInfoPtr SQLValueFunction @@ -2641,6 +2651,7 @@ STARTUPINFO STRLEN SV SYNCHRONIZATION_BARRIER +SYSTEM_INFO SampleScan SampleScanGetSampleSize_function SampleScanState @@ -2728,6 +2739,7 @@ SharedIncrementalSortInfo SharedIndexScanInstrumentation SharedInvalCatalogMsg SharedInvalCatcacheMsg +SharedInvalRelSyncMsg SharedInvalRelcacheMsg SharedInvalRelmapMsg SharedInvalSmgrMsg @@ -2767,7 +2779,7 @@ SingleBoundSortItem Size SkipPages SkipSupport -SkipSupportData +SkipSupportIncDec SlabBlock SlabContext SlabSlot @@ -2993,6 +3005,7 @@ TarMethodData TarMethodFile TargetEntry TclExceptionNameMap +Tcl_CmdInfo Tcl_DString Tcl_FileProc Tcl_HashEntry @@ -3000,8 +3013,10 @@ Tcl_HashTable Tcl_Interp Tcl_NotifierProcs Tcl_Obj +Tcl_Size Tcl_Time TempNamespaceStatus +TestDSMRegistryHashEntry TestDSMRegistryStruct TestDecodingData TestDecodingTxnData @@ -3145,6 +3160,7 @@ UnicodeNormalizationQC Unique UniquePath UniquePathMethod +UniqueRelInfo UniqueState UnlistenStmt UnresolvedTup @@ -3175,8 +3191,11 @@ VacuumRelation VacuumStmt ValidIOData ValidateIndexState -ValidatorModuleState ValidatorModuleResult +ValidatorModuleState +ValidatorShutdownCB +ValidatorStartupCB +ValidatorValidateCB ValuesScan ValuesScanState Var @@ -3381,10 +3400,9 @@ _resultmap _stringlist access_vector_t acquireLocksOnSubLinks_context -add_nulling_relids_context addFkConstraintSides +add_nulling_relids_context adjust_appendrel_attrs_context -allocfunc amadjustmembers_function ambeginscan_function ambuild_function @@ -3396,6 +3414,7 @@ amcostestimate_function amendscan_function amestimateparallelscan_function amgetbitmap_function +amgettreeheight_function amgettuple_function aminitparallelscan_function aminsert_function @@ -3406,13 +3425,27 @@ amparallelrescan_function amproperty_function amrescan_function amrestrpos_function -amtranslate_strategy_function amtranslatestrategy; -amtranslate_cmptype_function amtranslatecmptype; +amtranslate_cmptype_function +amtranslate_strategy_function amvacuumcleanup_function amvalidate_function array_iter array_unnest_fctx assign_collations_context +astreamer +astreamer_archive_context +astreamer_extractor +astreamer_gzip_decompressor +astreamer_gzip_writer +astreamer_lz4_frame +astreamer_member +astreamer_ops +astreamer_plain_writer +astreamer_recovery_injector +astreamer_tar_archiver +astreamer_tar_parser +astreamer_verify +astreamer_zstd_frame auth_password_hook_typ autovac_table av_relation @@ -3439,20 +3472,6 @@ bbsink_shell bbsink_state bbsink_throttle bbsink_zstd -astreamer -astreamer_archive_context -astreamer_extractor -astreamer_gzip_decompressor -astreamer_gzip_writer -astreamer_lz4_frame -astreamer_member -astreamer_ops -astreamer_plain_writer -astreamer_recovery_injector -astreamer_tar_archiver -astreamer_tar_parser -astreamer_verify -astreamer_zstd_frame bgworker_main_type bh_node_type binaryheap @@ -3467,6 +3486,8 @@ bloom_filter boolKEY brin_column_state brin_serialize_callback_type +btree_gin_convert_function +btree_gin_leftmost_function bytea cached_re_str canonicalize_state @@ -3492,6 +3513,13 @@ colormaprange compare_context config_handle config_var_value +conn_errorMessage_func +conn_oauth_client_id_func +conn_oauth_client_secret_func +conn_oauth_discovery_uri_func +conn_oauth_issuer_id_func +conn_oauth_scope_func +conn_sasl_state_func contain_aggs_of_level_context contain_placeholder_references_context convert_testexpr_context @@ -3508,6 +3536,9 @@ create_upper_paths_hook_type createdb_failure_params crosstab_HashEnt crosstab_cat_desc +curl_infotype +curl_socket_t +curl_version_info_data datapagemap_iterator_t datapagemap_t dateKEY @@ -3519,9 +3550,8 @@ deparse_columns deparse_context deparse_expr_cxt deparse_namespace -destructor +derives_hash dev_t -digit disassembledLeaf dlist_head dlist_iter @@ -3559,18 +3589,23 @@ dsm_handle dsm_op dsm_segment dsm_segment_detach_callback +duration eLogType ean13 eary ec_matches_callback_type ec_member_foreign_arg ec_member_matches_arg +element_type emit_log_hook_type eval_const_expressions_context exec_thread_arg execution_state exit_function explain_get_index_name_hook_type +explain_per_node_hook_type +explain_per_plan_hook_type +explain_validate_options_hook_type f_smgr fasthash_state fd_set @@ -3653,7 +3688,6 @@ gss_key_value_set_desc gss_name_t gtrgm_consistent_cache gzFile -hashfunc hbaPort heap_page_items_state help_handler @@ -3675,17 +3709,21 @@ init_function inline_cte_walker_context inline_error_callback_arg ino_t +inquiry instr_time int128 int16 int16KEY +int16_t int2vector int32 int32KEY int32_t int64 int64KEY +int64_t int8 +int8_t int8x16_t internalPQconninfoOption intptr_t @@ -3717,7 +3755,9 @@ lclContext lclTocEntry leafSegmentInfo leaf_item +libpq_gettext_func libpq_source +libpqsrv_PGresult line_t lineno_t list_sort_comparator @@ -3773,6 +3813,7 @@ mxact mxtruncinfo needs_fmgr_hook_type network_sortsupport_state +nl_item nodeitem normal_rand_fctx nsphash_hash @@ -3790,6 +3831,7 @@ openssl_tls_init_hook_typ ossl_EVP_cipher_func other output_type +overexplain_options pagetable_hash pagetable_iterator pairingheap @@ -3809,7 +3851,6 @@ pg_atomic_flag pg_atomic_uint32 pg_atomic_uint64 pg_be_sasl_mech -pg_case_map pg_category_range pg_checksum_context pg_checksum_raw_context @@ -3833,7 +3874,6 @@ pg_funcptr_t pg_gssinfo pg_hmac_ctx pg_hmac_errno -pg_int64 pg_local_to_utf_combined pg_locale_t pg_mb_radix_tree @@ -3902,7 +3942,8 @@ plperl_query_entry plpgsql_CastExprHashEntry plpgsql_CastHashEntry plpgsql_CastHashKey -plpgsql_HashEnt +plpgsql_expr_walker_callback +plpgsql_stmt_walker_callback pltcl_call_state pltcl_interp_desc pltcl_proc_desc @@ -3925,7 +3966,6 @@ printTextLineFormat printTextLineWrap printTextRule printXheaderWidthType -printfunc priv_map process_file_callback_t process_sublinks_context @@ -3965,12 +4005,9 @@ reduce_outer_joins_pass1_state reduce_outer_joins_pass2_state reference regex_arc_t -regex_t regexp regexp_matches_ctx registered_buffer -regmatch_t -regoff_t regproc relopt_bool relopt_enum @@ -3989,6 +4026,7 @@ remoteConnHashEnt remoteDep remove_nulling_relids_context rendezvousHashEntry +rep replace_rte_variables_callback replace_rte_variables_context report_error_fn @@ -4007,6 +4045,7 @@ rt_node_class_test_elem rt_radix_tree saophash_hash save_buffer +save_locale_t scram_state scram_state_enum script_error_callback_arg @@ -4014,6 +4053,8 @@ security_class_t sem_t sepgsql_context_info_t sequence_magic +set_conn_altsock_func +set_conn_oauth_token_func set_join_pathlist_hook_type set_rel_pathlist_hook_type shared_ts_iter @@ -4134,6 +4175,7 @@ uint32_t uint32x4_t uint64 uint64_t +uint64x2_t uint8 uint8_t uint8x16_t @@ -4143,7 +4185,6 @@ unicodeStyleColumnFormat unicodeStyleFormat unicodeStyleRowFormat unicode_linestyle -UniqueRelInfo unit_conversion unlogged_relation_entry utf_local_conversion_func @@ -4286,6 +4327,7 @@ xmlGenericErrorFunc xmlNodePtr xmlNodeSetPtr xmlParserCtxtPtr +xmlParserErrors xmlParserInputPtr xmlSaveCtxt xmlSaveCtxtPtr @@ -4306,6 +4348,3 @@ yyscan_t z_stream z_streamp zic_t -ExplainExtensionOption -ExplainOptionHandler -overexplain_options diff --git a/src/tools/valgrind.supp b/src/tools/valgrind.supp index 7ea464c8094..3880007dfb3 100644 --- a/src/tools/valgrind.supp +++ b/src/tools/valgrind.supp @@ -180,3 +180,50 @@ Memcheck:Cond fun:PyObject_Realloc } + +# NUMA introspection requires touching memory first, and some of it may +# be marked as noacess (e.g. unpinned buffers). So just ignore that. +{ + pg_numa_touch_mem_if_required + Memcheck:Addr4 + fun:pg_numa_touch_mem_if_required +} + +{ + pg_numa_touch_mem_if_required + Memcheck:Addr8 + fun:pg_numa_touch_mem_if_required +} + + +# Memory-leak suppressions +# Note that a suppression rule will silence complaints about memory blocks +# allocated in matching places, but it won't prevent "indirectly lost" +# complaints about blocks that are only reachable via the suppressed blocks. + +# Suppress complaints about stuff leaked during function cache loading. +# Both the PL/pgSQL and SQL-function parsing processes generate some cruft +# within the function's cache context, which doesn't seem worth the trouble +# to get rid of. Moreover, there are cases where CachedFunction structs +# are intentionally leaked because we're unsure if any fn_extra pointers +# remain. +{ + hide_function_cache_leaks + Memcheck:Leak + match-leak-kinds: definite,possible,indirect + + ... + fun:cached_function_compile +} + +# Suppress complaints about stuff leaked during TS dictionary loading. +# Not very much is typically lost there, and preventing it would +# require a risky API change for TS tmplinit functions. +{ + hide_ts_dictionary_leaks + Memcheck:Leak + match-leak-kinds: definite,possible,indirect + + ... + fun:lookup_ts_dictionary_cache +} diff --git a/src/tools/version_stamp.pl b/src/tools/version_stamp.pl index c3509474d83..a9d2d0910f3 100755 --- a/src/tools/version_stamp.pl +++ b/src/tools/version_stamp.pl @@ -25,7 +25,7 @@ use warnings FATAL => 'all'; # Major version is hard-wired into the script. We update it when we branch # a new development version. -my $majorversion = 18; +my $majorversion = 19; # Validate argument and compute derived variables my $minor = shift; |