diff options
Diffstat (limited to 'src/backend')
25 files changed, 511 insertions, 367 deletions
diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c index ffd0c78f905..020d00cd01c 100644 --- a/src/backend/access/common/tupdesc.c +++ b/src/backend/access/common/tupdesc.c @@ -142,11 +142,18 @@ void verify_compact_attribute(TupleDesc tupdesc, int attnum) { #ifdef USE_ASSERT_CHECKING - CompactAttribute *cattr = &tupdesc->compact_attrs[attnum]; + CompactAttribute cattr; Form_pg_attribute attr = TupleDescAttr(tupdesc, attnum); CompactAttribute tmp; /* + * Make a temp copy of the TupleDesc's CompactAttribute. This may be a + * shared TupleDesc and the attcacheoff might get changed by another + * backend. + */ + memcpy(&cattr, &tupdesc->compact_attrs[attnum], sizeof(CompactAttribute)); + + /* * Populate the temporary CompactAttribute from the corresponding * Form_pg_attribute */ @@ -156,11 +163,11 @@ verify_compact_attribute(TupleDesc tupdesc, int attnum) * Make the attcacheoff match since it's been reset to -1 by * populate_compact_attribute_internal. Same with attnullability. */ - tmp.attcacheoff = cattr->attcacheoff; - tmp.attnullability = cattr->attnullability; + tmp.attcacheoff = cattr.attcacheoff; + tmp.attnullability = cattr.attnullability; /* Check the freshly populated CompactAttribute matches the TupleDesc's */ - Assert(memcmp(&tmp, cattr, sizeof(CompactAttribute)) == 0); + Assert(memcmp(&tmp, &cattr, sizeof(CompactAttribute)) == 0); #endif } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 03a1d7b027a..fdff960c130 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -417,6 +417,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, * way, so we might as well avoid wasting cycles on acquiring page LSNs. * * See nbtree/README section on making concurrent TID recycling safe. + * + * Note: so->dropPin should never change across rescans. */ so->dropPin = (!scan->xs_want_itup && IsMVCCSnapshot(scan->xs_snapshot) && diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 070f14c8b91..36544ecfd58 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -2282,9 +2282,12 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * previously-saved right link or left link. lastcurrblkno is the page that * was current at the point where the blkno link was saved, which we use to * reason about concurrent page splits/page deletions during backwards scans. + * In the common case where seized=false, blkno is either so->currPos.nextPage + * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage. * - * On entry, caller shouldn't hold any locks or pins on any page (we work - * directly off of blkno and lastcurrblkno instead). Parallel scan callers + * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must + * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno + * won't need to be read again in almost all cases). Parallel scan callers * that seized the scan before calling here should pass seized=true; such a * caller's blkno and lastcurrblkno arguments come from the seized scan. * seized=false callers just pass us the blkno/lastcurrblkno taken from their @@ -2301,8 +2304,8 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * success exit (except during so->dropPin index scans, when we drop the pin * eagerly to avoid blocking VACUUM). * - * If there are no more matching records in the given direction, we drop all - * locks and pins, invalidate so->currPos, and return false. + * If there are no more matching records in the given direction, we invalidate + * so->currPos (while ensuring it retains no locks or pins), and return false. * * We always release the scan for a parallel scan caller, regardless of * success or failure; we'll call _bt_parallel_release as soon as possible. diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 29f0dca1b08..c71d1b6f2e1 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -63,7 +63,7 @@ static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, bool *continuescan, int *ikey); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - ScanDirection dir, bool *continuescan); + ScanDirection dir, bool forcenonrequired, bool *continuescan); static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc); static int _bt_keep_natts(Relation rel, IndexTuple lastleft, @@ -2902,10 +2902,8 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, /* row-comparison keys need special processing */ if (key->sk_flags & SK_ROW_HEADER) { - Assert(!forcenonrequired); /* forbidden by _bt_set_startikey */ - if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, - continuescan)) + forcenonrequired, continuescan)) continue; return false; } @@ -3062,7 +3060,8 @@ _bt_check_compare(IndexScanDesc scan, ScanDirection dir, */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, - TupleDesc tupdesc, ScanDirection dir, bool *continuescan) + TupleDesc tupdesc, ScanDirection dir, + bool forcenonrequired, bool *continuescan) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); int32 cmpresult = 0; @@ -3102,7 +3101,11 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, if (isNull) { - if (subkey->sk_flags & SK_BT_NULLS_FIRST) + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if (subkey->sk_flags & SK_BT_NULLS_FIRST) { /* * Since NULLs are sorted before non-NULLs, we know we have @@ -3156,8 +3159,12 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, */ Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); subkey--; - if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) *continuescan = false; else if ((subkey->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) @@ -3209,7 +3216,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, break; } - if (!result) + if (!result && !forcenonrequired) { /* * Tuple fails this qual. If it's a required qual for the current @@ -3323,24 +3330,26 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, * current page and killed tuples thereon (generally, this should only be * called if so->numKilled > 0). * - * The caller does not have a lock on the page and may or may not have the - * page pinned in a buffer. Note that read-lock is sufficient for setting - * LP_DEAD status (which is only a hint). - * - * We match items by heap TID before assuming they are the right ones to - * delete. We cope with cases where items have moved right due to insertions. - * If an item has moved off the current page due to a split, we'll fail to - * find it and do nothing (this is not an error case --- we assume the item - * will eventually get marked in a future indexscan). + * Caller should not have a lock on the so->currPos page, but must hold a + * buffer pin when !so->dropPin. When we return, it still won't be locked. + * It'll continue to hold whatever pins were held before calling here. * - * Note that if we hold a pin on the target page continuously from initially - * reading the items until applying this function, VACUUM cannot have deleted - * any items on the page, so the page's TIDs can't have been recycled by now. - * There's no risk that we'll confuse a new index tuple that happens to use a - * recycled TID with a now-removed tuple with the same TID (that used to be on - * this same page). We can't rely on that during scans that drop pins eagerly + * We match items by heap TID before assuming they are the right ones to set + * LP_DEAD. If the scan is one that holds a buffer pin on the target page + * continuously from initially reading the items until applying this function + * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the + * page, so the page's TIDs can't have been recycled by now. There's no risk + * that we'll confuse a new index tuple that happens to use a recycled TID + * with a now-removed tuple with the same TID (that used to be on this same + * page). We can't rely on that during scans that drop buffer pins eagerly * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on * the page LSN having not changed since back when _bt_readpage saw the page. + * We totally give up on setting LP_DEAD bits when the page LSN changed. + * + * We give up much less often during !so->dropPin scans, but it still happens. + * We cope with cases where items have moved right due to insertions. If an + * item has moved off the current page due to a split, we'll fail to find it + * and just give up on it. */ void _bt_killitems(IndexScanDesc scan) @@ -3353,6 +3362,7 @@ _bt_killitems(IndexScanDesc scan) OffsetNumber maxoff; int numKilled = so->numKilled; bool killedsomething = false; + Buffer buf; Assert(numKilled > 0); Assert(BTScanPosIsValid(so->currPos)); @@ -3369,11 +3379,11 @@ _bt_killitems(IndexScanDesc scan) * concurrent VACUUMs from recycling any of the TIDs on the page. */ Assert(BTScanPosIsPinned(so->currPos)); - _bt_lockbuf(rel, so->currPos.buf, BT_READ); + buf = so->currPos.buf; + _bt_lockbuf(rel, buf, BT_READ); } else { - Buffer buf; XLogRecPtr latestlsn; Assert(!BTScanPosIsPinned(so->currPos)); @@ -3391,10 +3401,9 @@ _bt_killitems(IndexScanDesc scan) } /* Unmodified, hinting is safe */ - so->currPos.buf = buf; } - page = BufferGetPage(so->currPos.buf); + page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@ -3511,10 +3520,13 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - MarkBufferDirtyHint(so->currPos.buf, true); + MarkBufferDirtyHint(buf, true); } - _bt_unlockbuf(rel, so->currPos.buf); + if (!so->dropPin) + _bt_unlockbuf(rel, buf); + else + _bt_relbuf(rel, buf); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1914859b2ee..47ffc0a2307 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7498,6 +7498,10 @@ CreateCheckPoint(int flags) if (PriorRedoPtr != InvalidXLogRecPtr) UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr); +#ifdef USE_INJECTION_POINTS + INJECTION_POINT("checkpoint-before-old-wal-removal", NULL); +#endif + /* * Delete old log files, those no longer needed for last checkpoint to * prevent the disk holding the xlog from growing full. diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index f52f2477df1..f5fc346e201 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -1538,7 +1538,7 @@ GetDecimalFromHex(char hex) if (isdigit((unsigned char) hex)) return hex - '0'; else - return pg_ascii_tolower((unsigned char) hex) - 'a' + 10; + return tolower((unsigned char) hex) - 'a' + 10; } /* diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index 255bd795361..b5400749353 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -144,7 +144,7 @@ execTuplesHashPrepare(int numCols, * hashfunctions: FmgrInfos of datatype-specific hashing functions to use * collations: collations to use in comparisons * nbuckets: initial estimate of hashtable size - * additionalsize: size of data stored in ->additional + * additionalsize: size of data that may be stored along with the hash entry * metacxt: memory context for long-lived allocation, but not per-entry data * tablecxt: memory context in which to store table entries * tempcxt: short-lived context for evaluation hash and comparison functions @@ -288,7 +288,7 @@ ResetTupleHashTable(TupleHashTable hashtable) * * If isnew isn't NULL, then a new entry is created if no existing entry * matches. On return, *isnew is true if the entry is newly created, - * false if it existed already. ->additional_data in the new entry has + * false if it existed already. The additional data in the new entry has * been zeroed. */ TupleHashEntry diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c index ab2eab9596e..26f7420b64b 100644 --- a/src/backend/executor/nodeTidrangescan.c +++ b/src/backend/executor/nodeTidrangescan.c @@ -128,9 +128,11 @@ TidExprListCreate(TidRangeScanState *tidrangestate) * TidRangeEval * * Compute and set node's block and offset range to scan by evaluating - * the trss_tidexprs. Returns false if we detect the range cannot + * node->trss_tidexprs. Returns false if we detect the range cannot * contain any tuples. Returns true if it's possible for the range to - * contain tuples. + * contain tuples. We don't bother validating that trss_mintid is less + * than or equal to trss_maxtid, as the scan_set_tidrange() table AM + * function will handle that. * ---------------------------------------------------------------- */ static bool diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl index c8595109b0e..9ecddb14231 100644 --- a/src/backend/nodes/gen_node_support.pl +++ b/src/backend/nodes/gen_node_support.pl @@ -1329,7 +1329,7 @@ _jumble${n}(JumbleState *jstate, Node *node) # Node type. Squash constants if requested. if ($query_jumble_squash) { - print $jff "\tJUMBLE_ELEMENTS($f);\n" + print $jff "\tJUMBLE_ELEMENTS($f, node);\n" unless $query_jumble_ignore; } else diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 25e08ba3426..eaf391fc2ab 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -653,6 +653,8 @@ _outA_Expr(StringInfo str, const A_Expr *node) WRITE_NODE_FIELD(lexpr); WRITE_NODE_FIELD(rexpr); + WRITE_LOCATION_FIELD(rexpr_list_start); + WRITE_LOCATION_FIELD(rexpr_list_end); WRITE_LOCATION_FIELD(location); } diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c index ac3cb3d9caf..fb33e6931ad 100644 --- a/src/backend/nodes/queryjumblefuncs.c +++ b/src/backend/nodes/queryjumblefuncs.c @@ -61,9 +61,9 @@ static void AppendJumble(JumbleState *jstate, const unsigned char *value, Size size); static void FlushPendingNulls(JumbleState *jstate); static void RecordConstLocation(JumbleState *jstate, - int location, bool squashed); + int location, int len); static void _jumbleNode(JumbleState *jstate, Node *node); -static void _jumbleElements(JumbleState *jstate, List *elements); +static void _jumbleElements(JumbleState *jstate, List *elements, Node *node); static void _jumbleA_Const(JumbleState *jstate, Node *node); static void _jumbleList(JumbleState *jstate, Node *node); static void _jumbleVariableSetStmt(JumbleState *jstate, Node *node); @@ -373,15 +373,17 @@ FlushPendingNulls(JumbleState *jstate) /* - * Record location of constant within query string of query tree that is - * currently being walked. + * Record the location of some kind of constant within a query string. + * These are not only bare constants but also expressions that ultimately + * constitute a constant, such as those inside casts and simple function + * calls. * - * 'squashed' signals that the constant represents the first or the last - * element in a series of merged constants, and everything but the first/last - * element contributes nothing to the jumble hash. + * If length is -1, it indicates a single such constant element. If + * it's a positive integer, it indicates the length of a squashable + * list of them. */ static void -RecordConstLocation(JumbleState *jstate, int location, bool squashed) +RecordConstLocation(JumbleState *jstate, int location, int len) { /* -1 indicates unknown or undefined location */ if (location >= 0) @@ -396,9 +398,14 @@ RecordConstLocation(JumbleState *jstate, int location, bool squashed) sizeof(LocationLen)); } jstate->clocations[jstate->clocations_count].location = location; - /* initialize lengths to -1 to simplify third-party module usage */ - jstate->clocations[jstate->clocations_count].squashed = squashed; - jstate->clocations[jstate->clocations_count].length = -1; + + /* + * Lengths are either positive integers (indicating a squashable + * list), or -1. + */ + Assert(len > -1 || len == -1); + jstate->clocations[jstate->clocations_count].length = len; + jstate->clocations[jstate->clocations_count].squashed = (len > -1); jstate->clocations_count++; } } @@ -408,12 +415,12 @@ RecordConstLocation(JumbleState *jstate, int location, bool squashed) * deduce that the expression is a constant: * * - Ignore a possible wrapping RelabelType and CoerceViaIO. - * - If it's a FuncExpr, check that the function is an implicit + * - If it's a FuncExpr, check that the function is a builtin * cast and its arguments are Const. * - Otherwise test if the expression is a simple Const. */ static bool -IsSquashableConst(Node *element) +IsSquashableConstant(Node *element) { if (IsA(element, RelabelType)) element = (Node *) ((RelabelType *) element)->arg; @@ -421,32 +428,50 @@ IsSquashableConst(Node *element) if (IsA(element, CoerceViaIO)) element = (Node *) ((CoerceViaIO *) element)->arg; - if (IsA(element, FuncExpr)) + switch (nodeTag(element)) { - FuncExpr *func = (FuncExpr *) element; - ListCell *temp; + case T_FuncExpr: + { + FuncExpr *func = (FuncExpr *) element; + ListCell *temp; - if (func->funcformat != COERCE_IMPLICIT_CAST && - func->funcformat != COERCE_EXPLICIT_CAST) - return false; + if (func->funcformat != COERCE_IMPLICIT_CAST && + func->funcformat != COERCE_EXPLICIT_CAST) + return false; - if (func->funcid > FirstGenbkiObjectId) - return false; + if (func->funcid > FirstGenbkiObjectId) + return false; - foreach(temp, func->args) - { - Node *arg = lfirst(temp); + /* + * We can check function arguments recursively, being careful + * about recursing too deep. At each recursion level it's + * enough to test the stack on the first element. (Note that + * I wasn't able to hit this without bloating the stack + * artificially in this function: the parser errors out before + * stack size becomes a problem here.) + */ + foreach(temp, func->args) + { + Node *arg = lfirst(temp); + + if (!IsA(arg, Const)) + { + if (foreach_current_index(temp) == 0 && + stack_is_too_deep()) + return false; + else if (!IsSquashableConstant(arg)) + return false; + } + } + + return true; + } - if (!IsA(arg, Const)) /* XXX we could recurse here instead */ + default: + if (!IsA(element, Const)) return false; - } - - return true; } - if (!IsA(element, Const)) - return false; - return true; } @@ -461,35 +486,29 @@ IsSquashableConst(Node *element) * expressions. */ static bool -IsSquashableConstList(List *elements, Node **firstExpr, Node **lastExpr) +IsSquashableConstantList(List *elements) { ListCell *temp; - /* - * If squashing is disabled, or the list is too short, we don't try to - * squash it. - */ + /* If the list is too short, we don't try to squash it. */ if (list_length(elements) < 2) return false; foreach(temp, elements) { - if (!IsSquashableConst(lfirst(temp))) + if (!IsSquashableConstant(lfirst(temp))) return false; } - *firstExpr = linitial(elements); - *lastExpr = llast(elements); - return true; } #define JUMBLE_NODE(item) \ _jumbleNode(jstate, (Node *) expr->item) -#define JUMBLE_ELEMENTS(list) \ - _jumbleElements(jstate, (List *) expr->list) +#define JUMBLE_ELEMENTS(list, node) \ + _jumbleElements(jstate, (List *) expr->list, node) #define JUMBLE_LOCATION(location) \ - RecordConstLocation(jstate, expr->location, false) + RecordConstLocation(jstate, expr->location, -1) #define JUMBLE_FIELD(item) \ do { \ if (sizeof(expr->item) == 8) \ @@ -517,36 +536,36 @@ do { \ #include "queryjumblefuncs.funcs.c" /* - * We jumble lists of constant elements as one individual item regardless - * of how many elements are in the list. This means different queries - * jumble to the same query_id, if the only difference is the number of - * elements in the list. + * We try to jumble lists of expressions as one individual item regardless + * of how many elements are in the list. This is know as squashing, which + * results in different queries jumbling to the same query_id, if the only + * difference is the number of elements in the list. + * + * We allow constants to be squashed. To normalize such queries, we use + * the start and end locations of the list of elements in a list. */ static void -_jumbleElements(JumbleState *jstate, List *elements) +_jumbleElements(JumbleState *jstate, List *elements, Node *node) { - Node *first, - *last; + bool normalize_list = false; - if (IsSquashableConstList(elements, &first, &last)) + if (IsSquashableConstantList(elements)) { - /* - * If this list of elements is squashable, keep track of the location - * of its first and last elements. When reading back the locations - * array, we'll see two consecutive locations with ->squashed set to - * true, indicating the location of initial and final elements of this - * list. - * - * For the limited set of cases we support now (implicit coerce via - * FuncExpr, Const) it's fine to use exprLocation of the 'last' - * expression, but if more complex composite expressions are to be - * supported (e.g., OpExpr or FuncExpr as an explicit call), more - * sophisticated tracking will be needed. - */ - RecordConstLocation(jstate, exprLocation(first), true); - RecordConstLocation(jstate, exprLocation(last), true); + if (IsA(node, ArrayExpr)) + { + ArrayExpr *aexpr = (ArrayExpr *) node; + + if (aexpr->list_start > 0 && aexpr->list_end > 0) + { + RecordConstLocation(jstate, + aexpr->list_start + 1, + (aexpr->list_end - aexpr->list_start) - 1); + normalize_list = true; + } + } } - else + + if (!normalize_list) { _jumbleNode(jstate, (Node *) elements); } diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 8c90ab54af8..48b5d13b9b6 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -526,6 +526,8 @@ _readA_Expr(void) READ_NODE_FIELD(lexpr); READ_NODE_FIELD(rexpr); + READ_LOCATION_FIELD(rexpr_list_start); + READ_LOCATION_FIELD(rexpr_list_end); READ_LOCATION_FIELD(location); READ_DONE(); diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index a16fdd65601..34f7c17f576 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -239,102 +239,23 @@ parse_sub_analyze(Node *parseTree, ParseState *parentParseState, } /* - * setQueryLocationAndLength - * Set query's location and length from statement and ParseState - * - * Some statements, like PreparableStmt, can be located within parentheses. - * For example "(SELECT 1)" or "COPY (UPDATE ...) to x;". For those, we - * cannot use the whole string from the statement's location or the SQL - * string would yield incorrectly. The parser will set stmt_len, reflecting - * the size of the statement within the parentheses. Thus, when stmt_len is - * available, we need to use it for the Query's stmt_len. - * - * For other cases, the parser can't provide the length of individual - * statements. However, we have the statement's location plus the length - * (p_stmt_len) and location (p_stmt_location) of the top level RawStmt, - * stored in pstate. Thus, the statement's length is the RawStmt's length - * minus how much we've advanced in the RawStmt's string. If p_stmt_len - * is 0, the SQL string is used up to its end. - */ -static void -setQueryLocationAndLength(ParseState *pstate, Query *qry, Node *parseTree) -{ - ParseLoc stmt_len = 0; - - switch (nodeTag(parseTree)) - { - case T_InsertStmt: - qry->stmt_location = ((InsertStmt *) parseTree)->stmt_location; - stmt_len = ((InsertStmt *) parseTree)->stmt_len; - break; - - case T_DeleteStmt: - qry->stmt_location = ((DeleteStmt *) parseTree)->stmt_location; - stmt_len = ((DeleteStmt *) parseTree)->stmt_len; - break; - - case T_UpdateStmt: - qry->stmt_location = ((UpdateStmt *) parseTree)->stmt_location; - stmt_len = ((UpdateStmt *) parseTree)->stmt_len; - break; - - case T_MergeStmt: - qry->stmt_location = ((MergeStmt *) parseTree)->stmt_location; - stmt_len = ((MergeStmt *) parseTree)->stmt_len; - break; - - case T_SelectStmt: - qry->stmt_location = ((SelectStmt *) parseTree)->stmt_location; - stmt_len = ((SelectStmt *) parseTree)->stmt_len; - break; - - case T_PLAssignStmt: - qry->stmt_location = ((PLAssignStmt *) parseTree)->location; - break; - - default: - qry->stmt_location = pstate->p_stmt_location; - break; - } - - if (stmt_len > 0) - { - /* Statement's length is known, use it */ - qry->stmt_len = stmt_len; - } - else if (pstate->p_stmt_len > 0) - { - /* - * The top RawStmt's length is known, so calculate the statement's - * length from the statement's location and the RawStmt's length and - * location. - */ - qry->stmt_len = pstate->p_stmt_len - (qry->stmt_location - pstate->p_stmt_location); - } - - /* The calculated statement length should be calculated as positive. */ - Assert(qry->stmt_len >= 0); -} - -/* * transformTopLevelStmt - * transform a Parse tree into a Query tree. * - * This function is just responsible for storing location data - * from the RawStmt into the ParseState. + * This function is just responsible for transferring statement location data + * from the RawStmt into the finished Query. */ Query * transformTopLevelStmt(ParseState *pstate, RawStmt *parseTree) { Query *result; - /* Store RawStmt's length and location in pstate */ - pstate->p_stmt_len = parseTree->stmt_len; - pstate->p_stmt_location = parseTree->stmt_location; - /* We're at top level, so allow SELECT INTO */ result = transformOptionalSelectInto(pstate, parseTree->stmt); + result->stmt_location = parseTree->stmt_location; + result->stmt_len = parseTree->stmt_len; + return result; } @@ -503,7 +424,6 @@ transformStmt(ParseState *pstate, Node *parseTree) /* Mark as original query until we learn differently */ result->querySource = QSRC_ORIGINAL; result->canSetTag = true; - setQueryLocationAndLength(pstate, result, parseTree); return result; } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 0b5652071d1..50f53159d58 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -154,7 +154,6 @@ static void base_yyerror(YYLTYPE *yylloc, core_yyscan_t yyscanner, const char *msg); static RawStmt *makeRawStmt(Node *stmt, int stmt_location); static void updateRawStmtEnd(RawStmt *rs, int end_location); -static void updatePreparableStmtEnd(Node *n, int end_location); static Node *makeColumnRef(char *colname, List *indirection, int location, core_yyscan_t yyscanner); static Node *makeTypeCast(Node *arg, TypeName *typename, int location); @@ -178,13 +177,13 @@ static void insertSelectOptions(SelectStmt *stmt, SelectLimit *limitClause, WithClause *withClause, core_yyscan_t yyscanner); -static Node *makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg, int location); +static Node *makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg); static Node *doNegate(Node *n, int location); static void doNegateFloat(Float *v); static Node *makeAndExpr(Node *lexpr, Node *rexpr, int location); static Node *makeOrExpr(Node *lexpr, Node *rexpr, int location); static Node *makeNotExpr(Node *expr, int location); -static Node *makeAArrayExpr(List *elements, int location); +static Node *makeAArrayExpr(List *elements, int location, int end_location); static Node *makeSQLValueFunction(SQLValueFunctionOp op, int32 typmod, int location); static Node *makeXmlExpr(XmlExprOp op, char *name, List *named_args, @@ -523,7 +522,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type <defelt> def_elem reloption_elem old_aggr_elem operator_def_elem %type <node> def_arg columnElem where_clause where_or_current_clause a_expr b_expr c_expr AexprConst indirection_el opt_slice_bound - columnref in_expr having_clause func_table xmltable array_expr + columnref having_clause func_table xmltable array_expr OptWhereClause operator_def_arg %type <list> opt_column_and_period_list %type <list> rowsfrom_item rowsfrom_list opt_col_def_list @@ -3417,7 +3416,6 @@ CopyStmt: COPY opt_binary qualified_name opt_column_list { CopyStmt *n = makeNode(CopyStmt); - updatePreparableStmtEnd($3, @4); n->relation = NULL; n->query = $3; n->attlist = NIL; @@ -12240,7 +12238,6 @@ InsertStmt: $5->onConflictClause = $6; $5->returningClause = $7; $5->withClause = $1; - $5->stmt_location = @$; $$ = (Node *) $5; } ; @@ -12431,7 +12428,6 @@ DeleteStmt: opt_with_clause DELETE_P FROM relation_expr_opt_alias n->whereClause = $6; n->returningClause = $7; n->withClause = $1; - n->stmt_location = @$; $$ = (Node *) n; } ; @@ -12506,7 +12502,6 @@ UpdateStmt: opt_with_clause UPDATE relation_expr_opt_alias n->whereClause = $7; n->returningClause = $8; n->withClause = $1; - n->stmt_location = @$; $$ = (Node *) n; } ; @@ -12584,7 +12579,6 @@ MergeStmt: m->joinCondition = $8; m->mergeWhenClauses = $9; m->returningClause = $10; - m->stmt_location = @$; $$ = (Node *) m; } @@ -12825,20 +12819,7 @@ SelectStmt: select_no_parens %prec UMINUS ; select_with_parens: - '(' select_no_parens ')' - { - SelectStmt *n = (SelectStmt *) $2; - - /* - * As SelectStmt's location starts at the SELECT keyword, - * we need to track the length of the SelectStmt within - * parentheses to be able to extract the relevant part - * of the query. Without this, the RawStmt's length would - * be used and would include the closing parenthesis. - */ - n->stmt_len = @3 - @2; - $$ = $2; - } + '(' select_no_parens ')' { $$ = $2; } | '(' select_with_parens ')' { $$ = $2; } ; @@ -12960,7 +12941,6 @@ simple_select: n->groupDistinct = ($7)->distinct; n->havingClause = $8; n->windowClause = $9; - n->stmt_location = @1; $$ = (Node *) n; } | SELECT distinct_clause target_list @@ -12978,7 +12958,6 @@ simple_select: n->groupDistinct = ($7)->distinct; n->havingClause = $8; n->windowClause = $9; - n->stmt_location = @1; $$ = (Node *) n; } | values_clause { $$ = $1; } @@ -12999,20 +12978,19 @@ simple_select: n->targetList = list_make1(rt); n->fromClause = list_make1($2); - n->stmt_location = @1; $$ = (Node *) n; } | select_clause UNION set_quantifier select_clause { - $$ = makeSetOp(SETOP_UNION, $3 == SET_QUANTIFIER_ALL, $1, $4, @1); + $$ = makeSetOp(SETOP_UNION, $3 == SET_QUANTIFIER_ALL, $1, $4); } | select_clause INTERSECT set_quantifier select_clause { - $$ = makeSetOp(SETOP_INTERSECT, $3 == SET_QUANTIFIER_ALL, $1, $4, @1); + $$ = makeSetOp(SETOP_INTERSECT, $3 == SET_QUANTIFIER_ALL, $1, $4); } | select_clause EXCEPT set_quantifier select_clause { - $$ = makeSetOp(SETOP_EXCEPT, $3 == SET_QUANTIFIER_ALL, $1, $4, @1); + $$ = makeSetOp(SETOP_EXCEPT, $3 == SET_QUANTIFIER_ALL, $1, $4); } ; @@ -13590,7 +13568,6 @@ values_clause: { SelectStmt *n = makeNode(SelectStmt); - n->stmt_location = @1; n->valuesLists = list_make1($3); $$ = (Node *) n; } @@ -15287,49 +15264,50 @@ a_expr: c_expr { $$ = $1; } (Node *) list_make2($5, $7), @2); } - | a_expr IN_P in_expr + | a_expr IN_P select_with_parens { - /* in_expr returns a SubLink or a list of a_exprs */ - if (IsA($3, SubLink)) - { - /* generate foo = ANY (subquery) */ - SubLink *n = (SubLink *) $3; + /* generate foo = ANY (subquery) */ + SubLink *n = makeNode(SubLink); - n->subLinkType = ANY_SUBLINK; - n->subLinkId = 0; - n->testexpr = $1; - n->operName = NIL; /* show it's IN not = ANY */ - n->location = @2; - $$ = (Node *) n; - } - else - { - /* generate scalar IN expression */ - $$ = (Node *) makeSimpleA_Expr(AEXPR_IN, "=", $1, $3, @2); - } + n->subselect = $3; + n->subLinkType = ANY_SUBLINK; + n->subLinkId = 0; + n->testexpr = $1; + n->operName = NIL; /* show it's IN not = ANY */ + n->location = @2; + $$ = (Node *) n; } - | a_expr NOT_LA IN_P in_expr %prec NOT_LA + | a_expr IN_P '(' expr_list ')' { - /* in_expr returns a SubLink or a list of a_exprs */ - if (IsA($4, SubLink)) - { - /* generate NOT (foo = ANY (subquery)) */ - /* Make an = ANY node */ - SubLink *n = (SubLink *) $4; - - n->subLinkType = ANY_SUBLINK; - n->subLinkId = 0; - n->testexpr = $1; - n->operName = NIL; /* show it's IN not = ANY */ - n->location = @2; - /* Stick a NOT on top; must have same parse location */ - $$ = makeNotExpr((Node *) n, @2); - } - else - { - /* generate scalar NOT IN expression */ - $$ = (Node *) makeSimpleA_Expr(AEXPR_IN, "<>", $1, $4, @2); - } + /* generate scalar IN expression */ + A_Expr *n = makeSimpleA_Expr(AEXPR_IN, "=", $1, (Node *) $4, @2); + + n->rexpr_list_start = @3; + n->rexpr_list_end = @5; + $$ = (Node *) n; + } + | a_expr NOT_LA IN_P select_with_parens %prec NOT_LA + { + /* generate NOT (foo = ANY (subquery)) */ + SubLink *n = makeNode(SubLink); + + n->subselect = $4; + n->subLinkType = ANY_SUBLINK; + n->subLinkId = 0; + n->testexpr = $1; + n->operName = NIL; /* show it's IN not = ANY */ + n->location = @2; + /* Stick a NOT on top; must have same parse location */ + $$ = makeNotExpr((Node *) n, @2); + } + | a_expr NOT_LA IN_P '(' expr_list ')' + { + /* generate scalar NOT IN expression */ + A_Expr *n = makeSimpleA_Expr(AEXPR_IN, "<>", $1, (Node *) $5, @2); + + n->rexpr_list_start = @4; + n->rexpr_list_end = @6; + $$ = (Node *) n; } | a_expr subquery_Op sub_type select_with_parens %prec Op { @@ -16764,15 +16742,15 @@ type_list: Typename { $$ = list_make1($1); } array_expr: '[' expr_list ']' { - $$ = makeAArrayExpr($2, @1); + $$ = makeAArrayExpr($2, @1, @3); } | '[' array_expr_list ']' { - $$ = makeAArrayExpr($2, @1); + $$ = makeAArrayExpr($2, @1, @3); } | '[' ']' { - $$ = makeAArrayExpr(NIL, @1); + $$ = makeAArrayExpr(NIL, @1, @2); } ; @@ -16894,17 +16872,6 @@ trim_list: a_expr FROM expr_list { $$ = lappend($3, $1); } | expr_list { $$ = $1; } ; -in_expr: select_with_parens - { - SubLink *n = makeNode(SubLink); - - n->subselect = $1; - /* other fields will be filled later */ - $$ = (Node *) n; - } - | '(' expr_list ')' { $$ = (Node *) $2; } - ; - /* * Define SQL-style CASE clause. * - Full specification @@ -18748,47 +18715,6 @@ updateRawStmtEnd(RawStmt *rs, int end_location) rs->stmt_len = end_location - rs->stmt_location; } -/* - * Adjust a PreparableStmt to reflect that it doesn't run to the end of the - * string. - */ -static void -updatePreparableStmtEnd(Node *n, int end_location) -{ - if (IsA(n, SelectStmt)) - { - SelectStmt *stmt = (SelectStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, InsertStmt)) - { - InsertStmt *stmt = (InsertStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, UpdateStmt)) - { - UpdateStmt *stmt = (UpdateStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, DeleteStmt)) - { - DeleteStmt *stmt = (DeleteStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else if (IsA(n, MergeStmt)) - { - MergeStmt *stmt = (MergeStmt *) n; - - stmt->stmt_len = end_location - stmt->stmt_location; - } - else - elog(ERROR, "unexpected node type %d", (int) n->type); -} - static Node * makeColumnRef(char *colname, List *indirection, int location, core_yyscan_t yyscanner) @@ -19167,14 +19093,11 @@ insertSelectOptions(SelectStmt *stmt, errmsg("multiple WITH clauses not allowed"), parser_errposition(exprLocation((Node *) withClause)))); stmt->withClause = withClause; - - /* Update SelectStmt's location to the start of the WITH clause */ - stmt->stmt_location = withClause->location; } } static Node * -makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg, int location) +makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg) { SelectStmt *n = makeNode(SelectStmt); @@ -19182,7 +19105,6 @@ makeSetOp(SetOperation op, bool all, Node *larg, Node *rarg, int location) n->all = all; n->larg = (SelectStmt *) larg; n->rarg = (SelectStmt *) rarg; - n->stmt_location = location; return (Node *) n; } @@ -19300,12 +19222,14 @@ makeNotExpr(Node *expr, int location) } static Node * -makeAArrayExpr(List *elements, int location) +makeAArrayExpr(List *elements, int location, int location_end) { A_ArrayExpr *n = makeNode(A_ArrayExpr); n->elements = elements; n->location = location; + n->list_start = location; + n->list_end = location_end; return (Node *) n; } diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index 1f8e2d54673..d66276801c6 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -1223,6 +1223,8 @@ transformAExprIn(ParseState *pstate, A_Expr *a) newa->element_typeid = scalar_type; newa->elements = aexprs; newa->multidims = false; + newa->list_start = a->rexpr_list_start; + newa->list_end = a->rexpr_list_end; newa->location = -1; result = (Node *) make_scalar_array_op(pstate, @@ -2165,6 +2167,8 @@ transformArrayExpr(ParseState *pstate, A_ArrayExpr *a, /* array_collid will be set by parse_collate.c */ newa->element_typeid = element_type; newa->elements = newcoercedelems; + newa->list_start = a->list_start; + newa->list_end = a->list_end; newa->location = a->location; return (Node *) newa; diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 1d56d0c4ef3..f1eb798f3e9 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -29,6 +29,7 @@ #include "postgres.h" #include "access/xact.h" +#include "access/xlog_internal.h" #include "access/xlogutils.h" #include "fmgr.h" #include "miscadmin.h" @@ -41,6 +42,7 @@ #include "storage/proc.h" #include "storage/procarray.h" #include "utils/builtins.h" +#include "utils/injection_point.h" #include "utils/inval.h" #include "utils/memutils.h" @@ -1825,9 +1827,13 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn) { bool updated_xmin = false; bool updated_restart = false; + XLogRecPtr restart_lsn pg_attribute_unused(); SpinLockAcquire(&MyReplicationSlot->mutex); + /* remember the old restart lsn */ + restart_lsn = MyReplicationSlot->data.restart_lsn; + /* * Prevent moving the confirmed_flush backwards, as this could lead to * data duplication issues caused by replicating already replicated @@ -1881,6 +1887,18 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn) /* first write new xmin to disk, so we know what's up after a crash */ if (updated_xmin || updated_restart) { +#ifdef USE_INJECTION_POINTS + XLogSegNo seg1, + seg2; + + XLByteToSeg(restart_lsn, seg1, wal_segment_size); + XLByteToSeg(MyReplicationSlot->data.restart_lsn, seg2, wal_segment_size); + + /* trigger injection point, but only if segment changes */ + if (seg1 != seg2) + INJECTION_POINT("logical-replication-slot-advance-segment", NULL); +#endif + ReplicationSlotMarkDirty(); ReplicationSlotSave(); elog(DEBUG1, "updated xmin: %u restart: %u", updated_xmin, updated_restart); diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 67655111875..c4299c76fb1 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -109,10 +109,22 @@ #include "storage/procarray.h" #include "storage/sinval.h" #include "utils/builtins.h" +#include "utils/inval.h" #include "utils/memutils.h" #include "utils/rel.h" #include "utils/relfilenumbermap.h" +/* + * Each transaction has an 8MB limit for invalidation messages distributed from + * other transactions. This limit is set considering scenarios with many + * concurrent logical decoding operations. When the distributed invalidation + * messages reach this threshold, the transaction is marked as + * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost + * some inval messages and hence don't know what needs to be invalidated. + */ +#define MAX_DISTR_INVAL_MSG_PER_TXN \ + ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage)) + /* entry for a hash table we use to map from xid to our transaction state */ typedef struct ReorderBufferTXNByIdEnt { @@ -472,6 +484,12 @@ ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn) txn->invalidations = NULL; } + if (txn->invalidations_distributed) + { + pfree(txn->invalidations_distributed); + txn->invalidations_distributed = NULL; + } + /* Reset the toast hash */ ReorderBufferToastReset(rb, txn); @@ -2661,7 +2679,17 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, AbortCurrentTransaction(); /* make sure there's no cache pollution */ - ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + if (rbtxn_distr_inval_overflowed(txn)) + { + Assert(txn->ninvalidations_distributed == 0); + InvalidateSystemCaches(); + } + else + { + ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed, + txn->invalidations_distributed); + } if (using_subtxn) RollbackAndReleaseCurrentSubTransaction(); @@ -2710,8 +2738,17 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, AbortCurrentTransaction(); /* make sure there's no cache pollution */ - ReorderBufferExecuteInvalidations(txn->ninvalidations, - txn->invalidations); + if (rbtxn_distr_inval_overflowed(txn)) + { + Assert(txn->ninvalidations_distributed == 0); + InvalidateSystemCaches(); + } + else + { + ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations); + ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed, + txn->invalidations_distributed); + } if (using_subtxn) RollbackAndReleaseCurrentSubTransaction(); @@ -3060,7 +3097,8 @@ ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn, * We might have decoded changes for this transaction that could load * the cache as per the current transaction's view (consider DDL's * happened in this transaction). We don't want the decoding of future - * transactions to use those cache entries so execute invalidations. + * transactions to use those cache entries so execute only the inval + * messages in this transaction. */ if (txn->ninvalidations > 0) ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, @@ -3147,9 +3185,10 @@ ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn) txn->final_lsn = lsn; /* - * Process cache invalidation messages if there are any. Even if we're not - * interested in the transaction's contents, it could have manipulated the - * catalog and we need to update the caches according to that. + * Process only cache invalidation messages in this transaction if there + * are any. Even if we're not interested in the transaction's contents, it + * could have manipulated the catalog and we need to update the caches + * according to that. */ if (txn->base_snapshot != NULL && txn->ninvalidations > 0) ReorderBufferImmediateInvalidation(rb, txn->ninvalidations, @@ -3422,6 +3461,57 @@ ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid, } /* + * Add new invalidation messages to the reorder buffer queue. + */ +static void +ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs) +{ + ReorderBufferChange *change; + + change = ReorderBufferAllocChange(rb); + change->action = REORDER_BUFFER_CHANGE_INVALIDATION; + change->data.inval.ninvalidations = nmsgs; + change->data.inval.invalidations = (SharedInvalidationMessage *) + palloc(sizeof(SharedInvalidationMessage) * nmsgs); + memcpy(change->data.inval.invalidations, msgs, + sizeof(SharedInvalidationMessage) * nmsgs); + + ReorderBufferQueueChange(rb, xid, lsn, change, false); +} + +/* + * A helper function for ReorderBufferAddInvalidations() and + * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation + * messages to the **invals_out. + */ +static void +ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out, + uint32 *ninvals_out, + SharedInvalidationMessage *msgs_new, + Size nmsgs_new) +{ + if (*ninvals_out == 0) + { + *ninvals_out = nmsgs_new; + *invals_out = (SharedInvalidationMessage *) + palloc(sizeof(SharedInvalidationMessage) * nmsgs_new); + memcpy(*invals_out, msgs_new, sizeof(SharedInvalidationMessage) * nmsgs_new); + } + else + { + /* Enlarge the array of inval messages */ + *invals_out = (SharedInvalidationMessage *) + repalloc(*invals_out, sizeof(SharedInvalidationMessage) * + (*ninvals_out + nmsgs_new)); + memcpy(*invals_out + *ninvals_out, msgs_new, + nmsgs_new * sizeof(SharedInvalidationMessage)); + *ninvals_out += nmsgs_new; + } +} + +/* * Accumulate the invalidations for executing them later. * * This needs to be called for each XLOG_XACT_INVALIDATIONS message and @@ -3441,7 +3531,6 @@ ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, { ReorderBufferTXN *txn; MemoryContext oldcontext; - ReorderBufferChange *change; txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); @@ -3456,35 +3545,76 @@ ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid, Assert(nmsgs > 0); - /* Accumulate invalidations. */ - if (txn->ninvalidations == 0) - { - txn->ninvalidations = nmsgs; - txn->invalidations = (SharedInvalidationMessage *) - palloc(sizeof(SharedInvalidationMessage) * nmsgs); - memcpy(txn->invalidations, msgs, - sizeof(SharedInvalidationMessage) * nmsgs); - } - else + ReorderBufferAccumulateInvalidations(&txn->invalidations, + &txn->ninvalidations, + msgs, nmsgs); + + ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Accumulate the invalidations distributed by other committed transactions + * for executing them later. + * + * This function is similar to ReorderBufferAddInvalidations() but stores + * the given inval messages to the txn->invalidations_distributed with the + * overflow check. + * + * This needs to be called by committed transactions to distribute their + * inval messages to in-progress transactions. + */ +void +ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid, + XLogRecPtr lsn, Size nmsgs, + SharedInvalidationMessage *msgs) +{ + ReorderBufferTXN *txn; + MemoryContext oldcontext; + + txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true); + + oldcontext = MemoryContextSwitchTo(rb->context); + + /* + * Collect all the invalidations under the top transaction, if available, + * so that we can execute them all together. See comments + * ReorderBufferAddInvalidations. + */ + txn = rbtxn_get_toptxn(txn); + + Assert(nmsgs > 0); + + if (!rbtxn_distr_inval_overflowed(txn)) { - txn->invalidations = (SharedInvalidationMessage *) - repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) * - (txn->ninvalidations + nmsgs)); + /* + * Check the transaction has enough space for storing distributed + * invalidation messages. + */ + if (txn->ninvalidations_distributed + nmsgs >= MAX_DISTR_INVAL_MSG_PER_TXN) + { + /* + * Mark the invalidation message as overflowed and free up the + * messages accumulated so far. + */ + txn->txn_flags |= RBTXN_DISTR_INVAL_OVERFLOWED; - memcpy(txn->invalidations + txn->ninvalidations, msgs, - nmsgs * sizeof(SharedInvalidationMessage)); - txn->ninvalidations += nmsgs; + if (txn->invalidations_distributed) + { + pfree(txn->invalidations_distributed); + txn->invalidations_distributed = NULL; + txn->ninvalidations_distributed = 0; + } + } + else + ReorderBufferAccumulateInvalidations(&txn->invalidations_distributed, + &txn->ninvalidations_distributed, + msgs, nmsgs); } - change = ReorderBufferAllocChange(rb); - change->action = REORDER_BUFFER_CHANGE_INVALIDATION; - change->data.inval.ninvalidations = nmsgs; - change->data.inval.invalidations = (SharedInvalidationMessage *) - palloc(sizeof(SharedInvalidationMessage) * nmsgs); - memcpy(change->data.inval.invalidations, msgs, - sizeof(SharedInvalidationMessage) * nmsgs); - - ReorderBufferQueueChange(rb, xid, lsn, change, false); + /* Queue the invalidation messages into the transaction */ + ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs); MemoryContextSwitchTo(oldcontext); } diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 0d7bddbe4ed..adf18c397db 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -794,6 +794,13 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact * contents built by the current transaction even after its decoding, * which should have been invalidated due to concurrent catalog * changing transaction. + * + * Distribute only the invalidation messages generated by the current + * committed transaction. Invalidation messages received from other + * transactions would have already been propagated to the relevant + * in-progress transactions. This transaction would have processed + * those invalidations, ensuring that subsequent transactions observe + * a consistent cache state. */ if (txn->xid != xid) { @@ -807,8 +814,9 @@ SnapBuildDistributeSnapshotAndInval(SnapBuild *builder, XLogRecPtr lsn, Transact { Assert(msgs != NULL); - ReorderBufferAddInvalidations(builder->reorder, txn->xid, lsn, - ninvalidations, msgs); + ReorderBufferAddDistributedInvalidations(builder->reorder, + txn->xid, lsn, + ninvalidations, msgs); } } } diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 600b87fa9cb..c64f020742f 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -424,6 +424,7 @@ ReplicationSlotCreate(const char *name, bool db_specific, slot->candidate_restart_valid = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->last_saved_confirmed_flush = InvalidXLogRecPtr; + slot->last_saved_restart_lsn = InvalidXLogRecPtr; slot->inactive_since = 0; /* @@ -1165,20 +1166,41 @@ ReplicationSlotsComputeRequiredLSN(void) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; XLogRecPtr restart_lsn; + XLogRecPtr last_saved_restart_lsn; bool invalidated; + ReplicationSlotPersistency persistency; if (!s->in_use) continue; SpinLockAcquire(&s->mutex); + persistency = s->data.persistency; restart_lsn = s->data.restart_lsn; invalidated = s->data.invalidated != RS_INVAL_NONE; + last_saved_restart_lsn = s->last_saved_restart_lsn; SpinLockRelease(&s->mutex); /* invalidated slots need not apply */ if (invalidated) continue; + /* + * For persistent slot use last_saved_restart_lsn to compute the + * oldest LSN for removal of WAL segments. The segments between + * last_saved_restart_lsn and restart_lsn might be needed by a + * persistent slot in the case of database crash. Non-persistent + * slots can't survive the database crash, so we don't care about + * last_saved_restart_lsn for them. + */ + if (persistency == RS_PERSISTENT) + { + if (last_saved_restart_lsn != InvalidXLogRecPtr && + restart_lsn > last_saved_restart_lsn) + { + restart_lsn = last_saved_restart_lsn; + } + } + if (restart_lsn != InvalidXLogRecPtr && (min_required == InvalidXLogRecPtr || restart_lsn < min_required)) @@ -1216,7 +1238,9 @@ ReplicationSlotsComputeLogicalRestartLSN(void) { ReplicationSlot *s; XLogRecPtr restart_lsn; + XLogRecPtr last_saved_restart_lsn; bool invalidated; + ReplicationSlotPersistency persistency; s = &ReplicationSlotCtl->replication_slots[i]; @@ -1230,14 +1254,33 @@ ReplicationSlotsComputeLogicalRestartLSN(void) /* read once, it's ok if it increases while we're checking */ SpinLockAcquire(&s->mutex); + persistency = s->data.persistency; restart_lsn = s->data.restart_lsn; invalidated = s->data.invalidated != RS_INVAL_NONE; + last_saved_restart_lsn = s->last_saved_restart_lsn; SpinLockRelease(&s->mutex); /* invalidated slots need not apply */ if (invalidated) continue; + /* + * For persistent slot use last_saved_restart_lsn to compute the + * oldest LSN for removal of WAL segments. The segments between + * last_saved_restart_lsn and restart_lsn might be needed by a + * persistent slot in the case of database crash. Non-persistent + * slots can't survive the database crash, so we don't care about + * last_saved_restart_lsn for them. + */ + if (persistency == RS_PERSISTENT) + { + if (last_saved_restart_lsn != InvalidXLogRecPtr && + restart_lsn > last_saved_restart_lsn) + { + restart_lsn = last_saved_restart_lsn; + } + } + if (restart_lsn == InvalidXLogRecPtr) continue; @@ -1455,6 +1498,7 @@ ReplicationSlotReserveWal(void) Assert(slot != NULL); Assert(slot->data.restart_lsn == InvalidXLogRecPtr); + Assert(slot->last_saved_restart_lsn == InvalidXLogRecPtr); /* * The replication slot mechanism is used to prevent removal of required @@ -1766,6 +1810,8 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, */ SpinLockAcquire(&s->mutex); + Assert(s->data.restart_lsn >= s->last_saved_restart_lsn); + restart_lsn = s->data.restart_lsn; /* we do nothing if the slot is already invalid */ @@ -1835,7 +1881,10 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes, * just rely on .invalidated. */ if (invalidation_cause == RS_INVAL_WAL_REMOVED) + { s->data.restart_lsn = InvalidXLogRecPtr; + s->last_saved_restart_lsn = InvalidXLogRecPtr; + } /* Let caller know */ *invalidated = true; @@ -2079,6 +2128,12 @@ CheckPointReplicationSlots(bool is_shutdown) SaveSlotToPath(s, path, LOG); } LWLockRelease(ReplicationSlotAllocationLock); + + /* + * Recompute the required LSN as SaveSlotToPath() updated + * last_saved_restart_lsn for slots. + */ + ReplicationSlotsComputeRequiredLSN(); } /* @@ -2354,6 +2409,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) if (!slot->just_dirtied) slot->dirty = false; slot->last_saved_confirmed_flush = cp.slotdata.confirmed_flush; + slot->last_saved_restart_lsn = cp.slotdata.restart_lsn; SpinLockRelease(&slot->mutex); LWLockRelease(&slot->io_in_progress_lock); @@ -2569,6 +2625,7 @@ RestoreSlotFromDisk(const char *name) slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->last_saved_confirmed_flush = cp.slotdata.confirmed_flush; + slot->last_saved_restart_lsn = cp.slotdata.restart_lsn; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index 6c6c0a908e2..3643f27ad6e 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -556,6 +556,13 @@ bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state) { *state = ioh->state; + + /* + * Ensure that we don't see an earlier state of the handle than ioh->state + * due to compiler or CPU reordering. This protects both ->generation as + * directly used here, and other fields in the handle accessed in the + * caller if the handle was not reused. + */ pg_read_barrier(); return ioh->generation != ref_generation; @@ -773,7 +780,12 @@ pgaio_io_wait_for_free(void) * Note that no interrupts are processed between the state check * and the call to reclaim - that's important as otherwise an * interrupt could have already reclaimed the handle. + * + * Need to ensure that there's no reordering, in the more common + * paths, where we wait for IO, that's done by + * pgaio_io_was_recycled(). */ + pg_read_barrier(); pgaio_io_reclaim(ioh); reclaimed++; } @@ -852,7 +864,12 @@ pgaio_io_wait_for_free(void) * check and the call to reclaim - that's important as * otherwise an interrupt could have already reclaimed the * handle. + * + * Need to ensure that there's no reordering, in the more + * common paths, where we wait for IO, that's done by + * pgaio_io_was_recycled(). */ + pg_read_barrier(); pgaio_io_reclaim(ioh); break; } diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c index 0ad9795bb7e..03c9bba0802 100644 --- a/src/backend/storage/aio/aio_callback.c +++ b/src/backend/storage/aio/aio_callback.c @@ -256,6 +256,9 @@ pgaio_io_call_complete_shared(PgAioHandle *ioh) pgaio_result_status_string(result.status), result.id, result.error_data, result.result); result = ce->cb->complete_shared(ioh, result, cb_data); + + /* the callback should never transition to unknown */ + Assert(result.status != PGAIO_RS_UNKNOWN); } ioh->distilled_result = result; @@ -290,6 +293,7 @@ pgaio_io_call_complete_local(PgAioHandle *ioh) /* start with distilled result from shared callback */ result = ioh->distilled_result; + Assert(result.status != PGAIO_RS_UNKNOWN); for (int i = ioh->num_callbacks; i > 0; i--) { @@ -306,6 +310,9 @@ pgaio_io_call_complete_local(PgAioHandle *ioh) pgaio_result_status_string(result.status), result.id, result.error_data, result.result); result = ce->cb->complete_local(ioh, result, cb_data); + + /* the callback should never transition to unknown */ + Assert(result.status != PGAIO_RS_UNKNOWN); } /* diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index cc312b641ca..b78048328e1 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -400,9 +400,9 @@ pgaio_uring_wait_one(PgAioHandle *ioh, uint64 ref_generation) while (true) { pgaio_debug_io(DEBUG3, ioh, - "wait_one io_gen: %llu, ref_gen: %llu, cycle %d", - (long long unsigned) ioh->generation, - (long long unsigned) ref_generation, + "wait_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64 ", cycle %d", + ioh->generation, + ref_generation, waited); if (pgaio_io_was_recycled(ioh, ref_generation, &state) || diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index 743cccc2acd..36be179678d 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -461,7 +461,12 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) int nwakeups = 0; int worker; - /* Try to get a job to do. */ + /* + * Try to get a job to do. + * + * The lwlock acquisition also provides the necessary memory barrier + * to ensure that we don't see an outdated data in the handle. + */ LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE); if ((io_index = pgaio_worker_submission_queue_consume()) == UINT32_MAX) { diff --git a/src/backend/utils/adt/inet_net_pton.c b/src/backend/utils/adt/inet_net_pton.c index 3b0db2a3799..ef2236d9f04 100644 --- a/src/backend/utils/adt/inet_net_pton.c +++ b/src/backend/utils/adt/inet_net_pton.c @@ -115,7 +115,8 @@ inet_cidr_pton_ipv4(const char *src, u_char *dst, size_t size) src++; /* skip x or X. */ while ((ch = *src++) != '\0' && isxdigit((unsigned char) ch)) { - ch = pg_ascii_tolower((unsigned char) ch); + if (isupper((unsigned char) ch)) + ch = tolower((unsigned char) ch); n = strchr(xdigits, ch) - xdigits; assert(n >= 0 && n <= 15); if (dirty == 0) diff --git a/src/backend/utils/adt/mcxtfuncs.c b/src/backend/utils/adt/mcxtfuncs.c index 396c2f223b4..fe6dce9cba3 100644 --- a/src/backend/utils/adt/mcxtfuncs.c +++ b/src/backend/utils/adt/mcxtfuncs.c @@ -38,7 +38,7 @@ typedef struct MemoryContextId { MemoryContext context; int context_id; -} MemoryContextId; +} MemoryContextId; /* * int_list_to_array |