aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtutils.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtutils.c')
-rw-r--r--src/backend/access/nbtree/nbtutils.c390
1 files changed, 146 insertions, 244 deletions
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 1a15dfcb7d3..9aed207995f 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -44,7 +44,6 @@ static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *arra
static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
bool *skip_array_set);
-static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
bool readpagetup, int sktrig, bool *scanBehind);
@@ -52,7 +51,6 @@ static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
int sktrig, bool sktrig_required);
#ifdef USE_ASSERT_CHECKING
-static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir);
static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan);
#endif
static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
@@ -1035,73 +1033,6 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
}
/*
- * _bt_rewind_nonrequired_arrays() -- Rewind SAOP arrays not marked required
- *
- * Called when _bt_advance_array_keys decides to start a new primitive index
- * scan on the basis of the current scan position being before the position
- * that _bt_first is capable of repositioning the scan to by applying an
- * inequality operator required in the opposite-to-scan direction only.
- *
- * Although equality strategy scan keys (for both arrays and non-arrays alike)
- * are either marked required in both directions or in neither direction,
- * there is a sense in which non-required arrays behave like required arrays.
- * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)",
- * the scan key on "c" is non-required, but nevertheless enables positioning
- * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the
- * first descent of the tree by _bt_first. Later on, there could also be a
- * second descent, that places the scan right before tuples >= "(200, 3, 5)".
- * _bt_first must never be allowed to build an insertion scan key whose "c"
- * entry is set to a value other than 5, the "c" array's first element/value.
- * (Actually, it's the first in the current scan direction. This example uses
- * a forward scan.)
- *
- * Calling here resets the array scan key elements for the scan's non-required
- * arrays. This is strictly necessary for correctness in a subset of cases
- * involving "required in opposite direction"-triggered primitive index scans.
- * Not all callers are at risk of _bt_first using a non-required array like
- * this, but advancement always resets the arrays when another primitive scan
- * is scheduled, just to keep things simple. Array advancement even makes
- * sure to reset non-required arrays during scans that have no inequalities.
- * (Advancement still won't call here when there are no inequalities, though
- * that's just because it's all handled indirectly instead.)
- *
- * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that
- * everybody got this right.
- *
- * Note: In practice almost all SAOP arrays are marked required during
- * preprocessing (if necessary by generating skip arrays). It is hardly ever
- * truly necessary to call here, but consistently doing so is simpler.
- */
-static void
-_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir)
-{
- Relation rel = scan->indexRelation;
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
- int arrayidx = 0;
-
- for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
- {
- ScanKey cur = so->keyData + ikey;
- BTArrayKeyInfo *array = NULL;
-
- if (!(cur->sk_flags & SK_SEARCHARRAY) ||
- cur->sk_strategy != BTEqualStrategyNumber)
- continue;
-
- array = &so->arrayKeys[arrayidx++];
- Assert(array->scan_key == ikey);
-
- if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
- continue;
-
- Assert(array->num_elems != -1); /* No non-required skip arrays */
-
- _bt_array_set_low_or_high(rel, cur, array,
- ScanDirectionIsForward(dir));
- }
-}
-
-/*
* _bt_tuple_before_array_skeys() -- too early to advance required arrays?
*
* We always compare the tuple using the current array keys (which we assume
@@ -1380,8 +1311,6 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
*/
if (so->needPrimScan)
{
- Assert(_bt_verify_arrays_bt_first(scan, dir));
-
/*
* Flag was set -- must call _bt_first again, which will reset the
* scan's needPrimScan flag
@@ -2007,14 +1936,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
*/
else if (has_required_opposite_direction_only && pstate->finaltup &&
unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup)))
- {
- /*
- * Make sure that any SAOP arrays that were not marked required by
- * preprocessing are reset to their first element for this direction
- */
- _bt_rewind_nonrequired_arrays(scan, dir);
goto new_prim_scan;
- }
continue_scan:
@@ -2045,8 +1967,6 @@ continue_scan:
*/
so->oppositeDirCheck = has_required_opposite_direction_only;
- _bt_rewind_nonrequired_arrays(scan, dir);
-
/*
* skip by setting "look ahead" mechanism's offnum for forwards scans
* (backwards scans check scanBehind flag directly instead)
@@ -2143,48 +2063,6 @@ end_toplevel_scan:
#ifdef USE_ASSERT_CHECKING
/*
- * Verify that the scan's qual state matches what we expect at the point that
- * _bt_start_prim_scan is about to start a just-scheduled new primitive scan.
- *
- * We enforce a rule against non-required array scan keys: they must start out
- * with whatever element is the first for the scan's current scan direction.
- * See _bt_rewind_nonrequired_arrays comments for an explanation.
- */
-static bool
-_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir)
-{
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
- int arrayidx = 0;
-
- for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
- {
- ScanKey cur = so->keyData + ikey;
- BTArrayKeyInfo *array = NULL;
- int first_elem_dir;
-
- if (!(cur->sk_flags & SK_SEARCHARRAY) ||
- cur->sk_strategy != BTEqualStrategyNumber)
- continue;
-
- array = &so->arrayKeys[arrayidx++];
-
- if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
- ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
- continue;
-
- if (ScanDirectionIsForward(dir))
- first_elem_dir = 0;
- else
- first_elem_dir = array->num_elems - 1;
-
- if (array->cur_elem != first_elem_dir)
- return false;
- }
-
- return _bt_verify_keys_with_arraykeys(scan);
-}
-
-/*
* Verify that the scan's "so->keyData[]" scan keys are in agreement with
* its array key state
*/
@@ -2194,6 +2072,7 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan)
BTScanOpaque so = (BTScanOpaque) scan->opaque;
int last_sk_attno = InvalidAttrNumber,
arrayidx = 0;
+ bool nonrequiredseen = false;
if (!so->qual_ok)
return false;
@@ -2217,8 +2096,16 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan)
if (array->num_elems != -1 &&
cur->sk_argument != array->elem_values[array->cur_elem])
return false;
- if (last_sk_attno > cur->sk_attno)
- return false;
+ if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))
+ {
+ if (last_sk_attno > cur->sk_attno)
+ return false;
+ if (nonrequiredseen)
+ return false;
+ }
+ else
+ nonrequiredseen = true;
+
last_sk_attno = cur->sk_attno;
}
@@ -2551,37 +2438,12 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
if (!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
{
/* Scan key isn't marked required (corner case) */
- Assert(!(key->sk_flags & SK_ROW_HEADER));
break; /* unsafe */
}
if (key->sk_flags & SK_ROW_HEADER)
{
- /*
- * RowCompare inequality.
- *
- * Only the first subkey from a RowCompare can ever be marked
- * required (that happens when the row header is marked required).
- * There is no simple, general way for us to transitively deduce
- * whether or not every tuple on the page satisfies a RowCompare
- * key based only on firsttup and lasttup -- so we just give up.
- */
- if (!start_past_saop_eq && !so->skipScan)
- break; /* unsafe to go further */
-
- /*
- * We have to be even more careful with RowCompares that come
- * after an array: we assume it's unsafe to even bypass the array.
- * Calling _bt_start_array_keys to recover the scan's arrays
- * following use of forcenonrequired mode isn't compatible with
- * _bt_check_rowcompare's continuescan=false behavior with NULL
- * row compare members. _bt_advance_array_keys must not make a
- * decision on the basis of a key not being satisfied in the
- * opposite-to-scan direction until the scan reaches a leaf page
- * where the same key begins to be satisfied in scan direction.
- * The _bt_first !used_all_subkeys behavior makes this limitation
- * hard to work around some other way.
- */
- return; /* completely unsafe to set pstate.startikey */
+ /* RowCompare inequalities currently aren't supported */
+ break; /* "unsafe" */
}
if (key->sk_strategy != BTEqualStrategyNumber)
{
@@ -3078,6 +2940,31 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
Assert(subkey->sk_flags & SK_ROW_MEMBER);
+ /* When a NULL row member is compared, the row never matches */
+ if (subkey->sk_flags & SK_ISNULL)
+ {
+ /*
+ * Unlike the simple-scankey case, this isn't a disallowed case
+ * (except when it's the first row element that has the NULL arg).
+ * But it can never match. If all the earlier row comparison
+ * columns are required for the scan direction, we can stop the
+ * scan, because there can't be another tuple that will succeed.
+ */
+ Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument));
+ subkey--;
+ if (forcenonrequired)
+ {
+ /* treating scan's keys as non-required */
+ }
+ else if ((subkey->sk_flags & SK_BT_REQFWD) &&
+ ScanDirectionIsForward(dir))
+ *continuescan = false;
+ else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
+ ScanDirectionIsBackward(dir))
+ *continuescan = false;
+ return false;
+ }
+
if (subkey->sk_attno > tupnatts)
{
/*
@@ -3087,11 +2974,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
* attribute passes the qual.
*/
Assert(BTreeTupleIsPivot(tuple));
- cmpresult = 0;
- if (subkey->sk_flags & SK_ROW_END)
- break;
- subkey++;
- continue;
+ return true;
}
datum = index_getattr(tuple,
@@ -3101,6 +2984,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
if (isNull)
{
+ int reqflags;
+
if (forcenonrequired)
{
/* treating scan's keys as non-required */
@@ -3111,15 +2996,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
* Since NULLs are sorted before non-NULLs, we know we have
* reached the lower limit of the range of values for this
* index attr. On a backward scan, we can stop if this qual
- * is one of the "must match" subset. We can stop regardless
- * of whether the qual is > or <, so long as it's required,
- * because it's not possible for any future tuples to pass. On
- * a forward scan, however, we must keep going, because we may
- * have initially positioned to the start of the index.
- * (_bt_advance_array_keys also relies on this behavior during
- * forward scans.)
+ * is one of the "must match" subset. However, on a forwards
+ * scan, we must keep going, because we may have initially
+ * positioned to the start of the index.
+ *
+ * All required NULLS FIRST > row members can use NULL tuple
+ * values to end backwards scans, just like with other values.
+ * A qual "WHERE (a, b, c) > (9, 42, 'foo')" can terminate a
+ * backwards scan upon reaching the index's rightmost "a = 9"
+ * tuple whose "b" column contains a NULL (if not sooner).
+ * Since "b" is NULLS FIRST, we can treat its NULLs as "<" 42.
*/
- if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+ reqflags = SK_BT_REQBKWD;
+
+ /*
+ * When a most significant required NULLS FIRST < row compare
+ * member sees NULL tuple values during a backwards scan, it
+ * signals the end of matches for the whole row compare/scan.
+ * A qual "WHERE (a, b, c) < (9, 42, 'foo')" will terminate a
+ * backwards scan upon reaching the rightmost tuple whose "a"
+ * column has a NULL. The "a" NULL value is "<" 9, and yet
+ * our < row compare will still end the scan. (This isn't
+ * safe with later/lower-order row members. Notice that it
+ * can only happen with an "a" NULL some time after the scan
+ * completely stops needing to use its "b" and "c" members.)
+ */
+ if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument))
+ reqflags |= SK_BT_REQFWD; /* safe, first row member */
+
+ if ((subkey->sk_flags & reqflags) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
}
@@ -3129,15 +3034,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
* Since NULLs are sorted after non-NULLs, we know we have
* reached the upper limit of the range of values for this
* index attr. On a forward scan, we can stop if this qual is
- * one of the "must match" subset. We can stop regardless of
- * whether the qual is > or <, so long as it's required,
- * because it's not possible for any future tuples to pass. On
- * a backward scan, however, we must keep going, because we
- * may have initially positioned to the end of the index.
- * (_bt_advance_array_keys also relies on this behavior during
- * backward scans.)
+ * one of the "must match" subset. However, on a backward
+ * scan, we must keep going, because we may have initially
+ * positioned to the end of the index.
+ *
+ * All required NULLS LAST < row members can use NULL tuple
+ * values to end forwards scans, just like with other values.
+ * A qual "WHERE (a, b, c) < (9, 42, 'foo')" can terminate a
+ * forwards scan upon reaching the index's leftmost "a = 9"
+ * tuple whose "b" column contains a NULL (if not sooner).
+ * Since "b" is NULLS LAST, we can treat its NULLs as ">" 42.
+ */
+ reqflags = SK_BT_REQFWD;
+
+ /*
+ * When a most significant required NULLS LAST > row compare
+ * member sees NULL tuple values during a forwards scan, it
+ * signals the end of matches for the whole row compare/scan.
+ * A qual "WHERE (a, b, c) > (9, 42, 'foo')" will terminate a
+ * forwards scan upon reaching the leftmost tuple whose "a"
+ * column has a NULL. The "a" NULL value is ">" 9, and yet
+ * our > row compare will end the scan. (This isn't safe with
+ * later/lower-order row members. Notice that it can only
+ * happen with an "a" NULL some time after the scan completely
+ * stops needing to use its "b" and "c" members.)
*/
- if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+ if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument))
+ reqflags |= SK_BT_REQBKWD; /* safe, first row member */
+
+ if ((subkey->sk_flags & reqflags) &&
ScanDirectionIsForward(dir))
*continuescan = false;
}
@@ -3148,30 +3073,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
return false;
}
- if (subkey->sk_flags & SK_ISNULL)
- {
- /*
- * Unlike the simple-scankey case, this isn't a disallowed case
- * (except when it's the first row element that has the NULL arg).
- * But it can never match. If all the earlier row comparison
- * columns are required for the scan direction, we can stop the
- * scan, because there can't be another tuple that will succeed.
- */
- Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument));
- subkey--;
- if (forcenonrequired)
- {
- /* treating scan's keys as non-required */
- }
- else if ((subkey->sk_flags & SK_BT_REQFWD) &&
- ScanDirectionIsForward(dir))
- *continuescan = false;
- else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
- ScanDirectionIsBackward(dir))
- *continuescan = false;
- return false;
- }
-
/* Perform the test --- three-way comparison not bool operator */
cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
subkey->sk_collation,
@@ -3330,87 +3231,85 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
* current page and killed tuples thereon (generally, this should only be
* called if so->numKilled > 0).
*
- * The caller does not have a lock on the page and may or may not have the
- * page pinned in a buffer. Note that read-lock is sufficient for setting
- * LP_DEAD status (which is only a hint).
- *
- * We match items by heap TID before assuming they are the right ones to
- * delete. We cope with cases where items have moved right due to insertions.
- * If an item has moved off the current page due to a split, we'll fail to
- * find it and do nothing (this is not an error case --- we assume the item
- * will eventually get marked in a future indexscan).
+ * Caller should not have a lock on the so->currPos page, but must hold a
+ * buffer pin when !so->dropPin. When we return, it still won't be locked.
+ * It'll continue to hold whatever pins were held before calling here.
*
- * Note that if we hold a pin on the target page continuously from initially
- * reading the items until applying this function, VACUUM cannot have deleted
- * any items from the page, and so there is no need to search left from the
- * recorded offset. (This observation also guarantees that the item is still
- * the right one to delete, which might otherwise be questionable since heap
- * TIDs can get recycled.) This holds true even if the page has been modified
- * by inserts and page splits, so there is no need to consult the LSN.
+ * We match items by heap TID before assuming they are the right ones to set
+ * LP_DEAD. If the scan is one that holds a buffer pin on the target page
+ * continuously from initially reading the items until applying this function
+ * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the
+ * page, so the page's TIDs can't have been recycled by now. There's no risk
+ * that we'll confuse a new index tuple that happens to use a recycled TID
+ * with a now-removed tuple with the same TID (that used to be on this same
+ * page). We can't rely on that during scans that drop buffer pins eagerly
+ * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
+ * the page LSN having not changed since back when _bt_readpage saw the page.
+ * We totally give up on setting LP_DEAD bits when the page LSN changed.
*
- * If the pin was released after reading the page, then we re-read it. If it
- * has been modified since we read it (as determined by the LSN), we dare not
- * flag any entries because it is possible that the old entry was vacuumed
- * away and the TID was re-used by a completely different heap tuple.
+ * We give up much less often during !so->dropPin scans, but it still happens.
+ * We cope with cases where items have moved right due to insertions. If an
+ * item has moved off the current page due to a split, we'll fail to find it
+ * and just give up on it.
*/
void
_bt_killitems(IndexScanDesc scan)
{
+ Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
- int i;
int numKilled = so->numKilled;
bool killedsomething = false;
- bool droppedpin PG_USED_FOR_ASSERTS_ONLY;
+ Buffer buf;
+ Assert(numKilled > 0);
Assert(BTScanPosIsValid(so->currPos));
+ Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
- /*
- * Always reset the scan state, so we don't look for same items on other
- * pages.
- */
+ /* Always invalidate so->killedItems[] before leaving so->currPos */
so->numKilled = 0;
- if (BTScanPosIsPinned(so->currPos))
+ if (!so->dropPin)
{
/*
* We have held the pin on this page since we read the index tuples,
* so all we need to do is lock it. The pin will have prevented
- * re-use of any TID on the page, so there is no need to check the
- * LSN.
+ * concurrent VACUUMs from recycling any of the TIDs on the page.
*/
- droppedpin = false;
- _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
-
- page = BufferGetPage(so->currPos.buf);
+ Assert(BTScanPosIsPinned(so->currPos));
+ buf = so->currPos.buf;
+ _bt_lockbuf(rel, buf, BT_READ);
}
else
{
- Buffer buf;
+ XLogRecPtr latestlsn;
- droppedpin = true;
- /* Attempt to re-read the buffer, getting pin and lock. */
- buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
+ Assert(!BTScanPosIsPinned(so->currPos));
+ Assert(RelationNeedsWAL(rel));
+ buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
- page = BufferGetPage(buf);
- if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
- so->currPos.buf = buf;
- else
+ latestlsn = BufferGetLSNAtomic(buf);
+ Assert(!XLogRecPtrIsInvalid(so->currPos.lsn));
+ Assert(so->currPos.lsn <= latestlsn);
+ if (so->currPos.lsn != latestlsn)
{
- /* Modified while not pinned means hinting is not safe. */
- _bt_relbuf(scan->indexRelation, buf);
+ /* Modified, give up on hinting */
+ _bt_relbuf(rel, buf);
return;
}
+
+ /* Unmodified, hinting is safe */
}
+ page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
- for (i = 0; i < numKilled; i++)
+ for (int i = 0; i < numKilled; i++)
{
int itemIndex = so->killedItems[i];
BTScanPosItem *kitem = &so->currPos.items[itemIndex];
@@ -3442,7 +3341,7 @@ _bt_killitems(IndexScanDesc scan)
* correctness.
*
* Note that the page may have been modified in almost any way
- * since we first read it (in the !droppedpin case), so it's
+ * since we first read it (in the !so->dropPin case), so it's
* possible that this posting list tuple wasn't a posting list
* tuple when we first encountered its heap TIDs.
*/
@@ -3458,7 +3357,7 @@ _bt_killitems(IndexScanDesc scan)
* though only in the common case where the page can't
* have been concurrently modified
*/
- Assert(kitem->indexOffset == offnum || !droppedpin);
+ Assert(kitem->indexOffset == offnum || !so->dropPin);
/*
* Read-ahead to later kitems here.
@@ -3522,10 +3421,13 @@ _bt_killitems(IndexScanDesc scan)
if (killedsomething)
{
opaque->btpo_flags |= BTP_HAS_GARBAGE;
- MarkBufferDirtyHint(so->currPos.buf, true);
+ MarkBufferDirtyHint(buf, true);
}
- _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ if (!so->dropPin)
+ _bt_unlockbuf(rel, buf);
+ else
+ _bt_relbuf(rel, buf);
}