aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/nbtree/README31
-rw-r--r--src/backend/access/nbtree/nbtinsert.c7
-rw-r--r--src/backend/access/nbtree/nbtpage.c4
-rw-r--r--src/backend/access/nbtree/nbtsearch.c66
-rw-r--r--src/backend/access/nbtree/nbtutils.c17
5 files changed, 78 insertions, 47 deletions
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index b5033628348..76bbe01730a 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.8 2003/11/29 19:51:40 pgsql Exp $
+$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.9 2006/01/17 00:09:00 tgl Exp $
This directory contains a correct implementation of Lehman and Yao's
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -325,15 +325,26 @@ work sometimes, but could cause failures later on depending on
what else gets put on their page.
"ScanKey" data structures are used in two fundamentally different ways
-in this code. Searches for the initial position for a scan, as well as
-insertions, use scankeys in which the comparison function is a 3-way
-comparator (<0, =0, >0 result). These scankeys are built within the
-btree code (eg, by _bt_mkscankey()) and used by _bt_compare(). Once we
-are positioned, sequential examination of tuples in a scan is done by
-_bt_checkkeys() using scankeys in which the comparison functions return
-booleans --- for example, int4lt might be used. These scankeys are the
-ones originally passed in from outside the btree code. Same
-representation, but different comparison functions!
+in this code, which we describe as "search" scankeys and "insertion"
+scankeys. A search scankey is the kind passed to btbeginscan() or
+btrescan() from outside the btree code. The sk_func pointers in a search
+scankey point to comparison functions that return boolean, such as int4lt.
+There might be more than one scankey entry for a given index column, or
+none at all. (We require the keys to appear in index column order, but
+the order of multiple keys for a given column is unspecified.) An
+insertion scankey uses the same array-of-ScanKey data structure, but the
+sk_func pointers point to btree comparison support functions (ie, 3-way
+comparators that return int4 values interpreted as <0, =0, >0). In an
+insertion scankey there is exactly one entry per index column. Insertion
+scankeys are built within the btree code (eg, by _bt_mkscankey()) and are
+used to locate the starting point of a scan, as well as for locating the
+place to insert a new index tuple. (Note: in the case of an insertion
+scankey built from a search scankey, there might be fewer keys than
+index columns, indicating that we have no constraints for the remaining
+index columns.) After we have located the starting point of a scan, the
+original search scankey is consulted as each index entry is sequentially
+scanned to decide whether to return the entry and whether the scan can
+stop (see _bt_checkkeys()).
Notes about data representation
-------------------------------
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index f634889219b..c88b6a9fd2b 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.130 2006/01/11 08:43:11 neilc Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.131 2006/01/17 00:09:00 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -80,7 +80,7 @@ _bt_doinsert(Relation rel, BTItem btitem,
BTStack stack;
Buffer buf;
- /* we need a scan key to do our search, so build one */
+ /* we need an insertion scan key to do our search, so build one */
itup_scankey = _bt_mkscankey(rel, itup);
top:
@@ -331,7 +331,8 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
* If 'afteritem' is >0 then the new tuple must be inserted after the
* existing item of that number, noplace else. If 'afteritem' is 0
* then the procedure finds the exact spot to insert it by searching.
- * (keysz and scankey parameters are used ONLY if afteritem == 0.)
+ * (keysz and scankey parameters are used ONLY if afteritem == 0.
+ * The scankey must be an insertion-type scankey.)
*
* NOTE: if the new key is equal to one or more existing keys, we can
* legitimately place it anywhere in the series of equal keys --- in fact,
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index c356dc082f0..9827971603f 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.90 2005/11/22 18:17:06 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.91 2006/01/17 00:09:01 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -813,7 +813,7 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
* better drop the target page lock first.
*/
_bt_relbuf(rel, buf);
- /* we need a scan key to do our search, so build one */
+ /* we need an insertion scan key to do our search, so build one */
itup_scankey = _bt_mkscankey(rel, &(targetkey->bti_itup));
/* find the leftmost leaf page containing this key */
stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 4cd79997292..904629af30d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.99 2005/12/07 19:37:53 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.100 2006/01/17 00:09:01 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -29,6 +29,9 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
* _bt_search() -- Search the tree for a particular scankey,
* or more precisely for the first leaf page it could be on.
*
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
* When nextkey is false (the usual case), we are looking for the first
* item >= scankey. When nextkey is true, we are looking for the first
* item strictly greater than scankey.
@@ -127,15 +130,18 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
* data that appeared on the page originally is either on the page
* or strictly to the right of it.
*
- * When nextkey is false (the usual case), we are looking for the first
- * item >= scankey. When nextkey is true, we are looking for the first
- * item strictly greater than scankey.
- *
* This routine decides whether or not we need to move right in the
* tree by examining the high key entry on the page. If that entry
* is strictly less than the scankey, or <= the scankey in the nextkey=true
* case, then we followed the wrong link and we need to move right.
*
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
+ * When nextkey is false (the usual case), we are looking for the first
+ * item >= scankey. When nextkey is true, we are looking for the first
+ * item strictly greater than scankey.
+ *
* On entry, we have the buffer pinned and a lock of the type specified by
* 'access'. If we move right, we release the buffer and lock and acquire
* the same on the right sibling. Return value is the buffer we stop at.
@@ -194,14 +200,13 @@ _bt_moveright(Relation rel,
/*
* _bt_binsrch() -- Do a binary search for a key on a particular page.
*
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
* When nextkey is false (the usual case), we are looking for the first
* item >= scankey. When nextkey is true, we are looking for the first
* item strictly greater than scankey.
*
- * The scankey we get has the compare function stored in the procedure
- * entry of each data struct. We invoke this regproc to do the
- * comparison for every key in the scankey.
- *
* On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
* key >= given scankey, or > scankey if nextkey is true. (NOTE: in
* particular, this means it is possible to return a value 1 greater than the
@@ -301,8 +306,11 @@ _bt_binsrch(Relation rel,
/*----------
* _bt_compare() -- Compare scankey to a particular tuple on the page.
*
+ * The passed scankey must be an insertion-type scankey (see nbtree/README),
+ * but it can omit the rightmost column(s) of the index.
+ *
* keysz: number of key conditions to be checked (might be less than the
- * total length of the scan key!)
+ * number of index columns!)
* page/offnum: location of btree item to be compared to.
*
* This routine returns:
@@ -464,12 +472,17 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
/*
* _bt_first() -- Find the first item in a scan.
*
- * We need to be clever about the type of scan, the operation it's
- * performing, and the tree ordering. We find the
- * first item in the tree that satisfies the qualification
- * associated with the scan descriptor. On exit, the page containing
+ * We need to be clever about the direction of scan, the search
+ * conditions, and the tree ordering. We find the first item (or,
+ * if backwards scan, the last item) in the tree that satisfies the
+ * qualifications in the scan key. On exit, the page containing
* the current index tuple is read locked and pinned, and the scan's
* opaque data entry is updated to include the buffer.
+ *
+ * Note that scan->keyData[], and the so->keyData[] scankey built from it,
+ * are both search-type scankeys (see nbtree/README for more about this).
+ * Within this routine, we build a temporary insertion-type scankey to use
+ * in locating the scan start position.
*/
bool
_bt_first(IndexScanDesc scan, ScanDirection dir)
@@ -537,6 +550,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* equality quals survive preprocessing, however, it doesn't matter which
* one we use --- by definition, they are either redundant or
* contradictory.
+ *
+ * The selected scan keys (at most one per index column) are remembered by
+ * storing their addresses into the local startKeys[] array.
*----------
*/
strat_total = BTEqualStrategyNumber;
@@ -631,9 +647,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
return _bt_endpoint(scan, dir);
/*
- * We want to start the scan somewhere within the index. Set up a
- * 3-way-comparison scankey we can use to search for the boundary point we
- * identified above.
+ * We want to start the scan somewhere within the index. Set up an
+ * insertion scankey we can use to search for the boundary point we
+ * identified above. The insertion scankey is built in the local
+ * scankeys[] array, using the keys identified by startKeys[].
*/
Assert(keysCount <= INDEX_MAX_KEYS);
for (i = 0; i < keysCount; i++)
@@ -681,19 +698,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
}
- /*
+ /*----------
* Examine the selected initial-positioning strategy to determine exactly
* where we need to start the scan, and set flag variables to control the
* code below.
*
* If nextkey = false, _bt_search and _bt_binsrch will locate the first
- * item >= scan key. If nextkey = true, they will locate the first item >
- * scan key.
+ * item >= scan key. If nextkey = true, they will locate the first
+ * item > scan key.
*
- * If goback = true, we will then step back one item, while if goback =
- * false, we will start the scan on the located item.
+ * If goback = true, we will then step back one item, while if
+ * goback = false, we will start the scan on the located item.
*
* it's yet other place to add some code later for is(not)null ...
+ *----------
*/
switch (strat_total)
{
@@ -774,8 +792,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
/*
- * Use the manufactured scan key to descend the tree and position
- * ourselves on the target leaf page.
+ * Use the manufactured insertion scan key to descend the tree and
+ * position ourselves on the target leaf page.
*/
stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7a18fcf0abb..ddd59444e28 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.67 2005/12/07 19:37:53 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.68 2006/01/17 00:09:01 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -23,7 +23,7 @@
/*
* _bt_mkscankey
- * Build a scan key that contains comparison data from itup
+ * Build an insertion scan key that contains comparison data from itup
* as well as comparator routines appropriate to the key datatypes.
*
* The result is intended for use with _bt_compare().
@@ -67,11 +67,12 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
/*
* _bt_mkscankey_nodata
- * Build a scan key that contains comparator routines appropriate to
- * the key datatypes, but no comparison data. The comparison data
- * ultimately used must match the key datatypes.
+ * Build an insertion scan key that contains 3-way comparator routines
+ * appropriate to the key datatypes, but no comparison data. The
+ * comparison data ultimately used must match the key datatypes.
*
- * The result cannot be used with _bt_compare(). Currently this
+ * The result cannot be used with _bt_compare(), unless comparison
+ * data is first stored into the key entries. Currently this
* routine is only called by nbtsort.c and tuplesort.c, which have
* their own comparison routines.
*/
@@ -160,7 +161,7 @@ _bt_formitem(IndexTuple itup)
/*----------
* _bt_preprocess_keys() -- Preprocess scan keys
*
- * The caller-supplied keys (in scan->keyData[]) are copied to
+ * The caller-supplied search-type keys (in scan->keyData[]) are copied to
* so->keyData[] with possible transformation. scan->numberOfKeys is
* the number of input keys, so->numberOfKeys gets the number of output
* keys (possibly less, never greater).
@@ -485,7 +486,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
* accordingly. See comments for _bt_preprocess_keys(), above, about how
* this is done.
*
- * scan: index scan descriptor
+ * scan: index scan descriptor (containing a search-type scankey)
* page: buffer page containing index tuple
* offnum: offset number of index tuple (must be a valid item!)
* dir: direction we are scanning in