diff options
Diffstat (limited to 'src/backend/access')
-rw-r--r-- | src/backend/access/common/indexvalid.c | 37 | ||||
-rw-r--r-- | src/backend/access/gist/gist.c | 408 | ||||
-rw-r--r-- | src/backend/access/gist/gistscan.c | 30 | ||||
-rw-r--r-- | src/backend/access/hash/hash.c | 317 | ||||
-rw-r--r-- | src/backend/access/hash/hashovfl.c | 46 | ||||
-rw-r--r-- | src/backend/access/hash/hashpage.c | 49 | ||||
-rw-r--r-- | src/backend/access/hash/hashscan.c | 27 | ||||
-rw-r--r-- | src/backend/access/index/indexam.c | 60 | ||||
-rw-r--r-- | src/backend/access/nbtree/Makefile | 4 | ||||
-rw-r--r-- | src/backend/access/nbtree/README | 16 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 63 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtpage.c | 41 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 504 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtscan.c | 224 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 26 | ||||
-rw-r--r-- | src/backend/access/rtree/rtree.c | 343 | ||||
-rw-r--r-- | src/backend/access/rtree/rtscan.c | 30 | ||||
-rw-r--r-- | src/backend/access/transam/xact.c | 11 |
18 files changed, 1036 insertions, 1200 deletions
diff --git a/src/backend/access/common/indexvalid.c b/src/backend/access/common/indexvalid.c index 6a7c08b4506..94e7efd522e 100644 --- a/src/backend/access/common/indexvalid.c +++ b/src/backend/access/common/indexvalid.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/common/Attic/indexvalid.c,v 1.26 2001/01/24 19:42:47 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/common/Attic/indexvalid.c,v 1.27 2001/07/15 22:48:15 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -24,12 +24,9 @@ */ int NIndexTupleProcessed; + /* ---------------- - * index_keytest - * - * old comments - * May eventually combine with other tests (like timeranges)? - * Should have Buffer buffer; as an argument and pass it to amgetattr. + * index_keytest - does this index tuple satisfy the scan key(s)? * ---------------- */ bool @@ -38,16 +35,16 @@ index_keytest(IndexTuple tuple, int scanKeySize, ScanKey key) { - bool isNull; - Datum datum; - Datum test; - IncrIndexProcessed(); while (scanKeySize > 0) { + Datum datum; + bool isNull; + Datum test; + datum = index_getattr(tuple, - key[0].sk_attno, + key->sk_attno, tupdesc, &isNull); @@ -57,25 +54,19 @@ index_keytest(IndexTuple tuple, return false; } - if (key[0].sk_flags & SK_ISNULL) + if (key->sk_flags & SK_ISNULL) return false; - if (key[0].sk_flags & SK_COMMUTE) - { - test = FunctionCall2(&key[0].sk_func, - key[0].sk_argument, datum); - } + if (key->sk_flags & SK_COMMUTE) + test = FunctionCall2(&key->sk_func, key->sk_argument, datum); else - { - test = FunctionCall2(&key[0].sk_func, - datum, key[0].sk_argument); - } + test = FunctionCall2(&key->sk_func, datum, key->sk_argument); - if (DatumGetBool(test) == !!(key[0].sk_flags & SK_NEGATE)) + if (DatumGetBool(test) == !!(key->sk_flags & SK_NEGATE)) return false; - scanKeySize -= 1; key++; + scanKeySize--; } return true; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 9d6e2040f6c..c99c4a7e6e3 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.79 2001/06/11 05:00:56 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.80 2001/07/15 22:48:15 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -43,7 +43,23 @@ #define RIGHT_ADDED 0x02 #define BOTH_ADDED ( LEFT_ADDED | RIGHT_ADDED ) + +/* Working state for gistbuild and its callback */ +typedef struct +{ + GISTSTATE giststate; + int numindexattrs; + double indtuples; +} GISTBuildState; + + /* non-export function prototypes */ +static void gistbuildCallback(Relation index, + HeapTuple htup, + Datum *attdata, + char *nulls, + bool tupleIsAlive, + void *state); static void gistdoinsert(Relation r, IndexTuple itup, InsertIndexResult *res, @@ -89,6 +105,7 @@ static void GISTInitBuffer(Buffer b, uint32 f); static OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, GISTSTATE *giststate); +static void gistdelete(Relation r, ItemPointer tid); #ifdef GIST_PAGEADDITEM static IndexTuple gist_tuple_replacekey(Relation r, GISTENTRY entry, IndexTuple t); @@ -116,184 +133,36 @@ gistbuild(PG_FUNCTION_ARGS) Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); - Node *oldPred = (Node *) PG_GETARG_POINTER(3); - -#ifdef NOT_USED - IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4); - -#endif - HeapScanDesc hscan; - HeapTuple htup; - IndexTuple itup; - TupleDesc htupdesc, - itupdesc; - Datum attdata[INDEX_MAX_KEYS]; - char nulls[INDEX_MAX_KEYS]; - double nhtups, - nitups; - Node *pred = indexInfo->ii_Predicate; - -#ifndef OMIT_PARTIAL_INDEX - TupleTable tupleTable; - TupleTableSlot *slot; - -#endif - ExprContext *econtext; - GISTSTATE giststate; - GISTENTRY tmpcentry; - Buffer buffer = InvalidBuffer; - bool *compvec; - int i; + double reltuples; + GISTBuildState buildstate; + Buffer buffer; /* no locking is needed */ - initGISTstate(&giststate, index); + initGISTstate(&buildstate.giststate, index); /* * We expect to be called exactly once for any index relation. If * that's not the case, big trouble's what we have. */ - if (oldPred == NULL && RelationGetNumberOfBlocks(index) != 0) - elog(ERROR, "%s already contains data", RelationGetRelationName(index)); - - /* initialize the root page (if this is a new index) */ - if (oldPred == NULL) - { - buffer = ReadBuffer(index, P_NEW); - GISTInitBuffer(buffer, F_LEAF); - WriteBuffer(buffer); - } + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "%s already contains data", + RelationGetRelationName(index)); - /* get tuple descriptors for heap and index relations */ - htupdesc = RelationGetDescr(heap); - itupdesc = RelationGetDescr(index); - - /* - * If this is a predicate (partial) index, we will need to evaluate - * the predicate using ExecQual, which requires the current tuple to - * be in a slot of a TupleTable. In addition, ExecQual must have an - * ExprContext referring to that slot. Here, we initialize dummy - * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92 - * - * We construct the ExprContext anyway since we need a per-tuple - * temporary memory context for function evaluation -- tgl July 00 - */ -#ifndef OMIT_PARTIAL_INDEX - if (pred != NULL || oldPred != NULL) - { - tupleTable = ExecCreateTupleTable(1); - slot = ExecAllocTableSlot(tupleTable); - ExecSetSlotDescriptor(slot, htupdesc, false); - } - else - { - tupleTable = NULL; - slot = NULL; - } - econtext = MakeExprContext(slot, TransactionCommandContext); -#else - econtext = MakeExprContext(NULL, TransactionCommandContext); -#endif /* OMIT_PARTIAL_INDEX */ + /* initialize the root page */ + buffer = ReadBuffer(index, P_NEW); + GISTInitBuffer(buffer, F_LEAF); + WriteBuffer(buffer); /* build the index */ - nhtups = nitups = 0.0; - - compvec = (bool *) palloc(sizeof(bool) * indexInfo->ii_NumIndexAttrs); - - /* start a heap scan */ - hscan = heap_beginscan(heap, 0, SnapshotNow, 0, (ScanKey) NULL); - - while (HeapTupleIsValid(htup = heap_getnext(hscan, 0))) - { - MemoryContextReset(econtext->ecxt_per_tuple_memory); - - nhtups += 1.0; - -#ifndef OMIT_PARTIAL_INDEX - - /* - * If oldPred != NULL, this is an EXTEND INDEX command, so skip - * this tuple if it was already in the existing partial index - */ - if (oldPred != NULL) - { - slot->val = htup; - if (ExecQual((List *) oldPred, econtext, false)) - { - nitups += 1.0; - continue; - } - } - - /* - * Skip this tuple if it doesn't satisfy the partial-index - * predicate - */ - if (pred != NULL) - { - slot->val = htup; - if (!ExecQual((List *) pred, econtext, false)) - continue; - } -#endif /* OMIT_PARTIAL_INDEX */ - - nitups += 1.0; - - /* - * For the current heap tuple, extract all the attributes we use - * in this index, and note which are null. - */ - FormIndexDatum(indexInfo, - htup, - htupdesc, - econtext->ecxt_per_tuple_memory, - attdata, - nulls); - - /* immediately compress keys to normalize */ - for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) - { - gistcentryinit(&giststate, i, &tmpcentry, attdata[i], - (Relation) NULL, (Page) NULL, (OffsetNumber) 0, - -1 /* size is currently bogus */ , TRUE); - if (attdata[i] != tmpcentry.key && - !(giststate.keytypbyval)) - compvec[i] = TRUE; - else - compvec[i] = FALSE; - attdata[i] = tmpcentry.key; - } - - /* form an index tuple and point it at the heap tuple */ - itup = index_formtuple(itupdesc, attdata, nulls); - itup->t_tid = htup->t_self; - - /* - * Since we already have the index relation locked, we call - * gistdoinsert directly. Normal access method calls dispatch - * through gistinsert, which locks the relation for write. This - * is the right thing to do if you're inserting single tups, but - * not when you're initializing the whole index at once. - */ - gistdoinsert(index, itup, NULL, &giststate); + buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs; + buildstate.indtuples = 0; - for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) - if (compvec[i]) - pfree(DatumGetPointer(attdata[i])); - - pfree(itup); - } + /* do the heap scan */ + reltuples = IndexBuildHeapScan(heap, index, indexInfo, + gistbuildCallback, (void *) &buildstate); /* okay, all heap tuples are indexed */ - heap_endscan(hscan); - - pfree(compvec); - -#ifndef OMIT_PARTIAL_INDEX - if (pred != NULL || oldPred != NULL) - ExecDropTupleTable(tupleTable, true); -#endif /* OMIT_PARTIAL_INDEX */ - FreeExprContext(econtext); /* * Since we just counted the tuples in the heap, we update its stats @@ -313,14 +182,8 @@ gistbuild(PG_FUNCTION_ARGS) heap_close(heap, NoLock); index_close(index); - UpdateStats(hrelid, nhtups); - UpdateStats(irelid, nitups); - if (oldPred != NULL) - { - if (nitups == nhtups) - pred = NULL; - UpdateIndexPredicate(irelid, oldPred, pred); - } + UpdateStats(hrelid, reltuples); + UpdateStats(irelid, buildstate.indtuples); } #ifdef GISTDEBUG @@ -331,6 +194,63 @@ gistbuild(PG_FUNCTION_ARGS) } /* + * Per-tuple callback from IndexBuildHeapScan + */ +static void +gistbuildCallback(Relation index, + HeapTuple htup, + Datum *attdata, + char *nulls, + bool tupleIsAlive, + void *state) +{ + GISTBuildState *buildstate = (GISTBuildState *) state; + IndexTuple itup; + bool compvec[INDEX_MAX_KEYS]; + GISTENTRY tmpcentry; + int i; + + /* immediately compress keys to normalize */ + for (i = 0; i < buildstate->numindexattrs; i++) + { + gistcentryinit(&buildstate->giststate, i, &tmpcentry, attdata[i], + (Relation) NULL, (Page) NULL, (OffsetNumber) 0, + -1 /* size is currently bogus */ , TRUE); + if (attdata[i] != tmpcentry.key && + !(buildstate->giststate.keytypbyval)) + compvec[i] = TRUE; + else + compvec[i] = FALSE; + attdata[i] = tmpcentry.key; + } + + /* form an index tuple and point it at the heap tuple */ + itup = index_formtuple(RelationGetDescr(index), attdata, nulls); + itup->t_tid = htup->t_self; + + /* GIST indexes don't index nulls, see notes in gistinsert */ + if (! IndexTupleHasNulls(itup)) + { + /* + * Since we already have the index relation locked, we call + * gistdoinsert directly. Normal access method calls dispatch + * through gistinsert, which locks the relation for write. This + * is the right thing to do if you're inserting single tups, but + * not when you're initializing the whole index at once. + */ + gistdoinsert(index, itup, NULL, &buildstate->giststate); + + buildstate->indtuples += 1; + } + + for (i = 0; i < buildstate->numindexattrs; i++) + if (compvec[i]) + pfree(DatumGetPointer(attdata[i])); + + pfree(itup); +} + +/* * gistinsert -- wrapper for GiST tuple insertion. * * This is the public interface routine for tuple insertion in GiSTs. @@ -343,25 +263,28 @@ gistinsert(PG_FUNCTION_ARGS) Datum *datum = (Datum *) PG_GETARG_POINTER(1); char *nulls = (char *) PG_GETARG_POINTER(2); ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3); - #ifdef NOT_USED Relation heapRel = (Relation) PG_GETARG_POINTER(4); - #endif InsertIndexResult res; IndexTuple itup; GISTSTATE giststate; GISTENTRY tmpentry; int i; - bool *compvec; + bool compvec[INDEX_MAX_KEYS]; + + /* + * Since GIST is not marked "amconcurrent" in pg_am, caller should + * have acquired exclusive lock on index relation. We need no locking + * here. + */ initGISTstate(&giststate, r); /* immediately compress keys to normalize */ - compvec = (bool *) palloc(sizeof(bool) * r->rd_att->natts); for (i = 0; i < r->rd_att->natts; i++) { - gistcentryinit(&giststate, i,&tmpentry, datum[i], + gistcentryinit(&giststate, i, &tmpentry, datum[i], (Relation) NULL, (Page) NULL, (OffsetNumber) 0, -1 /* size is currently bogus */ , TRUE); if (datum[i] != tmpentry.key && !(giststate.keytypbyval)) @@ -374,18 +297,24 @@ gistinsert(PG_FUNCTION_ARGS) itup->t_tid = *ht_ctid; /* - * Notes in ExecUtils:ExecOpenIndices() - * - * RelationSetLockForWrite(r); + * Currently, GIST indexes do not support indexing NULLs; considerable + * infrastructure work would have to be done to do anything reasonable + * with a NULL. */ + if (IndexTupleHasNulls(itup)) + { + res = NULL; + } + else + { + res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); + gistdoinsert(r, itup, &res, &giststate); + } - res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); - gistdoinsert(r, itup, &res, &giststate); for (i = 0; i < r->rd_att->natts; i++) if (compvec[i] == TRUE) pfree(DatumGetPointer(datum[i])); pfree(itup); - pfree(compvec); PG_RETURN_POINTER(res); } @@ -527,9 +456,7 @@ gistlayerinsert(Relation r, BlockNumber blkno, /* key is modified, so old version must be deleted */ ItemPointerSet(&oldtid, blkno, child); - DirectFunctionCall2(gistdelete, - PointerGetDatum(r), - PointerGetDatum(&oldtid)); + gistdelete(r, &oldtid); } ret = INSERTED; @@ -1416,29 +1343,31 @@ gistfreestack(GISTSTACK *s) /* -** remove an entry from a page -*/ -Datum -gistdelete(PG_FUNCTION_ARGS) + * Retail deletion of a single tuple. + * + * NB: this is no longer called externally, but is still needed by + * gistlayerinsert(). That dependency will have to be fixed if GIST + * is ever going to allow concurrent insertions. + */ +static void +gistdelete(Relation r, ItemPointer tid) { - Relation r = (Relation) PG_GETARG_POINTER(0); - ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1); BlockNumber blkno; OffsetNumber offnum; Buffer buf; Page page; /* - * Notes in ExecUtils:ExecOpenIndices() Also note that only vacuum - * deletes index tuples now... - * - * RelationSetLockForWrite(r); + * Since GIST is not marked "amconcurrent" in pg_am, caller should + * have acquired exclusive lock on index relation. We need no locking + * here. */ blkno = ItemPointerGetBlockNumber(tid); offnum = ItemPointerGetOffsetNumber(tid); /* adjust any scans that will be affected by this deletion */ + /* NB: this works only for scans in *this* backend! */ gistadjscans(r, GISTOP_DEL, blkno, offnum); /* delete the index tuple */ @@ -1448,10 +1377,93 @@ gistdelete(PG_FUNCTION_ARGS) PageIndexTupleDelete(page, offnum); WriteBuffer(buf); +} - PG_RETURN_VOID(); +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +Datum +gistbulkdelete(PG_FUNCTION_ARGS) +{ + Relation rel = (Relation) PG_GETARG_POINTER(0); + IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1); + void *callback_state = (void *) PG_GETARG_POINTER(2); + IndexBulkDeleteResult *result; + BlockNumber num_pages; + double tuples_removed; + double num_index_tuples; + RetrieveIndexResult res; + IndexScanDesc iscan; + + tuples_removed = 0; + num_index_tuples = 0; + + /* + * Since GIST is not marked "amconcurrent" in pg_am, caller should + * have acquired exclusive lock on index relation. We need no locking + * here. + */ + + /* + * XXX generic implementation --- should be improved! + */ + + /* walk through the entire index */ + iscan = index_beginscan(rel, false, 0, (ScanKey) NULL); + + while ((res = index_getnext(iscan, ForwardScanDirection)) + != (RetrieveIndexResult) NULL) + { + ItemPointer heapptr = &res->heap_iptr; + + if (callback(heapptr, callback_state)) + { + ItemPointer indexptr = &res->index_iptr; + BlockNumber blkno; + OffsetNumber offnum; + Buffer buf; + Page page; + + blkno = ItemPointerGetBlockNumber(indexptr); + offnum = ItemPointerGetOffsetNumber(indexptr); + + /* adjust any scans that will be affected by this deletion */ + gistadjscans(rel, GISTOP_DEL, blkno, offnum); + + /* delete the index tuple */ + buf = ReadBuffer(rel, blkno); + page = BufferGetPage(buf); + + PageIndexTupleDelete(page, offnum); + + WriteBuffer(buf); + + tuples_removed += 1; + } + else + num_index_tuples += 1; + + pfree(res); + } + + index_endscan(iscan); + + /* return statistics */ + num_pages = RelationGetNumberOfBlocks(rel); + + result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult)); + result->num_pages = num_pages; + result->tuples_removed = tuples_removed; + result->num_index_tuples = num_index_tuples; + + PG_RETURN_POINTER(result); } + void initGISTstate(GISTSTATE *giststate, Relation index) { diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index 672b121693a..9358692a53c 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/gist/gistscan.c,v 1.37 2001/06/28 16:00:07 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/gist/gistscan.c,v 1.38 2001/07/15 22:48:15 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -59,13 +59,8 @@ gistbeginscan(PG_FUNCTION_ARGS) ScanKey key = (ScanKey) PG_GETARG_POINTER(3); IndexScanDesc s; - /* - * Let index_beginscan does its work... - * - * RelationSetLockForRead(r); - */ - s = RelationGetIndexScan(r, fromEnd, nkeys, key); + gistregscan(s); PG_RETURN_POINTER(s); @@ -283,6 +278,27 @@ gistdropscan(IndexScanDesc s) pfree(l); } +/* + * AtEOXact_gist() --- clean up gist subsystem at xact abort or commit. + * + * This is here because it needs to touch this module's static var GISTScans. + */ +void +AtEOXact_gist(void) +{ + /* + * Note: these actions should only be necessary during xact abort; but + * they can't hurt during a commit. + */ + + /* + * Reset the active-scans list to empty. We do not need to free the + * list elements, because they're all palloc()'d, so they'll go away + * at end of transaction anyway. + */ + GISTScans = NULL; +} + void gistadjscans(Relation rel, int op, BlockNumber blkno, OffsetNumber offnum) { diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 9617fcc33a6..9b0e6cf28ee 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.51 2001/05/07 00:43:15 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.52 2001/07/15 22:48:15 tgl Exp $ * * NOTES * This file contains only the public interface routines. @@ -21,13 +21,27 @@ #include "access/genam.h" #include "access/hash.h" #include "access/heapam.h" +#include "access/xlogutils.h" #include "catalog/index.h" #include "executor/executor.h" #include "miscadmin.h" + bool BuildingHash = false; -#include "access/xlogutils.h" + +/* Working state for hashbuild and its callback */ +typedef struct +{ + double indtuples; +} HashBuildState; + +static void hashbuildCallback(Relation index, + HeapTuple htup, + Datum *attdata, + char *nulls, + bool tupleIsAlive, + void *state); /* @@ -44,161 +58,32 @@ hashbuild(PG_FUNCTION_ARGS) Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); - Node *oldPred = (Node *) PG_GETARG_POINTER(3); - -#ifdef NOT_USED - IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4); - -#endif - HeapScanDesc hscan; - HeapTuple htup; - IndexTuple itup; - TupleDesc htupdesc, - itupdesc; - Datum attdata[INDEX_MAX_KEYS]; - char nulls[INDEX_MAX_KEYS]; - double nhtups, - nitups; - HashItem hitem; - Node *pred = indexInfo->ii_Predicate; - -#ifndef OMIT_PARTIAL_INDEX - TupleTable tupleTable; - TupleTableSlot *slot; + double reltuples; + HashBuildState buildstate; -#endif - ExprContext *econtext; - InsertIndexResult res = NULL; - - /* note that this is a new hash */ + /* set flag to disable locking */ BuildingHash = true; - /* initialize the hash index metadata page (if this is a new index) */ - if (oldPred == NULL) - _hash_metapinit(index); - - /* get tuple descriptors for heap and index relations */ - htupdesc = RelationGetDescr(heap); - itupdesc = RelationGetDescr(index); - /* - * If this is a predicate (partial) index, we will need to evaluate - * the predicate using ExecQual, which requires the current tuple to - * be in a slot of a TupleTable. In addition, ExecQual must have an - * ExprContext referring to that slot. Here, we initialize dummy - * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92 - * - * We construct the ExprContext anyway since we need a per-tuple - * temporary memory context for function evaluation -- tgl July 00 + * We expect to be called exactly once for any index relation. If + * that's not the case, big trouble's what we have. */ -#ifndef OMIT_PARTIAL_INDEX - if (pred != NULL || oldPred != NULL) - { - tupleTable = ExecCreateTupleTable(1); - slot = ExecAllocTableSlot(tupleTable); - ExecSetSlotDescriptor(slot, htupdesc, false); - } - else - { - tupleTable = NULL; - slot = NULL; - } - econtext = MakeExprContext(slot, TransactionCommandContext); -#else - econtext = MakeExprContext(NULL, TransactionCommandContext); -#endif /* OMIT_PARTIAL_INDEX */ - - /* build the index */ - nhtups = nitups = 0.0; - - /* start a heap scan */ - hscan = heap_beginscan(heap, 0, SnapshotNow, 0, (ScanKey) NULL); + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "%s already contains data", + RelationGetRelationName(index)); - while (HeapTupleIsValid(htup = heap_getnext(hscan, 0))) - { - MemoryContextReset(econtext->ecxt_per_tuple_memory); + /* initialize the hash index metadata page */ + _hash_metapinit(index); - nhtups += 1.0; - -#ifndef OMIT_PARTIAL_INDEX - - /* - * If oldPred != NULL, this is an EXTEND INDEX command, so skip - * this tuple if it was already in the existing partial index - */ - if (oldPred != NULL) - { - slot->val = htup; - if (ExecQual((List *) oldPred, econtext, false)) - { - nitups += 1.0; - continue; - } - } - - /* - * Skip this tuple if it doesn't satisfy the partial-index - * predicate - */ - if (pred != NULL) - { - slot->val = htup; - if (!ExecQual((List *) pred, econtext, false)) - continue; - } -#endif /* OMIT_PARTIAL_INDEX */ - - nitups += 1.0; - - /* - * For the current heap tuple, extract all the attributes we use - * in this index, and note which are null. - */ - FormIndexDatum(indexInfo, - htup, - htupdesc, - econtext->ecxt_per_tuple_memory, - attdata, - nulls); - - /* form an index tuple and point it at the heap tuple */ - itup = index_formtuple(itupdesc, attdata, nulls); - - /* - * If the single index key is null, we don't insert it into the - * index. Hash tables support scans on '='. Relational algebra - * says that A = B returns null if either A or B is null. This - * means that no qualification used in an index scan could ever - * return true on a null attribute. It also means that indices - * can't be used by ISNULL or NOTNULL scans, but that's an - * artifact of the strategy map architecture chosen in 1986, not - * of the way nulls are handled here. - */ - - if (IndexTupleHasNulls(itup)) - { - pfree(itup); - continue; - } - - itup->t_tid = htup->t_self; - hitem = _hash_formitem(itup); - - res = _hash_doinsert(index, hitem); - - pfree(hitem); - pfree(itup); - pfree(res); - } + /* build the index */ + buildstate.indtuples = 0; - /* okay, all heap tuples are indexed */ - heap_endscan(hscan); + /* do the heap scan */ + reltuples = IndexBuildHeapScan(heap, index, indexInfo, + hashbuildCallback, (void *) &buildstate); -#ifndef OMIT_PARTIAL_INDEX - if (pred != NULL || oldPred != NULL) - ExecDropTupleTable(tupleTable, true); -#endif /* OMIT_PARTIAL_INDEX */ - FreeExprContext(econtext); + /* all done */ + BuildingHash = false; /* * Since we just counted the tuples in the heap, we update its stats @@ -218,23 +103,54 @@ hashbuild(PG_FUNCTION_ARGS) heap_close(heap, NoLock); index_close(index); - UpdateStats(hrelid, nhtups); - UpdateStats(irelid, nitups); - if (oldPred != NULL) - { - if (nitups == nhtups) - pred = NULL; - UpdateIndexPredicate(irelid, oldPred, pred); - } + UpdateStats(hrelid, reltuples); + UpdateStats(irelid, buildstate.indtuples); } - /* all done */ - BuildingHash = false; - PG_RETURN_VOID(); } /* + * Per-tuple callback from IndexBuildHeapScan + */ +static void +hashbuildCallback(Relation index, + HeapTuple htup, + Datum *attdata, + char *nulls, + bool tupleIsAlive, + void *state) +{ + HashBuildState *buildstate = (HashBuildState *) state; + IndexTuple itup; + HashItem hitem; + InsertIndexResult res; + + /* form an index tuple and point it at the heap tuple */ + itup = index_formtuple(RelationGetDescr(index), attdata, nulls); + itup->t_tid = htup->t_self; + + /* Hash indexes don't index nulls, see notes in hashinsert */ + if (IndexTupleHasNulls(itup)) + { + pfree(itup); + return; + } + + hitem = _hash_formitem(itup); + + res = _hash_doinsert(index, hitem); + + if (res) + pfree(res); + + buildstate->indtuples += 1; + + pfree(hitem); + pfree(itup); +} + +/* * hashinsert() -- insert an index tuple into a hash table. * * Hash on the index tuple's key, find the appropriate location @@ -248,10 +164,8 @@ hashinsert(PG_FUNCTION_ARGS) Datum *datum = (Datum *) PG_GETARG_POINTER(1); char *nulls = (char *) PG_GETARG_POINTER(2); ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3); - #ifdef NOT_USED Relation heapRel = (Relation) PG_GETARG_POINTER(4); - #endif InsertIndexResult res; HashItem hitem; @@ -261,8 +175,21 @@ hashinsert(PG_FUNCTION_ARGS) itup = index_formtuple(RelationGetDescr(rel), datum, nulls); itup->t_tid = *ht_ctid; + /* + * If the single index key is null, we don't insert it into the + * index. Hash tables support scans on '='. Relational algebra + * says that A = B returns null if either A or B is null. This + * means that no qualification used in an index scan could ever + * return true on a null attribute. It also means that indices + * can't be used by ISNULL or NOTNULL scans, but that's an + * artifact of the strategy map architecture chosen in 1986, not + * of the way nulls are handled here. + */ if (IndexTupleHasNulls(itup)) + { + pfree(itup); PG_RETURN_POINTER((InsertIndexResult) NULL); + } hitem = _hash_formitem(itup); @@ -471,22 +398,74 @@ hashrestrpos(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } -/* stubs */ +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ Datum -hashdelete(PG_FUNCTION_ARGS) +hashbulkdelete(PG_FUNCTION_ARGS) { Relation rel = (Relation) PG_GETARG_POINTER(0); - ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1); + IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1); + void *callback_state = (void *) PG_GETARG_POINTER(2); + IndexBulkDeleteResult *result; + BlockNumber num_pages; + double tuples_removed; + double num_index_tuples; + RetrieveIndexResult res; + IndexScanDesc iscan; - /* adjust any active scans that will be affected by this deletion */ - _hash_adjscans(rel, tid); + tuples_removed = 0; + num_index_tuples = 0; - /* delete the data from the page */ - _hash_pagedel(rel, tid); + /* + * XXX generic implementation --- should be improved! + */ - PG_RETURN_VOID(); + /* walk through the entire index */ + iscan = index_beginscan(rel, false, 0, (ScanKey) NULL); + + while ((res = index_getnext(iscan, ForwardScanDirection)) + != (RetrieveIndexResult) NULL) + { + ItemPointer heapptr = &res->heap_iptr; + + if (callback(heapptr, callback_state)) + { + ItemPointer indexptr = &res->index_iptr; + + /* adjust any active scans that will be affected by deletion */ + /* (namely, my own scan) */ + _hash_adjscans(rel, indexptr); + + /* delete the data from the page */ + _hash_pagedel(rel, indexptr); + + tuples_removed += 1; + } + else + num_index_tuples += 1; + + pfree(res); + } + + index_endscan(iscan); + + /* return statistics */ + num_pages = RelationGetNumberOfBlocks(rel); + + result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult)); + result->num_pages = num_pages; + result->tuples_removed = tuples_removed; + result->num_index_tuples = num_index_tuples; + + PG_RETURN_POINTER(result); } + void hash_redo(XLogRecPtr lsn, XLogRecord *record) { diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 8e2ed1bb8af..c9fb065dbd2 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.29 2001/03/07 21:20:26 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.30 2001/07/15 22:48:15 tgl Exp $ * * NOTES * Overflow pages look like ordinary relation pages. @@ -112,14 +112,14 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp) metap = (HashMetaPage) _hash_chgbufaccess(rel, metabufp, HASH_READ, HASH_WRITE); - splitnum = metap->OVFL_POINT; - max_free = metap->SPARES[splitnum]; + splitnum = metap->hashm_ovflpoint; + max_free = metap->hashm_spares[splitnum]; free_page = (max_free - 1) >> (metap->hashm_bshift + BYTE_TO_BIT); free_bit = (max_free - 1) & (BMPGSZ_BIT(metap) - 1); /* Look through all the free maps to find the first free block */ - first_page = metap->LAST_FREED >> (metap->hashm_bshift + BYTE_TO_BIT); + first_page = metap->hashm_lastfreed >> (metap->hashm_bshift + BYTE_TO_BIT); for (i = first_page; i <= free_page; i++) { Page mappage; @@ -138,7 +138,7 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp) if (i == first_page) { - bit = metap->LAST_FREED & (BMPGSZ_BIT(metap) - 1); + bit = metap->hashm_lastfreed & (BMPGSZ_BIT(metap) - 1); j = bit / BITS_PER_MAP; bit = bit & ~(BITS_PER_MAP - 1); } @@ -153,10 +153,10 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp) } /* No Free Page Found - have to allocate a new page */ - metap->LAST_FREED = metap->SPARES[splitnum]; - metap->SPARES[splitnum]++; - offset = metap->SPARES[splitnum] - - (splitnum ? metap->SPARES[splitnum - 1] : 0); + metap->hashm_lastfreed = metap->hashm_spares[splitnum]; + metap->hashm_spares[splitnum]++; + offset = metap->hashm_spares[splitnum] - + (splitnum ? metap->hashm_spares[splitnum - 1] : 0); #define OVMSG "HASH: Out of overflow pages. Out of luck.\n" @@ -164,9 +164,9 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp) { if (++splitnum >= NCACHED) elog(ERROR, OVMSG); - metap->OVFL_POINT = splitnum; - metap->SPARES[splitnum] = metap->SPARES[splitnum - 1]; - metap->SPARES[splitnum - 1]--; + metap->hashm_ovflpoint = splitnum; + metap->hashm_spares[splitnum] = metap->hashm_spares[splitnum - 1]; + metap->hashm_spares[splitnum - 1]--; offset = 0; } @@ -194,15 +194,15 @@ _hash_getovfladdr(Relation rel, Buffer *metabufp) if (_hash_initbitmap(rel, metap, OADDR_OF(splitnum, offset), 1, free_page)) elog(ERROR, "overflow_page: problem with _hash_initbitmap."); - metap->SPARES[splitnum]++; + metap->hashm_spares[splitnum]++; offset++; if (offset > SPLITMASK) { if (++splitnum >= NCACHED) elog(ERROR, OVMSG); - metap->OVFL_POINT = splitnum; - metap->SPARES[splitnum] = metap->SPARES[splitnum - 1]; - metap->SPARES[splitnum - 1]--; + metap->hashm_ovflpoint = splitnum; + metap->hashm_spares[splitnum] = metap->hashm_spares[splitnum - 1]; + metap->hashm_spares[splitnum - 1]--; offset = 0; } } @@ -235,13 +235,13 @@ found: */ bit = 1 + bit + (i * BMPGSZ_BIT(metap)); - if (bit >= metap->LAST_FREED) - metap->LAST_FREED = bit - 1; + if (bit >= metap->hashm_lastfreed) + metap->hashm_lastfreed = bit - 1; /* Calculate the split number for this page */ - for (i = 0; (i < splitnum) && (bit > metap->SPARES[i]); i++) + for (i = 0; (i < splitnum) && (bit > metap->hashm_spares[i]); i++) ; - offset = (i ? bit - metap->SPARES[i - 1] : bit); + offset = (i ? bit - metap->hashm_spares[i - 1] : bit); if (offset >= SPLITMASK) elog(ERROR, OVMSG); @@ -355,10 +355,10 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf) * element hashm_mapp[bitmappage]. */ splitnum = (addr >> SPLITSHIFT); - ovflpgno = (splitnum ? metap->SPARES[splitnum - 1] : 0) + (addr & SPLITMASK) - 1; + ovflpgno = (splitnum ? metap->hashm_spares[splitnum - 1] : 0) + (addr & SPLITMASK) - 1; - if (ovflpgno < metap->LAST_FREED) - metap->LAST_FREED = ovflpgno; + if (ovflpgno < metap->hashm_lastfreed) + metap->hashm_lastfreed = ovflpgno; bitmappage = (ovflpgno >> (metap->hashm_bshift + BYTE_TO_BIT)); bitmapbit = ovflpgno & (BMPGSZ_BIT(metap) - 1); diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index d1b3aaa2325..b8c520e3c0d 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.31 2001/06/27 23:31:37 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.32 2001/07/15 22:48:15 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -18,7 +18,7 @@ * address of the page if it is an overflow page. * * The first page in a hash relation, page zero, is special -- it stores - * information describing the hash table; it is referred to as teh + * information describing the hash table; it is referred to as the * "meta page." Pages one and higher store the actual data. * *------------------------------------------------------------------------- @@ -48,6 +48,19 @@ static void _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket * before the lock table is fully initialized, so we can't use it. * Strictly speaking, this violates 2pl, but we don't do 2pl on the * system catalogs anyway. + * + * Note that our page locks are actual lockmanager locks, not buffer + * locks (as are used by btree, for example). This is a good idea because + * the algorithms are not deadlock-free, and we'd better be able to detect + * and recover from deadlocks. + * + * Another important difference from btree is that a hash indexscan + * retains both a lock and a buffer pin on the current index page + * between hashgettuple() calls (btree keeps only a buffer pin). + * Because of this, it's safe to do item deletions with only a regular + * write lock on a hash page --- there cannot be an indexscan stopped on + * the page being deleted, other than an indexscan of our own backend, + * which will be taken care of by _hash_adjscans. */ @@ -350,6 +363,16 @@ _hash_unsetpagelock(Relation rel, } } +/* + * Delete a hash index item. + * + * It is safe to delete an item after acquiring a regular WRITE lock on + * the page, because no other backend can hold a READ lock on the page, + * and that means no other backend currently has an indexscan stopped on + * any item of the item being deleted. Our own backend might have such + * an indexscan (in fact *will*, since that's how VACUUM found the item + * in the first place), but _hash_adjscans will fix the scan position. + */ void _hash_pagedel(Relation rel, ItemPointer tid) { @@ -384,7 +407,7 @@ _hash_pagedel(Relation rel, ItemPointer tid) metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); metap = (HashMetaPage) BufferGetPage(metabuf); _hash_checkpage((Page) metap, LH_META_PAGE); - ++metap->hashm_nkeys; + metap->hashm_nkeys--; _hash_wrtbuf(rel, metabuf); } @@ -402,32 +425,32 @@ _hash_expandtable(Relation rel, Buffer metabuf) _hash_checkpage((Page) metap, LH_META_PAGE); metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); - new_bucket = ++metap->MAX_BUCKET; + new_bucket = ++metap->hashm_maxbucket; metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); - old_bucket = (metap->MAX_BUCKET & metap->LOW_MASK); + old_bucket = (metap->hashm_maxbucket & metap->hashm_lowmask); /* - * If the split point is increasing (MAX_BUCKET's log base 2 * + * If the split point is increasing (hashm_maxbucket's log base 2 * * increases), we need to copy the current contents of the spare split * bucket to the next bucket. */ - spare_ndx = _hash_log2(metap->MAX_BUCKET + 1); - if (spare_ndx > metap->OVFL_POINT) + spare_ndx = _hash_log2(metap->hashm_maxbucket + 1); + if (spare_ndx > metap->hashm_ovflpoint) { metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); - metap->SPARES[spare_ndx] = metap->SPARES[metap->OVFL_POINT]; - metap->OVFL_POINT = spare_ndx; + metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; + metap->hashm_ovflpoint = spare_ndx; metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); } - if (new_bucket > metap->HIGH_MASK) + if (new_bucket > metap->hashm_highmask) { /* Starting a new doubling */ metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE); - metap->LOW_MASK = metap->HIGH_MASK; - metap->HIGH_MASK = new_bucket | metap->LOW_MASK; + metap->hashm_lowmask = metap->hashm_highmask; + metap->hashm_highmask = new_bucket | metap->hashm_lowmask; metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ); } diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c index 649e42fbeb0..f4a91b5710f 100644 --- a/src/backend/access/hash/hashscan.c +++ b/src/backend/access/hash/hashscan.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.24 2001/01/24 19:42:47 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.25 2001/07/15 22:48:15 tgl Exp $ * * NOTES * Because we can be doing an index scan on a relation while we @@ -45,6 +45,31 @@ typedef HashScanListData *HashScanList; static HashScanList HashScans = (HashScanList) NULL; + +/* + * AtEOXact_hash() --- clean up hash subsystem at xact abort or commit. + * + * This is here because it needs to touch this module's static var HashScans. + */ +void +AtEOXact_hash(void) +{ + /* + * Note: these actions should only be necessary during xact abort; but + * they can't hurt during a commit. + */ + + /* + * Reset the active-scans list to empty. We do not need to free the + * list elements, because they're all palloc()'d, so they'll go away + * at end of transaction anyway. + */ + HashScans = NULL; + + /* If we were building a hash, we ain't anymore. */ + BuildingHash = false; +} + /* * _Hash_regscan() -- register a new scan. */ diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index adeccf5cc84..2b6be06168f 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.51 2001/06/22 19:16:21 wieck Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.52 2001/07/15 22:48:15 tgl Exp $ * * INTERFACE ROUTINES * index_open - open an index relation by relationId @@ -18,23 +18,17 @@ * index_rescan - restart a scan of an index * index_endscan - end a scan * index_insert - insert an index tuple into a relation - * index_delete - delete an item from an index relation * index_markpos - mark a scan position * index_restrpos - restore a scan position * index_getnext - get the next tuple from a scan - * ** index_fetch - retrieve tuple with tid - * ** index_replace - replace a tuple - * ** index_getattr - get an attribute from an index tuple - * index_getprocid - get a support procedure id from the rel tuple - * - * IndexScanIsValid - check index scan + * index_bulk_delete - bulk deletion of index tuples + * index_cost_estimator - fetch amcostestimate procedure OID + * index_getprocid - get a support procedure OID * * NOTES * This file contains the index_ routines which used * to be a scattered collection of stuff in access/genam. * - * The ** routines: index_fetch, index_replace, and index_getattr - * have not yet been implemented. They may not be needed. * * old comments * Scans are implemented as follows: @@ -211,23 +205,6 @@ index_insert(Relation relation, } /* ---------------- - * index_delete - delete an item from an index relation - * ---------------- - */ -void -index_delete(Relation relation, ItemPointer indexItem) -{ - RegProcedure procedure; - - RELATION_CHECKS; - GET_REL_PROCEDURE(delete, amdelete); - - OidFunctionCall2(procedure, - PointerGetDatum(relation), - PointerGetDatum(indexItem)); -} - -/* ---------------- * index_beginscan - start a scan of an index * ---------------- */ @@ -379,6 +356,35 @@ index_getnext(IndexScanDesc scan, } /* ---------------- + * index_bulk_delete - do mass deletion of index entries + * + * callback routine tells whether a given main-heap tuple is + * to be deleted + * + * return value is an optional palloc'd struct of statistics + * ---------------- + */ +IndexBulkDeleteResult * +index_bulk_delete(Relation relation, + IndexBulkDeleteCallback callback, + void *callback_state) +{ + RegProcedure procedure; + IndexBulkDeleteResult *result; + + RELATION_CHECKS; + GET_REL_PROCEDURE(bulk_delete, ambulkdelete); + + result = (IndexBulkDeleteResult *) + DatumGetPointer(OidFunctionCall3(procedure, + PointerGetDatum(relation), + PointerGetDatum((Pointer) callback), + PointerGetDatum(callback_state))); + + return result; +} + +/* ---------------- * index_cost_estimator * * Fetch the amcostestimate procedure OID for an index. diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile index eba9bd4eefe..bdc366dd0ad 100644 --- a/src/backend/access/nbtree/Makefile +++ b/src/backend/access/nbtree/Makefile @@ -4,7 +4,7 @@ # Makefile for access/nbtree # # IDENTIFICATION -# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.10 2000/08/31 16:09:41 petere Exp $ +# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Makefile,v 1.11 2001/07/15 22:48:16 tgl Exp $ # #------------------------------------------------------------------------- @@ -12,7 +12,7 @@ subdir = src/backend/access/nbtree top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtscan.o nbtsearch.o \ +OBJS = nbtcompare.o nbtinsert.o nbtpage.o nbtree.o nbtsearch.o \ nbtstrat.o nbtutils.o nbtsort.o all: SUBSYS.o diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index cff7ff0d655..d8ec739b2a8 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1,4 +1,4 @@ -$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.4 2000/07/25 05:26:40 tgl Exp $ +$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.5 2001/07/15 22:48:16 tgl Exp $ This directory contains a correct implementation of Lehman and Yao's high-concurrency B-tree management algorithm (P. Lehman and S. Yao, @@ -109,15 +109,11 @@ In addition, the following things are handy to know: is too high a price). Rebuilding corrupted indexes during restart seems more attractive. -+ On deletions, we need to adjust the position of active scans on - the index. The code in nbtscan.c handles this. We don't need to - do this for insertions or splits because _bt_restscan can find the - new position of the previously-found item. NOTE that nbtscan.c - only copes with deletions issued by the current backend. This - essentially means that concurrent deletions are not supported, but - that's true already in the Lehman and Yao algorithm. nbtscan.c - exists only to support VACUUM and allow it to delete items while - it's scanning the index. ++ Deletions are handled by getting a super-exclusive lock on the target + page, so that no other backend has a pin on the page when the deletion + starts. This means no scan is pointing at the page. This is OK for + deleting leaf items, probably not OK for deleting internal nodes; + will need to think harder when it's time to support index compaction. + "ScanKey" data structures are used in two fundamentally different ways in this code. Searches for the initial position for a scan, as well as diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 8ffb9b9043c..c91c568ed2f 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.83 2001/06/22 19:16:21 wieck Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.84 2001/07/15 22:48:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -126,7 +126,7 @@ top: if (TransactionIdIsValid(xwait)) { /* Have to wait for the other guy ... */ - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); XactLockTableWait(xwait); /* start over... */ _bt_freestack(stack); @@ -234,7 +234,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, if (TransactionIdIsValid(xwait)) { if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf, BT_READ); + _bt_relbuf(rel, nbuf); /* Tell _bt_doinsert to wait... */ return xwait; } @@ -263,7 +263,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, break; nblkno = opaque->btpo_next; if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf, BT_READ); + _bt_relbuf(rel, nbuf); nbuf = _bt_getbuf(rel, nblkno, BT_READ); page = BufferGetPage(nbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -273,7 +273,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, } if (nbuf != InvalidBuffer) - _bt_relbuf(rel, nbuf, BT_READ); + _bt_relbuf(rel, nbuf); return NullTransactionId; } @@ -397,7 +397,7 @@ _bt_insertonpg(Relation rel, /* step right one page */ BlockNumber rblkno = lpageop->btpo_next; - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); buf = _bt_getbuf(rel, rblkno, BT_WRITE); page = BufferGetPage(buf); lpageop = (BTPageOpaque) PageGetSpecialPointer(page); @@ -1175,12 +1175,12 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) */ if (P_RIGHTMOST(opaque)) { - _bt_relbuf(rel, buf, access); + _bt_relbuf(rel, buf); return (InvalidBuffer); } blkno = opaque->btpo_next; - _bt_relbuf(rel, buf, access); + _bt_relbuf(rel, buf); buf = _bt_getbuf(rel, blkno, access); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -1449,7 +1449,7 @@ _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release) &itup_off, &itup_blkno); /* Keep lock on new "root" buffer ! */ if (buf != rootbuf) - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); buf = newbuf; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -1525,7 +1525,7 @@ _bt_fixtree(Relation rel, BlockNumber blkno) if (P_ISROOT(opaque)) { /* Tree is Ok now */ - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); return; } /* Call _bt_fixroot() if there is no upper level */ @@ -1533,12 +1533,12 @@ _bt_fixtree(Relation rel, BlockNumber blkno) { elog(NOTICE, "bt_fixtree[%s]: fixing root page", RelationGetRelationName(rel)); buf = _bt_fixroot(rel, buf, true); - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); return; } /* Have to go up one level */ pblkno = opaque->btpo_parent; - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); } blkno = pblkno; } @@ -1571,7 +1571,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit) page = BufferGetPage(buf); /* copy page to temp storage */ memmove(tbuf, page, PageGetPageSize(page)); - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); page = (Page) tbuf; opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -1682,7 +1682,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit) { if (coff[i] != P_FIRSTDATAKEY(newopaque)) elog(ERROR, "bt_fixlevel[%s]: invalid item order(3) (need to recreate index)", RelationGetRelationName(rel)); - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); buf = newbuf; page = newpage; opaque = newopaque; @@ -1691,7 +1691,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit) continue; } /* unfound - need to insert on current page */ - _bt_relbuf(rel, newbuf, BT_WRITE); + _bt_relbuf(rel, newbuf); } /* insert pointer */ ritem = (BTItem) PageGetItem(cpage[i - 1], @@ -1718,10 +1718,10 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit) &itup_off, &itup_blkno); /* what buffer we need in ? */ if (newitemonleft) - _bt_relbuf(rel, newbuf, BT_WRITE); + _bt_relbuf(rel, newbuf); else { - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); buf = newbuf; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -1741,7 +1741,7 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit) /* copy page with pointer to cblkno[cidx] to temp storage */ memmove(tbuf, page, PageGetPageSize(page)); - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); page = (Page) tbuf; opaque = (BTPageOpaque) PageGetSpecialPointer(page); } @@ -1751,13 +1751,13 @@ _bt_fixlevel(Relation rel, Buffer buf, BlockNumber limit) goodbye = false; /* Pointers to child pages are Ok - right end of child level ? */ - _bt_relbuf(rel, cbuf[0], BT_READ); - _bt_relbuf(rel, cbuf[1], BT_READ); + _bt_relbuf(rel, cbuf[0]); + _bt_relbuf(rel, cbuf[1]); if (cidx == 1 || (cidx == 2 && (P_RIGHTMOST(copaque[2]) || goodbye))) { if (cidx == 2) - _bt_relbuf(rel, cbuf[2], BT_READ); + _bt_relbuf(rel, cbuf[2]); return; } if (cblkno[0] == limit || cblkno[1] == limit) @@ -1819,7 +1819,7 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno, { if (offnum <= stack.bts_offset) elog(ERROR, "bt_fixbranch[%s]: invalid item order (need to recreate index)", RelationGetRelationName(rel)); - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); return; } @@ -1837,7 +1837,7 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno, if (rbuf == InvalidBuffer) elog(ERROR, "bt_fixbranch[%s]: right pointer unfound(2) (need to recreate index)", RelationGetRelationName(rel)); rblkno = BufferGetBlockNumber(rbuf); - _bt_relbuf(rel, rbuf, BT_READ); + _bt_relbuf(rel, rbuf); /* * If we have parent item in true_stack then go up one level and @@ -1845,7 +1845,7 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno, */ if (true_stack) { - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); blkno = true_stack->bts_blkno; true_stack = true_stack->bts_parent; continue; @@ -1860,19 +1860,19 @@ _bt_fixbranch(Relation rel, BlockNumber lblkno, if (!BTreeInvalidParent(opaque)) { blkno = opaque->btpo_parent; - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); continue; } /* Have to switch to excl buf lock and re-check btpo_parent */ - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); buf = _bt_getbuf(rel, blkno, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!BTreeInvalidParent(opaque)) { blkno = opaque->btpo_parent; - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); continue; } @@ -1913,7 +1913,7 @@ _bt_fixup(Relation rel, Buffer buf) if (!BTreeInvalidParent(opaque)) { blkno = opaque->btpo_parent; - _bt_relbuf(rel, buf, BT_WRITE); + _bt_relbuf(rel, buf); elog(NOTICE, "bt_fixup[%s]: checking/fixing upper levels", RelationGetRelationName(rel)); _bt_fixtree(rel, blkno); return; @@ -1921,8 +1921,7 @@ _bt_fixup(Relation rel, Buffer buf) if (P_LEFTMOST(opaque)) break; blkno = opaque->btpo_prev; - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buf); + _bt_relbuf(rel, buf); buf = _bt_getbuf(rel, blkno, BT_WRITE); } @@ -1932,9 +1931,7 @@ _bt_fixup(Relation rel, Buffer buf) */ elog(NOTICE, "bt_fixup[%s]: fixing root page", RelationGetRelationName(rel)); buf = _bt_fixroot(rel, buf, true); - _bt_relbuf(rel, buf, BT_WRITE); - - return; + _bt_relbuf(rel, buf); } static OffsetNumber diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 67e1407b22b..376274c5621 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.52 2001/06/27 23:31:38 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.53 2001/07/15 22:48:16 tgl Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -138,7 +138,7 @@ _bt_getroot(Relation rel, int access) /* If access = BT_READ, caller doesn't want us to create root yet */ if (access == BT_READ) { - _bt_relbuf(rel, metabuf, BT_READ); + _bt_relbuf(rel, metabuf); return InvalidBuffer; } @@ -215,14 +215,14 @@ _bt_getroot(Relation rel, int access) * guarantee no deadlocks, we have to release the metadata * page and start all over again. */ - _bt_relbuf(rel, metabuf, BT_WRITE); + _bt_relbuf(rel, metabuf); return _bt_getroot(rel, access); } } else { rootblkno = metad->btm_root; - _bt_relbuf(rel, metabuf, BT_READ); /* done with the meta page */ + _bt_relbuf(rel, metabuf); /* done with the meta page */ rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); } @@ -270,8 +270,8 @@ _bt_getroot(Relation rel, int access) goto check_parent; } else -/* someone else already fixed root */ { + /* someone else already fixed root */ LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); LockBuffer(rootbuf, BT_READ); } @@ -283,7 +283,7 @@ _bt_getroot(Relation rel, int access) * chance that parent is root page. */ newrootbuf = _bt_getbuf(rel, rootopaque->btpo_parent, BT_READ); - _bt_relbuf(rel, rootbuf, BT_READ); + _bt_relbuf(rel, rootbuf); rootbuf = newrootbuf; rootpage = BufferGetPage(rootbuf); rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); @@ -293,7 +293,7 @@ _bt_getroot(Relation rel, int access) } /* try again */ - _bt_relbuf(rel, rootbuf, BT_READ); + _bt_relbuf(rel, rootbuf); return _bt_getroot(rel, access); } @@ -350,10 +350,12 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) /* * _bt_relbuf() -- release a locked buffer. * - * Lock and pin (refcount) are both dropped. + * Lock and pin (refcount) are both dropped. Note that either read or + * write lock can be dropped this way, but if we modified the buffer, + * this is NOT the right way to release a write lock. */ void -_bt_relbuf(Relation rel, Buffer buf, int access) +_bt_relbuf(Relation rel, Buffer buf) { LockBuffer(buf, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buf); @@ -449,24 +451,23 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, int level) } /* - * Delete an item from a btree. It had better be a leaf item... + * Delete an item from a btree page. + * + * This routine assumes that the caller has pinned and locked the buffer, + * and will write the buffer afterwards. */ void -_bt_pagedel(Relation rel, ItemPointer tid) +_bt_itemdel(Relation rel, Buffer buf, ItemPointer tid) { - Buffer buf; - Page page; - BlockNumber blkno; + Page page = BufferGetPage(buf); OffsetNumber offno; - blkno = ItemPointerGetBlockNumber(tid); offno = ItemPointerGetOffsetNumber(tid); - buf = _bt_getbuf(rel, blkno, BT_WRITE); - page = BufferGetPage(buf); - START_CRIT_SECTION(); + PageIndexTupleDelete(page, offno); + /* XLOG stuff */ { xl_btree_delete xlrec; @@ -490,8 +491,6 @@ _bt_pagedel(Relation rel, ItemPointer tid) PageSetLSN(page, recptr); PageSetSUI(page, ThisStartUpID); } - END_CRIT_SECTION(); - /* write the buffer and release the lock */ - _bt_wrtbuf(rel, buf); + END_CRIT_SECTION(); } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index b714296c8f7..b1426456241 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,7 +12,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.81 2001/05/18 21:24:17 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.82 2001/07/15 22:48:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -28,11 +28,27 @@ #include "storage/sinval.h" #include "access/xlogutils.h" -bool BuildingBtree = false; /* see comment in btbuild() */ -bool FastBuild = true; /* use sort/build instead */ - /* of insertion build */ +/* Working state for btbuild and its callback */ +typedef struct +{ + bool usefast; + bool isUnique; + bool haveDead; + Relation heapRel; + BTSpool *spool; + /* + * spool2 is needed only when the index is an unique index. Dead + * tuples are put into spool2 instead of spool in order to avoid + * uniqueness check. + */ + BTSpool *spool2; + double indtuples; +} BTBuildState; + +bool BuildingBtree = false; /* see comment in btbuild() */ +bool FastBuild = true; /* use SORT instead of insertion build */ /* * TEMPORARY FLAG FOR TESTING NEW FIX TREE @@ -41,6 +57,29 @@ bool FastBuild = true; /* use sort/build instead */ bool FixBTree = true; static void _bt_restscan(IndexScanDesc scan); +static void btbuildCallback(Relation index, + HeapTuple htup, + Datum *attdata, + char *nulls, + bool tupleIsAlive, + void *state); + + +/* + * AtEOXact_nbtree() --- clean up nbtree subsystem at xact abort or commit. + */ +void +AtEOXact_nbtree(void) +{ + /* + * Note: these actions should only be necessary during xact abort; but + * they can't hurt during a commit. + */ + + /* If we were building a btree, we ain't anymore. */ + BuildingBtree = false; +} + /* * btbuild() -- build a new btree index. @@ -56,42 +95,10 @@ btbuild(PG_FUNCTION_ARGS) Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); - Node *oldPred = (Node *) PG_GETARG_POINTER(3); -#ifdef NOT_USED - IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4); -#endif - HeapScanDesc hscan; - HeapTuple htup; - IndexTuple itup; - TupleDesc htupdesc, - itupdesc; - Datum attdata[INDEX_MAX_KEYS]; - char nulls[INDEX_MAX_KEYS]; - double nhtups, - nitups; - Node *pred = indexInfo->ii_Predicate; -#ifndef OMIT_PARTIAL_INDEX - TupleTable tupleTable; - TupleTableSlot *slot; -#endif - ExprContext *econtext; - InsertIndexResult res = NULL; - BTSpool *spool = NULL; - BTItem btitem; - bool usefast; - Snapshot snapshot; - TransactionId XmaxRecent; + double reltuples; + BTBuildState buildstate; - /* - * spool2 is needed only when the index is an unique index. Dead - * tuples are put into spool2 instead of spool in order to avoid - * uniqueness check. - */ - BTSpool *spool2 = NULL; - bool tupleIsAlive; - int dead_count; - - /* note that this is a new btree */ + /* set flag to disable locking */ BuildingBtree = true; /* @@ -100,220 +107,63 @@ btbuild(PG_FUNCTION_ARGS) * look harder at this. (there is some kind of incremental processing * going on there.) -- pma 08/29/95 */ - usefast = (FastBuild && IsNormalProcessingMode()); + buildstate.usefast = (FastBuild && IsNormalProcessingMode()); + buildstate.isUnique = indexInfo->ii_Unique; + buildstate.haveDead = false; + buildstate.heapRel = heap; + buildstate.spool = NULL; + buildstate.spool2 = NULL; + buildstate.indtuples = 0; #ifdef BTREE_BUILD_STATS if (Show_btree_build_stats) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - /* initialize the btree index metadata page (if this is a new index) */ - if (oldPred == NULL) - _bt_metapinit(index); - - /* get tuple descriptors for heap and index relations */ - htupdesc = RelationGetDescr(heap); - itupdesc = RelationGetDescr(index); - /* - * If this is a predicate (partial) index, we will need to evaluate - * the predicate using ExecQual, which requires the current tuple to - * be in a slot of a TupleTable. In addition, ExecQual must have an - * ExprContext referring to that slot. Here, we initialize dummy - * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92 - * - * We construct the ExprContext anyway since we need a per-tuple - * temporary memory context for function evaluation -- tgl July 00 + * We expect to be called exactly once for any index relation. If + * that's not the case, big trouble's what we have. */ -#ifndef OMIT_PARTIAL_INDEX - if (pred != NULL || oldPred != NULL) - { - tupleTable = ExecCreateTupleTable(1); - slot = ExecAllocTableSlot(tupleTable); - ExecSetSlotDescriptor(slot, htupdesc, false); - - /* - * we never want to use sort/build if we are extending an existing - * partial index -- it works by inserting the newly-qualifying - * tuples into the existing index. (sort/build would overwrite the - * existing index with one consisting of the newly-qualifying - * tuples.) - */ - usefast = false; - } - else - { - tupleTable = NULL; - slot = NULL; - } - econtext = MakeExprContext(slot, TransactionCommandContext); -#else - econtext = MakeExprContext(NULL, TransactionCommandContext); -#endif /* OMIT_PARTIAL_INDEX */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "%s already contains data", + RelationGetRelationName(index)); - /* build the index */ - nhtups = nitups = 0.0; + /* initialize the btree index metadata page */ + _bt_metapinit(index); - if (usefast) + if (buildstate.usefast) { - spool = _bt_spoolinit(index, indexInfo->ii_Unique); - + buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique); /* - * Different from spool,the uniqueness isn't checked for spool2. + * Different from spool, the uniqueness isn't checked for spool2. */ if (indexInfo->ii_Unique) - spool2 = _bt_spoolinit(index, false); + buildstate.spool2 = _bt_spoolinit(index, false); } - /* start a heap scan */ - dead_count = 0; - snapshot = (IsBootstrapProcessingMode() ? SnapshotNow : SnapshotAny); - hscan = heap_beginscan(heap, 0, snapshot, 0, (ScanKey) NULL); - XmaxRecent = 0; - if (snapshot == SnapshotAny) - GetXmaxRecent(&XmaxRecent); - - while (HeapTupleIsValid(htup = heap_getnext(hscan, 0))) - { - if (snapshot == SnapshotAny) - { - tupleIsAlive = HeapTupleSatisfiesNow(htup->t_data); - if (!tupleIsAlive) - { - if ((htup->t_data->t_infomask & HEAP_XMIN_INVALID) != 0) - continue; - if (htup->t_data->t_infomask & HEAP_XMAX_COMMITTED && - htup->t_data->t_xmax < XmaxRecent) - continue; - } - } - else - tupleIsAlive = true; - - MemoryContextReset(econtext->ecxt_per_tuple_memory); - - nhtups += 1.0; - -#ifndef OMIT_PARTIAL_INDEX - - /* - * If oldPred != NULL, this is an EXTEND INDEX command, so skip - * this tuple if it was already in the existing partial index - */ - if (oldPred != NULL) - { - slot->val = htup; - if (ExecQual((List *) oldPred, econtext, false)) - { - nitups += 1.0; - continue; - } - } - - /* - * Skip this tuple if it doesn't satisfy the partial-index - * predicate - */ - if (pred != NULL) - { - slot->val = htup; - if (!ExecQual((List *) pred, econtext, false)) - continue; - } -#endif /* OMIT_PARTIAL_INDEX */ - - nitups += 1.0; - - /* - * For the current heap tuple, extract all the attributes we use - * in this index, and note which are null. - */ - FormIndexDatum(indexInfo, - htup, - htupdesc, - econtext->ecxt_per_tuple_memory, - attdata, - nulls); - - /* form an index tuple and point it at the heap tuple */ - itup = index_formtuple(itupdesc, attdata, nulls); - - /* - * If the single index key is null, we don't insert it into the - * index. Btrees support scans on <, <=, =, >=, and >. Relational - * algebra says that A op B (where op is one of the operators - * above) returns null if either A or B is null. This means that - * no qualification used in an index scan could ever return true - * on a null attribute. It also means that indices can't be used - * by ISNULL or NOTNULL scans, but that's an artifact of the - * strategy map architecture chosen in 1986, not of the way nulls - * are handled here. - */ - - /* - * New comments: NULLs handling. While we can't do NULL - * comparison, we can follow simple rule for ordering items on - * btree pages - NULLs greater NOT_NULLs and NULL = NULL is TRUE. - * Sure, it's just rule for placing/finding items and no more - - * keytest'll return FALSE for a = 5 for items having 'a' isNULL. - * Look at _bt_compare for how it works. - vadim 03/23/97 - * - * if (itup->t_info & INDEX_NULL_MASK) { pfree(itup); continue; } - */ - - itup->t_tid = htup->t_self; - btitem = _bt_formitem(itup); - - /* - * if we are doing bottom-up btree build, we insert the index into - * a spool file for subsequent processing. otherwise, we insert - * into the btree. - */ - if (usefast) - { - if (tupleIsAlive || !spool2) - _bt_spool(btitem, spool); - else -/* dead tuples are put into spool2 */ - { - dead_count++; - _bt_spool(btitem, spool2); - } - } - else - res = _bt_doinsert(index, btitem, indexInfo->ii_Unique, heap); - - pfree(btitem); - pfree(itup); - if (res) - pfree(res); - } + /* do the heap scan */ + reltuples = IndexBuildHeapScan(heap, index, indexInfo, + btbuildCallback, (void *) &buildstate); /* okay, all heap tuples are indexed */ - heap_endscan(hscan); - if (spool2 && !dead_count) /* spool2 was found to be unnecessary */ + if (buildstate.spool2 && !buildstate.haveDead) { - _bt_spooldestroy(spool2); - spool2 = NULL; + /* spool2 turns out to be unnecessary */ + _bt_spooldestroy(buildstate.spool2); + buildstate.spool2 = NULL; } -#ifndef OMIT_PARTIAL_INDEX - if (pred != NULL || oldPred != NULL) - ExecDropTupleTable(tupleTable, true); -#endif /* OMIT_PARTIAL_INDEX */ - FreeExprContext(econtext); - /* * if we are doing bottom-up btree build, finish the build by (1) * completing the sort of the spool file, (2) inserting the sorted * tuples into btree pages and (3) building the upper levels. */ - if (usefast) + if (buildstate.usefast) { - _bt_leafbuild(spool, spool2); - _bt_spooldestroy(spool); - if (spool2) - _bt_spooldestroy(spool2); + _bt_leafbuild(buildstate.spool, buildstate.spool2); + _bt_spooldestroy(buildstate.spool); + if (buildstate.spool2) + _bt_spooldestroy(buildstate.spool2); } #ifdef BTREE_BUILD_STATS @@ -325,6 +175,9 @@ btbuild(PG_FUNCTION_ARGS) } #endif /* BTREE_BUILD_STATS */ + /* all done */ + BuildingBtree = false; + /* * Since we just counted the tuples in the heap, we update its stats * in pg_class to guarantee that the planner takes advantage of the @@ -343,20 +196,63 @@ btbuild(PG_FUNCTION_ARGS) heap_close(heap, NoLock); index_close(index); - UpdateStats(hrelid, nhtups); - UpdateStats(irelid, nitups); - if (oldPred != NULL) + UpdateStats(hrelid, reltuples); + UpdateStats(irelid, buildstate.indtuples); + } + + PG_RETURN_VOID(); +} + +/* + * Per-tuple callback from IndexBuildHeapScan + */ +static void +btbuildCallback(Relation index, + HeapTuple htup, + Datum *attdata, + char *nulls, + bool tupleIsAlive, + void *state) +{ + BTBuildState *buildstate = (BTBuildState *) state; + IndexTuple itup; + BTItem btitem; + InsertIndexResult res; + + /* form an index tuple and point it at the heap tuple */ + itup = index_formtuple(RelationGetDescr(index), attdata, nulls); + itup->t_tid = htup->t_self; + + btitem = _bt_formitem(itup); + + /* + * if we are doing bottom-up btree build, we insert the index into + * a spool file for subsequent processing. otherwise, we insert + * into the btree. + */ + if (buildstate->usefast) + { + if (tupleIsAlive || buildstate->spool2 == NULL) + _bt_spool(btitem, buildstate->spool); + else { - if (nitups == nhtups) - pred = NULL; - UpdateIndexPredicate(irelid, oldPred, pred); + /* dead tuples are put into spool2 */ + buildstate->haveDead = true; + _bt_spool(btitem, buildstate->spool2); } } + else + { + res = _bt_doinsert(index, btitem, + buildstate->isUnique, buildstate->heapRel); + if (res) + pfree(res); + } - /* all done */ - BuildingBtree = false; + buildstate->indtuples += 1; - PG_RETURN_VOID(); + pfree(btitem); + pfree(itup); } /* @@ -423,8 +319,10 @@ btgettuple(PG_FUNCTION_ARGS) /* * Save heap TID to use it in _bt_restscan. Then release the read - * lock on the buffer so that we aren't blocking other backends. NOTE: - * we do keep the pin on the buffer! + * lock on the buffer so that we aren't blocking other backends. + * + * NOTE: we do keep the pin on the buffer! This is essential to ensure + * that someone else doesn't delete the index entry we are stopped on. */ if (res) { @@ -451,9 +349,6 @@ btbeginscan(PG_FUNCTION_ARGS) /* get the scan */ scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey); - /* register scan in case we change pages it's using */ - _bt_regscan(scan); - PG_RETURN_POINTER(scan); } @@ -571,8 +466,6 @@ btendscan(PG_FUNCTION_ARGS) pfree(so->keyData); pfree(so); - _bt_dropscan(scan); - PG_RETURN_VOID(); } @@ -640,20 +533,127 @@ btrestrpos(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } -/* stubs */ +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ Datum -btdelete(PG_FUNCTION_ARGS) +btbulkdelete(PG_FUNCTION_ARGS) { Relation rel = (Relation) PG_GETARG_POINTER(0); - ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1); + IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1); + void *callback_state = (void *) PG_GETARG_POINTER(2); + IndexBulkDeleteResult *result; + BlockNumber num_pages; + double tuples_removed; + double num_index_tuples; + RetrieveIndexResult res; + IndexScanDesc scan; + BTScanOpaque so; + ItemPointer current; + + tuples_removed = 0; + num_index_tuples = 0; + + /* + * We use a standard IndexScanDesc scan object, but to speed up the loop, + * we skip most of the wrapper layers of index_getnext and instead call + * _bt_step directly. This implies holding buffer lock on a target page + * throughout the loop over the page's tuples. Initially, we have a read + * lock acquired by _bt_step when we stepped onto the page. If we find + * a tuple we need to delete, we trade in the read lock for an exclusive + * write lock; after that, we hold the write lock until we step off the + * page (fortunately, _bt_relbuf doesn't care which kind of lock it's + * releasing). This should minimize the amount of work needed per page. + */ + scan = index_beginscan(rel, false, 0, (ScanKey) NULL); + so = (BTScanOpaque) scan->opaque; + current = &(scan->currentItemData); - /* adjust any active scans that will be affected by this deletion */ - _bt_adjscans(rel, tid); + /* Use _bt_first to get started, then _bt_step to remaining tuples */ + res = _bt_first(scan, ForwardScanDirection); - /* delete the data from the page */ - _bt_pagedel(rel, tid); + if (res != NULL) + { + Buffer buf; + BlockNumber lockedBlock = InvalidBlockNumber; - PG_RETURN_VOID(); + pfree(res); + /* we have the buffer pinned and locked */ + buf = so->btso_curbuf; + Assert(BufferIsValid(buf)); + + do + { + Page page; + BlockNumber blkno; + OffsetNumber offnum; + BTItem btitem; + IndexTuple itup; + ItemPointer htup; + + /* current is the next index tuple */ + blkno = ItemPointerGetBlockNumber(current); + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); + itup = &btitem->bti_itup; + htup = &(itup->t_tid); + + if (callback(htup, callback_state)) + { + /* + * If this is first deletion on this page, trade in read + * lock for a really-exclusive write lock. Then, step back + * one and re-examine the item, because someone else might + * have inserted an item while we weren't holding the lock! + */ + if (blkno != lockedBlock) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBufferForCleanup(buf); + lockedBlock = blkno; + } + else + { + /* Delete the item from the page */ + _bt_itemdel(rel, buf, current); + + /* Mark buffer dirty, but keep the lock and pin */ + WriteNoReleaseBuffer(buf); + + tuples_removed += 1; + } + + /* + * We need to back up the scan one item so that the next + * cycle will re-examine the same offnum on this page. + * + * For now, just hack the current-item index. Will need + * to be smarter when deletion includes removal of empty + * index pages. + */ + current->ip_posid--; + } + else + num_index_tuples += 1; + } while (_bt_step(scan, &buf, ForwardScanDirection)); + } + + index_endscan(scan); + + /* return statistics */ + num_pages = RelationGetNumberOfBlocks(rel); + + result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult)); + result->num_pages = num_pages; + result->tuples_removed = tuples_removed; + result->num_index_tuples = num_index_tuples; + + PG_RETURN_POINTER(result); } /* @@ -676,7 +676,7 @@ _bt_restscan(IndexScanDesc scan) /* * Get back the read lock we were holding on the buffer. (We still - * have a reference-count pin on it, though.) + * have a reference-count pin on it, so need not get that.) */ LockBuffer(buf, BT_READ); @@ -729,7 +729,7 @@ _bt_restscan(IndexScanDesc scan) "\n\tRecreate index %s.", RelationGetRelationName(rel)); blkno = opaque->btpo_next; - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); buf = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(buf); maxoff = PageGetMaxOffsetNumber(page); diff --git a/src/backend/access/nbtree/nbtscan.c b/src/backend/access/nbtree/nbtscan.c deleted file mode 100644 index e07914b3440..00000000000 --- a/src/backend/access/nbtree/nbtscan.c +++ /dev/null @@ -1,224 +0,0 @@ -/*------------------------------------------------------------------------- - * - * btscan.c - * manage scans on btrees. - * - * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.33 2001/01/24 19:42:48 momjian Exp $ - * - * - * NOTES - * Because we can be doing an index scan on a relation while we update - * it, we need to avoid missing data that moves around in the index. - * Insertions and page splits are no problem because _bt_restscan() - * can figure out where the current item moved to, but if a deletion - * happens at or before the current scan position, we'd better do - * something to stay in sync. - * - * The routines in this file handle the problem for deletions issued - * by the current backend. Currently, that's all we need, since - * deletions are only done by VACUUM and it gets an exclusive lock. - * - * The scheme is to manage a list of active scans in the current backend. - * Whenever we remove a record from an index, we check the list of active - * scans to see if any has been affected. A scan is affected only if it - * is on the same relation, and the same page, as the update. - * - *------------------------------------------------------------------------- - */ - -#include "postgres.h" - -#include "access/nbtree.h" - -typedef struct BTScanListData -{ - IndexScanDesc btsl_scan; - struct BTScanListData *btsl_next; -} BTScanListData; - -typedef BTScanListData *BTScanList; - -static BTScanList BTScans = (BTScanList) NULL; - -static void _bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno); - -/* - * AtEOXact_nbtree() --- clean up nbtree subsystem at xact abort or commit. - * - * This is here because it needs to touch this module's static var BTScans. - */ -void -AtEOXact_nbtree(void) -{ - - /* - * Note: these actions should only be necessary during xact abort; but - * they can't hurt during a commit. - */ - - /* - * Reset the active-scans list to empty. We do not need to free the - * list elements, because they're all palloc()'d, so they'll go away - * at end of transaction anyway. - */ - BTScans = NULL; - - /* If we were building a btree, we ain't anymore. */ - BuildingBtree = false; -} - -/* - * _bt_regscan() -- register a new scan. - */ -void -_bt_regscan(IndexScanDesc scan) -{ - BTScanList new_el; - - new_el = (BTScanList) palloc(sizeof(BTScanListData)); - new_el->btsl_scan = scan; - new_el->btsl_next = BTScans; - BTScans = new_el; -} - -/* - * _bt_dropscan() -- drop a scan from the scan list - */ -void -_bt_dropscan(IndexScanDesc scan) -{ - BTScanList chk, - last; - - last = (BTScanList) NULL; - for (chk = BTScans; - chk != (BTScanList) NULL && chk->btsl_scan != scan; - chk = chk->btsl_next) - last = chk; - - if (chk == (BTScanList) NULL) - elog(ERROR, "btree scan list trashed; can't find 0x%p", (void *) scan); - - if (last == (BTScanList) NULL) - BTScans = chk->btsl_next; - else - last->btsl_next = chk->btsl_next; - - pfree(chk); -} - -/* - * _bt_adjscans() -- adjust all scans in the scan list to compensate - * for a given deletion - */ -void -_bt_adjscans(Relation rel, ItemPointer tid) -{ - BTScanList l; - Oid relid; - - relid = RelationGetRelid(rel); - for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) - { - if (relid == RelationGetRelid(l->btsl_scan->relation)) - _bt_scandel(l->btsl_scan, - ItemPointerGetBlockNumber(tid), - ItemPointerGetOffsetNumber(tid)); - } -} - -/* - * _bt_scandel() -- adjust a single scan on deletion - * - */ -static void -_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno) -{ - ItemPointer current; - Buffer buf; - BTScanOpaque so; - OffsetNumber start; - Page page; - BTPageOpaque opaque; - - so = (BTScanOpaque) scan->opaque; - buf = so->btso_curbuf; - - current = &(scan->currentItemData); - if (ItemPointerIsValid(current) - && ItemPointerGetBlockNumber(current) == blkno - && ItemPointerGetOffsetNumber(current) >= offno) - { - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - start = P_FIRSTDATAKEY(opaque); - if (ItemPointerGetOffsetNumber(current) == start) - ItemPointerSetInvalid(&(so->curHeapIptr)); - else - { - - /* - * We have to lock buffer before _bt_step and unlock it after - * that. - */ - LockBuffer(buf, BT_READ); - _bt_step(scan, &buf, BackwardScanDirection); - if (ItemPointerIsValid(current)) - { - Page pg = BufferGetPage(buf); - BTItem btitem = (BTItem) PageGetItem(pg, - PageGetItemId(pg, ItemPointerGetOffsetNumber(current))); - - so->curHeapIptr = btitem->bti_itup.t_tid; - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - } - } - } - - current = &(scan->currentMarkData); - if (ItemPointerIsValid(current) - && ItemPointerGetBlockNumber(current) == blkno - && ItemPointerGetOffsetNumber(current) >= offno) - { - page = BufferGetPage(so->btso_mrkbuf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - start = P_FIRSTDATAKEY(opaque); - - if (ItemPointerGetOffsetNumber(current) == start) - ItemPointerSetInvalid(&(so->mrkHeapIptr)); - else - { - ItemPointerData tmp; - - tmp = *current; - *current = scan->currentItemData; - scan->currentItemData = tmp; - so->btso_curbuf = so->btso_mrkbuf; - so->btso_mrkbuf = buf; - buf = so->btso_curbuf; - LockBuffer(buf, BT_READ); /* as above */ - - _bt_step(scan, &buf, BackwardScanDirection); - - so->btso_curbuf = so->btso_mrkbuf; - so->btso_mrkbuf = buf; - tmp = *current; - *current = scan->currentItemData; - scan->currentItemData = tmp; - if (ItemPointerIsValid(current)) - { - Page pg = BufferGetPage(buf); - BTItem btitem = (BTItem) PageGetItem(pg, - PageGetItemId(pg, ItemPointerGetOffsetNumber(current))); - - so->mrkHeapIptr = btitem->bti_itup.t_tid; - LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* as above */ - } - } - } -} diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 59bf5358e4f..295387ed517 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.66 2001/03/23 04:49:51 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.67 2001/07/15 22:48:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -94,7 +94,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, new_stack->bts_parent = stack_in; /* drop the read lock on the parent page, acquire one on the child */ - _bt_relbuf(rel, *bufP, BT_READ); + _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); /* @@ -155,7 +155,7 @@ _bt_moveright(Relation rel, /* step right one page */ BlockNumber rblkno = opaque->btpo_next; - _bt_relbuf(rel, buf, access); + _bt_relbuf(rel, buf); buf = _bt_getbuf(rel, rblkno, access); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -406,7 +406,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) /* No more items, so close down the current-item info */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); return (RetrieveIndexResult) NULL; } @@ -760,7 +760,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) nomatches: ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); res = (RetrieveIndexResult) NULL; } @@ -815,14 +815,14 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) /* if we're at end of scan, release the buffer and return */ if (P_RIGHTMOST(opaque)) { - _bt_relbuf(rel, *bufP, BT_READ); + _bt_relbuf(rel, *bufP); ItemPointerSetInvalid(current); *bufP = so->btso_curbuf = InvalidBuffer; return false; } /* step right one page */ blkno = opaque->btpo_next; - _bt_relbuf(rel, *bufP, BT_READ); + _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -846,7 +846,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) /* if we're at end of scan, release the buffer and return */ if (P_LEFTMOST(opaque)) { - _bt_relbuf(rel, *bufP, BT_READ); + _bt_relbuf(rel, *bufP); ItemPointerSetInvalid(current); *bufP = so->btso_curbuf = InvalidBuffer; return false; @@ -854,7 +854,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) /* step left */ obknum = BufferGetBlockNumber(*bufP); blkno = opaque->btpo_prev; - _bt_relbuf(rel, *bufP, BT_READ); + _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -868,7 +868,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) while (opaque->btpo_next != obknum) { blkno = opaque->btpo_next; - _bt_relbuf(rel, *bufP, BT_READ); + _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -952,7 +952,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) itup = &(btitem->bti_itup); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); buf = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(buf); @@ -968,7 +968,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) do { blkno = opaque->btpo_next; - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); buf = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -1035,7 +1035,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) /* no tuples in the index match this scan key */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf, BT_READ); + _bt_relbuf(rel, buf); res = (RetrieveIndexResult) NULL; } diff --git a/src/backend/access/rtree/rtree.c b/src/backend/access/rtree/rtree.c index a8c6a13ea3c..21831ef5d61 100644 --- a/src/backend/access/rtree/rtree.c +++ b/src/backend/access/rtree/rtree.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.62 2001/05/07 00:43:16 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.63 2001/07/15 22:48:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,20 @@ typedef struct RTSTATE FmgrInfo interFn; /* intersection function */ } RTSTATE; +/* Working state for rtbuild and its callback */ +typedef struct +{ + RTSTATE rtState; + double indtuples; +} RTBuildState; + /* non-export function prototypes */ +static void rtbuildCallback(Relation index, + HeapTuple htup, + Datum *attdata, + char *nulls, + bool tupleIsAlive, + void *state); static InsertIndexResult rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate); static void rttighten(Relation r, RTSTACK *stk, Datum datum, int att_size, @@ -81,165 +94,44 @@ static int nospace(Page p, IndexTuple it); static void initRtstate(RTSTATE *rtstate, Relation index); +/* + * routine to build an index. Basically calls insert over and over + */ Datum rtbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); - Node *oldPred = (Node *) PG_GETARG_POINTER(3); - -#ifdef NOT_USED - IndexStrategy istrat = (IndexStrategy) PG_GETARG_POINTER(4); - -#endif - HeapScanDesc hscan; - HeapTuple htup; - IndexTuple itup; - TupleDesc htupdesc, - itupdesc; - Datum attdata[INDEX_MAX_KEYS]; - char nulls[INDEX_MAX_KEYS]; - double nhtups, - nitups; - Node *pred = indexInfo->ii_Predicate; - -#ifndef OMIT_PARTIAL_INDEX - TupleTable tupleTable; - TupleTableSlot *slot; + double reltuples; + RTBuildState buildstate; + Buffer buffer; -#endif - ExprContext *econtext; - InsertIndexResult res = NULL; - Buffer buffer = InvalidBuffer; - RTSTATE rtState; + /* no locking is needed */ - initRtstate(&rtState, index); + initRtstate(&buildstate.rtState, index); /* * We expect to be called exactly once for any index relation. If * that's not the case, big trouble's what we have. */ - if (oldPred == NULL && RelationGetNumberOfBlocks(index) != 0) - elog(ERROR, "%s already contains data", RelationGetRelationName(index)); - - /* initialize the root page (if this is a new index) */ - if (oldPred == NULL) - { - buffer = ReadBuffer(index, P_NEW); - RTInitBuffer(buffer, F_LEAF); - WriteBuffer(buffer); - } - - /* get tuple descriptors for heap and index relations */ - htupdesc = RelationGetDescr(heap); - itupdesc = RelationGetDescr(index); - - /* - * If this is a predicate (partial) index, we will need to evaluate - * the predicate using ExecQual, which requires the current tuple to - * be in a slot of a TupleTable. In addition, ExecQual must have an - * ExprContext referring to that slot. Here, we initialize dummy - * TupleTable and ExprContext objects for this purpose. --Nels, Feb 92 - * - * We construct the ExprContext anyway since we need a per-tuple - * temporary memory context for function evaluation -- tgl July 00 - */ -#ifndef OMIT_PARTIAL_INDEX - if (pred != NULL || oldPred != NULL) - { - tupleTable = ExecCreateTupleTable(1); - slot = ExecAllocTableSlot(tupleTable); - ExecSetSlotDescriptor(slot, htupdesc, false); - } - else - { - tupleTable = NULL; - slot = NULL; - } - econtext = MakeExprContext(slot, TransactionCommandContext); -#else - econtext = MakeExprContext(NULL, TransactionCommandContext); -#endif /* OMIT_PARTIAL_INDEX */ - - /* count the tuples as we insert them */ - nhtups = nitups = 0.0; - - /* start a heap scan */ - hscan = heap_beginscan(heap, 0, SnapshotNow, 0, (ScanKey) NULL); - - while (HeapTupleIsValid(htup = heap_getnext(hscan, 0))) - { - MemoryContextReset(econtext->ecxt_per_tuple_memory); + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "%s already contains data", + RelationGetRelationName(index)); - nhtups += 1.0; - -#ifndef OMIT_PARTIAL_INDEX - - /* - * If oldPred != NULL, this is an EXTEND INDEX command, so skip - * this tuple if it was already in the existing partial index - */ - if (oldPred != NULL) - { - slot->val = htup; - if (ExecQual((List *) oldPred, econtext, false)) - { - nitups += 1.0; - continue; - } - } - - /* - * Skip this tuple if it doesn't satisfy the partial-index - * predicate - */ - if (pred != NULL) - { - slot->val = htup; - if (!ExecQual((List *) pred, econtext, false)) - continue; - } -#endif /* OMIT_PARTIAL_INDEX */ - - nitups += 1.0; - - /* - * For the current heap tuple, extract all the attributes we use - * in this index, and note which are null. - */ - FormIndexDatum(indexInfo, - htup, - htupdesc, - econtext->ecxt_per_tuple_memory, - attdata, - nulls); - - /* form an index tuple and point it at the heap tuple */ - itup = index_formtuple(itupdesc, attdata, nulls); - itup->t_tid = htup->t_self; + /* initialize the root page */ + buffer = ReadBuffer(index, P_NEW); + RTInitBuffer(buffer, F_LEAF); + WriteBuffer(buffer); - /* - * Since we already have the index relation locked, we call - * rtdoinsert directly. Normal access method calls dispatch - * through rtinsert, which locks the relation for write. This is - * the right thing to do if you're inserting single tups, but not - * when you're initializing the whole index at once. - */ + /* build the index */ + buildstate.indtuples = 0; - res = rtdoinsert(index, itup, &rtState); - pfree(itup); - pfree(res); - } + /* do the heap scan */ + reltuples = IndexBuildHeapScan(heap, index, indexInfo, + rtbuildCallback, (void *) &buildstate); /* okay, all heap tuples are indexed */ - heap_endscan(hscan); - -#ifndef OMIT_PARTIAL_INDEX - if (pred != NULL || oldPred != NULL) - ExecDropTupleTable(tupleTable, true); -#endif /* OMIT_PARTIAL_INDEX */ - FreeExprContext(econtext); /* * Since we just counted the tuples in the heap, we update its stats @@ -259,20 +151,57 @@ rtbuild(PG_FUNCTION_ARGS) heap_close(heap, NoLock); index_close(index); - UpdateStats(hrelid, nhtups); - UpdateStats(irelid, nitups); - if (oldPred != NULL) - { - if (nitups == nhtups) - pred = NULL; - UpdateIndexPredicate(irelid, oldPred, pred); - } + UpdateStats(hrelid, reltuples); + UpdateStats(irelid, buildstate.indtuples); } PG_RETURN_VOID(); } /* + * Per-tuple callback from IndexBuildHeapScan + */ +static void +rtbuildCallback(Relation index, + HeapTuple htup, + Datum *attdata, + char *nulls, + bool tupleIsAlive, + void *state) +{ + RTBuildState *buildstate = (RTBuildState *) state; + IndexTuple itup; + InsertIndexResult res; + + /* form an index tuple and point it at the heap tuple */ + itup = index_formtuple(RelationGetDescr(index), attdata, nulls); + itup->t_tid = htup->t_self; + + /* rtree indexes don't index nulls, see notes in rtinsert */ + if (IndexTupleHasNulls(itup)) + { + pfree(itup); + return; + } + + /* + * Since we already have the index relation locked, we call + * rtdoinsert directly. Normal access method calls dispatch + * through rtinsert, which locks the relation for write. This is + * the right thing to do if you're inserting single tups, but not + * when you're initializing the whole index at once. + */ + res = rtdoinsert(index, itup, &buildstate->rtState); + + if (res) + pfree(res); + + buildstate->indtuples += 1; + + pfree(itup); +} + +/* * rtinsert -- wrapper for rtree tuple insertion. * * This is the public interface routine for tuple insertion in rtrees. @@ -285,10 +214,8 @@ rtinsert(PG_FUNCTION_ARGS) Datum *datum = (Datum *) PG_GETARG_POINTER(1); char *nulls = (char *) PG_GETARG_POINTER(2); ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3); - #ifdef NOT_USED Relation heapRel = (Relation) PG_GETARG_POINTER(4); - #endif InsertIndexResult res; IndexTuple itup; @@ -297,12 +224,24 @@ rtinsert(PG_FUNCTION_ARGS) /* generate an index tuple */ itup = index_formtuple(RelationGetDescr(r), datum, nulls); itup->t_tid = *ht_ctid; + + /* + * Currently, rtrees do not support indexing NULLs; considerable + * infrastructure work would have to be done to do anything reasonable + * with a NULL. + */ + if (IndexTupleHasNulls(itup)) + { + pfree(itup); + PG_RETURN_POINTER((InsertIndexResult) NULL); + } + initRtstate(&rtState, r); /* - * Notes in ExecUtils:ExecOpenIndices() - * - * RelationSetLockForWrite(r); + * Since rtree is not marked "amconcurrent" in pg_am, caller should + * have acquired exclusive lock on index relation. We need no locking + * here. */ res = rtdoinsert(r, itup, &rtState); @@ -1104,40 +1043,92 @@ freestack(RTSTACK *s) } } +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ Datum -rtdelete(PG_FUNCTION_ARGS) +rtbulkdelete(PG_FUNCTION_ARGS) { - Relation r = (Relation) PG_GETARG_POINTER(0); - ItemPointer tid = (ItemPointer) PG_GETARG_POINTER(1); - BlockNumber blkno; - OffsetNumber offnum; - Buffer buf; - Page page; + Relation rel = (Relation) PG_GETARG_POINTER(0); + IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1); + void *callback_state = (void *) PG_GETARG_POINTER(2); + IndexBulkDeleteResult *result; + BlockNumber num_pages; + double tuples_removed; + double num_index_tuples; + RetrieveIndexResult res; + IndexScanDesc iscan; + + tuples_removed = 0; + num_index_tuples = 0; /* - * Notes in ExecUtils:ExecOpenIndices() Also note that only vacuum - * deletes index tuples now... - * - * RelationSetLockForWrite(r); + * Since rtree is not marked "amconcurrent" in pg_am, caller should + * have acquired exclusive lock on index relation. We need no locking + * here. */ - blkno = ItemPointerGetBlockNumber(tid); - offnum = ItemPointerGetOffsetNumber(tid); + /* + * XXX generic implementation --- should be improved! + */ - /* adjust any scans that will be affected by this deletion */ - rtadjscans(r, RTOP_DEL, blkno, offnum); + /* walk through the entire index */ + iscan = index_beginscan(rel, false, 0, (ScanKey) NULL); - /* delete the index tuple */ - buf = ReadBuffer(r, blkno); - page = BufferGetPage(buf); + while ((res = index_getnext(iscan, ForwardScanDirection)) + != (RetrieveIndexResult) NULL) + { + ItemPointer heapptr = &res->heap_iptr; - PageIndexTupleDelete(page, offnum); + if (callback(heapptr, callback_state)) + { + ItemPointer indexptr = &res->index_iptr; + BlockNumber blkno; + OffsetNumber offnum; + Buffer buf; + Page page; - WriteBuffer(buf); + blkno = ItemPointerGetBlockNumber(indexptr); + offnum = ItemPointerGetOffsetNumber(indexptr); - PG_RETURN_VOID(); + /* adjust any scans that will be affected by this deletion */ + /* (namely, my own scan) */ + rtadjscans(rel, RTOP_DEL, blkno, offnum); + + /* delete the index tuple */ + buf = ReadBuffer(rel, blkno); + page = BufferGetPage(buf); + + PageIndexTupleDelete(page, offnum); + + WriteBuffer(buf); + + tuples_removed += 1; + } + else + num_index_tuples += 1; + + pfree(res); + } + + index_endscan(iscan); + + /* return statistics */ + num_pages = RelationGetNumberOfBlocks(rel); + + result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult)); + result->num_pages = num_pages; + result->tuples_removed = tuples_removed; + result->num_index_tuples = num_index_tuples; + + PG_RETURN_POINTER(result); } + static void initRtstate(RTSTATE *rtstate, Relation index) { diff --git a/src/backend/access/rtree/rtscan.c b/src/backend/access/rtree/rtscan.c index c9f1ab7b893..1311cfdc29a 100644 --- a/src/backend/access/rtree/rtscan.c +++ b/src/backend/access/rtree/rtscan.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtscan.c,v 1.37 2001/06/09 18:16:56 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtscan.c,v 1.38 2001/07/15 22:48:16 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -60,13 +60,8 @@ rtbeginscan(PG_FUNCTION_ARGS) ScanKey key = (ScanKey) PG_GETARG_POINTER(3); IndexScanDesc s; - /* - * Let index_beginscan does its work... - * - * RelationSetLockForRead(r); - */ - s = RelationGetIndexScan(r, fromEnd, nkeys, key); + rtregscan(s); PG_RETURN_POINTER(s); @@ -282,6 +277,27 @@ rtdropscan(IndexScanDesc s) pfree(l); } +/* + * AtEOXact_rtree() --- clean up rtree subsystem at xact abort or commit. + * + * This is here because it needs to touch this module's static var RTScans. + */ +void +AtEOXact_rtree(void) +{ + /* + * Note: these actions should only be necessary during xact abort; but + * they can't hurt during a commit. + */ + + /* + * Reset the active-scans list to empty. We do not need to free the + * list elements, because they're all palloc()'d, so they'll go away + * at end of transaction anyway. + */ + RTScans = NULL; +} + void rtadjscans(Relation r, int op, BlockNumber blkno, OffsetNumber offnum) { diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 64671792315..d32a6dda978 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.106 2001/07/12 04:11:13 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.107 2001/07/15 22:48:16 tgl Exp $ * * NOTES * Transaction aborts can now occur two ways: @@ -156,7 +156,10 @@ #include <sys/time.h> +#include "access/gistscan.h" +#include "access/hash.h" #include "access/nbtree.h" +#include "access/rtree.h" #include "access/xact.h" #include "catalog/heap.h" #include "catalog/index.h" @@ -1040,7 +1043,10 @@ CommitTransaction(void) smgrDoPendingDeletes(true); AtEOXact_SPI(); + AtEOXact_gist(); + AtEOXact_hash(); AtEOXact_nbtree(); + AtEOXact_rtree(); AtCommit_Cache(); AtCommit_Locks(); AtEOXact_CatCache(true); @@ -1147,7 +1153,10 @@ AbortTransaction(void) smgrDoPendingDeletes(false); AtEOXact_SPI(); + AtEOXact_gist(); + AtEOXact_hash(); AtEOXact_nbtree(); + AtEOXact_rtree(); AtAbort_Cache(); AtEOXact_CatCache(false); AtAbort_Memory(); |