/*------------------------------------------------------------------------- * * ginget.c * fetch tuples from a GIN scan. * * * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION * $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.23 2009/03/24 20:17:10 tgl Exp $ *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/gin.h" #include "access/relscan.h" #include "catalog/index.h" #include "miscadmin.h" #include "storage/bufmgr.h" #include "utils/datum.h" #include "utils/memutils.h" typedef struct pendingPosition { Buffer pendingBuffer; OffsetNumber firstOffset; OffsetNumber lastOffset; ItemPointerData item; } pendingPosition; /* * Tries to refind previously taken ItemPointer on page. */ static bool findItemInPage(Page page, ItemPointer item, OffsetNumber *off) { OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; int res; if (GinPageGetOpaque(page)->flags & GIN_DELETED) /* page was deleted by concurrent vacuum */ return false; /* * scan page to find equal or first greater value */ for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++) { res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off)); if (res <= 0) return true; } return false; } /* * Goes to the next page if current offset is outside of bounds */ static bool moveRightIfItNeeded( GinBtreeData *btree, GinBtreeStack *stack ) { Page page = BufferGetPage(stack->buffer); if ( stack->off > PageGetMaxOffsetNumber(page) ) { /* * We scanned the whole page, so we should take right page */ stack->blkno = GinPageGetOpaque(page)->rightlink; if ( GinPageRightMost(page) ) return false; /* no more pages */ LockBuffer(stack->buffer, GIN_UNLOCK); stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); LockBuffer(stack->buffer, GIN_SHARE); stack->off = FirstOffsetNumber; } return true; } /* * Does fullscan of posting tree and saves ItemPointers * in scanEntry->partialMatch TIDBitmap */ static void scanForItems( Relation index, GinScanEntry scanEntry, BlockNumber rootPostingTree ) { GinPostingTreeScan *gdi; Buffer buffer; Page page; BlockNumber blkno; gdi = prepareScanPostingTree(index, rootPostingTree, TRUE); buffer = scanBeginPostingTree(gdi); IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */ freeGinBtreeStack(gdi->stack); pfree(gdi); /* * Goes through all leaves */ for(;;) { page = BufferGetPage(buffer); if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 && GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber ) { tbm_add_tuples( scanEntry->partialMatch, (ItemPointer)GinDataPageGetItem(page, FirstOffsetNumber), GinPageGetOpaque(page)->maxoff, false); scanEntry->predictNumberResult += GinPageGetOpaque(page)->maxoff; } blkno = GinPageGetOpaque(page)->rightlink; if ( GinPageRightMost(page) ) { UnlockReleaseBuffer(buffer); return; /* no more pages */ } LockBuffer(buffer, GIN_UNLOCK); buffer = ReleaseAndReadBuffer(buffer, index, blkno); LockBuffer(buffer, GIN_SHARE); } } /* * Collects all ItemPointer into the TIDBitmap struct * for entries partially matched to search entry. * * Returns true if done, false if it's needed to restart scan from scratch */ static bool computePartialMatchList( GinBtreeData *btree, GinBtreeStack *stack, GinScanEntry scanEntry ) { Page page; IndexTuple itup; Datum idatum; int32 cmp; scanEntry->partialMatch = tbm_create( work_mem * 1024L ); for(;;) { /* * stack->off points to the interested entry, buffer is already locked */ if ( moveRightIfItNeeded(btree, stack) == false ) return true; page = BufferGetPage(stack->buffer); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); /* * If tuple stores another attribute then stop scan */ if ( gintuple_get_attrnum( btree->ginstate, itup ) != scanEntry->attnum ) return true; idatum = gin_index_getattr( btree->ginstate, itup ); /*---------- * Check of partial match. * case cmp == 0 => match * case cmp > 0 => not match and finish scan * case cmp < 0 => not match and continue scan *---------- */ cmp = DatumGetInt32(FunctionCall3(&btree->ginstate->comparePartialFn[scanEntry->attnum-1], scanEntry->entry, idatum, UInt16GetDatum(scanEntry->strategy))); if ( cmp > 0 ) return true; else if ( cmp < 0 ) { stack->off++; continue; } if ( GinIsPostingTree(itup) ) { BlockNumber rootPostingTree = GinGetPostingTree(itup); Datum newDatum, savedDatum = datumCopy ( idatum, btree->ginstate->origTupdesc->attrs[scanEntry->attnum-1]->attbyval, btree->ginstate->origTupdesc->attrs[scanEntry->attnum-1]->attlen ); /* * We should unlock current page (but not unpin) during * tree scan to prevent deadlock with vacuum processes. * * We save current entry value (savedDatum) to be able to refind * our tuple after re-locking */ LockBuffer(stack->buffer, GIN_UNLOCK); scanForItems( btree->index, scanEntry, rootPostingTree ); /* * We lock again the entry page and while it was unlocked * insert might occured, so we need to refind our position */ LockBuffer(stack->buffer, GIN_SHARE); page = BufferGetPage(stack->buffer); if ( !GinPageIsLeaf(page) ) { /* * Root page becomes non-leaf while we unlock it. We * will start again, this situation doesn't cause * often - root can became a non-leaf only one per * life of index. */ return false; } for(;;) { if ( moveRightIfItNeeded(btree, stack) == false ) elog(ERROR, "lost saved point in index"); /* must not happen !!! */ page = BufferGetPage(stack->buffer); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off)); newDatum = gin_index_getattr( btree->ginstate, itup ); if ( gintuple_get_attrnum( btree->ginstate, itup ) != scanEntry->attnum ) elog(ERROR, "lost saved point in index"); /* must not happen !!! */ if ( compareEntries(btree->ginstate, scanEntry->attnum, newDatum, savedDatum) == 0 ) { /* Found! */ if ( btree->ginstate->origTupdesc->attrs[scanEntry->attnum-1]->attbyval == false ) pfree( DatumGetPointer(savedDatum) ); break; } stack->off++; } } else { tbm_add_tuples( scanEntry->partialMatch, GinGetPosting(itup), GinGetNPosting(itup), false); scanEntry->predictNumberResult += GinGetNPosting(itup); } /* * Ok, we save ItemPointers, go to the next entry */ stack->off++; } return true; } /* * Start* functions setup beginning state of searches: finds correct buffer and pins it. */ static void startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) { GinBtreeData btreeEntry; GinBtreeStack *stackEntry; Page page; bool needUnlock = TRUE; entry->buffer = InvalidBuffer; entry->offset = InvalidOffsetNumber; entry->list = NULL; entry->nlist = 0; entry->partialMatch = NULL; entry->partialMatchResult = NULL; entry->reduceResult = FALSE; entry->predictNumberResult = 0; if (entry->master != NULL) { entry->isFinished = entry->master->isFinished; return; } /* * we should find entry, and begin scan of posting tree * or just store posting list in memory */ prepareEntryScan(&btreeEntry, index, entry->attnum, entry->entry, ginstate); btreeEntry.searchMode = TRUE; stackEntry = ginFindLeafPage(&btreeEntry, NULL); page = BufferGetPage(stackEntry->buffer); entry->isFinished = TRUE; if ( entry->isPartialMatch ) { /* * btreeEntry.findItem points to the first equal or greater value * than needed. So we will scan further and collect all * ItemPointers */ btreeEntry.findItem(&btreeEntry, stackEntry); if ( computePartialMatchList( &btreeEntry, stackEntry, entry ) == false ) { /* * GIN tree was seriously restructured, so we will * cleanup all found data and rescan. See comments near * 'return false' in computePartialMatchList() */ if ( entry->partialMatch ) { if (entry->partialMatchIterator) tbm_end_iterate(entry->partialMatchIterator); entry->partialMatchIterator = NULL; tbm_free( entry->partialMatch ); entry->partialMatch = NULL; } LockBuffer(stackEntry->buffer, GIN_UNLOCK); freeGinBtreeStack(stackEntry); startScanEntry(index, ginstate, entry); return; } if ( entry->partialMatch && !tbm_is_empty(entry->partialMatch) ) { entry->partialMatchIterator = tbm_begin_iterate(entry->partialMatch); entry->isFinished = FALSE; } } else if (btreeEntry.findItem(&btreeEntry, stackEntry)) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off)); if (GinIsPostingTree(itup)) { BlockNumber rootPostingTree = GinGetPostingTree(itup); GinPostingTreeScan *gdi; Page page; /* * We should unlock entry page before make deal with * posting tree to prevent deadlocks with vacuum processes. * Because entry is never deleted from page and posting tree is * never reduced to the posting list, we can unlock page after * getting BlockNumber of root of posting tree. */ LockBuffer(stackEntry->buffer, GIN_UNLOCK); needUnlock = FALSE; gdi = prepareScanPostingTree(index, rootPostingTree, TRUE); entry->buffer = scanBeginPostingTree(gdi); /* * We keep buffer pinned because we need to prevent deletion of * page during scan. See GIN's vacuum implementation. RefCount * is increased to keep buffer pinned after freeGinBtreeStack() * call. */ IncrBufferRefCount(entry->buffer); page = BufferGetPage(entry->buffer); entry->predictNumberResult = gdi->stack->predictNumber * GinPageGetOpaque(page)->maxoff; /* * Keep page content in memory to prevent durable page locking */ entry->list = (ItemPointerData *) palloc( BLCKSZ ); entry->nlist = GinPageGetOpaque(page)->maxoff; memcpy( entry->list, GinDataPageGetItem(page, FirstOffsetNumber), GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData) ); LockBuffer(entry->buffer, GIN_UNLOCK); freeGinBtreeStack(gdi->stack); pfree(gdi); entry->isFinished = FALSE; } else if (GinGetNPosting(itup) > 0) { entry->nlist = GinGetNPosting(itup); entry->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * entry->nlist); memcpy(entry->list, GinGetPosting(itup), sizeof(ItemPointerData) * entry->nlist); entry->isFinished = FALSE; } } if (needUnlock) LockBuffer(stackEntry->buffer, GIN_UNLOCK); freeGinBtreeStack(stackEntry); } static void startScanKey(Relation index, GinState *ginstate, GinScanKey key) { uint32 i; if (!key->firstCall) return; for (i = 0; i < key->nentries; i++) startScanEntry(index, ginstate, key->scanEntry + i); memset(key->entryRes, TRUE, sizeof(bool) * key->nentries); key->isFinished = FALSE; key->firstCall = FALSE; if (GinFuzzySearchLimit > 0) { /* * If all of keys more than threshold we will try to reduce * result, we hope (and only hope, for intersection operation of * array our supposition isn't true), that total result will not * more than minimal predictNumberResult. */ for (i = 0; i < key->nentries; i++) if (key->scanEntry[i].predictNumberResult <= key->nentries * GinFuzzySearchLimit) return; for (i = 0; i < key->nentries; i++) if (key->scanEntry[i].predictNumberResult > key->nentries * GinFuzzySearchLimit) { key->scanEntry[i].predictNumberResult /= key->nentries; key->scanEntry[i].reduceResult = TRUE; } } } static void startScan(IndexScanDesc scan) { uint32 i; GinScanOpaque so = (GinScanOpaque) scan->opaque; for (i = 0; i < so->nkeys; i++) startScanKey(scan->indexRelation, &so->ginstate, so->keys + i); } /* * Gets next ItemPointer from PostingTree. Note, that we copy * page into GinScanEntry->list array and unlock page, but keep it pinned * to prevent interference with vacuum */ static void entryGetNextItem(Relation index, GinScanEntry entry) { Page page; BlockNumber blkno; for(;;) { entry->offset++; if (entry->offset <= entry->nlist) { entry->curItem = entry->list[entry->offset - 1]; return; } LockBuffer(entry->buffer, GIN_SHARE); page = BufferGetPage(entry->buffer); for(;;) { /* * It's needed to go by right link. During that we should refind * first ItemPointer greater that stored */ blkno = GinPageGetOpaque(page)->rightlink; LockBuffer(entry->buffer, GIN_UNLOCK); if (blkno == InvalidBlockNumber) { ReleaseBuffer(entry->buffer); ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber); entry->buffer = InvalidBuffer; entry->isFinished = TRUE; return; } entry->buffer = ReleaseAndReadBuffer(entry->buffer, index, blkno); LockBuffer(entry->buffer, GIN_SHARE); page = BufferGetPage(entry->buffer); entry->offset = InvalidOffsetNumber; if (!ItemPointerIsValid(&entry->curItem) || findItemInPage(page, &entry->curItem, &entry->offset)) { /* * Found position equal to or greater than stored */ entry->nlist = GinPageGetOpaque(page)->maxoff; memcpy( entry->list, GinDataPageGetItem(page, FirstOffsetNumber), GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData) ); LockBuffer(entry->buffer, GIN_UNLOCK); if ( !ItemPointerIsValid(&entry->curItem) || compareItemPointers( &entry->curItem, entry->list + entry->offset - 1 ) == 0 ) { /* * First pages are deleted or empty, or we found exact position, * so break inner loop and continue outer one. */ break; } /* * Find greater than entry->curItem position, store it. */ entry->curItem = entry->list[entry->offset - 1]; return; } } } } #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE)) #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) /* * Sets entry->curItem to new found heap item pointer for one * entry of one scan key */ static bool entryGetItem(Relation index, GinScanEntry entry) { if (entry->master) { entry->isFinished = entry->master->isFinished; entry->curItem = entry->master->curItem; } else if ( entry->partialMatch ) { do { if (entry->partialMatchResult == NULL || entry->offset >= entry->partialMatchResult->ntuples) { entry->partialMatchResult = tbm_iterate(entry->partialMatchIterator); if ( entry->partialMatchResult == NULL ) { ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber); tbm_end_iterate(entry->partialMatchIterator); entry->partialMatchIterator = NULL; entry->isFinished = TRUE; break; } /* * reset counter to the beginning of entry->partialMatchResult. * Note: entry->offset is still greater than * partialMatchResult->ntuples if partialMatchResult is * lossy. So, on next call we will get next result from * TIDBitmap. */ entry->offset = 0; } if ( entry->partialMatchResult->ntuples < 0 ) { /* * lossy result, so we need to check the whole page */ ItemPointerSetLossyPage(&entry->curItem, entry->partialMatchResult->blockno); /* * We might as well fall out of the loop; we could not * estimate number of results on this page to support correct * reducing of result even if it's enabled */ break; } ItemPointerSet(&entry->curItem, entry->partialMatchResult->blockno, entry->partialMatchResult->offsets[entry->offset]); entry->offset++; } while (entry->reduceResult == TRUE && dropItem(entry)); } else if (!BufferIsValid(entry->buffer)) { entry->offset++; if (entry->offset <= entry->nlist) entry->curItem = entry->list[entry->offset - 1]; else { ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber); entry->isFinished = TRUE; } } else { do { entryGetNextItem(index, entry); } while (entry->isFinished == FALSE && entry->reduceResult == TRUE && dropItem(entry)); } return entry->isFinished; } /* * Sets key->curItem to new found heap item pointer for one scan key * Returns isFinished, ie TRUE means we did NOT get a new item pointer! * Also, *keyrecheck is set true if recheck is needed for this scan key. */ static bool keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx, GinScanKey key, bool *keyrecheck) { uint32 i; GinScanEntry entry; bool res; MemoryContext oldCtx; if (key->isFinished) return TRUE; do { /* * move forward from previously value and set new curItem, which is * minimal from entries->curItems */ ItemPointerSetMax(&key->curItem); for (i = 0; i < key->nentries; i++) { entry = key->scanEntry + i; if (key->entryRes[i]) { /* * Move forward only entries which was the least * on previous call */ if (entry->isFinished == FALSE && entryGetItem(index, entry) == FALSE) { if (compareItemPointers(&entry->curItem, &key->curItem) < 0) key->curItem = entry->curItem; } else key->entryRes[i] = FALSE; } else if (entry->isFinished == FALSE) { if (compareItemPointers(&entry->curItem, &key->curItem) < 0) key->curItem = entry->curItem; } } if (ItemPointerIsMax(&key->curItem)) { /* all entries are finished */ key->isFinished = TRUE; return TRUE; } /* * if key->nentries == 1 then the consistentFn should always succeed, * but we must call it anyway to find out the recheck status. */ /* setting up array for consistentFn */ for (i = 0; i < key->nentries; i++) { entry = key->scanEntry + i; if (entry->isFinished == FALSE && compareItemPointers(&entry->curItem, &key->curItem) == 0) key->entryRes[i] = TRUE; else key->entryRes[i] = FALSE; } /* * Initialize *keyrecheck in case the consistentFn doesn't know it * should set it. The safe assumption in that case is to force * recheck. */ *keyrecheck = true; /* * If one of the entry's scans returns lossy result, return it without * checking - we can't suggest anything helpful to consistentFn. */ if (ItemPointerIsLossyPage(&key->curItem)) return FALSE; oldCtx = MemoryContextSwitchTo(tempCtx); res = DatumGetBool(FunctionCall4(&ginstate->consistentFn[key->attnum-1], PointerGetDatum(key->entryRes), UInt16GetDatum(key->strategy), key->query, PointerGetDatum(keyrecheck))); MemoryContextSwitchTo(oldCtx); MemoryContextReset(tempCtx); } while (!res); return FALSE; } /* * Get ItemPointer of next heap row to be checked from pending list. * Returns false if there are no more. * * The pendingBuffer is presumed pinned and share-locked on entry, and is * pinned and share-locked on success exit. On failure exit it's released. */ static bool scanGetCandidate(IndexScanDesc scan, pendingPosition *pos) { OffsetNumber maxoff; Page page; IndexTuple itup; ItemPointerSetInvalid( &pos->item ); for(;;) { page = BufferGetPage(pos->pendingBuffer); maxoff = PageGetMaxOffsetNumber(page); if ( pos->firstOffset > maxoff ) { BlockNumber blkno = GinPageGetOpaque(page)->rightlink; if ( blkno == InvalidBlockNumber ) { UnlockReleaseBuffer(pos->pendingBuffer); pos->pendingBuffer=InvalidBuffer; return false; } else { /* * Here we must prevent deletion of next page by * insertcleanup process, which may be trying to obtain * exclusive lock on current page. So, we lock next * page before releasing the current one */ Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno); LockBuffer(tmpbuf, GIN_SHARE); UnlockReleaseBuffer(pos->pendingBuffer); pos->pendingBuffer = tmpbuf; pos->firstOffset = FirstOffsetNumber; } } else { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset)); pos->item = itup->t_tid; if ( GinPageHasFullRow(page) ) { /* * find itempointer to the next row */ for(pos->lastOffset = pos->firstOffset+1; pos->lastOffset<=maxoff; pos->lastOffset++) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset)); if (!ItemPointerEquals(&pos->item, &itup->t_tid)) break; } } else { /* * All itempointers are the same on this page */ pos->lastOffset = maxoff + 1; } break; } } return true; } static bool matchPartialInPendingList(GinState *ginstate, Page page, OffsetNumber off, OffsetNumber maxoff, Datum value, OffsetNumber attrnum, Datum *datum, bool *datumExtracted, StrategyNumber strategy) { IndexTuple itup; int res; while ( off < maxoff ) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); if ( attrnum != gintuple_get_attrnum(ginstate, itup) ) return false; if (datumExtracted[ off-1 ] == false) { datum[ off-1 ] = gin_index_getattr(ginstate, itup); datumExtracted[ off-1 ] = true; } res = DatumGetInt32(FunctionCall3(&ginstate->comparePartialFn[attrnum], value, datum[ off-1 ], UInt16GetDatum(strategy))); if ( res == 0 ) return true; else if (res>0) return false; } return false; } /* * Sets entryRes array for each key by looking at * every entry per indexed value (row) in pending list. * returns true if at least one of datum was matched by key's entry * * The pendingBuffer is presumed pinned and share-locked on entry. */ static bool collectDatumForItem(IndexScanDesc scan, pendingPosition *pos) { GinScanOpaque so = (GinScanOpaque) scan->opaque; OffsetNumber attrnum; Page page; IndexTuple itup; int i, j; bool hasMatch = false; /* * Resets entryRes */ for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; memset( key->entryRes, FALSE, key->nentries ); } for(;;) { Datum datum[ BLCKSZ/sizeof(IndexTupleData) ]; bool datumExtracted[ BLCKSZ/sizeof(IndexTupleData) ]; Assert( pos->lastOffset > pos->firstOffset ); memset(datumExtracted + pos->firstOffset - 1, 0, sizeof(bool) * (pos->lastOffset - pos->firstOffset )); page = BufferGetPage(pos->pendingBuffer); for(i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; for(j=0; jnentries; j++) { OffsetNumber StopLow = pos->firstOffset, StopHigh = pos->lastOffset, StopMiddle; GinScanEntry entry = key->scanEntry + j; if ( key->entryRes[j] ) continue; while (StopLow < StopHigh) { StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle)); attrnum = gintuple_get_attrnum(&so->ginstate, itup); if (key->attnum < attrnum) StopHigh = StopMiddle; else if (key->attnum > attrnum) StopLow = StopMiddle + 1; else { int res; if (datumExtracted[ StopMiddle-1 ] == false) { datum[ StopMiddle-1 ] = gin_index_getattr(&so->ginstate, itup); datumExtracted[ StopMiddle-1 ] = true; } res = compareEntries(&so->ginstate, entry->attnum, entry->entry, datum[ StopMiddle-1 ]); if ( res == 0 ) { if ( entry->isPartialMatch ) key->entryRes[j] = matchPartialInPendingList(&so->ginstate, page, StopMiddle, pos->lastOffset, entry->entry, entry->attnum, datum, datumExtracted, entry->strategy); else key->entryRes[j] = true; break; } else if ( res < 0 ) StopHigh = StopMiddle; else StopLow = StopMiddle + 1; } } if ( StopLow>=StopHigh && entry->isPartialMatch ) key->entryRes[j] = matchPartialInPendingList(&so->ginstate, page, StopHigh, pos->lastOffset, entry->entry, entry->attnum, datum, datumExtracted, entry->strategy); hasMatch |= key->entryRes[j]; } } pos->firstOffset = pos->lastOffset; if ( GinPageHasFullRow(page) ) { /* * We scan all values from one tuple, go to next one */ return hasMatch; } else { ItemPointerData item = pos->item; if ( scanGetCandidate(scan, pos) == false || !ItemPointerEquals(&pos->item, &item) ) elog(ERROR,"Could not process tuple"); /* XXX should not be here ! */ } } return hasMatch; } /* * Collect all matched rows from pending list in bitmap */ static void scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) { GinScanOpaque so = (GinScanOpaque) scan->opaque; MemoryContext oldCtx; bool recheck, keyrecheck, match; int i; pendingPosition pos; Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO); BlockNumber blkno; *ntids = 0; LockBuffer(metabuffer, GIN_SHARE); blkno = GinPageGetMeta(BufferGetPage(metabuffer))->head; /* * fetch head of list before unlocking metapage. * head page must be pinned to prevent deletion by vacuum process */ if ( blkno == InvalidBlockNumber ) { /* No pending list, so proceed with normal scan */ UnlockReleaseBuffer( metabuffer ); return; } pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno); LockBuffer(pos.pendingBuffer, GIN_SHARE); pos.firstOffset = FirstOffsetNumber; UnlockReleaseBuffer( metabuffer ); /* * loop for each heap row */ while( scanGetCandidate(scan, &pos) ) { /* * Check entries in rows and setup entryRes array */ if (!collectDatumForItem(scan, &pos)) continue; /* * check for consistent */ oldCtx = MemoryContextSwitchTo(so->tempCtx); recheck = false; match = true; for (i = 0; match && i < so->nkeys; i++) { GinScanKey key = so->keys + i; keyrecheck = true; if ( DatumGetBool(FunctionCall4(&so->ginstate.consistentFn[ key->attnum-1 ], PointerGetDatum(key->entryRes), UInt16GetDatum(key->strategy), key->query, PointerGetDatum(&keyrecheck))) == false ) { match = false; } recheck |= keyrecheck; } MemoryContextSwitchTo(oldCtx); MemoryContextReset(so->tempCtx); if ( match ) { tbm_add_tuples(tbm, &pos.item, 1, recheck); (*ntids)++; } } } /* * Get heap item pointer from scan * returns true if found */ static bool scanGetItem(IndexScanDesc scan, ItemPointerData *item, bool *recheck) { GinScanOpaque so = (GinScanOpaque) scan->opaque; uint32 i; bool keyrecheck; /* * We return recheck = true if any of the keyGetItem calls return * keyrecheck = true. Note that because the second loop might advance * some keys, this could theoretically be too conservative. In practice * though, we expect that a consistentFn's recheck result will depend * only on the operator and the query, so for any one key it should * stay the same regardless of advancing to new items. So it's not * worth working harder. */ *recheck = false; ItemPointerSetMin(item); for (i = 0; i < so->nkeys; i++) { GinScanKey key = so->keys + i; if (keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx, key, &keyrecheck)) return FALSE; /* finished one of keys */ if (compareItemPointers(item, &key->curItem) < 0) *item = key->curItem; *recheck |= keyrecheck; } for (i = 1; i <= so->nkeys; i++) { GinScanKey key = so->keys + i - 1; for (;;) { int cmp = compareItemPointers(item, &key->curItem); if ( cmp != 0 && (ItemPointerIsLossyPage(item) || ItemPointerIsLossyPage(&key->curItem)) ) { /* * if one of ItemPointers points to the whole page then * compare only page's number */ if ( ItemPointerGetBlockNumber(item) == ItemPointerGetBlockNumber(&key->curItem) ) cmp = 0; else cmp = (ItemPointerGetBlockNumber(item) > ItemPointerGetBlockNumber(&key->curItem)) ? 1 : -1; } if (cmp == 0) break; else if (cmp > 0) { if (keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx, key, &keyrecheck)) return FALSE; /* finished one of keys */ *recheck |= keyrecheck; } else { /* returns to begin */ *item = key->curItem; i = 0; break; } } } return TRUE; } #define GinIsNewKey(s) ( ((GinScanOpaque) scan->opaque)->keys == NULL ) #define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes == true ) Datum gingetbitmap(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); int64 ntids; if (GinIsNewKey(scan)) newScanKey(scan); if (GinIsVoidRes(scan)) PG_RETURN_INT64(0); ntids = 0; /* * First, scan the pending list and collect any matching entries into * the bitmap. After we scan a pending item, some other backend could * post it into the main index, and so we might visit it a second time * during the main scan. This is okay because we'll just re-set the * same bit in the bitmap. (The possibility of duplicate visits is a * major reason why GIN can't support the amgettuple API, however.) * Note that it would not do to scan the main index before the pending * list, since concurrent cleanup could then make us miss entries * entirely. */ scanPendingInsert(scan, tbm, &ntids); /* * Now scan the main index. */ startScan(scan); for (;;) { ItemPointerData iptr; bool recheck; CHECK_FOR_INTERRUPTS(); if (!scanGetItem(scan, &iptr, &recheck)) break; if ( ItemPointerIsLossyPage(&iptr) ) tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr)); else tbm_add_tuples(tbm, &iptr, 1, recheck); ntids++; } PG_RETURN_INT64(ntids); }