diff options
Diffstat (limited to 'src')
35 files changed, 9503 insertions, 49 deletions
diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index a4c4ca7da94..0366d59624e 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -8,6 +8,6 @@ subdir = src/backend/access top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = common gist hash heap index nbtree transam gin +SUBDIRS = common gist hash heap index nbtree transam gin spgist include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 240e178b3b4..100172fa4ac 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -19,6 +19,7 @@ #include "access/hash.h" #include "access/nbtree.h" #include "access/reloptions.h" +#include "access/spgist.h" #include "catalog/pg_type.h" #include "commands/defrem.h" #include "commands/tablespace.h" @@ -106,6 +107,14 @@ static relopt_int intRelOpts[] = }, { { + "fillfactor", + "Packs spgist index pages only to this percentage", + RELOPT_KIND_SPGIST + }, + SPGIST_DEFAULT_FILLFACTOR, SPGIST_MIN_FILLFACTOR, 100 + }, + { + { "autovacuum_vacuum_threshold", "Minimum number of tuple updates or deletes prior to vacuum", RELOPT_KIND_HEAP | RELOPT_KIND_TOAST diff --git a/src/backend/access/spgist/Makefile b/src/backend/access/spgist/Makefile new file mode 100644 index 00000000000..918da1fccaf --- /dev/null +++ b/src/backend/access/spgist/Makefile @@ -0,0 +1,19 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/spgist +# +# IDENTIFICATION +# src/backend/access/spgist/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/spgist +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = spgutils.o spginsert.o spgscan.o spgvacuum.o \ + spgdoinsert.o spgxlog.o \ + spgtextproc.o spgquadtreeproc.o spgkdtreeproc.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/spgist/README b/src/backend/access/spgist/README new file mode 100644 index 00000000000..4ff0e357cb4 --- /dev/null +++ b/src/backend/access/spgist/README @@ -0,0 +1,316 @@ +src/backend/access/spgist/README + +SP-GiST is an abbreviation of space-partitioned GiST. It provides a +generalized infrastructure for implementing space-partitioned data +structures, such as quadtrees, k-d trees, and suffix trees (tries). When +implemented in main memory, these structures are usually designed as a set of +dynamically-allocated nodes linked by pointers. This is not suitable for +direct storing on disk, since the chains of pointers can be rather long and +require too many disk accesses. In contrast, disk based data structures +should have a high fanout to minimize I/O. The challenge is to map tree +nodes to disk pages in such a way that the search algorithm accesses only a +few disk pages, even if it traverses many nodes. + +COMMON STRUCTURE DESCRIPTION + +Logically, an SP-GiST tree is a set of tuples, each of which can be either +an inner or leaf tuple. Each inner tuple contains "nodes", which are +(label,pointer) pairs, where the pointer (ItemPointerData) is a pointer to +another inner tuple or to the head of a list of leaf tuples. Inner tuples +can have different numbers of nodes (children). Branches can be of different +depth (actually, there is no control or code to support balancing), which +means that the tree is non-balanced. However, leaf and inner tuples cannot +be intermixed at the same level: a downlink from a node of an inner tuple +leads either to one inner tuple, or to a list of leaf tuples. + +The SP-GiST core requires that inner and leaf tuples fit on a single index +page, and even more stringently that the list of leaf tuples reached from a +single inner-tuple node all be stored on the same index page. (Restricting +such lists to not cross pages reduces seeks, and allows the list links to be +stored as simple 2-byte OffsetNumbers.) SP-GiST index opclasses should +therefore ensure that not too many nodes can be needed in one inner tuple, +and that inner-tuple prefixes and leaf-node datum values not be too large. + +Inner and leaf tuples are stored separately: the former are stored only on +"inner" pages, the latter only on "leaf" pages. Also, there are special +restrictions on the root page. Early in an index's life, when there is only +one page's worth of data, the root page contains an unorganized set of leaf +tuples. After the first page split has occurred, the root is required to +contain exactly one inner tuple. + +When the search traversal algorithm reaches an inner tuple, it chooses a set +of nodes to continue tree traverse in depth. If it reaches a leaf page it +scans a list of leaf tuples to find the ones that match the query. + +The insertion algorithm descends the tree similarly, except it must choose +just one node to descend to from each inner tuple. Insertion might also have +to modify the inner tuple before it can descend: it could add a new node, or +it could "split" the tuple to obtain a less-specific prefix that can match +the value to be inserted. If it's necessary to append a new leaf tuple to a +list and there is no free space on page, then SP-GiST creates a new inner +tuple and distributes leaf tuples into a set of lists on, perhaps, several +pages. + +Inner tuple consists of: + + optional prefix value - all successors must be consistent with it. + Example: + suffix tree - prefix value is a common prefix string + quad tree - centroid + k-d tree - one coordinate + + list of nodes, where node is a (label, pointer) pair. + Example of a label: a single character for suffix tree + +Leaf tuple consists of: + + a leaf value + Example: + suffix tree - the rest of string (postfix) + quad and k-d tree - the point itself + + ItemPointer to the heap + +INSERTION ALGORITHM + +Insertion algorithm is designed to keep the tree in a consistent state at +any moment. Here is a simplified insertion algorithm specification +(numbers refer to notes below): + + Start with the first tuple on the root page (1) + + loop: + if (page is leaf) then + if (enough space) + insert on page and exit (5) + else (7) + call PickSplitFn() (2) + end if + else + switch (chooseFn()) + case MatchNode - descend through selected node + case AddNode - add node and then retry chooseFn (3, 6) + case SplitTuple - split inner tuple to prefix and postfix, then + retry chooseFn with the prefix tuple (4, 6) + end if + +Notes: + +(1) Initially, we just dump leaf tuples into the root page until it is full; +then we split it. Once the root is not a leaf page, it can have only one +inner tuple, so as to keep the amount of free space on the root as large as +possible. Both of these rules are meant to postpone doing PickSplit on the +root for as long as possible, so that the topmost partitioning of the search +space is as good as we can easily make it. + +(2) Current implementation allows to do picksplit and insert a new leaf tuple +in one operation, if the new list of leaf tuples fits on one page. It's +always possible for trees with small nodes like quad tree or k-d tree, but +suffix trees may require another picksplit. + +(3) Addition of node must keep size of inner tuple small enough to fit on a +page. After addition, inner tuple could become too large to be stored on +current page because of other tuples on page. In this case it will be moved +to another inner page (see notes about page management). When moving tuple to +another page, we can't change the numbers of other tuples on the page, else +we'd make downlink pointers to them invalid. To prevent that, SP-GiST leaves +a "placeholder" tuple, which can be reused later whenever another tuple is +added to the page. See also Concurrency and Vacuum sections below. Right now +only suffix trees could add a node to the tuple; quad trees and k-d trees +make all possible nodes at once in PickSplitFn() call. + +(4) Prefix value could only partially match a new value, so the SplitTuple +action allows breaking the current tree branch into upper and lower sections. +Another way to say it is that we can split the current inner tuple into +"prefix" and "postfix" parts, where the prefix part is able to match the +incoming new value. Consider example of insertion into a suffix tree. We use +the following notation, where tuple's id is just for discussion (no such id +is actually stored): + +inner tuple: {tuple id}(prefix string)[ comma separated list of node labels ] +leaf tuple: {tuple id}<value> + +Suppose we need to insert string 'www.gogo.com' into inner tuple + + {1}(www.google.com/)[a, i] + +The string does not match the prefix so we cannot descend. We must +split the inner tuple into two tuples: + + {2}(www.go)[o] - prefix tuple + | + {3}(gle.com/)[a,i] - postfix tuple + +On the next iteration of loop we find that 'www.gogo.com' matches the +prefix, but not any node label, so we add a node [g] to tuple {2}: + + NIL (no child exists yet) + | + {2}(www.go)[o, g] + | + {3}(gle.com/)[a,i] + +Now we can descend through the [g] node, which will cause us to update +the target string to just 'o.com'. Finally, we'll insert a leaf tuple +bearing that string: + + {4}<o.com> + | + {2}(www.go)[o, g] + | + {3}(gle.com/)[a,i] + +As we can see, the original tuple's node array moves to postfix tuple without +any changes. Note also that SP-GiST core assumes that prefix tuple is not +larger than old inner tuple. That allows us to store prefix tuple directly +in place of old inner tuple. SP-GiST core will try to store postfix tuple on +the same page if possible, but will use another page if there is not enough +free space (see notes 5 and 6). Currently, quad and k-d trees don't use this +feature, because they have no concept of a prefix being "inconsistent" with +any new value. They grow their depth only by PickSplitFn() call. + +(5) If pointer from node of parent is a NIL pointer, algorithm chooses a leaf +page to store on. At first, it tries to use the last-used leaf page with the +largest free space (which we track in each backend) to better utilize disk +space. If that's not large enough, then the algorithm allocates a new page. + +(6) Management of inner pages is very similar to management of leaf pages, +described in (5). + +(7) Actually, current implementation can move the whole list of leaf tuples +and a new tuple to another page, if the list is short enough. This improves +space utilization, but doesn't change the basis of the algorithm. + +CONCURRENCY + +While descending the tree, the insertion algorithm holds exclusive lock on +two tree levels at a time, ie both parent and child pages (parent and child +pages can be the same, see notes above). There is a possibility of deadlock +between two insertions if there are cross-referenced pages in different +branches. That is, if inner tuple on page M has a child on page N while +an inner tuple from another branch is on page N and has a child on page M, +then two insertions descending the two branches could deadlock. To prevent +deadlocks we introduce a concept of "triple parity" of pages: if inner tuple +is on page with BlockNumber N, then its child tuples should be placed on the +same page, or else on a page with BlockNumber M where (N+1) mod 3 == M mod 3. +This rule guarantees that tuples on page M will have no children on page N, +since (M+1) mod 3 != N mod 3. + +Insertion may also need to take locks on an additional inner and/or leaf page +to add tuples of the right type(s), when there's not enough room on the pages +it descended through. However, we don't care exactly which such page we add +to, so deadlocks can be avoided by conditionally locking the additional +buffers: if we fail to get lock on an additional page, just try another one. + +Search traversal algorithm is rather traditional. At each non-leaf level, it +share-locks the page, identifies which node(s) in the current inner tuple +need to be visited, and puts those addresses on a stack of pages to examine +later. It then releases lock on the current buffer before visiting the next +stack item. So only one page is locked at a time, and no deadlock is +possible. But instead, we have to worry about race conditions: by the time +we arrive at a pointed-to page, a concurrent insertion could have replaced +the target inner tuple (or leaf tuple chain) with data placed elsewhere. +To handle that, whenever the insertion algorithm changes a nonempty downlink +in an inner tuple, it places a "redirect tuple" in place of the lower-level +inner tuple or leaf-tuple chain head that the link formerly led to. Scans +(though not insertions) must be prepared to honor such redirects. Only a +scan that had already visited the parent level could possibly reach such a +redirect tuple, so we can remove redirects once all active transactions have +been flushed out of the system. + +DEAD TUPLES + +Tuples on leaf pages can be in one of four states: + +SPGIST_LIVE: normal, live pointer to a heap tuple. + +SPGIST_REDIRECT: placeholder that contains a link to another place in the +index. When a chain of leaf tuples has to be moved to another page, a +redirect tuple is inserted in place of the chain's head tuple. The parent +inner tuple's downlink is updated when this happens, but concurrent scans +might be "in flight" from the parent page to the child page (since they +release lock on the parent page before attempting to lock the child). +The redirect pointer serves to tell such a scan where to go. A redirect +pointer is only needed for as long as such concurrent scans could be in +progress. Eventually, it's converted into a PLACEHOLDER dead tuple by +VACUUM, and is then a candidate for replacement. Searches that find such +a tuple (which should never be part of a chain) should immediately proceed +to the other place, forgetting about the redirect tuple. Insertions that +reach such a tuple should raise error, since a valid downlink should never +point to such a tuple. + +SPGIST_DEAD: tuple is dead, but it cannot be removed or moved to a +different offset on the page because there is a link leading to it from +some inner tuple elsewhere in the index. (Such a tuple is never part of a +chain, since we don't need one unless there is nothing live left in its +chain.) Searches should ignore such entries. If an insertion action +arrives at such a tuple, it should either replace it in-place (if there's +room on the page to hold the desired new leaf tuple) or replace it with a +redirection pointer to wherever it puts the new leaf tuple. + +SPGIST_PLACEHOLDER: tuple is dead, and there are known to be no links to +it from elsewhere. When a live tuple is deleted or moved away, and not +replaced by a redirect pointer, it is replaced by a placeholder to keep +the offsets of later tuples on the same page from changing. Placeholders +can be freely replaced when adding a new tuple to the page, and also +VACUUM will delete any that are at the end of the range of valid tuple +offsets. Both searches and insertions should complain if a link from +elsewhere leads them to a placeholder tuple. + +When the root page is also a leaf, all its tuple should be in LIVE state; +there's no need for the others since there are no links and no need to +preserve offset numbers. + +Tuples on inner pages can be in LIVE, REDIRECT, or PLACEHOLDER states. +The REDIRECT state has the same function as on leaf pages, to send +concurrent searches to the place where they need to go after an inner +tuple is moved to another page. Expired REDIRECT pointers are converted +to PLACEHOLDER status by VACUUM, and are then candidates for replacement. +DEAD state is not currently possible, since VACUUM does not attempt to +remove unused inner tuples. + +VACUUM + +VACUUM (or more precisely, spgbulkdelete) performs a single sequential scan +over the entire index. On both leaf and inner pages, we can convert old +REDIRECT tuples into PLACEHOLDER status, and then remove any PLACEHOLDERs +that are at the end of the page (since they aren't needed to preserve the +offsets of any live tuples). On leaf pages, we scan for tuples that need +to be deleted because their heap TIDs match a vacuum target TID. + +If we find a deletable tuple that is not at the head of its chain, we +can simply replace it with a PLACEHOLDER, updating the chain links to +remove it from the chain. If it is at the head of its chain, but there's +at least one live tuple remaining in the chain, we move that live tuple +to the head tuple's offset, replacing it with a PLACEHOLDER to preserve +the offsets of other tuples. This keeps the parent inner tuple's downlink +valid. If we find ourselves deleting all live tuples in a chain, we +replace the head tuple with a DEAD tuple and the rest with PLACEHOLDERS. +The parent inner tuple's downlink thus points to the DEAD tuple, and the +rules explained in the previous section keep everything working. + +VACUUM doesn't know a-priori which tuples are heads of their chains, but +it can easily figure that out by constructing a predecessor array that's +the reverse map of the nextOffset links (ie, when we see tuple x links to +tuple y, we set predecessor[y] = x). Then head tuples are the ones with +no predecessor. + +spgbulkdelete also updates the index's free space map. + +Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was +performed; otherwise, it does an spgbulkdelete scan with an empty target +list, so as to clean up redirections and placeholders, update the free +space map, and gather statistics. + +LAST USED PAGE MANAGEMENT + +List of last used pages contains four pages - a leaf page and three inner +pages, one from each "triple parity" group. This list is stored between +calls on the index meta page, but updates are never WAL-logged to decrease +WAL traffic. Incorrect data on meta page isn't critical, because we could +allocate a new page at any moment. + +AUTHORS + + Teodor Sigaev <teodor@sigaev.ru> + Oleg Bartunov <oleg@sai.msu.su> diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c new file mode 100644 index 00000000000..4bb8dfa1509 --- /dev/null +++ b/src/backend/access/spgist/spgdoinsert.c @@ -0,0 +1,2065 @@ +/*------------------------------------------------------------------------- + * + * spgdoinsert.c + * implementation of insert algorithm + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgdoinsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/spgist_private.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" + + +/* + * SPPageDesc tracks all info about a page we are inserting into. In some + * situations it actually identifies a tuple, or even a specific node within + * an inner tuple. But any of the fields can be invalid. If the buffer + * field is valid, it implies we hold pin and exclusive lock on that buffer. + * page pointer should be valid exactly when buffer is. + */ +typedef struct SPPageDesc +{ + BlockNumber blkno; /* block number, or InvalidBlockNumber */ + Buffer buffer; /* page's buffer number, or InvalidBuffer */ + Page page; /* pointer to page buffer, or NULL */ + OffsetNumber offnum; /* offset of tuple, or InvalidOffsetNumber */ + int node; /* node number within inner tuple, or -1 */ +} SPPageDesc; + + +/* + * Set the item pointer in the nodeN'th entry in inner tuple tup. This + * is used to update the parent inner tuple's downlink after a move or + * split operation. + */ +void +updateNodeLink(SpGistInnerTuple tup, int nodeN, + BlockNumber blkno, OffsetNumber offset) +{ + int i; + SpGistNodeTuple node; + + SGITITERATE(tup, i, node) + { + if (i == nodeN) + { + ItemPointerSet(&node->t_tid, blkno, offset); + return; + } + } + + elog(ERROR, "failed to find requested node %d in SPGiST inner tuple", + nodeN); +} + +/* + * Form a new inner tuple containing one more node than the given one, with + * the specified label datum, inserted at offset "offset" in the node array. + * The new tuple's prefix is the same as the old one's. + * + * Note that the new node initially has an invalid downlink. We'll find a + * page to point it to later. + */ +static SpGistInnerTuple +addNode(SpGistState *state, SpGistInnerTuple tuple, Datum label, int offset) +{ + SpGistNodeTuple node, + *nodes; + int i; + + /* if offset is negative, insert at end */ + if (offset < 0) + offset = tuple->nNodes; + else if (offset > tuple->nNodes) + elog(ERROR, "invalid offset for adding node to SPGiST inner tuple"); + + nodes = palloc(sizeof(SpGistNodeTuple) * (tuple->nNodes + 1)); + SGITITERATE(tuple, i, node) + { + if (i < offset) + nodes[i] = node; + else + nodes[i + 1] = node; + } + + nodes[offset] = spgFormNodeTuple(state, label, false); + + return spgFormInnerTuple(state, + (tuple->prefixSize > 0), + SGITDATUM(tuple, state), + tuple->nNodes + 1, + nodes); +} + +/* qsort comparator for sorting OffsetNumbers */ +static int +cmpOffsetNumbers(const void *a, const void *b) +{ + if (*(const OffsetNumber *) a == *(const OffsetNumber *) b) + return 0; + return (*(const OffsetNumber *) a > *(const OffsetNumber *) b) ? 1 : -1; +} + +/* + * Delete multiple tuples from an index page, preserving tuple offset numbers. + * + * The first tuple in the given list is replaced with a dead tuple of type + * "firststate" (REDIRECT/DEAD/PLACEHOLDER); the remaining tuples are replaced + * with dead tuples of type "reststate". If either firststate or reststate + * is REDIRECT, blkno/offnum specify where to link to. + * + * NB: this is used during WAL replay, so beware of trying to make it too + * smart. In particular, it shouldn't use "state" except for calling + * spgFormDeadTuple(). + */ +void +spgPageIndexMultiDelete(SpGistState *state, Page page, + OffsetNumber *itemnos, int nitems, + int firststate, int reststate, + BlockNumber blkno, OffsetNumber offnum) +{ + OffsetNumber firstItem; + OffsetNumber *sortednos; + SpGistDeadTuple tuple = NULL; + int i; + + if (nitems == 0) + return; /* nothing to do */ + + /* + * For efficiency we want to use PageIndexMultiDelete, which requires the + * targets to be listed in sorted order, so we have to sort the itemnos + * array. (This also greatly simplifies the math for reinserting the + * replacement tuples.) However, we must not scribble on the caller's + * array, so we have to make a copy. + */ + sortednos = (OffsetNumber *) palloc(sizeof(OffsetNumber) * nitems); + memcpy(sortednos, itemnos, sizeof(OffsetNumber) * nitems); + if (nitems > 1) + qsort(sortednos, nitems, sizeof(OffsetNumber), cmpOffsetNumbers); + + PageIndexMultiDelete(page, sortednos, nitems); + + firstItem = itemnos[0]; + + for (i = 0; i < nitems; i++) + { + OffsetNumber itemno = sortednos[i]; + int tupstate; + + tupstate = (itemno == firstItem) ? firststate : reststate; + if (tuple == NULL || tuple->tupstate != tupstate) + tuple = spgFormDeadTuple(state, tupstate, blkno, offnum); + + if (PageAddItem(page, (Item) tuple, tuple->size, + itemno, false, false) != itemno) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + tuple->size); + + if (tupstate == SPGIST_REDIRECT) + SpGistPageGetOpaque(page)->nRedirection++; + else if (tupstate == SPGIST_PLACEHOLDER) + SpGistPageGetOpaque(page)->nPlaceholder++; + } + + pfree(sortednos); +} + +/* + * Update the parent inner tuple's downlink, and mark the parent buffer + * dirty (this must be the last change to the parent page in the current + * WAL action). + */ +static void +saveNodeLink(Relation index, SPPageDesc *parent, + BlockNumber blkno, OffsetNumber offnum) +{ + SpGistInnerTuple innerTuple; + + innerTuple = (SpGistInnerTuple) PageGetItem(parent->page, + PageGetItemId(parent->page, parent->offnum)); + + updateNodeLink(innerTuple, parent->node, blkno, offnum); + + MarkBufferDirty(parent->buffer); +} + +/* + * Add a leaf tuple to a leaf page where there is known to be room for it + */ +static void +addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple, + SPPageDesc *current, SPPageDesc *parent, bool isNew) +{ + XLogRecData rdata[4]; + spgxlogAddLeaf xlrec; + + xlrec.node = index->rd_node; + xlrec.blknoLeaf = current->blkno; + xlrec.newPage = isNew; + + /* these will be filled below as needed */ + xlrec.offnumLeaf = InvalidOffsetNumber; + xlrec.offnumHeadLeaf = InvalidOffsetNumber; + xlrec.blknoParent = InvalidBlockNumber; + xlrec.offnumParent = InvalidOffsetNumber; + xlrec.nodeI = 0; + + ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0); + /* we assume sizeof(xlrec) is at least int-aligned */ + ACCEPT_RDATA_DATA(leafTuple, leafTuple->size, 1); + ACCEPT_RDATA_BUFFER(current->buffer, 2); + + START_CRIT_SECTION(); + + if (current->offnum == InvalidOffsetNumber || + current->blkno == SPGIST_HEAD_BLKNO) + { + /* Tuple is not part of a chain */ + leafTuple->nextOffset = InvalidOffsetNumber; + current->offnum = SpGistPageAddNewItem(state, current->page, + (Item) leafTuple, leafTuple->size, + NULL, false); + + xlrec.offnumLeaf = current->offnum; + + /* Must update parent's downlink if any */ + if (parent->buffer != InvalidBuffer) + { + xlrec.blknoParent = parent->blkno; + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + + saveNodeLink(index, parent, current->blkno, current->offnum); + + ACCEPT_RDATA_BUFFER(parent->buffer, 3); + } + } + else + { + /* + * Tuple must be inserted into existing chain. We mustn't change + * the chain's head address, but we don't need to chase the entire + * chain to put the tuple at the end; we can insert it second. + * + * Also, it's possible that the "chain" consists only of a DEAD tuple, + * in which case we should replace the DEAD tuple in-place. + */ + SpGistLeafTuple head; + OffsetNumber offnum; + + head = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, current->offnum)); + if (head->tupstate == SPGIST_LIVE) + { + leafTuple->nextOffset = head->nextOffset; + offnum = SpGistPageAddNewItem(state, current->page, + (Item) leafTuple, leafTuple->size, + NULL, false); + + /* + * re-get head of list because it could have been moved on page, + * and set new second element + */ + head = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, current->offnum)); + head->nextOffset = offnum; + + xlrec.offnumLeaf = offnum; + xlrec.offnumHeadLeaf = current->offnum; + } + else if (head->tupstate == SPGIST_DEAD) + { + leafTuple->nextOffset = InvalidOffsetNumber; + PageIndexTupleDelete(current->page, current->offnum); + if (PageAddItem(current->page, + (Item) leafTuple, leafTuple->size, + current->offnum, false, false) != current->offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + leafTuple->size); + + /* WAL replay distinguishes this case by equal offnums */ + xlrec.offnumLeaf = current->offnum; + xlrec.offnumHeadLeaf = current->offnum; + } + else + elog(ERROR, "unexpected SPGiST tuple state: %d", head->tupstate); + } + + MarkBufferDirty(current->buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_LEAF, rdata); + + PageSetLSN(current->page, recptr); + PageSetTLI(current->page, ThisTimeLineID); + + /* update parent only if we actually changed it */ + if (xlrec.blknoParent != InvalidBlockNumber) + { + PageSetLSN(parent->page, recptr); + PageSetTLI(parent->page, ThisTimeLineID); + } + } + + END_CRIT_SECTION(); +} + +/* + * Count the number and total size of leaf tuples in the chain starting at + * current->offnum. Return number into *nToSplit and total size as function + * result. + * + * Klugy special case when considering the root page (i.e., root is a leaf + * page, but we're about to split for the first time): return fake large + * values to force spgdoinsert() to take the doPickSplit rather than + * moveLeafs code path. moveLeafs is not prepared to deal with root page. + */ +static int +checkSplitConditions(Relation index, SpGistState *state, + SPPageDesc *current, int *nToSplit) +{ + int i, + n = 0, + totalSize = 0; + + if (current->blkno == SPGIST_HEAD_BLKNO) + { + /* return impossible values to force split */ + *nToSplit = BLCKSZ; + return BLCKSZ; + } + + i = current->offnum; + while (i != InvalidOffsetNumber) + { + SpGistLeafTuple it; + + Assert(i >= FirstOffsetNumber && + i <= PageGetMaxOffsetNumber(current->page)); + it = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, i)); + if (it->tupstate == SPGIST_LIVE) + { + n++; + totalSize += it->size + sizeof(ItemIdData); + } + else if (it->tupstate == SPGIST_DEAD) + { + /* We could see a DEAD tuple as first/only chain item */ + Assert(i == current->offnum); + Assert(it->nextOffset == InvalidOffsetNumber); + /* Don't count it in result, because it won't go to other page */ + } + else + elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate); + + i = it->nextOffset; + } + + *nToSplit = n; + + return totalSize; +} + +/* + * current points to a leaf-tuple chain that we wanted to add newLeafTuple to, + * but the chain has to be moved because there's not enough room to add + * newLeafTuple to its page. We use this method when the chain contains + * very little data so a split would be inefficient. We are sure we can + * fit the chain plus newLeafTuple on one other page. + */ +static void +moveLeafs(Relation index, SpGistState *state, + SPPageDesc *current, SPPageDesc *parent, + SpGistLeafTuple newLeafTuple) +{ + int i, + nDelete, + nInsert, + size; + Buffer nbuf; + Page npage; + SpGistLeafTuple it; + OffsetNumber r = InvalidOffsetNumber, + startOffset = InvalidOffsetNumber; + bool replaceDead = false; + OffsetNumber *toDelete; + OffsetNumber *toInsert; + BlockNumber nblkno; + XLogRecData rdata[7]; + spgxlogMoveLeafs xlrec; + char *leafdata, + *leafptr; + + /* This doesn't work on root page */ + Assert(parent->buffer != InvalidBuffer); + Assert(parent->buffer != current->buffer); + + /* Locate the tuples to be moved, and count up the space needed */ + i = PageGetMaxOffsetNumber(current->page); + toDelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * i); + toInsert = (OffsetNumber *) palloc(sizeof(OffsetNumber) * (i + 1)); + + size = newLeafTuple->size + sizeof(ItemIdData); + + nDelete = 0; + i = current->offnum; + while (i != InvalidOffsetNumber) + { + SpGistLeafTuple it; + + Assert(i >= FirstOffsetNumber && + i <= PageGetMaxOffsetNumber(current->page)); + it = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, i)); + + if (it->tupstate == SPGIST_LIVE) + { + toDelete[nDelete] = i; + size += it->size + sizeof(ItemIdData); + nDelete++; + } + else if (it->tupstate == SPGIST_DEAD) + { + /* We could see a DEAD tuple as first/only chain item */ + Assert(i == current->offnum); + Assert(it->nextOffset == InvalidOffsetNumber); + /* We don't want to move it, so don't count it in size */ + toDelete[nDelete] = i; + nDelete++; + replaceDead = true; + } + else + elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate); + + i = it->nextOffset; + } + + /* Find a leaf page that will hold them */ + nbuf = SpGistGetBuffer(index, GBUF_LEAF, size, &xlrec.newPage); + npage = BufferGetPage(nbuf); + nblkno = BufferGetBlockNumber(nbuf); + Assert(nblkno != current->blkno); + + /* prepare WAL info */ + xlrec.node = index->rd_node; + STORE_STATE(state, xlrec.stateSrc); + + xlrec.blknoSrc = current->blkno; + xlrec.blknoDst = nblkno; + xlrec.nMoves = nDelete; + xlrec.replaceDead = replaceDead; + + xlrec.blknoParent = parent->blkno; + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + + leafdata = leafptr = palloc(size); + + START_CRIT_SECTION(); + + /* copy all the old tuples to new page, unless they're dead */ + nInsert = 0; + if (!replaceDead) + { + for (i = 0; i < nDelete; i++) + { + it = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, toDelete[i])); + Assert(it->tupstate == SPGIST_LIVE); + + /* + * Update chain link (notice the chain order gets reversed, but we + * don't care). We're modifying the tuple on the source page + * here, but it's okay since we're about to delete it. + */ + it->nextOffset = r; + + r = SpGistPageAddNewItem(state, npage, (Item) it, it->size, + &startOffset, false); + + toInsert[nInsert] = r; + nInsert++; + + /* save modified tuple into leafdata as well */ + memcpy(leafptr, it, it->size); + leafptr += it->size; + } + } + + /* add the new tuple as well */ + newLeafTuple->nextOffset = r; + r = SpGistPageAddNewItem(state, npage, + (Item) newLeafTuple, newLeafTuple->size, + &startOffset, false); + toInsert[nInsert] = r; + nInsert++; + memcpy(leafptr, newLeafTuple, newLeafTuple->size); + leafptr += newLeafTuple->size; + + /* + * Now delete the old tuples, leaving a redirection pointer behind for + * the first one, unless we're doing an index build; in which case there + * can't be any concurrent scan so we need not provide a redirect. + */ + spgPageIndexMultiDelete(state, current->page, toDelete, nDelete, + state->isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT, + SPGIST_PLACEHOLDER, + nblkno, r); + + /* Update parent's downlink and mark parent page dirty */ + saveNodeLink(index, parent, nblkno, r); + + /* Mark the leaf pages too */ + MarkBufferDirty(current->buffer); + MarkBufferDirty(nbuf); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + ACCEPT_RDATA_DATA(&xlrec, MAXALIGN(sizeof(xlrec)), 0); + ACCEPT_RDATA_DATA(toDelete, MAXALIGN(sizeof(OffsetNumber) * nDelete), 1); + ACCEPT_RDATA_DATA(toInsert, MAXALIGN(sizeof(OffsetNumber) * nInsert), 2); + ACCEPT_RDATA_DATA(leafdata, leafptr - leafdata, 3); + ACCEPT_RDATA_BUFFER(current->buffer, 4); + ACCEPT_RDATA_BUFFER(nbuf, 5); + ACCEPT_RDATA_BUFFER(parent->buffer, 6); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_MOVE_LEAFS, rdata); + + PageSetLSN(current->page, recptr); + PageSetTLI(current->page, ThisTimeLineID); + PageSetLSN(npage, recptr); + PageSetTLI(npage, ThisTimeLineID); + PageSetLSN(parent->page, recptr); + PageSetTLI(parent->page, ThisTimeLineID); + } + + END_CRIT_SECTION(); + + /* Update local free-space cache and release new buffer */ + SpGistSetLastUsedPage(index, nbuf); + UnlockReleaseBuffer(nbuf); +} + +/* + * Update previously-created redirection tuple with appropriate destination + * + * We use this when it's not convenient to know the destination first. + * The tuple should have been made with the "impossible" destination of + * the metapage. + */ +static void +setRedirectionTuple(SPPageDesc *current, OffsetNumber position, + BlockNumber blkno, OffsetNumber offnum) +{ + SpGistDeadTuple dt; + + dt = (SpGistDeadTuple) PageGetItem(current->page, + PageGetItemId(current->page, position)); + Assert(dt->tupstate == SPGIST_REDIRECT); + Assert(ItemPointerGetBlockNumber(&dt->pointer) == SPGIST_METAPAGE_BLKNO); + ItemPointerSet(&dt->pointer, blkno, offnum); +} + +/* + * Test to see if the user-defined picksplit function failed to do its job, + * ie, it put all the leaf tuples into the same node. + * If so, randomly divide the tuples into several nodes (all with the same + * label) and return TRUE to select allTheSame mode for this inner tuple. + * + * If we know that the leaf tuples wouldn't all fit on one page, then we + * exclude the last tuple (which is the incoming new tuple that forced a split) + * from the check to see if more than one node is used. The reason for this + * is that if the existing tuples are put into only one chain, then even if + * we move them all to an empty page, there would still not be room for the + * new tuple, so we'd get into an infinite loop of picksplit attempts. + * Forcing allTheSame mode dodges this problem by ensuring the old tuples will + * be split across pages. (Exercise for the reader: figure out why this + * fixes the problem even when there is only one old tuple.) + */ +static bool +checkAllTheSame(spgPickSplitIn *in, spgPickSplitOut *out, bool tooBig, + bool *includeNew) +{ + int theNode; + int limit; + int i; + + /* For the moment, assume we can include the new leaf tuple */ + *includeNew = true; + + /* If there's only the new leaf tuple, don't select allTheSame mode */ + if (in->nTuples <= 1) + return false; + + /* If tuple set doesn't fit on one page, ignore the new tuple in test */ + limit = tooBig ? in->nTuples - 1 : in->nTuples; + + /* Check to see if more than one node is populated */ + theNode = out->mapTuplesToNodes[0]; + for (i = 1; i < limit; i++) + { + if (out->mapTuplesToNodes[i] != theNode) + return false; + } + + /* Nope, so override the picksplit function's decisions */ + + /* If the new tuple is in its own node, it can't be included in split */ + if (tooBig && out->mapTuplesToNodes[in->nTuples - 1] != theNode) + *includeNew = false; + + out->nNodes = 8; /* arbitrary number of child nodes */ + + /* Random assignment of tuples to nodes (note we include new tuple) */ + for (i = 0; i < in->nTuples; i++) + out->mapTuplesToNodes[i] = i % out->nNodes; + + /* The opclass may not use node labels, but if it does, duplicate 'em */ + if (out->nodeLabels) + { + Datum theLabel = out->nodeLabels[theNode]; + + out->nodeLabels = (Datum *) palloc(sizeof(Datum) * out->nNodes); + for (i = 0; i < out->nNodes; i++) + out->nodeLabels[i] = theLabel; + } + + /* We don't touch the prefix or the leaf tuple datum assignments */ + + return true; +} + +/* + * current points to a leaf-tuple chain that we wanted to add newLeafTuple to, + * but the chain has to be split because there's not enough room to add + * newLeafTuple to its page. + * + * This function splits the leaf tuple set according to picksplit's rules, + * creating one or more new chains that are spread across the current page + * and an additional leaf page (we assume that two leaf pages will be + * sufficient). A new inner tuple is created, and the parent downlink + * pointer is updated to point to that inner tuple instead of the leaf chain. + * + * On exit, current contains the address of the new inner tuple. + * + * Returns true if we successfully inserted newLeafTuple during this function, + * false if caller still has to do it (meaning another picksplit operation is + * probably needed). Failure could occur if the picksplit result is fairly + * unbalanced, or if newLeafTuple is just plain too big to fit on a page. + * Because we force the picksplit result to be at least two chains, each + * cycle will get rid of at least one leaf tuple from the chain, so the loop + * will eventually terminate if lack of balance is the issue. If the tuple + * is too big, we assume that repeated picksplit operations will eventually + * make it small enough by repeated prefix-stripping. A broken opclass could + * make this an infinite loop, though. + */ +static bool +doPickSplit(Relation index, SpGistState *state, + SPPageDesc *current, SPPageDesc *parent, + SpGistLeafTuple newLeafTuple, int level, bool isNew) +{ + bool insertedNew = false; + spgPickSplitIn in; + spgPickSplitOut out; + bool includeNew; + int i, + max, + n; + SpGistInnerTuple innerTuple; + SpGistNodeTuple node, + *nodes; + Buffer newInnerBuffer, + newLeafBuffer; + ItemPointerData *heapPtrs; + uint8 *leafPageSelect; + int *leafSizes; + OffsetNumber *toDelete; + OffsetNumber *toInsert; + OffsetNumber redirectTuplePos = InvalidOffsetNumber; + OffsetNumber startOffsets[2]; + SpGistLeafTuple *newLeafs; + int spaceToDelete; + int currentFreeSpace; + int totalLeafSizes; + bool allTheSame; + XLogRecData rdata[10]; + int nRdata; + spgxlogPickSplit xlrec; + char *leafdata, + *leafptr; + SPPageDesc saveCurrent; + int nToDelete, + nToInsert, + maxToInclude; + + in.level = level; + + /* + * Allocate per-leaf-tuple work arrays with max possible size + */ + max = PageGetMaxOffsetNumber(current->page); + n = max + 1; + in.datums = (Datum *) palloc(sizeof(Datum) * n); + heapPtrs = (ItemPointerData *) palloc(sizeof(ItemPointerData) * n); + toDelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * n); + toInsert = (OffsetNumber *) palloc(sizeof(OffsetNumber) * n); + newLeafs = (SpGistLeafTuple *) palloc(sizeof(SpGistLeafTuple) * n); + leafPageSelect = (uint8 *) palloc(sizeof(uint8) * n); + + xlrec.node = index->rd_node; + STORE_STATE(state, xlrec.stateSrc); + + /* + * Form list of leaf tuples which will be distributed as split result; + * also, count up the amount of space that will be freed from current. + * (Note that in the non-root case, we won't actually delete the old + * tuples, only replace them with redirects or placeholders.) + */ + nToInsert = 0; + nToDelete = 0; + spaceToDelete = 0; + if (current->blkno == SPGIST_HEAD_BLKNO) + { + /* + * We are splitting the root (which up to now is also a leaf page). + * Its tuples are not linked, so scan sequentially to get them all. + * We ignore the original value of current->offnum. + */ + for (i = FirstOffsetNumber; i <= max; i++) + { + SpGistLeafTuple it; + + it = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, i)); + if (it->tupstate == SPGIST_LIVE) + { + in.datums[nToInsert] = SGLTDATUM(it, state); + heapPtrs[nToInsert] = it->heapPtr; + nToInsert++; + toDelete[nToDelete] = i; + nToDelete++; + /* we will delete the tuple altogether, so count full space */ + spaceToDelete += it->size + sizeof(ItemIdData); + } + else /* tuples on root should be live */ + elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate); + } + } + else + { + /* Normal case, just collect the leaf tuples in the chain */ + i = current->offnum; + while (i != InvalidOffsetNumber) + { + SpGistLeafTuple it; + + Assert(i >= FirstOffsetNumber && i <= max); + it = (SpGistLeafTuple) PageGetItem(current->page, + PageGetItemId(current->page, i)); + if (it->tupstate == SPGIST_LIVE) + { + in.datums[nToInsert] = SGLTDATUM(it, state); + heapPtrs[nToInsert] = it->heapPtr; + nToInsert++; + toDelete[nToDelete] = i; + nToDelete++; + /* we will not delete the tuple, only replace with dead */ + Assert(it->size >= SGDTSIZE); + spaceToDelete += it->size - SGDTSIZE; + } + else if (it->tupstate == SPGIST_DEAD) + { + /* We could see a DEAD tuple as first/only chain item */ + Assert(i == current->offnum); + Assert(it->nextOffset == InvalidOffsetNumber); + toDelete[nToDelete] = i; + nToDelete++; + /* replacing it with redirect will save no space */ + } + else + elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate); + + i = it->nextOffset; + } + } + in.nTuples = nToInsert; + + /* + * We may not actually insert new tuple because another picksplit may be + * necessary due to too large value, but we will try to to allocate enough + * space to include it; and in any case it has to be included in the input + * for the picksplit function. So don't increment nToInsert yet. + */ + in.datums[in.nTuples] = SGLTDATUM(newLeafTuple, state); + heapPtrs[in.nTuples] = newLeafTuple->heapPtr; + in.nTuples++; + + /* + * Perform split using user-defined method. + */ + memset(&out, 0, sizeof(out)); + + FunctionCall2Coll(&state->picksplitFn, + index->rd_indcollation[0], + PointerGetDatum(&in), + PointerGetDatum(&out)); + + /* + * Form new leaf tuples and count up the total space needed. + */ + totalLeafSizes = 0; + for (i = 0; i < in.nTuples; i++) + { + newLeafs[i] = spgFormLeafTuple(state, heapPtrs + i, + out.leafTupleDatums[i]); + totalLeafSizes += newLeafs[i]->size + sizeof(ItemIdData); + } + + /* + * Check to see if the picksplit function failed to separate the values, + * ie, it put them all into the same child node. If so, select allTheSame + * mode and create a random split instead. See comments for + * checkAllTheSame as to why we need to know if the new leaf tuples could + * fit on one page. + */ + allTheSame = checkAllTheSame(&in, &out, + totalLeafSizes > SPGIST_PAGE_CAPACITY, + &includeNew); + + /* + * If checkAllTheSame decided we must exclude the new tuple, don't + * consider it any further. + */ + if (includeNew) + maxToInclude = in.nTuples; + else + { + maxToInclude = in.nTuples - 1; + totalLeafSizes -= newLeafs[in.nTuples - 1]->size + sizeof(ItemIdData); + } + + /* + * Allocate per-node work arrays. Since checkAllTheSame could replace + * out.nNodes with a value larger than the number of tuples on the input + * page, we can't allocate these arrays before here. + */ + nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * out.nNodes); + leafSizes = (int *) palloc0(sizeof(int) * out.nNodes); + + /* + * Form nodes of inner tuple and inner tuple itself + */ + for (i = 0; i < out.nNodes; i++) + { + Datum label = (Datum) 0; + bool isnull = (out.nodeLabels == NULL); + + if (!isnull) + label = out.nodeLabels[i]; + nodes[i] = spgFormNodeTuple(state, label, isnull); + } + innerTuple = spgFormInnerTuple(state, + out.hasPrefix, out.prefixDatum, + out.nNodes, nodes); + innerTuple->allTheSame = allTheSame; + + /* + * Update nodes[] array to point into the newly formed innerTuple, so + * that we can adjust their downlinks below. + */ + SGITITERATE(innerTuple, i, node) + { + nodes[i] = node; + } + + /* + * Re-scan new leaf tuples and count up the space needed under each node. + */ + for (i = 0; i < maxToInclude; i++) + { + n = out.mapTuplesToNodes[i]; + if (n < 0 || n >= out.nNodes) + elog(ERROR, "inconsistent result of SPGiST picksplit function"); + leafSizes[n] += newLeafs[i]->size + sizeof(ItemIdData); + } + + /* + * To perform the split, we must insert a new inner tuple, which can't + * go on a leaf page; and unless we are splitting the root page, we + * must then update the parent tuple's downlink to point to the inner + * tuple. If there is room, we'll put the new inner tuple on the same + * page as the parent tuple, otherwise we need another non-leaf buffer. + * But if the parent page is the root, we can't add the new inner tuple + * there, because the root page must have only one inner tuple. + */ + xlrec.initInner = false; + if (parent->buffer != InvalidBuffer && + parent->blkno != SPGIST_HEAD_BLKNO && + (SpGistPageGetFreeSpace(parent->page, 1) >= + innerTuple->size + sizeof(ItemIdData))) + { + /* New inner tuple will fit on parent page */ + newInnerBuffer = parent->buffer; + } + else if (parent->buffer != InvalidBuffer) + { + /* Send tuple to page with next triple parity (see README) */ + newInnerBuffer = SpGistGetBuffer(index, + GBUF_INNER_PARITY(parent->blkno + 1), + innerTuple->size + sizeof(ItemIdData), + &xlrec.initInner); + } + else + { + /* Root page split ... inner tuple will go to root page */ + newInnerBuffer = InvalidBuffer; + } + + /*---------- + * Because a WAL record can't involve more than four buffers, we can + * only afford to deal with two leaf pages in each picksplit action, + * ie the current page and at most one other. + * + * The new leaf tuples converted from the existing ones should require + * the same or less space, and therefore should all fit onto one page + * (although that's not necessarily the current page, since we can't + * delete the old tuples but only replace them with placeholders). + * However, the incoming new tuple might not also fit, in which case + * we might need another picksplit cycle to reduce it some more. + * + * If there's not room to put everything back onto the current page, + * then we decide on a per-node basis which tuples go to the new page. + * (We do it like that because leaf tuple chains can't cross pages, + * so we must place all leaf tuples belonging to the same parent node + * on the same page.) + * + * If we are splitting the root page (turning it from a leaf page into an + * inner page), then no leaf tuples can go back to the current page; they + * must all go somewhere else. + *---------- + */ + if (current->blkno != SPGIST_HEAD_BLKNO) + currentFreeSpace = PageGetExactFreeSpace(current->page) + spaceToDelete; + else + currentFreeSpace = 0; /* prevent assigning any tuples to current */ + + xlrec.initDest = false; + + if (totalLeafSizes <= currentFreeSpace) + { + /* All the leaf tuples will fit on current page */ + newLeafBuffer = InvalidBuffer; + /* mark new leaf tuple as included in insertions, if allowed */ + if (includeNew) + { + nToInsert++; + insertedNew = true; + } + for (i = 0; i < nToInsert; i++) + leafPageSelect[i] = 0; /* signifies current page */ + } + else if (in.nTuples == 1 && totalLeafSizes > SPGIST_PAGE_CAPACITY) + { + /* + * We're trying to split up a long value by repeated suffixing, but + * it's not going to fit yet. Don't bother allocating a second leaf + * buffer that we won't be able to use. + */ + newLeafBuffer = InvalidBuffer; + Assert(includeNew); + Assert(nToInsert == 0); + } + else + { + /* We will need another leaf page */ + uint8 *nodePageSelect; + int curspace; + int newspace; + + newLeafBuffer = SpGistGetBuffer(index, GBUF_LEAF, + Min(totalLeafSizes, + SPGIST_PAGE_CAPACITY), + &xlrec.initDest); + /* + * Attempt to assign node groups to the two pages. We might fail to + * do so, even if totalLeafSizes is less than the available space, + * because we can't split a group across pages. + */ + nodePageSelect = (uint8 *) palloc(sizeof(uint8) * out.nNodes); + + curspace = currentFreeSpace; + newspace = PageGetExactFreeSpace(BufferGetPage(newLeafBuffer)); + for (i = 0; i < out.nNodes; i++) + { + if (leafSizes[i] <= curspace) + { + nodePageSelect[i] = 0; /* signifies current page */ + curspace -= leafSizes[i]; + } + else + { + nodePageSelect[i] = 1; /* signifies new leaf page */ + newspace -= leafSizes[i]; + } + } + if (curspace >= 0 && newspace >= 0) + { + /* Successful assignment, so we can include the new leaf tuple */ + if (includeNew) + { + nToInsert++; + insertedNew = true; + } + } + else if (includeNew) + { + /* We must exclude the new leaf tuple from the split */ + int nodeOfNewTuple = out.mapTuplesToNodes[in.nTuples - 1]; + + leafSizes[nodeOfNewTuple] -= + newLeafs[in.nTuples - 1]->size + sizeof(ItemIdData); + + /* Repeat the node assignment process --- should succeed now */ + curspace = currentFreeSpace; + newspace = PageGetExactFreeSpace(BufferGetPage(newLeafBuffer)); + for (i = 0; i < out.nNodes; i++) + { + if (leafSizes[i] <= curspace) + { + nodePageSelect[i] = 0; /* signifies current page */ + curspace -= leafSizes[i]; + } + else + { + nodePageSelect[i] = 1; /* signifies new leaf page */ + newspace -= leafSizes[i]; + } + } + if (curspace < 0 || newspace < 0) + elog(ERROR, "failed to divide leaf tuple groups across pages"); + } + else + { + /* oops, we already excluded new tuple ... should not get here */ + elog(ERROR, "failed to divide leaf tuple groups across pages"); + } + /* Expand the per-node assignments to be shown per leaf tuple */ + for (i = 0; i < nToInsert; i++) + { + n = out.mapTuplesToNodes[i]; + leafPageSelect[i] = nodePageSelect[n]; + } + } + + /* Start preparing WAL record */ + xlrec.blknoSrc = current->blkno; + xlrec.blknoDest = InvalidBlockNumber; + xlrec.nDelete = 0; + xlrec.initSrc = isNew; + + leafdata = leafptr = (char *) palloc(totalLeafSizes); + + ACCEPT_RDATA_DATA(&xlrec, MAXALIGN(sizeof(xlrec)), 0); + ACCEPT_RDATA_DATA(innerTuple, innerTuple->size, 1); + nRdata = 2; + + /* Here we begin making the changes to the target pages */ + START_CRIT_SECTION(); + + /* + * Delete old leaf tuples from current buffer, except when we're splitting + * the root; in that case there's no need because we'll re-init the page + * below. We do this first to make room for reinserting new leaf tuples. + */ + if (current->blkno != SPGIST_HEAD_BLKNO) + { + /* + * Init buffer instead of deleting individual tuples, but only if + * there aren't any other live tuples and only during build; otherwise + * we need to set a redirection tuple for concurrent scans. + */ + if (state->isBuild && + nToDelete + SpGistPageGetOpaque(current->page)->nPlaceholder == + PageGetMaxOffsetNumber(current->page)) + { + SpGistInitBuffer(current->buffer, SPGIST_LEAF); + xlrec.initSrc = true; + } + else if (isNew) + { + /* don't expose the freshly init'd buffer as a backup block */ + Assert(nToDelete == 0); + } + else + { + xlrec.nDelete = nToDelete; + ACCEPT_RDATA_DATA(toDelete, + MAXALIGN(sizeof(OffsetNumber) * nToDelete), + nRdata); + nRdata++; + ACCEPT_RDATA_BUFFER(current->buffer, nRdata); + nRdata++; + + if (!state->isBuild) + { + /* + * Need to create redirect tuple (it will point to new inner + * tuple) but right now the new tuple's location is not known + * yet. So, set the redirection pointer to "impossible" value + * and remember its position to update tuple later. + */ + if (nToDelete > 0) + redirectTuplePos = toDelete[0]; + spgPageIndexMultiDelete(state, current->page, + toDelete, nToDelete, + SPGIST_REDIRECT, + SPGIST_PLACEHOLDER, + SPGIST_METAPAGE_BLKNO, + FirstOffsetNumber); + } + else + { + /* + * During index build there is not concurrent searches, so we + * don't need to create redirection tuple. + */ + spgPageIndexMultiDelete(state, current->page, + toDelete, nToDelete, + SPGIST_PLACEHOLDER, + SPGIST_PLACEHOLDER, + InvalidBlockNumber, + InvalidOffsetNumber); + } + } + } + + /* + * Put leaf tuples on proper pages, and update downlinks in innerTuple's + * nodes. + */ + startOffsets[0] = startOffsets[1] = InvalidOffsetNumber; + for (i = 0; i < nToInsert; i++) + { + SpGistLeafTuple it = newLeafs[i]; + Buffer leafBuffer; + BlockNumber leafBlock; + OffsetNumber newoffset; + + /* Which page is it going to? */ + leafBuffer = leafPageSelect[i] ? newLeafBuffer : current->buffer; + leafBlock = BufferGetBlockNumber(leafBuffer); + + /* Link tuple into correct chain for its node */ + n = out.mapTuplesToNodes[i]; + + if (ItemPointerIsValid(&nodes[n]->t_tid)) + { + Assert(ItemPointerGetBlockNumber(&nodes[n]->t_tid) == leafBlock); + it->nextOffset = ItemPointerGetOffsetNumber(&nodes[n]->t_tid); + } + else + it->nextOffset = InvalidOffsetNumber; + + /* Insert it on page */ + newoffset = SpGistPageAddNewItem(state, BufferGetPage(leafBuffer), + (Item) it, it->size, + &startOffsets[leafPageSelect[i]], + false); + toInsert[i] = newoffset; + + /* ... and complete the chain linking */ + ItemPointerSet(&nodes[n]->t_tid, leafBlock, newoffset); + + /* Also copy leaf tuple into WAL data */ + memcpy(leafptr, newLeafs[i], newLeafs[i]->size); + leafptr += newLeafs[i]->size; + } + + /* + * We're done modifying the other leaf buffer (if any), so mark it dirty. + * current->buffer will be marked below, after we're entirely done + * modifying it. + */ + if (newLeafBuffer != InvalidBuffer) + { + MarkBufferDirty(newLeafBuffer); + /* also save block number for WAL */ + xlrec.blknoDest = BufferGetBlockNumber(newLeafBuffer); + if (!xlrec.initDest) + { + ACCEPT_RDATA_BUFFER(newLeafBuffer, nRdata); + nRdata++; + } + } + + xlrec.nInsert = nToInsert; + ACCEPT_RDATA_DATA(toInsert, + MAXALIGN(sizeof(OffsetNumber) * nToInsert), + nRdata); + nRdata++; + ACCEPT_RDATA_DATA(leafPageSelect, + MAXALIGN(sizeof(uint8) * nToInsert), + nRdata); + nRdata++; + ACCEPT_RDATA_DATA(leafdata, leafptr - leafdata, nRdata); + nRdata++; + + /* Remember current buffer, since we're about to change "current" */ + saveCurrent = *current; + + /* + * Store the new innerTuple + */ + if (newInnerBuffer == parent->buffer && newInnerBuffer != InvalidBuffer) + { + /* + * new inner tuple goes to parent page + */ + Assert(current->buffer != parent->buffer); + + /* Repoint "current" at the new inner tuple */ + current->blkno = parent->blkno; + current->buffer = parent->buffer; + current->page = parent->page; + xlrec.blknoInner = current->blkno; + xlrec.offnumInner = current->offnum = + SpGistPageAddNewItem(state, current->page, + (Item) innerTuple, innerTuple->size, + NULL, false); + + /* + * Update parent node link and mark parent page dirty + */ + xlrec.blknoParent = parent->blkno; + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + saveNodeLink(index, parent, current->blkno, current->offnum); + + ACCEPT_RDATA_BUFFER(parent->buffer, nRdata); + nRdata++; + + /* + * Update redirection link (in old current buffer) + */ + if (redirectTuplePos != InvalidOffsetNumber) + setRedirectionTuple(&saveCurrent, redirectTuplePos, + current->blkno, current->offnum); + + /* Done modifying old current buffer, mark it dirty */ + MarkBufferDirty(saveCurrent.buffer); + } + else if (parent->buffer != InvalidBuffer) + { + /* + * new inner tuple will be stored on a new page + */ + Assert(newInnerBuffer != InvalidBuffer); + + /* Repoint "current" at the new inner tuple */ + current->buffer = newInnerBuffer; + current->blkno = BufferGetBlockNumber(current->buffer); + current->page = BufferGetPage(current->buffer); + xlrec.blknoInner = current->blkno; + xlrec.offnumInner = current->offnum = + SpGistPageAddNewItem(state, current->page, + (Item) innerTuple, innerTuple->size, + NULL, false); + + /* Done modifying new current buffer, mark it dirty */ + MarkBufferDirty(current->buffer); + + /* + * Update parent node link and mark parent page dirty + */ + xlrec.blknoParent = parent->blkno; + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + saveNodeLink(index, parent, current->blkno, current->offnum); + + ACCEPT_RDATA_BUFFER(current->buffer, nRdata); + nRdata++; + ACCEPT_RDATA_BUFFER(parent->buffer, nRdata); + nRdata++; + + /* + * Update redirection link (in old current buffer) + */ + if (redirectTuplePos != InvalidOffsetNumber) + setRedirectionTuple(&saveCurrent, redirectTuplePos, + current->blkno, current->offnum); + + /* Done modifying old current buffer, mark it dirty */ + MarkBufferDirty(saveCurrent.buffer); + } + else + { + /* + * Splitting root page, which was a leaf but now becomes inner page + * (and so "current" continues to point at it) + */ + Assert(current->blkno == SPGIST_HEAD_BLKNO); + Assert(redirectTuplePos == InvalidOffsetNumber); + + SpGistInitBuffer(current->buffer, 0); + xlrec.initInner = true; + + xlrec.blknoInner = current->blkno; + xlrec.offnumInner = current->offnum = + PageAddItem(current->page, (Item) innerTuple, innerTuple->size, + InvalidOffsetNumber, false, false); + if (current->offnum != FirstOffsetNumber) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + innerTuple->size); + + /* No parent link to update, nor redirection to do */ + xlrec.blknoParent = InvalidBlockNumber; + xlrec.offnumParent = InvalidOffsetNumber; + xlrec.nodeI = 0; + + /* Done modifying new current buffer, mark it dirty */ + MarkBufferDirty(current->buffer); + + /* saveCurrent doesn't represent a different buffer */ + saveCurrent.buffer = InvalidBuffer; + } + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + /* Issue the WAL record */ + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_PICKSPLIT, rdata); + + /* Update page LSNs on all affected pages */ + if (newLeafBuffer != InvalidBuffer) + { + Page page = BufferGetPage(newLeafBuffer); + + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + + if (saveCurrent.buffer != InvalidBuffer) + { + Page page = BufferGetPage(saveCurrent.buffer); + + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + + PageSetLSN(current->page, recptr); + PageSetTLI(current->page, ThisTimeLineID); + + if (parent->buffer != InvalidBuffer) + { + PageSetLSN(parent->page, recptr); + PageSetTLI(parent->page, ThisTimeLineID); + } + } + + END_CRIT_SECTION(); + + /* Update local free-space cache and unlock buffers */ + if (newLeafBuffer != InvalidBuffer) + { + SpGistSetLastUsedPage(index, newLeafBuffer); + UnlockReleaseBuffer(newLeafBuffer); + } + if (saveCurrent.buffer != InvalidBuffer) + { + SpGistSetLastUsedPage(index, saveCurrent.buffer); + UnlockReleaseBuffer(saveCurrent.buffer); + } + + return insertedNew; +} + +/* + * spgMatchNode action: descend to N'th child node of current inner tuple + */ +static void +spgMatchNodeAction(Relation index, SpGistState *state, + SpGistInnerTuple innerTuple, + SPPageDesc *current, SPPageDesc *parent, int nodeN) +{ + int i; + SpGistNodeTuple node; + + /* Release previous parent buffer if any */ + if (parent->buffer != InvalidBuffer && + parent->buffer != current->buffer) + { + SpGistSetLastUsedPage(index, parent->buffer); + UnlockReleaseBuffer(parent->buffer); + } + + /* Repoint parent to specified node of current inner tuple */ + parent->blkno = current->blkno; + parent->buffer = current->buffer; + parent->page = current->page; + parent->offnum = current->offnum; + parent->node = nodeN; + + /* Locate that node */ + SGITITERATE(innerTuple, i, node) + { + if (i == nodeN) + break; + } + + if (i != nodeN) + elog(ERROR, "failed to find requested node %d in SPGiST inner tuple", + nodeN); + + /* Point current to the downlink location, if any */ + if (ItemPointerIsValid(&node->t_tid)) + { + current->blkno = ItemPointerGetBlockNumber(&node->t_tid); + current->offnum = ItemPointerGetOffsetNumber(&node->t_tid); + } + else + { + /* Downlink is empty, so we'll need to find a new page */ + current->blkno = InvalidBlockNumber; + current->offnum = InvalidOffsetNumber; + } + + current->buffer = InvalidBuffer; + current->page = NULL; +} + +/* + * spgAddNode action: add a node to the inner tuple at current + */ +static void +spgAddNodeAction(Relation index, SpGistState *state, + SpGistInnerTuple innerTuple, + SPPageDesc *current, SPPageDesc *parent, + int nodeN, Datum nodeLabel) +{ + SpGistInnerTuple newInnerTuple; + XLogRecData rdata[5]; + spgxlogAddNode xlrec; + + /* Construct new inner tuple with additional node */ + newInnerTuple = addNode(state, innerTuple, nodeLabel, nodeN); + + /* Prepare WAL record */ + xlrec.node = index->rd_node; + STORE_STATE(state, xlrec.stateSrc); + xlrec.blkno = current->blkno; + xlrec.offnum = current->offnum; + + /* we don't fill these unless we need to change the parent downlink */ + xlrec.blknoParent = InvalidBlockNumber; + xlrec.offnumParent = InvalidOffsetNumber; + xlrec.nodeI = 0; + + /* we don't fill these unless tuple has to be moved */ + xlrec.blknoNew = InvalidBlockNumber; + xlrec.offnumNew = InvalidOffsetNumber; + xlrec.newPage = false; + + ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0); + /* we assume sizeof(xlrec) is at least int-aligned */ + ACCEPT_RDATA_DATA(newInnerTuple, newInnerTuple->size, 1); + ACCEPT_RDATA_BUFFER(current->buffer, 2); + + if (PageGetExactFreeSpace(current->page) >= + newInnerTuple->size - innerTuple->size) + { + /* + * We can replace the inner tuple by new version in-place + */ + START_CRIT_SECTION(); + + PageIndexTupleDelete(current->page, current->offnum); + if (PageAddItem(current->page, + (Item) newInnerTuple, newInnerTuple->size, + current->offnum, false, false) != current->offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + newInnerTuple->size); + + MarkBufferDirty(current->buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE, rdata); + + PageSetLSN(current->page, recptr); + PageSetTLI(current->page, ThisTimeLineID); + } + + END_CRIT_SECTION(); + } + else + { + /* + * move inner tuple to another page, and update parent + */ + SpGistDeadTuple dt; + SPPageDesc saveCurrent; + + /* + * It should not be possible to get here for the root page, since we + * allow only one inner tuple on the root page, and spgFormInnerTuple + * always checks that inner tuples don't exceed the size of a page. + */ + if (current->blkno == SPGIST_HEAD_BLKNO) + elog(ERROR, "cannot enlarge root tuple any more"); + Assert(parent->buffer != InvalidBuffer); + + saveCurrent = *current; + + xlrec.blknoParent = parent->blkno; + xlrec.offnumParent = parent->offnum; + xlrec.nodeI = parent->node; + + /* + * obtain new buffer with the same parity as current, since it will + * be a child of same parent tuple + */ + current->buffer = SpGistGetBuffer(index, + GBUF_INNER_PARITY(current->blkno), + newInnerTuple->size + sizeof(ItemIdData), + &xlrec.newPage); + current->blkno = BufferGetBlockNumber(current->buffer); + current->page = BufferGetPage(current->buffer); + + xlrec.blknoNew = current->blkno; + + /* + * Let's just make real sure new current isn't same as old. Right + * now that's impossible, but if SpGistGetBuffer ever got smart enough + * to delete placeholder tuples before checking space, maybe it + * wouldn't be impossible. The case would appear to work except that + * WAL replay would be subtly wrong, so I think a mere assert isn't + * enough here. + */ + if (xlrec.blknoNew == xlrec.blkno) + elog(ERROR, "SPGiST new buffer shouldn't be same as old buffer"); + + /* + * New current and parent buffer will both be modified; but note that + * parent buffer could be same as either new or old current. + */ + ACCEPT_RDATA_BUFFER(current->buffer, 3); + if (parent->buffer != current->buffer && + parent->buffer != saveCurrent.buffer) + ACCEPT_RDATA_BUFFER(parent->buffer, 4); + + START_CRIT_SECTION(); + + /* insert new ... */ + xlrec.offnumNew = current->offnum = + SpGistPageAddNewItem(state, current->page, + (Item) newInnerTuple, newInnerTuple->size, + NULL, false); + + MarkBufferDirty(current->buffer); + + /* update parent's downlink and mark parent page dirty */ + saveNodeLink(index, parent, current->blkno, current->offnum); + + /* + * Replace old tuple with a placeholder or redirection tuple. Unless + * doing an index build, we have to insert a redirection tuple for + * possible concurrent scans. We can't just delete it in any case, + * because that could change the offsets of other tuples on the page, + * breaking downlinks from their parents. + */ + if (state->isBuild) + dt = spgFormDeadTuple(state, SPGIST_PLACEHOLDER, + InvalidBlockNumber, InvalidOffsetNumber); + else + dt = spgFormDeadTuple(state, SPGIST_REDIRECT, + current->blkno, current->offnum); + + PageIndexTupleDelete(saveCurrent.page, saveCurrent.offnum); + if (PageAddItem(saveCurrent.page, (Item) dt, dt->size, + saveCurrent.offnum, + false, false) != saveCurrent.offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + dt->size); + + if (state->isBuild) + SpGistPageGetOpaque(saveCurrent.page)->nPlaceholder++; + else + SpGistPageGetOpaque(saveCurrent.page)->nRedirection++; + + MarkBufferDirty(saveCurrent.buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE, rdata); + + /* we don't bother to check if any of these are redundant */ + PageSetLSN(current->page, recptr); + PageSetTLI(current->page, ThisTimeLineID); + PageSetLSN(parent->page, recptr); + PageSetTLI(parent->page, ThisTimeLineID); + PageSetLSN(saveCurrent.page, recptr); + PageSetTLI(saveCurrent.page, ThisTimeLineID); + } + + END_CRIT_SECTION(); + + /* Release saveCurrent if it's not same as current or parent */ + if (saveCurrent.buffer != current->buffer && + saveCurrent.buffer != parent->buffer) + { + SpGistSetLastUsedPage(index, saveCurrent.buffer); + UnlockReleaseBuffer(saveCurrent.buffer); + } + } +} + +/* + * spgSplitNode action: split inner tuple at current into prefix and postfix + */ +static void +spgSplitNodeAction(Relation index, SpGistState *state, + SpGistInnerTuple innerTuple, + SPPageDesc *current, spgChooseOut *out) +{ + SpGistInnerTuple prefixTuple, + postfixTuple; + SpGistNodeTuple node, + *nodes; + BlockNumber postfixBlkno; + OffsetNumber postfixOffset; + int i; + XLogRecData rdata[5]; + spgxlogSplitTuple xlrec; + Buffer newBuffer = InvalidBuffer; + + /* + * Construct new prefix tuple, containing a single node with the + * specified label. (We'll update the node's downlink to point to the + * new postfix tuple, below.) + */ + node = spgFormNodeTuple(state, out->result.splitTuple.nodeLabel, false); + + prefixTuple = spgFormInnerTuple(state, + out->result.splitTuple.prefixHasPrefix, + out->result.splitTuple.prefixPrefixDatum, + 1, &node); + + /* it must fit in the space that innerTuple now occupies */ + if (prefixTuple->size > innerTuple->size) + elog(ERROR, "SPGiST inner-tuple split must not produce longer prefix"); + + /* + * Construct new postfix tuple, containing all nodes of innerTuple with + * same node datums, but with the prefix specified by the picksplit + * function. + */ + nodes = palloc(sizeof(SpGistNodeTuple) * innerTuple->nNodes); + SGITITERATE(innerTuple, i, node) + { + nodes[i] = node; + } + + postfixTuple = spgFormInnerTuple(state, + out->result.splitTuple.postfixHasPrefix, + out->result.splitTuple.postfixPrefixDatum, + innerTuple->nNodes, nodes); + + /* Postfix tuple is allTheSame if original tuple was */ + postfixTuple->allTheSame = innerTuple->allTheSame; + + /* prep data for WAL record */ + xlrec.node = index->rd_node; + xlrec.newPage = false; + + ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0); + /* we assume sizeof(xlrec) is at least int-aligned */ + ACCEPT_RDATA_DATA(prefixTuple, prefixTuple->size, 1); + ACCEPT_RDATA_DATA(postfixTuple, postfixTuple->size, 2); + ACCEPT_RDATA_BUFFER(current->buffer, 3); + + /* + * If we can't fit both tuples on the current page, get a new page for the + * postfix tuple. In particular, can't split to the root page. + * + * For the space calculation, note that prefixTuple replaces innerTuple + * but postfixTuple will be a new entry. + */ + if (current->blkno == SPGIST_HEAD_BLKNO || + SpGistPageGetFreeSpace(current->page, 1) + innerTuple->size < + prefixTuple->size + postfixTuple->size + sizeof(ItemIdData)) + { + /* + * Choose page with next triple parity, because postfix tuple is a + * child of prefix one + */ + newBuffer = SpGistGetBuffer(index, + GBUF_INNER_PARITY(current->blkno + 1), + postfixTuple->size + sizeof(ItemIdData), + &xlrec.newPage); + ACCEPT_RDATA_BUFFER(newBuffer, 4); + } + + START_CRIT_SECTION(); + + /* + * Replace old tuple by prefix tuple + */ + PageIndexTupleDelete(current->page, current->offnum); + xlrec.offnumPrefix = PageAddItem(current->page, + (Item) prefixTuple, prefixTuple->size, + current->offnum, false, false); + if (xlrec.offnumPrefix != current->offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + prefixTuple->size); + xlrec.blknoPrefix = current->blkno; + + /* + * put postfix tuple into appropriate page + */ + if (newBuffer == InvalidBuffer) + { + xlrec.blknoPostfix = postfixBlkno = current->blkno; + xlrec.offnumPostfix = postfixOffset = + SpGistPageAddNewItem(state, current->page, + (Item) postfixTuple, postfixTuple->size, + NULL, false); + } + else + { + xlrec.blknoPostfix = postfixBlkno = BufferGetBlockNumber(newBuffer); + xlrec.offnumPostfix = postfixOffset = + SpGistPageAddNewItem(state, BufferGetPage(newBuffer), + (Item) postfixTuple, postfixTuple->size, + NULL, false); + MarkBufferDirty(newBuffer); + } + + /* + * And set downlink pointer in the prefix tuple to point to postfix tuple. + * (We can't avoid this step by doing the above two steps in opposite + * order, because there might not be enough space on the page to insert + * the postfix tuple first.) We have to update the local copy of the + * prefixTuple too, because that's what will be written to WAL. + */ + updateNodeLink(prefixTuple, 0, postfixBlkno, postfixOffset); + prefixTuple = (SpGistInnerTuple) PageGetItem(current->page, + PageGetItemId(current->page, current->offnum)); + updateNodeLink(prefixTuple, 0, postfixBlkno, postfixOffset); + + MarkBufferDirty(current->buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_SPLIT_TUPLE, rdata); + + PageSetLSN(current->page, recptr); + PageSetTLI(current->page, ThisTimeLineID); + + if (newBuffer != InvalidBuffer) + { + PageSetLSN(BufferGetPage(newBuffer), recptr); + PageSetTLI(BufferGetPage(newBuffer), ThisTimeLineID); + } + } + + END_CRIT_SECTION(); + + /* Update local free-space cache and release buffer */ + if (newBuffer != InvalidBuffer) + { + SpGistSetLastUsedPage(index, newBuffer); + UnlockReleaseBuffer(newBuffer); + } +} + +/* + * Insert one item into the index + */ +void +spgdoinsert(Relation index, SpGistState *state, + ItemPointer heapPtr, Datum datum) +{ + int level = 0; + Datum leafDatum; + int leafSize; + SPPageDesc current, + parent; + + /* + * Since we don't use index_form_tuple in this AM, we have to make sure + * value to be inserted is not toasted; FormIndexDatum doesn't guarantee + * that. + */ + if (state->attType.attlen == -1) + datum = PointerGetDatum(PG_DETOAST_DATUM(datum)); + + leafDatum = datum; + + /* + * Compute space needed for a leaf tuple containing the given datum. + * + * If it isn't gonna fit, and the opclass can't reduce the datum size by + * suffixing, bail out now rather than getting into an endless loop. + */ + leafSize = SGLTHDRSZ + sizeof(ItemIdData) + + SpGistGetTypeSize(&state->attType, leafDatum); + + if (leafSize > SPGIST_PAGE_CAPACITY && !state->config.longValuesOK) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", + (unsigned long) (leafSize - sizeof(ItemIdData)), + (unsigned long) (SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)), + RelationGetRelationName(index)), + errhint("Values larger than a buffer page cannot be indexed."))); + + /* Initialize "current" to the root page */ + current.blkno = SPGIST_HEAD_BLKNO; + current.buffer = InvalidBuffer; + current.page = NULL; + current.offnum = FirstOffsetNumber; + current.node = -1; + + /* "parent" is invalid for the moment */ + parent.blkno = InvalidBlockNumber; + parent.buffer = InvalidBuffer; + parent.page = NULL; + parent.offnum = InvalidOffsetNumber; + parent.node = -1; + + for (;;) + { + bool isNew = false; + + /* + * Bail out if query cancel is pending. We must have this somewhere + * in the loop since a broken opclass could produce an infinite + * picksplit loop. + */ + CHECK_FOR_INTERRUPTS(); + + if (current.blkno == InvalidBlockNumber) + { + /* + * Create a leaf page. If leafSize is too large to fit on a page, + * we won't actually use the page yet, but it simplifies the API + * for doPickSplit to always have a leaf page at hand; so just + * quietly limit our request to a page size. + */ + current.buffer = SpGistGetBuffer(index, GBUF_LEAF, + Min(leafSize, + SPGIST_PAGE_CAPACITY), + &isNew); + current.blkno = BufferGetBlockNumber(current.buffer); + } + else if (parent.buffer == InvalidBuffer || + current.blkno != parent.blkno) + { + current.buffer = ReadBuffer(index, current.blkno); + LockBuffer(current.buffer, BUFFER_LOCK_EXCLUSIVE); + } + else + { + /* inner tuple can be stored on the same page as parent one */ + current.buffer = parent.buffer; + } + current.page = BufferGetPage(current.buffer); + + if (SpGistPageIsLeaf(current.page)) + { + SpGistLeafTuple leafTuple; + int nToSplit, + sizeToSplit; + + leafTuple = spgFormLeafTuple(state, heapPtr, leafDatum); + if (leafTuple->size + sizeof(ItemIdData) <= + SpGistPageGetFreeSpace(current.page, 1)) + { + /* it fits on page, so insert it and we're done */ + addLeafTuple(index, state, leafTuple, + ¤t, &parent, isNew); + break; + } + else if ((sizeToSplit = + checkSplitConditions(index, state, ¤t, + &nToSplit)) < SPGIST_PAGE_CAPACITY / 2 && + nToSplit < 64 && + leafTuple->size + sizeof(ItemIdData) + sizeToSplit <= SPGIST_PAGE_CAPACITY) + { + /* + * the amount of data is pretty small, so just move the whole + * chain to another leaf page rather than splitting it. + */ + Assert(!isNew); + moveLeafs(index, state, ¤t, &parent, leafTuple); + break; /* we're done */ + } + else + { + /* picksplit */ + if (doPickSplit(index, state, ¤t, &parent, + leafTuple, level, isNew)) + break; /* doPickSplit installed new tuples */ + + /* leaf tuple will not be inserted yet */ + pfree(leafTuple); + + /* + * current now describes new inner tuple, go insert into it + */ + Assert(!SpGistPageIsLeaf(current.page)); + goto process_inner_tuple; + } + } + else /* non-leaf page */ + { + /* + * Apply the opclass choose function to figure out how to insert + * the given datum into the current inner tuple. + */ + SpGistInnerTuple innerTuple; + spgChooseIn in; + spgChooseOut out; + + /* + * spgAddNode and spgSplitTuple cases will loop back to here to + * complete the insertion operation. Just in case the choose + * function is broken and produces add or split requests + * repeatedly, check for query cancel. + */ + process_inner_tuple: + CHECK_FOR_INTERRUPTS(); + + innerTuple = (SpGistInnerTuple) PageGetItem(current.page, + PageGetItemId(current.page, current.offnum)); + + in.datum = datum; + in.leafDatum = leafDatum; + in.level = level; + in.allTheSame = innerTuple->allTheSame; + in.hasPrefix = (innerTuple->prefixSize > 0); + in.prefixDatum = SGITDATUM(innerTuple, state); + in.nNodes = innerTuple->nNodes; + in.nodeLabels = spgExtractNodeLabels(state, innerTuple); + + memset(&out, 0, sizeof(out)); + + FunctionCall2Coll(&state->chooseFn, + index->rd_indcollation[0], + PointerGetDatum(&in), + PointerGetDatum(&out)); + + if (innerTuple->allTheSame) + { + /* + * It's not allowed to do an AddNode at an allTheSame tuple. + * Opclass must say "match", in which case we choose a random + * one of the nodes to descend into, or "split". + */ + if (out.resultType == spgAddNode) + elog(ERROR, "cannot add a node to an allTheSame inner tuple"); + else if (out.resultType == spgMatchNode) + out.result.matchNode.nodeN = random() % innerTuple->nNodes; + } + + switch (out.resultType) + { + case spgMatchNode: + /* Descend to N'th child node */ + spgMatchNodeAction(index, state, innerTuple, + ¤t, &parent, + out.result.matchNode.nodeN); + /* Adjust level as per opclass request */ + level += out.result.matchNode.levelAdd; + /* Replace leafDatum and recompute leafSize */ + leafDatum = out.result.matchNode.restDatum; + leafSize = SGLTHDRSZ + sizeof(ItemIdData) + + SpGistGetTypeSize(&state->attType, leafDatum); + + /* + * Loop around and attempt to insert the new leafDatum + * at "current" (which might reference an existing child + * tuple, or might be invalid to force us to find a new + * page for the tuple). + * + * Note: if the opclass sets longValuesOK, we rely on the + * choose function to eventually shorten the leafDatum + * enough to fit on a page. We could add a test here to + * complain if the datum doesn't get visibly shorter each + * time, but that could get in the way of opclasses that + * "simplify" datums in a way that doesn't necessarily + * lead to physical shortening on every cycle. + */ + break; + case spgAddNode: + /* AddNode is not sensible if nodes don't have labels */ + if (in.nodeLabels == NULL) + elog(ERROR, "cannot add a node to an inner tuple without node labels"); + /* Add node to inner tuple, per request */ + spgAddNodeAction(index, state, innerTuple, + ¤t, &parent, + out.result.addNode.nodeN, + out.result.addNode.nodeLabel); + + /* + * Retry insertion into the enlarged node. We assume + * that we'll get a MatchNode result this time. + */ + goto process_inner_tuple; + break; + case spgSplitTuple: + /* Split inner tuple, per request */ + spgSplitNodeAction(index, state, innerTuple, + ¤t, &out); + + /* Retry insertion into the split node */ + goto process_inner_tuple; + break; + default: + elog(ERROR, "unrecognized SPGiST choose result: %d", + (int) out.resultType); + break; + } + } + } /* end loop */ + + /* + * Release any buffers we're still holding. Beware of possibility that + * current and parent reference same buffer. + */ + if (current.buffer != InvalidBuffer) + { + SpGistSetLastUsedPage(index, current.buffer); + UnlockReleaseBuffer(current.buffer); + } + if (parent.buffer != InvalidBuffer && + parent.buffer != current.buffer) + { + SpGistSetLastUsedPage(index, parent.buffer); + UnlockReleaseBuffer(parent.buffer); + } +} diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c new file mode 100644 index 00000000000..4a059bdfedc --- /dev/null +++ b/src/backend/access/spgist/spginsert.c @@ -0,0 +1,219 @@ +/*------------------------------------------------------------------------- + * + * spginsert.c + * Externally visible index creation/insertion routines + * + * All the actual insertion logic is in spgdoinsert.c. + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spginsert.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/spgist_private.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/smgr.h" +#include "utils/memutils.h" + + +typedef struct +{ + SpGistState spgstate; /* SPGiST's working state */ + MemoryContext tmpCtx; /* per-tuple temporary context */ +} SpGistBuildState; + + +/* Callback to process one heap tuple during IndexBuildHeapScan */ +static void +spgistBuildCallback(Relation index, HeapTuple htup, Datum *values, + bool *isnull, bool tupleIsAlive, void *state) +{ + SpGistBuildState *buildstate = (SpGistBuildState *) state; + + /* SPGiST doesn't index nulls */ + if (*isnull == false) + { + /* Work in temp context, and reset it after each tuple */ + MemoryContext oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); + + spgdoinsert(index, &buildstate->spgstate, &htup->t_self, *values); + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(buildstate->tmpCtx); + } +} + +/* + * Build an SP-GiST index. + */ +Datum +spgbuild(PG_FUNCTION_ARGS) +{ + Relation heap = (Relation) PG_GETARG_POINTER(0); + Relation index = (Relation) PG_GETARG_POINTER(1); + IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); + IndexBuildResult *result; + double reltuples; + SpGistBuildState buildstate; + Buffer metabuffer, + rootbuffer; + + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* + * Initialize the meta page and root page + */ + metabuffer = SpGistNewBuffer(index); + rootbuffer = SpGistNewBuffer(index); + + Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO); + Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_HEAD_BLKNO); + + START_CRIT_SECTION(); + + SpGistInitMetapage(BufferGetPage(metabuffer)); + MarkBufferDirty(metabuffer); + SpGistInitBuffer(rootbuffer, SPGIST_LEAF); + MarkBufferDirty(rootbuffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + XLogRecData rdata; + + /* WAL data is just the relfilenode */ + rdata.data = (char *) &(index->rd_node); + rdata.len = sizeof(RelFileNode); + rdata.buffer = InvalidBuffer; + rdata.next = NULL; + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_CREATE_INDEX, &rdata); + + PageSetLSN(BufferGetPage(metabuffer), recptr); + PageSetTLI(BufferGetPage(metabuffer), ThisTimeLineID); + PageSetLSN(BufferGetPage(rootbuffer), recptr); + PageSetTLI(BufferGetPage(rootbuffer), ThisTimeLineID); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(metabuffer); + UnlockReleaseBuffer(rootbuffer); + + /* + * Now insert all the heap data into the index + */ + initSpGistState(&buildstate.spgstate, index); + buildstate.spgstate.isBuild = true; + + buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST build temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, + spgistBuildCallback, (void *) &buildstate); + + MemoryContextDelete(buildstate.tmpCtx); + + SpGistUpdateMetaPage(index); + + result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); + result->heap_tuples = result->index_tuples = reltuples; + + PG_RETURN_POINTER(result); +} + +/* + * Build an empty SPGiST index in the initialization fork + */ +Datum +spgbuildempty(PG_FUNCTION_ARGS) +{ + Relation index = (Relation) PG_GETARG_POINTER(0); + Page page; + + /* Construct metapage. */ + page = (Page) palloc(BLCKSZ); + SpGistInitMetapage(page); + + /* Write the page. If archiving/streaming, XLOG it. */ + smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, + (char *) page, true); + if (XLogIsNeeded()) + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + SPGIST_METAPAGE_BLKNO, page); + + /* Likewise for the root page. */ + SpGistInitPage(page, SPGIST_LEAF); + + smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_HEAD_BLKNO, + (char *) page, true); + if (XLogIsNeeded()) + log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, + SPGIST_HEAD_BLKNO, page); + + /* + * An immediate sync is required even if we xlog'd the pages, because the + * writes did not go through shared buffers and therefore a concurrent + * checkpoint may have moved the redo pointer past our xlog record. + */ + smgrimmedsync(index->rd_smgr, INIT_FORKNUM); + + PG_RETURN_VOID(); +} + +/* + * Insert one new tuple into an SPGiST index. + */ +Datum +spginsert(PG_FUNCTION_ARGS) +{ + Relation index = (Relation) PG_GETARG_POINTER(0); + Datum *values = (Datum *) PG_GETARG_POINTER(1); + bool *isnull = (bool *) PG_GETARG_POINTER(2); + ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3); + +#ifdef NOT_USED + Relation heapRel = (Relation) PG_GETARG_POINTER(4); + IndexUniqueCheck checkUnique = (IndexUniqueCheck) PG_GETARG_INT32(5); +#endif + SpGistState spgstate; + MemoryContext oldCtx; + MemoryContext insertCtx; + + /* SPGiST doesn't index nulls */ + if (*isnull) + PG_RETURN_BOOL(false); + + insertCtx = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST insert temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldCtx = MemoryContextSwitchTo(insertCtx); + + initSpGistState(&spgstate, index); + + spgdoinsert(index, &spgstate, ht_ctid, *values); + + SpGistUpdateMetaPage(index); + + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + + /* return false since we've not done any unique check */ + PG_RETURN_BOOL(false); +} diff --git a/src/backend/access/spgist/spgkdtreeproc.c b/src/backend/access/spgist/spgkdtreeproc.c new file mode 100644 index 00000000000..e11d1a35e3a --- /dev/null +++ b/src/backend/access/spgist/spgkdtreeproc.c @@ -0,0 +1,298 @@ +/*------------------------------------------------------------------------- + * + * spgkdtreeproc.c + * implementation of k-d tree over points for SP-GiST + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgkdtreeproc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gist.h" /* for RTree strategy numbers */ +#include "access/spgist.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/geo_decls.h" + + +Datum +spg_kd_config(PG_FUNCTION_ARGS) +{ + /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */ + spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1); + + cfg->prefixType = FLOAT8OID; + cfg->labelType = VOIDOID; /* we don't need node labels */ + cfg->longValuesOK = false; + PG_RETURN_VOID(); +} + +static int +getSide(double coord, bool isX, Point *tst) +{ + double tstcoord = (isX) ? tst->x : tst->y; + + if (coord == tstcoord) + return 0; + else if (coord > tstcoord) + return 1; + else + return -1; +} + +Datum +spg_kd_choose(PG_FUNCTION_ARGS) +{ + spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0); + spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1); + Point *inPoint = DatumGetPointP(in->datum); + double coord; + + if (in->allTheSame) + elog(ERROR, "allTheSame should not occur for k-d trees"); + + Assert(in->hasPrefix); + coord = DatumGetFloat8(in->prefixDatum); + + Assert(in->nNodes == 2); + + out->resultType = spgMatchNode; + out->result.matchNode.nodeN = + (getSide(coord, in->level % 2, inPoint) > 0) ? 0 : 1; + out->result.matchNode.levelAdd = 1; + out->result.matchNode.restDatum = PointPGetDatum(inPoint); + + PG_RETURN_VOID(); +} + +typedef struct SortedPoint +{ + Point *p; + int i; +} SortedPoint; + +static int +x_cmp(const void *a, const void *b) +{ + SortedPoint *pa = (SortedPoint *) a; + SortedPoint *pb = (SortedPoint *) b; + + if (pa->p->x == pb->p->x) + return 0; + return (pa->p->x > pb->p->x) ? 1 : -1; +} + +static int +y_cmp(const void *a, const void *b) +{ + SortedPoint *pa = (SortedPoint *) a; + SortedPoint *pb = (SortedPoint *) b; + + if (pa->p->y == pb->p->y) + return 0; + return (pa->p->y > pb->p->y) ? 1 : -1; +} + + +Datum +spg_kd_picksplit(PG_FUNCTION_ARGS) +{ + spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0); + spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1); + int i; + int middle; + SortedPoint *sorted; + double coord; + + sorted = palloc(sizeof(*sorted) * in->nTuples); + for (i = 0; i < in->nTuples; i++) + { + sorted[i].p = DatumGetPointP(in->datums[i]); + sorted[i].i = i; + } + + qsort(sorted, in->nTuples, sizeof(*sorted), + (in->level % 2) ? x_cmp : y_cmp); + middle = in->nTuples >> 1; + coord = (in->level % 2) ? sorted[middle].p->x : sorted[middle].p->y; + + out->hasPrefix = true; + out->prefixDatum = Float8GetDatum(coord); + + out->nNodes = 2; + out->nodeLabels = NULL; /* we don't need node labels */ + + out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples); + out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples); + + /* + * Note: points that have coordinates exactly equal to coord may get + * classified into either node, depending on where they happen to fall + * in the sorted list. This is okay as long as the inner_consistent + * function descends into both sides for such cases. This is better + * than the alternative of trying to have an exact boundary, because + * it keeps the tree balanced even when we have many instances of the + * same point value. So we should never trigger the allTheSame logic. + */ + for (i = 0; i < in->nTuples; i++) + { + Point *p = sorted[i].p; + int n = sorted[i].i; + + out->mapTuplesToNodes[n] = (i < middle) ? 0 : 1; + out->leafTupleDatums[n] = PointPGetDatum(p); + } + + PG_RETURN_VOID(); +} + +Datum +spg_kd_inner_consistent(PG_FUNCTION_ARGS) +{ + spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0); + spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1); + Point *query; + BOX *boxQuery; + double coord; + + query = DatumGetPointP(in->query); + Assert(in->hasPrefix); + coord = DatumGetFloat8(in->prefixDatum); + + if (in->allTheSame) + elog(ERROR, "allTheSame should not occur for k-d trees"); + + Assert(in->nNodes == 2); + out->nodeNumbers = (int *) palloc(sizeof(int) * 2); + out->levelAdds = (int *) palloc(sizeof(int) * 2); + out->levelAdds[0] = 1; + out->levelAdds[1] = 1; + out->nNodes = 0; + + switch (in->strategy) + { + case RTLeftStrategyNumber: + out->nNodes = 1; + out->nodeNumbers[0] = 0; + + if ((in->level % 2) == 0 || FPge(query->x, coord)) + { + out->nodeNumbers[1] = 1; + out->nNodes++; + } + break; + case RTRightStrategyNumber: + out->nNodes = 1; + out->nodeNumbers[0] = 1; + + if ((in->level % 2) == 0 || FPle(query->x, coord)) + { + out->nodeNumbers[1] = 0; + out->nNodes++; + } + break; + case RTSameStrategyNumber: + if (in->level % 2) + { + if (FPle(query->x, coord)) + { + out->nodeNumbers[out->nNodes] = 0; + out->nNodes++; + } + if (FPge(query->x, coord)) + { + out->nodeNumbers[out->nNodes] = 1; + out->nNodes++; + } + } + else + { + if (FPle(query->y, coord)) + { + out->nodeNumbers[out->nNodes] = 0; + out->nNodes++; + } + if (FPge(query->y, coord)) + { + out->nodeNumbers[out->nNodes] = 1; + out->nNodes++; + } + } + break; + case RTBelowStrategyNumber: + out->nNodes = 1; + out->nodeNumbers[0] = 0; + + if ((in->level % 2) == 1 || FPge(query->y, coord)) + { + out->nodeNumbers[1] = 1; + out->nNodes++; + } + break; + case RTAboveStrategyNumber: + out->nNodes = 1; + out->nodeNumbers[0] = 1; + + if ((in->level % 2) == 1 || FPle(query->y, coord)) + { + out->nodeNumbers[1] = 0; + out->nNodes++; + } + break; + case RTContainedByStrategyNumber: + + /* + * For this operator, the query is a box not a point. We cheat to + * the extent of assuming that DatumGetPointP won't do anything + * that would be bad for a pointer-to-box. + */ + boxQuery = DatumGetBoxP(in->query); + + out->nNodes = 1; + if (in->level % 2) + { + if (FPlt(boxQuery->high.x, coord)) + out->nodeNumbers[0] = 0; + else if (FPgt(boxQuery->low.x, coord)) + out->nodeNumbers[0] = 1; + else + { + out->nodeNumbers[0] = 0; + out->nodeNumbers[1] = 1; + out->nNodes = 2; + } + } + else + { + if (FPlt(boxQuery->high.y, coord)) + out->nodeNumbers[0] = 0; + else if (FPgt(boxQuery->low.y, coord)) + out->nodeNumbers[0] = 1; + else + { + out->nodeNumbers[0] = 0; + out->nodeNumbers[1] = 1; + out->nNodes = 2; + } + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", in->strategy); + break; + } + + PG_RETURN_VOID(); +} + +/* + * spg_kd_leaf_consistent() is the same as spg_quad_leaf_consistent(), + * since we support the same operators and the same leaf data type. + * So we just borrow that function. + */ diff --git a/src/backend/access/spgist/spgquadtreeproc.c b/src/backend/access/spgist/spgquadtreeproc.c new file mode 100644 index 00000000000..0be6e55ad30 --- /dev/null +++ b/src/backend/access/spgist/spgquadtreeproc.c @@ -0,0 +1,360 @@ +/*------------------------------------------------------------------------- + * + * spgquadtreeproc.c + * implementation of quad tree over points for SP-GiST + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgquadtreeproc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gist.h" /* for RTree strategy numbers */ +#include "access/spgist.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/geo_decls.h" + + +Datum +spg_quad_config(PG_FUNCTION_ARGS) +{ + /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */ + spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1); + + cfg->prefixType = POINTOID; + cfg->labelType = VOIDOID; /* we don't need node labels */ + cfg->longValuesOK = false; + PG_RETURN_VOID(); +} + +#define SPTEST(f, x, y) \ + DatumGetBool(DirectFunctionCall2(f, PointPGetDatum(x), PointPGetDatum(y))) + +/* + * Determine which quadrant a point falls into, relative to the centroid. + * + * Quadrants are identified like this: + * + * 4 | 1 + * ----+----- + * 3 | 2 + * + * Points on one of the axes are taken to lie in the lowest-numbered + * adjacent quadrant. + */ +static int2 +getQuadrant(Point *centroid, Point *tst) +{ + if ((SPTEST(point_above, tst, centroid) || + SPTEST(point_horiz, tst, centroid)) && + (SPTEST(point_right, tst, centroid) || + SPTEST(point_vert, tst, centroid))) + return 1; + + if (SPTEST(point_below, tst, centroid) && + (SPTEST(point_right, tst, centroid) || + SPTEST(point_vert, tst, centroid))) + return 2; + + if ((SPTEST(point_below, tst, centroid) || + SPTEST(point_horiz, tst, centroid)) && + SPTEST(point_left, tst, centroid)) + return 3; + + if (SPTEST(point_above, tst, centroid) && + SPTEST(point_left, tst, centroid)) + return 4; + + elog(ERROR, "getQuadrant: impossible case"); + return 0; +} + + +Datum +spg_quad_choose(PG_FUNCTION_ARGS) +{ + spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0); + spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1); + Point *inPoint = DatumGetPointP(in->datum), + *centroid; + + if (in->allTheSame) + { + out->resultType = spgMatchNode; + /* nodeN will be set by core */ + out->result.matchNode.levelAdd = 0; + out->result.matchNode.restDatum = PointPGetDatum(inPoint); + PG_RETURN_VOID(); + } + + Assert(in->hasPrefix); + centroid = DatumGetPointP(in->prefixDatum); + + Assert(in->nNodes == 4); + + out->resultType = spgMatchNode; + out->result.matchNode.nodeN = getQuadrant(centroid, inPoint) - 1; + out->result.matchNode.levelAdd = 0; + out->result.matchNode.restDatum = PointPGetDatum(inPoint); + + PG_RETURN_VOID(); +} + +#ifdef USE_MEDIAN +static int +x_cmp(const void *a, const void *b, void *arg) +{ + Point *pa = *(Point **) a; + Point *pb = *(Point **) b; + + if (pa->x == pb->x) + return 0; + return (pa->x > pb->x) ? 1 : -1; +} + +static int +y_cmp(const void *a, const void *b, void *arg) +{ + Point *pa = *(Point **) a; + Point *pb = *(Point **) b; + + if (pa->y == pb->y) + return 0; + return (pa->y > pb->y) ? 1 : -1; +} +#endif + +Datum +spg_quad_picksplit(PG_FUNCTION_ARGS) +{ + spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0); + spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1); + int i; + Point *centroid; + +#ifdef USE_MEDIAN + /* Use the median values of x and y as the centroid point */ + Point **sorted; + + sorted = palloc(sizeof(*sorted) * in->nTuples); + for (i = 0; i < in->nTuples; i++) + sorted[i] = DatumGetPointP(in->datums[i]); + + centroid = palloc(sizeof(*centroid)); + + qsort(sorted, in->nTuples, sizeof(*sorted), x_cmp); + centroid->x = sorted[in->nTuples >> 1]->x; + qsort(sorted, in->nTuples, sizeof(*sorted), y_cmp); + centroid->y = sorted[in->nTuples >> 1]->y; +#else + /* Use the average values of x and y as the centroid point */ + centroid = palloc0(sizeof(*centroid)); + + for (i = 0; i < in->nTuples; i++) + { + centroid->x += DatumGetPointP(in->datums[i])->x; + centroid->y += DatumGetPointP(in->datums[i])->y; + } + + centroid->x /= in->nTuples; + centroid->y /= in->nTuples; +#endif + + out->hasPrefix = true; + out->prefixDatum = PointPGetDatum(centroid); + + out->nNodes = 4; + out->nodeLabels = NULL; /* we don't need node labels */ + + out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples); + out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples); + + for (i = 0; i < in->nTuples; i++) + { + Point *p = DatumGetPointP(in->datums[i]); + int quadrant = getQuadrant(centroid, p) - 1; + + out->leafTupleDatums[i] = PointPGetDatum(p); + out->mapTuplesToNodes[i] = quadrant; + } + + PG_RETURN_VOID(); +} + + +/* Subroutine to fill out->nodeNumbers[] for spg_quad_inner_consistent */ +static void +setNodes(spgInnerConsistentOut *out, bool isAll, int first, int second) +{ + if (isAll) + { + out->nNodes = 4; + out->nodeNumbers[0] = 0; + out->nodeNumbers[1] = 1; + out->nodeNumbers[2] = 2; + out->nodeNumbers[3] = 3; + } + else + { + out->nNodes = 2; + out->nodeNumbers[0] = first - 1; + out->nodeNumbers[1] = second - 1; + } +} + + +Datum +spg_quad_inner_consistent(PG_FUNCTION_ARGS) +{ + spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0); + spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1); + Point *query, + *centroid; + BOX *boxQuery; + + query = DatumGetPointP(in->query); + Assert(in->hasPrefix); + centroid = DatumGetPointP(in->prefixDatum); + + if (in->allTheSame) + { + /* Report that all nodes should be visited */ + int i; + + out->nNodes = in->nNodes; + out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); + for (i = 0; i < in->nNodes; i++) + out->nodeNumbers[i] = i; + PG_RETURN_VOID(); + } + + Assert(in->nNodes == 4); + out->nodeNumbers = (int *) palloc(sizeof(int) * 4); + + switch (in->strategy) + { + case RTLeftStrategyNumber: + setNodes(out, SPTEST(point_left, centroid, query), 3, 4); + break; + case RTRightStrategyNumber: + setNodes(out, SPTEST(point_right, centroid, query), 1, 2); + break; + case RTSameStrategyNumber: + out->nNodes = 1; + out->nodeNumbers[0] = getQuadrant(centroid, query) - 1; + break; + case RTBelowStrategyNumber: + setNodes(out, SPTEST(point_below, centroid, query), 2, 3); + break; + case RTAboveStrategyNumber: + setNodes(out, SPTEST(point_above, centroid, query), 1, 4); + break; + case RTContainedByStrategyNumber: + + /* + * For this operator, the query is a box not a point. We cheat to + * the extent of assuming that DatumGetPointP won't do anything + * that would be bad for a pointer-to-box. + */ + boxQuery = DatumGetBoxP(in->query); + + if (DatumGetBool(DirectFunctionCall2(box_contain_pt, + PointerGetDatum(boxQuery), + PointerGetDatum(centroid)))) + { + /* centroid is in box, so descend to all quadrants */ + setNodes(out, true, 0, 0); + } + else + { + /* identify quadrant(s) containing all corners of box */ + Point p; + int i, + r = 0; + + p = boxQuery->low; + r |= 1 << (getQuadrant(centroid, &p) - 1); + + p.y = boxQuery->high.y; + r |= 1 << (getQuadrant(centroid, &p) - 1); + + p = boxQuery->high; + r |= 1 << (getQuadrant(centroid, &p) - 1); + + p.x = boxQuery->low.x; + r |= 1 << (getQuadrant(centroid, &p) - 1); + + /* we must descend into those quadrant(s) */ + out->nNodes = 0; + for (i = 0; i < 4; i++) + { + if (r & (1 << i)) + { + out->nodeNumbers[out->nNodes] = i; + out->nNodes++; + } + } + } + break; + default: + elog(ERROR, "unrecognized strategy number: %d", in->strategy); + break; + } + + PG_RETURN_VOID(); +} + + +Datum +spg_quad_leaf_consistent(PG_FUNCTION_ARGS) +{ + spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0); + spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1); + Point *query = DatumGetPointP(in->query); + Point *datum = DatumGetPointP(in->leafDatum); + bool res; + + /* all tests are exact */ + out->recheck = false; + + switch (in->strategy) + { + case RTLeftStrategyNumber: + res = SPTEST(point_left, datum, query); + break; + case RTRightStrategyNumber: + res = SPTEST(point_right, datum, query); + break; + case RTSameStrategyNumber: + res = SPTEST(point_eq, datum, query); + break; + case RTBelowStrategyNumber: + res = SPTEST(point_below, datum, query); + break; + case RTAboveStrategyNumber: + res = SPTEST(point_above, datum, query); + break; + case RTContainedByStrategyNumber: + + /* + * For this operator, the query is a box not a point. We cheat to + * the extent of assuming that DatumGetPointP won't do anything + * that would be bad for a pointer-to-box. + */ + res = SPTEST(box_contain_pt, query, datum); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", in->strategy); + res = false; + break; + } + + PG_RETURN_BOOL(res); +} diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c new file mode 100644 index 00000000000..1c6180b2d24 --- /dev/null +++ b/src/backend/access/spgist/spgscan.c @@ -0,0 +1,543 @@ +/*------------------------------------------------------------------------- + * + * spgscan.c + * routines for scanning SP-GiST indexes + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgscan.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relscan.h" +#include "access/spgist_private.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "utils/datum.h" +#include "utils/memutils.h" + + +typedef struct ScanStackEntry +{ + Datum reconstructedValue; /* value reconstructed from parent */ + int level; /* level of items on this page */ + ItemPointerData ptr; /* block and offset to scan from */ +} ScanStackEntry; + + +/* Free a ScanStackEntry */ +static void +freeScanStackEntry(SpGistScanOpaque so, ScanStackEntry *stackEntry) +{ + if (!so->state.attType.attbyval && + DatumGetPointer(stackEntry->reconstructedValue) != NULL) + pfree(DatumGetPointer(stackEntry->reconstructedValue)); + pfree(stackEntry); +} + +/* Free the entire stack */ +static void +freeScanStack(SpGistScanOpaque so) +{ + ListCell *lc; + + foreach(lc, so->scanStack) + { + freeScanStackEntry(so, (ScanStackEntry *) lfirst(lc)); + } + list_free(so->scanStack); + so->scanStack = NIL; +} + +/* Initialize scanStack with a single entry for the root page */ +static void +resetSpGistScanOpaque(SpGistScanOpaque so) +{ + ScanStackEntry *startEntry = palloc0(sizeof(ScanStackEntry)); + + ItemPointerSet(&startEntry->ptr, SPGIST_HEAD_BLKNO, FirstOffsetNumber); + + freeScanStack(so); + so->scanStack = list_make1(startEntry); + so->nPtrs = so->iPtr = 0; +} + +Datum +spgbeginscan(PG_FUNCTION_ARGS) +{ + Relation rel = (Relation) PG_GETARG_POINTER(0); + int keysz = PG_GETARG_INT32(1); + /* ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2); */ + IndexScanDesc scan; + SpGistScanOpaque so; + + scan = RelationGetIndexScan(rel, keysz, 0); + + so = (SpGistScanOpaque) palloc0(sizeof(SpGistScanOpaqueData)); + initSpGistState(&so->state, scan->indexRelation); + so->tempCxt = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST search temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + resetSpGistScanOpaque(so); + scan->opaque = so; + + PG_RETURN_POINTER(scan); +} + +Datum +spgrescan(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1); + + if (scankey && scan->numberOfKeys > 0) + { + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + } + + resetSpGistScanOpaque(so); + + PG_RETURN_VOID(); +} + +Datum +spgendscan(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + MemoryContextDelete(so->tempCxt); + + PG_RETURN_VOID(); +} + +Datum +spgmarkpos(PG_FUNCTION_ARGS) +{ + elog(ERROR, "SPGiST does not support mark/restore"); + PG_RETURN_VOID(); +} + +Datum +spgrestrpos(PG_FUNCTION_ARGS) +{ + elog(ERROR, "SPGiST does not support mark/restore"); + PG_RETURN_VOID(); +} + +/* + * Test whether a leaf datum satisfies all the scan keys + * + * *recheck is set true if any of the operators are lossy + */ +static bool +spgLeafTest(SpGistScanOpaque so, Datum leafDatum, + int level, Datum reconstructedValue, + bool *recheck) +{ + bool result = true; + spgLeafConsistentIn in; + spgLeafConsistentOut out; + MemoryContext oldCtx; + int i; + + *recheck = false; + + /* set up values that are the same for all quals */ + in.reconstructedValue = reconstructedValue; + in.level = level; + in.leafDatum = leafDatum; + + /* Apply each leaf consistent function, working in the temp context */ + oldCtx = MemoryContextSwitchTo(so->tempCxt); + for (i = 0; i < so->numberOfKeys; i++) + { + in.strategy = so->keyData[i].sk_strategy; + in.query = so->keyData[i].sk_argument; + + out.recheck = false; + + result = DatumGetBool(FunctionCall2Coll(&so->state.leafConsistentFn, + so->keyData[i].sk_collation, + PointerGetDatum(&in), + PointerGetDatum(&out))); + *recheck |= out.recheck; + if (!result) + break; + } + MemoryContextSwitchTo(oldCtx); + + return result; +} + +/* + * Walk the tree and report all tuples passing the scan quals to the storeRes + * subroutine. + * + * If scanWholeIndex is true, we'll do just that. If not, we'll stop at the + * next page boundary once we have reported at least one tuple. + */ +static void +spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, + void (*storeRes) (SpGistScanOpaque, ItemPointer, bool)) +{ + Buffer buffer = InvalidBuffer; + bool reportedSome = false; + + while (scanWholeIndex || !reportedSome) + { + ScanStackEntry *stackEntry; + BlockNumber blkno; + OffsetNumber offset; + Page page; + + /* Pull next to-do item from the list */ + if (so->scanStack == NIL) + break; /* there are no more pages to scan */ + + stackEntry = (ScanStackEntry *) linitial(so->scanStack); + so->scanStack = list_delete_first(so->scanStack); + +redirect: + /* Check for interrupts, just in case of infinite loop */ + CHECK_FOR_INTERRUPTS(); + + blkno = ItemPointerGetBlockNumber(&stackEntry->ptr); + offset = ItemPointerGetOffsetNumber(&stackEntry->ptr); + + if (buffer == InvalidBuffer) + { + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + } + else if (blkno != BufferGetBlockNumber(buffer)) + { + UnlockReleaseBuffer(buffer); + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + } + /* else new pointer points to the same page, no work needed */ + + page = BufferGetPage(buffer); + + if (SpGistPageIsLeaf(page)) + { + SpGistLeafTuple leafTuple; + OffsetNumber max = PageGetMaxOffsetNumber(page); + bool recheck = false; + + if (blkno == SPGIST_HEAD_BLKNO) + { + /* When root is a leaf, examine all its tuples */ + for (offset = FirstOffsetNumber; offset <= max; offset++) + { + leafTuple = (SpGistLeafTuple) + PageGetItem(page, PageGetItemId(page, offset)); + if (leafTuple->tupstate != SPGIST_LIVE) + { + /* all tuples on root should be live */ + elog(ERROR, "unexpected SPGiST tuple state: %d", + leafTuple->tupstate); + } + + Assert(ItemPointerIsValid(&leafTuple->heapPtr)); + if (spgLeafTest(so, + SGLTDATUM(leafTuple, &so->state), + stackEntry->level, + stackEntry->reconstructedValue, + &recheck)) + { + storeRes(so, &leafTuple->heapPtr, recheck); + reportedSome = true; + } + } + } + else + { + /* Normal case: just examine the chain we arrived at */ + while (offset != InvalidOffsetNumber) + { + Assert(offset >= FirstOffsetNumber && offset <= max); + leafTuple = (SpGistLeafTuple) + PageGetItem(page, PageGetItemId(page, offset)); + if (leafTuple->tupstate != SPGIST_LIVE) + { + if (leafTuple->tupstate == SPGIST_REDIRECT) + { + /* redirection tuple should be first in chain */ + Assert(offset == ItemPointerGetOffsetNumber(&stackEntry->ptr)); + /* transfer attention to redirect point */ + stackEntry->ptr = ((SpGistDeadTuple) leafTuple)->pointer; + Assert(ItemPointerGetBlockNumber(&stackEntry->ptr) != SPGIST_METAPAGE_BLKNO); + goto redirect; + } + if (leafTuple->tupstate == SPGIST_DEAD) + { + /* dead tuple should be first in chain */ + Assert(offset == ItemPointerGetOffsetNumber(&stackEntry->ptr)); + /* No live entries on this page */ + Assert(leafTuple->nextOffset == InvalidOffsetNumber); + break; + } + /* We should not arrive at a placeholder */ + elog(ERROR, "unexpected SPGiST tuple state: %d", + leafTuple->tupstate); + } + + Assert(ItemPointerIsValid(&leafTuple->heapPtr)); + if (spgLeafTest(so, + SGLTDATUM(leafTuple, &so->state), + stackEntry->level, + stackEntry->reconstructedValue, + &recheck)) + { + storeRes(so, &leafTuple->heapPtr, recheck); + reportedSome = true; + } + + offset = leafTuple->nextOffset; + } + } + } + else /* page is inner */ + { + SpGistInnerTuple innerTuple; + SpGistNodeTuple node; + int i; + + innerTuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, offset)); + + if (innerTuple->tupstate != SPGIST_LIVE) + { + if (innerTuple->tupstate == SPGIST_REDIRECT) + { + /* transfer attention to redirect point */ + stackEntry->ptr = ((SpGistDeadTuple) innerTuple)->pointer; + Assert(ItemPointerGetBlockNumber(&stackEntry->ptr) != SPGIST_METAPAGE_BLKNO); + goto redirect; + } + elog(ERROR, "unexpected SPGiST tuple state: %d", + innerTuple->tupstate); + } + + if (so->numberOfKeys == 0) + { + /* + * This case cannot happen at the moment, because we don't + * set pg_am.amoptionalkey for SP-GiST. In order for full + * index scans to produce correct answers, we'd need to + * index nulls, which we don't. + */ + Assert(false); + +#ifdef NOT_USED + /* + * A full index scan could be done approximately like this, + * but note that reconstruction of indexed values would be + * impossible unless the API for inner_consistent is changed. + */ + SGITITERATE(innerTuple, i, node) + { + if (ItemPointerIsValid(&node->t_tid)) + { + ScanStackEntry *newEntry = palloc(sizeof(ScanStackEntry)); + + newEntry->ptr = node->t_tid; + newEntry->level = -1; + newEntry->reconstructedValue = (Datum) 0; + so->scanStack = lcons(newEntry, so->scanStack); + } + } +#endif + } + else + { + spgInnerConsistentIn in; + spgInnerConsistentOut out; + SpGistNodeTuple *nodes; + int *andMap; + int *levelAdds; + Datum *reconstructedValues; + int j, + nMatches = 0; + MemoryContext oldCtx; + + /* use temp context for calling inner_consistent */ + oldCtx = MemoryContextSwitchTo(so->tempCxt); + + /* set up values that are the same for all scankeys */ + in.reconstructedValue = stackEntry->reconstructedValue; + in.level = stackEntry->level; + in.allTheSame = innerTuple->allTheSame; + in.hasPrefix = (innerTuple->prefixSize > 0); + in.prefixDatum = SGITDATUM(innerTuple, &so->state); + in.nNodes = innerTuple->nNodes; + in.nodeLabels = spgExtractNodeLabels(&so->state, innerTuple); + + /* collect node pointers */ + nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * in.nNodes); + SGITITERATE(innerTuple, i, node) + { + nodes[i] = node; + } + + andMap = (int *) palloc0(sizeof(int) * in.nNodes); + levelAdds = (int *) palloc0(sizeof(int) * in.nNodes); + reconstructedValues = (Datum *) palloc0(sizeof(Datum) * in.nNodes); + + for (j = 0; j < so->numberOfKeys; j++) + { + in.strategy = so->keyData[j].sk_strategy; + in.query = so->keyData[j].sk_argument; + + memset(&out, 0, sizeof(out)); + + FunctionCall2Coll(&so->state.innerConsistentFn, + so->keyData[j].sk_collation, + PointerGetDatum(&in), + PointerGetDatum(&out)); + + /* If allTheSame, they should all or none of 'em match */ + if (innerTuple->allTheSame) + if (out.nNodes != 0 && out.nNodes != in.nNodes) + elog(ERROR, "inconsistent inner_consistent results for allTheSame inner tuple"); + + nMatches = 0; + for (i = 0; i < out.nNodes; i++) + { + int nodeN = out.nodeNumbers[i]; + + andMap[nodeN]++; + if (andMap[nodeN] == j + 1) + nMatches++; + if (out.levelAdds) + levelAdds[nodeN] = out.levelAdds[i]; + if (out.reconstructedValues) + reconstructedValues[nodeN] = out.reconstructedValues[i]; + } + + /* quit as soon as all nodes have failed some qual */ + if (nMatches == 0) + break; + } + + MemoryContextSwitchTo(oldCtx); + + if (nMatches > 0) + { + for (i = 0; i < in.nNodes; i++) + { + if (andMap[i] == so->numberOfKeys && + ItemPointerIsValid(&nodes[i]->t_tid)) + { + ScanStackEntry *newEntry; + + /* Create new work item for this node */ + newEntry = palloc(sizeof(ScanStackEntry)); + newEntry->ptr = nodes[i]->t_tid; + newEntry->level = stackEntry->level + levelAdds[i]; + /* Must copy value out of temp context */ + newEntry->reconstructedValue = + datumCopy(reconstructedValues[i], + so->state.attType.attbyval, + so->state.attType.attlen); + + so->scanStack = lcons(newEntry, so->scanStack); + } + } + } + } + } + + /* done with this scan stack entry */ + freeScanStackEntry(so, stackEntry); + /* clear temp context before proceeding to the next one */ + MemoryContextReset(so->tempCxt); + } + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); +} + +/* storeRes subroutine for getbitmap case */ +static void +storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr, bool recheck) +{ + tbm_add_tuples(so->tbm, heapPtr, 1, recheck); + so->ntids++; +} + +Datum +spggetbitmap(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + /* Copy scankey to *so so we don't need to pass it around separately */ + so->numberOfKeys = scan->numberOfKeys; + so->keyData = scan->keyData; + + so->tbm = tbm; + so->ntids = 0; + + spgWalk(scan->indexRelation, so, true, storeBitmap); + + PG_RETURN_INT64(so->ntids); +} + +/* storeRes subroutine for gettuple case */ +static void +storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, bool recheck) +{ + Assert(so->nPtrs < MaxIndexTuplesPerPage); + so->heapPtrs[so->nPtrs] = *heapPtr; + so->recheck[so->nPtrs] = recheck; + so->nPtrs++; +} + +Datum +spggettuple(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + if (dir != ForwardScanDirection) + elog(ERROR, "SP-GiST only supports forward scan direction"); + + /* Copy scankey to *so so we don't need to pass it around separately */ + so->numberOfKeys = scan->numberOfKeys; + so->keyData = scan->keyData; + + for (;;) + { + if (so->iPtr < so->nPtrs) + { + /* continuing to return tuples from a leaf page */ + scan->xs_ctup.t_self = so->heapPtrs[so->iPtr]; + scan->xs_recheck = so->recheck[so->iPtr]; + so->iPtr++; + PG_RETURN_BOOL(true); + } + + so->iPtr = so->nPtrs = 0; + spgWalk(scan->indexRelation, so, false, storeGettuple); + + if (so->nPtrs == 0) + break; /* must have completed scan */ + } + + PG_RETURN_BOOL(false); +} diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c new file mode 100644 index 00000000000..b6037978425 --- /dev/null +++ b/src/backend/access/spgist/spgtextproc.c @@ -0,0 +1,594 @@ +/*------------------------------------------------------------------------- + * + * spgtextproc.c + * implementation of compressed-suffix tree over text + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgtextproc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/spgist.h" +#include "catalog/pg_type.h" +#include "mb/pg_wchar.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/pg_locale.h" + + +/* + * In the worst case, a inner tuple in a text suffix tree could have as many + * as 256 nodes (one for each possible byte value). Each node can take 16 + * bytes on MAXALIGN=8 machines. The inner tuple must fit on an index page + * of size BLCKSZ. Rather than assuming we know the exact amount of overhead + * imposed by page headers, tuple headers, etc, we leave 100 bytes for that + * (the actual overhead should be no more than 56 bytes at this writing, so + * there is slop in this number). The upshot is that the maximum safe prefix + * length is this: + */ +#define SPGIST_MAX_PREFIX_LENGTH (BLCKSZ - 256 * 16 - 100) + +/* Struct for sorting values in picksplit */ +typedef struct spgNodePtr +{ + Datum d; + int i; + uint8 c; +} spgNodePtr; + + +Datum +spg_text_config(PG_FUNCTION_ARGS) +{ + /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */ + spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1); + + cfg->prefixType = TEXTOID; + cfg->labelType = CHAROID; + cfg->longValuesOK = true; /* suffixing will shorten long values */ + PG_RETURN_VOID(); +} + +/* + * Form a text datum from the given not-necessarily-null-terminated string, + * using short varlena header format if possible + */ +static Datum +formTextDatum(const char *data, int datalen) +{ + char *p; + + p = (char *) palloc(datalen + VARHDRSZ); + + if (datalen + VARHDRSZ_SHORT <= VARATT_SHORT_MAX) + { + SET_VARSIZE_SHORT(p, datalen + VARHDRSZ_SHORT); + if (datalen) + memcpy(p + VARHDRSZ_SHORT, data, datalen); + } + else + { + SET_VARSIZE(p, datalen + VARHDRSZ); + memcpy(p + VARHDRSZ, data, datalen); + } + + return PointerGetDatum(p); +} + +/* + * Find the length of the common prefix of a and b + */ +static int +commonPrefix(const char *a, const char *b, int lena, int lenb) +{ + int i = 0; + + while (i < lena && i < lenb && *a == *b) + { + a++; + b++; + i++; + } + + return i; +} + +/* + * Binary search an array of uint8 datums for a match to c + * + * On success, *i gets the match location; on failure, it gets where to insert + */ +static bool +searchChar(Datum *nodeLabels, int nNodes, uint8 c, int *i) +{ + int StopLow = 0, + StopHigh = nNodes; + + while (StopLow < StopHigh) + { + int StopMiddle = (StopLow + StopHigh) >> 1; + uint8 middle = DatumGetUInt8(nodeLabels[StopMiddle]); + + if (c < middle) + StopHigh = StopMiddle; + else if (c > middle) + StopLow = StopMiddle + 1; + else + { + *i = StopMiddle; + return true; + } + } + + *i = StopHigh; + return false; +} + +Datum +spg_text_choose(PG_FUNCTION_ARGS) +{ + spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0); + spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1); + text *inText = DatumGetTextPP(in->datum); + char *inStr = VARDATA_ANY(inText); + int inSize = VARSIZE_ANY_EXHDR(inText); + uint8 nodeChar = '\0'; + int i = 0; + int commonLen = 0; + + /* Check for prefix match, set nodeChar to first byte after prefix */ + if (in->hasPrefix) + { + text *prefixText = DatumGetTextPP(in->prefixDatum); + char *prefixStr = VARDATA_ANY(prefixText); + int prefixSize = VARSIZE_ANY_EXHDR(prefixText); + + commonLen = commonPrefix(inStr + in->level, + prefixStr, + inSize - in->level, + prefixSize); + + if (commonLen == prefixSize) + { + if (inSize - in->level > commonLen) + nodeChar = *(uint8 *) (inStr + in->level + commonLen); + else + nodeChar = '\0'; + } + else + { + /* Must split tuple because incoming value doesn't match prefix */ + out->resultType = spgSplitTuple; + + if (commonLen == 0) + { + out->result.splitTuple.prefixHasPrefix = false; + } + else + { + out->result.splitTuple.prefixHasPrefix = true; + out->result.splitTuple.prefixPrefixDatum = + formTextDatum(prefixStr, commonLen); + } + out->result.splitTuple.nodeLabel = + UInt8GetDatum(*(prefixStr + commonLen)); + + if (prefixSize - commonLen == 1) + { + out->result.splitTuple.postfixHasPrefix = false; + } + else + { + out->result.splitTuple.postfixHasPrefix = true; + out->result.splitTuple.postfixPrefixDatum = + formTextDatum(prefixStr + commonLen + 1, + prefixSize - commonLen - 1); + } + + PG_RETURN_VOID(); + } + } + else if (inSize > in->level) + { + nodeChar = *(uint8 *) (inStr + in->level); + } + else + { + nodeChar = '\0'; + } + + /* Look up nodeChar in the node label array */ + if (searchChar(in->nodeLabels, in->nNodes, nodeChar, &i)) + { + /* + * Descend to existing node. (If in->allTheSame, the core code will + * ignore our nodeN specification here, but that's OK. We still + * have to provide the correct levelAdd and restDatum values, and + * those are the same regardless of which node gets chosen by core.) + */ + out->resultType = spgMatchNode; + out->result.matchNode.nodeN = i; + out->result.matchNode.levelAdd = commonLen + 1; + if (inSize - in->level - commonLen - 1 > 0) + out->result.matchNode.restDatum = + formTextDatum(inStr + in->level + commonLen + 1, + inSize - in->level - commonLen - 1); + else + out->result.matchNode.restDatum = + formTextDatum(NULL, 0); + } + else if (in->allTheSame) + { + /* + * Can't use AddNode action, so split the tuple. The upper tuple + * has the same prefix as before and uses an empty node label for + * the lower tuple. The lower tuple has no prefix and the same + * node labels as the original tuple. + */ + out->resultType = spgSplitTuple; + out->result.splitTuple.prefixHasPrefix = in->hasPrefix; + out->result.splitTuple.prefixPrefixDatum = in->prefixDatum; + out->result.splitTuple.nodeLabel = UInt8GetDatum('\0'); + out->result.splitTuple.postfixHasPrefix = false; + } + else + { + /* Add a node for the not-previously-seen nodeChar value */ + out->resultType = spgAddNode; + out->result.addNode.nodeLabel = UInt8GetDatum(nodeChar); + out->result.addNode.nodeN = i; + } + + PG_RETURN_VOID(); +} + +/* qsort comparator to sort spgNodePtr structs by "c" */ +static int +cmpNodePtr(const void *a, const void *b) +{ + const spgNodePtr *aa = (const spgNodePtr *) a; + const spgNodePtr *bb = (const spgNodePtr *) b; + + if (aa->c == bb->c) + return 0; + else if (aa->c > bb->c) + return 1; + else + return -1; +} + +Datum +spg_text_picksplit(PG_FUNCTION_ARGS) +{ + spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0); + spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1); + text *text0 = DatumGetTextPP(in->datums[0]); + int i, + commonLen; + spgNodePtr *nodes; + + /* Identify longest common prefix, if any */ + commonLen = VARSIZE_ANY_EXHDR(text0); + for (i = 1; i < in->nTuples && commonLen > 0; i++) + { + text *texti = DatumGetTextPP(in->datums[i]); + int tmp = commonPrefix(VARDATA_ANY(text0), + VARDATA_ANY(texti), + VARSIZE_ANY_EXHDR(text0), + VARSIZE_ANY_EXHDR(texti)); + + if (tmp < commonLen) + commonLen = tmp; + } + + /* + * Limit the prefix length, if necessary, to ensure that the resulting + * inner tuple will fit on a page. + */ + commonLen = Min(commonLen, SPGIST_MAX_PREFIX_LENGTH); + + /* Set node prefix to be that string, if it's not empty */ + if (commonLen == 0) + { + out->hasPrefix = false; + } + else + { + out->hasPrefix = true; + out->prefixDatum = formTextDatum(VARDATA_ANY(text0), commonLen); + } + + /* Extract the node label (first non-common byte) from each value */ + nodes = (spgNodePtr *) palloc(sizeof(spgNodePtr) * in->nTuples); + + for (i = 0; i < in->nTuples; i++) + { + text *texti = DatumGetTextPP(in->datums[i]); + + if (commonLen < VARSIZE_ANY_EXHDR(texti)) + nodes[i].c = *(uint8 *) (VARDATA_ANY(texti) + commonLen); + else + nodes[i].c = '\0'; /* use \0 if string is all common */ + nodes[i].i = i; + nodes[i].d = in->datums[i]; + } + + /* + * Sort by label bytes so that we can group the values into nodes. This + * also ensures that the nodes are ordered by label value, allowing the + * use of binary search in searchChar. + */ + qsort(nodes, in->nTuples, sizeof(*nodes), cmpNodePtr); + + /* And emit results */ + out->nNodes = 0; + out->nodeLabels = (Datum *) palloc(sizeof(Datum) * in->nTuples); + out->mapTuplesToNodes = (int *) palloc(sizeof(int) * in->nTuples); + out->leafTupleDatums = (Datum *) palloc(sizeof(Datum) * in->nTuples); + + for (i = 0; i < in->nTuples; i++) + { + text *texti = DatumGetTextPP(nodes[i].d); + Datum leafD; + + if (i == 0 || nodes[i].c != nodes[i - 1].c) + { + out->nodeLabels[out->nNodes] = UInt8GetDatum(nodes[i].c); + out->nNodes++; + } + + if (commonLen < VARSIZE_ANY_EXHDR(texti)) + leafD = formTextDatum(VARDATA_ANY(texti) + commonLen + 1, + VARSIZE_ANY_EXHDR(texti) - commonLen - 1); + else + leafD = formTextDatum(NULL, 0); + + out->leafTupleDatums[nodes[i].i] = leafD; + out->mapTuplesToNodes[nodes[i].i] = out->nNodes - 1; + } + + PG_RETURN_VOID(); +} + +Datum +spg_text_inner_consistent(PG_FUNCTION_ARGS) +{ + spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0); + spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1); + StrategyNumber strategy = in->strategy; + text *inText; + int inSize; + int i; + text *reconstrText = NULL; + int maxReconstrLen = 0; + text *prefixText = NULL; + int prefixSize = 0; + + /* + * If it's a collation-aware operator, but the collation is C, we can + * treat it as non-collation-aware. + */ + if (strategy > 10 && + lc_collate_is_c(PG_GET_COLLATION())) + strategy -= 10; + + inText = DatumGetTextPP(in->query); + inSize = VARSIZE_ANY_EXHDR(inText); + + /* + * Reconstruct values represented at this tuple, including parent data, + * prefix of this tuple if any, and the node label if any. in->level + * should be the length of the previously reconstructed value, and the + * number of bytes added here is prefixSize or prefixSize + 1. + * + * Note: we assume that in->reconstructedValue isn't toasted and doesn't + * have a short varlena header. This is okay because it must have been + * created by a previous invocation of this routine, and we always emit + * long-format reconstructed values. + */ + Assert(in->level == 0 ? DatumGetPointer(in->reconstructedValue) == NULL : + VARSIZE_ANY_EXHDR(DatumGetPointer(in->reconstructedValue)) == in->level); + + maxReconstrLen = in->level + 1; + if (in->hasPrefix) + { + prefixText = DatumGetTextPP(in->prefixDatum); + prefixSize = VARSIZE_ANY_EXHDR(prefixText); + maxReconstrLen += prefixSize; + } + + reconstrText = palloc(VARHDRSZ + maxReconstrLen); + SET_VARSIZE(reconstrText, VARHDRSZ + maxReconstrLen); + + if (in->level) + memcpy(VARDATA(reconstrText), + VARDATA(DatumGetPointer(in->reconstructedValue)), + in->level); + if (prefixSize) + memcpy(((char *) VARDATA(reconstrText)) + in->level, + VARDATA_ANY(prefixText), + prefixSize); + /* last byte of reconstrText will be filled in below */ + + /* + * Scan the child nodes. For each one, complete the reconstructed value + * and see if it's consistent with the query. If so, emit an entry into + * the output arrays. + */ + out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); + out->levelAdds = (int *) palloc(sizeof(int) * in->nNodes); + out->reconstructedValues = (Datum *) palloc(sizeof(Datum) * in->nNodes); + out->nNodes = 0; + + for (i = 0; i < in->nNodes; i++) + { + uint8 nodeChar = DatumGetUInt8(in->nodeLabels[i]); + int thisLen; + int r; + bool res = false; + + /* If nodeChar is zero, don't include it in data */ + if (nodeChar == '\0') + thisLen = maxReconstrLen - 1; + else + { + ((char *) VARDATA(reconstrText))[maxReconstrLen - 1] = nodeChar; + thisLen = maxReconstrLen; + } + + r = memcmp(VARDATA(reconstrText), VARDATA_ANY(inText), + Min(inSize, thisLen)); + + switch (strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (r <= 0) + res = true; + break; + case BTEqualStrategyNumber: + if (r == 0 && inSize >= thisLen) + res = true; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (r >= 0) + res = true; + break; + case BTLessStrategyNumber + 10: + case BTLessEqualStrategyNumber + 10: + case BTGreaterEqualStrategyNumber + 10: + case BTGreaterStrategyNumber + 10: + /* + * with non-C collation we need to traverse whole tree :-( + */ + res = true; + break; + default: + elog(ERROR, "unrecognized strategy number: %d", + in->strategy); + break; + } + + if (res) + { + out->nodeNumbers[out->nNodes] = i; + out->levelAdds[out->nNodes] = thisLen - in->level; + SET_VARSIZE(reconstrText, VARHDRSZ + thisLen); + out->reconstructedValues[out->nNodes] = + datumCopy(PointerGetDatum(reconstrText), false, -1); + out->nNodes++; + } + } + + PG_RETURN_VOID(); +} + +Datum +spg_text_leaf_consistent(PG_FUNCTION_ARGS) +{ + spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0); + spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1); + StrategyNumber strategy = in->strategy; + text *query = DatumGetTextPP(in->query); + int level = in->level; + text *leafValue, + *reconstrValue = NULL; + char *fullValue; + int fullLen; + int queryLen; + int r; + bool res; + + /* all tests are exact */ + out->recheck = false; + + leafValue = DatumGetTextPP(in->leafDatum); + + if (DatumGetPointer(in->reconstructedValue)) + reconstrValue = DatumGetTextP(in->reconstructedValue); + + Assert(level == 0 ? reconstrValue == NULL : + VARSIZE_ANY_EXHDR(reconstrValue) == level); + + fullLen = level + VARSIZE_ANY_EXHDR(leafValue); + + queryLen = VARSIZE_ANY_EXHDR(query); + + /* For equality, we needn't reconstruct fullValue if not same length */ + if (strategy == BTEqualStrategyNumber && queryLen != fullLen) + PG_RETURN_BOOL(false); + + /* Else, reconstruct the full string represented by this leaf tuple */ + if (VARSIZE_ANY_EXHDR(leafValue) == 0 && level > 0) + { + fullValue = VARDATA(reconstrValue); + } + else + { + fullValue = palloc(fullLen); + if (level) + memcpy(fullValue, VARDATA(reconstrValue), level); + if (VARSIZE_ANY_EXHDR(leafValue) > 0) + memcpy(fullValue + level, VARDATA_ANY(leafValue), + VARSIZE_ANY_EXHDR(leafValue)); + } + + /* Run the appropriate type of comparison */ + if (strategy > 10) + { + /* Collation-aware comparison */ + strategy -= 10; + + /* If asserts are enabled, verify encoding of reconstructed string */ + Assert(pg_verifymbstr(fullValue, fullLen, false)); + + r = varstr_cmp(fullValue, Min(queryLen, fullLen), + VARDATA_ANY(query), Min(queryLen, fullLen), + PG_GET_COLLATION()); + } + else + { + /* Non-collation-aware comparison */ + r = memcmp(fullValue, VARDATA_ANY(query), Min(queryLen, fullLen)); + } + + if (r == 0) + { + if (queryLen > fullLen) + r = -1; + else if (queryLen < fullLen) + r = 1; + } + + switch (strategy) + { + case BTLessStrategyNumber: + res = (r < 0); + break; + case BTLessEqualStrategyNumber: + res = (r <= 0); + break; + case BTEqualStrategyNumber: + res = (r == 0); + break; + case BTGreaterEqualStrategyNumber: + res = (r >= 0); + break; + case BTGreaterStrategyNumber: + res = (r > 0); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", in->strategy); + res = false; + break; + } + + PG_RETURN_BOOL(res); +} diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c new file mode 100644 index 00000000000..d6c01a5f842 --- /dev/null +++ b/src/backend/access/spgist/spgutils.c @@ -0,0 +1,850 @@ +/*------------------------------------------------------------------------- + * + * spgutils.c + * various support functions for SP-GiST + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgutils.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/reloptions.h" +#include "access/spgist_private.h" +#include "access/transam.h" +#include "access/xact.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "utils/lsyscache.h" + + +/* Fill in a SpGistTypeDesc struct with info about the specified data type */ +static void +fillTypeDesc(SpGistTypeDesc *desc, Oid type) +{ + desc->type = type; + get_typlenbyval(type, &desc->attlen, &desc->attbyval); +} + +/* Initialize SpGistState for working with the given index */ +void +initSpGistState(SpGistState *state, Relation index) +{ + Oid atttype; + spgConfigIn in; + + /* SPGiST doesn't support multi-column indexes */ + Assert(index->rd_att->natts == 1); + + /* + * Get the actual data type of the indexed column from the index tupdesc. + * We pass this to the opclass config function so that polymorphic + * opclasses are possible. + */ + atttype = index->rd_att->attrs[0]->atttypid; + + /* Get the config info for the opclass */ + in.attType = atttype; + + memset(&state->config, 0, sizeof(state->config)); + + FunctionCall2Coll(index_getprocinfo(index, 1, SPGIST_CONFIG_PROC), + index->rd_indcollation[0], + PointerGetDatum(&in), + PointerGetDatum(&state->config)); + + /* Get the information we need about each relevant datatype */ + fillTypeDesc(&state->attType, atttype); + fillTypeDesc(&state->attPrefixType, state->config.prefixType); + fillTypeDesc(&state->attLabelType, state->config.labelType); + + /* Get lookup info for opclass support procs */ + fmgr_info_copy(&(state->chooseFn), + index_getprocinfo(index, 1, SPGIST_CHOOSE_PROC), + CurrentMemoryContext); + fmgr_info_copy(&(state->picksplitFn), + index_getprocinfo(index, 1, SPGIST_PICKSPLIT_PROC), + CurrentMemoryContext); + fmgr_info_copy(&(state->innerConsistentFn), + index_getprocinfo(index, 1, SPGIST_INNER_CONSISTENT_PROC), + CurrentMemoryContext); + fmgr_info_copy(&(state->leafConsistentFn), + index_getprocinfo(index, 1, SPGIST_LEAF_CONSISTENT_PROC), + CurrentMemoryContext); + + /* Make workspace for constructing dead tuples */ + state->deadTupleStorage = palloc0(SGDTSIZE); + + /* Set XID to use in redirection tuples */ + state->myXid = GetTopTransactionIdIfAny(); + + state->isBuild = false; +} + +/* + * Allocate a new page (either by recycling, or by extending the index file). + * + * The returned buffer is already pinned and exclusive-locked. + * Caller is responsible for initializing the page by calling SpGistInitBuffer. + */ +Buffer +SpGistNewBuffer(Relation index) +{ + Buffer buffer; + bool needLock; + + /* First, try to get a page from FSM */ + for (;;) + { + BlockNumber blkno = GetFreeIndexPage(index); + + if (blkno == InvalidBlockNumber) + break; /* nothing known to FSM */ + + /* + * The root page shouldn't ever be listed in FSM, but just in case it + * is, ignore it. + */ + if (blkno == SPGIST_HEAD_BLKNO) + continue; + + buffer = ReadBuffer(index, blkno); + + /* + * We have to guard against the possibility that someone else already + * recycled this page; the buffer may be locked if so. + */ + if (ConditionalLockBuffer(buffer)) + { + Page page = BufferGetPage(buffer); + + if (PageIsNew(page)) + return buffer; /* OK to use, if never initialized */ + + if (SpGistPageIsDeleted(page) || PageIsEmpty(page)) + return buffer; /* OK to use */ + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + + /* Can't use it, so release buffer and try again */ + ReleaseBuffer(buffer); + } + + /* Must extend the file */ + needLock = !RELATION_IS_LOCAL(index); + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + + buffer = ReadBuffer(index, P_NEW); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + return buffer; +} + +/* + * Fetch local cache of lastUsedPages info, initializing it from the metapage + * if necessary + */ +static SpGistCache * +spgGetCache(Relation index) +{ + SpGistCache *cache; + + if (index->rd_amcache == NULL) + { + Buffer metabuffer; + SpGistMetaPageData *metadata; + + cache = MemoryContextAlloc(index->rd_indexcxt, + sizeof(SpGistCache)); + + metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO); + LockBuffer(metabuffer, BUFFER_LOCK_SHARE); + + metadata = SpGistPageGetMeta(BufferGetPage(metabuffer)); + + if (metadata->magicNumber != SPGIST_MAGIC_NUMBER) + elog(ERROR, "index \"%s\" is not an SP-GiST index", + RelationGetRelationName(index)); + + *cache = metadata->lastUsedPages; + + UnlockReleaseBuffer(metabuffer); + + index->rd_amcache = cache; + } + else + { + cache = (SpGistCache *) index->rd_amcache; + } + + return cache; +} + +/* + * Update index metapage's lastUsedPages info from local cache, if possible + * + * Updating meta page isn't critical for index working, so + * 1 use ConditionalLockBuffer to improve concurrency + * 2 don't WAL-log metabuffer changes to decrease WAL traffic + */ +void +SpGistUpdateMetaPage(Relation index) +{ + SpGistCache *cache = (SpGistCache *) index->rd_amcache; + + if (cache != NULL) + { + Buffer metabuffer; + SpGistMetaPageData *metadata; + + metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO); + + if (ConditionalLockBuffer(metabuffer)) + { + metadata = SpGistPageGetMeta(BufferGetPage(metabuffer)); + metadata->lastUsedPages = *cache; + + MarkBufferDirty(metabuffer); + UnlockReleaseBuffer(metabuffer); + } + else + { + ReleaseBuffer(metabuffer); + } + } +} + +/* Macro to select proper element of lastUsedPages cache depending on flags */ +#define GET_LUP(c, f) (((f) & GBUF_LEAF) ? \ + &(c)->leafPage : \ + &(c)->innerPage[(f) & GBUF_PARITY_MASK]) + +/* + * Allocate and initialize a new buffer of the type and parity specified by + * flags. The returned buffer is already pinned and exclusive-locked. + * + * When requesting an inner page, if we get one with the wrong parity, + * we just release the buffer and try again. We will get a different page + * because GetFreeIndexPage will have marked the page used in FSM. The page + * is entered in our local lastUsedPages cache, so there's some hope of + * making use of it later in this session, but otherwise we rely on VACUUM + * to eventually re-enter the page in FSM, making it available for recycling. + * Note that such a page does not get marked dirty here, so unless it's used + * fairly soon, the buffer will just get discarded and the page will remain + * as it was on disk. + * + * When we return a buffer to the caller, the page is *not* entered into + * the lastUsedPages cache; we expect the caller will do so after it's taken + * whatever space it will use. This is because after the caller has used up + * some space, the page might have less space than whatever was cached already + * so we'd rather not trash the old cache entry. + */ +static Buffer +allocNewBuffer(Relation index, int flags) +{ + SpGistCache *cache = spgGetCache(index); + + for (;;) + { + Buffer buffer; + + buffer = SpGistNewBuffer(index); + SpGistInitBuffer(buffer, (flags & GBUF_LEAF) ? SPGIST_LEAF : 0); + + if (flags & GBUF_LEAF) + { + /* Leaf pages have no parity concerns, so just use it */ + return buffer; + } + else + { + BlockNumber blkno = BufferGetBlockNumber(buffer); + int blkParity = blkno % 3; + + if ((flags & GBUF_PARITY_MASK) == blkParity) + { + /* Page has right parity, use it */ + return buffer; + } + else + { + /* Page has wrong parity, record it in cache and try again */ + cache->innerPage[blkParity].blkno = blkno; + cache->innerPage[blkParity].freeSpace = + PageGetExactFreeSpace(BufferGetPage(buffer)); + UnlockReleaseBuffer(buffer); + } + } + } +} + +/* + * Get a buffer of the type and parity specified by flags, having at least + * as much free space as indicated by needSpace. We use the lastUsedPages + * cache to assign the same buffer previously requested when possible. + * The returned buffer is already pinned and exclusive-locked. + * + * *isNew is set true if the page was initialized here, false if it was + * already valid. + */ +Buffer +SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew) +{ + SpGistCache *cache = spgGetCache(index); + SpGistLastUsedPage *lup; + + /* Bail out if even an empty page wouldn't meet the demand */ + if (needSpace > SPGIST_PAGE_CAPACITY) + elog(ERROR, "desired SPGiST tuple size is too big"); + + /* + * If possible, increase the space request to include relation's + * fillfactor. This ensures that when we add unrelated tuples to a page, + * we try to keep 100-fillfactor% available for adding tuples that are + * related to the ones already on it. But fillfactor mustn't cause an + * error for requests that would otherwise be legal. + */ + needSpace += RelationGetTargetPageFreeSpace(index, + SPGIST_DEFAULT_FILLFACTOR); + needSpace = Min(needSpace, SPGIST_PAGE_CAPACITY); + + /* Get the cache entry for this flags setting */ + lup = GET_LUP(cache, flags); + + /* If we have nothing cached, just turn it over to allocNewBuffer */ + if (lup->blkno == InvalidBlockNumber) + { + *isNew = true; + return allocNewBuffer(index, flags); + } + + /* root page should never be in cache */ + Assert(lup->blkno != SPGIST_HEAD_BLKNO); + + /* If cached freeSpace isn't enough, don't bother looking at the page */ + if (lup->freeSpace >= needSpace) + { + Buffer buffer; + Page page; + + buffer = ReadBuffer(index, lup->blkno); + + if (!ConditionalLockBuffer(buffer)) + { + /* + * buffer is locked by another process, so return a new buffer + */ + ReleaseBuffer(buffer); + *isNew = true; + return allocNewBuffer(index, flags); + } + + page = BufferGetPage(buffer); + + if (PageIsNew(page) || SpGistPageIsDeleted(page) || PageIsEmpty(page)) + { + /* OK to initialize the page */ + SpGistInitBuffer(buffer, (flags & GBUF_LEAF) ? SPGIST_LEAF : 0); + lup->freeSpace = PageGetExactFreeSpace(page) - needSpace; + *isNew = true; + return buffer; + } + + /* + * Check that page is of right type and has enough space. We must + * recheck this since our cache isn't necessarily up to date. + */ + if ((flags & GBUF_LEAF) ? SpGistPageIsLeaf(page) : + !SpGistPageIsLeaf(page)) + { + int freeSpace = PageGetExactFreeSpace(page); + + if (freeSpace >= needSpace) + { + /* Success, update freespace info and return the buffer */ + lup->freeSpace = freeSpace - needSpace; + *isNew = false; + return buffer; + } + } + + /* + * fallback to allocation of new buffer + */ + UnlockReleaseBuffer(buffer); + } + + /* No success with cache, so return a new buffer */ + *isNew = true; + return allocNewBuffer(index, flags); +} + +/* + * Update lastUsedPages cache when done modifying a page. + * + * We update the appropriate cache entry if it already contained this page + * (its freeSpace is likely obsolete), or if this page has more space than + * whatever we had cached. + */ +void +SpGistSetLastUsedPage(Relation index, Buffer buffer) +{ + SpGistCache *cache = spgGetCache(index); + SpGistLastUsedPage *lup; + int freeSpace; + Page page = BufferGetPage(buffer); + BlockNumber blkno = BufferGetBlockNumber(buffer); + int flags; + + /* Never enter the root page in cache, though */ + if (blkno == SPGIST_HEAD_BLKNO) + return; + + if (SpGistPageIsLeaf(page)) + flags = GBUF_LEAF; + else + flags = GBUF_INNER_PARITY(blkno); + + lup = GET_LUP(cache, flags); + + freeSpace = PageGetExactFreeSpace(page); + if (lup->blkno == InvalidBlockNumber || lup->blkno == blkno || + lup->freeSpace < freeSpace) + { + lup->blkno = blkno; + lup->freeSpace = freeSpace; + } +} + +/* + * Initialize an SPGiST page to empty, with specified flags + */ +void +SpGistInitPage(Page page, uint16 f) +{ + SpGistPageOpaque opaque; + + PageInit(page, BLCKSZ, MAXALIGN(sizeof(SpGistPageOpaqueData))); + opaque = SpGistPageGetOpaque(page); + memset(opaque, 0, sizeof(SpGistPageOpaqueData)); + opaque->flags = f; + opaque->spgist_page_id = SPGIST_PAGE_ID; +} + +/* + * Initialize a buffer's page to empty, with specified flags + */ +void +SpGistInitBuffer(Buffer b, uint16 f) +{ + Assert(BufferGetPageSize(b) == BLCKSZ); + SpGistInitPage(BufferGetPage(b), f); +} + +/* + * Initialize metadata page + */ +void +SpGistInitMetapage(Page page) +{ + SpGistMetaPageData *metadata; + + SpGistInitPage(page, SPGIST_META); + metadata = SpGistPageGetMeta(page); + memset(metadata, 0, sizeof(SpGistMetaPageData)); + metadata->magicNumber = SPGIST_MAGIC_NUMBER; + + /* initialize last-used-page cache to empty */ + metadata->lastUsedPages.innerPage[0].blkno = InvalidBlockNumber; + metadata->lastUsedPages.innerPage[1].blkno = InvalidBlockNumber; + metadata->lastUsedPages.innerPage[2].blkno = InvalidBlockNumber; + metadata->lastUsedPages.leafPage.blkno = InvalidBlockNumber; +} + +/* + * reloptions processing for SPGiST + */ +Datum +spgoptions(PG_FUNCTION_ARGS) +{ + Datum reloptions = PG_GETARG_DATUM(0); + bool validate = PG_GETARG_BOOL(1); + bytea *result; + + result = default_reloptions(reloptions, validate, RELOPT_KIND_SPGIST); + + if (result) + PG_RETURN_BYTEA_P(result); + PG_RETURN_NULL(); +} + +/* + * Get the space needed to store a datum of the indicated type. + * Note the result is already rounded up to a MAXALIGN boundary. + * Also, we follow the SPGiST convention that pass-by-val types are + * just stored in their Datum representation (compare memcpyDatum). + */ +unsigned int +SpGistGetTypeSize(SpGistTypeDesc *att, Datum datum) +{ + unsigned int size; + + if (att->attbyval) + size = sizeof(Datum); + else if (att->attlen > 0) + size = att->attlen; + else + size = VARSIZE_ANY(datum); + + return MAXALIGN(size); +} + +/* + * Copy the given datum to *target + */ +static void +memcpyDatum(void *target, SpGistTypeDesc *att, Datum datum) +{ + unsigned int size; + + if (att->attbyval) + { + memcpy(target, &datum, sizeof(Datum)); + } + else + { + size = (att->attlen > 0) ? att->attlen : VARSIZE_ANY(datum); + memcpy(target, DatumGetPointer(datum), size); + } +} + +/* + * Construct a leaf tuple containing the given heap TID and datum value + */ +SpGistLeafTuple +spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr, Datum datum) +{ + SpGistLeafTuple tup; + unsigned int size; + + /* compute space needed (note result is already maxaligned) */ + size = SGLTHDRSZ + SpGistGetTypeSize(&state->attType, datum); + + /* + * Ensure that we can replace the tuple with a dead tuple later. This + * test is unnecessary given current tuple layouts, but let's be safe. + */ + if (size < SGDTSIZE) + size = SGDTSIZE; + + /* OK, form the tuple */ + tup = (SpGistLeafTuple) palloc0(size); + + tup->size = size; + tup->nextOffset = InvalidOffsetNumber; + tup->heapPtr = *heapPtr; + memcpyDatum(SGLTDATAPTR(tup), &state->attType, datum); + + return tup; +} + +/* + * Construct a node (to go into an inner tuple) containing the given label + * + * Note that the node's downlink is just set invalid here. Caller will fill + * it in later. + */ +SpGistNodeTuple +spgFormNodeTuple(SpGistState *state, Datum label, bool isnull) +{ + SpGistNodeTuple tup; + unsigned int size; + unsigned short infomask = 0; + + /* compute space needed (note result is already maxaligned) */ + size = SGNTHDRSZ; + if (!isnull) + size += SpGistGetTypeSize(&state->attLabelType, label); + + /* + * Here we make sure that the size will fit in the field reserved for it + * in t_info. + */ + if ((size & INDEX_SIZE_MASK) != size) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row requires %lu bytes, maximum size is %lu", + (unsigned long) size, + (unsigned long) INDEX_SIZE_MASK))); + + tup = (SpGistNodeTuple) palloc0(size); + + if (isnull) + infomask |= INDEX_NULL_MASK; + /* we don't bother setting the INDEX_VAR_MASK bit */ + infomask |= size; + tup->t_info = infomask; + + /* The TID field will be filled in later */ + ItemPointerSetInvalid(&tup->t_tid); + + if (!isnull) + memcpyDatum(SGNTDATAPTR(tup), &state->attLabelType, label); + + return tup; +} + +/* + * Construct an inner tuple containing the given prefix and node array + */ +SpGistInnerTuple +spgFormInnerTuple(SpGistState *state, bool hasPrefix, Datum prefix, + int nNodes, SpGistNodeTuple *nodes) +{ + SpGistInnerTuple tup; + unsigned int size; + unsigned int prefixSize; + int i; + char *ptr; + + /* Compute size needed */ + if (hasPrefix) + prefixSize = SpGistGetTypeSize(&state->attPrefixType, prefix); + else + prefixSize = 0; + + size = SGITHDRSZ + prefixSize; + + /* Note: we rely on node tuple sizes to be maxaligned already */ + for (i = 0; i < nNodes; i++) + size += IndexTupleSize(nodes[i]); + + /* + * Ensure that we can replace the tuple with a dead tuple later. This + * test is unnecessary given current tuple layouts, but let's be safe. + */ + if (size < SGDTSIZE) + size = SGDTSIZE; + + /* + * Inner tuple should be small enough to fit on a page + */ + if (size > SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("SPGiST inner tuple size %lu exceeds maximum %lu", + (unsigned long) size, + (unsigned long) (SPGIST_PAGE_CAPACITY - sizeof(ItemIdData))), + errhint("Values larger than a buffer page cannot be indexed."))); + + /* + * Check for overflow of header fields --- probably can't fail if the + * above succeeded, but let's be paranoid + */ + if (size > SGITMAXSIZE || + prefixSize > SGITMAXPREFIXSIZE || + nNodes > SGITMAXNNODES) + elog(ERROR, "SPGiST inner tuple header field is too small"); + + /* OK, form the tuple */ + tup = (SpGistInnerTuple) palloc0(size); + + tup->nNodes = nNodes; + tup->prefixSize = prefixSize; + tup->size = size; + + if (hasPrefix) + memcpyDatum(SGITDATAPTR(tup), &state->attPrefixType, prefix); + + ptr = (char *) SGITNODEPTR(tup); + + for (i = 0; i < nNodes; i++) + { + SpGistNodeTuple node = nodes[i]; + + memcpy(ptr, node, IndexTupleSize(node)); + ptr += IndexTupleSize(node); + } + + return tup; +} + +/* + * Construct a "dead" tuple to replace a tuple being deleted. + * + * The state can be SPGIST_REDIRECT, SPGIST_DEAD, or SPGIST_PLACEHOLDER. + * For a REDIRECT tuple, a pointer (blkno+offset) must be supplied, and + * the xid field is filled in automatically. + * + * This is called in critical sections, so we don't use palloc; the tuple + * is built in preallocated storage. It should be copied before another + * call with different parameters can occur. + */ +SpGistDeadTuple +spgFormDeadTuple(SpGistState *state, int tupstate, + BlockNumber blkno, OffsetNumber offnum) +{ + SpGistDeadTuple tuple = (SpGistDeadTuple) state->deadTupleStorage; + + tuple->tupstate = tupstate; + tuple->size = SGDTSIZE; + tuple->nextOffset = InvalidOffsetNumber; + + if (tupstate == SPGIST_REDIRECT) + { + ItemPointerSet(&tuple->pointer, blkno, offnum); + tuple->xid = state->myXid; + } + else + { + ItemPointerSetInvalid(&tuple->pointer); + tuple->xid = InvalidTransactionId; + } + + return tuple; +} + +/* + * Extract the label datums of the nodes within innerTuple + * + * Returns NULL if label datums are NULLs + */ +Datum * +spgExtractNodeLabels(SpGistState *state, SpGistInnerTuple innerTuple) +{ + Datum *nodeLabels; + int nullcount = 0; + int i; + SpGistNodeTuple node; + + nodeLabels = (Datum *) palloc(sizeof(Datum) * innerTuple->nNodes); + SGITITERATE(innerTuple, i, node) + { + if (IndexTupleHasNulls(node)) + nullcount++; + else + nodeLabels[i] = SGNTDATUM(node, state); + } + if (nullcount == innerTuple->nNodes) + { + /* They're all null, so just return NULL */ + pfree(nodeLabels); + return NULL; + } + if (nullcount != 0) + elog(ERROR, "some but not all node labels are null in SPGiST inner tuple"); + return nodeLabels; +} + +/* + * Add a new item to the page, replacing a PLACEHOLDER item if possible. + * Return the location it's inserted at, or InvalidOffsetNumber on failure. + * + * If startOffset isn't NULL, we start searching for placeholders at + * *startOffset, and update that to the next place to search. This is just + * an optimization for repeated insertions. + * + * If errorOK is false, we throw error when there's not enough room, + * rather than returning InvalidOffsetNumber. + */ +OffsetNumber +SpGistPageAddNewItem(SpGistState *state, Page page, Item item, Size size, + OffsetNumber *startOffset, bool errorOK) +{ + SpGistPageOpaque opaque = SpGistPageGetOpaque(page); + OffsetNumber i, + maxoff, + offnum; + + if (opaque->nPlaceholder > 0 && + PageGetExactFreeSpace(page) + SGDTSIZE >= MAXALIGN(size)) + { + /* Try to replace a placeholder */ + maxoff = PageGetMaxOffsetNumber(page); + offnum = InvalidOffsetNumber; + + for (;;) + { + if (startOffset && *startOffset != InvalidOffsetNumber) + i = *startOffset; + else + i = FirstOffsetNumber; + for (; i <= maxoff; i++) + { + SpGistDeadTuple it = (SpGistDeadTuple) PageGetItem(page, + PageGetItemId(page, i)); + + if (it->tupstate == SPGIST_PLACEHOLDER) + { + offnum = i; + break; + } + } + + /* Done if we found a placeholder */ + if (offnum != InvalidOffsetNumber) + break; + + if (startOffset && *startOffset != InvalidOffsetNumber) + { + /* Hint was no good, re-search from beginning */ + *startOffset = InvalidOffsetNumber; + continue; + } + + /* Hmm, no placeholder found? */ + opaque->nPlaceholder = 0; + break; + } + + if (offnum != InvalidOffsetNumber) + { + /* Replace the placeholder tuple */ + PageIndexTupleDelete(page, offnum); + + offnum = PageAddItem(page, item, size, offnum, false, false); + + /* + * We should not have failed given the size check at the top of + * the function, but test anyway. If we did fail, we must PANIC + * because we've already deleted the placeholder tuple, and + * there's no other way to keep the damage from getting to disk. + */ + if (offnum != InvalidOffsetNumber) + { + Assert(opaque->nPlaceholder > 0); + opaque->nPlaceholder--; + if (startOffset) + *startOffset = offnum + 1; + } + else + elog(PANIC, "failed to add item of size %u to SPGiST index page", + size); + + return offnum; + } + } + + /* No luck in replacing a placeholder, so just add it to the page */ + offnum = PageAddItem(page, item, size, + InvalidOffsetNumber, false, false); + + if (offnum == InvalidOffsetNumber && !errorOK) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + size); + + return offnum; +} diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c new file mode 100644 index 00000000000..90d59920eb6 --- /dev/null +++ b/src/backend/access/spgist/spgvacuum.c @@ -0,0 +1,755 @@ +/*------------------------------------------------------------------------- + * + * spgvacuum.c + * vacuum for SP-GiST + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgvacuum.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/spgist_private.h" +#include "access/transam.h" +#include "catalog/storage.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/indexfsm.h" +#include "storage/lmgr.h" +#include "storage/procarray.h" + + +/* local state for vacuum operations */ +typedef struct spgBulkDeleteState +{ + /* Parameters passed in to spgvacuumscan */ + IndexVacuumInfo *info; + IndexBulkDeleteResult *stats; + IndexBulkDeleteCallback callback; + void *callback_state; + /* Additional working state */ + SpGistState spgstate; + TransactionId OldestXmin; + BlockNumber lastFilledBlock; +} spgBulkDeleteState; + + +/* + * Vacuum a regular (non-root) leaf page + * + * We must delete tuples that are targeted for deletion by the VACUUM, + * but not move any tuples that are referenced by outside links; we assume + * those are the ones that are heads of chains. + */ +static void +vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer) +{ + Page page = BufferGetPage(buffer); + spgxlogVacuumLeaf xlrec; + XLogRecData rdata[8]; + OffsetNumber toDead[MaxIndexTuplesPerPage]; + OffsetNumber toPlaceholder[MaxIndexTuplesPerPage]; + OffsetNumber moveSrc[MaxIndexTuplesPerPage]; + OffsetNumber moveDest[MaxIndexTuplesPerPage]; + OffsetNumber chainSrc[MaxIndexTuplesPerPage]; + OffsetNumber chainDest[MaxIndexTuplesPerPage]; + OffsetNumber predecessor[MaxIndexTuplesPerPage + 1]; + bool deletable[MaxIndexTuplesPerPage + 1]; + int nDeletable; + OffsetNumber i, + max = PageGetMaxOffsetNumber(page); + + memset(predecessor, 0, sizeof(predecessor)); + memset(deletable, 0, sizeof(deletable)); + nDeletable = 0; + + /* Scan page, identify tuples to delete, accumulate stats */ + for (i = FirstOffsetNumber; i <= max; i++) + { + SpGistLeafTuple lt; + + lt = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, i)); + if (lt->tupstate == SPGIST_LIVE) + { + Assert(ItemPointerIsValid(<->heapPtr)); + + if (bds->callback(<->heapPtr, bds->callback_state)) + { + bds->stats->tuples_removed += 1; + deletable[i] = true; + nDeletable++; + } + else + { + bds->stats->num_index_tuples += 1; + } + + /* Form predecessor map, too */ + if (lt->nextOffset != InvalidOffsetNumber) + { + /* paranoia about corrupted chain links */ + if (lt->nextOffset < FirstOffsetNumber || + lt->nextOffset > max || + predecessor[lt->nextOffset] != InvalidOffsetNumber) + elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"", + BufferGetBlockNumber(buffer), + RelationGetRelationName(index)); + predecessor[lt->nextOffset] = i; + } + } + else + { + Assert(lt->nextOffset == InvalidOffsetNumber); + } + } + + if (nDeletable == 0) + return; /* nothing more to do */ + + /*---------- + * Figure out exactly what we have to do. We do this separately from + * actually modifying the page, mainly so that we have a representation + * that can be dumped into WAL and then the replay code can do exactly + * the same thing. The output of this step consists of six arrays + * describing four kinds of operations, to be performed in this order: + * + * toDead[]: tuple numbers to be replaced with DEAD tuples + * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples + * moveSrc[]: tuple numbers that need to be relocated to another offset + * (replacing the tuple there) and then replaced with PLACEHOLDER tuples + * moveDest[]: new locations for moveSrc tuples + * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates + * chainDest[]: new values of nextOffset for chainSrc members + * + * It's easiest to figure out what we have to do by processing tuple + * chains, so we iterate over all the tuples (not just the deletable + * ones!) to identify chain heads, then chase down each chain and make + * work item entries for deletable tuples within the chain. + *---------- + */ + xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0; + + for (i = FirstOffsetNumber; i <= max; i++) + { + SpGistLeafTuple head; + bool interveningDeletable; + OffsetNumber prevLive; + OffsetNumber j; + + head = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, i)); + if (head->tupstate != SPGIST_LIVE) + continue; /* can't be a chain member */ + if (predecessor[i] != 0) + continue; /* not a chain head */ + + /* initialize ... */ + interveningDeletable = false; + prevLive = deletable[i] ? InvalidOffsetNumber : i; + + /* scan down the chain ... */ + j = head->nextOffset; + while (j != InvalidOffsetNumber) + { + SpGistLeafTuple lt; + + lt = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, j)); + if (lt->tupstate != SPGIST_LIVE) + { + /* all tuples in chain should be live */ + elog(ERROR, "unexpected SPGiST tuple state: %d", + lt->tupstate); + } + + if (deletable[j]) + { + /* This tuple should be replaced by a placeholder */ + toPlaceholder[xlrec.nPlaceholder] = j; + xlrec.nPlaceholder++; + /* previous live tuple's chain link will need an update */ + interveningDeletable = true; + } + else if (prevLive == InvalidOffsetNumber) + { + /* + * This is the first live tuple in the chain. It has + * to move to the head position. + */ + moveSrc[xlrec.nMove] = j; + moveDest[xlrec.nMove] = i; + xlrec.nMove++; + /* Chain updates will be applied after the move */ + prevLive = i; + interveningDeletable = false; + } + else + { + /* + * Second or later live tuple. Arrange to re-chain it to the + * previous live one, if there was a gap. + */ + if (interveningDeletable) + { + chainSrc[xlrec.nChain] = prevLive; + chainDest[xlrec.nChain] = j; + xlrec.nChain++; + } + prevLive = j; + interveningDeletable = false; + } + + j = lt->nextOffset; + } + + if (prevLive == InvalidOffsetNumber) + { + /* The chain is entirely removable, so we need a DEAD tuple */ + toDead[xlrec.nDead] = i; + xlrec.nDead++; + } + else if (interveningDeletable) + { + /* One or more deletions at end of chain, so close it off */ + chainSrc[xlrec.nChain] = prevLive; + chainDest[xlrec.nChain] = InvalidOffsetNumber; + xlrec.nChain++; + } + } + + /* sanity check ... */ + if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove) + elog(ERROR, "inconsistent counts of deletable tuples"); + + /* Prepare WAL record */ + xlrec.node = index->rd_node; + xlrec.blkno = BufferGetBlockNumber(buffer); + STORE_STATE(&bds->spgstate, xlrec.stateSrc); + + ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0); + /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ + ACCEPT_RDATA_DATA(toDead, sizeof(OffsetNumber) * xlrec.nDead, 1); + ACCEPT_RDATA_DATA(toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder, 2); + ACCEPT_RDATA_DATA(moveSrc, sizeof(OffsetNumber) * xlrec.nMove, 3); + ACCEPT_RDATA_DATA(moveDest, sizeof(OffsetNumber) * xlrec.nMove, 4); + ACCEPT_RDATA_DATA(chainSrc, sizeof(OffsetNumber) * xlrec.nChain, 5); + ACCEPT_RDATA_DATA(chainDest, sizeof(OffsetNumber) * xlrec.nChain, 6); + ACCEPT_RDATA_BUFFER(buffer, 7); + + /* Do the updates */ + START_CRIT_SECTION(); + + spgPageIndexMultiDelete(&bds->spgstate, page, + toDead, xlrec.nDead, + SPGIST_DEAD, SPGIST_DEAD, + InvalidBlockNumber, InvalidOffsetNumber); + + spgPageIndexMultiDelete(&bds->spgstate, page, + toPlaceholder, xlrec.nPlaceholder, + SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, + InvalidBlockNumber, InvalidOffsetNumber); + + /* + * We implement the move step by swapping the item pointers of the + * source and target tuples, then replacing the newly-source tuples + * with placeholders. This is perhaps unduly friendly with the page + * data representation, but it's fast and doesn't risk page overflow + * when a tuple to be relocated is large. + */ + for (i = 0; i < xlrec.nMove; i++) + { + ItemId idSrc = PageGetItemId(page, moveSrc[i]); + ItemId idDest = PageGetItemId(page, moveDest[i]); + ItemIdData tmp; + + tmp = *idSrc; + *idSrc = *idDest; + *idDest = tmp; + } + + spgPageIndexMultiDelete(&bds->spgstate, page, + moveSrc, xlrec.nMove, + SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, + InvalidBlockNumber, InvalidOffsetNumber); + + for (i = 0; i < xlrec.nChain; i++) + { + SpGistLeafTuple lt; + + lt = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, chainSrc[i])); + Assert(lt->tupstate == SPGIST_LIVE); + lt->nextOffset = chainDest[i]; + } + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF, rdata); + + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + + END_CRIT_SECTION(); +} + +/* + * Vacuum the root page when it is a leaf + * + * On the root, we just delete any dead leaf tuples; no fancy business + */ +static void +vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) +{ + Page page = BufferGetPage(buffer); + spgxlogVacuumRoot xlrec; + XLogRecData rdata[3]; + OffsetNumber toDelete[MaxIndexTuplesPerPage]; + OffsetNumber i, + max = PageGetMaxOffsetNumber(page); + + xlrec.nDelete = 0; + + /* Scan page, identify tuples to delete, accumulate stats */ + for (i = FirstOffsetNumber; i <= max; i++) + { + SpGistLeafTuple lt; + + lt = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, i)); + if (lt->tupstate == SPGIST_LIVE) + { + Assert(ItemPointerIsValid(<->heapPtr)); + + if (bds->callback(<->heapPtr, bds->callback_state)) + { + bds->stats->tuples_removed += 1; + toDelete[xlrec.nDelete] = i; + xlrec.nDelete++; + } + else + { + bds->stats->num_index_tuples += 1; + } + } + else + { + /* all tuples on root should be live */ + elog(ERROR, "unexpected SPGiST tuple state: %d", + lt->tupstate); + } + } + + if (xlrec.nDelete == 0) + return; /* nothing more to do */ + + /* Prepare WAL record */ + xlrec.node = index->rd_node; + STORE_STATE(&bds->spgstate, xlrec.stateSrc); + + ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0); + /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */ + ACCEPT_RDATA_DATA(toDelete, sizeof(OffsetNumber) * xlrec.nDelete, 1); + ACCEPT_RDATA_BUFFER(buffer, 2); + + /* Do the update */ + START_CRIT_SECTION(); + + /* The tuple numbers are in order, so we can use PageIndexMultiDelete */ + PageIndexMultiDelete(page, toDelete, xlrec.nDelete); + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT, rdata); + + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + + END_CRIT_SECTION(); +} + +/* + * Clean up redirect and placeholder tuples on the given page + * + * Redirect tuples can be marked placeholder once they're old enough. + * Placeholder tuples can be removed if it won't change the offsets of + * non-placeholder ones. + * + * Unlike the routines above, this works on both leaf and inner pages. + */ +static void +vacuumRedirectAndPlaceholder(Relation index, Buffer buffer, + TransactionId OldestXmin) +{ + Page page = BufferGetPage(buffer); + SpGistPageOpaque opaque = SpGistPageGetOpaque(page); + OffsetNumber i, + max = PageGetMaxOffsetNumber(page), + firstPlaceholder = InvalidOffsetNumber; + bool hasNonPlaceholder = false; + bool hasUpdate = false; + OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage]; + OffsetNumber itemnos[MaxIndexTuplesPerPage]; + spgxlogVacuumRedirect xlrec; + XLogRecData rdata[3]; + + xlrec.node = index->rd_node; + xlrec.blkno = BufferGetBlockNumber(buffer); + xlrec.nToPlaceholder = 0; + + START_CRIT_SECTION(); + + /* + * Scan backwards to convert old redirection tuples to placeholder tuples, + * and identify location of last non-placeholder tuple while at it. + */ + for (i = max; + i >= FirstOffsetNumber && + (opaque->nRedirection > 0 || !hasNonPlaceholder); + i--) + { + SpGistDeadTuple dt; + + dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i)); + + if (dt->tupstate == SPGIST_REDIRECT && + TransactionIdPrecedes(dt->xid, OldestXmin)) + { + dt->tupstate = SPGIST_PLACEHOLDER; + Assert(opaque->nRedirection > 0); + opaque->nRedirection--; + opaque->nPlaceholder++; + + ItemPointerSetInvalid(&dt->pointer); + + itemToPlaceholder[xlrec.nToPlaceholder] = i; + xlrec.nToPlaceholder++; + + hasUpdate = true; + } + + if (dt->tupstate == SPGIST_PLACEHOLDER) + { + if (!hasNonPlaceholder) + firstPlaceholder = i; + } + else + { + hasNonPlaceholder = true; + } + } + + /* + * Any placeholder tuples at the end of page can safely be removed. We + * can't remove ones before the last non-placeholder, though, because we + * can't alter the offset numbers of non-placeholder tuples. + */ + if (firstPlaceholder != InvalidOffsetNumber) + { + /* + * We do not store this array to rdata because it's easy to recreate. + */ + for (i = firstPlaceholder; i <= max; i++) + itemnos[i - firstPlaceholder] = i; + + i = max - firstPlaceholder + 1; + Assert(opaque->nPlaceholder >= i); + opaque->nPlaceholder -= i; + + /* The array is surely sorted, so can use PageIndexMultiDelete */ + PageIndexMultiDelete(page, itemnos, i); + + hasUpdate = true; + } + + xlrec.firstPlaceholder = firstPlaceholder; + + if (hasUpdate) + MarkBufferDirty(buffer); + + if (hasUpdate && RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + + ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0); + ACCEPT_RDATA_DATA(itemToPlaceholder, sizeof(OffsetNumber) * xlrec.nToPlaceholder, 1); + ACCEPT_RDATA_BUFFER(buffer, 2); + + recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT, rdata); + + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + + END_CRIT_SECTION(); +} + +/* + * Process one page during a bulkdelete scan + */ +static void +spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno) +{ + Relation index = bds->info->index; + Buffer buffer; + Page page; + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(); + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, bds->info->strategy); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + if (PageIsNew(page)) + { + /* + * We found an all-zero page, which could happen if the database + * crashed just after extending the file. Initialize and recycle it. + */ + SpGistInitBuffer(buffer, 0); + SpGistPageSetDeleted(page); + /* We don't bother to WAL-log this action; easy to redo */ + MarkBufferDirty(buffer); + } + else if (SpGistPageIsDeleted(page)) + { + /* nothing to do */ + } + else if (SpGistPageIsLeaf(page)) + { + if (blkno == SPGIST_HEAD_BLKNO) + { + vacuumLeafRoot(bds, index, buffer); + /* no need for vacuumRedirectAndPlaceholder */ + } + else + { + vacuumLeafPage(bds, index, buffer); + vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin); + } + } + else + { + /* inner page */ + vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin); + } + + /* + * The root page must never be deleted, nor marked as available in FSM, + * because we don't want it ever returned by a search for a place to + * put a new tuple. Otherwise, check for empty/deletable page, and + * make sure FSM knows about it. + */ + if (blkno != SPGIST_HEAD_BLKNO) + { + /* If page is now empty, mark it deleted */ + if (PageIsEmpty(page) && !SpGistPageIsDeleted(page)) + { + SpGistPageSetDeleted(page); + /* We don't bother to WAL-log this action; easy to redo */ + MarkBufferDirty(buffer); + } + + if (SpGistPageIsDeleted(page)) + { + RecordFreeIndexPage(index, blkno); + bds->stats->pages_deleted++; + } + else + bds->lastFilledBlock = blkno; + } + + SpGistSetLastUsedPage(index, buffer); + + UnlockReleaseBuffer(buffer); +} + +/* + * Perform a bulkdelete scan + */ +static void +spgvacuumscan(spgBulkDeleteState *bds) +{ + Relation index = bds->info->index; + bool needLock; + BlockNumber num_pages, + blkno; + + /* Finish setting up spgBulkDeleteState */ + initSpGistState(&bds->spgstate, index); + bds->OldestXmin = GetOldestXmin(true, false); + bds->lastFilledBlock = SPGIST_HEAD_BLKNO; + + /* + * Reset counts that will be incremented during the scan; needed in case + * of multiple scans during a single VACUUM command + */ + bds->stats->estimated_count = false; + bds->stats->num_index_tuples = 0; + bds->stats->pages_deleted = 0; + + /* We can skip locking for new or temp relations */ + needLock = !RELATION_IS_LOCAL(index); + + /* + * The outer loop iterates over all index pages except the metapage, in + * physical order (we hope the kernel will cooperate in providing + * read-ahead for speed). It is critical that we visit all leaf pages, + * including ones added after we start the scan, else we might fail to + * delete some deletable tuples. See more extensive comments about + * this in btvacuumscan(). + */ + blkno = SPGIST_HEAD_BLKNO; + for (;;) + { + /* Get the current relation length */ + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + num_pages = RelationGetNumberOfBlocks(index); + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); + + /* Quit if we've scanned the whole relation */ + if (blkno >= num_pages) + break; + /* Iterate over pages, then loop back to recheck length */ + for (; blkno < num_pages; blkno++) + { + spgvacuumpage(bds, blkno); + } + } + + /* Propagate local lastUsedPage cache to metablock */ + SpGistUpdateMetaPage(index); + + /* + * Truncate index if possible + * + * XXX disabled because it's unsafe due to possible concurrent inserts. + * We'd have to rescan the pages to make sure they're still empty, and it + * doesn't seem worth it. Note that btree doesn't do this either. + */ +#ifdef NOT_USED + if (num_pages > bds->lastFilledBlock + 1) + { + BlockNumber lastBlock = num_pages - 1; + + num_pages = bds->lastFilledBlock + 1; + RelationTruncate(index, num_pages); + bds->stats->pages_removed += lastBlock - bds->lastFilledBlock; + bds->stats->pages_deleted -= lastBlock - bds->lastFilledBlock; + } +#endif + + /* Report final stats */ + bds->stats->num_pages = num_pages; + bds->stats->pages_free = bds->stats->pages_deleted; +} + +/* + * Bulk deletion of all index entries pointing to a set of heap tuples. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +Datum +spgbulkdelete(PG_FUNCTION_ARGS) +{ + IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); + IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); + IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); + void *callback_state = (void *) PG_GETARG_POINTER(3); + spgBulkDeleteState bds; + + /* allocate stats if first time through, else re-use existing struct */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + bds.info = info; + bds.stats = stats; + bds.callback = callback; + bds.callback_state = callback_state; + + spgvacuumscan(&bds); + + PG_RETURN_POINTER(stats); +} + +/* Dummy callback to delete no tuples during spgvacuumcleanup */ +static bool +dummy_callback(ItemPointer itemptr, void *state) +{ + return false; +} + +/* + * Post-VACUUM cleanup. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +Datum +spgvacuumcleanup(PG_FUNCTION_ARGS) +{ + IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); + IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); + Relation index = info->index; + spgBulkDeleteState bds; + + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + PG_RETURN_POINTER(stats); + + /* + * We don't need to scan the index if there was a preceding bulkdelete + * pass. Otherwise, make a pass that won't delete any live tuples, but + * might still accomplish useful stuff with redirect/placeholder cleanup, + * and in any case will provide stats. + */ + if (stats == NULL) + { + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + bds.info = info; + bds.stats = stats; + bds.callback = dummy_callback; + bds.callback_state = NULL; + + spgvacuumscan(&bds); + } + + /* Finally, vacuum the FSM */ + IndexFreeSpaceMapVacuum(index); + + /* + * It's quite possible for us to be fooled by concurrent page splits into + * double-counting some index tuples, so disbelieve any total that exceeds + * the underlying heap's count ... if we know that accurately. Otherwise + * this might just make matters worse. + */ + if (!info->estimated_count) + { + if (stats->num_index_tuples > info->num_heap_tuples) + stats->num_index_tuples = info->num_heap_tuples; + } + + PG_RETURN_POINTER(stats); +} diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c new file mode 100644 index 00000000000..e508f09703d --- /dev/null +++ b/src/backend/access/spgist/spgxlog.c @@ -0,0 +1,1070 @@ +/*------------------------------------------------------------------------- + * + * spgxlog.c + * WAL replay logic for SP-GiST + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgxlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/spgist_private.h" +#include "access/xlogutils.h" +#include "storage/bufmgr.h" +#include "utils/memutils.h" + + +static MemoryContext opCtx; /* working memory for operations */ + + +/* + * Prepare a dummy SpGistState, with just the minimum info needed for replay. + * + * At present, all we need is enough info to support spgFormDeadTuple(), + * plus the isBuild flag. + */ +static void +fillFakeState(SpGistState *state, spgxlogState stateSrc) +{ + memset(state, 0, sizeof(*state)); + + state->myXid = stateSrc.myXid; + state->isBuild = stateSrc.isBuild; + state->deadTupleStorage = palloc0(SGDTSIZE); +} + +/* + * Add a leaf tuple, or replace an existing placeholder tuple. This is used + * to replay SpGistPageAddNewItem() operations. If the offset points at an + * existing tuple, it had better be a placeholder tuple. + */ +static void +addOrReplaceTuple(Page page, Item tuple, int size, OffsetNumber offset) +{ + if (offset <= PageGetMaxOffsetNumber(page)) + { + SpGistDeadTuple dt = (SpGistDeadTuple) PageGetItem(page, + PageGetItemId(page, offset)); + + if (dt->tupstate != SPGIST_PLACEHOLDER) + elog(ERROR, "SPGiST tuple to be replaced is not a placeholder"); + + Assert(SpGistPageGetOpaque(page)->nPlaceholder > 0); + SpGistPageGetOpaque(page)->nPlaceholder--; + + PageIndexTupleDelete(page, offset); + } + + Assert(offset <= PageGetMaxOffsetNumber(page) + 1); + + if (PageAddItem(page, tuple, size, offset, false, false) != offset) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + size); +} + +static void +spgRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) +{ + RelFileNode *node = (RelFileNode *) XLogRecGetData(record); + Buffer buffer; + Page page; + + buffer = XLogReadBuffer(*node, SPGIST_METAPAGE_BLKNO, true); + Assert(BufferIsValid(buffer)); + page = (Page) BufferGetPage(buffer); + SpGistInitMetapage(page); + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + + buffer = XLogReadBuffer(*node, SPGIST_HEAD_BLKNO, true); + Assert(BufferIsValid(buffer)); + SpGistInitBuffer(buffer, SPGIST_LEAF); + page = (Page) BufferGetPage(buffer); + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); +} + +static void +spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record) +{ + char *ptr = XLogRecGetData(record); + spgxlogAddLeaf *xldata = (spgxlogAddLeaf *) ptr; + SpGistLeafTuple leafTuple; + Buffer buffer; + Page page; + + /* we assume this is adequately aligned */ + ptr += sizeof(spgxlogAddLeaf); + leafTuple = (SpGistLeafTuple) ptr; + + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blknoLeaf, + xldata->newPage); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + + if (xldata->newPage) + SpGistInitBuffer(buffer, SPGIST_LEAF); + + if (!XLByteLE(lsn, PageGetLSN(page))) + { + /* insert new tuple */ + if (xldata->offnumLeaf != xldata->offnumHeadLeaf) + { + /* normal cases, tuple was added by SpGistPageAddNewItem */ + addOrReplaceTuple(page, (Item) leafTuple, leafTuple->size, + xldata->offnumLeaf); + + /* update head tuple's chain link if needed */ + if (xldata->offnumHeadLeaf != InvalidOffsetNumber) + { + SpGistLeafTuple head; + + head = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumHeadLeaf)); + Assert(head->nextOffset == leafTuple->nextOffset); + head->nextOffset = xldata->offnumLeaf; + } + } + else + { + /* replacing a DEAD tuple */ + PageIndexTupleDelete(page, xldata->offnumLeaf); + if (PageAddItem(page, + (Item) leafTuple, leafTuple->size, + xldata->offnumLeaf, false, false) != xldata->offnumLeaf) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + leafTuple->size); + } + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + + /* update parent downlink if necessary */ + if (xldata->blknoParent != InvalidBlockNumber && + !(record->xl_info & XLR_BKP_BLOCK_2)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + SpGistInnerTuple tuple; + + tuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + + updateNodeLink(tuple, xldata->nodeI, + xldata->blknoLeaf, xldata->offnumLeaf); + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } +} + +static void +spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record) +{ + char *ptr = XLogRecGetData(record); + spgxlogMoveLeafs *xldata = (spgxlogMoveLeafs *) ptr; + SpGistState state; + OffsetNumber *toDelete; + OffsetNumber *toInsert; + int nInsert; + Buffer buffer; + Page page; + + fillFakeState(&state, xldata->stateSrc); + + nInsert = xldata->replaceDead ? 1 : xldata->nMoves + 1; + + ptr += MAXALIGN(sizeof(spgxlogMoveLeafs)); + toDelete = (OffsetNumber *) ptr; + ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nMoves); + toInsert = (OffsetNumber *) ptr; + ptr += MAXALIGN(sizeof(OffsetNumber) * nInsert); + + /* now ptr points to the list of leaf tuples */ + + /* Insert tuples on the dest page (do first, so redirect is valid) */ + if (!(record->xl_info & XLR_BKP_BLOCK_2)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blknoDst, + xldata->newPage); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + + if (xldata->newPage) + SpGistInitBuffer(buffer, SPGIST_LEAF); + + if (!XLByteLE(lsn, PageGetLSN(page))) + { + int i; + + for (i = 0; i < nInsert; i++) + { + SpGistLeafTuple lt = (SpGistLeafTuple) ptr; + + addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]); + ptr += lt->size; + } + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + + /* Delete tuples from the source page, inserting a redirection pointer */ + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + spgPageIndexMultiDelete(&state, page, toDelete, xldata->nMoves, + state.isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT, + SPGIST_PLACEHOLDER, + xldata->blknoDst, + toInsert[nInsert - 1]); + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + + /* And update the parent downlink */ + if (!(record->xl_info & XLR_BKP_BLOCK_3)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + SpGistInnerTuple tuple; + + tuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + + updateNodeLink(tuple, xldata->nodeI, + xldata->blknoDst, toInsert[nInsert - 1]); + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } +} + +static void +spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record) +{ + char *ptr = XLogRecGetData(record); + spgxlogAddNode *xldata = (spgxlogAddNode *) ptr; + SpGistInnerTuple innerTuple; + SpGistState state; + Buffer buffer; + Page page; + int bbi; + + /* we assume this is adequately aligned */ + ptr += sizeof(spgxlogAddNode); + innerTuple = (SpGistInnerTuple) ptr; + + fillFakeState(&state, xldata->stateSrc); + + if (xldata->blknoNew == InvalidBlockNumber) + { + /* update in place */ + Assert(xldata->blknoParent == InvalidBlockNumber); + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + PageIndexTupleDelete(page, xldata->offnum); + if (PageAddItem(page, (Item) innerTuple, innerTuple->size, + xldata->offnum, + false, false) != xldata->offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + innerTuple->size); + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + } + else + { + /* Install new tuple first so redirect is valid */ + if (!(record->xl_info & XLR_BKP_BLOCK_2)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blknoNew, + xldata->newPage); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + + if (xldata->newPage) + SpGistInitBuffer(buffer, 0); + + if (!XLByteLE(lsn, PageGetLSN(page))) + { + addOrReplaceTuple(page, (Item) innerTuple, + innerTuple->size, xldata->offnumNew); + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + + /* Delete old tuple, replacing it with redirect or placeholder tuple */ + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + SpGistDeadTuple dt; + + if (state.isBuild) + dt = spgFormDeadTuple(&state, SPGIST_PLACEHOLDER, + InvalidBlockNumber, + InvalidOffsetNumber); + else + dt = spgFormDeadTuple(&state, SPGIST_REDIRECT, + xldata->blknoNew, + xldata->offnumNew); + + PageIndexTupleDelete(page, xldata->offnum); + if (PageAddItem(page, (Item) dt, dt->size, + xldata->offnum, + false, false) != xldata->offnum) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + dt->size); + + if (state.isBuild) + SpGistPageGetOpaque(page)->nPlaceholder++; + else + SpGistPageGetOpaque(page)->nRedirection++; + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + + /* + * Update parent downlink. Since parent could be in either of the + * previous two buffers, it's a bit tricky to determine which BKP bit + * applies. + */ + if (xldata->blknoParent == xldata->blkno) + bbi = 0; + else if (xldata->blknoParent == xldata->blknoNew) + bbi = 1; + else + bbi = 2; + + if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) + { + buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + SpGistInnerTuple innerTuple; + + innerTuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + + updateNodeLink(innerTuple, xldata->nodeI, + xldata->blknoNew, xldata->offnumNew); + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + } +} + +static void +spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record) +{ + char *ptr = XLogRecGetData(record); + spgxlogSplitTuple *xldata = (spgxlogSplitTuple *) ptr; + SpGistInnerTuple prefixTuple; + SpGistInnerTuple postfixTuple; + Buffer buffer; + Page page; + + /* we assume this is adequately aligned */ + ptr += sizeof(spgxlogSplitTuple); + prefixTuple = (SpGistInnerTuple) ptr; + ptr += prefixTuple->size; + postfixTuple = (SpGistInnerTuple) ptr; + + /* insert postfix tuple first to avoid dangling link */ + if (xldata->blknoPostfix != xldata->blknoPrefix && + !(record->xl_info & XLR_BKP_BLOCK_2)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blknoPostfix, + xldata->newPage); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + + if (xldata->newPage) + SpGistInitBuffer(buffer, 0); + + if (!XLByteLE(lsn, PageGetLSN(page))) + { + addOrReplaceTuple(page, (Item) postfixTuple, + postfixTuple->size, xldata->offnumPostfix); + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + + /* now handle the original page */ + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blknoPrefix, false); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + PageIndexTupleDelete(page, xldata->offnumPrefix); + if (PageAddItem(page, (Item) prefixTuple, prefixTuple->size, + xldata->offnumPrefix, false, false) != xldata->offnumPrefix) + elog(ERROR, "failed to add item of size %u to SPGiST index page", + prefixTuple->size); + + if (xldata->blknoPostfix == xldata->blknoPrefix) + addOrReplaceTuple(page, (Item) postfixTuple, + postfixTuple->size, + xldata->offnumPostfix); + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } +} + +static void +spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record) +{ + char *ptr = XLogRecGetData(record); + spgxlogPickSplit *xldata = (spgxlogPickSplit *) ptr; + SpGistInnerTuple innerTuple; + SpGistState state; + OffsetNumber *toDelete; + OffsetNumber *toInsert; + uint8 *leafPageSelect; + Buffer srcBuffer; + Buffer destBuffer; + Page page; + int bbi; + int i; + + fillFakeState(&state, xldata->stateSrc); + + ptr += MAXALIGN(sizeof(spgxlogPickSplit)); + innerTuple = (SpGistInnerTuple) ptr; + ptr += innerTuple->size; + toDelete = (OffsetNumber *) ptr; + ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nDelete); + toInsert = (OffsetNumber *) ptr; + ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nInsert); + leafPageSelect = (uint8 *) ptr; + ptr += MAXALIGN(sizeof(uint8) * xldata->nInsert); + + /* now ptr points to the list of leaf tuples */ + + /* + * It's a bit tricky to identify which pages have been handled as + * full-page images, so we explicitly count each referenced buffer. + */ + bbi = 0; + + if (xldata->blknoSrc == SPGIST_HEAD_BLKNO) + { + /* when splitting root, we touch it only in the guise of new inner */ + srcBuffer = InvalidBuffer; + } + else if (xldata->initSrc) + { + /* just re-init the source page */ + srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, true); + Assert(BufferIsValid(srcBuffer)); + page = (Page) BufferGetPage(srcBuffer); + + SpGistInitBuffer(srcBuffer, SPGIST_LEAF); + /* don't update LSN etc till we're done with it */ + } + else + { + /* delete the specified tuples from source page */ + if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) + { + srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false); + if (BufferIsValid(srcBuffer)) + { + page = BufferGetPage(srcBuffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + /* + * We have it a bit easier here than in doPickSplit(), + * because we know the inner tuple's location already, + * so we can inject the correct redirection tuple now. + */ + if (!state.isBuild) + spgPageIndexMultiDelete(&state, page, + toDelete, xldata->nDelete, + SPGIST_REDIRECT, + SPGIST_PLACEHOLDER, + xldata->blknoInner, + xldata->offnumInner); + else + spgPageIndexMultiDelete(&state, page, + toDelete, xldata->nDelete, + SPGIST_PLACEHOLDER, + SPGIST_PLACEHOLDER, + InvalidBlockNumber, + InvalidOffsetNumber); + + /* don't update LSN etc till we're done with it */ + } + } + } + else + srcBuffer = InvalidBuffer; + bbi++; + } + + /* try to access dest page if any */ + if (xldata->blknoDest == InvalidBlockNumber) + { + destBuffer = InvalidBuffer; + } + else if (xldata->initDest) + { + /* just re-init the dest page */ + destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, true); + Assert(BufferIsValid(destBuffer)); + page = (Page) BufferGetPage(destBuffer); + + SpGistInitBuffer(destBuffer, SPGIST_LEAF); + /* don't update LSN etc till we're done with it */ + } + else + { + if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) + destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, false); + else + destBuffer = InvalidBuffer; + bbi++; + } + + /* restore leaf tuples to src and/or dest page */ + for (i = 0; i < xldata->nInsert; i++) + { + SpGistLeafTuple lt = (SpGistLeafTuple) ptr; + Buffer leafBuffer; + + ptr += lt->size; + + leafBuffer = leafPageSelect[i] ? destBuffer : srcBuffer; + if (!BufferIsValid(leafBuffer)) + continue; /* no need to touch this page */ + page = BufferGetPage(leafBuffer); + + if (!XLByteLE(lsn, PageGetLSN(page))) + { + addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]); + } + } + + /* Now update src and dest page LSNs */ + if (BufferIsValid(srcBuffer)) + { + page = BufferGetPage(srcBuffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(srcBuffer); + } + UnlockReleaseBuffer(srcBuffer); + } + if (BufferIsValid(destBuffer)) + { + page = BufferGetPage(destBuffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(destBuffer); + } + UnlockReleaseBuffer(destBuffer); + } + + /* restore new inner tuple */ + if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) + { + Buffer buffer = XLogReadBuffer(xldata->node, xldata->blknoInner, + xldata->initInner); + + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + + if (xldata->initInner) + SpGistInitBuffer(buffer, 0); + + if (!XLByteLE(lsn, PageGetLSN(page))) + { + addOrReplaceTuple(page, (Item) innerTuple, innerTuple->size, + xldata->offnumInner); + + /* if inner is also parent, update link while we're here */ + if (xldata->blknoInner == xldata->blknoParent) + { + SpGistInnerTuple parent; + + parent = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + updateNodeLink(parent, xldata->nodeI, + xldata->blknoInner, xldata->offnumInner); + } + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + bbi++; + + /* update parent downlink, unless we did it above */ + if (xldata->blknoParent == InvalidBlockNumber) + { + /* no parent cause we split the root */ + Assert(xldata->blknoInner == SPGIST_HEAD_BLKNO); + } + else if (xldata->blknoInner != xldata->blknoParent) + { + if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi))) + { + Buffer buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false); + + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + + if (!XLByteLE(lsn, PageGetLSN(page))) + { + SpGistInnerTuple parent; + + parent = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, xldata->offnumParent)); + updateNodeLink(parent, xldata->nodeI, + xldata->blknoInner, xldata->offnumInner); + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + } +} + +static void +spgRedoVacuumLeaf(XLogRecPtr lsn, XLogRecord *record) +{ + char *ptr = XLogRecGetData(record); + spgxlogVacuumLeaf *xldata = (spgxlogVacuumLeaf *) ptr; + OffsetNumber *toDead; + OffsetNumber *toPlaceholder; + OffsetNumber *moveSrc; + OffsetNumber *moveDest; + OffsetNumber *chainSrc; + OffsetNumber *chainDest; + SpGistState state; + Buffer buffer; + Page page; + int i; + + fillFakeState(&state, xldata->stateSrc); + + ptr += sizeof(spgxlogVacuumLeaf); + toDead = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nDead; + toPlaceholder = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nPlaceholder; + moveSrc = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nMove; + moveDest = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nMove; + chainSrc = (OffsetNumber *) ptr; + ptr += sizeof(OffsetNumber) * xldata->nChain; + chainDest = (OffsetNumber *) ptr; + + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + spgPageIndexMultiDelete(&state, page, + toDead, xldata->nDead, + SPGIST_DEAD, SPGIST_DEAD, + InvalidBlockNumber, + InvalidOffsetNumber); + + spgPageIndexMultiDelete(&state, page, + toPlaceholder, xldata->nPlaceholder, + SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, + InvalidBlockNumber, + InvalidOffsetNumber); + + /* see comments in vacuumLeafPage() */ + for (i = 0; i < xldata->nMove; i++) + { + ItemId idSrc = PageGetItemId(page, moveSrc[i]); + ItemId idDest = PageGetItemId(page, moveDest[i]); + ItemIdData tmp; + + tmp = *idSrc; + *idSrc = *idDest; + *idDest = tmp; + } + + spgPageIndexMultiDelete(&state, page, + moveSrc, xldata->nMove, + SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, + InvalidBlockNumber, + InvalidOffsetNumber); + + for (i = 0; i < xldata->nChain; i++) + { + SpGistLeafTuple lt; + + lt = (SpGistLeafTuple) PageGetItem(page, + PageGetItemId(page, chainSrc[i])); + Assert(lt->tupstate == SPGIST_LIVE); + lt->nextOffset = chainDest[i]; + } + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } +} + +static void +spgRedoVacuumRoot(XLogRecPtr lsn, XLogRecord *record) +{ + char *ptr = XLogRecGetData(record); + spgxlogVacuumRoot *xldata = (spgxlogVacuumRoot *) ptr; + OffsetNumber *toDelete; + Buffer buffer; + Page page; + + ptr += sizeof(spgxlogVacuumRoot); + toDelete = (OffsetNumber *) ptr; + + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + buffer = XLogReadBuffer(xldata->node, SPGIST_HEAD_BLKNO, false); + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + /* The tuple numbers are in order */ + PageIndexMultiDelete(page, toDelete, xldata->nDelete); + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } +} + +static void +spgRedoVacuumRedirect(XLogRecPtr lsn, XLogRecord *record) +{ + char *ptr = XLogRecGetData(record); + spgxlogVacuumRedirect *xldata = (spgxlogVacuumRedirect *) ptr; + OffsetNumber *itemToPlaceholder; + Buffer buffer; + Page page; + + ptr += sizeof(spgxlogVacuumRedirect); + itemToPlaceholder = (OffsetNumber *) ptr; + + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); + + if (BufferIsValid(buffer)) + { + page = BufferGetPage(buffer); + if (!XLByteLE(lsn, PageGetLSN(page))) + { + SpGistPageOpaque opaque = SpGistPageGetOpaque(page); + int i; + + /* Convert redirect pointers to plain placeholders */ + for (i = 0; i < xldata->nToPlaceholder; i++) + { + SpGistDeadTuple dt; + + dt = (SpGistDeadTuple) PageGetItem(page, + PageGetItemId(page, itemToPlaceholder[i])); + Assert(dt->tupstate == SPGIST_REDIRECT); + dt->tupstate = SPGIST_PLACEHOLDER; + ItemPointerSetInvalid(&dt->pointer); + } + + Assert(opaque->nRedirection >= xldata->nToPlaceholder); + opaque->nRedirection -= xldata->nToPlaceholder; + opaque->nPlaceholder += xldata->nToPlaceholder; + + /* Remove placeholder tuples at end of page */ + if (xldata->firstPlaceholder != InvalidOffsetNumber) + { + int max = PageGetMaxOffsetNumber(page); + OffsetNumber *toDelete; + + toDelete = palloc(sizeof(OffsetNumber) * max); + + for (i = xldata->firstPlaceholder; i <= max; i++) + toDelete[i - xldata->firstPlaceholder] = i; + + i = max - xldata->firstPlaceholder + 1; + Assert(opaque->nPlaceholder >= i); + opaque->nPlaceholder -= i; + + /* The array is sorted, so can use PageIndexMultiDelete */ + PageIndexMultiDelete(page, toDelete, i); + + pfree(toDelete); + } + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + + UnlockReleaseBuffer(buffer); + } + } +} + +void +spg_redo(XLogRecPtr lsn, XLogRecord *record) +{ + uint8 info = record->xl_info & ~XLR_INFO_MASK; + MemoryContext oldCxt; + + /* + * SP-GiST indexes do not require any conflict processing. NB: If we ever + * implement a similar optimization as we have in b-tree, and remove + * killed tuples outside VACUUM, we'll need to handle that here. + */ + RestoreBkpBlocks(lsn, record, false); + + oldCxt = MemoryContextSwitchTo(opCtx); + switch (info) + { + case XLOG_SPGIST_CREATE_INDEX: + spgRedoCreateIndex(lsn, record); + break; + case XLOG_SPGIST_ADD_LEAF: + spgRedoAddLeaf(lsn, record); + break; + case XLOG_SPGIST_MOVE_LEAFS: + spgRedoMoveLeafs(lsn, record); + break; + case XLOG_SPGIST_ADD_NODE: + spgRedoAddNode(lsn, record); + break; + case XLOG_SPGIST_SPLIT_TUPLE: + spgRedoSplitTuple(lsn, record); + break; + case XLOG_SPGIST_PICKSPLIT: + spgRedoPickSplit(lsn, record); + break; + case XLOG_SPGIST_VACUUM_LEAF: + spgRedoVacuumLeaf(lsn, record); + break; + case XLOG_SPGIST_VACUUM_ROOT: + spgRedoVacuumRoot(lsn, record); + break; + case XLOG_SPGIST_VACUUM_REDIRECT: + spgRedoVacuumRedirect(lsn, record); + break; + default: + elog(PANIC, "spg_redo: unknown op code %u", info); + } + + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(opCtx); +} + +static void +out_target(StringInfo buf, RelFileNode node) +{ + appendStringInfo(buf, "rel %u/%u/%u ", + node.spcNode, node.dbNode, node.relNode); +} + +void +spg_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + uint8 info = xl_info & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_SPGIST_CREATE_INDEX: + appendStringInfo(buf, "create_index: rel %u/%u/%u", + ((RelFileNode *) rec)->spcNode, + ((RelFileNode *) rec)->dbNode, + ((RelFileNode *) rec)->relNode); + break; + case XLOG_SPGIST_ADD_LEAF: + out_target(buf, ((spgxlogAddLeaf *) rec)->node); + appendStringInfo(buf, "add leaf to page: %u", + ((spgxlogAddLeaf *) rec)->blknoLeaf); + break; + case XLOG_SPGIST_MOVE_LEAFS: + out_target(buf, ((spgxlogMoveLeafs *) rec)->node); + appendStringInfo(buf, "move %u leafs from page %u to page %u", + ((spgxlogMoveLeafs *) rec)->nMoves, + ((spgxlogMoveLeafs *) rec)->blknoSrc, + ((spgxlogMoveLeafs *) rec)->blknoDst); + break; + case XLOG_SPGIST_ADD_NODE: + out_target(buf, ((spgxlogAddNode *) rec)->node); + appendStringInfo(buf, "add node to %u:%u", + ((spgxlogAddNode *) rec)->blkno, + ((spgxlogAddNode *) rec)->offnum); + break; + case XLOG_SPGIST_SPLIT_TUPLE: + out_target(buf, ((spgxlogSplitTuple *) rec)->node); + appendStringInfo(buf, "split node %u:%u to %u:%u", + ((spgxlogSplitTuple *) rec)->blknoPrefix, + ((spgxlogSplitTuple *) rec)->offnumPrefix, + ((spgxlogSplitTuple *) rec)->blknoPostfix, + ((spgxlogSplitTuple *) rec)->offnumPostfix); + break; + case XLOG_SPGIST_PICKSPLIT: + out_target(buf, ((spgxlogPickSplit *) rec)->node); + appendStringInfo(buf, "split leaf page"); + break; + case XLOG_SPGIST_VACUUM_LEAF: + out_target(buf, ((spgxlogVacuumLeaf *) rec)->node); + appendStringInfo(buf, "vacuum leaf tuples on page %u", + ((spgxlogVacuumLeaf *) rec)->blkno); + break; + case XLOG_SPGIST_VACUUM_ROOT: + out_target(buf, ((spgxlogVacuumRoot *) rec)->node); + appendStringInfo(buf, "vacuum leaf tuples on root page"); + break; + case XLOG_SPGIST_VACUUM_REDIRECT: + out_target(buf, ((spgxlogVacuumRedirect *) rec)->node); + appendStringInfo(buf, "vacuum redirect tuples on page %u", + ((spgxlogVacuumRedirect *) rec)->blkno); + break; + default: + appendStringInfo(buf, "unknown spgist op code %u", info); + break; + } +} + +void +spg_xlog_startup(void) +{ + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); +} + +void +spg_xlog_cleanup(void) +{ + MemoryContextDelete(opCtx); + opCtx = NULL; +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 6a0a2d9b477..ed8754e6f22 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -14,6 +14,7 @@ #include "access/heapam.h" #include "access/multixact.h" #include "access/nbtree.h" +#include "access/spgist.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "catalog/storage.h" @@ -40,5 +41,6 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = { {"Hash", hash_redo, hash_desc, NULL, NULL, NULL}, {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint}, {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, NULL}, - {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL} + {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL}, + {"SPGist", spg_redo, spg_desc, spg_xlog_startup, spg_xlog_cleanup, NULL} }; diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index f5660b2c3cd..d06809e7675 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6555,6 +6555,26 @@ gistcostestimate(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +Datum +spgcostestimate(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1); + List *indexQuals = (List *) PG_GETARG_POINTER(2); + List *indexOrderBys = (List *) PG_GETARG_POINTER(3); + RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(4); + Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(5); + Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(6); + Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(7); + double *indexCorrelation = (double *) PG_GETARG_POINTER(8); + + genericcostestimate(root, index, indexQuals, indexOrderBys, outer_rel, 0.0, + indexStartupCost, indexTotalCost, + indexSelectivity, indexCorrelation); + + PG_RETURN_VOID(); +} + /* Find the index column matching "op"; return its index, or -1 if no match */ static int find_index_column(Node *op, IndexOptInfo *index) diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 290f0edaefa..ee5d71e4d71 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -24,6 +24,10 @@ * Note: GIN does not include a page ID word as do the other index types. * This is OK because the opaque data is only 8 bytes and so can be reliably * distinguished by size. Revisit this if the size ever increases. + * Further note: as of 9.2, SP-GiST also uses 8-byte special space. This is + * still OK, as long as GIN isn't using all of the high-order bits in its + * flags word, because that way the flags word cannot match the page ID used + * by SP-GiST. */ typedef struct GinPageOpaqueData { diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h index 14f50345bbf..10b2f9ea4db 100644 --- a/src/include/access/reloptions.h +++ b/src/include/access/reloptions.h @@ -42,8 +42,9 @@ typedef enum relopt_kind RELOPT_KIND_GIST = (1 << 5), RELOPT_KIND_ATTRIBUTE = (1 << 6), RELOPT_KIND_TABLESPACE = (1 << 7), + RELOPT_KIND_SPGIST = (1 << 8), /* if you add a new kind, make sure you update "last_default" too */ - RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_TABLESPACE, + RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_SPGIST, /* some compilers treat enums as signed ints, so we can't use 1 << 31 */ RELOPT_KIND_MAX = (1 << 30) } relopt_kind; diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h index 83abba359a5..e4844fe96c9 100644 --- a/src/include/access/rmgr.h +++ b/src/include/access/rmgr.h @@ -32,6 +32,8 @@ typedef uint8 RmgrId; #define RM_GIN_ID 13 #define RM_GIST_ID 14 #define RM_SEQ_ID 15 -#define RM_MAX_ID RM_SEQ_ID +#define RM_SPGIST_ID 16 + +#define RM_MAX_ID RM_SPGIST_ID #endif /* RMGR_H */ diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h new file mode 100644 index 00000000000..aa655a31402 --- /dev/null +++ b/src/include/access/spgist.h @@ -0,0 +1,199 @@ +/*------------------------------------------------------------------------- + * + * spgist.h + * Public header file for SP-GiST access method. + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/spgist.h + * + *------------------------------------------------------------------------- + */ +#ifndef SPGIST_H +#define SPGIST_H + +#include "access/skey.h" +#include "access/xlog.h" +#include "fmgr.h" + + +/* reloption parameters */ +#define SPGIST_MIN_FILLFACTOR 10 +#define SPGIST_DEFAULT_FILLFACTOR 80 + +/* SPGiST opclass support function numbers */ +#define SPGIST_CONFIG_PROC 1 +#define SPGIST_CHOOSE_PROC 2 +#define SPGIST_PICKSPLIT_PROC 3 +#define SPGIST_INNER_CONSISTENT_PROC 4 +#define SPGIST_LEAF_CONSISTENT_PROC 5 +#define SPGISTNProc 5 + +/* + * Argument structs for spg_config method + */ +typedef struct spgConfigIn +{ + Oid attType; /* Data type to be indexed */ +} spgConfigIn; + +typedef struct spgConfigOut +{ + Oid prefixType; /* Data type of inner-tuple prefixes */ + Oid labelType; /* Data type of inner-tuple node labels */ + bool longValuesOK; /* Opclass can cope with values > 1 page */ +} spgConfigOut; + +/* + * Argument structs for spg_choose method + */ +typedef struct spgChooseIn +{ + Datum datum; /* original datum to be indexed */ + Datum leafDatum; /* current datum to be stored at leaf */ + int level; /* current level (counting from zero) */ + + /* Data from current inner tuple */ + bool allTheSame; /* tuple is marked all-the-same? */ + bool hasPrefix; /* tuple has a prefix? */ + Datum prefixDatum; /* if so, the prefix value */ + int nNodes; /* number of nodes in the inner tuple */ + Datum *nodeLabels; /* node label values (NULL if none) */ +} spgChooseIn; + +typedef enum spgChooseResultType +{ + spgMatchNode = 1, /* descend into existing node */ + spgAddNode, /* add a node to the inner tuple */ + spgSplitTuple /* split inner tuple (change its prefix) */ +} spgChooseResultType; + +typedef struct spgChooseOut +{ + spgChooseResultType resultType; /* action code, see above */ + union + { + struct /* results for spgMatchNode */ + { + int nodeN; /* descend to this node (index from 0) */ + int levelAdd; /* increment level by this much */ + Datum restDatum; /* new leaf datum */ + } matchNode; + struct /* results for spgAddNode */ + { + Datum nodeLabel; /* new node's label */ + int nodeN; /* where to insert it (index from 0) */ + } addNode; + struct /* results for spgSplitTuple */ + { + /* Info to form new inner tuple with one node */ + bool prefixHasPrefix; /* tuple should have a prefix? */ + Datum prefixPrefixDatum; /* if so, its value */ + Datum nodeLabel; /* node's label */ + + /* Info to form new lower-level inner tuple with all old nodes */ + bool postfixHasPrefix; /* tuple should have a prefix? */ + Datum postfixPrefixDatum; /* if so, its value */ + } splitTuple; + } result; +} spgChooseOut; + +/* + * Argument structs for spg_picksplit method + */ +typedef struct spgPickSplitIn +{ + int nTuples; /* number of leaf tuples */ + Datum *datums; /* their datums (array of length nTuples) */ + int level; /* current level (counting from zero) */ +} spgPickSplitIn; + +typedef struct spgPickSplitOut +{ + bool hasPrefix; /* new inner tuple should have a prefix? */ + Datum prefixDatum; /* if so, its value */ + + int nNodes; /* number of nodes for new inner tuple */ + Datum *nodeLabels; /* their labels (or NULL for no labels) */ + + int *mapTuplesToNodes; /* node index for each leaf tuple */ + Datum *leafTupleDatums; /* datum to store in each new leaf tuple */ +} spgPickSplitOut; + +/* + * Argument structs for spg_inner_consistent method + */ +typedef struct spgInnerConsistentIn +{ + StrategyNumber strategy; /* operator strategy number */ + Datum query; /* operator's RHS value */ + + Datum reconstructedValue; /* value reconstructed at parent */ + int level; /* current level (counting from zero) */ + + /* Data from current inner tuple */ + bool allTheSame; /* tuple is marked all-the-same? */ + bool hasPrefix; /* tuple has a prefix? */ + Datum prefixDatum; /* if so, the prefix value */ + int nNodes; /* number of nodes in the inner tuple */ + Datum *nodeLabels; /* node label values (NULL if none) */ +} spgInnerConsistentIn; + +typedef struct spgInnerConsistentOut +{ + int nNodes; /* number of child nodes to be visited */ + int *nodeNumbers; /* their indexes in the node array */ + int *levelAdds; /* increment level by this much for each */ + Datum *reconstructedValues; /* associated reconstructed values */ +} spgInnerConsistentOut; + +/* + * Argument structs for spg_leaf_consistent method + */ +typedef struct spgLeafConsistentIn +{ + StrategyNumber strategy; /* operator strategy number */ + Datum query; /* operator's RHS value */ + + Datum reconstructedValue; /* value reconstructed at parent */ + int level; /* current level (counting from zero) */ + + Datum leafDatum; /* datum in leaf tuple */ +} spgLeafConsistentIn; + +typedef struct spgLeafConsistentOut +{ + bool recheck; /* set true if operator must be rechecked */ +} spgLeafConsistentOut; + + +/* spginsert.c */ +extern Datum spgbuild(PG_FUNCTION_ARGS); +extern Datum spgbuildempty(PG_FUNCTION_ARGS); +extern Datum spginsert(PG_FUNCTION_ARGS); + +/* spgscan.c */ +extern Datum spgbeginscan(PG_FUNCTION_ARGS); +extern Datum spgendscan(PG_FUNCTION_ARGS); +extern Datum spgrescan(PG_FUNCTION_ARGS); +extern Datum spgmarkpos(PG_FUNCTION_ARGS); +extern Datum spgrestrpos(PG_FUNCTION_ARGS); +extern Datum spggetbitmap(PG_FUNCTION_ARGS); +extern Datum spggettuple(PG_FUNCTION_ARGS); + +/* spgutils.c */ +extern Datum spgoptions(PG_FUNCTION_ARGS); + +/* spgvacuum.c */ +extern Datum spgbulkdelete(PG_FUNCTION_ARGS); +extern Datum spgvacuumcleanup(PG_FUNCTION_ARGS); + +/* spgxlog.c */ +extern void spg_redo(XLogRecPtr lsn, XLogRecord *record); +extern void spg_desc(StringInfo buf, uint8 xl_info, char *rec); +extern void spg_xlog_startup(void); +extern void spg_xlog_cleanup(void); + +#endif /* SPGIST_H */ diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h new file mode 100644 index 00000000000..5c57799f09c --- /dev/null +++ b/src/include/access/spgist_private.h @@ -0,0 +1,609 @@ +/*------------------------------------------------------------------------- + * + * spgist_private.h + * Private declarations for SP-GiST access method. + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/spgist_private.h + * + *------------------------------------------------------------------------- + */ +#ifndef SPGIST_PRIVATE_H +#define SPGIST_PRIVATE_H + +#include "access/itup.h" +#include "access/spgist.h" +#include "nodes/tidbitmap.h" +#include "utils/rel.h" + + +/* Page numbers of fixed-location pages */ +#define SPGIST_METAPAGE_BLKNO (0) +#define SPGIST_HEAD_BLKNO (1) + +/* + * Contents of page special space on SPGiST index pages + */ +typedef struct SpGistPageOpaqueData +{ + uint16 flags; /* see bit definitions below */ + uint16 nRedirection; /* number of redirection tuples on page */ + uint16 nPlaceholder; /* number of placeholder tuples on page */ + /* note there's no count of either LIVE or DEAD tuples ... */ + uint16 spgist_page_id; /* for identification of SP-GiST indexes */ +} SpGistPageOpaqueData; + +typedef SpGistPageOpaqueData *SpGistPageOpaque; + +/* Flag bits in page special space */ +#define SPGIST_META (1<<0) +#define SPGIST_DELETED (1<<1) +#define SPGIST_LEAF (1<<2) + +#define SpGistPageGetOpaque(page) ((SpGistPageOpaque) PageGetSpecialPointer(page)) +#define SpGistPageIsMeta(page) (SpGistPageGetOpaque(page)->flags & SPGIST_META) +#define SpGistPageIsDeleted(page) (SpGistPageGetOpaque(page)->flags & SPGIST_DELETED) +#define SpGistPageSetDeleted(page) (SpGistPageGetOpaque(page)->flags |= SPGIST_DELETED) +#define SpGistPageSetNonDeleted(page) (SpGistPageGetOpaque(page)->flags &= ~SPGIST_DELETED) +#define SpGistPageIsLeaf(page) (SpGistPageGetOpaque(page)->flags & SPGIST_LEAF) +#define SpGistPageSetLeaf(page) (SpGistPageGetOpaque(page)->flags |= SPGIST_LEAF) +#define SpGistPageSetInner(page) (SpGistPageGetOpaque(page)->flags &= ~SPGIST_LEAF) + +/* + * The page ID is for the convenience of pg_filedump and similar utilities, + * which otherwise would have a hard time telling pages of different index + * types apart. It should be the last 2 bytes on the page. This is more or + * less "free" due to alignment considerations. + */ +#define SPGIST_PAGE_ID 0xFF82 + +/* + * Each backend keeps a cache of last-used page info in its index->rd_amcache + * area. This is initialized from, and occasionally written back to, + * shared storage in the index metapage. + */ +typedef struct SpGistLastUsedPage +{ + BlockNumber blkno; /* block number of described page */ + int freeSpace; /* its free space (could be obsolete!) */ +} SpGistLastUsedPage; + +typedef struct SpGistCache +{ + SpGistLastUsedPage innerPage[3]; /* one per triple-parity group */ + SpGistLastUsedPage leafPage; +} SpGistCache; + +/* + * metapage + */ +typedef struct SpGistMetaPageData +{ + uint32 magicNumber; /* for identity cross-check */ + SpGistCache lastUsedPages; /* shared storage of last-used info */ +} SpGistMetaPageData; + +#define SPGIST_MAGIC_NUMBER (0xBA0BABED) + +#define SpGistPageGetMeta(p) \ + ((SpGistMetaPageData *) PageGetContents(p)) + +/* + * Private state of index AM. SpGistState is common to both insert and + * search code; SpGistScanOpaque is for searches only. + */ + +/* Per-datatype info needed in SpGistState */ +typedef struct SpGistTypeDesc +{ + Oid type; + bool attbyval; + int16 attlen; +} SpGistTypeDesc; + +typedef struct SpGistState +{ + spgConfigOut config; /* filled in by opclass config method */ + + SpGistTypeDesc attType; /* type of input data and leaf values */ + SpGistTypeDesc attPrefixType; /* type of inner-tuple prefix values */ + SpGistTypeDesc attLabelType; /* type of node label values */ + + /* lookup data for the opclass support functions, except config */ + FmgrInfo chooseFn; + FmgrInfo picksplitFn; + FmgrInfo innerConsistentFn; + FmgrInfo leafConsistentFn; + + char *deadTupleStorage; /* workspace for spgFormDeadTuple */ + + TransactionId myXid; /* XID to use when creating a redirect tuple */ + bool isBuild; /* true if doing index build */ +} SpGistState; + +/* + * Private state of an index scan + */ +typedef struct SpGistScanOpaqueData +{ + SpGistState state; /* see above */ + MemoryContext tempCxt; /* short-lived memory context */ + + /* Index quals for scan (copied from IndexScanDesc for convenience) */ + int numberOfKeys; /* number of index qualifier conditions */ + ScanKey keyData; /* array of index qualifier descriptors */ + + /* Stack of yet-to-be-visited pages */ + List *scanStack; /* List of ScanStackEntrys */ + + /* These fields are only used in amgetbitmap scans: */ + TIDBitmap *tbm; /* bitmap being filled */ + int64 ntids; /* number of TIDs passed to bitmap */ + + /* These fields are only used in amgettuple scans: */ + int nPtrs; /* number of TIDs found on current page */ + int iPtr; /* index for scanning through same */ + ItemPointerData heapPtrs[MaxIndexTuplesPerPage]; /* TIDs from cur page */ + bool recheck[MaxIndexTuplesPerPage]; /* their recheck flags */ + + /* + * Note: using MaxIndexTuplesPerPage above is a bit hokey since + * SpGistLeafTuples aren't exactly IndexTuples; however, they are + * larger, so this is safe. + */ +} SpGistScanOpaqueData; + +typedef SpGistScanOpaqueData *SpGistScanOpaque; + + +/* + * SPGiST tuple types. Note: inner, leaf, and dead tuple structs + * must have the same tupstate field in the same position! Real inner and + * leaf tuples always have tupstate = LIVE; if the state is something else, + * use the SpGistDeadTuple struct to inspect the tuple. + */ + +/* values of tupstate (see README for more info) */ +#define SPGIST_LIVE 0 /* normal live tuple (either inner or leaf) */ +#define SPGIST_REDIRECT 1 /* temporary redirection placeholder */ +#define SPGIST_DEAD 2 /* dead, cannot be removed because of links */ +#define SPGIST_PLACEHOLDER 3 /* placeholder, used to preserve offsets */ + +/* + * SPGiST inner tuple: list of "nodes" that subdivide a set of tuples + * + * Inner tuple layout: + * header/optional prefix/array of nodes, which are SpGistNodeTuples + * + * size and prefixSize must be multiples of MAXALIGN + */ +typedef struct SpGistInnerTupleData +{ + unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */ + allTheSame:1, /* all nodes in tuple are equivalent */ + nNodes:13, /* number of nodes within inner tuple */ + prefixSize:16; /* size of prefix, or 0 if none */ + uint16 size; /* total size of inner tuple */ + /* On most machines there will be a couple of wasted bytes here */ + /* prefix datum follows, then nodes */ +} SpGistInnerTupleData; + +typedef SpGistInnerTupleData *SpGistInnerTuple; + +/* these must match largest values that fit in bit fields declared above */ +#define SGITMAXNNODES 0x1FFF +#define SGITMAXPREFIXSIZE 0xFFFF +#define SGITMAXSIZE 0xFFFF + +#define SGITHDRSZ MAXALIGN(sizeof(SpGistInnerTupleData)) +#define _SGITDATA(x) (((char *) (x)) + SGITHDRSZ) +#define SGITDATAPTR(x) ((x)->prefixSize ? _SGITDATA(x) : NULL) +#define SGITDATUM(x, s) ((x)->prefixSize ? \ + ((s)->attPrefixType.attbyval ? \ + *(Datum *) _SGITDATA(x) : \ + PointerGetDatum(_SGITDATA(x))) \ + : (Datum) 0) +#define SGITNODEPTR(x) ((SpGistNodeTuple) (_SGITDATA(x) + (x)->prefixSize)) + +/* Macro for iterating through the nodes of an inner tuple */ +#define SGITITERATE(x, i, nt) \ + for ((i) = 0, (nt) = SGITNODEPTR(x); \ + (i) < (x)->nNodes; \ + (i)++, (nt) = (SpGistNodeTuple) (((char *) (nt)) + IndexTupleSize(nt))) + +/* + * SPGiST node tuple: one node within an inner tuple + * + * Node tuples use the same header as ordinary Postgres IndexTuples, but + * we do not use a null bitmap, because we know there is only one column + * so the INDEX_NULL_MASK bit suffices. Also, pass-by-value datums are + * stored as a full Datum, the same convention as for inner tuple prefixes + * and leaf tuple datums. + */ + +typedef IndexTupleData SpGistNodeTupleData; + +typedef SpGistNodeTupleData *SpGistNodeTuple; + +#define SGNTHDRSZ MAXALIGN(sizeof(SpGistNodeTupleData)) +#define SGNTDATAPTR(x) (((char *) (x)) + SGNTHDRSZ) +#define SGNTDATUM(x, s) ((s)->attLabelType.attbyval ? \ + *(Datum *) SGNTDATAPTR(x) : \ + PointerGetDatum(SGNTDATAPTR(x))) + +/* + * SPGiST leaf tuple: carries a datum and a heap tuple TID + * + * In the simplest case, the datum is the same as the indexed value; but + * it could also be a suffix or some other sort of delta that permits + * reconstruction given knowledge of the prefix path traversed to get here. + * + * The size field is wider than could possibly be needed for an on-disk leaf + * tuple, but this allows us to form leaf tuples even when the datum is too + * wide to be stored immediately, and it costs nothing because of alignment + * considerations. + * + * Normally, nextOffset links to the next tuple belonging to the same parent + * node (which must be on the same page). But when the root page is a leaf + * page, we don't chain its tuples, so nextOffset is always 0 on the root. + * + * size must be a multiple of MAXALIGN + */ +typedef struct SpGistLeafTupleData +{ + unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */ + size:30; /* large enough for any palloc'able value */ + OffsetNumber nextOffset; /* next tuple in chain, or InvalidOffset */ + ItemPointerData heapPtr; /* TID of represented heap tuple */ + /* leaf datum follows */ +} SpGistLeafTupleData; + +typedef SpGistLeafTupleData *SpGistLeafTuple; + +#define SGLTHDRSZ MAXALIGN(sizeof(SpGistLeafTupleData)) +#define SGLTDATAPTR(x) (((char *) (x)) + SGLTHDRSZ) +#define SGLTDATUM(x, s) ((s)->attType.attbyval ? \ + *(Datum *) SGLTDATAPTR(x) : \ + PointerGetDatum(SGLTDATAPTR(x))) + +/* + * SPGiST dead tuple: declaration for examining non-live tuples + * + * The tupstate field of this struct must match those of regular inner and + * leaf tuples, and its size field must match a leaf tuple's. + * Also, the pointer field must be in the same place as a leaf tuple's heapPtr + * field, to satisfy some Asserts that we make when replacing a leaf tuple + * with a dead tuple. + * We don't use nextOffset, but it's needed to align the pointer field. + * pointer and xid are only valid when tupstate = REDIRECT. + */ +typedef struct SpGistDeadTupleData +{ + unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */ + size:30; + OffsetNumber nextOffset; /* not used in dead tuples */ + ItemPointerData pointer; /* redirection inside index */ + TransactionId xid; /* ID of xact that inserted this tuple */ +} SpGistDeadTupleData; + +typedef SpGistDeadTupleData *SpGistDeadTuple; + +#define SGDTSIZE MAXALIGN(sizeof(SpGistDeadTupleData)) + +/* + * Macros for doing free-space calculations. Note that when adding up the + * space needed for tuples, we always consider each tuple to need the tuple's + * size plus sizeof(ItemIdData) (for the line pointer). This works correctly + * so long as tuple sizes are always maxaligned. + */ + +/* Page capacity after allowing for fixed header and special space */ +#define SPGIST_PAGE_CAPACITY \ + MAXALIGN_DOWN(BLCKSZ - \ + SizeOfPageHeaderData - \ + MAXALIGN(sizeof(SpGistPageOpaqueData))) + +/* + * Compute free space on page, assuming that up to n placeholders can be + * recycled if present (n should be the number of tuples to be inserted) + */ +#define SpGistPageGetFreeSpace(p, n) \ + (PageGetExactFreeSpace(p) + \ + Min(SpGistPageGetOpaque(p)->nPlaceholder, n) * \ + (SGDTSIZE + sizeof(ItemIdData))) + +/* + * XLOG stuff + * + * ACCEPT_RDATA_* can only use fixed-length rdata arrays, because of lengthof + */ + +#define ACCEPT_RDATA_DATA(p, s, i) \ + do { \ + Assert((i) < lengthof(rdata)); \ + rdata[i].data = (char *) (p); \ + rdata[i].len = (s); \ + rdata[i].buffer = InvalidBuffer; \ + rdata[i].buffer_std = true; \ + rdata[i].next = NULL; \ + if ((i) > 0) \ + rdata[(i) - 1].next = rdata + (i); \ + } while(0) + +#define ACCEPT_RDATA_BUFFER(b, i) \ + do { \ + Assert((i) < lengthof(rdata)); \ + rdata[i].data = NULL; \ + rdata[i].len = 0; \ + rdata[i].buffer = (b); \ + rdata[i].buffer_std = true; \ + rdata[i].next = NULL; \ + if ((i) > 0) \ + rdata[(i) - 1].next = rdata + (i); \ + } while(0) + + +/* XLOG record types for SPGiST */ +#define XLOG_SPGIST_CREATE_INDEX 0x00 +#define XLOG_SPGIST_ADD_LEAF 0x10 +#define XLOG_SPGIST_MOVE_LEAFS 0x20 +#define XLOG_SPGIST_ADD_NODE 0x30 +#define XLOG_SPGIST_SPLIT_TUPLE 0x40 +#define XLOG_SPGIST_PICKSPLIT 0x50 +#define XLOG_SPGIST_VACUUM_LEAF 0x60 +#define XLOG_SPGIST_VACUUM_ROOT 0x70 +#define XLOG_SPGIST_VACUUM_REDIRECT 0x80 + +/* + * Some redo functions need an SpGistState, although only a few of its fields + * need to be valid. spgxlogState carries the required info in xlog records. + * (See fillFakeState in spgxlog.c for more comments.) + */ +typedef struct spgxlogState +{ + TransactionId myXid; + bool isBuild; +} spgxlogState; + +#define STORE_STATE(s, d) \ + do { \ + (d).myXid = (s)->myXid; \ + (d).isBuild = (s)->isBuild; \ + } while(0) + + +typedef struct spgxlogAddLeaf +{ + RelFileNode node; + + BlockNumber blknoLeaf; /* destination page for leaf tuple */ + bool newPage; /* init dest page? */ + OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */ + OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */ + + BlockNumber blknoParent; /* where the parent downlink is, if any */ + OffsetNumber offnumParent; + uint16 nodeI; + + /* + * new leaf tuple follows, on an intalign boundary (replay only needs to + * fetch its size field, so that should be enough alignment) + */ +} spgxlogAddLeaf; + +typedef struct spgxlogMoveLeafs +{ + RelFileNode node; + + BlockNumber blknoSrc; /* source leaf page */ + BlockNumber blknoDst; /* destination leaf page */ + uint16 nMoves; /* number of tuples moved from source page */ + bool newPage; /* init dest page? */ + bool replaceDead; /* are we replacing a DEAD source tuple? */ + + BlockNumber blknoParent; /* where the parent downlink is */ + OffsetNumber offnumParent; + uint16 nodeI; + + spgxlogState stateSrc; + + /*---------- + * data follows: + * array of deleted tuple numbers, length nMoves + * array of inserted tuple numbers, length nMoves + 1 or 1 + * list of leaf tuples, length nMoves + 1 or 1 (must be maxaligned) + * the tuple number arrays are padded to maxalign boundaries so that the + * leaf tuples will be suitably aligned + * + * Note: if replaceDead is true then there is only one inserted tuple + * number and only one leaf tuple in the data, because we are not copying + * the dead tuple from the source + * + * Buffer references in the rdata array are: + * Src page + * Dest page + * Parent page + *---------- + */ +} spgxlogMoveLeafs; + +typedef struct spgxlogAddNode +{ + RelFileNode node; + + BlockNumber blkno; /* block number of original inner tuple */ + OffsetNumber offnum; /* offset of original inner tuple */ + + BlockNumber blknoParent; /* where parent downlink is, if updated */ + OffsetNumber offnumParent; + uint16 nodeI; + + BlockNumber blknoNew; /* where new tuple goes, if not same place */ + OffsetNumber offnumNew; + bool newPage; /* init new page? */ + + spgxlogState stateSrc; + + /* + * updated inner tuple follows, on an intalign boundary (replay only needs + * to fetch its size field, so that should be enough alignment) + */ +} spgxlogAddNode; + +typedef struct spgxlogSplitTuple +{ + RelFileNode node; + + BlockNumber blknoPrefix; /* where the prefix tuple goes */ + OffsetNumber offnumPrefix; + + BlockNumber blknoPostfix; /* where the postfix tuple goes */ + OffsetNumber offnumPostfix; + bool newPage; /* need to init that page? */ + + /* + * new prefix inner tuple follows, then new postfix inner tuple, on + * intalign boundaries (replay only needs to fetch size fields, so that + * should be enough alignment) + */ +} spgxlogSplitTuple; + +typedef struct spgxlogPickSplit +{ + RelFileNode node; + + BlockNumber blknoSrc; /* original leaf page */ + BlockNumber blknoDest; /* other leaf page, if any */ + uint16 nDelete; /* n to delete from Src */ + uint16 nInsert; /* n to insert on Src and/or Dest */ + bool initSrc; /* re-init the Src page? */ + bool initDest; /* re-init the Dest page? */ + + BlockNumber blknoInner; /* where to put new inner tuple */ + OffsetNumber offnumInner; + bool initInner; /* re-init the Inner page? */ + + BlockNumber blknoParent; /* where the parent downlink is, if any */ + OffsetNumber offnumParent; + uint16 nodeI; + + spgxlogState stateSrc; + + /*---------- + * data follows: + * new inner tuple (assumed to have a maxaligned length) + * array of deleted tuple numbers, length nDelete + * array of inserted tuple numbers, length nInsert + * array of page selector bytes for inserted tuples, length nInsert + * list of leaf tuples, length nInsert (must be maxaligned) + * the tuple number and page selector arrays are padded to maxalign + * boundaries so that the leaf tuples will be suitably aligned + * + * Buffer references in the rdata array are: + * Src page (only if not root and not being init'd) + * Dest page (if used and not being init'd) + * Inner page (only if not being init'd) + * Parent page (if any; could be same as Inner) + *---------- + */ +} spgxlogPickSplit; + +typedef struct spgxlogVacuumLeaf +{ + RelFileNode node; + + BlockNumber blkno; /* block number to clean */ + uint16 nDead; /* number of tuples to become DEAD */ + uint16 nPlaceholder; /* number of tuples to become PLACEHOLDER */ + uint16 nMove; /* number of tuples to move */ + uint16 nChain; /* number of tuples to re-chain */ + + spgxlogState stateSrc; + + /*---------- + * data follows: + * tuple numbers to become DEAD + * tuple numbers to become PLACEHOLDER + * tuple numbers to move from (and replace with PLACEHOLDER) + * tuple numbers to move to (replacing what is there) + * tuple numbers to update nextOffset links of + * tuple numbers to insert in nextOffset links + *---------- + */ +} spgxlogVacuumLeaf; + +typedef struct spgxlogVacuumRoot +{ + /* vacuum root page when it is a leaf */ + RelFileNode node; + + uint16 nDelete; /* number of tuples to delete */ + + spgxlogState stateSrc; + + /* offsets of tuples to delete follow */ +} spgxlogVacuumRoot; + +typedef struct spgxlogVacuumRedirect +{ + RelFileNode node; + + BlockNumber blkno; /* block number to clean */ + uint16 nToPlaceholder; /* number of redirects to make placeholders */ + OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */ + + /* offsets of redirect tuples to make placeholders follow */ +} spgxlogVacuumRedirect; + +/* + * The "flags" argument for SpGistGetBuffer should be either GBUF_LEAF to + * get a leaf page, or GBUF_INNER_PARITY(blockNumber) to get an inner + * page in the same triple-parity group as the specified block number. + * (Typically, this should be GBUF_INNER_PARITY(parentBlockNumber + 1) + * to follow the rule described in spgist/README.) + */ +#define GBUF_PARITY_MASK 0x03 +#define GBUF_LEAF 0x04 +#define GBUF_INNER_PARITY(x) ((x) % 3) + +/* spgutils.c */ +extern void initSpGistState(SpGistState *state, Relation index); +extern Buffer SpGistNewBuffer(Relation index); +extern void SpGistUpdateMetaPage(Relation index); +extern Buffer SpGistGetBuffer(Relation index, int flags, + int needSpace, bool *isNew); +extern void SpGistSetLastUsedPage(Relation index, Buffer buffer); +extern void SpGistInitPage(Page page, uint16 f); +extern void SpGistInitBuffer(Buffer b, uint16 f); +extern void SpGistInitMetapage(Page page); +extern unsigned int SpGistGetTypeSize(SpGistTypeDesc *att, Datum datum); +extern SpGistLeafTuple spgFormLeafTuple(SpGistState *state, + ItemPointer heapPtr, Datum datum); +extern SpGistNodeTuple spgFormNodeTuple(SpGistState *state, + Datum label, bool isnull); +extern SpGistInnerTuple spgFormInnerTuple(SpGistState *state, + bool hasPrefix, Datum prefix, + int nNodes, SpGistNodeTuple *nodes); +extern SpGistDeadTuple spgFormDeadTuple(SpGistState *state, int tupstate, + BlockNumber blkno, OffsetNumber offnum); +extern Datum *spgExtractNodeLabels(SpGistState *state, + SpGistInnerTuple innerTuple); +extern OffsetNumber SpGistPageAddNewItem(SpGistState *state, Page page, + Item item, Size size, + OffsetNumber *startOffset, + bool errorOK); + +/* spgdoinsert.c */ +extern void updateNodeLink(SpGistInnerTuple tup, int nodeN, + BlockNumber blkno, OffsetNumber offset); +extern void spgPageIndexMultiDelete(SpGistState *state, Page page, + OffsetNumber *itemnos, int nitems, + int firststate, int reststate, + BlockNumber blkno, OffsetNumber offnum); +extern void spgdoinsert(Relation index, SpGistState *state, + ItemPointer heapPtr, Datum datum); + +#endif /* SPGIST_PRIVATE_H */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 14e177dc482..eb343545915 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201112071 +#define CATALOG_VERSION_NO 201112171 #endif diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h index ddacdf274c4..6fdd1d5b052 100644 --- a/src/include/catalog/pg_am.h +++ b/src/include/catalog/pg_am.h @@ -117,17 +117,20 @@ typedef FormData_pg_am *Form_pg_am; * ---------------- */ -DATA(insert OID = 403 ( btree 5 2 t f t t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcostestimate btoptions )); +DATA(insert OID = 403 ( btree 5 2 t f t t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcostestimate btoptions )); DESCR("b-tree index access method"); #define BTREE_AM_OID 403 -DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); +DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); DESCR("hash index access method"); #define HASH_AM_OID 405 -DATA(insert OID = 783 ( gist 0 8 f t f f t f t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); +DATA(insert OID = 783 ( gist 0 8 f t f f t f t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); DESCR("GiST index access method"); #define GIST_AM_OID 783 -DATA(insert OID = 2742 ( gin 0 5 f f f f t f t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); +DATA(insert OID = 2742 ( gin 0 5 f f f f t f t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); DESCR("GIN index access method"); #define GIN_AM_OID 2742 +DATA(insert OID = 4000 ( spgist 0 5 f f f f f f f f f f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcostestimate spgoptions )); +DESCR("SP-GiST index access method"); +#define SPGIST_AM_OID 4000 #endif /* PG_AM_H */ diff --git a/src/include/catalog/pg_amop.h b/src/include/catalog/pg_amop.h index 1e8c9a289f9..cb394e03e40 100644 --- a/src/include/catalog/pg_amop.h +++ b/src/include/catalog/pg_amop.h @@ -737,4 +737,37 @@ DATA(insert ( 3919 3831 3831 8 s 3892 783 0 )); DATA(insert ( 3919 3831 2283 16 s 3889 783 0 )); DATA(insert ( 3919 3831 3831 18 s 3882 783 0 )); +/* + * SP-GiST quad_point_ops + */ +DATA(insert ( 4015 600 600 11 s 506 4000 0 )); +DATA(insert ( 4015 600 600 1 s 507 4000 0 )); +DATA(insert ( 4015 600 600 5 s 508 4000 0 )); +DATA(insert ( 4015 600 600 10 s 509 4000 0 )); +DATA(insert ( 4015 600 600 6 s 510 4000 0 )); +DATA(insert ( 4015 600 603 8 s 511 4000 0 )); + +/* + * SP-GiST kd_point_ops + */ +DATA(insert ( 4016 600 600 11 s 506 4000 0 )); +DATA(insert ( 4016 600 600 1 s 507 4000 0 )); +DATA(insert ( 4016 600 600 5 s 508 4000 0 )); +DATA(insert ( 4016 600 600 10 s 509 4000 0 )); +DATA(insert ( 4016 600 600 6 s 510 4000 0 )); +DATA(insert ( 4016 600 603 8 s 511 4000 0 )); + +/* + * SP-GiST text_ops + */ +DATA(insert ( 4017 25 25 1 s 2314 4000 0 )); +DATA(insert ( 4017 25 25 2 s 2315 4000 0 )); +DATA(insert ( 4017 25 25 3 s 98 4000 0 )); +DATA(insert ( 4017 25 25 4 s 2317 4000 0 )); +DATA(insert ( 4017 25 25 5 s 2318 4000 0 )); +DATA(insert ( 4017 25 25 11 s 664 4000 0 )); +DATA(insert ( 4017 25 25 12 s 665 4000 0 )); +DATA(insert ( 4017 25 25 14 s 667 4000 0 )); +DATA(insert ( 4017 25 25 15 s 666 4000 0 )); + #endif /* PG_AMOP_H */ diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h index 8571dd08709..a4c49efed83 100644 --- a/src/include/catalog/pg_amproc.h +++ b/src/include/catalog/pg_amproc.h @@ -356,4 +356,22 @@ DATA(insert ( 3919 3831 3831 5 3879 )); DATA(insert ( 3919 3831 3831 6 3880 )); DATA(insert ( 3919 3831 3831 7 3881 )); + +/* sp-gist */ +DATA(insert ( 4015 600 600 1 4018 )); +DATA(insert ( 4015 600 600 2 4019 )); +DATA(insert ( 4015 600 600 3 4020 )); +DATA(insert ( 4015 600 600 4 4021 )); +DATA(insert ( 4015 600 600 5 4022 )); +DATA(insert ( 4016 600 600 1 4023 )); +DATA(insert ( 4016 600 600 2 4024 )); +DATA(insert ( 4016 600 600 3 4025 )); +DATA(insert ( 4016 600 600 4 4026 )); +DATA(insert ( 4016 600 600 5 4022 )); +DATA(insert ( 4017 25 25 1 4027 )); +DATA(insert ( 4017 25 25 2 4028 )); +DATA(insert ( 4017 25 25 3 4029 )); +DATA(insert ( 4017 25 25 4 4030 )); +DATA(insert ( 4017 25 25 5 4031 )); + #endif /* PG_AMPROC_H */ diff --git a/src/include/catalog/pg_opclass.h b/src/include/catalog/pg_opclass.h index eecd3b63c50..c692ae4311b 100644 --- a/src/include/catalog/pg_opclass.h +++ b/src/include/catalog/pg_opclass.h @@ -223,5 +223,8 @@ DATA(insert ( 783 tsquery_ops PGNSP PGUID 3702 3615 t 20 )); DATA(insert ( 403 range_ops PGNSP PGUID 3901 3831 t 0 )); DATA(insert ( 405 range_ops PGNSP PGUID 3903 3831 t 0 )); DATA(insert ( 783 range_ops PGNSP PGUID 3919 3831 t 0 )); +DATA(insert ( 4000 quad_point_ops PGNSP PGUID 4015 600 t 0 )); +DATA(insert ( 4000 kd_point_ops PGNSP PGUID 4016 600 f 0 )); +DATA(insert ( 4000 text_ops PGNSP PGUID 4017 25 t 0 )); #endif /* PG_OPCLASS_H */ diff --git a/src/include/catalog/pg_opfamily.h b/src/include/catalog/pg_opfamily.h index 5ea949bec6b..009000ffcff 100644 --- a/src/include/catalog/pg_opfamily.h +++ b/src/include/catalog/pg_opfamily.h @@ -142,5 +142,8 @@ DATA(insert OID = 3702 ( 783 tsquery_ops PGNSP PGUID )); DATA(insert OID = 3901 ( 403 range_ops PGNSP PGUID )); DATA(insert OID = 3903 ( 405 range_ops PGNSP PGUID )); DATA(insert OID = 3919 ( 783 range_ops PGNSP PGUID )); +DATA(insert OID = 4015 ( 4000 quad_point_ops PGNSP PGUID )); +DATA(insert OID = 4016 ( 4000 kd_point_ops PGNSP PGUID )); +DATA(insert OID = 4017 ( 4000 text_ops PGNSP PGUID )); #endif /* PG_OPFAMILY_H */ diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 924cb1f601c..6da3b421ae3 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4481,6 +4481,68 @@ DESCR("int8range constructor"); DATA(insert OID = 3946 ( int8range PGNSP PGUID 12 1 0 0 0 f f f f f i 3 0 3926 "20 20 25" _null_ _null_ _null_ _null_ range_constructor3 _null_ _null_ _null_ )); DESCR("int8range constructor"); +/* spgist support functions */ +DATA(insert OID = 4001 ( spggettuple PGNSP PGUID 12 1 0 0 0 f f f t f v 2 0 16 "2281 2281" _null_ _null_ _null_ _null_ spggettuple _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4002 ( spggetbitmap PGNSP PGUID 12 1 0 0 0 f f f t f v 2 0 20 "2281 2281" _null_ _null_ _null_ _null_ spggetbitmap _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4003 ( spginsert PGNSP PGUID 12 1 0 0 0 f f f t f v 6 0 16 "2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ spginsert _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4004 ( spgbeginscan PGNSP PGUID 12 1 0 0 0 f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ spgbeginscan _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4005 ( spgrescan PGNSP PGUID 12 1 0 0 0 f f f t f v 5 0 2278 "2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ spgrescan _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4006 ( spgendscan PGNSP PGUID 12 1 0 0 0 f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ spgendscan _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4007 ( spgmarkpos PGNSP PGUID 12 1 0 0 0 f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ spgmarkpos _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4008 ( spgrestrpos PGNSP PGUID 12 1 0 0 0 f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ spgrestrpos _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4009 ( spgbuild PGNSP PGUID 12 1 0 0 0 f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ spgbuild _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4010 ( spgbuildempty PGNSP PGUID 12 1 0 0 0 f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ spgbuildempty _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4011 ( spgbulkdelete PGNSP PGUID 12 1 0 0 0 f f f t f v 4 0 2281 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ spgbulkdelete _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4012 ( spgvacuumcleanup PGNSP PGUID 12 1 0 0 0 f f f t f v 2 0 2281 "2281 2281" _null_ _null_ _null_ _null_ spgvacuumcleanup _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4013 ( spgcostestimate PGNSP PGUID 12 1 0 0 0 f f f t f v 9 0 2278 "2281 2281 2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ spgcostestimate _null_ _null_ _null_ )); +DESCR("spgist(internal)"); +DATA(insert OID = 4014 ( spgoptions PGNSP PGUID 12 1 0 0 0 f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ spgoptions _null_ _null_ _null_ )); +DESCR("spgist(internal)"); + +/* spgist opclasses */ +DATA(insert OID = 4018 ( spg_quad_config PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_quad_config _null_ _null_ _null_ )); +DESCR("SP-GiST support for quad tree over point"); +DATA(insert OID = 4019 ( spg_quad_choose PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_quad_choose _null_ _null_ _null_ )); +DESCR("SP-GiST support for quad tree over point"); +DATA(insert OID = 4020 ( spg_quad_picksplit PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_quad_picksplit _null_ _null_ _null_ )); +DESCR("SP-GiST support for quad tree over point"); +DATA(insert OID = 4021 ( spg_quad_inner_consistent PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_quad_inner_consistent _null_ _null_ _null_ )); +DESCR("SP-GiST support for quad tree over point"); +DATA(insert OID = 4022 ( spg_quad_leaf_consistent PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 16 "2281 2281" _null_ _null_ _null_ _null_ spg_quad_leaf_consistent _null_ _null_ _null_ )); +DESCR("SP-GiST support for quad tree and k-d tree over point"); + +DATA(insert OID = 4023 ( spg_kd_config PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_kd_config _null_ _null_ _null_ )); +DESCR("SP-GiST support for k-d tree over point"); +DATA(insert OID = 4024 ( spg_kd_choose PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_kd_choose _null_ _null_ _null_ )); +DESCR("SP-GiST support for k-d tree over point"); +DATA(insert OID = 4025 ( spg_kd_picksplit PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_kd_picksplit _null_ _null_ _null_ )); +DESCR("SP-GiST support for k-d tree over point"); +DATA(insert OID = 4026 ( spg_kd_inner_consistent PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_kd_inner_consistent _null_ _null_ _null_ )); +DESCR("SP-GiST support for k-d tree over point"); + +DATA(insert OID = 4027 ( spg_text_config PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_text_config _null_ _null_ _null_ )); +DESCR("SP-GiST support for suffix tree over text"); +DATA(insert OID = 4028 ( spg_text_choose PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_text_choose _null_ _null_ _null_ )); +DESCR("SP-GiST support for suffix tree over text"); +DATA(insert OID = 4029 ( spg_text_picksplit PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_text_picksplit _null_ _null_ _null_ )); +DESCR("SP-GiST support for suffix tree over text"); +DATA(insert OID = 4030 ( spg_text_inner_consistent PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_text_inner_consistent _null_ _null_ _null_ )); +DESCR("SP-GiST support for suffix tree over text"); +DATA(insert OID = 4031 ( spg_text_leaf_consistent PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 16 "2281 2281" _null_ _null_ _null_ _null_ spg_text_leaf_consistent _null_ _null_ _null_ )); +DESCR("SP-GiST support for suffix tree over text"); + /* * Symbolic values for provolatile column: these indicate whether the result diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 994dc5368b1..9c5af5960fd 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -1080,6 +1080,26 @@ extern Datum window_first_value(PG_FUNCTION_ARGS); extern Datum window_last_value(PG_FUNCTION_ARGS); extern Datum window_nth_value(PG_FUNCTION_ARGS); +/* access/spgist/spgquadtreeproc.c */ +extern Datum spg_quad_config(PG_FUNCTION_ARGS); +extern Datum spg_quad_choose(PG_FUNCTION_ARGS); +extern Datum spg_quad_picksplit(PG_FUNCTION_ARGS); +extern Datum spg_quad_inner_consistent(PG_FUNCTION_ARGS); +extern Datum spg_quad_leaf_consistent(PG_FUNCTION_ARGS); + +/* access/spgist/spgkdtreeproc.c */ +extern Datum spg_kd_config(PG_FUNCTION_ARGS); +extern Datum spg_kd_choose(PG_FUNCTION_ARGS); +extern Datum spg_kd_picksplit(PG_FUNCTION_ARGS); +extern Datum spg_kd_inner_consistent(PG_FUNCTION_ARGS); + +/* access/spgist/spgtextproc.c */ +extern Datum spg_text_config(PG_FUNCTION_ARGS); +extern Datum spg_text_choose(PG_FUNCTION_ARGS); +extern Datum spg_text_picksplit(PG_FUNCTION_ARGS); +extern Datum spg_text_inner_consistent(PG_FUNCTION_ARGS); +extern Datum spg_text_leaf_consistent(PG_FUNCTION_ARGS); + /* access/gin/ginarrayproc.c */ extern Datum ginarrayextract(PG_FUNCTION_ARGS); extern Datum ginarrayextract_2args(PG_FUNCTION_ARGS); diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 32d14b60290..6afcbf47537 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -194,6 +194,7 @@ extern Selectivity estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, extern Datum btcostestimate(PG_FUNCTION_ARGS); extern Datum hashcostestimate(PG_FUNCTION_ARGS); extern Datum gistcostestimate(PG_FUNCTION_ARGS); +extern Datum spgcostestimate(PG_FUNCTION_ARGS); extern Datum gincostestimate(PG_FUNCTION_ARGS); #endif /* SELFUNCS_H */ diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index bdd1f4ec78e..86cee2de942 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -61,6 +61,26 @@ CREATE TEMP TABLE gcircle_tbl AS SELECT circle(home_base) AS f1 FROM slow_emp4000; CREATE INDEX ggpolygonind ON gpolygon_tbl USING gist (f1); CREATE INDEX ggcircleind ON gcircle_tbl USING gist (f1); +-- +-- SP-GiST +-- +CREATE TABLE quad_point_tbl AS + SELECT point(unique1,unique2) AS p FROM tenk1; +INSERT INTO quad_point_tbl + SELECT '(333.0,400.0)'::point FROM generate_series(1,1000); +CREATE INDEX sp_quad_ind ON quad_point_tbl USING spgist (p); +CREATE TABLE kd_point_tbl AS SELECT * FROM quad_point_tbl; +CREATE INDEX sp_kd_ind ON kd_point_tbl USING spgist (p kd_point_ops); +CREATE TABLE suffix_text_tbl AS + SELECT name AS t FROM road; +INSERT INTO suffix_text_tbl + SELECT '0123456789abcdef' FROM generate_series(1,1000); +INSERT INTO suffix_text_tbl VALUES ('0123456789abcde'); +INSERT INTO suffix_text_tbl VALUES ('0123456789abcdefF'); +CREATE INDEX sp_suff_ind ON suffix_text_tbl USING spgist (t); +-- +-- Test GiST and SP-GiST indexes +-- -- get non-indexed results for comparison purposes SET enable_seqscan = ON; SET enable_indexscan = OFF; @@ -207,22 +227,141 @@ SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0 (10,10) (4 rows) +SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + count +------- + 1057 +(1 row) + +SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; + count +------- + 1057 +(1 row) + +SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)'; + count +------- + 6000 +(1 row) + +SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)'; + count +------- + 4999 +(1 row) + +SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)'; + count +------- + 5000 +(1 row) + +SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)'; + count +------- + 5999 +(1 row) + +SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; + count +------- + 1 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef'; + count +------- + 1000 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde'; + count +------- + 1 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF'; + count +------- + 1 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct '; + count +------- + 1705 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct '; + count +------- + 1705 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct '; + count +------- + 1706 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct '; + count +------- + 1706 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct '; + count +------- + 1 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St '; + count +------- + 2 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St '; + count +------- + 50 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St '; + count +------- + 50 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St '; + count +------- + 48 +(1 row) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St '; + count +------- + 48 +(1 row) + +-- Now check the results from plain indexscan SET enable_seqscan = OFF; SET enable_indexscan = ON; -SET enable_bitmapscan = ON; +SET enable_bitmapscan = OFF; EXPLAIN (COSTS OFF) SELECT * FROM fast_emp4000 WHERE home_base @ '(200,200),(2000,1000)'::box ORDER BY (home_base[0])[0]; - QUERY PLAN ----------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------- Sort Sort Key: ((home_base[0])[0]) - -> Bitmap Heap Scan on fast_emp4000 - Recheck Cond: (home_base @ '(2000,1000),(200,200)'::box) - -> Bitmap Index Scan on grect2ind - Index Cond: (home_base @ '(2000,1000),(200,200)'::box) -(6 rows) + -> Index Scan using grect2ind on fast_emp4000 + Index Cond: (home_base @ '(2000,1000),(200,200)'::box) +(4 rows) SELECT * FROM fast_emp4000 WHERE home_base @ '(200,200),(2000,1000)'::box @@ -235,14 +374,12 @@ SELECT * FROM fast_emp4000 EXPLAIN (COSTS OFF) SELECT count(*) FROM fast_emp4000 WHERE home_base && '(1000,1000,0,0)'::box; - QUERY PLAN -------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on fast_emp4000 - Recheck Cond: (home_base && '(1000,1000),(0,0)'::box) - -> Bitmap Index Scan on grect2ind - Index Cond: (home_base && '(1000,1000),(0,0)'::box) -(5 rows) + -> Index Scan using grect2ind on fast_emp4000 + Index Cond: (home_base && '(1000,1000),(0,0)'::box) +(3 rows) SELECT count(*) FROM fast_emp4000 WHERE home_base && '(1000,1000,0,0)'::box; count @@ -252,14 +389,12 @@ SELECT count(*) FROM fast_emp4000 WHERE home_base && '(1000,1000,0,0)'::box; EXPLAIN (COSTS OFF) SELECT count(*) FROM fast_emp4000 WHERE home_base IS NULL; - QUERY PLAN ------------------------------------------------ + QUERY PLAN +-------------------------------------------------- Aggregate - -> Bitmap Heap Scan on fast_emp4000 - Recheck Cond: (home_base IS NULL) - -> Bitmap Index Scan on grect2ind - Index Cond: (home_base IS NULL) -(5 rows) + -> Index Scan using grect2ind on fast_emp4000 + Index Cond: (home_base IS NULL) +(3 rows) SELECT count(*) FROM fast_emp4000 WHERE home_base IS NULL; count @@ -308,14 +443,12 @@ SELECT * FROM circle_tbl WHERE f1 && circle(point(1,-2), 1) EXPLAIN (COSTS OFF) SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon; - QUERY PLAN ------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------ Aggregate - -> Bitmap Heap Scan on gpolygon_tbl - Recheck Cond: (f1 && '((1000,1000),(0,0))'::polygon) - -> Bitmap Index Scan on ggpolygonind - Index Cond: (f1 && '((1000,1000),(0,0))'::polygon) -(5 rows) + -> Index Scan using ggpolygonind on gpolygon_tbl + Index Cond: (f1 && '((1000,1000),(0,0))'::polygon) +(3 rows) SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon; count @@ -325,14 +458,12 @@ SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon; EXPLAIN (COSTS OFF) SELECT count(*) FROM gcircle_tbl WHERE f1 && '<(500,500),500>'::circle; - QUERY PLAN -------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Bitmap Heap Scan on gcircle_tbl - Recheck Cond: (f1 && '<(500,500),500>'::circle) - -> Bitmap Index Scan on ggcircleind - Index Cond: (f1 && '<(500,500),500>'::circle) -(5 rows) + -> Index Scan using ggcircleind on gcircle_tbl + Index Cond: (f1 && '<(500,500),500>'::circle) +(3 rows) SELECT count(*) FROM gcircle_tbl WHERE f1 && '<(500,500),500>'::circle; count @@ -547,6 +678,412 @@ SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0 (10,10) (4 rows) +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + QUERY PLAN +--------------------------------------------------------- + Aggregate + -> Index Scan using sp_quad_ind on quad_point_tbl + Index Cond: (p <@ '(1000,1000),(200,200)'::box) +(3 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + count +------- + 1057 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; + QUERY PLAN +--------------------------------------------------------- + Aggregate + -> Index Scan using sp_quad_ind on quad_point_tbl + Index Cond: ('(1000,1000),(200,200)'::box @> p) +(3 rows) + +SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; + count +------- + 1057 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------ + Aggregate + -> Index Scan using sp_quad_ind on quad_point_tbl + Index Cond: (p << '(5000,4000)'::point) +(3 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)'; + count +------- + 6000 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------ + Aggregate + -> Index Scan using sp_quad_ind on quad_point_tbl + Index Cond: (p >> '(5000,4000)'::point) +(3 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)'; + count +------- + 4999 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------ + Aggregate + -> Index Scan using sp_quad_ind on quad_point_tbl + Index Cond: (p <^ '(5000,4000)'::point) +(3 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)'; + count +------- + 5000 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------ + Aggregate + -> Index Scan using sp_quad_ind on quad_point_tbl + Index Cond: (p >^ '(5000,4000)'::point) +(3 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)'; + count +------- + 5999 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; + QUERY PLAN +------------------------------------------------------ + Aggregate + -> Index Scan using sp_quad_ind on quad_point_tbl + Index Cond: (p ~= '(4585,365)'::point) +(3 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; + count +------- + 1 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + QUERY PLAN +--------------------------------------------------------- + Aggregate + -> Index Scan using sp_kd_ind on kd_point_tbl + Index Cond: (p <@ '(1000,1000),(200,200)'::box) +(3 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + count +------- + 1057 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; + QUERY PLAN +--------------------------------------------------------- + Aggregate + -> Index Scan using sp_kd_ind on kd_point_tbl + Index Cond: ('(1000,1000),(200,200)'::box @> p) +(3 rows) + +SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; + count +------- + 1057 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)'; + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Index Scan using sp_kd_ind on kd_point_tbl + Index Cond: (p << '(5000,4000)'::point) +(3 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)'; + count +------- + 6000 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)'; + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Index Scan using sp_kd_ind on kd_point_tbl + Index Cond: (p >> '(5000,4000)'::point) +(3 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)'; + count +------- + 4999 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)'; + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Index Scan using sp_kd_ind on kd_point_tbl + Index Cond: (p <^ '(5000,4000)'::point) +(3 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)'; + count +------- + 5000 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)'; + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Index Scan using sp_kd_ind on kd_point_tbl + Index Cond: (p >^ '(5000,4000)'::point) +(3 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)'; + count +------- + 5999 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)'; + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Index Scan using sp_kd_ind on kd_point_tbl + Index Cond: (p ~= '(4585,365)'::point) +(3 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)'; + count +------- + 1 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t = '0123456789abcdef'::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef'; + count +------- + 1000 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t = '0123456789abcde'::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde'; + count +------- + 1 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t = '0123456789abcdefF'::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF'; + count +------- + 1 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct '; + QUERY PLAN +---------------------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t < 'Aztec Ct '::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct '; + count +------- + 1705 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct '; + QUERY PLAN +------------------------------------------------------------------------ + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t ~<~ 'Aztec Ct '::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct '; + count +------- + 1705 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct '; + QUERY PLAN +----------------------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t <= 'Aztec Ct '::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct '; + count +------- + 1706 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct '; + QUERY PLAN +------------------------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t ~<=~ 'Aztec Ct '::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct '; + count +------- + 1706 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct '; + QUERY PLAN +---------------------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t = 'Aztec Ct '::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct '; + count +------- + 1 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St '; + QUERY PLAN +---------------------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t = 'Worth St '::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St '; + count +------- + 2 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St '; + QUERY PLAN +----------------------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t >= 'Worth St '::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St '; + count +------- + 50 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St '; + QUERY PLAN +------------------------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t ~>=~ 'Worth St '::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St '; + count +------- + 50 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St '; + QUERY PLAN +---------------------------------------------------------------------- + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t > 'Worth St '::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St '; + count +------- + 48 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St '; + QUERY PLAN +------------------------------------------------------------------------ + Aggregate + -> Index Scan using sp_suff_ind on suffix_text_tbl + Index Cond: (t ~>~ 'Worth St '::text) +(3 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St '; + count +------- + 48 +(1 row) + +-- Now check the results from bitmap indexscan SET enable_seqscan = OFF; SET enable_indexscan = OFF; SET enable_bitmapscan = ON; @@ -571,6 +1108,465 @@ SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0 (10,10) (4 rows) +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + QUERY PLAN +--------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on quad_point_tbl + Recheck Cond: (p <@ '(1000,1000),(200,200)'::box) + -> Bitmap Index Scan on sp_quad_ind + Index Cond: (p <@ '(1000,1000),(200,200)'::box) +(5 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + count +------- + 1057 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; + QUERY PLAN +--------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on quad_point_tbl + Recheck Cond: ('(1000,1000),(200,200)'::box @> p) + -> Bitmap Index Scan on sp_quad_ind + Index Cond: ('(1000,1000),(200,200)'::box @> p) +(5 rows) + +SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; + count +------- + 1057 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on quad_point_tbl + Recheck Cond: (p << '(5000,4000)'::point) + -> Bitmap Index Scan on sp_quad_ind + Index Cond: (p << '(5000,4000)'::point) +(5 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)'; + count +------- + 6000 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on quad_point_tbl + Recheck Cond: (p >> '(5000,4000)'::point) + -> Bitmap Index Scan on sp_quad_ind + Index Cond: (p >> '(5000,4000)'::point) +(5 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)'; + count +------- + 4999 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on quad_point_tbl + Recheck Cond: (p <^ '(5000,4000)'::point) + -> Bitmap Index Scan on sp_quad_ind + Index Cond: (p <^ '(5000,4000)'::point) +(5 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)'; + count +------- + 5000 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on quad_point_tbl + Recheck Cond: (p >^ '(5000,4000)'::point) + -> Bitmap Index Scan on sp_quad_ind + Index Cond: (p >^ '(5000,4000)'::point) +(5 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)'; + count +------- + 5999 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; + QUERY PLAN +------------------------------------------------------ + Aggregate + -> Bitmap Heap Scan on quad_point_tbl + Recheck Cond: (p ~= '(4585,365)'::point) + -> Bitmap Index Scan on sp_quad_ind + Index Cond: (p ~= '(4585,365)'::point) +(5 rows) + +SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; + count +------- + 1 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + QUERY PLAN +--------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on kd_point_tbl + Recheck Cond: (p <@ '(1000,1000),(200,200)'::box) + -> Bitmap Index Scan on sp_kd_ind + Index Cond: (p <@ '(1000,1000),(200,200)'::box) +(5 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + count +------- + 1057 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; + QUERY PLAN +--------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on kd_point_tbl + Recheck Cond: ('(1000,1000),(200,200)'::box @> p) + -> Bitmap Index Scan on sp_kd_ind + Index Cond: ('(1000,1000),(200,200)'::box @> p) +(5 rows) + +SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; + count +------- + 1057 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on kd_point_tbl + Recheck Cond: (p << '(5000,4000)'::point) + -> Bitmap Index Scan on sp_kd_ind + Index Cond: (p << '(5000,4000)'::point) +(5 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)'; + count +------- + 6000 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on kd_point_tbl + Recheck Cond: (p >> '(5000,4000)'::point) + -> Bitmap Index Scan on sp_kd_ind + Index Cond: (p >> '(5000,4000)'::point) +(5 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)'; + count +------- + 4999 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on kd_point_tbl + Recheck Cond: (p <^ '(5000,4000)'::point) + -> Bitmap Index Scan on sp_kd_ind + Index Cond: (p <^ '(5000,4000)'::point) +(5 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)'; + count +------- + 5000 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)'; + QUERY PLAN +------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on kd_point_tbl + Recheck Cond: (p >^ '(5000,4000)'::point) + -> Bitmap Index Scan on sp_kd_ind + Index Cond: (p >^ '(5000,4000)'::point) +(5 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)'; + count +------- + 5999 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)'; + QUERY PLAN +------------------------------------------------------ + Aggregate + -> Bitmap Heap Scan on kd_point_tbl + Recheck Cond: (p ~= '(4585,365)'::point) + -> Bitmap Index Scan on sp_kd_ind + Index Cond: (p ~= '(4585,365)'::point) +(5 rows) + +SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)'; + count +------- + 1 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef'; + QUERY PLAN +---------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t = '0123456789abcdef'::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t = '0123456789abcdef'::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef'; + count +------- + 1000 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde'; + QUERY PLAN +--------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t = '0123456789abcde'::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t = '0123456789abcde'::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde'; + count +------- + 1 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF'; + QUERY PLAN +----------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t = '0123456789abcdefF'::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t = '0123456789abcdefF'::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF'; + count +------- + 1 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct '; + QUERY PLAN +---------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t < 'Aztec Ct '::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t < 'Aztec Ct '::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct '; + count +------- + 1705 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct '; + QUERY PLAN +------------------------------------------------------------------------------ + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t ~<~ 'Aztec Ct '::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t ~<~ 'Aztec Ct '::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct '; + count +------- + 1705 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct '; + QUERY PLAN +----------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t <= 'Aztec Ct '::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t <= 'Aztec Ct '::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct '; + count +------- + 1706 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct '; + QUERY PLAN +------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t ~<=~ 'Aztec Ct '::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t ~<=~ 'Aztec Ct '::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct '; + count +------- + 1706 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct '; + QUERY PLAN +---------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t = 'Aztec Ct '::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t = 'Aztec Ct '::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct '; + count +------- + 1 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St '; + QUERY PLAN +---------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t = 'Worth St '::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t = 'Worth St '::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St '; + count +------- + 2 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St '; + QUERY PLAN +----------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t >= 'Worth St '::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t >= 'Worth St '::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St '; + count +------- + 50 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St '; + QUERY PLAN +------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t ~>=~ 'Worth St '::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t ~>=~ 'Worth St '::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St '; + count +------- + 50 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St '; + QUERY PLAN +---------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t > 'Worth St '::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t > 'Worth St '::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St '; + count +------- + 48 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St '; + QUERY PLAN +------------------------------------------------------------------------------ + Aggregate + -> Bitmap Heap Scan on suffix_text_tbl + Recheck Cond: (t ~>~ 'Worth St '::text) + -> Bitmap Index Scan on sp_suff_ind + Index Cond: (t ~>~ 'Worth St '::text) +(5 rows) + +SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St '; + count +------- + 48 +(1 row) + RESET enable_seqscan; RESET enable_indexscan; RESET enable_bitmapscan; diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out index a0ffd77e0ed..8e4004ed311 100644 --- a/src/test/regress/expected/opr_sanity.out +++ b/src/test/regress/expected/opr_sanity.out @@ -1053,7 +1053,22 @@ ORDER BY 1, 2, 3; 2742 | 2 | @@@ 2742 | 3 | <@ 2742 | 4 | = -(43 rows) + 4000 | 1 | << + 4000 | 1 | ~<~ + 4000 | 2 | ~<=~ + 4000 | 3 | = + 4000 | 4 | ~>=~ + 4000 | 5 | >> + 4000 | 5 | ~>~ + 4000 | 6 | ~= + 4000 | 8 | <@ + 4000 | 10 | <^ + 4000 | 11 | < + 4000 | 11 | >^ + 4000 | 12 | <= + 4000 | 14 | >= + 4000 | 15 | > +(58 rows) -- Check that all opclass search operators have selectivity estimators. -- This is not absolutely required, but it seems a reasonable thing @@ -1077,6 +1092,24 @@ WHERE NOT EXISTS(SELECT 1 FROM pg_amop AS p2 ---------+----------- (0 rows) +-- Check that each operator listed in pg_amop has an associated opclass, +-- that is one whose opcintype matches oprleft (possibly by coercion). +-- Otherwise the operator is useless because it cannot be matched to an index. +-- (In principle it could be useful to list such operators in multiple-datatype +-- btree opfamilies, but in practice you'd expect there to be an opclass for +-- every datatype the family knows about.) +SELECT p1.amopfamily, p1.amopstrategy, p1.amopopr +FROM pg_amop AS p1 +WHERE NOT EXISTS(SELECT 1 FROM pg_opclass AS p2 + WHERE p2.opcfamily = p1.amopfamily + AND binary_coercible(p2.opcintype, p1.amoplefttype)); + amopfamily | amopstrategy | amopopr +------------+--------------+--------- + 1029 | 27 | 433 + 1029 | 47 | 757 + 1029 | 67 | 759 +(3 rows) + -- Operators that are primary members of opclasses must be immutable (else -- it suggests that the index ordering isn't fixed). Operators that are -- cross-type members need only be stable, since they are just shorthands @@ -1297,6 +1330,27 @@ ORDER BY 1; 2226 | 1 | hashint4 | cid_ops (6 rows) +-- We can also check SP-GiST carefully, since the support routine signatures +-- are independent of the datatype being indexed. +SELECT p1.amprocfamily, p1.amprocnum, + p2.oid, p2.proname, + p3.opfname +FROM pg_amproc AS p1, pg_proc AS p2, pg_opfamily AS p3 +WHERE p3.opfmethod = (SELECT oid FROM pg_am WHERE amname = 'spgist') + AND p1.amprocfamily = p3.oid AND p1.amproc = p2.oid AND + (CASE WHEN amprocnum = 1 OR amprocnum = 2 OR amprocnum = 3 OR amprocnum = 4 + THEN prorettype != 'void'::regtype OR proretset OR pronargs != 2 + OR proargtypes[0] != 'internal'::regtype + OR proargtypes[1] != 'internal'::regtype + WHEN amprocnum = 5 + THEN prorettype != 'bool'::regtype OR proretset OR pronargs != 2 + OR proargtypes[0] != 'internal'::regtype + OR proargtypes[1] != 'internal'::regtype + ELSE true END); + amprocfamily | amprocnum | oid | proname | opfname +--------------+-----------+-----+---------+--------- +(0 rows) + -- Support routines that are primary members of opfamilies must be immutable -- (else it suggests that the index ordering isn't fixed). But cross-type -- members need only be stable, since they are just shorthands diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out index cb468e58b91..9cae9d8bf10 100644 --- a/src/test/regress/expected/sanity_check.out +++ b/src/test/regress/expected/sanity_check.out @@ -63,6 +63,7 @@ SELECT relname, relhasindex int8_tbl | f interval_tbl | f iportaltest | f + kd_point_tbl | t log_table | f lseg_tbl | f main_table | f @@ -134,6 +135,7 @@ SELECT relname, relhasindex pg_user_mapping | t point_tbl | t polygon_tbl | t + quad_point_tbl | t ramp | f real_city | f reltime_tbl | f @@ -149,6 +151,7 @@ SELECT relname, relhasindex sql_sizing_profiles | f stud_emp | f student | f + suffix_text_tbl | t tenk1 | t tenk2 | t test_range_excl | t @@ -161,7 +164,7 @@ SELECT relname, relhasindex timetz_tbl | f tinterval_tbl | f varchar_tbl | f -(150 rows) +(153 rows) -- -- another sanity check: every system catalog that has OIDs should have diff --git a/src/test/regress/output/misc.source b/src/test/regress/output/misc.source index 45bc926407d..b57c5546ded 100644 --- a/src/test/regress/output/misc.source +++ b/src/test/regress/output/misc.source @@ -636,6 +636,7 @@ SELECT user_relns() AS user_relns int8_tbl interval_tbl iportaltest + kd_point_tbl log_table lseg_tbl main_table @@ -657,6 +658,7 @@ SELECT user_relns() AS user_relns person point_tbl polygon_tbl + quad_point_tbl ramp random_tbl real_city @@ -668,6 +670,7 @@ SELECT user_relns() AS user_relns stud_emp student subselect_tbl + suffix_text_tbl tenk1 tenk2 test_range_excl @@ -682,7 +685,7 @@ SELECT user_relns() AS user_relns toyemp varchar_tbl xacttest -(104 rows) +(107 rows) SELECT name(equipment(hobby_construct(text 'skywalking', text 'mer'))); name diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index 85cf23ccb8f..babde51d2c3 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -92,6 +92,36 @@ CREATE INDEX ggpolygonind ON gpolygon_tbl USING gist (f1); CREATE INDEX ggcircleind ON gcircle_tbl USING gist (f1); +-- +-- SP-GiST +-- + +CREATE TABLE quad_point_tbl AS + SELECT point(unique1,unique2) AS p FROM tenk1; + +INSERT INTO quad_point_tbl + SELECT '(333.0,400.0)'::point FROM generate_series(1,1000); + +CREATE INDEX sp_quad_ind ON quad_point_tbl USING spgist (p); + +CREATE TABLE kd_point_tbl AS SELECT * FROM quad_point_tbl; + +CREATE INDEX sp_kd_ind ON kd_point_tbl USING spgist (p kd_point_ops); + +CREATE TABLE suffix_text_tbl AS + SELECT name AS t FROM road; + +INSERT INTO suffix_text_tbl + SELECT '0123456789abcdef' FROM generate_series(1,1000); +INSERT INTO suffix_text_tbl VALUES ('0123456789abcde'); +INSERT INTO suffix_text_tbl VALUES ('0123456789abcdefF'); + +CREATE INDEX sp_suff_ind ON suffix_text_tbl USING spgist (t); + +-- +-- Test GiST and SP-GiST indexes +-- + -- get non-indexed results for comparison purposes SET enable_seqscan = ON; @@ -142,9 +172,50 @@ SELECT * FROM point_tbl WHERE f1 IS NOT NULL ORDER BY f1 <-> '0,1'; SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1'; +SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + +SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; + +SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)'; + +SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)'; + +SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)'; + +SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)'; + +SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef'; + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde'; + +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF'; + +SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct '; + +SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct '; + +SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct '; + +SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct '; + +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct '; + +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St '; + +SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St '; + +SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St '; + +SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St '; + +SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St '; + +-- Now check the results from plain indexscan SET enable_seqscan = OFF; SET enable_indexscan = ON; -SET enable_bitmapscan = ON; +SET enable_bitmapscan = OFF; EXPLAIN (COSTS OFF) SELECT * FROM fast_emp4000 @@ -234,6 +305,115 @@ EXPLAIN (COSTS OFF) SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1'; SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1'; +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; +SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; +SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)'; +SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)'; +SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)'; +SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)'; +SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; +SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; +SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; +SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)'; +SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)'; +SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)'; +SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)'; +SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)'; +SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef'; +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde'; +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF'; +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct '; +SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct '; +SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct '; +SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct '; +SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct '; +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St '; +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St '; +SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St '; +SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St '; +SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St '; +SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St '; + +-- Now check the results from bitmap indexscan SET enable_seqscan = OFF; SET enable_indexscan = OFF; SET enable_bitmapscan = ON; @@ -242,6 +422,114 @@ EXPLAIN (COSTS OFF) SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1'; SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1'; +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; +SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; +SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)'; +SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)'; +SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)'; +SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)'; +SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; +SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; +SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; +SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)'; +SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)'; +SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)'; +SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)'; +SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)'; +SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef'; +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde'; +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF'; +SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF'; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct '; +SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct '; +SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct '; +SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct '; +SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct '; +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St '; +SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St '; +SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St '; +SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St '; +SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St '; + +EXPLAIN (COSTS OFF) +SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St '; +SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St '; + RESET enable_seqscan; RESET enable_indexscan; RESET enable_bitmapscan; diff --git a/src/test/regress/sql/opr_sanity.sql b/src/test/regress/sql/opr_sanity.sql index 6a79ea180c1..e29148fd5bd 100644 --- a/src/test/regress/sql/opr_sanity.sql +++ b/src/test/regress/sql/opr_sanity.sql @@ -831,6 +831,19 @@ WHERE NOT EXISTS(SELECT 1 FROM pg_amop AS p2 WHERE p2.amopfamily = p1.opcfamily AND binary_coercible(p1.opcintype, p2.amoplefttype)); +-- Check that each operator listed in pg_amop has an associated opclass, +-- that is one whose opcintype matches oprleft (possibly by coercion). +-- Otherwise the operator is useless because it cannot be matched to an index. +-- (In principle it could be useful to list such operators in multiple-datatype +-- btree opfamilies, but in practice you'd expect there to be an opclass for +-- every datatype the family knows about.) + +SELECT p1.amopfamily, p1.amopstrategy, p1.amopopr +FROM pg_amop AS p1 +WHERE NOT EXISTS(SELECT 1 FROM pg_opclass AS p2 + WHERE p2.opcfamily = p1.amopfamily + AND binary_coercible(p2.opcintype, p1.amoplefttype)); + -- Operators that are primary members of opclasses must be immutable (else -- it suggests that the index ordering isn't fixed). Operators that are -- cross-type members need only be stable, since they are just shorthands @@ -1018,6 +1031,25 @@ WHERE p3.opfmethod = (SELECT oid FROM pg_am WHERE amname = 'hash') OR amproclefttype != amprocrighttype) ORDER BY 1; +-- We can also check SP-GiST carefully, since the support routine signatures +-- are independent of the datatype being indexed. + +SELECT p1.amprocfamily, p1.amprocnum, + p2.oid, p2.proname, + p3.opfname +FROM pg_amproc AS p1, pg_proc AS p2, pg_opfamily AS p3 +WHERE p3.opfmethod = (SELECT oid FROM pg_am WHERE amname = 'spgist') + AND p1.amprocfamily = p3.oid AND p1.amproc = p2.oid AND + (CASE WHEN amprocnum = 1 OR amprocnum = 2 OR amprocnum = 3 OR amprocnum = 4 + THEN prorettype != 'void'::regtype OR proretset OR pronargs != 2 + OR proargtypes[0] != 'internal'::regtype + OR proargtypes[1] != 'internal'::regtype + WHEN amprocnum = 5 + THEN prorettype != 'bool'::regtype OR proretset OR pronargs != 2 + OR proargtypes[0] != 'internal'::regtype + OR proargtypes[1] != 'internal'::regtype + ELSE true END); + -- Support routines that are primary members of opfamilies must be immutable -- (else it suggests that the index ordering isn't fixed). But cross-type -- members need only be stable, since they are just shorthands |