aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/Makefile2
-rw-r--r--src/backend/access/common/reloptions.c9
-rw-r--r--src/backend/access/spgist/Makefile19
-rw-r--r--src/backend/access/spgist/README316
-rw-r--r--src/backend/access/spgist/spgdoinsert.c2065
-rw-r--r--src/backend/access/spgist/spginsert.c219
-rw-r--r--src/backend/access/spgist/spgkdtreeproc.c298
-rw-r--r--src/backend/access/spgist/spgquadtreeproc.c360
-rw-r--r--src/backend/access/spgist/spgscan.c543
-rw-r--r--src/backend/access/spgist/spgtextproc.c594
-rw-r--r--src/backend/access/spgist/spgutils.c850
-rw-r--r--src/backend/access/spgist/spgvacuum.c755
-rw-r--r--src/backend/access/spgist/spgxlog.c1070
-rw-r--r--src/backend/access/transam/rmgr.c4
-rw-r--r--src/backend/utils/adt/selfuncs.c20
-rw-r--r--src/include/access/gin_private.h4
-rw-r--r--src/include/access/reloptions.h3
-rw-r--r--src/include/access/rmgr.h4
-rw-r--r--src/include/access/spgist.h199
-rw-r--r--src/include/access/spgist_private.h609
-rw-r--r--src/include/catalog/catversion.h2
-rw-r--r--src/include/catalog/pg_am.h11
-rw-r--r--src/include/catalog/pg_amop.h33
-rw-r--r--src/include/catalog/pg_amproc.h18
-rw-r--r--src/include/catalog/pg_opclass.h3
-rw-r--r--src/include/catalog/pg_opfamily.h3
-rw-r--r--src/include/catalog/pg_proc.h62
-rw-r--r--src/include/utils/builtins.h20
-rw-r--r--src/include/utils/selfuncs.h1
-rw-r--r--src/test/regress/expected/create_index.out1068
-rw-r--r--src/test/regress/expected/opr_sanity.out56
-rw-r--r--src/test/regress/expected/sanity_check.out5
-rw-r--r--src/test/regress/output/misc.source5
-rw-r--r--src/test/regress/sql/create_index.sql290
-rw-r--r--src/test/regress/sql/opr_sanity.sql32
35 files changed, 9503 insertions, 49 deletions
diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile
index a4c4ca7da94..0366d59624e 100644
--- a/src/backend/access/Makefile
+++ b/src/backend/access/Makefile
@@ -8,6 +8,6 @@ subdir = src/backend/access
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-SUBDIRS = common gist hash heap index nbtree transam gin
+SUBDIRS = common gist hash heap index nbtree transam gin spgist
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 240e178b3b4..100172fa4ac 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -19,6 +19,7 @@
#include "access/hash.h"
#include "access/nbtree.h"
#include "access/reloptions.h"
+#include "access/spgist.h"
#include "catalog/pg_type.h"
#include "commands/defrem.h"
#include "commands/tablespace.h"
@@ -106,6 +107,14 @@ static relopt_int intRelOpts[] =
},
{
{
+ "fillfactor",
+ "Packs spgist index pages only to this percentage",
+ RELOPT_KIND_SPGIST
+ },
+ SPGIST_DEFAULT_FILLFACTOR, SPGIST_MIN_FILLFACTOR, 100
+ },
+ {
+ {
"autovacuum_vacuum_threshold",
"Minimum number of tuple updates or deletes prior to vacuum",
RELOPT_KIND_HEAP | RELOPT_KIND_TOAST
diff --git a/src/backend/access/spgist/Makefile b/src/backend/access/spgist/Makefile
new file mode 100644
index 00000000000..918da1fccaf
--- /dev/null
+++ b/src/backend/access/spgist/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for access/spgist
+#
+# IDENTIFICATION
+# src/backend/access/spgist/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/spgist
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = spgutils.o spginsert.o spgscan.o spgvacuum.o \
+ spgdoinsert.o spgxlog.o \
+ spgtextproc.o spgquadtreeproc.o spgkdtreeproc.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/spgist/README b/src/backend/access/spgist/README
new file mode 100644
index 00000000000..4ff0e357cb4
--- /dev/null
+++ b/src/backend/access/spgist/README
@@ -0,0 +1,316 @@
+src/backend/access/spgist/README
+
+SP-GiST is an abbreviation of space-partitioned GiST. It provides a
+generalized infrastructure for implementing space-partitioned data
+structures, such as quadtrees, k-d trees, and suffix trees (tries). When
+implemented in main memory, these structures are usually designed as a set of
+dynamically-allocated nodes linked by pointers. This is not suitable for
+direct storing on disk, since the chains of pointers can be rather long and
+require too many disk accesses. In contrast, disk based data structures
+should have a high fanout to minimize I/O. The challenge is to map tree
+nodes to disk pages in such a way that the search algorithm accesses only a
+few disk pages, even if it traverses many nodes.
+
+COMMON STRUCTURE DESCRIPTION
+
+Logically, an SP-GiST tree is a set of tuples, each of which can be either
+an inner or leaf tuple. Each inner tuple contains "nodes", which are
+(label,pointer) pairs, where the pointer (ItemPointerData) is a pointer to
+another inner tuple or to the head of a list of leaf tuples. Inner tuples
+can have different numbers of nodes (children). Branches can be of different
+depth (actually, there is no control or code to support balancing), which
+means that the tree is non-balanced. However, leaf and inner tuples cannot
+be intermixed at the same level: a downlink from a node of an inner tuple
+leads either to one inner tuple, or to a list of leaf tuples.
+
+The SP-GiST core requires that inner and leaf tuples fit on a single index
+page, and even more stringently that the list of leaf tuples reached from a
+single inner-tuple node all be stored on the same index page. (Restricting
+such lists to not cross pages reduces seeks, and allows the list links to be
+stored as simple 2-byte OffsetNumbers.) SP-GiST index opclasses should
+therefore ensure that not too many nodes can be needed in one inner tuple,
+and that inner-tuple prefixes and leaf-node datum values not be too large.
+
+Inner and leaf tuples are stored separately: the former are stored only on
+"inner" pages, the latter only on "leaf" pages. Also, there are special
+restrictions on the root page. Early in an index's life, when there is only
+one page's worth of data, the root page contains an unorganized set of leaf
+tuples. After the first page split has occurred, the root is required to
+contain exactly one inner tuple.
+
+When the search traversal algorithm reaches an inner tuple, it chooses a set
+of nodes to continue tree traverse in depth. If it reaches a leaf page it
+scans a list of leaf tuples to find the ones that match the query.
+
+The insertion algorithm descends the tree similarly, except it must choose
+just one node to descend to from each inner tuple. Insertion might also have
+to modify the inner tuple before it can descend: it could add a new node, or
+it could "split" the tuple to obtain a less-specific prefix that can match
+the value to be inserted. If it's necessary to append a new leaf tuple to a
+list and there is no free space on page, then SP-GiST creates a new inner
+tuple and distributes leaf tuples into a set of lists on, perhaps, several
+pages.
+
+Inner tuple consists of:
+
+ optional prefix value - all successors must be consistent with it.
+ Example:
+ suffix tree - prefix value is a common prefix string
+ quad tree - centroid
+ k-d tree - one coordinate
+
+ list of nodes, where node is a (label, pointer) pair.
+ Example of a label: a single character for suffix tree
+
+Leaf tuple consists of:
+
+ a leaf value
+ Example:
+ suffix tree - the rest of string (postfix)
+ quad and k-d tree - the point itself
+
+ ItemPointer to the heap
+
+INSERTION ALGORITHM
+
+Insertion algorithm is designed to keep the tree in a consistent state at
+any moment. Here is a simplified insertion algorithm specification
+(numbers refer to notes below):
+
+ Start with the first tuple on the root page (1)
+
+ loop:
+ if (page is leaf) then
+ if (enough space)
+ insert on page and exit (5)
+ else (7)
+ call PickSplitFn() (2)
+ end if
+ else
+ switch (chooseFn())
+ case MatchNode - descend through selected node
+ case AddNode - add node and then retry chooseFn (3, 6)
+ case SplitTuple - split inner tuple to prefix and postfix, then
+ retry chooseFn with the prefix tuple (4, 6)
+ end if
+
+Notes:
+
+(1) Initially, we just dump leaf tuples into the root page until it is full;
+then we split it. Once the root is not a leaf page, it can have only one
+inner tuple, so as to keep the amount of free space on the root as large as
+possible. Both of these rules are meant to postpone doing PickSplit on the
+root for as long as possible, so that the topmost partitioning of the search
+space is as good as we can easily make it.
+
+(2) Current implementation allows to do picksplit and insert a new leaf tuple
+in one operation, if the new list of leaf tuples fits on one page. It's
+always possible for trees with small nodes like quad tree or k-d tree, but
+suffix trees may require another picksplit.
+
+(3) Addition of node must keep size of inner tuple small enough to fit on a
+page. After addition, inner tuple could become too large to be stored on
+current page because of other tuples on page. In this case it will be moved
+to another inner page (see notes about page management). When moving tuple to
+another page, we can't change the numbers of other tuples on the page, else
+we'd make downlink pointers to them invalid. To prevent that, SP-GiST leaves
+a "placeholder" tuple, which can be reused later whenever another tuple is
+added to the page. See also Concurrency and Vacuum sections below. Right now
+only suffix trees could add a node to the tuple; quad trees and k-d trees
+make all possible nodes at once in PickSplitFn() call.
+
+(4) Prefix value could only partially match a new value, so the SplitTuple
+action allows breaking the current tree branch into upper and lower sections.
+Another way to say it is that we can split the current inner tuple into
+"prefix" and "postfix" parts, where the prefix part is able to match the
+incoming new value. Consider example of insertion into a suffix tree. We use
+the following notation, where tuple's id is just for discussion (no such id
+is actually stored):
+
+inner tuple: {tuple id}(prefix string)[ comma separated list of node labels ]
+leaf tuple: {tuple id}<value>
+
+Suppose we need to insert string 'www.gogo.com' into inner tuple
+
+ {1}(www.google.com/)[a, i]
+
+The string does not match the prefix so we cannot descend. We must
+split the inner tuple into two tuples:
+
+ {2}(www.go)[o] - prefix tuple
+ |
+ {3}(gle.com/)[a,i] - postfix tuple
+
+On the next iteration of loop we find that 'www.gogo.com' matches the
+prefix, but not any node label, so we add a node [g] to tuple {2}:
+
+ NIL (no child exists yet)
+ |
+ {2}(www.go)[o, g]
+ |
+ {3}(gle.com/)[a,i]
+
+Now we can descend through the [g] node, which will cause us to update
+the target string to just 'o.com'. Finally, we'll insert a leaf tuple
+bearing that string:
+
+ {4}<o.com>
+ |
+ {2}(www.go)[o, g]
+ |
+ {3}(gle.com/)[a,i]
+
+As we can see, the original tuple's node array moves to postfix tuple without
+any changes. Note also that SP-GiST core assumes that prefix tuple is not
+larger than old inner tuple. That allows us to store prefix tuple directly
+in place of old inner tuple. SP-GiST core will try to store postfix tuple on
+the same page if possible, but will use another page if there is not enough
+free space (see notes 5 and 6). Currently, quad and k-d trees don't use this
+feature, because they have no concept of a prefix being "inconsistent" with
+any new value. They grow their depth only by PickSplitFn() call.
+
+(5) If pointer from node of parent is a NIL pointer, algorithm chooses a leaf
+page to store on. At first, it tries to use the last-used leaf page with the
+largest free space (which we track in each backend) to better utilize disk
+space. If that's not large enough, then the algorithm allocates a new page.
+
+(6) Management of inner pages is very similar to management of leaf pages,
+described in (5).
+
+(7) Actually, current implementation can move the whole list of leaf tuples
+and a new tuple to another page, if the list is short enough. This improves
+space utilization, but doesn't change the basis of the algorithm.
+
+CONCURRENCY
+
+While descending the tree, the insertion algorithm holds exclusive lock on
+two tree levels at a time, ie both parent and child pages (parent and child
+pages can be the same, see notes above). There is a possibility of deadlock
+between two insertions if there are cross-referenced pages in different
+branches. That is, if inner tuple on page M has a child on page N while
+an inner tuple from another branch is on page N and has a child on page M,
+then two insertions descending the two branches could deadlock. To prevent
+deadlocks we introduce a concept of "triple parity" of pages: if inner tuple
+is on page with BlockNumber N, then its child tuples should be placed on the
+same page, or else on a page with BlockNumber M where (N+1) mod 3 == M mod 3.
+This rule guarantees that tuples on page M will have no children on page N,
+since (M+1) mod 3 != N mod 3.
+
+Insertion may also need to take locks on an additional inner and/or leaf page
+to add tuples of the right type(s), when there's not enough room on the pages
+it descended through. However, we don't care exactly which such page we add
+to, so deadlocks can be avoided by conditionally locking the additional
+buffers: if we fail to get lock on an additional page, just try another one.
+
+Search traversal algorithm is rather traditional. At each non-leaf level, it
+share-locks the page, identifies which node(s) in the current inner tuple
+need to be visited, and puts those addresses on a stack of pages to examine
+later. It then releases lock on the current buffer before visiting the next
+stack item. So only one page is locked at a time, and no deadlock is
+possible. But instead, we have to worry about race conditions: by the time
+we arrive at a pointed-to page, a concurrent insertion could have replaced
+the target inner tuple (or leaf tuple chain) with data placed elsewhere.
+To handle that, whenever the insertion algorithm changes a nonempty downlink
+in an inner tuple, it places a "redirect tuple" in place of the lower-level
+inner tuple or leaf-tuple chain head that the link formerly led to. Scans
+(though not insertions) must be prepared to honor such redirects. Only a
+scan that had already visited the parent level could possibly reach such a
+redirect tuple, so we can remove redirects once all active transactions have
+been flushed out of the system.
+
+DEAD TUPLES
+
+Tuples on leaf pages can be in one of four states:
+
+SPGIST_LIVE: normal, live pointer to a heap tuple.
+
+SPGIST_REDIRECT: placeholder that contains a link to another place in the
+index. When a chain of leaf tuples has to be moved to another page, a
+redirect tuple is inserted in place of the chain's head tuple. The parent
+inner tuple's downlink is updated when this happens, but concurrent scans
+might be "in flight" from the parent page to the child page (since they
+release lock on the parent page before attempting to lock the child).
+The redirect pointer serves to tell such a scan where to go. A redirect
+pointer is only needed for as long as such concurrent scans could be in
+progress. Eventually, it's converted into a PLACEHOLDER dead tuple by
+VACUUM, and is then a candidate for replacement. Searches that find such
+a tuple (which should never be part of a chain) should immediately proceed
+to the other place, forgetting about the redirect tuple. Insertions that
+reach such a tuple should raise error, since a valid downlink should never
+point to such a tuple.
+
+SPGIST_DEAD: tuple is dead, but it cannot be removed or moved to a
+different offset on the page because there is a link leading to it from
+some inner tuple elsewhere in the index. (Such a tuple is never part of a
+chain, since we don't need one unless there is nothing live left in its
+chain.) Searches should ignore such entries. If an insertion action
+arrives at such a tuple, it should either replace it in-place (if there's
+room on the page to hold the desired new leaf tuple) or replace it with a
+redirection pointer to wherever it puts the new leaf tuple.
+
+SPGIST_PLACEHOLDER: tuple is dead, and there are known to be no links to
+it from elsewhere. When a live tuple is deleted or moved away, and not
+replaced by a redirect pointer, it is replaced by a placeholder to keep
+the offsets of later tuples on the same page from changing. Placeholders
+can be freely replaced when adding a new tuple to the page, and also
+VACUUM will delete any that are at the end of the range of valid tuple
+offsets. Both searches and insertions should complain if a link from
+elsewhere leads them to a placeholder tuple.
+
+When the root page is also a leaf, all its tuple should be in LIVE state;
+there's no need for the others since there are no links and no need to
+preserve offset numbers.
+
+Tuples on inner pages can be in LIVE, REDIRECT, or PLACEHOLDER states.
+The REDIRECT state has the same function as on leaf pages, to send
+concurrent searches to the place where they need to go after an inner
+tuple is moved to another page. Expired REDIRECT pointers are converted
+to PLACEHOLDER status by VACUUM, and are then candidates for replacement.
+DEAD state is not currently possible, since VACUUM does not attempt to
+remove unused inner tuples.
+
+VACUUM
+
+VACUUM (or more precisely, spgbulkdelete) performs a single sequential scan
+over the entire index. On both leaf and inner pages, we can convert old
+REDIRECT tuples into PLACEHOLDER status, and then remove any PLACEHOLDERs
+that are at the end of the page (since they aren't needed to preserve the
+offsets of any live tuples). On leaf pages, we scan for tuples that need
+to be deleted because their heap TIDs match a vacuum target TID.
+
+If we find a deletable tuple that is not at the head of its chain, we
+can simply replace it with a PLACEHOLDER, updating the chain links to
+remove it from the chain. If it is at the head of its chain, but there's
+at least one live tuple remaining in the chain, we move that live tuple
+to the head tuple's offset, replacing it with a PLACEHOLDER to preserve
+the offsets of other tuples. This keeps the parent inner tuple's downlink
+valid. If we find ourselves deleting all live tuples in a chain, we
+replace the head tuple with a DEAD tuple and the rest with PLACEHOLDERS.
+The parent inner tuple's downlink thus points to the DEAD tuple, and the
+rules explained in the previous section keep everything working.
+
+VACUUM doesn't know a-priori which tuples are heads of their chains, but
+it can easily figure that out by constructing a predecessor array that's
+the reverse map of the nextOffset links (ie, when we see tuple x links to
+tuple y, we set predecessor[y] = x). Then head tuples are the ones with
+no predecessor.
+
+spgbulkdelete also updates the index's free space map.
+
+Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was
+performed; otherwise, it does an spgbulkdelete scan with an empty target
+list, so as to clean up redirections and placeholders, update the free
+space map, and gather statistics.
+
+LAST USED PAGE MANAGEMENT
+
+List of last used pages contains four pages - a leaf page and three inner
+pages, one from each "triple parity" group. This list is stored between
+calls on the index meta page, but updates are never WAL-logged to decrease
+WAL traffic. Incorrect data on meta page isn't critical, because we could
+allocate a new page at any moment.
+
+AUTHORS
+
+ Teodor Sigaev <teodor@sigaev.ru>
+ Oleg Bartunov <oleg@sai.msu.su>
diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c
new file mode 100644
index 00000000000..4bb8dfa1509
--- /dev/null
+++ b/src/backend/access/spgist/spgdoinsert.c
@@ -0,0 +1,2065 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgdoinsert.c
+ * implementation of insert algorithm
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgdoinsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/spgist_private.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+
+
+/*
+ * SPPageDesc tracks all info about a page we are inserting into. In some
+ * situations it actually identifies a tuple, or even a specific node within
+ * an inner tuple. But any of the fields can be invalid. If the buffer
+ * field is valid, it implies we hold pin and exclusive lock on that buffer.
+ * page pointer should be valid exactly when buffer is.
+ */
+typedef struct SPPageDesc
+{
+ BlockNumber blkno; /* block number, or InvalidBlockNumber */
+ Buffer buffer; /* page's buffer number, or InvalidBuffer */
+ Page page; /* pointer to page buffer, or NULL */
+ OffsetNumber offnum; /* offset of tuple, or InvalidOffsetNumber */
+ int node; /* node number within inner tuple, or -1 */
+} SPPageDesc;
+
+
+/*
+ * Set the item pointer in the nodeN'th entry in inner tuple tup. This
+ * is used to update the parent inner tuple's downlink after a move or
+ * split operation.
+ */
+void
+updateNodeLink(SpGistInnerTuple tup, int nodeN,
+ BlockNumber blkno, OffsetNumber offset)
+{
+ int i;
+ SpGistNodeTuple node;
+
+ SGITITERATE(tup, i, node)
+ {
+ if (i == nodeN)
+ {
+ ItemPointerSet(&node->t_tid, blkno, offset);
+ return;
+ }
+ }
+
+ elog(ERROR, "failed to find requested node %d in SPGiST inner tuple",
+ nodeN);
+}
+
+/*
+ * Form a new inner tuple containing one more node than the given one, with
+ * the specified label datum, inserted at offset "offset" in the node array.
+ * The new tuple's prefix is the same as the old one's.
+ *
+ * Note that the new node initially has an invalid downlink. We'll find a
+ * page to point it to later.
+ */
+static SpGistInnerTuple
+addNode(SpGistState *state, SpGistInnerTuple tuple, Datum label, int offset)
+{
+ SpGistNodeTuple node,
+ *nodes;
+ int i;
+
+ /* if offset is negative, insert at end */
+ if (offset < 0)
+ offset = tuple->nNodes;
+ else if (offset > tuple->nNodes)
+ elog(ERROR, "invalid offset for adding node to SPGiST inner tuple");
+
+ nodes = palloc(sizeof(SpGistNodeTuple) * (tuple->nNodes + 1));
+ SGITITERATE(tuple, i, node)
+ {
+ if (i < offset)
+ nodes[i] = node;
+ else
+ nodes[i + 1] = node;
+ }
+
+ nodes[offset] = spgFormNodeTuple(state, label, false);
+
+ return spgFormInnerTuple(state,
+ (tuple->prefixSize > 0),
+ SGITDATUM(tuple, state),
+ tuple->nNodes + 1,
+ nodes);
+}
+
+/* qsort comparator for sorting OffsetNumbers */
+static int
+cmpOffsetNumbers(const void *a, const void *b)
+{
+ if (*(const OffsetNumber *) a == *(const OffsetNumber *) b)
+ return 0;
+ return (*(const OffsetNumber *) a > *(const OffsetNumber *) b) ? 1 : -1;
+}
+
+/*
+ * Delete multiple tuples from an index page, preserving tuple offset numbers.
+ *
+ * The first tuple in the given list is replaced with a dead tuple of type
+ * "firststate" (REDIRECT/DEAD/PLACEHOLDER); the remaining tuples are replaced
+ * with dead tuples of type "reststate". If either firststate or reststate
+ * is REDIRECT, blkno/offnum specify where to link to.
+ *
+ * NB: this is used during WAL replay, so beware of trying to make it too
+ * smart. In particular, it shouldn't use "state" except for calling
+ * spgFormDeadTuple().
+ */
+void
+spgPageIndexMultiDelete(SpGistState *state, Page page,
+ OffsetNumber *itemnos, int nitems,
+ int firststate, int reststate,
+ BlockNumber blkno, OffsetNumber offnum)
+{
+ OffsetNumber firstItem;
+ OffsetNumber *sortednos;
+ SpGistDeadTuple tuple = NULL;
+ int i;
+
+ if (nitems == 0)
+ return; /* nothing to do */
+
+ /*
+ * For efficiency we want to use PageIndexMultiDelete, which requires the
+ * targets to be listed in sorted order, so we have to sort the itemnos
+ * array. (This also greatly simplifies the math for reinserting the
+ * replacement tuples.) However, we must not scribble on the caller's
+ * array, so we have to make a copy.
+ */
+ sortednos = (OffsetNumber *) palloc(sizeof(OffsetNumber) * nitems);
+ memcpy(sortednos, itemnos, sizeof(OffsetNumber) * nitems);
+ if (nitems > 1)
+ qsort(sortednos, nitems, sizeof(OffsetNumber), cmpOffsetNumbers);
+
+ PageIndexMultiDelete(page, sortednos, nitems);
+
+ firstItem = itemnos[0];
+
+ for (i = 0; i < nitems; i++)
+ {
+ OffsetNumber itemno = sortednos[i];
+ int tupstate;
+
+ tupstate = (itemno == firstItem) ? firststate : reststate;
+ if (tuple == NULL || tuple->tupstate != tupstate)
+ tuple = spgFormDeadTuple(state, tupstate, blkno, offnum);
+
+ if (PageAddItem(page, (Item) tuple, tuple->size,
+ itemno, false, false) != itemno)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ tuple->size);
+
+ if (tupstate == SPGIST_REDIRECT)
+ SpGistPageGetOpaque(page)->nRedirection++;
+ else if (tupstate == SPGIST_PLACEHOLDER)
+ SpGistPageGetOpaque(page)->nPlaceholder++;
+ }
+
+ pfree(sortednos);
+}
+
+/*
+ * Update the parent inner tuple's downlink, and mark the parent buffer
+ * dirty (this must be the last change to the parent page in the current
+ * WAL action).
+ */
+static void
+saveNodeLink(Relation index, SPPageDesc *parent,
+ BlockNumber blkno, OffsetNumber offnum)
+{
+ SpGistInnerTuple innerTuple;
+
+ innerTuple = (SpGistInnerTuple) PageGetItem(parent->page,
+ PageGetItemId(parent->page, parent->offnum));
+
+ updateNodeLink(innerTuple, parent->node, blkno, offnum);
+
+ MarkBufferDirty(parent->buffer);
+}
+
+/*
+ * Add a leaf tuple to a leaf page where there is known to be room for it
+ */
+static void
+addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple,
+ SPPageDesc *current, SPPageDesc *parent, bool isNew)
+{
+ XLogRecData rdata[4];
+ spgxlogAddLeaf xlrec;
+
+ xlrec.node = index->rd_node;
+ xlrec.blknoLeaf = current->blkno;
+ xlrec.newPage = isNew;
+
+ /* these will be filled below as needed */
+ xlrec.offnumLeaf = InvalidOffsetNumber;
+ xlrec.offnumHeadLeaf = InvalidOffsetNumber;
+ xlrec.blknoParent = InvalidBlockNumber;
+ xlrec.offnumParent = InvalidOffsetNumber;
+ xlrec.nodeI = 0;
+
+ ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0);
+ /* we assume sizeof(xlrec) is at least int-aligned */
+ ACCEPT_RDATA_DATA(leafTuple, leafTuple->size, 1);
+ ACCEPT_RDATA_BUFFER(current->buffer, 2);
+
+ START_CRIT_SECTION();
+
+ if (current->offnum == InvalidOffsetNumber ||
+ current->blkno == SPGIST_HEAD_BLKNO)
+ {
+ /* Tuple is not part of a chain */
+ leafTuple->nextOffset = InvalidOffsetNumber;
+ current->offnum = SpGistPageAddNewItem(state, current->page,
+ (Item) leafTuple, leafTuple->size,
+ NULL, false);
+
+ xlrec.offnumLeaf = current->offnum;
+
+ /* Must update parent's downlink if any */
+ if (parent->buffer != InvalidBuffer)
+ {
+ xlrec.blknoParent = parent->blkno;
+ xlrec.offnumParent = parent->offnum;
+ xlrec.nodeI = parent->node;
+
+ saveNodeLink(index, parent, current->blkno, current->offnum);
+
+ ACCEPT_RDATA_BUFFER(parent->buffer, 3);
+ }
+ }
+ else
+ {
+ /*
+ * Tuple must be inserted into existing chain. We mustn't change
+ * the chain's head address, but we don't need to chase the entire
+ * chain to put the tuple at the end; we can insert it second.
+ *
+ * Also, it's possible that the "chain" consists only of a DEAD tuple,
+ * in which case we should replace the DEAD tuple in-place.
+ */
+ SpGistLeafTuple head;
+ OffsetNumber offnum;
+
+ head = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, current->offnum));
+ if (head->tupstate == SPGIST_LIVE)
+ {
+ leafTuple->nextOffset = head->nextOffset;
+ offnum = SpGistPageAddNewItem(state, current->page,
+ (Item) leafTuple, leafTuple->size,
+ NULL, false);
+
+ /*
+ * re-get head of list because it could have been moved on page,
+ * and set new second element
+ */
+ head = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, current->offnum));
+ head->nextOffset = offnum;
+
+ xlrec.offnumLeaf = offnum;
+ xlrec.offnumHeadLeaf = current->offnum;
+ }
+ else if (head->tupstate == SPGIST_DEAD)
+ {
+ leafTuple->nextOffset = InvalidOffsetNumber;
+ PageIndexTupleDelete(current->page, current->offnum);
+ if (PageAddItem(current->page,
+ (Item) leafTuple, leafTuple->size,
+ current->offnum, false, false) != current->offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ leafTuple->size);
+
+ /* WAL replay distinguishes this case by equal offnums */
+ xlrec.offnumLeaf = current->offnum;
+ xlrec.offnumHeadLeaf = current->offnum;
+ }
+ else
+ elog(ERROR, "unexpected SPGiST tuple state: %d", head->tupstate);
+ }
+
+ MarkBufferDirty(current->buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_LEAF, rdata);
+
+ PageSetLSN(current->page, recptr);
+ PageSetTLI(current->page, ThisTimeLineID);
+
+ /* update parent only if we actually changed it */
+ if (xlrec.blknoParent != InvalidBlockNumber)
+ {
+ PageSetLSN(parent->page, recptr);
+ PageSetTLI(parent->page, ThisTimeLineID);
+ }
+ }
+
+ END_CRIT_SECTION();
+}
+
+/*
+ * Count the number and total size of leaf tuples in the chain starting at
+ * current->offnum. Return number into *nToSplit and total size as function
+ * result.
+ *
+ * Klugy special case when considering the root page (i.e., root is a leaf
+ * page, but we're about to split for the first time): return fake large
+ * values to force spgdoinsert() to take the doPickSplit rather than
+ * moveLeafs code path. moveLeafs is not prepared to deal with root page.
+ */
+static int
+checkSplitConditions(Relation index, SpGistState *state,
+ SPPageDesc *current, int *nToSplit)
+{
+ int i,
+ n = 0,
+ totalSize = 0;
+
+ if (current->blkno == SPGIST_HEAD_BLKNO)
+ {
+ /* return impossible values to force split */
+ *nToSplit = BLCKSZ;
+ return BLCKSZ;
+ }
+
+ i = current->offnum;
+ while (i != InvalidOffsetNumber)
+ {
+ SpGistLeafTuple it;
+
+ Assert(i >= FirstOffsetNumber &&
+ i <= PageGetMaxOffsetNumber(current->page));
+ it = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, i));
+ if (it->tupstate == SPGIST_LIVE)
+ {
+ n++;
+ totalSize += it->size + sizeof(ItemIdData);
+ }
+ else if (it->tupstate == SPGIST_DEAD)
+ {
+ /* We could see a DEAD tuple as first/only chain item */
+ Assert(i == current->offnum);
+ Assert(it->nextOffset == InvalidOffsetNumber);
+ /* Don't count it in result, because it won't go to other page */
+ }
+ else
+ elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate);
+
+ i = it->nextOffset;
+ }
+
+ *nToSplit = n;
+
+ return totalSize;
+}
+
+/*
+ * current points to a leaf-tuple chain that we wanted to add newLeafTuple to,
+ * but the chain has to be moved because there's not enough room to add
+ * newLeafTuple to its page. We use this method when the chain contains
+ * very little data so a split would be inefficient. We are sure we can
+ * fit the chain plus newLeafTuple on one other page.
+ */
+static void
+moveLeafs(Relation index, SpGistState *state,
+ SPPageDesc *current, SPPageDesc *parent,
+ SpGistLeafTuple newLeafTuple)
+{
+ int i,
+ nDelete,
+ nInsert,
+ size;
+ Buffer nbuf;
+ Page npage;
+ SpGistLeafTuple it;
+ OffsetNumber r = InvalidOffsetNumber,
+ startOffset = InvalidOffsetNumber;
+ bool replaceDead = false;
+ OffsetNumber *toDelete;
+ OffsetNumber *toInsert;
+ BlockNumber nblkno;
+ XLogRecData rdata[7];
+ spgxlogMoveLeafs xlrec;
+ char *leafdata,
+ *leafptr;
+
+ /* This doesn't work on root page */
+ Assert(parent->buffer != InvalidBuffer);
+ Assert(parent->buffer != current->buffer);
+
+ /* Locate the tuples to be moved, and count up the space needed */
+ i = PageGetMaxOffsetNumber(current->page);
+ toDelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * i);
+ toInsert = (OffsetNumber *) palloc(sizeof(OffsetNumber) * (i + 1));
+
+ size = newLeafTuple->size + sizeof(ItemIdData);
+
+ nDelete = 0;
+ i = current->offnum;
+ while (i != InvalidOffsetNumber)
+ {
+ SpGistLeafTuple it;
+
+ Assert(i >= FirstOffsetNumber &&
+ i <= PageGetMaxOffsetNumber(current->page));
+ it = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, i));
+
+ if (it->tupstate == SPGIST_LIVE)
+ {
+ toDelete[nDelete] = i;
+ size += it->size + sizeof(ItemIdData);
+ nDelete++;
+ }
+ else if (it->tupstate == SPGIST_DEAD)
+ {
+ /* We could see a DEAD tuple as first/only chain item */
+ Assert(i == current->offnum);
+ Assert(it->nextOffset == InvalidOffsetNumber);
+ /* We don't want to move it, so don't count it in size */
+ toDelete[nDelete] = i;
+ nDelete++;
+ replaceDead = true;
+ }
+ else
+ elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate);
+
+ i = it->nextOffset;
+ }
+
+ /* Find a leaf page that will hold them */
+ nbuf = SpGistGetBuffer(index, GBUF_LEAF, size, &xlrec.newPage);
+ npage = BufferGetPage(nbuf);
+ nblkno = BufferGetBlockNumber(nbuf);
+ Assert(nblkno != current->blkno);
+
+ /* prepare WAL info */
+ xlrec.node = index->rd_node;
+ STORE_STATE(state, xlrec.stateSrc);
+
+ xlrec.blknoSrc = current->blkno;
+ xlrec.blknoDst = nblkno;
+ xlrec.nMoves = nDelete;
+ xlrec.replaceDead = replaceDead;
+
+ xlrec.blknoParent = parent->blkno;
+ xlrec.offnumParent = parent->offnum;
+ xlrec.nodeI = parent->node;
+
+ leafdata = leafptr = palloc(size);
+
+ START_CRIT_SECTION();
+
+ /* copy all the old tuples to new page, unless they're dead */
+ nInsert = 0;
+ if (!replaceDead)
+ {
+ for (i = 0; i < nDelete; i++)
+ {
+ it = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, toDelete[i]));
+ Assert(it->tupstate == SPGIST_LIVE);
+
+ /*
+ * Update chain link (notice the chain order gets reversed, but we
+ * don't care). We're modifying the tuple on the source page
+ * here, but it's okay since we're about to delete it.
+ */
+ it->nextOffset = r;
+
+ r = SpGistPageAddNewItem(state, npage, (Item) it, it->size,
+ &startOffset, false);
+
+ toInsert[nInsert] = r;
+ nInsert++;
+
+ /* save modified tuple into leafdata as well */
+ memcpy(leafptr, it, it->size);
+ leafptr += it->size;
+ }
+ }
+
+ /* add the new tuple as well */
+ newLeafTuple->nextOffset = r;
+ r = SpGistPageAddNewItem(state, npage,
+ (Item) newLeafTuple, newLeafTuple->size,
+ &startOffset, false);
+ toInsert[nInsert] = r;
+ nInsert++;
+ memcpy(leafptr, newLeafTuple, newLeafTuple->size);
+ leafptr += newLeafTuple->size;
+
+ /*
+ * Now delete the old tuples, leaving a redirection pointer behind for
+ * the first one, unless we're doing an index build; in which case there
+ * can't be any concurrent scan so we need not provide a redirect.
+ */
+ spgPageIndexMultiDelete(state, current->page, toDelete, nDelete,
+ state->isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT,
+ SPGIST_PLACEHOLDER,
+ nblkno, r);
+
+ /* Update parent's downlink and mark parent page dirty */
+ saveNodeLink(index, parent, nblkno, r);
+
+ /* Mark the leaf pages too */
+ MarkBufferDirty(current->buffer);
+ MarkBufferDirty(nbuf);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ ACCEPT_RDATA_DATA(&xlrec, MAXALIGN(sizeof(xlrec)), 0);
+ ACCEPT_RDATA_DATA(toDelete, MAXALIGN(sizeof(OffsetNumber) * nDelete), 1);
+ ACCEPT_RDATA_DATA(toInsert, MAXALIGN(sizeof(OffsetNumber) * nInsert), 2);
+ ACCEPT_RDATA_DATA(leafdata, leafptr - leafdata, 3);
+ ACCEPT_RDATA_BUFFER(current->buffer, 4);
+ ACCEPT_RDATA_BUFFER(nbuf, 5);
+ ACCEPT_RDATA_BUFFER(parent->buffer, 6);
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_MOVE_LEAFS, rdata);
+
+ PageSetLSN(current->page, recptr);
+ PageSetTLI(current->page, ThisTimeLineID);
+ PageSetLSN(npage, recptr);
+ PageSetTLI(npage, ThisTimeLineID);
+ PageSetLSN(parent->page, recptr);
+ PageSetTLI(parent->page, ThisTimeLineID);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Update local free-space cache and release new buffer */
+ SpGistSetLastUsedPage(index, nbuf);
+ UnlockReleaseBuffer(nbuf);
+}
+
+/*
+ * Update previously-created redirection tuple with appropriate destination
+ *
+ * We use this when it's not convenient to know the destination first.
+ * The tuple should have been made with the "impossible" destination of
+ * the metapage.
+ */
+static void
+setRedirectionTuple(SPPageDesc *current, OffsetNumber position,
+ BlockNumber blkno, OffsetNumber offnum)
+{
+ SpGistDeadTuple dt;
+
+ dt = (SpGistDeadTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, position));
+ Assert(dt->tupstate == SPGIST_REDIRECT);
+ Assert(ItemPointerGetBlockNumber(&dt->pointer) == SPGIST_METAPAGE_BLKNO);
+ ItemPointerSet(&dt->pointer, blkno, offnum);
+}
+
+/*
+ * Test to see if the user-defined picksplit function failed to do its job,
+ * ie, it put all the leaf tuples into the same node.
+ * If so, randomly divide the tuples into several nodes (all with the same
+ * label) and return TRUE to select allTheSame mode for this inner tuple.
+ *
+ * If we know that the leaf tuples wouldn't all fit on one page, then we
+ * exclude the last tuple (which is the incoming new tuple that forced a split)
+ * from the check to see if more than one node is used. The reason for this
+ * is that if the existing tuples are put into only one chain, then even if
+ * we move them all to an empty page, there would still not be room for the
+ * new tuple, so we'd get into an infinite loop of picksplit attempts.
+ * Forcing allTheSame mode dodges this problem by ensuring the old tuples will
+ * be split across pages. (Exercise for the reader: figure out why this
+ * fixes the problem even when there is only one old tuple.)
+ */
+static bool
+checkAllTheSame(spgPickSplitIn *in, spgPickSplitOut *out, bool tooBig,
+ bool *includeNew)
+{
+ int theNode;
+ int limit;
+ int i;
+
+ /* For the moment, assume we can include the new leaf tuple */
+ *includeNew = true;
+
+ /* If there's only the new leaf tuple, don't select allTheSame mode */
+ if (in->nTuples <= 1)
+ return false;
+
+ /* If tuple set doesn't fit on one page, ignore the new tuple in test */
+ limit = tooBig ? in->nTuples - 1 : in->nTuples;
+
+ /* Check to see if more than one node is populated */
+ theNode = out->mapTuplesToNodes[0];
+ for (i = 1; i < limit; i++)
+ {
+ if (out->mapTuplesToNodes[i] != theNode)
+ return false;
+ }
+
+ /* Nope, so override the picksplit function's decisions */
+
+ /* If the new tuple is in its own node, it can't be included in split */
+ if (tooBig && out->mapTuplesToNodes[in->nTuples - 1] != theNode)
+ *includeNew = false;
+
+ out->nNodes = 8; /* arbitrary number of child nodes */
+
+ /* Random assignment of tuples to nodes (note we include new tuple) */
+ for (i = 0; i < in->nTuples; i++)
+ out->mapTuplesToNodes[i] = i % out->nNodes;
+
+ /* The opclass may not use node labels, but if it does, duplicate 'em */
+ if (out->nodeLabels)
+ {
+ Datum theLabel = out->nodeLabels[theNode];
+
+ out->nodeLabels = (Datum *) palloc(sizeof(Datum) * out->nNodes);
+ for (i = 0; i < out->nNodes; i++)
+ out->nodeLabels[i] = theLabel;
+ }
+
+ /* We don't touch the prefix or the leaf tuple datum assignments */
+
+ return true;
+}
+
+/*
+ * current points to a leaf-tuple chain that we wanted to add newLeafTuple to,
+ * but the chain has to be split because there's not enough room to add
+ * newLeafTuple to its page.
+ *
+ * This function splits the leaf tuple set according to picksplit's rules,
+ * creating one or more new chains that are spread across the current page
+ * and an additional leaf page (we assume that two leaf pages will be
+ * sufficient). A new inner tuple is created, and the parent downlink
+ * pointer is updated to point to that inner tuple instead of the leaf chain.
+ *
+ * On exit, current contains the address of the new inner tuple.
+ *
+ * Returns true if we successfully inserted newLeafTuple during this function,
+ * false if caller still has to do it (meaning another picksplit operation is
+ * probably needed). Failure could occur if the picksplit result is fairly
+ * unbalanced, or if newLeafTuple is just plain too big to fit on a page.
+ * Because we force the picksplit result to be at least two chains, each
+ * cycle will get rid of at least one leaf tuple from the chain, so the loop
+ * will eventually terminate if lack of balance is the issue. If the tuple
+ * is too big, we assume that repeated picksplit operations will eventually
+ * make it small enough by repeated prefix-stripping. A broken opclass could
+ * make this an infinite loop, though.
+ */
+static bool
+doPickSplit(Relation index, SpGistState *state,
+ SPPageDesc *current, SPPageDesc *parent,
+ SpGistLeafTuple newLeafTuple, int level, bool isNew)
+{
+ bool insertedNew = false;
+ spgPickSplitIn in;
+ spgPickSplitOut out;
+ bool includeNew;
+ int i,
+ max,
+ n;
+ SpGistInnerTuple innerTuple;
+ SpGistNodeTuple node,
+ *nodes;
+ Buffer newInnerBuffer,
+ newLeafBuffer;
+ ItemPointerData *heapPtrs;
+ uint8 *leafPageSelect;
+ int *leafSizes;
+ OffsetNumber *toDelete;
+ OffsetNumber *toInsert;
+ OffsetNumber redirectTuplePos = InvalidOffsetNumber;
+ OffsetNumber startOffsets[2];
+ SpGistLeafTuple *newLeafs;
+ int spaceToDelete;
+ int currentFreeSpace;
+ int totalLeafSizes;
+ bool allTheSame;
+ XLogRecData rdata[10];
+ int nRdata;
+ spgxlogPickSplit xlrec;
+ char *leafdata,
+ *leafptr;
+ SPPageDesc saveCurrent;
+ int nToDelete,
+ nToInsert,
+ maxToInclude;
+
+ in.level = level;
+
+ /*
+ * Allocate per-leaf-tuple work arrays with max possible size
+ */
+ max = PageGetMaxOffsetNumber(current->page);
+ n = max + 1;
+ in.datums = (Datum *) palloc(sizeof(Datum) * n);
+ heapPtrs = (ItemPointerData *) palloc(sizeof(ItemPointerData) * n);
+ toDelete = (OffsetNumber *) palloc(sizeof(OffsetNumber) * n);
+ toInsert = (OffsetNumber *) palloc(sizeof(OffsetNumber) * n);
+ newLeafs = (SpGistLeafTuple *) palloc(sizeof(SpGistLeafTuple) * n);
+ leafPageSelect = (uint8 *) palloc(sizeof(uint8) * n);
+
+ xlrec.node = index->rd_node;
+ STORE_STATE(state, xlrec.stateSrc);
+
+ /*
+ * Form list of leaf tuples which will be distributed as split result;
+ * also, count up the amount of space that will be freed from current.
+ * (Note that in the non-root case, we won't actually delete the old
+ * tuples, only replace them with redirects or placeholders.)
+ */
+ nToInsert = 0;
+ nToDelete = 0;
+ spaceToDelete = 0;
+ if (current->blkno == SPGIST_HEAD_BLKNO)
+ {
+ /*
+ * We are splitting the root (which up to now is also a leaf page).
+ * Its tuples are not linked, so scan sequentially to get them all.
+ * We ignore the original value of current->offnum.
+ */
+ for (i = FirstOffsetNumber; i <= max; i++)
+ {
+ SpGistLeafTuple it;
+
+ it = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, i));
+ if (it->tupstate == SPGIST_LIVE)
+ {
+ in.datums[nToInsert] = SGLTDATUM(it, state);
+ heapPtrs[nToInsert] = it->heapPtr;
+ nToInsert++;
+ toDelete[nToDelete] = i;
+ nToDelete++;
+ /* we will delete the tuple altogether, so count full space */
+ spaceToDelete += it->size + sizeof(ItemIdData);
+ }
+ else /* tuples on root should be live */
+ elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate);
+ }
+ }
+ else
+ {
+ /* Normal case, just collect the leaf tuples in the chain */
+ i = current->offnum;
+ while (i != InvalidOffsetNumber)
+ {
+ SpGistLeafTuple it;
+
+ Assert(i >= FirstOffsetNumber && i <= max);
+ it = (SpGistLeafTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, i));
+ if (it->tupstate == SPGIST_LIVE)
+ {
+ in.datums[nToInsert] = SGLTDATUM(it, state);
+ heapPtrs[nToInsert] = it->heapPtr;
+ nToInsert++;
+ toDelete[nToDelete] = i;
+ nToDelete++;
+ /* we will not delete the tuple, only replace with dead */
+ Assert(it->size >= SGDTSIZE);
+ spaceToDelete += it->size - SGDTSIZE;
+ }
+ else if (it->tupstate == SPGIST_DEAD)
+ {
+ /* We could see a DEAD tuple as first/only chain item */
+ Assert(i == current->offnum);
+ Assert(it->nextOffset == InvalidOffsetNumber);
+ toDelete[nToDelete] = i;
+ nToDelete++;
+ /* replacing it with redirect will save no space */
+ }
+ else
+ elog(ERROR, "unexpected SPGiST tuple state: %d", it->tupstate);
+
+ i = it->nextOffset;
+ }
+ }
+ in.nTuples = nToInsert;
+
+ /*
+ * We may not actually insert new tuple because another picksplit may be
+ * necessary due to too large value, but we will try to to allocate enough
+ * space to include it; and in any case it has to be included in the input
+ * for the picksplit function. So don't increment nToInsert yet.
+ */
+ in.datums[in.nTuples] = SGLTDATUM(newLeafTuple, state);
+ heapPtrs[in.nTuples] = newLeafTuple->heapPtr;
+ in.nTuples++;
+
+ /*
+ * Perform split using user-defined method.
+ */
+ memset(&out, 0, sizeof(out));
+
+ FunctionCall2Coll(&state->picksplitFn,
+ index->rd_indcollation[0],
+ PointerGetDatum(&in),
+ PointerGetDatum(&out));
+
+ /*
+ * Form new leaf tuples and count up the total space needed.
+ */
+ totalLeafSizes = 0;
+ for (i = 0; i < in.nTuples; i++)
+ {
+ newLeafs[i] = spgFormLeafTuple(state, heapPtrs + i,
+ out.leafTupleDatums[i]);
+ totalLeafSizes += newLeafs[i]->size + sizeof(ItemIdData);
+ }
+
+ /*
+ * Check to see if the picksplit function failed to separate the values,
+ * ie, it put them all into the same child node. If so, select allTheSame
+ * mode and create a random split instead. See comments for
+ * checkAllTheSame as to why we need to know if the new leaf tuples could
+ * fit on one page.
+ */
+ allTheSame = checkAllTheSame(&in, &out,
+ totalLeafSizes > SPGIST_PAGE_CAPACITY,
+ &includeNew);
+
+ /*
+ * If checkAllTheSame decided we must exclude the new tuple, don't
+ * consider it any further.
+ */
+ if (includeNew)
+ maxToInclude = in.nTuples;
+ else
+ {
+ maxToInclude = in.nTuples - 1;
+ totalLeafSizes -= newLeafs[in.nTuples - 1]->size + sizeof(ItemIdData);
+ }
+
+ /*
+ * Allocate per-node work arrays. Since checkAllTheSame could replace
+ * out.nNodes with a value larger than the number of tuples on the input
+ * page, we can't allocate these arrays before here.
+ */
+ nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * out.nNodes);
+ leafSizes = (int *) palloc0(sizeof(int) * out.nNodes);
+
+ /*
+ * Form nodes of inner tuple and inner tuple itself
+ */
+ for (i = 0; i < out.nNodes; i++)
+ {
+ Datum label = (Datum) 0;
+ bool isnull = (out.nodeLabels == NULL);
+
+ if (!isnull)
+ label = out.nodeLabels[i];
+ nodes[i] = spgFormNodeTuple(state, label, isnull);
+ }
+ innerTuple = spgFormInnerTuple(state,
+ out.hasPrefix, out.prefixDatum,
+ out.nNodes, nodes);
+ innerTuple->allTheSame = allTheSame;
+
+ /*
+ * Update nodes[] array to point into the newly formed innerTuple, so
+ * that we can adjust their downlinks below.
+ */
+ SGITITERATE(innerTuple, i, node)
+ {
+ nodes[i] = node;
+ }
+
+ /*
+ * Re-scan new leaf tuples and count up the space needed under each node.
+ */
+ for (i = 0; i < maxToInclude; i++)
+ {
+ n = out.mapTuplesToNodes[i];
+ if (n < 0 || n >= out.nNodes)
+ elog(ERROR, "inconsistent result of SPGiST picksplit function");
+ leafSizes[n] += newLeafs[i]->size + sizeof(ItemIdData);
+ }
+
+ /*
+ * To perform the split, we must insert a new inner tuple, which can't
+ * go on a leaf page; and unless we are splitting the root page, we
+ * must then update the parent tuple's downlink to point to the inner
+ * tuple. If there is room, we'll put the new inner tuple on the same
+ * page as the parent tuple, otherwise we need another non-leaf buffer.
+ * But if the parent page is the root, we can't add the new inner tuple
+ * there, because the root page must have only one inner tuple.
+ */
+ xlrec.initInner = false;
+ if (parent->buffer != InvalidBuffer &&
+ parent->blkno != SPGIST_HEAD_BLKNO &&
+ (SpGistPageGetFreeSpace(parent->page, 1) >=
+ innerTuple->size + sizeof(ItemIdData)))
+ {
+ /* New inner tuple will fit on parent page */
+ newInnerBuffer = parent->buffer;
+ }
+ else if (parent->buffer != InvalidBuffer)
+ {
+ /* Send tuple to page with next triple parity (see README) */
+ newInnerBuffer = SpGistGetBuffer(index,
+ GBUF_INNER_PARITY(parent->blkno + 1),
+ innerTuple->size + sizeof(ItemIdData),
+ &xlrec.initInner);
+ }
+ else
+ {
+ /* Root page split ... inner tuple will go to root page */
+ newInnerBuffer = InvalidBuffer;
+ }
+
+ /*----------
+ * Because a WAL record can't involve more than four buffers, we can
+ * only afford to deal with two leaf pages in each picksplit action,
+ * ie the current page and at most one other.
+ *
+ * The new leaf tuples converted from the existing ones should require
+ * the same or less space, and therefore should all fit onto one page
+ * (although that's not necessarily the current page, since we can't
+ * delete the old tuples but only replace them with placeholders).
+ * However, the incoming new tuple might not also fit, in which case
+ * we might need another picksplit cycle to reduce it some more.
+ *
+ * If there's not room to put everything back onto the current page,
+ * then we decide on a per-node basis which tuples go to the new page.
+ * (We do it like that because leaf tuple chains can't cross pages,
+ * so we must place all leaf tuples belonging to the same parent node
+ * on the same page.)
+ *
+ * If we are splitting the root page (turning it from a leaf page into an
+ * inner page), then no leaf tuples can go back to the current page; they
+ * must all go somewhere else.
+ *----------
+ */
+ if (current->blkno != SPGIST_HEAD_BLKNO)
+ currentFreeSpace = PageGetExactFreeSpace(current->page) + spaceToDelete;
+ else
+ currentFreeSpace = 0; /* prevent assigning any tuples to current */
+
+ xlrec.initDest = false;
+
+ if (totalLeafSizes <= currentFreeSpace)
+ {
+ /* All the leaf tuples will fit on current page */
+ newLeafBuffer = InvalidBuffer;
+ /* mark new leaf tuple as included in insertions, if allowed */
+ if (includeNew)
+ {
+ nToInsert++;
+ insertedNew = true;
+ }
+ for (i = 0; i < nToInsert; i++)
+ leafPageSelect[i] = 0; /* signifies current page */
+ }
+ else if (in.nTuples == 1 && totalLeafSizes > SPGIST_PAGE_CAPACITY)
+ {
+ /*
+ * We're trying to split up a long value by repeated suffixing, but
+ * it's not going to fit yet. Don't bother allocating a second leaf
+ * buffer that we won't be able to use.
+ */
+ newLeafBuffer = InvalidBuffer;
+ Assert(includeNew);
+ Assert(nToInsert == 0);
+ }
+ else
+ {
+ /* We will need another leaf page */
+ uint8 *nodePageSelect;
+ int curspace;
+ int newspace;
+
+ newLeafBuffer = SpGistGetBuffer(index, GBUF_LEAF,
+ Min(totalLeafSizes,
+ SPGIST_PAGE_CAPACITY),
+ &xlrec.initDest);
+ /*
+ * Attempt to assign node groups to the two pages. We might fail to
+ * do so, even if totalLeafSizes is less than the available space,
+ * because we can't split a group across pages.
+ */
+ nodePageSelect = (uint8 *) palloc(sizeof(uint8) * out.nNodes);
+
+ curspace = currentFreeSpace;
+ newspace = PageGetExactFreeSpace(BufferGetPage(newLeafBuffer));
+ for (i = 0; i < out.nNodes; i++)
+ {
+ if (leafSizes[i] <= curspace)
+ {
+ nodePageSelect[i] = 0; /* signifies current page */
+ curspace -= leafSizes[i];
+ }
+ else
+ {
+ nodePageSelect[i] = 1; /* signifies new leaf page */
+ newspace -= leafSizes[i];
+ }
+ }
+ if (curspace >= 0 && newspace >= 0)
+ {
+ /* Successful assignment, so we can include the new leaf tuple */
+ if (includeNew)
+ {
+ nToInsert++;
+ insertedNew = true;
+ }
+ }
+ else if (includeNew)
+ {
+ /* We must exclude the new leaf tuple from the split */
+ int nodeOfNewTuple = out.mapTuplesToNodes[in.nTuples - 1];
+
+ leafSizes[nodeOfNewTuple] -=
+ newLeafs[in.nTuples - 1]->size + sizeof(ItemIdData);
+
+ /* Repeat the node assignment process --- should succeed now */
+ curspace = currentFreeSpace;
+ newspace = PageGetExactFreeSpace(BufferGetPage(newLeafBuffer));
+ for (i = 0; i < out.nNodes; i++)
+ {
+ if (leafSizes[i] <= curspace)
+ {
+ nodePageSelect[i] = 0; /* signifies current page */
+ curspace -= leafSizes[i];
+ }
+ else
+ {
+ nodePageSelect[i] = 1; /* signifies new leaf page */
+ newspace -= leafSizes[i];
+ }
+ }
+ if (curspace < 0 || newspace < 0)
+ elog(ERROR, "failed to divide leaf tuple groups across pages");
+ }
+ else
+ {
+ /* oops, we already excluded new tuple ... should not get here */
+ elog(ERROR, "failed to divide leaf tuple groups across pages");
+ }
+ /* Expand the per-node assignments to be shown per leaf tuple */
+ for (i = 0; i < nToInsert; i++)
+ {
+ n = out.mapTuplesToNodes[i];
+ leafPageSelect[i] = nodePageSelect[n];
+ }
+ }
+
+ /* Start preparing WAL record */
+ xlrec.blknoSrc = current->blkno;
+ xlrec.blknoDest = InvalidBlockNumber;
+ xlrec.nDelete = 0;
+ xlrec.initSrc = isNew;
+
+ leafdata = leafptr = (char *) palloc(totalLeafSizes);
+
+ ACCEPT_RDATA_DATA(&xlrec, MAXALIGN(sizeof(xlrec)), 0);
+ ACCEPT_RDATA_DATA(innerTuple, innerTuple->size, 1);
+ nRdata = 2;
+
+ /* Here we begin making the changes to the target pages */
+ START_CRIT_SECTION();
+
+ /*
+ * Delete old leaf tuples from current buffer, except when we're splitting
+ * the root; in that case there's no need because we'll re-init the page
+ * below. We do this first to make room for reinserting new leaf tuples.
+ */
+ if (current->blkno != SPGIST_HEAD_BLKNO)
+ {
+ /*
+ * Init buffer instead of deleting individual tuples, but only if
+ * there aren't any other live tuples and only during build; otherwise
+ * we need to set a redirection tuple for concurrent scans.
+ */
+ if (state->isBuild &&
+ nToDelete + SpGistPageGetOpaque(current->page)->nPlaceholder ==
+ PageGetMaxOffsetNumber(current->page))
+ {
+ SpGistInitBuffer(current->buffer, SPGIST_LEAF);
+ xlrec.initSrc = true;
+ }
+ else if (isNew)
+ {
+ /* don't expose the freshly init'd buffer as a backup block */
+ Assert(nToDelete == 0);
+ }
+ else
+ {
+ xlrec.nDelete = nToDelete;
+ ACCEPT_RDATA_DATA(toDelete,
+ MAXALIGN(sizeof(OffsetNumber) * nToDelete),
+ nRdata);
+ nRdata++;
+ ACCEPT_RDATA_BUFFER(current->buffer, nRdata);
+ nRdata++;
+
+ if (!state->isBuild)
+ {
+ /*
+ * Need to create redirect tuple (it will point to new inner
+ * tuple) but right now the new tuple's location is not known
+ * yet. So, set the redirection pointer to "impossible" value
+ * and remember its position to update tuple later.
+ */
+ if (nToDelete > 0)
+ redirectTuplePos = toDelete[0];
+ spgPageIndexMultiDelete(state, current->page,
+ toDelete, nToDelete,
+ SPGIST_REDIRECT,
+ SPGIST_PLACEHOLDER,
+ SPGIST_METAPAGE_BLKNO,
+ FirstOffsetNumber);
+ }
+ else
+ {
+ /*
+ * During index build there is not concurrent searches, so we
+ * don't need to create redirection tuple.
+ */
+ spgPageIndexMultiDelete(state, current->page,
+ toDelete, nToDelete,
+ SPGIST_PLACEHOLDER,
+ SPGIST_PLACEHOLDER,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+ }
+ }
+ }
+
+ /*
+ * Put leaf tuples on proper pages, and update downlinks in innerTuple's
+ * nodes.
+ */
+ startOffsets[0] = startOffsets[1] = InvalidOffsetNumber;
+ for (i = 0; i < nToInsert; i++)
+ {
+ SpGistLeafTuple it = newLeafs[i];
+ Buffer leafBuffer;
+ BlockNumber leafBlock;
+ OffsetNumber newoffset;
+
+ /* Which page is it going to? */
+ leafBuffer = leafPageSelect[i] ? newLeafBuffer : current->buffer;
+ leafBlock = BufferGetBlockNumber(leafBuffer);
+
+ /* Link tuple into correct chain for its node */
+ n = out.mapTuplesToNodes[i];
+
+ if (ItemPointerIsValid(&nodes[n]->t_tid))
+ {
+ Assert(ItemPointerGetBlockNumber(&nodes[n]->t_tid) == leafBlock);
+ it->nextOffset = ItemPointerGetOffsetNumber(&nodes[n]->t_tid);
+ }
+ else
+ it->nextOffset = InvalidOffsetNumber;
+
+ /* Insert it on page */
+ newoffset = SpGistPageAddNewItem(state, BufferGetPage(leafBuffer),
+ (Item) it, it->size,
+ &startOffsets[leafPageSelect[i]],
+ false);
+ toInsert[i] = newoffset;
+
+ /* ... and complete the chain linking */
+ ItemPointerSet(&nodes[n]->t_tid, leafBlock, newoffset);
+
+ /* Also copy leaf tuple into WAL data */
+ memcpy(leafptr, newLeafs[i], newLeafs[i]->size);
+ leafptr += newLeafs[i]->size;
+ }
+
+ /*
+ * We're done modifying the other leaf buffer (if any), so mark it dirty.
+ * current->buffer will be marked below, after we're entirely done
+ * modifying it.
+ */
+ if (newLeafBuffer != InvalidBuffer)
+ {
+ MarkBufferDirty(newLeafBuffer);
+ /* also save block number for WAL */
+ xlrec.blknoDest = BufferGetBlockNumber(newLeafBuffer);
+ if (!xlrec.initDest)
+ {
+ ACCEPT_RDATA_BUFFER(newLeafBuffer, nRdata);
+ nRdata++;
+ }
+ }
+
+ xlrec.nInsert = nToInsert;
+ ACCEPT_RDATA_DATA(toInsert,
+ MAXALIGN(sizeof(OffsetNumber) * nToInsert),
+ nRdata);
+ nRdata++;
+ ACCEPT_RDATA_DATA(leafPageSelect,
+ MAXALIGN(sizeof(uint8) * nToInsert),
+ nRdata);
+ nRdata++;
+ ACCEPT_RDATA_DATA(leafdata, leafptr - leafdata, nRdata);
+ nRdata++;
+
+ /* Remember current buffer, since we're about to change "current" */
+ saveCurrent = *current;
+
+ /*
+ * Store the new innerTuple
+ */
+ if (newInnerBuffer == parent->buffer && newInnerBuffer != InvalidBuffer)
+ {
+ /*
+ * new inner tuple goes to parent page
+ */
+ Assert(current->buffer != parent->buffer);
+
+ /* Repoint "current" at the new inner tuple */
+ current->blkno = parent->blkno;
+ current->buffer = parent->buffer;
+ current->page = parent->page;
+ xlrec.blknoInner = current->blkno;
+ xlrec.offnumInner = current->offnum =
+ SpGistPageAddNewItem(state, current->page,
+ (Item) innerTuple, innerTuple->size,
+ NULL, false);
+
+ /*
+ * Update parent node link and mark parent page dirty
+ */
+ xlrec.blknoParent = parent->blkno;
+ xlrec.offnumParent = parent->offnum;
+ xlrec.nodeI = parent->node;
+ saveNodeLink(index, parent, current->blkno, current->offnum);
+
+ ACCEPT_RDATA_BUFFER(parent->buffer, nRdata);
+ nRdata++;
+
+ /*
+ * Update redirection link (in old current buffer)
+ */
+ if (redirectTuplePos != InvalidOffsetNumber)
+ setRedirectionTuple(&saveCurrent, redirectTuplePos,
+ current->blkno, current->offnum);
+
+ /* Done modifying old current buffer, mark it dirty */
+ MarkBufferDirty(saveCurrent.buffer);
+ }
+ else if (parent->buffer != InvalidBuffer)
+ {
+ /*
+ * new inner tuple will be stored on a new page
+ */
+ Assert(newInnerBuffer != InvalidBuffer);
+
+ /* Repoint "current" at the new inner tuple */
+ current->buffer = newInnerBuffer;
+ current->blkno = BufferGetBlockNumber(current->buffer);
+ current->page = BufferGetPage(current->buffer);
+ xlrec.blknoInner = current->blkno;
+ xlrec.offnumInner = current->offnum =
+ SpGistPageAddNewItem(state, current->page,
+ (Item) innerTuple, innerTuple->size,
+ NULL, false);
+
+ /* Done modifying new current buffer, mark it dirty */
+ MarkBufferDirty(current->buffer);
+
+ /*
+ * Update parent node link and mark parent page dirty
+ */
+ xlrec.blknoParent = parent->blkno;
+ xlrec.offnumParent = parent->offnum;
+ xlrec.nodeI = parent->node;
+ saveNodeLink(index, parent, current->blkno, current->offnum);
+
+ ACCEPT_RDATA_BUFFER(current->buffer, nRdata);
+ nRdata++;
+ ACCEPT_RDATA_BUFFER(parent->buffer, nRdata);
+ nRdata++;
+
+ /*
+ * Update redirection link (in old current buffer)
+ */
+ if (redirectTuplePos != InvalidOffsetNumber)
+ setRedirectionTuple(&saveCurrent, redirectTuplePos,
+ current->blkno, current->offnum);
+
+ /* Done modifying old current buffer, mark it dirty */
+ MarkBufferDirty(saveCurrent.buffer);
+ }
+ else
+ {
+ /*
+ * Splitting root page, which was a leaf but now becomes inner page
+ * (and so "current" continues to point at it)
+ */
+ Assert(current->blkno == SPGIST_HEAD_BLKNO);
+ Assert(redirectTuplePos == InvalidOffsetNumber);
+
+ SpGistInitBuffer(current->buffer, 0);
+ xlrec.initInner = true;
+
+ xlrec.blknoInner = current->blkno;
+ xlrec.offnumInner = current->offnum =
+ PageAddItem(current->page, (Item) innerTuple, innerTuple->size,
+ InvalidOffsetNumber, false, false);
+ if (current->offnum != FirstOffsetNumber)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ innerTuple->size);
+
+ /* No parent link to update, nor redirection to do */
+ xlrec.blknoParent = InvalidBlockNumber;
+ xlrec.offnumParent = InvalidOffsetNumber;
+ xlrec.nodeI = 0;
+
+ /* Done modifying new current buffer, mark it dirty */
+ MarkBufferDirty(current->buffer);
+
+ /* saveCurrent doesn't represent a different buffer */
+ saveCurrent.buffer = InvalidBuffer;
+ }
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ /* Issue the WAL record */
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_PICKSPLIT, rdata);
+
+ /* Update page LSNs on all affected pages */
+ if (newLeafBuffer != InvalidBuffer)
+ {
+ Page page = BufferGetPage(newLeafBuffer);
+
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
+
+ if (saveCurrent.buffer != InvalidBuffer)
+ {
+ Page page = BufferGetPage(saveCurrent.buffer);
+
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
+
+ PageSetLSN(current->page, recptr);
+ PageSetTLI(current->page, ThisTimeLineID);
+
+ if (parent->buffer != InvalidBuffer)
+ {
+ PageSetLSN(parent->page, recptr);
+ PageSetTLI(parent->page, ThisTimeLineID);
+ }
+ }
+
+ END_CRIT_SECTION();
+
+ /* Update local free-space cache and unlock buffers */
+ if (newLeafBuffer != InvalidBuffer)
+ {
+ SpGistSetLastUsedPage(index, newLeafBuffer);
+ UnlockReleaseBuffer(newLeafBuffer);
+ }
+ if (saveCurrent.buffer != InvalidBuffer)
+ {
+ SpGistSetLastUsedPage(index, saveCurrent.buffer);
+ UnlockReleaseBuffer(saveCurrent.buffer);
+ }
+
+ return insertedNew;
+}
+
+/*
+ * spgMatchNode action: descend to N'th child node of current inner tuple
+ */
+static void
+spgMatchNodeAction(Relation index, SpGistState *state,
+ SpGistInnerTuple innerTuple,
+ SPPageDesc *current, SPPageDesc *parent, int nodeN)
+{
+ int i;
+ SpGistNodeTuple node;
+
+ /* Release previous parent buffer if any */
+ if (parent->buffer != InvalidBuffer &&
+ parent->buffer != current->buffer)
+ {
+ SpGistSetLastUsedPage(index, parent->buffer);
+ UnlockReleaseBuffer(parent->buffer);
+ }
+
+ /* Repoint parent to specified node of current inner tuple */
+ parent->blkno = current->blkno;
+ parent->buffer = current->buffer;
+ parent->page = current->page;
+ parent->offnum = current->offnum;
+ parent->node = nodeN;
+
+ /* Locate that node */
+ SGITITERATE(innerTuple, i, node)
+ {
+ if (i == nodeN)
+ break;
+ }
+
+ if (i != nodeN)
+ elog(ERROR, "failed to find requested node %d in SPGiST inner tuple",
+ nodeN);
+
+ /* Point current to the downlink location, if any */
+ if (ItemPointerIsValid(&node->t_tid))
+ {
+ current->blkno = ItemPointerGetBlockNumber(&node->t_tid);
+ current->offnum = ItemPointerGetOffsetNumber(&node->t_tid);
+ }
+ else
+ {
+ /* Downlink is empty, so we'll need to find a new page */
+ current->blkno = InvalidBlockNumber;
+ current->offnum = InvalidOffsetNumber;
+ }
+
+ current->buffer = InvalidBuffer;
+ current->page = NULL;
+}
+
+/*
+ * spgAddNode action: add a node to the inner tuple at current
+ */
+static void
+spgAddNodeAction(Relation index, SpGistState *state,
+ SpGistInnerTuple innerTuple,
+ SPPageDesc *current, SPPageDesc *parent,
+ int nodeN, Datum nodeLabel)
+{
+ SpGistInnerTuple newInnerTuple;
+ XLogRecData rdata[5];
+ spgxlogAddNode xlrec;
+
+ /* Construct new inner tuple with additional node */
+ newInnerTuple = addNode(state, innerTuple, nodeLabel, nodeN);
+
+ /* Prepare WAL record */
+ xlrec.node = index->rd_node;
+ STORE_STATE(state, xlrec.stateSrc);
+ xlrec.blkno = current->blkno;
+ xlrec.offnum = current->offnum;
+
+ /* we don't fill these unless we need to change the parent downlink */
+ xlrec.blknoParent = InvalidBlockNumber;
+ xlrec.offnumParent = InvalidOffsetNumber;
+ xlrec.nodeI = 0;
+
+ /* we don't fill these unless tuple has to be moved */
+ xlrec.blknoNew = InvalidBlockNumber;
+ xlrec.offnumNew = InvalidOffsetNumber;
+ xlrec.newPage = false;
+
+ ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0);
+ /* we assume sizeof(xlrec) is at least int-aligned */
+ ACCEPT_RDATA_DATA(newInnerTuple, newInnerTuple->size, 1);
+ ACCEPT_RDATA_BUFFER(current->buffer, 2);
+
+ if (PageGetExactFreeSpace(current->page) >=
+ newInnerTuple->size - innerTuple->size)
+ {
+ /*
+ * We can replace the inner tuple by new version in-place
+ */
+ START_CRIT_SECTION();
+
+ PageIndexTupleDelete(current->page, current->offnum);
+ if (PageAddItem(current->page,
+ (Item) newInnerTuple, newInnerTuple->size,
+ current->offnum, false, false) != current->offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ newInnerTuple->size);
+
+ MarkBufferDirty(current->buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE, rdata);
+
+ PageSetLSN(current->page, recptr);
+ PageSetTLI(current->page, ThisTimeLineID);
+ }
+
+ END_CRIT_SECTION();
+ }
+ else
+ {
+ /*
+ * move inner tuple to another page, and update parent
+ */
+ SpGistDeadTuple dt;
+ SPPageDesc saveCurrent;
+
+ /*
+ * It should not be possible to get here for the root page, since we
+ * allow only one inner tuple on the root page, and spgFormInnerTuple
+ * always checks that inner tuples don't exceed the size of a page.
+ */
+ if (current->blkno == SPGIST_HEAD_BLKNO)
+ elog(ERROR, "cannot enlarge root tuple any more");
+ Assert(parent->buffer != InvalidBuffer);
+
+ saveCurrent = *current;
+
+ xlrec.blknoParent = parent->blkno;
+ xlrec.offnumParent = parent->offnum;
+ xlrec.nodeI = parent->node;
+
+ /*
+ * obtain new buffer with the same parity as current, since it will
+ * be a child of same parent tuple
+ */
+ current->buffer = SpGistGetBuffer(index,
+ GBUF_INNER_PARITY(current->blkno),
+ newInnerTuple->size + sizeof(ItemIdData),
+ &xlrec.newPage);
+ current->blkno = BufferGetBlockNumber(current->buffer);
+ current->page = BufferGetPage(current->buffer);
+
+ xlrec.blknoNew = current->blkno;
+
+ /*
+ * Let's just make real sure new current isn't same as old. Right
+ * now that's impossible, but if SpGistGetBuffer ever got smart enough
+ * to delete placeholder tuples before checking space, maybe it
+ * wouldn't be impossible. The case would appear to work except that
+ * WAL replay would be subtly wrong, so I think a mere assert isn't
+ * enough here.
+ */
+ if (xlrec.blknoNew == xlrec.blkno)
+ elog(ERROR, "SPGiST new buffer shouldn't be same as old buffer");
+
+ /*
+ * New current and parent buffer will both be modified; but note that
+ * parent buffer could be same as either new or old current.
+ */
+ ACCEPT_RDATA_BUFFER(current->buffer, 3);
+ if (parent->buffer != current->buffer &&
+ parent->buffer != saveCurrent.buffer)
+ ACCEPT_RDATA_BUFFER(parent->buffer, 4);
+
+ START_CRIT_SECTION();
+
+ /* insert new ... */
+ xlrec.offnumNew = current->offnum =
+ SpGistPageAddNewItem(state, current->page,
+ (Item) newInnerTuple, newInnerTuple->size,
+ NULL, false);
+
+ MarkBufferDirty(current->buffer);
+
+ /* update parent's downlink and mark parent page dirty */
+ saveNodeLink(index, parent, current->blkno, current->offnum);
+
+ /*
+ * Replace old tuple with a placeholder or redirection tuple. Unless
+ * doing an index build, we have to insert a redirection tuple for
+ * possible concurrent scans. We can't just delete it in any case,
+ * because that could change the offsets of other tuples on the page,
+ * breaking downlinks from their parents.
+ */
+ if (state->isBuild)
+ dt = spgFormDeadTuple(state, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber, InvalidOffsetNumber);
+ else
+ dt = spgFormDeadTuple(state, SPGIST_REDIRECT,
+ current->blkno, current->offnum);
+
+ PageIndexTupleDelete(saveCurrent.page, saveCurrent.offnum);
+ if (PageAddItem(saveCurrent.page, (Item) dt, dt->size,
+ saveCurrent.offnum,
+ false, false) != saveCurrent.offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ dt->size);
+
+ if (state->isBuild)
+ SpGistPageGetOpaque(saveCurrent.page)->nPlaceholder++;
+ else
+ SpGistPageGetOpaque(saveCurrent.page)->nRedirection++;
+
+ MarkBufferDirty(saveCurrent.buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_ADD_NODE, rdata);
+
+ /* we don't bother to check if any of these are redundant */
+ PageSetLSN(current->page, recptr);
+ PageSetTLI(current->page, ThisTimeLineID);
+ PageSetLSN(parent->page, recptr);
+ PageSetTLI(parent->page, ThisTimeLineID);
+ PageSetLSN(saveCurrent.page, recptr);
+ PageSetTLI(saveCurrent.page, ThisTimeLineID);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Release saveCurrent if it's not same as current or parent */
+ if (saveCurrent.buffer != current->buffer &&
+ saveCurrent.buffer != parent->buffer)
+ {
+ SpGistSetLastUsedPage(index, saveCurrent.buffer);
+ UnlockReleaseBuffer(saveCurrent.buffer);
+ }
+ }
+}
+
+/*
+ * spgSplitNode action: split inner tuple at current into prefix and postfix
+ */
+static void
+spgSplitNodeAction(Relation index, SpGistState *state,
+ SpGistInnerTuple innerTuple,
+ SPPageDesc *current, spgChooseOut *out)
+{
+ SpGistInnerTuple prefixTuple,
+ postfixTuple;
+ SpGistNodeTuple node,
+ *nodes;
+ BlockNumber postfixBlkno;
+ OffsetNumber postfixOffset;
+ int i;
+ XLogRecData rdata[5];
+ spgxlogSplitTuple xlrec;
+ Buffer newBuffer = InvalidBuffer;
+
+ /*
+ * Construct new prefix tuple, containing a single node with the
+ * specified label. (We'll update the node's downlink to point to the
+ * new postfix tuple, below.)
+ */
+ node = spgFormNodeTuple(state, out->result.splitTuple.nodeLabel, false);
+
+ prefixTuple = spgFormInnerTuple(state,
+ out->result.splitTuple.prefixHasPrefix,
+ out->result.splitTuple.prefixPrefixDatum,
+ 1, &node);
+
+ /* it must fit in the space that innerTuple now occupies */
+ if (prefixTuple->size > innerTuple->size)
+ elog(ERROR, "SPGiST inner-tuple split must not produce longer prefix");
+
+ /*
+ * Construct new postfix tuple, containing all nodes of innerTuple with
+ * same node datums, but with the prefix specified by the picksplit
+ * function.
+ */
+ nodes = palloc(sizeof(SpGistNodeTuple) * innerTuple->nNodes);
+ SGITITERATE(innerTuple, i, node)
+ {
+ nodes[i] = node;
+ }
+
+ postfixTuple = spgFormInnerTuple(state,
+ out->result.splitTuple.postfixHasPrefix,
+ out->result.splitTuple.postfixPrefixDatum,
+ innerTuple->nNodes, nodes);
+
+ /* Postfix tuple is allTheSame if original tuple was */
+ postfixTuple->allTheSame = innerTuple->allTheSame;
+
+ /* prep data for WAL record */
+ xlrec.node = index->rd_node;
+ xlrec.newPage = false;
+
+ ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0);
+ /* we assume sizeof(xlrec) is at least int-aligned */
+ ACCEPT_RDATA_DATA(prefixTuple, prefixTuple->size, 1);
+ ACCEPT_RDATA_DATA(postfixTuple, postfixTuple->size, 2);
+ ACCEPT_RDATA_BUFFER(current->buffer, 3);
+
+ /*
+ * If we can't fit both tuples on the current page, get a new page for the
+ * postfix tuple. In particular, can't split to the root page.
+ *
+ * For the space calculation, note that prefixTuple replaces innerTuple
+ * but postfixTuple will be a new entry.
+ */
+ if (current->blkno == SPGIST_HEAD_BLKNO ||
+ SpGistPageGetFreeSpace(current->page, 1) + innerTuple->size <
+ prefixTuple->size + postfixTuple->size + sizeof(ItemIdData))
+ {
+ /*
+ * Choose page with next triple parity, because postfix tuple is a
+ * child of prefix one
+ */
+ newBuffer = SpGistGetBuffer(index,
+ GBUF_INNER_PARITY(current->blkno + 1),
+ postfixTuple->size + sizeof(ItemIdData),
+ &xlrec.newPage);
+ ACCEPT_RDATA_BUFFER(newBuffer, 4);
+ }
+
+ START_CRIT_SECTION();
+
+ /*
+ * Replace old tuple by prefix tuple
+ */
+ PageIndexTupleDelete(current->page, current->offnum);
+ xlrec.offnumPrefix = PageAddItem(current->page,
+ (Item) prefixTuple, prefixTuple->size,
+ current->offnum, false, false);
+ if (xlrec.offnumPrefix != current->offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ prefixTuple->size);
+ xlrec.blknoPrefix = current->blkno;
+
+ /*
+ * put postfix tuple into appropriate page
+ */
+ if (newBuffer == InvalidBuffer)
+ {
+ xlrec.blknoPostfix = postfixBlkno = current->blkno;
+ xlrec.offnumPostfix = postfixOffset =
+ SpGistPageAddNewItem(state, current->page,
+ (Item) postfixTuple, postfixTuple->size,
+ NULL, false);
+ }
+ else
+ {
+ xlrec.blknoPostfix = postfixBlkno = BufferGetBlockNumber(newBuffer);
+ xlrec.offnumPostfix = postfixOffset =
+ SpGistPageAddNewItem(state, BufferGetPage(newBuffer),
+ (Item) postfixTuple, postfixTuple->size,
+ NULL, false);
+ MarkBufferDirty(newBuffer);
+ }
+
+ /*
+ * And set downlink pointer in the prefix tuple to point to postfix tuple.
+ * (We can't avoid this step by doing the above two steps in opposite
+ * order, because there might not be enough space on the page to insert
+ * the postfix tuple first.) We have to update the local copy of the
+ * prefixTuple too, because that's what will be written to WAL.
+ */
+ updateNodeLink(prefixTuple, 0, postfixBlkno, postfixOffset);
+ prefixTuple = (SpGistInnerTuple) PageGetItem(current->page,
+ PageGetItemId(current->page, current->offnum));
+ updateNodeLink(prefixTuple, 0, postfixBlkno, postfixOffset);
+
+ MarkBufferDirty(current->buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_SPLIT_TUPLE, rdata);
+
+ PageSetLSN(current->page, recptr);
+ PageSetTLI(current->page, ThisTimeLineID);
+
+ if (newBuffer != InvalidBuffer)
+ {
+ PageSetLSN(BufferGetPage(newBuffer), recptr);
+ PageSetTLI(BufferGetPage(newBuffer), ThisTimeLineID);
+ }
+ }
+
+ END_CRIT_SECTION();
+
+ /* Update local free-space cache and release buffer */
+ if (newBuffer != InvalidBuffer)
+ {
+ SpGistSetLastUsedPage(index, newBuffer);
+ UnlockReleaseBuffer(newBuffer);
+ }
+}
+
+/*
+ * Insert one item into the index
+ */
+void
+spgdoinsert(Relation index, SpGistState *state,
+ ItemPointer heapPtr, Datum datum)
+{
+ int level = 0;
+ Datum leafDatum;
+ int leafSize;
+ SPPageDesc current,
+ parent;
+
+ /*
+ * Since we don't use index_form_tuple in this AM, we have to make sure
+ * value to be inserted is not toasted; FormIndexDatum doesn't guarantee
+ * that.
+ */
+ if (state->attType.attlen == -1)
+ datum = PointerGetDatum(PG_DETOAST_DATUM(datum));
+
+ leafDatum = datum;
+
+ /*
+ * Compute space needed for a leaf tuple containing the given datum.
+ *
+ * If it isn't gonna fit, and the opclass can't reduce the datum size by
+ * suffixing, bail out now rather than getting into an endless loop.
+ */
+ leafSize = SGLTHDRSZ + sizeof(ItemIdData) +
+ SpGistGetTypeSize(&state->attType, leafDatum);
+
+ if (leafSize > SPGIST_PAGE_CAPACITY && !state->config.longValuesOK)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
+ (unsigned long) (leafSize - sizeof(ItemIdData)),
+ (unsigned long) (SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)),
+ RelationGetRelationName(index)),
+ errhint("Values larger than a buffer page cannot be indexed.")));
+
+ /* Initialize "current" to the root page */
+ current.blkno = SPGIST_HEAD_BLKNO;
+ current.buffer = InvalidBuffer;
+ current.page = NULL;
+ current.offnum = FirstOffsetNumber;
+ current.node = -1;
+
+ /* "parent" is invalid for the moment */
+ parent.blkno = InvalidBlockNumber;
+ parent.buffer = InvalidBuffer;
+ parent.page = NULL;
+ parent.offnum = InvalidOffsetNumber;
+ parent.node = -1;
+
+ for (;;)
+ {
+ bool isNew = false;
+
+ /*
+ * Bail out if query cancel is pending. We must have this somewhere
+ * in the loop since a broken opclass could produce an infinite
+ * picksplit loop.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ if (current.blkno == InvalidBlockNumber)
+ {
+ /*
+ * Create a leaf page. If leafSize is too large to fit on a page,
+ * we won't actually use the page yet, but it simplifies the API
+ * for doPickSplit to always have a leaf page at hand; so just
+ * quietly limit our request to a page size.
+ */
+ current.buffer = SpGistGetBuffer(index, GBUF_LEAF,
+ Min(leafSize,
+ SPGIST_PAGE_CAPACITY),
+ &isNew);
+ current.blkno = BufferGetBlockNumber(current.buffer);
+ }
+ else if (parent.buffer == InvalidBuffer ||
+ current.blkno != parent.blkno)
+ {
+ current.buffer = ReadBuffer(index, current.blkno);
+ LockBuffer(current.buffer, BUFFER_LOCK_EXCLUSIVE);
+ }
+ else
+ {
+ /* inner tuple can be stored on the same page as parent one */
+ current.buffer = parent.buffer;
+ }
+ current.page = BufferGetPage(current.buffer);
+
+ if (SpGistPageIsLeaf(current.page))
+ {
+ SpGistLeafTuple leafTuple;
+ int nToSplit,
+ sizeToSplit;
+
+ leafTuple = spgFormLeafTuple(state, heapPtr, leafDatum);
+ if (leafTuple->size + sizeof(ItemIdData) <=
+ SpGistPageGetFreeSpace(current.page, 1))
+ {
+ /* it fits on page, so insert it and we're done */
+ addLeafTuple(index, state, leafTuple,
+ &current, &parent, isNew);
+ break;
+ }
+ else if ((sizeToSplit =
+ checkSplitConditions(index, state, &current,
+ &nToSplit)) < SPGIST_PAGE_CAPACITY / 2 &&
+ nToSplit < 64 &&
+ leafTuple->size + sizeof(ItemIdData) + sizeToSplit <= SPGIST_PAGE_CAPACITY)
+ {
+ /*
+ * the amount of data is pretty small, so just move the whole
+ * chain to another leaf page rather than splitting it.
+ */
+ Assert(!isNew);
+ moveLeafs(index, state, &current, &parent, leafTuple);
+ break; /* we're done */
+ }
+ else
+ {
+ /* picksplit */
+ if (doPickSplit(index, state, &current, &parent,
+ leafTuple, level, isNew))
+ break; /* doPickSplit installed new tuples */
+
+ /* leaf tuple will not be inserted yet */
+ pfree(leafTuple);
+
+ /*
+ * current now describes new inner tuple, go insert into it
+ */
+ Assert(!SpGistPageIsLeaf(current.page));
+ goto process_inner_tuple;
+ }
+ }
+ else /* non-leaf page */
+ {
+ /*
+ * Apply the opclass choose function to figure out how to insert
+ * the given datum into the current inner tuple.
+ */
+ SpGistInnerTuple innerTuple;
+ spgChooseIn in;
+ spgChooseOut out;
+
+ /*
+ * spgAddNode and spgSplitTuple cases will loop back to here to
+ * complete the insertion operation. Just in case the choose
+ * function is broken and produces add or split requests
+ * repeatedly, check for query cancel.
+ */
+ process_inner_tuple:
+ CHECK_FOR_INTERRUPTS();
+
+ innerTuple = (SpGistInnerTuple) PageGetItem(current.page,
+ PageGetItemId(current.page, current.offnum));
+
+ in.datum = datum;
+ in.leafDatum = leafDatum;
+ in.level = level;
+ in.allTheSame = innerTuple->allTheSame;
+ in.hasPrefix = (innerTuple->prefixSize > 0);
+ in.prefixDatum = SGITDATUM(innerTuple, state);
+ in.nNodes = innerTuple->nNodes;
+ in.nodeLabels = spgExtractNodeLabels(state, innerTuple);
+
+ memset(&out, 0, sizeof(out));
+
+ FunctionCall2Coll(&state->chooseFn,
+ index->rd_indcollation[0],
+ PointerGetDatum(&in),
+ PointerGetDatum(&out));
+
+ if (innerTuple->allTheSame)
+ {
+ /*
+ * It's not allowed to do an AddNode at an allTheSame tuple.
+ * Opclass must say "match", in which case we choose a random
+ * one of the nodes to descend into, or "split".
+ */
+ if (out.resultType == spgAddNode)
+ elog(ERROR, "cannot add a node to an allTheSame inner tuple");
+ else if (out.resultType == spgMatchNode)
+ out.result.matchNode.nodeN = random() % innerTuple->nNodes;
+ }
+
+ switch (out.resultType)
+ {
+ case spgMatchNode:
+ /* Descend to N'th child node */
+ spgMatchNodeAction(index, state, innerTuple,
+ &current, &parent,
+ out.result.matchNode.nodeN);
+ /* Adjust level as per opclass request */
+ level += out.result.matchNode.levelAdd;
+ /* Replace leafDatum and recompute leafSize */
+ leafDatum = out.result.matchNode.restDatum;
+ leafSize = SGLTHDRSZ + sizeof(ItemIdData) +
+ SpGistGetTypeSize(&state->attType, leafDatum);
+
+ /*
+ * Loop around and attempt to insert the new leafDatum
+ * at "current" (which might reference an existing child
+ * tuple, or might be invalid to force us to find a new
+ * page for the tuple).
+ *
+ * Note: if the opclass sets longValuesOK, we rely on the
+ * choose function to eventually shorten the leafDatum
+ * enough to fit on a page. We could add a test here to
+ * complain if the datum doesn't get visibly shorter each
+ * time, but that could get in the way of opclasses that
+ * "simplify" datums in a way that doesn't necessarily
+ * lead to physical shortening on every cycle.
+ */
+ break;
+ case spgAddNode:
+ /* AddNode is not sensible if nodes don't have labels */
+ if (in.nodeLabels == NULL)
+ elog(ERROR, "cannot add a node to an inner tuple without node labels");
+ /* Add node to inner tuple, per request */
+ spgAddNodeAction(index, state, innerTuple,
+ &current, &parent,
+ out.result.addNode.nodeN,
+ out.result.addNode.nodeLabel);
+
+ /*
+ * Retry insertion into the enlarged node. We assume
+ * that we'll get a MatchNode result this time.
+ */
+ goto process_inner_tuple;
+ break;
+ case spgSplitTuple:
+ /* Split inner tuple, per request */
+ spgSplitNodeAction(index, state, innerTuple,
+ &current, &out);
+
+ /* Retry insertion into the split node */
+ goto process_inner_tuple;
+ break;
+ default:
+ elog(ERROR, "unrecognized SPGiST choose result: %d",
+ (int) out.resultType);
+ break;
+ }
+ }
+ } /* end loop */
+
+ /*
+ * Release any buffers we're still holding. Beware of possibility that
+ * current and parent reference same buffer.
+ */
+ if (current.buffer != InvalidBuffer)
+ {
+ SpGistSetLastUsedPage(index, current.buffer);
+ UnlockReleaseBuffer(current.buffer);
+ }
+ if (parent.buffer != InvalidBuffer &&
+ parent.buffer != current.buffer)
+ {
+ SpGistSetLastUsedPage(index, parent.buffer);
+ UnlockReleaseBuffer(parent.buffer);
+ }
+}
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
new file mode 100644
index 00000000000..4a059bdfedc
--- /dev/null
+++ b/src/backend/access/spgist/spginsert.c
@@ -0,0 +1,219 @@
+/*-------------------------------------------------------------------------
+ *
+ * spginsert.c
+ * Externally visible index creation/insertion routines
+ *
+ * All the actual insertion logic is in spgdoinsert.c.
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spginsert.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/spgist_private.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/smgr.h"
+#include "utils/memutils.h"
+
+
+typedef struct
+{
+ SpGistState spgstate; /* SPGiST's working state */
+ MemoryContext tmpCtx; /* per-tuple temporary context */
+} SpGistBuildState;
+
+
+/* Callback to process one heap tuple during IndexBuildHeapScan */
+static void
+spgistBuildCallback(Relation index, HeapTuple htup, Datum *values,
+ bool *isnull, bool tupleIsAlive, void *state)
+{
+ SpGistBuildState *buildstate = (SpGistBuildState *) state;
+
+ /* SPGiST doesn't index nulls */
+ if (*isnull == false)
+ {
+ /* Work in temp context, and reset it after each tuple */
+ MemoryContext oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
+
+ spgdoinsert(index, &buildstate->spgstate, &htup->t_self, *values);
+
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextReset(buildstate->tmpCtx);
+ }
+}
+
+/*
+ * Build an SP-GiST index.
+ */
+Datum
+spgbuild(PG_FUNCTION_ARGS)
+{
+ Relation heap = (Relation) PG_GETARG_POINTER(0);
+ Relation index = (Relation) PG_GETARG_POINTER(1);
+ IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
+ IndexBuildResult *result;
+ double reltuples;
+ SpGistBuildState buildstate;
+ Buffer metabuffer,
+ rootbuffer;
+
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "index \"%s\" already contains data",
+ RelationGetRelationName(index));
+
+ /*
+ * Initialize the meta page and root page
+ */
+ metabuffer = SpGistNewBuffer(index);
+ rootbuffer = SpGistNewBuffer(index);
+
+ Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO);
+ Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_HEAD_BLKNO);
+
+ START_CRIT_SECTION();
+
+ SpGistInitMetapage(BufferGetPage(metabuffer));
+ MarkBufferDirty(metabuffer);
+ SpGistInitBuffer(rootbuffer, SPGIST_LEAF);
+ MarkBufferDirty(rootbuffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+ XLogRecData rdata;
+
+ /* WAL data is just the relfilenode */
+ rdata.data = (char *) &(index->rd_node);
+ rdata.len = sizeof(RelFileNode);
+ rdata.buffer = InvalidBuffer;
+ rdata.next = NULL;
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_CREATE_INDEX, &rdata);
+
+ PageSetLSN(BufferGetPage(metabuffer), recptr);
+ PageSetTLI(BufferGetPage(metabuffer), ThisTimeLineID);
+ PageSetLSN(BufferGetPage(rootbuffer), recptr);
+ PageSetTLI(BufferGetPage(rootbuffer), ThisTimeLineID);
+ }
+
+ END_CRIT_SECTION();
+
+ UnlockReleaseBuffer(metabuffer);
+ UnlockReleaseBuffer(rootbuffer);
+
+ /*
+ * Now insert all the heap data into the index
+ */
+ initSpGistState(&buildstate.spgstate, index);
+ buildstate.spgstate.isBuild = true;
+
+ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "SP-GiST build temporary context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+
+ reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
+ spgistBuildCallback, (void *) &buildstate);
+
+ MemoryContextDelete(buildstate.tmpCtx);
+
+ SpGistUpdateMetaPage(index);
+
+ result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult));
+ result->heap_tuples = result->index_tuples = reltuples;
+
+ PG_RETURN_POINTER(result);
+}
+
+/*
+ * Build an empty SPGiST index in the initialization fork
+ */
+Datum
+spgbuildempty(PG_FUNCTION_ARGS)
+{
+ Relation index = (Relation) PG_GETARG_POINTER(0);
+ Page page;
+
+ /* Construct metapage. */
+ page = (Page) palloc(BLCKSZ);
+ SpGistInitMetapage(page);
+
+ /* Write the page. If archiving/streaming, XLOG it. */
+ smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
+ (char *) page, true);
+ if (XLogIsNeeded())
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ SPGIST_METAPAGE_BLKNO, page);
+
+ /* Likewise for the root page. */
+ SpGistInitPage(page, SPGIST_LEAF);
+
+ smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_HEAD_BLKNO,
+ (char *) page, true);
+ if (XLogIsNeeded())
+ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
+ SPGIST_HEAD_BLKNO, page);
+
+ /*
+ * An immediate sync is required even if we xlog'd the pages, because the
+ * writes did not go through shared buffers and therefore a concurrent
+ * checkpoint may have moved the redo pointer past our xlog record.
+ */
+ smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * Insert one new tuple into an SPGiST index.
+ */
+Datum
+spginsert(PG_FUNCTION_ARGS)
+{
+ Relation index = (Relation) PG_GETARG_POINTER(0);
+ Datum *values = (Datum *) PG_GETARG_POINTER(1);
+ bool *isnull = (bool *) PG_GETARG_POINTER(2);
+ ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
+
+#ifdef NOT_USED
+ Relation heapRel = (Relation) PG_GETARG_POINTER(4);
+ IndexUniqueCheck checkUnique = (IndexUniqueCheck) PG_GETARG_INT32(5);
+#endif
+ SpGistState spgstate;
+ MemoryContext oldCtx;
+ MemoryContext insertCtx;
+
+ /* SPGiST doesn't index nulls */
+ if (*isnull)
+ PG_RETURN_BOOL(false);
+
+ insertCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "SP-GiST insert temporary context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ oldCtx = MemoryContextSwitchTo(insertCtx);
+
+ initSpGistState(&spgstate, index);
+
+ spgdoinsert(index, &spgstate, ht_ctid, *values);
+
+ SpGistUpdateMetaPage(index);
+
+ MemoryContextSwitchTo(oldCtx);
+ MemoryContextDelete(insertCtx);
+
+ /* return false since we've not done any unique check */
+ PG_RETURN_BOOL(false);
+}
diff --git a/src/backend/access/spgist/spgkdtreeproc.c b/src/backend/access/spgist/spgkdtreeproc.c
new file mode 100644
index 00000000000..e11d1a35e3a
--- /dev/null
+++ b/src/backend/access/spgist/spgkdtreeproc.c
@@ -0,0 +1,298 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgkdtreeproc.c
+ * implementation of k-d tree over points for SP-GiST
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgkdtreeproc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gist.h" /* for RTree strategy numbers */
+#include "access/spgist.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/geo_decls.h"
+
+
+Datum
+spg_kd_config(PG_FUNCTION_ARGS)
+{
+ /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */
+ spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1);
+
+ cfg->prefixType = FLOAT8OID;
+ cfg->labelType = VOIDOID; /* we don't need node labels */
+ cfg->longValuesOK = false;
+ PG_RETURN_VOID();
+}
+
+static int
+getSide(double coord, bool isX, Point *tst)
+{
+ double tstcoord = (isX) ? tst->x : tst->y;
+
+ if (coord == tstcoord)
+ return 0;
+ else if (coord > tstcoord)
+ return 1;
+ else
+ return -1;
+}
+
+Datum
+spg_kd_choose(PG_FUNCTION_ARGS)
+{
+ spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0);
+ spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1);
+ Point *inPoint = DatumGetPointP(in->datum);
+ double coord;
+
+ if (in->allTheSame)
+ elog(ERROR, "allTheSame should not occur for k-d trees");
+
+ Assert(in->hasPrefix);
+ coord = DatumGetFloat8(in->prefixDatum);
+
+ Assert(in->nNodes == 2);
+
+ out->resultType = spgMatchNode;
+ out->result.matchNode.nodeN =
+ (getSide(coord, in->level % 2, inPoint) > 0) ? 0 : 1;
+ out->result.matchNode.levelAdd = 1;
+ out->result.matchNode.restDatum = PointPGetDatum(inPoint);
+
+ PG_RETURN_VOID();
+}
+
+typedef struct SortedPoint
+{
+ Point *p;
+ int i;
+} SortedPoint;
+
+static int
+x_cmp(const void *a, const void *b)
+{
+ SortedPoint *pa = (SortedPoint *) a;
+ SortedPoint *pb = (SortedPoint *) b;
+
+ if (pa->p->x == pb->p->x)
+ return 0;
+ return (pa->p->x > pb->p->x) ? 1 : -1;
+}
+
+static int
+y_cmp(const void *a, const void *b)
+{
+ SortedPoint *pa = (SortedPoint *) a;
+ SortedPoint *pb = (SortedPoint *) b;
+
+ if (pa->p->y == pb->p->y)
+ return 0;
+ return (pa->p->y > pb->p->y) ? 1 : -1;
+}
+
+
+Datum
+spg_kd_picksplit(PG_FUNCTION_ARGS)
+{
+ spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0);
+ spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1);
+ int i;
+ int middle;
+ SortedPoint *sorted;
+ double coord;
+
+ sorted = palloc(sizeof(*sorted) * in->nTuples);
+ for (i = 0; i < in->nTuples; i++)
+ {
+ sorted[i].p = DatumGetPointP(in->datums[i]);
+ sorted[i].i = i;
+ }
+
+ qsort(sorted, in->nTuples, sizeof(*sorted),
+ (in->level % 2) ? x_cmp : y_cmp);
+ middle = in->nTuples >> 1;
+ coord = (in->level % 2) ? sorted[middle].p->x : sorted[middle].p->y;
+
+ out->hasPrefix = true;
+ out->prefixDatum = Float8GetDatum(coord);
+
+ out->nNodes = 2;
+ out->nodeLabels = NULL; /* we don't need node labels */
+
+ out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples);
+ out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples);
+
+ /*
+ * Note: points that have coordinates exactly equal to coord may get
+ * classified into either node, depending on where they happen to fall
+ * in the sorted list. This is okay as long as the inner_consistent
+ * function descends into both sides for such cases. This is better
+ * than the alternative of trying to have an exact boundary, because
+ * it keeps the tree balanced even when we have many instances of the
+ * same point value. So we should never trigger the allTheSame logic.
+ */
+ for (i = 0; i < in->nTuples; i++)
+ {
+ Point *p = sorted[i].p;
+ int n = sorted[i].i;
+
+ out->mapTuplesToNodes[n] = (i < middle) ? 0 : 1;
+ out->leafTupleDatums[n] = PointPGetDatum(p);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+spg_kd_inner_consistent(PG_FUNCTION_ARGS)
+{
+ spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0);
+ spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1);
+ Point *query;
+ BOX *boxQuery;
+ double coord;
+
+ query = DatumGetPointP(in->query);
+ Assert(in->hasPrefix);
+ coord = DatumGetFloat8(in->prefixDatum);
+
+ if (in->allTheSame)
+ elog(ERROR, "allTheSame should not occur for k-d trees");
+
+ Assert(in->nNodes == 2);
+ out->nodeNumbers = (int *) palloc(sizeof(int) * 2);
+ out->levelAdds = (int *) palloc(sizeof(int) * 2);
+ out->levelAdds[0] = 1;
+ out->levelAdds[1] = 1;
+ out->nNodes = 0;
+
+ switch (in->strategy)
+ {
+ case RTLeftStrategyNumber:
+ out->nNodes = 1;
+ out->nodeNumbers[0] = 0;
+
+ if ((in->level % 2) == 0 || FPge(query->x, coord))
+ {
+ out->nodeNumbers[1] = 1;
+ out->nNodes++;
+ }
+ break;
+ case RTRightStrategyNumber:
+ out->nNodes = 1;
+ out->nodeNumbers[0] = 1;
+
+ if ((in->level % 2) == 0 || FPle(query->x, coord))
+ {
+ out->nodeNumbers[1] = 0;
+ out->nNodes++;
+ }
+ break;
+ case RTSameStrategyNumber:
+ if (in->level % 2)
+ {
+ if (FPle(query->x, coord))
+ {
+ out->nodeNumbers[out->nNodes] = 0;
+ out->nNodes++;
+ }
+ if (FPge(query->x, coord))
+ {
+ out->nodeNumbers[out->nNodes] = 1;
+ out->nNodes++;
+ }
+ }
+ else
+ {
+ if (FPle(query->y, coord))
+ {
+ out->nodeNumbers[out->nNodes] = 0;
+ out->nNodes++;
+ }
+ if (FPge(query->y, coord))
+ {
+ out->nodeNumbers[out->nNodes] = 1;
+ out->nNodes++;
+ }
+ }
+ break;
+ case RTBelowStrategyNumber:
+ out->nNodes = 1;
+ out->nodeNumbers[0] = 0;
+
+ if ((in->level % 2) == 1 || FPge(query->y, coord))
+ {
+ out->nodeNumbers[1] = 1;
+ out->nNodes++;
+ }
+ break;
+ case RTAboveStrategyNumber:
+ out->nNodes = 1;
+ out->nodeNumbers[0] = 1;
+
+ if ((in->level % 2) == 1 || FPle(query->y, coord))
+ {
+ out->nodeNumbers[1] = 0;
+ out->nNodes++;
+ }
+ break;
+ case RTContainedByStrategyNumber:
+
+ /*
+ * For this operator, the query is a box not a point. We cheat to
+ * the extent of assuming that DatumGetPointP won't do anything
+ * that would be bad for a pointer-to-box.
+ */
+ boxQuery = DatumGetBoxP(in->query);
+
+ out->nNodes = 1;
+ if (in->level % 2)
+ {
+ if (FPlt(boxQuery->high.x, coord))
+ out->nodeNumbers[0] = 0;
+ else if (FPgt(boxQuery->low.x, coord))
+ out->nodeNumbers[0] = 1;
+ else
+ {
+ out->nodeNumbers[0] = 0;
+ out->nodeNumbers[1] = 1;
+ out->nNodes = 2;
+ }
+ }
+ else
+ {
+ if (FPlt(boxQuery->high.y, coord))
+ out->nodeNumbers[0] = 0;
+ else if (FPgt(boxQuery->low.y, coord))
+ out->nodeNumbers[0] = 1;
+ else
+ {
+ out->nodeNumbers[0] = 0;
+ out->nodeNumbers[1] = 1;
+ out->nNodes = 2;
+ }
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", in->strategy);
+ break;
+ }
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * spg_kd_leaf_consistent() is the same as spg_quad_leaf_consistent(),
+ * since we support the same operators and the same leaf data type.
+ * So we just borrow that function.
+ */
diff --git a/src/backend/access/spgist/spgquadtreeproc.c b/src/backend/access/spgist/spgquadtreeproc.c
new file mode 100644
index 00000000000..0be6e55ad30
--- /dev/null
+++ b/src/backend/access/spgist/spgquadtreeproc.c
@@ -0,0 +1,360 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgquadtreeproc.c
+ * implementation of quad tree over points for SP-GiST
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgquadtreeproc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/gist.h" /* for RTree strategy numbers */
+#include "access/spgist.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/geo_decls.h"
+
+
+Datum
+spg_quad_config(PG_FUNCTION_ARGS)
+{
+ /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */
+ spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1);
+
+ cfg->prefixType = POINTOID;
+ cfg->labelType = VOIDOID; /* we don't need node labels */
+ cfg->longValuesOK = false;
+ PG_RETURN_VOID();
+}
+
+#define SPTEST(f, x, y) \
+ DatumGetBool(DirectFunctionCall2(f, PointPGetDatum(x), PointPGetDatum(y)))
+
+/*
+ * Determine which quadrant a point falls into, relative to the centroid.
+ *
+ * Quadrants are identified like this:
+ *
+ * 4 | 1
+ * ----+-----
+ * 3 | 2
+ *
+ * Points on one of the axes are taken to lie in the lowest-numbered
+ * adjacent quadrant.
+ */
+static int2
+getQuadrant(Point *centroid, Point *tst)
+{
+ if ((SPTEST(point_above, tst, centroid) ||
+ SPTEST(point_horiz, tst, centroid)) &&
+ (SPTEST(point_right, tst, centroid) ||
+ SPTEST(point_vert, tst, centroid)))
+ return 1;
+
+ if (SPTEST(point_below, tst, centroid) &&
+ (SPTEST(point_right, tst, centroid) ||
+ SPTEST(point_vert, tst, centroid)))
+ return 2;
+
+ if ((SPTEST(point_below, tst, centroid) ||
+ SPTEST(point_horiz, tst, centroid)) &&
+ SPTEST(point_left, tst, centroid))
+ return 3;
+
+ if (SPTEST(point_above, tst, centroid) &&
+ SPTEST(point_left, tst, centroid))
+ return 4;
+
+ elog(ERROR, "getQuadrant: impossible case");
+ return 0;
+}
+
+
+Datum
+spg_quad_choose(PG_FUNCTION_ARGS)
+{
+ spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0);
+ spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1);
+ Point *inPoint = DatumGetPointP(in->datum),
+ *centroid;
+
+ if (in->allTheSame)
+ {
+ out->resultType = spgMatchNode;
+ /* nodeN will be set by core */
+ out->result.matchNode.levelAdd = 0;
+ out->result.matchNode.restDatum = PointPGetDatum(inPoint);
+ PG_RETURN_VOID();
+ }
+
+ Assert(in->hasPrefix);
+ centroid = DatumGetPointP(in->prefixDatum);
+
+ Assert(in->nNodes == 4);
+
+ out->resultType = spgMatchNode;
+ out->result.matchNode.nodeN = getQuadrant(centroid, inPoint) - 1;
+ out->result.matchNode.levelAdd = 0;
+ out->result.matchNode.restDatum = PointPGetDatum(inPoint);
+
+ PG_RETURN_VOID();
+}
+
+#ifdef USE_MEDIAN
+static int
+x_cmp(const void *a, const void *b, void *arg)
+{
+ Point *pa = *(Point **) a;
+ Point *pb = *(Point **) b;
+
+ if (pa->x == pb->x)
+ return 0;
+ return (pa->x > pb->x) ? 1 : -1;
+}
+
+static int
+y_cmp(const void *a, const void *b, void *arg)
+{
+ Point *pa = *(Point **) a;
+ Point *pb = *(Point **) b;
+
+ if (pa->y == pb->y)
+ return 0;
+ return (pa->y > pb->y) ? 1 : -1;
+}
+#endif
+
+Datum
+spg_quad_picksplit(PG_FUNCTION_ARGS)
+{
+ spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0);
+ spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1);
+ int i;
+ Point *centroid;
+
+#ifdef USE_MEDIAN
+ /* Use the median values of x and y as the centroid point */
+ Point **sorted;
+
+ sorted = palloc(sizeof(*sorted) * in->nTuples);
+ for (i = 0; i < in->nTuples; i++)
+ sorted[i] = DatumGetPointP(in->datums[i]);
+
+ centroid = palloc(sizeof(*centroid));
+
+ qsort(sorted, in->nTuples, sizeof(*sorted), x_cmp);
+ centroid->x = sorted[in->nTuples >> 1]->x;
+ qsort(sorted, in->nTuples, sizeof(*sorted), y_cmp);
+ centroid->y = sorted[in->nTuples >> 1]->y;
+#else
+ /* Use the average values of x and y as the centroid point */
+ centroid = palloc0(sizeof(*centroid));
+
+ for (i = 0; i < in->nTuples; i++)
+ {
+ centroid->x += DatumGetPointP(in->datums[i])->x;
+ centroid->y += DatumGetPointP(in->datums[i])->y;
+ }
+
+ centroid->x /= in->nTuples;
+ centroid->y /= in->nTuples;
+#endif
+
+ out->hasPrefix = true;
+ out->prefixDatum = PointPGetDatum(centroid);
+
+ out->nNodes = 4;
+ out->nodeLabels = NULL; /* we don't need node labels */
+
+ out->mapTuplesToNodes = palloc(sizeof(int) * in->nTuples);
+ out->leafTupleDatums = palloc(sizeof(Datum) * in->nTuples);
+
+ for (i = 0; i < in->nTuples; i++)
+ {
+ Point *p = DatumGetPointP(in->datums[i]);
+ int quadrant = getQuadrant(centroid, p) - 1;
+
+ out->leafTupleDatums[i] = PointPGetDatum(p);
+ out->mapTuplesToNodes[i] = quadrant;
+ }
+
+ PG_RETURN_VOID();
+}
+
+
+/* Subroutine to fill out->nodeNumbers[] for spg_quad_inner_consistent */
+static void
+setNodes(spgInnerConsistentOut *out, bool isAll, int first, int second)
+{
+ if (isAll)
+ {
+ out->nNodes = 4;
+ out->nodeNumbers[0] = 0;
+ out->nodeNumbers[1] = 1;
+ out->nodeNumbers[2] = 2;
+ out->nodeNumbers[3] = 3;
+ }
+ else
+ {
+ out->nNodes = 2;
+ out->nodeNumbers[0] = first - 1;
+ out->nodeNumbers[1] = second - 1;
+ }
+}
+
+
+Datum
+spg_quad_inner_consistent(PG_FUNCTION_ARGS)
+{
+ spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0);
+ spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1);
+ Point *query,
+ *centroid;
+ BOX *boxQuery;
+
+ query = DatumGetPointP(in->query);
+ Assert(in->hasPrefix);
+ centroid = DatumGetPointP(in->prefixDatum);
+
+ if (in->allTheSame)
+ {
+ /* Report that all nodes should be visited */
+ int i;
+
+ out->nNodes = in->nNodes;
+ out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes);
+ for (i = 0; i < in->nNodes; i++)
+ out->nodeNumbers[i] = i;
+ PG_RETURN_VOID();
+ }
+
+ Assert(in->nNodes == 4);
+ out->nodeNumbers = (int *) palloc(sizeof(int) * 4);
+
+ switch (in->strategy)
+ {
+ case RTLeftStrategyNumber:
+ setNodes(out, SPTEST(point_left, centroid, query), 3, 4);
+ break;
+ case RTRightStrategyNumber:
+ setNodes(out, SPTEST(point_right, centroid, query), 1, 2);
+ break;
+ case RTSameStrategyNumber:
+ out->nNodes = 1;
+ out->nodeNumbers[0] = getQuadrant(centroid, query) - 1;
+ break;
+ case RTBelowStrategyNumber:
+ setNodes(out, SPTEST(point_below, centroid, query), 2, 3);
+ break;
+ case RTAboveStrategyNumber:
+ setNodes(out, SPTEST(point_above, centroid, query), 1, 4);
+ break;
+ case RTContainedByStrategyNumber:
+
+ /*
+ * For this operator, the query is a box not a point. We cheat to
+ * the extent of assuming that DatumGetPointP won't do anything
+ * that would be bad for a pointer-to-box.
+ */
+ boxQuery = DatumGetBoxP(in->query);
+
+ if (DatumGetBool(DirectFunctionCall2(box_contain_pt,
+ PointerGetDatum(boxQuery),
+ PointerGetDatum(centroid))))
+ {
+ /* centroid is in box, so descend to all quadrants */
+ setNodes(out, true, 0, 0);
+ }
+ else
+ {
+ /* identify quadrant(s) containing all corners of box */
+ Point p;
+ int i,
+ r = 0;
+
+ p = boxQuery->low;
+ r |= 1 << (getQuadrant(centroid, &p) - 1);
+
+ p.y = boxQuery->high.y;
+ r |= 1 << (getQuadrant(centroid, &p) - 1);
+
+ p = boxQuery->high;
+ r |= 1 << (getQuadrant(centroid, &p) - 1);
+
+ p.x = boxQuery->low.x;
+ r |= 1 << (getQuadrant(centroid, &p) - 1);
+
+ /* we must descend into those quadrant(s) */
+ out->nNodes = 0;
+ for (i = 0; i < 4; i++)
+ {
+ if (r & (1 << i))
+ {
+ out->nodeNumbers[out->nNodes] = i;
+ out->nNodes++;
+ }
+ }
+ }
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", in->strategy);
+ break;
+ }
+
+ PG_RETURN_VOID();
+}
+
+
+Datum
+spg_quad_leaf_consistent(PG_FUNCTION_ARGS)
+{
+ spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0);
+ spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1);
+ Point *query = DatumGetPointP(in->query);
+ Point *datum = DatumGetPointP(in->leafDatum);
+ bool res;
+
+ /* all tests are exact */
+ out->recheck = false;
+
+ switch (in->strategy)
+ {
+ case RTLeftStrategyNumber:
+ res = SPTEST(point_left, datum, query);
+ break;
+ case RTRightStrategyNumber:
+ res = SPTEST(point_right, datum, query);
+ break;
+ case RTSameStrategyNumber:
+ res = SPTEST(point_eq, datum, query);
+ break;
+ case RTBelowStrategyNumber:
+ res = SPTEST(point_below, datum, query);
+ break;
+ case RTAboveStrategyNumber:
+ res = SPTEST(point_above, datum, query);
+ break;
+ case RTContainedByStrategyNumber:
+
+ /*
+ * For this operator, the query is a box not a point. We cheat to
+ * the extent of assuming that DatumGetPointP won't do anything
+ * that would be bad for a pointer-to-box.
+ */
+ res = SPTEST(box_contain_pt, query, datum);
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", in->strategy);
+ res = false;
+ break;
+ }
+
+ PG_RETURN_BOOL(res);
+}
diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c
new file mode 100644
index 00000000000..1c6180b2d24
--- /dev/null
+++ b/src/backend/access/spgist/spgscan.c
@@ -0,0 +1,543 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgscan.c
+ * routines for scanning SP-GiST indexes
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/spgist_private.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "utils/datum.h"
+#include "utils/memutils.h"
+
+
+typedef struct ScanStackEntry
+{
+ Datum reconstructedValue; /* value reconstructed from parent */
+ int level; /* level of items on this page */
+ ItemPointerData ptr; /* block and offset to scan from */
+} ScanStackEntry;
+
+
+/* Free a ScanStackEntry */
+static void
+freeScanStackEntry(SpGistScanOpaque so, ScanStackEntry *stackEntry)
+{
+ if (!so->state.attType.attbyval &&
+ DatumGetPointer(stackEntry->reconstructedValue) != NULL)
+ pfree(DatumGetPointer(stackEntry->reconstructedValue));
+ pfree(stackEntry);
+}
+
+/* Free the entire stack */
+static void
+freeScanStack(SpGistScanOpaque so)
+{
+ ListCell *lc;
+
+ foreach(lc, so->scanStack)
+ {
+ freeScanStackEntry(so, (ScanStackEntry *) lfirst(lc));
+ }
+ list_free(so->scanStack);
+ so->scanStack = NIL;
+}
+
+/* Initialize scanStack with a single entry for the root page */
+static void
+resetSpGistScanOpaque(SpGistScanOpaque so)
+{
+ ScanStackEntry *startEntry = palloc0(sizeof(ScanStackEntry));
+
+ ItemPointerSet(&startEntry->ptr, SPGIST_HEAD_BLKNO, FirstOffsetNumber);
+
+ freeScanStack(so);
+ so->scanStack = list_make1(startEntry);
+ so->nPtrs = so->iPtr = 0;
+}
+
+Datum
+spgbeginscan(PG_FUNCTION_ARGS)
+{
+ Relation rel = (Relation) PG_GETARG_POINTER(0);
+ int keysz = PG_GETARG_INT32(1);
+ /* ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2); */
+ IndexScanDesc scan;
+ SpGistScanOpaque so;
+
+ scan = RelationGetIndexScan(rel, keysz, 0);
+
+ so = (SpGistScanOpaque) palloc0(sizeof(SpGistScanOpaqueData));
+ initSpGistState(&so->state, scan->indexRelation);
+ so->tempCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "SP-GiST search temporary context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ resetSpGistScanOpaque(so);
+ scan->opaque = so;
+
+ PG_RETURN_POINTER(scan);
+}
+
+Datum
+spgrescan(PG_FUNCTION_ARGS)
+{
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+ ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
+
+ if (scankey && scan->numberOfKeys > 0)
+ {
+ memmove(scan->keyData, scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+ }
+
+ resetSpGistScanOpaque(so);
+
+ PG_RETURN_VOID();
+}
+
+Datum
+spgendscan(PG_FUNCTION_ARGS)
+{
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+
+ MemoryContextDelete(so->tempCxt);
+
+ PG_RETURN_VOID();
+}
+
+Datum
+spgmarkpos(PG_FUNCTION_ARGS)
+{
+ elog(ERROR, "SPGiST does not support mark/restore");
+ PG_RETURN_VOID();
+}
+
+Datum
+spgrestrpos(PG_FUNCTION_ARGS)
+{
+ elog(ERROR, "SPGiST does not support mark/restore");
+ PG_RETURN_VOID();
+}
+
+/*
+ * Test whether a leaf datum satisfies all the scan keys
+ *
+ * *recheck is set true if any of the operators are lossy
+ */
+static bool
+spgLeafTest(SpGistScanOpaque so, Datum leafDatum,
+ int level, Datum reconstructedValue,
+ bool *recheck)
+{
+ bool result = true;
+ spgLeafConsistentIn in;
+ spgLeafConsistentOut out;
+ MemoryContext oldCtx;
+ int i;
+
+ *recheck = false;
+
+ /* set up values that are the same for all quals */
+ in.reconstructedValue = reconstructedValue;
+ in.level = level;
+ in.leafDatum = leafDatum;
+
+ /* Apply each leaf consistent function, working in the temp context */
+ oldCtx = MemoryContextSwitchTo(so->tempCxt);
+ for (i = 0; i < so->numberOfKeys; i++)
+ {
+ in.strategy = so->keyData[i].sk_strategy;
+ in.query = so->keyData[i].sk_argument;
+
+ out.recheck = false;
+
+ result = DatumGetBool(FunctionCall2Coll(&so->state.leafConsistentFn,
+ so->keyData[i].sk_collation,
+ PointerGetDatum(&in),
+ PointerGetDatum(&out)));
+ *recheck |= out.recheck;
+ if (!result)
+ break;
+ }
+ MemoryContextSwitchTo(oldCtx);
+
+ return result;
+}
+
+/*
+ * Walk the tree and report all tuples passing the scan quals to the storeRes
+ * subroutine.
+ *
+ * If scanWholeIndex is true, we'll do just that. If not, we'll stop at the
+ * next page boundary once we have reported at least one tuple.
+ */
+static void
+spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
+ void (*storeRes) (SpGistScanOpaque, ItemPointer, bool))
+{
+ Buffer buffer = InvalidBuffer;
+ bool reportedSome = false;
+
+ while (scanWholeIndex || !reportedSome)
+ {
+ ScanStackEntry *stackEntry;
+ BlockNumber blkno;
+ OffsetNumber offset;
+ Page page;
+
+ /* Pull next to-do item from the list */
+ if (so->scanStack == NIL)
+ break; /* there are no more pages to scan */
+
+ stackEntry = (ScanStackEntry *) linitial(so->scanStack);
+ so->scanStack = list_delete_first(so->scanStack);
+
+redirect:
+ /* Check for interrupts, just in case of infinite loop */
+ CHECK_FOR_INTERRUPTS();
+
+ blkno = ItemPointerGetBlockNumber(&stackEntry->ptr);
+ offset = ItemPointerGetOffsetNumber(&stackEntry->ptr);
+
+ if (buffer == InvalidBuffer)
+ {
+ buffer = ReadBuffer(index, blkno);
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+ }
+ else if (blkno != BufferGetBlockNumber(buffer))
+ {
+ UnlockReleaseBuffer(buffer);
+ buffer = ReadBuffer(index, blkno);
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+ }
+ /* else new pointer points to the same page, no work needed */
+
+ page = BufferGetPage(buffer);
+
+ if (SpGistPageIsLeaf(page))
+ {
+ SpGistLeafTuple leafTuple;
+ OffsetNumber max = PageGetMaxOffsetNumber(page);
+ bool recheck = false;
+
+ if (blkno == SPGIST_HEAD_BLKNO)
+ {
+ /* When root is a leaf, examine all its tuples */
+ for (offset = FirstOffsetNumber; offset <= max; offset++)
+ {
+ leafTuple = (SpGistLeafTuple)
+ PageGetItem(page, PageGetItemId(page, offset));
+ if (leafTuple->tupstate != SPGIST_LIVE)
+ {
+ /* all tuples on root should be live */
+ elog(ERROR, "unexpected SPGiST tuple state: %d",
+ leafTuple->tupstate);
+ }
+
+ Assert(ItemPointerIsValid(&leafTuple->heapPtr));
+ if (spgLeafTest(so,
+ SGLTDATUM(leafTuple, &so->state),
+ stackEntry->level,
+ stackEntry->reconstructedValue,
+ &recheck))
+ {
+ storeRes(so, &leafTuple->heapPtr, recheck);
+ reportedSome = true;
+ }
+ }
+ }
+ else
+ {
+ /* Normal case: just examine the chain we arrived at */
+ while (offset != InvalidOffsetNumber)
+ {
+ Assert(offset >= FirstOffsetNumber && offset <= max);
+ leafTuple = (SpGistLeafTuple)
+ PageGetItem(page, PageGetItemId(page, offset));
+ if (leafTuple->tupstate != SPGIST_LIVE)
+ {
+ if (leafTuple->tupstate == SPGIST_REDIRECT)
+ {
+ /* redirection tuple should be first in chain */
+ Assert(offset == ItemPointerGetOffsetNumber(&stackEntry->ptr));
+ /* transfer attention to redirect point */
+ stackEntry->ptr = ((SpGistDeadTuple) leafTuple)->pointer;
+ Assert(ItemPointerGetBlockNumber(&stackEntry->ptr) != SPGIST_METAPAGE_BLKNO);
+ goto redirect;
+ }
+ if (leafTuple->tupstate == SPGIST_DEAD)
+ {
+ /* dead tuple should be first in chain */
+ Assert(offset == ItemPointerGetOffsetNumber(&stackEntry->ptr));
+ /* No live entries on this page */
+ Assert(leafTuple->nextOffset == InvalidOffsetNumber);
+ break;
+ }
+ /* We should not arrive at a placeholder */
+ elog(ERROR, "unexpected SPGiST tuple state: %d",
+ leafTuple->tupstate);
+ }
+
+ Assert(ItemPointerIsValid(&leafTuple->heapPtr));
+ if (spgLeafTest(so,
+ SGLTDATUM(leafTuple, &so->state),
+ stackEntry->level,
+ stackEntry->reconstructedValue,
+ &recheck))
+ {
+ storeRes(so, &leafTuple->heapPtr, recheck);
+ reportedSome = true;
+ }
+
+ offset = leafTuple->nextOffset;
+ }
+ }
+ }
+ else /* page is inner */
+ {
+ SpGistInnerTuple innerTuple;
+ SpGistNodeTuple node;
+ int i;
+
+ innerTuple = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, offset));
+
+ if (innerTuple->tupstate != SPGIST_LIVE)
+ {
+ if (innerTuple->tupstate == SPGIST_REDIRECT)
+ {
+ /* transfer attention to redirect point */
+ stackEntry->ptr = ((SpGistDeadTuple) innerTuple)->pointer;
+ Assert(ItemPointerGetBlockNumber(&stackEntry->ptr) != SPGIST_METAPAGE_BLKNO);
+ goto redirect;
+ }
+ elog(ERROR, "unexpected SPGiST tuple state: %d",
+ innerTuple->tupstate);
+ }
+
+ if (so->numberOfKeys == 0)
+ {
+ /*
+ * This case cannot happen at the moment, because we don't
+ * set pg_am.amoptionalkey for SP-GiST. In order for full
+ * index scans to produce correct answers, we'd need to
+ * index nulls, which we don't.
+ */
+ Assert(false);
+
+#ifdef NOT_USED
+ /*
+ * A full index scan could be done approximately like this,
+ * but note that reconstruction of indexed values would be
+ * impossible unless the API for inner_consistent is changed.
+ */
+ SGITITERATE(innerTuple, i, node)
+ {
+ if (ItemPointerIsValid(&node->t_tid))
+ {
+ ScanStackEntry *newEntry = palloc(sizeof(ScanStackEntry));
+
+ newEntry->ptr = node->t_tid;
+ newEntry->level = -1;
+ newEntry->reconstructedValue = (Datum) 0;
+ so->scanStack = lcons(newEntry, so->scanStack);
+ }
+ }
+#endif
+ }
+ else
+ {
+ spgInnerConsistentIn in;
+ spgInnerConsistentOut out;
+ SpGistNodeTuple *nodes;
+ int *andMap;
+ int *levelAdds;
+ Datum *reconstructedValues;
+ int j,
+ nMatches = 0;
+ MemoryContext oldCtx;
+
+ /* use temp context for calling inner_consistent */
+ oldCtx = MemoryContextSwitchTo(so->tempCxt);
+
+ /* set up values that are the same for all scankeys */
+ in.reconstructedValue = stackEntry->reconstructedValue;
+ in.level = stackEntry->level;
+ in.allTheSame = innerTuple->allTheSame;
+ in.hasPrefix = (innerTuple->prefixSize > 0);
+ in.prefixDatum = SGITDATUM(innerTuple, &so->state);
+ in.nNodes = innerTuple->nNodes;
+ in.nodeLabels = spgExtractNodeLabels(&so->state, innerTuple);
+
+ /* collect node pointers */
+ nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * in.nNodes);
+ SGITITERATE(innerTuple, i, node)
+ {
+ nodes[i] = node;
+ }
+
+ andMap = (int *) palloc0(sizeof(int) * in.nNodes);
+ levelAdds = (int *) palloc0(sizeof(int) * in.nNodes);
+ reconstructedValues = (Datum *) palloc0(sizeof(Datum) * in.nNodes);
+
+ for (j = 0; j < so->numberOfKeys; j++)
+ {
+ in.strategy = so->keyData[j].sk_strategy;
+ in.query = so->keyData[j].sk_argument;
+
+ memset(&out, 0, sizeof(out));
+
+ FunctionCall2Coll(&so->state.innerConsistentFn,
+ so->keyData[j].sk_collation,
+ PointerGetDatum(&in),
+ PointerGetDatum(&out));
+
+ /* If allTheSame, they should all or none of 'em match */
+ if (innerTuple->allTheSame)
+ if (out.nNodes != 0 && out.nNodes != in.nNodes)
+ elog(ERROR, "inconsistent inner_consistent results for allTheSame inner tuple");
+
+ nMatches = 0;
+ for (i = 0; i < out.nNodes; i++)
+ {
+ int nodeN = out.nodeNumbers[i];
+
+ andMap[nodeN]++;
+ if (andMap[nodeN] == j + 1)
+ nMatches++;
+ if (out.levelAdds)
+ levelAdds[nodeN] = out.levelAdds[i];
+ if (out.reconstructedValues)
+ reconstructedValues[nodeN] = out.reconstructedValues[i];
+ }
+
+ /* quit as soon as all nodes have failed some qual */
+ if (nMatches == 0)
+ break;
+ }
+
+ MemoryContextSwitchTo(oldCtx);
+
+ if (nMatches > 0)
+ {
+ for (i = 0; i < in.nNodes; i++)
+ {
+ if (andMap[i] == so->numberOfKeys &&
+ ItemPointerIsValid(&nodes[i]->t_tid))
+ {
+ ScanStackEntry *newEntry;
+
+ /* Create new work item for this node */
+ newEntry = palloc(sizeof(ScanStackEntry));
+ newEntry->ptr = nodes[i]->t_tid;
+ newEntry->level = stackEntry->level + levelAdds[i];
+ /* Must copy value out of temp context */
+ newEntry->reconstructedValue =
+ datumCopy(reconstructedValues[i],
+ so->state.attType.attbyval,
+ so->state.attType.attlen);
+
+ so->scanStack = lcons(newEntry, so->scanStack);
+ }
+ }
+ }
+ }
+ }
+
+ /* done with this scan stack entry */
+ freeScanStackEntry(so, stackEntry);
+ /* clear temp context before proceeding to the next one */
+ MemoryContextReset(so->tempCxt);
+ }
+
+ if (buffer != InvalidBuffer)
+ UnlockReleaseBuffer(buffer);
+}
+
+/* storeRes subroutine for getbitmap case */
+static void
+storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr, bool recheck)
+{
+ tbm_add_tuples(so->tbm, heapPtr, 1, recheck);
+ so->ntids++;
+}
+
+Datum
+spggetbitmap(PG_FUNCTION_ARGS)
+{
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
+ SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+
+ /* Copy scankey to *so so we don't need to pass it around separately */
+ so->numberOfKeys = scan->numberOfKeys;
+ so->keyData = scan->keyData;
+
+ so->tbm = tbm;
+ so->ntids = 0;
+
+ spgWalk(scan->indexRelation, so, true, storeBitmap);
+
+ PG_RETURN_INT64(so->ntids);
+}
+
+/* storeRes subroutine for gettuple case */
+static void
+storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, bool recheck)
+{
+ Assert(so->nPtrs < MaxIndexTuplesPerPage);
+ so->heapPtrs[so->nPtrs] = *heapPtr;
+ so->recheck[so->nPtrs] = recheck;
+ so->nPtrs++;
+}
+
+Datum
+spggettuple(PG_FUNCTION_ARGS)
+{
+ IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+ ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
+ SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+
+ if (dir != ForwardScanDirection)
+ elog(ERROR, "SP-GiST only supports forward scan direction");
+
+ /* Copy scankey to *so so we don't need to pass it around separately */
+ so->numberOfKeys = scan->numberOfKeys;
+ so->keyData = scan->keyData;
+
+ for (;;)
+ {
+ if (so->iPtr < so->nPtrs)
+ {
+ /* continuing to return tuples from a leaf page */
+ scan->xs_ctup.t_self = so->heapPtrs[so->iPtr];
+ scan->xs_recheck = so->recheck[so->iPtr];
+ so->iPtr++;
+ PG_RETURN_BOOL(true);
+ }
+
+ so->iPtr = so->nPtrs = 0;
+ spgWalk(scan->indexRelation, so, false, storeGettuple);
+
+ if (so->nPtrs == 0)
+ break; /* must have completed scan */
+ }
+
+ PG_RETURN_BOOL(false);
+}
diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c
new file mode 100644
index 00000000000..b6037978425
--- /dev/null
+++ b/src/backend/access/spgist/spgtextproc.c
@@ -0,0 +1,594 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgtextproc.c
+ * implementation of compressed-suffix tree over text
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgtextproc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/spgist.h"
+#include "catalog/pg_type.h"
+#include "mb/pg_wchar.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/pg_locale.h"
+
+
+/*
+ * In the worst case, a inner tuple in a text suffix tree could have as many
+ * as 256 nodes (one for each possible byte value). Each node can take 16
+ * bytes on MAXALIGN=8 machines. The inner tuple must fit on an index page
+ * of size BLCKSZ. Rather than assuming we know the exact amount of overhead
+ * imposed by page headers, tuple headers, etc, we leave 100 bytes for that
+ * (the actual overhead should be no more than 56 bytes at this writing, so
+ * there is slop in this number). The upshot is that the maximum safe prefix
+ * length is this:
+ */
+#define SPGIST_MAX_PREFIX_LENGTH (BLCKSZ - 256 * 16 - 100)
+
+/* Struct for sorting values in picksplit */
+typedef struct spgNodePtr
+{
+ Datum d;
+ int i;
+ uint8 c;
+} spgNodePtr;
+
+
+Datum
+spg_text_config(PG_FUNCTION_ARGS)
+{
+ /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */
+ spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1);
+
+ cfg->prefixType = TEXTOID;
+ cfg->labelType = CHAROID;
+ cfg->longValuesOK = true; /* suffixing will shorten long values */
+ PG_RETURN_VOID();
+}
+
+/*
+ * Form a text datum from the given not-necessarily-null-terminated string,
+ * using short varlena header format if possible
+ */
+static Datum
+formTextDatum(const char *data, int datalen)
+{
+ char *p;
+
+ p = (char *) palloc(datalen + VARHDRSZ);
+
+ if (datalen + VARHDRSZ_SHORT <= VARATT_SHORT_MAX)
+ {
+ SET_VARSIZE_SHORT(p, datalen + VARHDRSZ_SHORT);
+ if (datalen)
+ memcpy(p + VARHDRSZ_SHORT, data, datalen);
+ }
+ else
+ {
+ SET_VARSIZE(p, datalen + VARHDRSZ);
+ memcpy(p + VARHDRSZ, data, datalen);
+ }
+
+ return PointerGetDatum(p);
+}
+
+/*
+ * Find the length of the common prefix of a and b
+ */
+static int
+commonPrefix(const char *a, const char *b, int lena, int lenb)
+{
+ int i = 0;
+
+ while (i < lena && i < lenb && *a == *b)
+ {
+ a++;
+ b++;
+ i++;
+ }
+
+ return i;
+}
+
+/*
+ * Binary search an array of uint8 datums for a match to c
+ *
+ * On success, *i gets the match location; on failure, it gets where to insert
+ */
+static bool
+searchChar(Datum *nodeLabels, int nNodes, uint8 c, int *i)
+{
+ int StopLow = 0,
+ StopHigh = nNodes;
+
+ while (StopLow < StopHigh)
+ {
+ int StopMiddle = (StopLow + StopHigh) >> 1;
+ uint8 middle = DatumGetUInt8(nodeLabels[StopMiddle]);
+
+ if (c < middle)
+ StopHigh = StopMiddle;
+ else if (c > middle)
+ StopLow = StopMiddle + 1;
+ else
+ {
+ *i = StopMiddle;
+ return true;
+ }
+ }
+
+ *i = StopHigh;
+ return false;
+}
+
+Datum
+spg_text_choose(PG_FUNCTION_ARGS)
+{
+ spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0);
+ spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1);
+ text *inText = DatumGetTextPP(in->datum);
+ char *inStr = VARDATA_ANY(inText);
+ int inSize = VARSIZE_ANY_EXHDR(inText);
+ uint8 nodeChar = '\0';
+ int i = 0;
+ int commonLen = 0;
+
+ /* Check for prefix match, set nodeChar to first byte after prefix */
+ if (in->hasPrefix)
+ {
+ text *prefixText = DatumGetTextPP(in->prefixDatum);
+ char *prefixStr = VARDATA_ANY(prefixText);
+ int prefixSize = VARSIZE_ANY_EXHDR(prefixText);
+
+ commonLen = commonPrefix(inStr + in->level,
+ prefixStr,
+ inSize - in->level,
+ prefixSize);
+
+ if (commonLen == prefixSize)
+ {
+ if (inSize - in->level > commonLen)
+ nodeChar = *(uint8 *) (inStr + in->level + commonLen);
+ else
+ nodeChar = '\0';
+ }
+ else
+ {
+ /* Must split tuple because incoming value doesn't match prefix */
+ out->resultType = spgSplitTuple;
+
+ if (commonLen == 0)
+ {
+ out->result.splitTuple.prefixHasPrefix = false;
+ }
+ else
+ {
+ out->result.splitTuple.prefixHasPrefix = true;
+ out->result.splitTuple.prefixPrefixDatum =
+ formTextDatum(prefixStr, commonLen);
+ }
+ out->result.splitTuple.nodeLabel =
+ UInt8GetDatum(*(prefixStr + commonLen));
+
+ if (prefixSize - commonLen == 1)
+ {
+ out->result.splitTuple.postfixHasPrefix = false;
+ }
+ else
+ {
+ out->result.splitTuple.postfixHasPrefix = true;
+ out->result.splitTuple.postfixPrefixDatum =
+ formTextDatum(prefixStr + commonLen + 1,
+ prefixSize - commonLen - 1);
+ }
+
+ PG_RETURN_VOID();
+ }
+ }
+ else if (inSize > in->level)
+ {
+ nodeChar = *(uint8 *) (inStr + in->level);
+ }
+ else
+ {
+ nodeChar = '\0';
+ }
+
+ /* Look up nodeChar in the node label array */
+ if (searchChar(in->nodeLabels, in->nNodes, nodeChar, &i))
+ {
+ /*
+ * Descend to existing node. (If in->allTheSame, the core code will
+ * ignore our nodeN specification here, but that's OK. We still
+ * have to provide the correct levelAdd and restDatum values, and
+ * those are the same regardless of which node gets chosen by core.)
+ */
+ out->resultType = spgMatchNode;
+ out->result.matchNode.nodeN = i;
+ out->result.matchNode.levelAdd = commonLen + 1;
+ if (inSize - in->level - commonLen - 1 > 0)
+ out->result.matchNode.restDatum =
+ formTextDatum(inStr + in->level + commonLen + 1,
+ inSize - in->level - commonLen - 1);
+ else
+ out->result.matchNode.restDatum =
+ formTextDatum(NULL, 0);
+ }
+ else if (in->allTheSame)
+ {
+ /*
+ * Can't use AddNode action, so split the tuple. The upper tuple
+ * has the same prefix as before and uses an empty node label for
+ * the lower tuple. The lower tuple has no prefix and the same
+ * node labels as the original tuple.
+ */
+ out->resultType = spgSplitTuple;
+ out->result.splitTuple.prefixHasPrefix = in->hasPrefix;
+ out->result.splitTuple.prefixPrefixDatum = in->prefixDatum;
+ out->result.splitTuple.nodeLabel = UInt8GetDatum('\0');
+ out->result.splitTuple.postfixHasPrefix = false;
+ }
+ else
+ {
+ /* Add a node for the not-previously-seen nodeChar value */
+ out->resultType = spgAddNode;
+ out->result.addNode.nodeLabel = UInt8GetDatum(nodeChar);
+ out->result.addNode.nodeN = i;
+ }
+
+ PG_RETURN_VOID();
+}
+
+/* qsort comparator to sort spgNodePtr structs by "c" */
+static int
+cmpNodePtr(const void *a, const void *b)
+{
+ const spgNodePtr *aa = (const spgNodePtr *) a;
+ const spgNodePtr *bb = (const spgNodePtr *) b;
+
+ if (aa->c == bb->c)
+ return 0;
+ else if (aa->c > bb->c)
+ return 1;
+ else
+ return -1;
+}
+
+Datum
+spg_text_picksplit(PG_FUNCTION_ARGS)
+{
+ spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0);
+ spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1);
+ text *text0 = DatumGetTextPP(in->datums[0]);
+ int i,
+ commonLen;
+ spgNodePtr *nodes;
+
+ /* Identify longest common prefix, if any */
+ commonLen = VARSIZE_ANY_EXHDR(text0);
+ for (i = 1; i < in->nTuples && commonLen > 0; i++)
+ {
+ text *texti = DatumGetTextPP(in->datums[i]);
+ int tmp = commonPrefix(VARDATA_ANY(text0),
+ VARDATA_ANY(texti),
+ VARSIZE_ANY_EXHDR(text0),
+ VARSIZE_ANY_EXHDR(texti));
+
+ if (tmp < commonLen)
+ commonLen = tmp;
+ }
+
+ /*
+ * Limit the prefix length, if necessary, to ensure that the resulting
+ * inner tuple will fit on a page.
+ */
+ commonLen = Min(commonLen, SPGIST_MAX_PREFIX_LENGTH);
+
+ /* Set node prefix to be that string, if it's not empty */
+ if (commonLen == 0)
+ {
+ out->hasPrefix = false;
+ }
+ else
+ {
+ out->hasPrefix = true;
+ out->prefixDatum = formTextDatum(VARDATA_ANY(text0), commonLen);
+ }
+
+ /* Extract the node label (first non-common byte) from each value */
+ nodes = (spgNodePtr *) palloc(sizeof(spgNodePtr) * in->nTuples);
+
+ for (i = 0; i < in->nTuples; i++)
+ {
+ text *texti = DatumGetTextPP(in->datums[i]);
+
+ if (commonLen < VARSIZE_ANY_EXHDR(texti))
+ nodes[i].c = *(uint8 *) (VARDATA_ANY(texti) + commonLen);
+ else
+ nodes[i].c = '\0'; /* use \0 if string is all common */
+ nodes[i].i = i;
+ nodes[i].d = in->datums[i];
+ }
+
+ /*
+ * Sort by label bytes so that we can group the values into nodes. This
+ * also ensures that the nodes are ordered by label value, allowing the
+ * use of binary search in searchChar.
+ */
+ qsort(nodes, in->nTuples, sizeof(*nodes), cmpNodePtr);
+
+ /* And emit results */
+ out->nNodes = 0;
+ out->nodeLabels = (Datum *) palloc(sizeof(Datum) * in->nTuples);
+ out->mapTuplesToNodes = (int *) palloc(sizeof(int) * in->nTuples);
+ out->leafTupleDatums = (Datum *) palloc(sizeof(Datum) * in->nTuples);
+
+ for (i = 0; i < in->nTuples; i++)
+ {
+ text *texti = DatumGetTextPP(nodes[i].d);
+ Datum leafD;
+
+ if (i == 0 || nodes[i].c != nodes[i - 1].c)
+ {
+ out->nodeLabels[out->nNodes] = UInt8GetDatum(nodes[i].c);
+ out->nNodes++;
+ }
+
+ if (commonLen < VARSIZE_ANY_EXHDR(texti))
+ leafD = formTextDatum(VARDATA_ANY(texti) + commonLen + 1,
+ VARSIZE_ANY_EXHDR(texti) - commonLen - 1);
+ else
+ leafD = formTextDatum(NULL, 0);
+
+ out->leafTupleDatums[nodes[i].i] = leafD;
+ out->mapTuplesToNodes[nodes[i].i] = out->nNodes - 1;
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+spg_text_inner_consistent(PG_FUNCTION_ARGS)
+{
+ spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0);
+ spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy = in->strategy;
+ text *inText;
+ int inSize;
+ int i;
+ text *reconstrText = NULL;
+ int maxReconstrLen = 0;
+ text *prefixText = NULL;
+ int prefixSize = 0;
+
+ /*
+ * If it's a collation-aware operator, but the collation is C, we can
+ * treat it as non-collation-aware.
+ */
+ if (strategy > 10 &&
+ lc_collate_is_c(PG_GET_COLLATION()))
+ strategy -= 10;
+
+ inText = DatumGetTextPP(in->query);
+ inSize = VARSIZE_ANY_EXHDR(inText);
+
+ /*
+ * Reconstruct values represented at this tuple, including parent data,
+ * prefix of this tuple if any, and the node label if any. in->level
+ * should be the length of the previously reconstructed value, and the
+ * number of bytes added here is prefixSize or prefixSize + 1.
+ *
+ * Note: we assume that in->reconstructedValue isn't toasted and doesn't
+ * have a short varlena header. This is okay because it must have been
+ * created by a previous invocation of this routine, and we always emit
+ * long-format reconstructed values.
+ */
+ Assert(in->level == 0 ? DatumGetPointer(in->reconstructedValue) == NULL :
+ VARSIZE_ANY_EXHDR(DatumGetPointer(in->reconstructedValue)) == in->level);
+
+ maxReconstrLen = in->level + 1;
+ if (in->hasPrefix)
+ {
+ prefixText = DatumGetTextPP(in->prefixDatum);
+ prefixSize = VARSIZE_ANY_EXHDR(prefixText);
+ maxReconstrLen += prefixSize;
+ }
+
+ reconstrText = palloc(VARHDRSZ + maxReconstrLen);
+ SET_VARSIZE(reconstrText, VARHDRSZ + maxReconstrLen);
+
+ if (in->level)
+ memcpy(VARDATA(reconstrText),
+ VARDATA(DatumGetPointer(in->reconstructedValue)),
+ in->level);
+ if (prefixSize)
+ memcpy(((char *) VARDATA(reconstrText)) + in->level,
+ VARDATA_ANY(prefixText),
+ prefixSize);
+ /* last byte of reconstrText will be filled in below */
+
+ /*
+ * Scan the child nodes. For each one, complete the reconstructed value
+ * and see if it's consistent with the query. If so, emit an entry into
+ * the output arrays.
+ */
+ out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes);
+ out->levelAdds = (int *) palloc(sizeof(int) * in->nNodes);
+ out->reconstructedValues = (Datum *) palloc(sizeof(Datum) * in->nNodes);
+ out->nNodes = 0;
+
+ for (i = 0; i < in->nNodes; i++)
+ {
+ uint8 nodeChar = DatumGetUInt8(in->nodeLabels[i]);
+ int thisLen;
+ int r;
+ bool res = false;
+
+ /* If nodeChar is zero, don't include it in data */
+ if (nodeChar == '\0')
+ thisLen = maxReconstrLen - 1;
+ else
+ {
+ ((char *) VARDATA(reconstrText))[maxReconstrLen - 1] = nodeChar;
+ thisLen = maxReconstrLen;
+ }
+
+ r = memcmp(VARDATA(reconstrText), VARDATA_ANY(inText),
+ Min(inSize, thisLen));
+
+ switch (strategy)
+ {
+ case BTLessStrategyNumber:
+ case BTLessEqualStrategyNumber:
+ if (r <= 0)
+ res = true;
+ break;
+ case BTEqualStrategyNumber:
+ if (r == 0 && inSize >= thisLen)
+ res = true;
+ break;
+ case BTGreaterEqualStrategyNumber:
+ case BTGreaterStrategyNumber:
+ if (r >= 0)
+ res = true;
+ break;
+ case BTLessStrategyNumber + 10:
+ case BTLessEqualStrategyNumber + 10:
+ case BTGreaterEqualStrategyNumber + 10:
+ case BTGreaterStrategyNumber + 10:
+ /*
+ * with non-C collation we need to traverse whole tree :-(
+ */
+ res = true;
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d",
+ in->strategy);
+ break;
+ }
+
+ if (res)
+ {
+ out->nodeNumbers[out->nNodes] = i;
+ out->levelAdds[out->nNodes] = thisLen - in->level;
+ SET_VARSIZE(reconstrText, VARHDRSZ + thisLen);
+ out->reconstructedValues[out->nNodes] =
+ datumCopy(PointerGetDatum(reconstrText), false, -1);
+ out->nNodes++;
+ }
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+spg_text_leaf_consistent(PG_FUNCTION_ARGS)
+{
+ spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0);
+ spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1);
+ StrategyNumber strategy = in->strategy;
+ text *query = DatumGetTextPP(in->query);
+ int level = in->level;
+ text *leafValue,
+ *reconstrValue = NULL;
+ char *fullValue;
+ int fullLen;
+ int queryLen;
+ int r;
+ bool res;
+
+ /* all tests are exact */
+ out->recheck = false;
+
+ leafValue = DatumGetTextPP(in->leafDatum);
+
+ if (DatumGetPointer(in->reconstructedValue))
+ reconstrValue = DatumGetTextP(in->reconstructedValue);
+
+ Assert(level == 0 ? reconstrValue == NULL :
+ VARSIZE_ANY_EXHDR(reconstrValue) == level);
+
+ fullLen = level + VARSIZE_ANY_EXHDR(leafValue);
+
+ queryLen = VARSIZE_ANY_EXHDR(query);
+
+ /* For equality, we needn't reconstruct fullValue if not same length */
+ if (strategy == BTEqualStrategyNumber && queryLen != fullLen)
+ PG_RETURN_BOOL(false);
+
+ /* Else, reconstruct the full string represented by this leaf tuple */
+ if (VARSIZE_ANY_EXHDR(leafValue) == 0 && level > 0)
+ {
+ fullValue = VARDATA(reconstrValue);
+ }
+ else
+ {
+ fullValue = palloc(fullLen);
+ if (level)
+ memcpy(fullValue, VARDATA(reconstrValue), level);
+ if (VARSIZE_ANY_EXHDR(leafValue) > 0)
+ memcpy(fullValue + level, VARDATA_ANY(leafValue),
+ VARSIZE_ANY_EXHDR(leafValue));
+ }
+
+ /* Run the appropriate type of comparison */
+ if (strategy > 10)
+ {
+ /* Collation-aware comparison */
+ strategy -= 10;
+
+ /* If asserts are enabled, verify encoding of reconstructed string */
+ Assert(pg_verifymbstr(fullValue, fullLen, false));
+
+ r = varstr_cmp(fullValue, Min(queryLen, fullLen),
+ VARDATA_ANY(query), Min(queryLen, fullLen),
+ PG_GET_COLLATION());
+ }
+ else
+ {
+ /* Non-collation-aware comparison */
+ r = memcmp(fullValue, VARDATA_ANY(query), Min(queryLen, fullLen));
+ }
+
+ if (r == 0)
+ {
+ if (queryLen > fullLen)
+ r = -1;
+ else if (queryLen < fullLen)
+ r = 1;
+ }
+
+ switch (strategy)
+ {
+ case BTLessStrategyNumber:
+ res = (r < 0);
+ break;
+ case BTLessEqualStrategyNumber:
+ res = (r <= 0);
+ break;
+ case BTEqualStrategyNumber:
+ res = (r == 0);
+ break;
+ case BTGreaterEqualStrategyNumber:
+ res = (r >= 0);
+ break;
+ case BTGreaterStrategyNumber:
+ res = (r > 0);
+ break;
+ default:
+ elog(ERROR, "unrecognized strategy number: %d", in->strategy);
+ res = false;
+ break;
+ }
+
+ PG_RETURN_BOOL(res);
+}
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
new file mode 100644
index 00000000000..d6c01a5f842
--- /dev/null
+++ b/src/backend/access/spgist/spgutils.c
@@ -0,0 +1,850 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgutils.c
+ * various support functions for SP-GiST
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgutils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/reloptions.h"
+#include "access/spgist_private.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "utils/lsyscache.h"
+
+
+/* Fill in a SpGistTypeDesc struct with info about the specified data type */
+static void
+fillTypeDesc(SpGistTypeDesc *desc, Oid type)
+{
+ desc->type = type;
+ get_typlenbyval(type, &desc->attlen, &desc->attbyval);
+}
+
+/* Initialize SpGistState for working with the given index */
+void
+initSpGistState(SpGistState *state, Relation index)
+{
+ Oid atttype;
+ spgConfigIn in;
+
+ /* SPGiST doesn't support multi-column indexes */
+ Assert(index->rd_att->natts == 1);
+
+ /*
+ * Get the actual data type of the indexed column from the index tupdesc.
+ * We pass this to the opclass config function so that polymorphic
+ * opclasses are possible.
+ */
+ atttype = index->rd_att->attrs[0]->atttypid;
+
+ /* Get the config info for the opclass */
+ in.attType = atttype;
+
+ memset(&state->config, 0, sizeof(state->config));
+
+ FunctionCall2Coll(index_getprocinfo(index, 1, SPGIST_CONFIG_PROC),
+ index->rd_indcollation[0],
+ PointerGetDatum(&in),
+ PointerGetDatum(&state->config));
+
+ /* Get the information we need about each relevant datatype */
+ fillTypeDesc(&state->attType, atttype);
+ fillTypeDesc(&state->attPrefixType, state->config.prefixType);
+ fillTypeDesc(&state->attLabelType, state->config.labelType);
+
+ /* Get lookup info for opclass support procs */
+ fmgr_info_copy(&(state->chooseFn),
+ index_getprocinfo(index, 1, SPGIST_CHOOSE_PROC),
+ CurrentMemoryContext);
+ fmgr_info_copy(&(state->picksplitFn),
+ index_getprocinfo(index, 1, SPGIST_PICKSPLIT_PROC),
+ CurrentMemoryContext);
+ fmgr_info_copy(&(state->innerConsistentFn),
+ index_getprocinfo(index, 1, SPGIST_INNER_CONSISTENT_PROC),
+ CurrentMemoryContext);
+ fmgr_info_copy(&(state->leafConsistentFn),
+ index_getprocinfo(index, 1, SPGIST_LEAF_CONSISTENT_PROC),
+ CurrentMemoryContext);
+
+ /* Make workspace for constructing dead tuples */
+ state->deadTupleStorage = palloc0(SGDTSIZE);
+
+ /* Set XID to use in redirection tuples */
+ state->myXid = GetTopTransactionIdIfAny();
+
+ state->isBuild = false;
+}
+
+/*
+ * Allocate a new page (either by recycling, or by extending the index file).
+ *
+ * The returned buffer is already pinned and exclusive-locked.
+ * Caller is responsible for initializing the page by calling SpGistInitBuffer.
+ */
+Buffer
+SpGistNewBuffer(Relation index)
+{
+ Buffer buffer;
+ bool needLock;
+
+ /* First, try to get a page from FSM */
+ for (;;)
+ {
+ BlockNumber blkno = GetFreeIndexPage(index);
+
+ if (blkno == InvalidBlockNumber)
+ break; /* nothing known to FSM */
+
+ /*
+ * The root page shouldn't ever be listed in FSM, but just in case it
+ * is, ignore it.
+ */
+ if (blkno == SPGIST_HEAD_BLKNO)
+ continue;
+
+ buffer = ReadBuffer(index, blkno);
+
+ /*
+ * We have to guard against the possibility that someone else already
+ * recycled this page; the buffer may be locked if so.
+ */
+ if (ConditionalLockBuffer(buffer))
+ {
+ Page page = BufferGetPage(buffer);
+
+ if (PageIsNew(page))
+ return buffer; /* OK to use, if never initialized */
+
+ if (SpGistPageIsDeleted(page) || PageIsEmpty(page))
+ return buffer; /* OK to use */
+
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ }
+
+ /* Can't use it, so release buffer and try again */
+ ReleaseBuffer(buffer);
+ }
+
+ /* Must extend the file */
+ needLock = !RELATION_IS_LOCAL(index);
+ if (needLock)
+ LockRelationForExtension(index, ExclusiveLock);
+
+ buffer = ReadBuffer(index, P_NEW);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
+ if (needLock)
+ UnlockRelationForExtension(index, ExclusiveLock);
+
+ return buffer;
+}
+
+/*
+ * Fetch local cache of lastUsedPages info, initializing it from the metapage
+ * if necessary
+ */
+static SpGistCache *
+spgGetCache(Relation index)
+{
+ SpGistCache *cache;
+
+ if (index->rd_amcache == NULL)
+ {
+ Buffer metabuffer;
+ SpGistMetaPageData *metadata;
+
+ cache = MemoryContextAlloc(index->rd_indexcxt,
+ sizeof(SpGistCache));
+
+ metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO);
+ LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
+
+ metadata = SpGistPageGetMeta(BufferGetPage(metabuffer));
+
+ if (metadata->magicNumber != SPGIST_MAGIC_NUMBER)
+ elog(ERROR, "index \"%s\" is not an SP-GiST index",
+ RelationGetRelationName(index));
+
+ *cache = metadata->lastUsedPages;
+
+ UnlockReleaseBuffer(metabuffer);
+
+ index->rd_amcache = cache;
+ }
+ else
+ {
+ cache = (SpGistCache *) index->rd_amcache;
+ }
+
+ return cache;
+}
+
+/*
+ * Update index metapage's lastUsedPages info from local cache, if possible
+ *
+ * Updating meta page isn't critical for index working, so
+ * 1 use ConditionalLockBuffer to improve concurrency
+ * 2 don't WAL-log metabuffer changes to decrease WAL traffic
+ */
+void
+SpGistUpdateMetaPage(Relation index)
+{
+ SpGistCache *cache = (SpGistCache *) index->rd_amcache;
+
+ if (cache != NULL)
+ {
+ Buffer metabuffer;
+ SpGistMetaPageData *metadata;
+
+ metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO);
+
+ if (ConditionalLockBuffer(metabuffer))
+ {
+ metadata = SpGistPageGetMeta(BufferGetPage(metabuffer));
+ metadata->lastUsedPages = *cache;
+
+ MarkBufferDirty(metabuffer);
+ UnlockReleaseBuffer(metabuffer);
+ }
+ else
+ {
+ ReleaseBuffer(metabuffer);
+ }
+ }
+}
+
+/* Macro to select proper element of lastUsedPages cache depending on flags */
+#define GET_LUP(c, f) (((f) & GBUF_LEAF) ? \
+ &(c)->leafPage : \
+ &(c)->innerPage[(f) & GBUF_PARITY_MASK])
+
+/*
+ * Allocate and initialize a new buffer of the type and parity specified by
+ * flags. The returned buffer is already pinned and exclusive-locked.
+ *
+ * When requesting an inner page, if we get one with the wrong parity,
+ * we just release the buffer and try again. We will get a different page
+ * because GetFreeIndexPage will have marked the page used in FSM. The page
+ * is entered in our local lastUsedPages cache, so there's some hope of
+ * making use of it later in this session, but otherwise we rely on VACUUM
+ * to eventually re-enter the page in FSM, making it available for recycling.
+ * Note that such a page does not get marked dirty here, so unless it's used
+ * fairly soon, the buffer will just get discarded and the page will remain
+ * as it was on disk.
+ *
+ * When we return a buffer to the caller, the page is *not* entered into
+ * the lastUsedPages cache; we expect the caller will do so after it's taken
+ * whatever space it will use. This is because after the caller has used up
+ * some space, the page might have less space than whatever was cached already
+ * so we'd rather not trash the old cache entry.
+ */
+static Buffer
+allocNewBuffer(Relation index, int flags)
+{
+ SpGistCache *cache = spgGetCache(index);
+
+ for (;;)
+ {
+ Buffer buffer;
+
+ buffer = SpGistNewBuffer(index);
+ SpGistInitBuffer(buffer, (flags & GBUF_LEAF) ? SPGIST_LEAF : 0);
+
+ if (flags & GBUF_LEAF)
+ {
+ /* Leaf pages have no parity concerns, so just use it */
+ return buffer;
+ }
+ else
+ {
+ BlockNumber blkno = BufferGetBlockNumber(buffer);
+ int blkParity = blkno % 3;
+
+ if ((flags & GBUF_PARITY_MASK) == blkParity)
+ {
+ /* Page has right parity, use it */
+ return buffer;
+ }
+ else
+ {
+ /* Page has wrong parity, record it in cache and try again */
+ cache->innerPage[blkParity].blkno = blkno;
+ cache->innerPage[blkParity].freeSpace =
+ PageGetExactFreeSpace(BufferGetPage(buffer));
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+ }
+}
+
+/*
+ * Get a buffer of the type and parity specified by flags, having at least
+ * as much free space as indicated by needSpace. We use the lastUsedPages
+ * cache to assign the same buffer previously requested when possible.
+ * The returned buffer is already pinned and exclusive-locked.
+ *
+ * *isNew is set true if the page was initialized here, false if it was
+ * already valid.
+ */
+Buffer
+SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew)
+{
+ SpGistCache *cache = spgGetCache(index);
+ SpGistLastUsedPage *lup;
+
+ /* Bail out if even an empty page wouldn't meet the demand */
+ if (needSpace > SPGIST_PAGE_CAPACITY)
+ elog(ERROR, "desired SPGiST tuple size is too big");
+
+ /*
+ * If possible, increase the space request to include relation's
+ * fillfactor. This ensures that when we add unrelated tuples to a page,
+ * we try to keep 100-fillfactor% available for adding tuples that are
+ * related to the ones already on it. But fillfactor mustn't cause an
+ * error for requests that would otherwise be legal.
+ */
+ needSpace += RelationGetTargetPageFreeSpace(index,
+ SPGIST_DEFAULT_FILLFACTOR);
+ needSpace = Min(needSpace, SPGIST_PAGE_CAPACITY);
+
+ /* Get the cache entry for this flags setting */
+ lup = GET_LUP(cache, flags);
+
+ /* If we have nothing cached, just turn it over to allocNewBuffer */
+ if (lup->blkno == InvalidBlockNumber)
+ {
+ *isNew = true;
+ return allocNewBuffer(index, flags);
+ }
+
+ /* root page should never be in cache */
+ Assert(lup->blkno != SPGIST_HEAD_BLKNO);
+
+ /* If cached freeSpace isn't enough, don't bother looking at the page */
+ if (lup->freeSpace >= needSpace)
+ {
+ Buffer buffer;
+ Page page;
+
+ buffer = ReadBuffer(index, lup->blkno);
+
+ if (!ConditionalLockBuffer(buffer))
+ {
+ /*
+ * buffer is locked by another process, so return a new buffer
+ */
+ ReleaseBuffer(buffer);
+ *isNew = true;
+ return allocNewBuffer(index, flags);
+ }
+
+ page = BufferGetPage(buffer);
+
+ if (PageIsNew(page) || SpGistPageIsDeleted(page) || PageIsEmpty(page))
+ {
+ /* OK to initialize the page */
+ SpGistInitBuffer(buffer, (flags & GBUF_LEAF) ? SPGIST_LEAF : 0);
+ lup->freeSpace = PageGetExactFreeSpace(page) - needSpace;
+ *isNew = true;
+ return buffer;
+ }
+
+ /*
+ * Check that page is of right type and has enough space. We must
+ * recheck this since our cache isn't necessarily up to date.
+ */
+ if ((flags & GBUF_LEAF) ? SpGistPageIsLeaf(page) :
+ !SpGistPageIsLeaf(page))
+ {
+ int freeSpace = PageGetExactFreeSpace(page);
+
+ if (freeSpace >= needSpace)
+ {
+ /* Success, update freespace info and return the buffer */
+ lup->freeSpace = freeSpace - needSpace;
+ *isNew = false;
+ return buffer;
+ }
+ }
+
+ /*
+ * fallback to allocation of new buffer
+ */
+ UnlockReleaseBuffer(buffer);
+ }
+
+ /* No success with cache, so return a new buffer */
+ *isNew = true;
+ return allocNewBuffer(index, flags);
+}
+
+/*
+ * Update lastUsedPages cache when done modifying a page.
+ *
+ * We update the appropriate cache entry if it already contained this page
+ * (its freeSpace is likely obsolete), or if this page has more space than
+ * whatever we had cached.
+ */
+void
+SpGistSetLastUsedPage(Relation index, Buffer buffer)
+{
+ SpGistCache *cache = spgGetCache(index);
+ SpGistLastUsedPage *lup;
+ int freeSpace;
+ Page page = BufferGetPage(buffer);
+ BlockNumber blkno = BufferGetBlockNumber(buffer);
+ int flags;
+
+ /* Never enter the root page in cache, though */
+ if (blkno == SPGIST_HEAD_BLKNO)
+ return;
+
+ if (SpGistPageIsLeaf(page))
+ flags = GBUF_LEAF;
+ else
+ flags = GBUF_INNER_PARITY(blkno);
+
+ lup = GET_LUP(cache, flags);
+
+ freeSpace = PageGetExactFreeSpace(page);
+ if (lup->blkno == InvalidBlockNumber || lup->blkno == blkno ||
+ lup->freeSpace < freeSpace)
+ {
+ lup->blkno = blkno;
+ lup->freeSpace = freeSpace;
+ }
+}
+
+/*
+ * Initialize an SPGiST page to empty, with specified flags
+ */
+void
+SpGistInitPage(Page page, uint16 f)
+{
+ SpGistPageOpaque opaque;
+
+ PageInit(page, BLCKSZ, MAXALIGN(sizeof(SpGistPageOpaqueData)));
+ opaque = SpGistPageGetOpaque(page);
+ memset(opaque, 0, sizeof(SpGistPageOpaqueData));
+ opaque->flags = f;
+ opaque->spgist_page_id = SPGIST_PAGE_ID;
+}
+
+/*
+ * Initialize a buffer's page to empty, with specified flags
+ */
+void
+SpGistInitBuffer(Buffer b, uint16 f)
+{
+ Assert(BufferGetPageSize(b) == BLCKSZ);
+ SpGistInitPage(BufferGetPage(b), f);
+}
+
+/*
+ * Initialize metadata page
+ */
+void
+SpGistInitMetapage(Page page)
+{
+ SpGistMetaPageData *metadata;
+
+ SpGistInitPage(page, SPGIST_META);
+ metadata = SpGistPageGetMeta(page);
+ memset(metadata, 0, sizeof(SpGistMetaPageData));
+ metadata->magicNumber = SPGIST_MAGIC_NUMBER;
+
+ /* initialize last-used-page cache to empty */
+ metadata->lastUsedPages.innerPage[0].blkno = InvalidBlockNumber;
+ metadata->lastUsedPages.innerPage[1].blkno = InvalidBlockNumber;
+ metadata->lastUsedPages.innerPage[2].blkno = InvalidBlockNumber;
+ metadata->lastUsedPages.leafPage.blkno = InvalidBlockNumber;
+}
+
+/*
+ * reloptions processing for SPGiST
+ */
+Datum
+spgoptions(PG_FUNCTION_ARGS)
+{
+ Datum reloptions = PG_GETARG_DATUM(0);
+ bool validate = PG_GETARG_BOOL(1);
+ bytea *result;
+
+ result = default_reloptions(reloptions, validate, RELOPT_KIND_SPGIST);
+
+ if (result)
+ PG_RETURN_BYTEA_P(result);
+ PG_RETURN_NULL();
+}
+
+/*
+ * Get the space needed to store a datum of the indicated type.
+ * Note the result is already rounded up to a MAXALIGN boundary.
+ * Also, we follow the SPGiST convention that pass-by-val types are
+ * just stored in their Datum representation (compare memcpyDatum).
+ */
+unsigned int
+SpGistGetTypeSize(SpGistTypeDesc *att, Datum datum)
+{
+ unsigned int size;
+
+ if (att->attbyval)
+ size = sizeof(Datum);
+ else if (att->attlen > 0)
+ size = att->attlen;
+ else
+ size = VARSIZE_ANY(datum);
+
+ return MAXALIGN(size);
+}
+
+/*
+ * Copy the given datum to *target
+ */
+static void
+memcpyDatum(void *target, SpGistTypeDesc *att, Datum datum)
+{
+ unsigned int size;
+
+ if (att->attbyval)
+ {
+ memcpy(target, &datum, sizeof(Datum));
+ }
+ else
+ {
+ size = (att->attlen > 0) ? att->attlen : VARSIZE_ANY(datum);
+ memcpy(target, DatumGetPointer(datum), size);
+ }
+}
+
+/*
+ * Construct a leaf tuple containing the given heap TID and datum value
+ */
+SpGistLeafTuple
+spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr, Datum datum)
+{
+ SpGistLeafTuple tup;
+ unsigned int size;
+
+ /* compute space needed (note result is already maxaligned) */
+ size = SGLTHDRSZ + SpGistGetTypeSize(&state->attType, datum);
+
+ /*
+ * Ensure that we can replace the tuple with a dead tuple later. This
+ * test is unnecessary given current tuple layouts, but let's be safe.
+ */
+ if (size < SGDTSIZE)
+ size = SGDTSIZE;
+
+ /* OK, form the tuple */
+ tup = (SpGistLeafTuple) palloc0(size);
+
+ tup->size = size;
+ tup->nextOffset = InvalidOffsetNumber;
+ tup->heapPtr = *heapPtr;
+ memcpyDatum(SGLTDATAPTR(tup), &state->attType, datum);
+
+ return tup;
+}
+
+/*
+ * Construct a node (to go into an inner tuple) containing the given label
+ *
+ * Note that the node's downlink is just set invalid here. Caller will fill
+ * it in later.
+ */
+SpGistNodeTuple
+spgFormNodeTuple(SpGistState *state, Datum label, bool isnull)
+{
+ SpGistNodeTuple tup;
+ unsigned int size;
+ unsigned short infomask = 0;
+
+ /* compute space needed (note result is already maxaligned) */
+ size = SGNTHDRSZ;
+ if (!isnull)
+ size += SpGistGetTypeSize(&state->attLabelType, label);
+
+ /*
+ * Here we make sure that the size will fit in the field reserved for it
+ * in t_info.
+ */
+ if ((size & INDEX_SIZE_MASK) != size)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row requires %lu bytes, maximum size is %lu",
+ (unsigned long) size,
+ (unsigned long) INDEX_SIZE_MASK)));
+
+ tup = (SpGistNodeTuple) palloc0(size);
+
+ if (isnull)
+ infomask |= INDEX_NULL_MASK;
+ /* we don't bother setting the INDEX_VAR_MASK bit */
+ infomask |= size;
+ tup->t_info = infomask;
+
+ /* The TID field will be filled in later */
+ ItemPointerSetInvalid(&tup->t_tid);
+
+ if (!isnull)
+ memcpyDatum(SGNTDATAPTR(tup), &state->attLabelType, label);
+
+ return tup;
+}
+
+/*
+ * Construct an inner tuple containing the given prefix and node array
+ */
+SpGistInnerTuple
+spgFormInnerTuple(SpGistState *state, bool hasPrefix, Datum prefix,
+ int nNodes, SpGistNodeTuple *nodes)
+{
+ SpGistInnerTuple tup;
+ unsigned int size;
+ unsigned int prefixSize;
+ int i;
+ char *ptr;
+
+ /* Compute size needed */
+ if (hasPrefix)
+ prefixSize = SpGistGetTypeSize(&state->attPrefixType, prefix);
+ else
+ prefixSize = 0;
+
+ size = SGITHDRSZ + prefixSize;
+
+ /* Note: we rely on node tuple sizes to be maxaligned already */
+ for (i = 0; i < nNodes; i++)
+ size += IndexTupleSize(nodes[i]);
+
+ /*
+ * Ensure that we can replace the tuple with a dead tuple later. This
+ * test is unnecessary given current tuple layouts, but let's be safe.
+ */
+ if (size < SGDTSIZE)
+ size = SGDTSIZE;
+
+ /*
+ * Inner tuple should be small enough to fit on a page
+ */
+ if (size > SPGIST_PAGE_CAPACITY - sizeof(ItemIdData))
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("SPGiST inner tuple size %lu exceeds maximum %lu",
+ (unsigned long) size,
+ (unsigned long) (SPGIST_PAGE_CAPACITY - sizeof(ItemIdData))),
+ errhint("Values larger than a buffer page cannot be indexed.")));
+
+ /*
+ * Check for overflow of header fields --- probably can't fail if the
+ * above succeeded, but let's be paranoid
+ */
+ if (size > SGITMAXSIZE ||
+ prefixSize > SGITMAXPREFIXSIZE ||
+ nNodes > SGITMAXNNODES)
+ elog(ERROR, "SPGiST inner tuple header field is too small");
+
+ /* OK, form the tuple */
+ tup = (SpGistInnerTuple) palloc0(size);
+
+ tup->nNodes = nNodes;
+ tup->prefixSize = prefixSize;
+ tup->size = size;
+
+ if (hasPrefix)
+ memcpyDatum(SGITDATAPTR(tup), &state->attPrefixType, prefix);
+
+ ptr = (char *) SGITNODEPTR(tup);
+
+ for (i = 0; i < nNodes; i++)
+ {
+ SpGistNodeTuple node = nodes[i];
+
+ memcpy(ptr, node, IndexTupleSize(node));
+ ptr += IndexTupleSize(node);
+ }
+
+ return tup;
+}
+
+/*
+ * Construct a "dead" tuple to replace a tuple being deleted.
+ *
+ * The state can be SPGIST_REDIRECT, SPGIST_DEAD, or SPGIST_PLACEHOLDER.
+ * For a REDIRECT tuple, a pointer (blkno+offset) must be supplied, and
+ * the xid field is filled in automatically.
+ *
+ * This is called in critical sections, so we don't use palloc; the tuple
+ * is built in preallocated storage. It should be copied before another
+ * call with different parameters can occur.
+ */
+SpGistDeadTuple
+spgFormDeadTuple(SpGistState *state, int tupstate,
+ BlockNumber blkno, OffsetNumber offnum)
+{
+ SpGistDeadTuple tuple = (SpGistDeadTuple) state->deadTupleStorage;
+
+ tuple->tupstate = tupstate;
+ tuple->size = SGDTSIZE;
+ tuple->nextOffset = InvalidOffsetNumber;
+
+ if (tupstate == SPGIST_REDIRECT)
+ {
+ ItemPointerSet(&tuple->pointer, blkno, offnum);
+ tuple->xid = state->myXid;
+ }
+ else
+ {
+ ItemPointerSetInvalid(&tuple->pointer);
+ tuple->xid = InvalidTransactionId;
+ }
+
+ return tuple;
+}
+
+/*
+ * Extract the label datums of the nodes within innerTuple
+ *
+ * Returns NULL if label datums are NULLs
+ */
+Datum *
+spgExtractNodeLabels(SpGistState *state, SpGistInnerTuple innerTuple)
+{
+ Datum *nodeLabels;
+ int nullcount = 0;
+ int i;
+ SpGistNodeTuple node;
+
+ nodeLabels = (Datum *) palloc(sizeof(Datum) * innerTuple->nNodes);
+ SGITITERATE(innerTuple, i, node)
+ {
+ if (IndexTupleHasNulls(node))
+ nullcount++;
+ else
+ nodeLabels[i] = SGNTDATUM(node, state);
+ }
+ if (nullcount == innerTuple->nNodes)
+ {
+ /* They're all null, so just return NULL */
+ pfree(nodeLabels);
+ return NULL;
+ }
+ if (nullcount != 0)
+ elog(ERROR, "some but not all node labels are null in SPGiST inner tuple");
+ return nodeLabels;
+}
+
+/*
+ * Add a new item to the page, replacing a PLACEHOLDER item if possible.
+ * Return the location it's inserted at, or InvalidOffsetNumber on failure.
+ *
+ * If startOffset isn't NULL, we start searching for placeholders at
+ * *startOffset, and update that to the next place to search. This is just
+ * an optimization for repeated insertions.
+ *
+ * If errorOK is false, we throw error when there's not enough room,
+ * rather than returning InvalidOffsetNumber.
+ */
+OffsetNumber
+SpGistPageAddNewItem(SpGistState *state, Page page, Item item, Size size,
+ OffsetNumber *startOffset, bool errorOK)
+{
+ SpGistPageOpaque opaque = SpGistPageGetOpaque(page);
+ OffsetNumber i,
+ maxoff,
+ offnum;
+
+ if (opaque->nPlaceholder > 0 &&
+ PageGetExactFreeSpace(page) + SGDTSIZE >= MAXALIGN(size))
+ {
+ /* Try to replace a placeholder */
+ maxoff = PageGetMaxOffsetNumber(page);
+ offnum = InvalidOffsetNumber;
+
+ for (;;)
+ {
+ if (startOffset && *startOffset != InvalidOffsetNumber)
+ i = *startOffset;
+ else
+ i = FirstOffsetNumber;
+ for (; i <= maxoff; i++)
+ {
+ SpGistDeadTuple it = (SpGistDeadTuple) PageGetItem(page,
+ PageGetItemId(page, i));
+
+ if (it->tupstate == SPGIST_PLACEHOLDER)
+ {
+ offnum = i;
+ break;
+ }
+ }
+
+ /* Done if we found a placeholder */
+ if (offnum != InvalidOffsetNumber)
+ break;
+
+ if (startOffset && *startOffset != InvalidOffsetNumber)
+ {
+ /* Hint was no good, re-search from beginning */
+ *startOffset = InvalidOffsetNumber;
+ continue;
+ }
+
+ /* Hmm, no placeholder found? */
+ opaque->nPlaceholder = 0;
+ break;
+ }
+
+ if (offnum != InvalidOffsetNumber)
+ {
+ /* Replace the placeholder tuple */
+ PageIndexTupleDelete(page, offnum);
+
+ offnum = PageAddItem(page, item, size, offnum, false, false);
+
+ /*
+ * We should not have failed given the size check at the top of
+ * the function, but test anyway. If we did fail, we must PANIC
+ * because we've already deleted the placeholder tuple, and
+ * there's no other way to keep the damage from getting to disk.
+ */
+ if (offnum != InvalidOffsetNumber)
+ {
+ Assert(opaque->nPlaceholder > 0);
+ opaque->nPlaceholder--;
+ if (startOffset)
+ *startOffset = offnum + 1;
+ }
+ else
+ elog(PANIC, "failed to add item of size %u to SPGiST index page",
+ size);
+
+ return offnum;
+ }
+ }
+
+ /* No luck in replacing a placeholder, so just add it to the page */
+ offnum = PageAddItem(page, item, size,
+ InvalidOffsetNumber, false, false);
+
+ if (offnum == InvalidOffsetNumber && !errorOK)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ size);
+
+ return offnum;
+}
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
new file mode 100644
index 00000000000..90d59920eb6
--- /dev/null
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -0,0 +1,755 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgvacuum.c
+ * vacuum for SP-GiST
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgvacuum.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/spgist_private.h"
+#include "access/transam.h"
+#include "catalog/storage.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/indexfsm.h"
+#include "storage/lmgr.h"
+#include "storage/procarray.h"
+
+
+/* local state for vacuum operations */
+typedef struct spgBulkDeleteState
+{
+ /* Parameters passed in to spgvacuumscan */
+ IndexVacuumInfo *info;
+ IndexBulkDeleteResult *stats;
+ IndexBulkDeleteCallback callback;
+ void *callback_state;
+ /* Additional working state */
+ SpGistState spgstate;
+ TransactionId OldestXmin;
+ BlockNumber lastFilledBlock;
+} spgBulkDeleteState;
+
+
+/*
+ * Vacuum a regular (non-root) leaf page
+ *
+ * We must delete tuples that are targeted for deletion by the VACUUM,
+ * but not move any tuples that are referenced by outside links; we assume
+ * those are the ones that are heads of chains.
+ */
+static void
+vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
+{
+ Page page = BufferGetPage(buffer);
+ spgxlogVacuumLeaf xlrec;
+ XLogRecData rdata[8];
+ OffsetNumber toDead[MaxIndexTuplesPerPage];
+ OffsetNumber toPlaceholder[MaxIndexTuplesPerPage];
+ OffsetNumber moveSrc[MaxIndexTuplesPerPage];
+ OffsetNumber moveDest[MaxIndexTuplesPerPage];
+ OffsetNumber chainSrc[MaxIndexTuplesPerPage];
+ OffsetNumber chainDest[MaxIndexTuplesPerPage];
+ OffsetNumber predecessor[MaxIndexTuplesPerPage + 1];
+ bool deletable[MaxIndexTuplesPerPage + 1];
+ int nDeletable;
+ OffsetNumber i,
+ max = PageGetMaxOffsetNumber(page);
+
+ memset(predecessor, 0, sizeof(predecessor));
+ memset(deletable, 0, sizeof(deletable));
+ nDeletable = 0;
+
+ /* Scan page, identify tuples to delete, accumulate stats */
+ for (i = FirstOffsetNumber; i <= max; i++)
+ {
+ SpGistLeafTuple lt;
+
+ lt = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, i));
+ if (lt->tupstate == SPGIST_LIVE)
+ {
+ Assert(ItemPointerIsValid(&lt->heapPtr));
+
+ if (bds->callback(&lt->heapPtr, bds->callback_state))
+ {
+ bds->stats->tuples_removed += 1;
+ deletable[i] = true;
+ nDeletable++;
+ }
+ else
+ {
+ bds->stats->num_index_tuples += 1;
+ }
+
+ /* Form predecessor map, too */
+ if (lt->nextOffset != InvalidOffsetNumber)
+ {
+ /* paranoia about corrupted chain links */
+ if (lt->nextOffset < FirstOffsetNumber ||
+ lt->nextOffset > max ||
+ predecessor[lt->nextOffset] != InvalidOffsetNumber)
+ elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"",
+ BufferGetBlockNumber(buffer),
+ RelationGetRelationName(index));
+ predecessor[lt->nextOffset] = i;
+ }
+ }
+ else
+ {
+ Assert(lt->nextOffset == InvalidOffsetNumber);
+ }
+ }
+
+ if (nDeletable == 0)
+ return; /* nothing more to do */
+
+ /*----------
+ * Figure out exactly what we have to do. We do this separately from
+ * actually modifying the page, mainly so that we have a representation
+ * that can be dumped into WAL and then the replay code can do exactly
+ * the same thing. The output of this step consists of six arrays
+ * describing four kinds of operations, to be performed in this order:
+ *
+ * toDead[]: tuple numbers to be replaced with DEAD tuples
+ * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples
+ * moveSrc[]: tuple numbers that need to be relocated to another offset
+ * (replacing the tuple there) and then replaced with PLACEHOLDER tuples
+ * moveDest[]: new locations for moveSrc tuples
+ * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates
+ * chainDest[]: new values of nextOffset for chainSrc members
+ *
+ * It's easiest to figure out what we have to do by processing tuple
+ * chains, so we iterate over all the tuples (not just the deletable
+ * ones!) to identify chain heads, then chase down each chain and make
+ * work item entries for deletable tuples within the chain.
+ *----------
+ */
+ xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0;
+
+ for (i = FirstOffsetNumber; i <= max; i++)
+ {
+ SpGistLeafTuple head;
+ bool interveningDeletable;
+ OffsetNumber prevLive;
+ OffsetNumber j;
+
+ head = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, i));
+ if (head->tupstate != SPGIST_LIVE)
+ continue; /* can't be a chain member */
+ if (predecessor[i] != 0)
+ continue; /* not a chain head */
+
+ /* initialize ... */
+ interveningDeletable = false;
+ prevLive = deletable[i] ? InvalidOffsetNumber : i;
+
+ /* scan down the chain ... */
+ j = head->nextOffset;
+ while (j != InvalidOffsetNumber)
+ {
+ SpGistLeafTuple lt;
+
+ lt = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, j));
+ if (lt->tupstate != SPGIST_LIVE)
+ {
+ /* all tuples in chain should be live */
+ elog(ERROR, "unexpected SPGiST tuple state: %d",
+ lt->tupstate);
+ }
+
+ if (deletable[j])
+ {
+ /* This tuple should be replaced by a placeholder */
+ toPlaceholder[xlrec.nPlaceholder] = j;
+ xlrec.nPlaceholder++;
+ /* previous live tuple's chain link will need an update */
+ interveningDeletable = true;
+ }
+ else if (prevLive == InvalidOffsetNumber)
+ {
+ /*
+ * This is the first live tuple in the chain. It has
+ * to move to the head position.
+ */
+ moveSrc[xlrec.nMove] = j;
+ moveDest[xlrec.nMove] = i;
+ xlrec.nMove++;
+ /* Chain updates will be applied after the move */
+ prevLive = i;
+ interveningDeletable = false;
+ }
+ else
+ {
+ /*
+ * Second or later live tuple. Arrange to re-chain it to the
+ * previous live one, if there was a gap.
+ */
+ if (interveningDeletable)
+ {
+ chainSrc[xlrec.nChain] = prevLive;
+ chainDest[xlrec.nChain] = j;
+ xlrec.nChain++;
+ }
+ prevLive = j;
+ interveningDeletable = false;
+ }
+
+ j = lt->nextOffset;
+ }
+
+ if (prevLive == InvalidOffsetNumber)
+ {
+ /* The chain is entirely removable, so we need a DEAD tuple */
+ toDead[xlrec.nDead] = i;
+ xlrec.nDead++;
+ }
+ else if (interveningDeletable)
+ {
+ /* One or more deletions at end of chain, so close it off */
+ chainSrc[xlrec.nChain] = prevLive;
+ chainDest[xlrec.nChain] = InvalidOffsetNumber;
+ xlrec.nChain++;
+ }
+ }
+
+ /* sanity check ... */
+ if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove)
+ elog(ERROR, "inconsistent counts of deletable tuples");
+
+ /* Prepare WAL record */
+ xlrec.node = index->rd_node;
+ xlrec.blkno = BufferGetBlockNumber(buffer);
+ STORE_STATE(&bds->spgstate, xlrec.stateSrc);
+
+ ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0);
+ /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */
+ ACCEPT_RDATA_DATA(toDead, sizeof(OffsetNumber) * xlrec.nDead, 1);
+ ACCEPT_RDATA_DATA(toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder, 2);
+ ACCEPT_RDATA_DATA(moveSrc, sizeof(OffsetNumber) * xlrec.nMove, 3);
+ ACCEPT_RDATA_DATA(moveDest, sizeof(OffsetNumber) * xlrec.nMove, 4);
+ ACCEPT_RDATA_DATA(chainSrc, sizeof(OffsetNumber) * xlrec.nChain, 5);
+ ACCEPT_RDATA_DATA(chainDest, sizeof(OffsetNumber) * xlrec.nChain, 6);
+ ACCEPT_RDATA_BUFFER(buffer, 7);
+
+ /* Do the updates */
+ START_CRIT_SECTION();
+
+ spgPageIndexMultiDelete(&bds->spgstate, page,
+ toDead, xlrec.nDead,
+ SPGIST_DEAD, SPGIST_DEAD,
+ InvalidBlockNumber, InvalidOffsetNumber);
+
+ spgPageIndexMultiDelete(&bds->spgstate, page,
+ toPlaceholder, xlrec.nPlaceholder,
+ SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber, InvalidOffsetNumber);
+
+ /*
+ * We implement the move step by swapping the item pointers of the
+ * source and target tuples, then replacing the newly-source tuples
+ * with placeholders. This is perhaps unduly friendly with the page
+ * data representation, but it's fast and doesn't risk page overflow
+ * when a tuple to be relocated is large.
+ */
+ for (i = 0; i < xlrec.nMove; i++)
+ {
+ ItemId idSrc = PageGetItemId(page, moveSrc[i]);
+ ItemId idDest = PageGetItemId(page, moveDest[i]);
+ ItemIdData tmp;
+
+ tmp = *idSrc;
+ *idSrc = *idDest;
+ *idDest = tmp;
+ }
+
+ spgPageIndexMultiDelete(&bds->spgstate, page,
+ moveSrc, xlrec.nMove,
+ SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber, InvalidOffsetNumber);
+
+ for (i = 0; i < xlrec.nChain; i++)
+ {
+ SpGistLeafTuple lt;
+
+ lt = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, chainSrc[i]));
+ Assert(lt->tupstate == SPGIST_LIVE);
+ lt->nextOffset = chainDest[i];
+ }
+
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF, rdata);
+
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
+
+ END_CRIT_SECTION();
+}
+
+/*
+ * Vacuum the root page when it is a leaf
+ *
+ * On the root, we just delete any dead leaf tuples; no fancy business
+ */
+static void
+vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer)
+{
+ Page page = BufferGetPage(buffer);
+ spgxlogVacuumRoot xlrec;
+ XLogRecData rdata[3];
+ OffsetNumber toDelete[MaxIndexTuplesPerPage];
+ OffsetNumber i,
+ max = PageGetMaxOffsetNumber(page);
+
+ xlrec.nDelete = 0;
+
+ /* Scan page, identify tuples to delete, accumulate stats */
+ for (i = FirstOffsetNumber; i <= max; i++)
+ {
+ SpGistLeafTuple lt;
+
+ lt = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, i));
+ if (lt->tupstate == SPGIST_LIVE)
+ {
+ Assert(ItemPointerIsValid(&lt->heapPtr));
+
+ if (bds->callback(&lt->heapPtr, bds->callback_state))
+ {
+ bds->stats->tuples_removed += 1;
+ toDelete[xlrec.nDelete] = i;
+ xlrec.nDelete++;
+ }
+ else
+ {
+ bds->stats->num_index_tuples += 1;
+ }
+ }
+ else
+ {
+ /* all tuples on root should be live */
+ elog(ERROR, "unexpected SPGiST tuple state: %d",
+ lt->tupstate);
+ }
+ }
+
+ if (xlrec.nDelete == 0)
+ return; /* nothing more to do */
+
+ /* Prepare WAL record */
+ xlrec.node = index->rd_node;
+ STORE_STATE(&bds->spgstate, xlrec.stateSrc);
+
+ ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0);
+ /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */
+ ACCEPT_RDATA_DATA(toDelete, sizeof(OffsetNumber) * xlrec.nDelete, 1);
+ ACCEPT_RDATA_BUFFER(buffer, 2);
+
+ /* Do the update */
+ START_CRIT_SECTION();
+
+ /* The tuple numbers are in order, so we can use PageIndexMultiDelete */
+ PageIndexMultiDelete(page, toDelete, xlrec.nDelete);
+
+ MarkBufferDirty(buffer);
+
+ if (RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT, rdata);
+
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
+
+ END_CRIT_SECTION();
+}
+
+/*
+ * Clean up redirect and placeholder tuples on the given page
+ *
+ * Redirect tuples can be marked placeholder once they're old enough.
+ * Placeholder tuples can be removed if it won't change the offsets of
+ * non-placeholder ones.
+ *
+ * Unlike the routines above, this works on both leaf and inner pages.
+ */
+static void
+vacuumRedirectAndPlaceholder(Relation index, Buffer buffer,
+ TransactionId OldestXmin)
+{
+ Page page = BufferGetPage(buffer);
+ SpGistPageOpaque opaque = SpGistPageGetOpaque(page);
+ OffsetNumber i,
+ max = PageGetMaxOffsetNumber(page),
+ firstPlaceholder = InvalidOffsetNumber;
+ bool hasNonPlaceholder = false;
+ bool hasUpdate = false;
+ OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage];
+ OffsetNumber itemnos[MaxIndexTuplesPerPage];
+ spgxlogVacuumRedirect xlrec;
+ XLogRecData rdata[3];
+
+ xlrec.node = index->rd_node;
+ xlrec.blkno = BufferGetBlockNumber(buffer);
+ xlrec.nToPlaceholder = 0;
+
+ START_CRIT_SECTION();
+
+ /*
+ * Scan backwards to convert old redirection tuples to placeholder tuples,
+ * and identify location of last non-placeholder tuple while at it.
+ */
+ for (i = max;
+ i >= FirstOffsetNumber &&
+ (opaque->nRedirection > 0 || !hasNonPlaceholder);
+ i--)
+ {
+ SpGistDeadTuple dt;
+
+ dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i));
+
+ if (dt->tupstate == SPGIST_REDIRECT &&
+ TransactionIdPrecedes(dt->xid, OldestXmin))
+ {
+ dt->tupstate = SPGIST_PLACEHOLDER;
+ Assert(opaque->nRedirection > 0);
+ opaque->nRedirection--;
+ opaque->nPlaceholder++;
+
+ ItemPointerSetInvalid(&dt->pointer);
+
+ itemToPlaceholder[xlrec.nToPlaceholder] = i;
+ xlrec.nToPlaceholder++;
+
+ hasUpdate = true;
+ }
+
+ if (dt->tupstate == SPGIST_PLACEHOLDER)
+ {
+ if (!hasNonPlaceholder)
+ firstPlaceholder = i;
+ }
+ else
+ {
+ hasNonPlaceholder = true;
+ }
+ }
+
+ /*
+ * Any placeholder tuples at the end of page can safely be removed. We
+ * can't remove ones before the last non-placeholder, though, because we
+ * can't alter the offset numbers of non-placeholder tuples.
+ */
+ if (firstPlaceholder != InvalidOffsetNumber)
+ {
+ /*
+ * We do not store this array to rdata because it's easy to recreate.
+ */
+ for (i = firstPlaceholder; i <= max; i++)
+ itemnos[i - firstPlaceholder] = i;
+
+ i = max - firstPlaceholder + 1;
+ Assert(opaque->nPlaceholder >= i);
+ opaque->nPlaceholder -= i;
+
+ /* The array is surely sorted, so can use PageIndexMultiDelete */
+ PageIndexMultiDelete(page, itemnos, i);
+
+ hasUpdate = true;
+ }
+
+ xlrec.firstPlaceholder = firstPlaceholder;
+
+ if (hasUpdate)
+ MarkBufferDirty(buffer);
+
+ if (hasUpdate && RelationNeedsWAL(index))
+ {
+ XLogRecPtr recptr;
+
+ ACCEPT_RDATA_DATA(&xlrec, sizeof(xlrec), 0);
+ ACCEPT_RDATA_DATA(itemToPlaceholder, sizeof(OffsetNumber) * xlrec.nToPlaceholder, 1);
+ ACCEPT_RDATA_BUFFER(buffer, 2);
+
+ recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT, rdata);
+
+ PageSetLSN(page, recptr);
+ PageSetTLI(page, ThisTimeLineID);
+ }
+
+ END_CRIT_SECTION();
+}
+
+/*
+ * Process one page during a bulkdelete scan
+ */
+static void
+spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
+{
+ Relation index = bds->info->index;
+ Buffer buffer;
+ Page page;
+
+ /* call vacuum_delay_point while not holding any buffer lock */
+ vacuum_delay_point();
+
+ buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, bds->info->strategy);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ page = (Page) BufferGetPage(buffer);
+
+ if (PageIsNew(page))
+ {
+ /*
+ * We found an all-zero page, which could happen if the database
+ * crashed just after extending the file. Initialize and recycle it.
+ */
+ SpGistInitBuffer(buffer, 0);
+ SpGistPageSetDeleted(page);
+ /* We don't bother to WAL-log this action; easy to redo */
+ MarkBufferDirty(buffer);
+ }
+ else if (SpGistPageIsDeleted(page))
+ {
+ /* nothing to do */
+ }
+ else if (SpGistPageIsLeaf(page))
+ {
+ if (blkno == SPGIST_HEAD_BLKNO)
+ {
+ vacuumLeafRoot(bds, index, buffer);
+ /* no need for vacuumRedirectAndPlaceholder */
+ }
+ else
+ {
+ vacuumLeafPage(bds, index, buffer);
+ vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
+ }
+ }
+ else
+ {
+ /* inner page */
+ vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
+ }
+
+ /*
+ * The root page must never be deleted, nor marked as available in FSM,
+ * because we don't want it ever returned by a search for a place to
+ * put a new tuple. Otherwise, check for empty/deletable page, and
+ * make sure FSM knows about it.
+ */
+ if (blkno != SPGIST_HEAD_BLKNO)
+ {
+ /* If page is now empty, mark it deleted */
+ if (PageIsEmpty(page) && !SpGistPageIsDeleted(page))
+ {
+ SpGistPageSetDeleted(page);
+ /* We don't bother to WAL-log this action; easy to redo */
+ MarkBufferDirty(buffer);
+ }
+
+ if (SpGistPageIsDeleted(page))
+ {
+ RecordFreeIndexPage(index, blkno);
+ bds->stats->pages_deleted++;
+ }
+ else
+ bds->lastFilledBlock = blkno;
+ }
+
+ SpGistSetLastUsedPage(index, buffer);
+
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * Perform a bulkdelete scan
+ */
+static void
+spgvacuumscan(spgBulkDeleteState *bds)
+{
+ Relation index = bds->info->index;
+ bool needLock;
+ BlockNumber num_pages,
+ blkno;
+
+ /* Finish setting up spgBulkDeleteState */
+ initSpGistState(&bds->spgstate, index);
+ bds->OldestXmin = GetOldestXmin(true, false);
+ bds->lastFilledBlock = SPGIST_HEAD_BLKNO;
+
+ /*
+ * Reset counts that will be incremented during the scan; needed in case
+ * of multiple scans during a single VACUUM command
+ */
+ bds->stats->estimated_count = false;
+ bds->stats->num_index_tuples = 0;
+ bds->stats->pages_deleted = 0;
+
+ /* We can skip locking for new or temp relations */
+ needLock = !RELATION_IS_LOCAL(index);
+
+ /*
+ * The outer loop iterates over all index pages except the metapage, in
+ * physical order (we hope the kernel will cooperate in providing
+ * read-ahead for speed). It is critical that we visit all leaf pages,
+ * including ones added after we start the scan, else we might fail to
+ * delete some deletable tuples. See more extensive comments about
+ * this in btvacuumscan().
+ */
+ blkno = SPGIST_HEAD_BLKNO;
+ for (;;)
+ {
+ /* Get the current relation length */
+ if (needLock)
+ LockRelationForExtension(index, ExclusiveLock);
+ num_pages = RelationGetNumberOfBlocks(index);
+ if (needLock)
+ UnlockRelationForExtension(index, ExclusiveLock);
+
+ /* Quit if we've scanned the whole relation */
+ if (blkno >= num_pages)
+ break;
+ /* Iterate over pages, then loop back to recheck length */
+ for (; blkno < num_pages; blkno++)
+ {
+ spgvacuumpage(bds, blkno);
+ }
+ }
+
+ /* Propagate local lastUsedPage cache to metablock */
+ SpGistUpdateMetaPage(index);
+
+ /*
+ * Truncate index if possible
+ *
+ * XXX disabled because it's unsafe due to possible concurrent inserts.
+ * We'd have to rescan the pages to make sure they're still empty, and it
+ * doesn't seem worth it. Note that btree doesn't do this either.
+ */
+#ifdef NOT_USED
+ if (num_pages > bds->lastFilledBlock + 1)
+ {
+ BlockNumber lastBlock = num_pages - 1;
+
+ num_pages = bds->lastFilledBlock + 1;
+ RelationTruncate(index, num_pages);
+ bds->stats->pages_removed += lastBlock - bds->lastFilledBlock;
+ bds->stats->pages_deleted -= lastBlock - bds->lastFilledBlock;
+ }
+#endif
+
+ /* Report final stats */
+ bds->stats->num_pages = num_pages;
+ bds->stats->pages_free = bds->stats->pages_deleted;
+}
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+Datum
+spgbulkdelete(PG_FUNCTION_ARGS)
+{
+ IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
+ IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+ IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
+ void *callback_state = (void *) PG_GETARG_POINTER(3);
+ spgBulkDeleteState bds;
+
+ /* allocate stats if first time through, else re-use existing struct */
+ if (stats == NULL)
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ bds.info = info;
+ bds.stats = stats;
+ bds.callback = callback;
+ bds.callback_state = callback_state;
+
+ spgvacuumscan(&bds);
+
+ PG_RETURN_POINTER(stats);
+}
+
+/* Dummy callback to delete no tuples during spgvacuumcleanup */
+static bool
+dummy_callback(ItemPointer itemptr, void *state)
+{
+ return false;
+}
+
+/*
+ * Post-VACUUM cleanup.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+Datum
+spgvacuumcleanup(PG_FUNCTION_ARGS)
+{
+ IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
+ IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+ Relation index = info->index;
+ spgBulkDeleteState bds;
+
+ /* No-op in ANALYZE ONLY mode */
+ if (info->analyze_only)
+ PG_RETURN_POINTER(stats);
+
+ /*
+ * We don't need to scan the index if there was a preceding bulkdelete
+ * pass. Otherwise, make a pass that won't delete any live tuples, but
+ * might still accomplish useful stuff with redirect/placeholder cleanup,
+ * and in any case will provide stats.
+ */
+ if (stats == NULL)
+ {
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ bds.info = info;
+ bds.stats = stats;
+ bds.callback = dummy_callback;
+ bds.callback_state = NULL;
+
+ spgvacuumscan(&bds);
+ }
+
+ /* Finally, vacuum the FSM */
+ IndexFreeSpaceMapVacuum(index);
+
+ /*
+ * It's quite possible for us to be fooled by concurrent page splits into
+ * double-counting some index tuples, so disbelieve any total that exceeds
+ * the underlying heap's count ... if we know that accurately. Otherwise
+ * this might just make matters worse.
+ */
+ if (!info->estimated_count)
+ {
+ if (stats->num_index_tuples > info->num_heap_tuples)
+ stats->num_index_tuples = info->num_heap_tuples;
+ }
+
+ PG_RETURN_POINTER(stats);
+}
diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c
new file mode 100644
index 00000000000..e508f09703d
--- /dev/null
+++ b/src/backend/access/spgist/spgxlog.c
@@ -0,0 +1,1070 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgxlog.c
+ * WAL replay logic for SP-GiST
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/spgist/spgxlog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/spgist_private.h"
+#include "access/xlogutils.h"
+#include "storage/bufmgr.h"
+#include "utils/memutils.h"
+
+
+static MemoryContext opCtx; /* working memory for operations */
+
+
+/*
+ * Prepare a dummy SpGistState, with just the minimum info needed for replay.
+ *
+ * At present, all we need is enough info to support spgFormDeadTuple(),
+ * plus the isBuild flag.
+ */
+static void
+fillFakeState(SpGistState *state, spgxlogState stateSrc)
+{
+ memset(state, 0, sizeof(*state));
+
+ state->myXid = stateSrc.myXid;
+ state->isBuild = stateSrc.isBuild;
+ state->deadTupleStorage = palloc0(SGDTSIZE);
+}
+
+/*
+ * Add a leaf tuple, or replace an existing placeholder tuple. This is used
+ * to replay SpGistPageAddNewItem() operations. If the offset points at an
+ * existing tuple, it had better be a placeholder tuple.
+ */
+static void
+addOrReplaceTuple(Page page, Item tuple, int size, OffsetNumber offset)
+{
+ if (offset <= PageGetMaxOffsetNumber(page))
+ {
+ SpGistDeadTuple dt = (SpGistDeadTuple) PageGetItem(page,
+ PageGetItemId(page, offset));
+
+ if (dt->tupstate != SPGIST_PLACEHOLDER)
+ elog(ERROR, "SPGiST tuple to be replaced is not a placeholder");
+
+ Assert(SpGistPageGetOpaque(page)->nPlaceholder > 0);
+ SpGistPageGetOpaque(page)->nPlaceholder--;
+
+ PageIndexTupleDelete(page, offset);
+ }
+
+ Assert(offset <= PageGetMaxOffsetNumber(page) + 1);
+
+ if (PageAddItem(page, tuple, size, offset, false, false) != offset)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ size);
+}
+
+static void
+spgRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record)
+{
+ RelFileNode *node = (RelFileNode *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ buffer = XLogReadBuffer(*node, SPGIST_METAPAGE_BLKNO, true);
+ Assert(BufferIsValid(buffer));
+ page = (Page) BufferGetPage(buffer);
+ SpGistInitMetapage(page);
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
+
+ buffer = XLogReadBuffer(*node, SPGIST_HEAD_BLKNO, true);
+ Assert(BufferIsValid(buffer));
+ SpGistInitBuffer(buffer, SPGIST_LEAF);
+ page = (Page) BufferGetPage(buffer);
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
+}
+
+static void
+spgRedoAddLeaf(XLogRecPtr lsn, XLogRecord *record)
+{
+ char *ptr = XLogRecGetData(record);
+ spgxlogAddLeaf *xldata = (spgxlogAddLeaf *) ptr;
+ SpGistLeafTuple leafTuple;
+ Buffer buffer;
+ Page page;
+
+ /* we assume this is adequately aligned */
+ ptr += sizeof(spgxlogAddLeaf);
+ leafTuple = (SpGistLeafTuple) ptr;
+
+ if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blknoLeaf,
+ xldata->newPage);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+
+ if (xldata->newPage)
+ SpGistInitBuffer(buffer, SPGIST_LEAF);
+
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ /* insert new tuple */
+ if (xldata->offnumLeaf != xldata->offnumHeadLeaf)
+ {
+ /* normal cases, tuple was added by SpGistPageAddNewItem */
+ addOrReplaceTuple(page, (Item) leafTuple, leafTuple->size,
+ xldata->offnumLeaf);
+
+ /* update head tuple's chain link if needed */
+ if (xldata->offnumHeadLeaf != InvalidOffsetNumber)
+ {
+ SpGistLeafTuple head;
+
+ head = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumHeadLeaf));
+ Assert(head->nextOffset == leafTuple->nextOffset);
+ head->nextOffset = xldata->offnumLeaf;
+ }
+ }
+ else
+ {
+ /* replacing a DEAD tuple */
+ PageIndexTupleDelete(page, xldata->offnumLeaf);
+ if (PageAddItem(page,
+ (Item) leafTuple, leafTuple->size,
+ xldata->offnumLeaf, false, false) != xldata->offnumLeaf)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ leafTuple->size);
+ }
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+
+ /* update parent downlink if necessary */
+ if (xldata->blknoParent != InvalidBlockNumber &&
+ !(record->xl_info & XLR_BKP_BLOCK_2))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ SpGistInnerTuple tuple;
+
+ tuple = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+
+ updateNodeLink(tuple, xldata->nodeI,
+ xldata->blknoLeaf, xldata->offnumLeaf);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+}
+
+static void
+spgRedoMoveLeafs(XLogRecPtr lsn, XLogRecord *record)
+{
+ char *ptr = XLogRecGetData(record);
+ spgxlogMoveLeafs *xldata = (spgxlogMoveLeafs *) ptr;
+ SpGistState state;
+ OffsetNumber *toDelete;
+ OffsetNumber *toInsert;
+ int nInsert;
+ Buffer buffer;
+ Page page;
+
+ fillFakeState(&state, xldata->stateSrc);
+
+ nInsert = xldata->replaceDead ? 1 : xldata->nMoves + 1;
+
+ ptr += MAXALIGN(sizeof(spgxlogMoveLeafs));
+ toDelete = (OffsetNumber *) ptr;
+ ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nMoves);
+ toInsert = (OffsetNumber *) ptr;
+ ptr += MAXALIGN(sizeof(OffsetNumber) * nInsert);
+
+ /* now ptr points to the list of leaf tuples */
+
+ /* Insert tuples on the dest page (do first, so redirect is valid) */
+ if (!(record->xl_info & XLR_BKP_BLOCK_2))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blknoDst,
+ xldata->newPage);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+
+ if (xldata->newPage)
+ SpGistInitBuffer(buffer, SPGIST_LEAF);
+
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ int i;
+
+ for (i = 0; i < nInsert; i++)
+ {
+ SpGistLeafTuple lt = (SpGistLeafTuple) ptr;
+
+ addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]);
+ ptr += lt->size;
+ }
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+
+ /* Delete tuples from the source page, inserting a redirection pointer */
+ if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ spgPageIndexMultiDelete(&state, page, toDelete, xldata->nMoves,
+ state.isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT,
+ SPGIST_PLACEHOLDER,
+ xldata->blknoDst,
+ toInsert[nInsert - 1]);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+
+ /* And update the parent downlink */
+ if (!(record->xl_info & XLR_BKP_BLOCK_3))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ SpGistInnerTuple tuple;
+
+ tuple = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+
+ updateNodeLink(tuple, xldata->nodeI,
+ xldata->blknoDst, toInsert[nInsert - 1]);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+}
+
+static void
+spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record)
+{
+ char *ptr = XLogRecGetData(record);
+ spgxlogAddNode *xldata = (spgxlogAddNode *) ptr;
+ SpGistInnerTuple innerTuple;
+ SpGistState state;
+ Buffer buffer;
+ Page page;
+ int bbi;
+
+ /* we assume this is adequately aligned */
+ ptr += sizeof(spgxlogAddNode);
+ innerTuple = (SpGistInnerTuple) ptr;
+
+ fillFakeState(&state, xldata->stateSrc);
+
+ if (xldata->blknoNew == InvalidBlockNumber)
+ {
+ /* update in place */
+ Assert(xldata->blknoParent == InvalidBlockNumber);
+ if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ PageIndexTupleDelete(page, xldata->offnum);
+ if (PageAddItem(page, (Item) innerTuple, innerTuple->size,
+ xldata->offnum,
+ false, false) != xldata->offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ innerTuple->size);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+ }
+ else
+ {
+ /* Install new tuple first so redirect is valid */
+ if (!(record->xl_info & XLR_BKP_BLOCK_2))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blknoNew,
+ xldata->newPage);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+
+ if (xldata->newPage)
+ SpGistInitBuffer(buffer, 0);
+
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ addOrReplaceTuple(page, (Item) innerTuple,
+ innerTuple->size, xldata->offnumNew);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+
+ /* Delete old tuple, replacing it with redirect or placeholder tuple */
+ if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ SpGistDeadTuple dt;
+
+ if (state.isBuild)
+ dt = spgFormDeadTuple(&state, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+ else
+ dt = spgFormDeadTuple(&state, SPGIST_REDIRECT,
+ xldata->blknoNew,
+ xldata->offnumNew);
+
+ PageIndexTupleDelete(page, xldata->offnum);
+ if (PageAddItem(page, (Item) dt, dt->size,
+ xldata->offnum,
+ false, false) != xldata->offnum)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ dt->size);
+
+ if (state.isBuild)
+ SpGistPageGetOpaque(page)->nPlaceholder++;
+ else
+ SpGistPageGetOpaque(page)->nRedirection++;
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+
+ /*
+ * Update parent downlink. Since parent could be in either of the
+ * previous two buffers, it's a bit tricky to determine which BKP bit
+ * applies.
+ */
+ if (xldata->blknoParent == xldata->blkno)
+ bbi = 0;
+ else if (xldata->blknoParent == xldata->blknoNew)
+ bbi = 1;
+ else
+ bbi = 2;
+
+ if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi)))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ SpGistInnerTuple innerTuple;
+
+ innerTuple = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+
+ updateNodeLink(innerTuple, xldata->nodeI,
+ xldata->blknoNew, xldata->offnumNew);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+ }
+}
+
+static void
+spgRedoSplitTuple(XLogRecPtr lsn, XLogRecord *record)
+{
+ char *ptr = XLogRecGetData(record);
+ spgxlogSplitTuple *xldata = (spgxlogSplitTuple *) ptr;
+ SpGistInnerTuple prefixTuple;
+ SpGistInnerTuple postfixTuple;
+ Buffer buffer;
+ Page page;
+
+ /* we assume this is adequately aligned */
+ ptr += sizeof(spgxlogSplitTuple);
+ prefixTuple = (SpGistInnerTuple) ptr;
+ ptr += prefixTuple->size;
+ postfixTuple = (SpGistInnerTuple) ptr;
+
+ /* insert postfix tuple first to avoid dangling link */
+ if (xldata->blknoPostfix != xldata->blknoPrefix &&
+ !(record->xl_info & XLR_BKP_BLOCK_2))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blknoPostfix,
+ xldata->newPage);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+
+ if (xldata->newPage)
+ SpGistInitBuffer(buffer, 0);
+
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ addOrReplaceTuple(page, (Item) postfixTuple,
+ postfixTuple->size, xldata->offnumPostfix);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+
+ /* now handle the original page */
+ if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blknoPrefix, false);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ PageIndexTupleDelete(page, xldata->offnumPrefix);
+ if (PageAddItem(page, (Item) prefixTuple, prefixTuple->size,
+ xldata->offnumPrefix, false, false) != xldata->offnumPrefix)
+ elog(ERROR, "failed to add item of size %u to SPGiST index page",
+ prefixTuple->size);
+
+ if (xldata->blknoPostfix == xldata->blknoPrefix)
+ addOrReplaceTuple(page, (Item) postfixTuple,
+ postfixTuple->size,
+ xldata->offnumPostfix);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+}
+
+static void
+spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record)
+{
+ char *ptr = XLogRecGetData(record);
+ spgxlogPickSplit *xldata = (spgxlogPickSplit *) ptr;
+ SpGistInnerTuple innerTuple;
+ SpGistState state;
+ OffsetNumber *toDelete;
+ OffsetNumber *toInsert;
+ uint8 *leafPageSelect;
+ Buffer srcBuffer;
+ Buffer destBuffer;
+ Page page;
+ int bbi;
+ int i;
+
+ fillFakeState(&state, xldata->stateSrc);
+
+ ptr += MAXALIGN(sizeof(spgxlogPickSplit));
+ innerTuple = (SpGistInnerTuple) ptr;
+ ptr += innerTuple->size;
+ toDelete = (OffsetNumber *) ptr;
+ ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nDelete);
+ toInsert = (OffsetNumber *) ptr;
+ ptr += MAXALIGN(sizeof(OffsetNumber) * xldata->nInsert);
+ leafPageSelect = (uint8 *) ptr;
+ ptr += MAXALIGN(sizeof(uint8) * xldata->nInsert);
+
+ /* now ptr points to the list of leaf tuples */
+
+ /*
+ * It's a bit tricky to identify which pages have been handled as
+ * full-page images, so we explicitly count each referenced buffer.
+ */
+ bbi = 0;
+
+ if (xldata->blknoSrc == SPGIST_HEAD_BLKNO)
+ {
+ /* when splitting root, we touch it only in the guise of new inner */
+ srcBuffer = InvalidBuffer;
+ }
+ else if (xldata->initSrc)
+ {
+ /* just re-init the source page */
+ srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, true);
+ Assert(BufferIsValid(srcBuffer));
+ page = (Page) BufferGetPage(srcBuffer);
+
+ SpGistInitBuffer(srcBuffer, SPGIST_LEAF);
+ /* don't update LSN etc till we're done with it */
+ }
+ else
+ {
+ /* delete the specified tuples from source page */
+ if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi)))
+ {
+ srcBuffer = XLogReadBuffer(xldata->node, xldata->blknoSrc, false);
+ if (BufferIsValid(srcBuffer))
+ {
+ page = BufferGetPage(srcBuffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ /*
+ * We have it a bit easier here than in doPickSplit(),
+ * because we know the inner tuple's location already,
+ * so we can inject the correct redirection tuple now.
+ */
+ if (!state.isBuild)
+ spgPageIndexMultiDelete(&state, page,
+ toDelete, xldata->nDelete,
+ SPGIST_REDIRECT,
+ SPGIST_PLACEHOLDER,
+ xldata->blknoInner,
+ xldata->offnumInner);
+ else
+ spgPageIndexMultiDelete(&state, page,
+ toDelete, xldata->nDelete,
+ SPGIST_PLACEHOLDER,
+ SPGIST_PLACEHOLDER,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+
+ /* don't update LSN etc till we're done with it */
+ }
+ }
+ }
+ else
+ srcBuffer = InvalidBuffer;
+ bbi++;
+ }
+
+ /* try to access dest page if any */
+ if (xldata->blknoDest == InvalidBlockNumber)
+ {
+ destBuffer = InvalidBuffer;
+ }
+ else if (xldata->initDest)
+ {
+ /* just re-init the dest page */
+ destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, true);
+ Assert(BufferIsValid(destBuffer));
+ page = (Page) BufferGetPage(destBuffer);
+
+ SpGistInitBuffer(destBuffer, SPGIST_LEAF);
+ /* don't update LSN etc till we're done with it */
+ }
+ else
+ {
+ if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi)))
+ destBuffer = XLogReadBuffer(xldata->node, xldata->blknoDest, false);
+ else
+ destBuffer = InvalidBuffer;
+ bbi++;
+ }
+
+ /* restore leaf tuples to src and/or dest page */
+ for (i = 0; i < xldata->nInsert; i++)
+ {
+ SpGistLeafTuple lt = (SpGistLeafTuple) ptr;
+ Buffer leafBuffer;
+
+ ptr += lt->size;
+
+ leafBuffer = leafPageSelect[i] ? destBuffer : srcBuffer;
+ if (!BufferIsValid(leafBuffer))
+ continue; /* no need to touch this page */
+ page = BufferGetPage(leafBuffer);
+
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ addOrReplaceTuple(page, (Item) lt, lt->size, toInsert[i]);
+ }
+ }
+
+ /* Now update src and dest page LSNs */
+ if (BufferIsValid(srcBuffer))
+ {
+ page = BufferGetPage(srcBuffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(srcBuffer);
+ }
+ UnlockReleaseBuffer(srcBuffer);
+ }
+ if (BufferIsValid(destBuffer))
+ {
+ page = BufferGetPage(destBuffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(destBuffer);
+ }
+ UnlockReleaseBuffer(destBuffer);
+ }
+
+ /* restore new inner tuple */
+ if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi)))
+ {
+ Buffer buffer = XLogReadBuffer(xldata->node, xldata->blknoInner,
+ xldata->initInner);
+
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+
+ if (xldata->initInner)
+ SpGistInitBuffer(buffer, 0);
+
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ addOrReplaceTuple(page, (Item) innerTuple, innerTuple->size,
+ xldata->offnumInner);
+
+ /* if inner is also parent, update link while we're here */
+ if (xldata->blknoInner == xldata->blknoParent)
+ {
+ SpGistInnerTuple parent;
+
+ parent = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+ updateNodeLink(parent, xldata->nodeI,
+ xldata->blknoInner, xldata->offnumInner);
+ }
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+ bbi++;
+
+ /* update parent downlink, unless we did it above */
+ if (xldata->blknoParent == InvalidBlockNumber)
+ {
+ /* no parent cause we split the root */
+ Assert(xldata->blknoInner == SPGIST_HEAD_BLKNO);
+ }
+ else if (xldata->blknoInner != xldata->blknoParent)
+ {
+ if (!(record->xl_info & XLR_SET_BKP_BLOCK(bbi)))
+ {
+ Buffer buffer = XLogReadBuffer(xldata->node, xldata->blknoParent, false);
+
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ SpGistInnerTuple parent;
+
+ parent = (SpGistInnerTuple) PageGetItem(page,
+ PageGetItemId(page, xldata->offnumParent));
+ updateNodeLink(parent, xldata->nodeI,
+ xldata->blknoInner, xldata->offnumInner);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+ }
+}
+
+static void
+spgRedoVacuumLeaf(XLogRecPtr lsn, XLogRecord *record)
+{
+ char *ptr = XLogRecGetData(record);
+ spgxlogVacuumLeaf *xldata = (spgxlogVacuumLeaf *) ptr;
+ OffsetNumber *toDead;
+ OffsetNumber *toPlaceholder;
+ OffsetNumber *moveSrc;
+ OffsetNumber *moveDest;
+ OffsetNumber *chainSrc;
+ OffsetNumber *chainDest;
+ SpGistState state;
+ Buffer buffer;
+ Page page;
+ int i;
+
+ fillFakeState(&state, xldata->stateSrc);
+
+ ptr += sizeof(spgxlogVacuumLeaf);
+ toDead = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nDead;
+ toPlaceholder = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nPlaceholder;
+ moveSrc = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nMove;
+ moveDest = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nMove;
+ chainSrc = (OffsetNumber *) ptr;
+ ptr += sizeof(OffsetNumber) * xldata->nChain;
+ chainDest = (OffsetNumber *) ptr;
+
+ if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ spgPageIndexMultiDelete(&state, page,
+ toDead, xldata->nDead,
+ SPGIST_DEAD, SPGIST_DEAD,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+
+ spgPageIndexMultiDelete(&state, page,
+ toPlaceholder, xldata->nPlaceholder,
+ SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+
+ /* see comments in vacuumLeafPage() */
+ for (i = 0; i < xldata->nMove; i++)
+ {
+ ItemId idSrc = PageGetItemId(page, moveSrc[i]);
+ ItemId idDest = PageGetItemId(page, moveDest[i]);
+ ItemIdData tmp;
+
+ tmp = *idSrc;
+ *idSrc = *idDest;
+ *idDest = tmp;
+ }
+
+ spgPageIndexMultiDelete(&state, page,
+ moveSrc, xldata->nMove,
+ SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
+ InvalidBlockNumber,
+ InvalidOffsetNumber);
+
+ for (i = 0; i < xldata->nChain; i++)
+ {
+ SpGistLeafTuple lt;
+
+ lt = (SpGistLeafTuple) PageGetItem(page,
+ PageGetItemId(page, chainSrc[i]));
+ Assert(lt->tupstate == SPGIST_LIVE);
+ lt->nextOffset = chainDest[i];
+ }
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+}
+
+static void
+spgRedoVacuumRoot(XLogRecPtr lsn, XLogRecord *record)
+{
+ char *ptr = XLogRecGetData(record);
+ spgxlogVacuumRoot *xldata = (spgxlogVacuumRoot *) ptr;
+ OffsetNumber *toDelete;
+ Buffer buffer;
+ Page page;
+
+ ptr += sizeof(spgxlogVacuumRoot);
+ toDelete = (OffsetNumber *) ptr;
+
+ if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ {
+ buffer = XLogReadBuffer(xldata->node, SPGIST_HEAD_BLKNO, false);
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ /* The tuple numbers are in order */
+ PageIndexMultiDelete(page, toDelete, xldata->nDelete);
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+}
+
+static void
+spgRedoVacuumRedirect(XLogRecPtr lsn, XLogRecord *record)
+{
+ char *ptr = XLogRecGetData(record);
+ spgxlogVacuumRedirect *xldata = (spgxlogVacuumRedirect *) ptr;
+ OffsetNumber *itemToPlaceholder;
+ Buffer buffer;
+ Page page;
+
+ ptr += sizeof(spgxlogVacuumRedirect);
+ itemToPlaceholder = (OffsetNumber *) ptr;
+
+ if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ {
+ buffer = XLogReadBuffer(xldata->node, xldata->blkno, false);
+
+ if (BufferIsValid(buffer))
+ {
+ page = BufferGetPage(buffer);
+ if (!XLByteLE(lsn, PageGetLSN(page)))
+ {
+ SpGistPageOpaque opaque = SpGistPageGetOpaque(page);
+ int i;
+
+ /* Convert redirect pointers to plain placeholders */
+ for (i = 0; i < xldata->nToPlaceholder; i++)
+ {
+ SpGistDeadTuple dt;
+
+ dt = (SpGistDeadTuple) PageGetItem(page,
+ PageGetItemId(page, itemToPlaceholder[i]));
+ Assert(dt->tupstate == SPGIST_REDIRECT);
+ dt->tupstate = SPGIST_PLACEHOLDER;
+ ItemPointerSetInvalid(&dt->pointer);
+ }
+
+ Assert(opaque->nRedirection >= xldata->nToPlaceholder);
+ opaque->nRedirection -= xldata->nToPlaceholder;
+ opaque->nPlaceholder += xldata->nToPlaceholder;
+
+ /* Remove placeholder tuples at end of page */
+ if (xldata->firstPlaceholder != InvalidOffsetNumber)
+ {
+ int max = PageGetMaxOffsetNumber(page);
+ OffsetNumber *toDelete;
+
+ toDelete = palloc(sizeof(OffsetNumber) * max);
+
+ for (i = xldata->firstPlaceholder; i <= max; i++)
+ toDelete[i - xldata->firstPlaceholder] = i;
+
+ i = max - xldata->firstPlaceholder + 1;
+ Assert(opaque->nPlaceholder >= i);
+ opaque->nPlaceholder -= i;
+
+ /* The array is sorted, so can use PageIndexMultiDelete */
+ PageIndexMultiDelete(page, toDelete, i);
+
+ pfree(toDelete);
+ }
+
+ PageSetLSN(page, lsn);
+ PageSetTLI(page, ThisTimeLineID);
+ MarkBufferDirty(buffer);
+ }
+
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+}
+
+void
+spg_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ MemoryContext oldCxt;
+
+ /*
+ * SP-GiST indexes do not require any conflict processing. NB: If we ever
+ * implement a similar optimization as we have in b-tree, and remove
+ * killed tuples outside VACUUM, we'll need to handle that here.
+ */
+ RestoreBkpBlocks(lsn, record, false);
+
+ oldCxt = MemoryContextSwitchTo(opCtx);
+ switch (info)
+ {
+ case XLOG_SPGIST_CREATE_INDEX:
+ spgRedoCreateIndex(lsn, record);
+ break;
+ case XLOG_SPGIST_ADD_LEAF:
+ spgRedoAddLeaf(lsn, record);
+ break;
+ case XLOG_SPGIST_MOVE_LEAFS:
+ spgRedoMoveLeafs(lsn, record);
+ break;
+ case XLOG_SPGIST_ADD_NODE:
+ spgRedoAddNode(lsn, record);
+ break;
+ case XLOG_SPGIST_SPLIT_TUPLE:
+ spgRedoSplitTuple(lsn, record);
+ break;
+ case XLOG_SPGIST_PICKSPLIT:
+ spgRedoPickSplit(lsn, record);
+ break;
+ case XLOG_SPGIST_VACUUM_LEAF:
+ spgRedoVacuumLeaf(lsn, record);
+ break;
+ case XLOG_SPGIST_VACUUM_ROOT:
+ spgRedoVacuumRoot(lsn, record);
+ break;
+ case XLOG_SPGIST_VACUUM_REDIRECT:
+ spgRedoVacuumRedirect(lsn, record);
+ break;
+ default:
+ elog(PANIC, "spg_redo: unknown op code %u", info);
+ }
+
+ MemoryContextSwitchTo(oldCxt);
+ MemoryContextReset(opCtx);
+}
+
+static void
+out_target(StringInfo buf, RelFileNode node)
+{
+ appendStringInfo(buf, "rel %u/%u/%u ",
+ node.spcNode, node.dbNode, node.relNode);
+}
+
+void
+spg_desc(StringInfo buf, uint8 xl_info, char *rec)
+{
+ uint8 info = xl_info & ~XLR_INFO_MASK;
+
+ switch (info)
+ {
+ case XLOG_SPGIST_CREATE_INDEX:
+ appendStringInfo(buf, "create_index: rel %u/%u/%u",
+ ((RelFileNode *) rec)->spcNode,
+ ((RelFileNode *) rec)->dbNode,
+ ((RelFileNode *) rec)->relNode);
+ break;
+ case XLOG_SPGIST_ADD_LEAF:
+ out_target(buf, ((spgxlogAddLeaf *) rec)->node);
+ appendStringInfo(buf, "add leaf to page: %u",
+ ((spgxlogAddLeaf *) rec)->blknoLeaf);
+ break;
+ case XLOG_SPGIST_MOVE_LEAFS:
+ out_target(buf, ((spgxlogMoveLeafs *) rec)->node);
+ appendStringInfo(buf, "move %u leafs from page %u to page %u",
+ ((spgxlogMoveLeafs *) rec)->nMoves,
+ ((spgxlogMoveLeafs *) rec)->blknoSrc,
+ ((spgxlogMoveLeafs *) rec)->blknoDst);
+ break;
+ case XLOG_SPGIST_ADD_NODE:
+ out_target(buf, ((spgxlogAddNode *) rec)->node);
+ appendStringInfo(buf, "add node to %u:%u",
+ ((spgxlogAddNode *) rec)->blkno,
+ ((spgxlogAddNode *) rec)->offnum);
+ break;
+ case XLOG_SPGIST_SPLIT_TUPLE:
+ out_target(buf, ((spgxlogSplitTuple *) rec)->node);
+ appendStringInfo(buf, "split node %u:%u to %u:%u",
+ ((spgxlogSplitTuple *) rec)->blknoPrefix,
+ ((spgxlogSplitTuple *) rec)->offnumPrefix,
+ ((spgxlogSplitTuple *) rec)->blknoPostfix,
+ ((spgxlogSplitTuple *) rec)->offnumPostfix);
+ break;
+ case XLOG_SPGIST_PICKSPLIT:
+ out_target(buf, ((spgxlogPickSplit *) rec)->node);
+ appendStringInfo(buf, "split leaf page");
+ break;
+ case XLOG_SPGIST_VACUUM_LEAF:
+ out_target(buf, ((spgxlogVacuumLeaf *) rec)->node);
+ appendStringInfo(buf, "vacuum leaf tuples on page %u",
+ ((spgxlogVacuumLeaf *) rec)->blkno);
+ break;
+ case XLOG_SPGIST_VACUUM_ROOT:
+ out_target(buf, ((spgxlogVacuumRoot *) rec)->node);
+ appendStringInfo(buf, "vacuum leaf tuples on root page");
+ break;
+ case XLOG_SPGIST_VACUUM_REDIRECT:
+ out_target(buf, ((spgxlogVacuumRedirect *) rec)->node);
+ appendStringInfo(buf, "vacuum redirect tuples on page %u",
+ ((spgxlogVacuumRedirect *) rec)->blkno);
+ break;
+ default:
+ appendStringInfo(buf, "unknown spgist op code %u", info);
+ break;
+ }
+}
+
+void
+spg_xlog_startup(void)
+{
+ opCtx = AllocSetContextCreate(CurrentMemoryContext,
+ "SP-GiST temporary context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+}
+
+void
+spg_xlog_cleanup(void)
+{
+ MemoryContextDelete(opCtx);
+ opCtx = NULL;
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 6a0a2d9b477..ed8754e6f22 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -14,6 +14,7 @@
#include "access/heapam.h"
#include "access/multixact.h"
#include "access/nbtree.h"
+#include "access/spgist.h"
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "catalog/storage.h"
@@ -40,5 +41,6 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = {
{"Hash", hash_redo, hash_desc, NULL, NULL, NULL},
{"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint},
{"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, NULL},
- {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL}
+ {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL},
+ {"SPGist", spg_redo, spg_desc, spg_xlog_startup, spg_xlog_cleanup, NULL}
};
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index f5660b2c3cd..d06809e7675 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6555,6 +6555,26 @@ gistcostestimate(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+Datum
+spgcostestimate(PG_FUNCTION_ARGS)
+{
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ IndexOptInfo *index = (IndexOptInfo *) PG_GETARG_POINTER(1);
+ List *indexQuals = (List *) PG_GETARG_POINTER(2);
+ List *indexOrderBys = (List *) PG_GETARG_POINTER(3);
+ RelOptInfo *outer_rel = (RelOptInfo *) PG_GETARG_POINTER(4);
+ Cost *indexStartupCost = (Cost *) PG_GETARG_POINTER(5);
+ Cost *indexTotalCost = (Cost *) PG_GETARG_POINTER(6);
+ Selectivity *indexSelectivity = (Selectivity *) PG_GETARG_POINTER(7);
+ double *indexCorrelation = (double *) PG_GETARG_POINTER(8);
+
+ genericcostestimate(root, index, indexQuals, indexOrderBys, outer_rel, 0.0,
+ indexStartupCost, indexTotalCost,
+ indexSelectivity, indexCorrelation);
+
+ PG_RETURN_VOID();
+}
+
/* Find the index column matching "op"; return its index, or -1 if no match */
static int
find_index_column(Node *op, IndexOptInfo *index)
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
index 290f0edaefa..ee5d71e4d71 100644
--- a/src/include/access/gin_private.h
+++ b/src/include/access/gin_private.h
@@ -24,6 +24,10 @@
* Note: GIN does not include a page ID word as do the other index types.
* This is OK because the opaque data is only 8 bytes and so can be reliably
* distinguished by size. Revisit this if the size ever increases.
+ * Further note: as of 9.2, SP-GiST also uses 8-byte special space. This is
+ * still OK, as long as GIN isn't using all of the high-order bits in its
+ * flags word, because that way the flags word cannot match the page ID used
+ * by SP-GiST.
*/
typedef struct GinPageOpaqueData
{
diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h
index 14f50345bbf..10b2f9ea4db 100644
--- a/src/include/access/reloptions.h
+++ b/src/include/access/reloptions.h
@@ -42,8 +42,9 @@ typedef enum relopt_kind
RELOPT_KIND_GIST = (1 << 5),
RELOPT_KIND_ATTRIBUTE = (1 << 6),
RELOPT_KIND_TABLESPACE = (1 << 7),
+ RELOPT_KIND_SPGIST = (1 << 8),
/* if you add a new kind, make sure you update "last_default" too */
- RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_TABLESPACE,
+ RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_SPGIST,
/* some compilers treat enums as signed ints, so we can't use 1 << 31 */
RELOPT_KIND_MAX = (1 << 30)
} relopt_kind;
diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h
index 83abba359a5..e4844fe96c9 100644
--- a/src/include/access/rmgr.h
+++ b/src/include/access/rmgr.h
@@ -32,6 +32,8 @@ typedef uint8 RmgrId;
#define RM_GIN_ID 13
#define RM_GIST_ID 14
#define RM_SEQ_ID 15
-#define RM_MAX_ID RM_SEQ_ID
+#define RM_SPGIST_ID 16
+
+#define RM_MAX_ID RM_SPGIST_ID
#endif /* RMGR_H */
diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h
new file mode 100644
index 00000000000..aa655a31402
--- /dev/null
+++ b/src/include/access/spgist.h
@@ -0,0 +1,199 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgist.h
+ * Public header file for SP-GiST access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/spgist.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SPGIST_H
+#define SPGIST_H
+
+#include "access/skey.h"
+#include "access/xlog.h"
+#include "fmgr.h"
+
+
+/* reloption parameters */
+#define SPGIST_MIN_FILLFACTOR 10
+#define SPGIST_DEFAULT_FILLFACTOR 80
+
+/* SPGiST opclass support function numbers */
+#define SPGIST_CONFIG_PROC 1
+#define SPGIST_CHOOSE_PROC 2
+#define SPGIST_PICKSPLIT_PROC 3
+#define SPGIST_INNER_CONSISTENT_PROC 4
+#define SPGIST_LEAF_CONSISTENT_PROC 5
+#define SPGISTNProc 5
+
+/*
+ * Argument structs for spg_config method
+ */
+typedef struct spgConfigIn
+{
+ Oid attType; /* Data type to be indexed */
+} spgConfigIn;
+
+typedef struct spgConfigOut
+{
+ Oid prefixType; /* Data type of inner-tuple prefixes */
+ Oid labelType; /* Data type of inner-tuple node labels */
+ bool longValuesOK; /* Opclass can cope with values > 1 page */
+} spgConfigOut;
+
+/*
+ * Argument structs for spg_choose method
+ */
+typedef struct spgChooseIn
+{
+ Datum datum; /* original datum to be indexed */
+ Datum leafDatum; /* current datum to be stored at leaf */
+ int level; /* current level (counting from zero) */
+
+ /* Data from current inner tuple */
+ bool allTheSame; /* tuple is marked all-the-same? */
+ bool hasPrefix; /* tuple has a prefix? */
+ Datum prefixDatum; /* if so, the prefix value */
+ int nNodes; /* number of nodes in the inner tuple */
+ Datum *nodeLabels; /* node label values (NULL if none) */
+} spgChooseIn;
+
+typedef enum spgChooseResultType
+{
+ spgMatchNode = 1, /* descend into existing node */
+ spgAddNode, /* add a node to the inner tuple */
+ spgSplitTuple /* split inner tuple (change its prefix) */
+} spgChooseResultType;
+
+typedef struct spgChooseOut
+{
+ spgChooseResultType resultType; /* action code, see above */
+ union
+ {
+ struct /* results for spgMatchNode */
+ {
+ int nodeN; /* descend to this node (index from 0) */
+ int levelAdd; /* increment level by this much */
+ Datum restDatum; /* new leaf datum */
+ } matchNode;
+ struct /* results for spgAddNode */
+ {
+ Datum nodeLabel; /* new node's label */
+ int nodeN; /* where to insert it (index from 0) */
+ } addNode;
+ struct /* results for spgSplitTuple */
+ {
+ /* Info to form new inner tuple with one node */
+ bool prefixHasPrefix; /* tuple should have a prefix? */
+ Datum prefixPrefixDatum; /* if so, its value */
+ Datum nodeLabel; /* node's label */
+
+ /* Info to form new lower-level inner tuple with all old nodes */
+ bool postfixHasPrefix; /* tuple should have a prefix? */
+ Datum postfixPrefixDatum; /* if so, its value */
+ } splitTuple;
+ } result;
+} spgChooseOut;
+
+/*
+ * Argument structs for spg_picksplit method
+ */
+typedef struct spgPickSplitIn
+{
+ int nTuples; /* number of leaf tuples */
+ Datum *datums; /* their datums (array of length nTuples) */
+ int level; /* current level (counting from zero) */
+} spgPickSplitIn;
+
+typedef struct spgPickSplitOut
+{
+ bool hasPrefix; /* new inner tuple should have a prefix? */
+ Datum prefixDatum; /* if so, its value */
+
+ int nNodes; /* number of nodes for new inner tuple */
+ Datum *nodeLabels; /* their labels (or NULL for no labels) */
+
+ int *mapTuplesToNodes; /* node index for each leaf tuple */
+ Datum *leafTupleDatums; /* datum to store in each new leaf tuple */
+} spgPickSplitOut;
+
+/*
+ * Argument structs for spg_inner_consistent method
+ */
+typedef struct spgInnerConsistentIn
+{
+ StrategyNumber strategy; /* operator strategy number */
+ Datum query; /* operator's RHS value */
+
+ Datum reconstructedValue; /* value reconstructed at parent */
+ int level; /* current level (counting from zero) */
+
+ /* Data from current inner tuple */
+ bool allTheSame; /* tuple is marked all-the-same? */
+ bool hasPrefix; /* tuple has a prefix? */
+ Datum prefixDatum; /* if so, the prefix value */
+ int nNodes; /* number of nodes in the inner tuple */
+ Datum *nodeLabels; /* node label values (NULL if none) */
+} spgInnerConsistentIn;
+
+typedef struct spgInnerConsistentOut
+{
+ int nNodes; /* number of child nodes to be visited */
+ int *nodeNumbers; /* their indexes in the node array */
+ int *levelAdds; /* increment level by this much for each */
+ Datum *reconstructedValues; /* associated reconstructed values */
+} spgInnerConsistentOut;
+
+/*
+ * Argument structs for spg_leaf_consistent method
+ */
+typedef struct spgLeafConsistentIn
+{
+ StrategyNumber strategy; /* operator strategy number */
+ Datum query; /* operator's RHS value */
+
+ Datum reconstructedValue; /* value reconstructed at parent */
+ int level; /* current level (counting from zero) */
+
+ Datum leafDatum; /* datum in leaf tuple */
+} spgLeafConsistentIn;
+
+typedef struct spgLeafConsistentOut
+{
+ bool recheck; /* set true if operator must be rechecked */
+} spgLeafConsistentOut;
+
+
+/* spginsert.c */
+extern Datum spgbuild(PG_FUNCTION_ARGS);
+extern Datum spgbuildempty(PG_FUNCTION_ARGS);
+extern Datum spginsert(PG_FUNCTION_ARGS);
+
+/* spgscan.c */
+extern Datum spgbeginscan(PG_FUNCTION_ARGS);
+extern Datum spgendscan(PG_FUNCTION_ARGS);
+extern Datum spgrescan(PG_FUNCTION_ARGS);
+extern Datum spgmarkpos(PG_FUNCTION_ARGS);
+extern Datum spgrestrpos(PG_FUNCTION_ARGS);
+extern Datum spggetbitmap(PG_FUNCTION_ARGS);
+extern Datum spggettuple(PG_FUNCTION_ARGS);
+
+/* spgutils.c */
+extern Datum spgoptions(PG_FUNCTION_ARGS);
+
+/* spgvacuum.c */
+extern Datum spgbulkdelete(PG_FUNCTION_ARGS);
+extern Datum spgvacuumcleanup(PG_FUNCTION_ARGS);
+
+/* spgxlog.c */
+extern void spg_redo(XLogRecPtr lsn, XLogRecord *record);
+extern void spg_desc(StringInfo buf, uint8 xl_info, char *rec);
+extern void spg_xlog_startup(void);
+extern void spg_xlog_cleanup(void);
+
+#endif /* SPGIST_H */
diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h
new file mode 100644
index 00000000000..5c57799f09c
--- /dev/null
+++ b/src/include/access/spgist_private.h
@@ -0,0 +1,609 @@
+/*-------------------------------------------------------------------------
+ *
+ * spgist_private.h
+ * Private declarations for SP-GiST access method.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/spgist_private.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SPGIST_PRIVATE_H
+#define SPGIST_PRIVATE_H
+
+#include "access/itup.h"
+#include "access/spgist.h"
+#include "nodes/tidbitmap.h"
+#include "utils/rel.h"
+
+
+/* Page numbers of fixed-location pages */
+#define SPGIST_METAPAGE_BLKNO (0)
+#define SPGIST_HEAD_BLKNO (1)
+
+/*
+ * Contents of page special space on SPGiST index pages
+ */
+typedef struct SpGistPageOpaqueData
+{
+ uint16 flags; /* see bit definitions below */
+ uint16 nRedirection; /* number of redirection tuples on page */
+ uint16 nPlaceholder; /* number of placeholder tuples on page */
+ /* note there's no count of either LIVE or DEAD tuples ... */
+ uint16 spgist_page_id; /* for identification of SP-GiST indexes */
+} SpGistPageOpaqueData;
+
+typedef SpGistPageOpaqueData *SpGistPageOpaque;
+
+/* Flag bits in page special space */
+#define SPGIST_META (1<<0)
+#define SPGIST_DELETED (1<<1)
+#define SPGIST_LEAF (1<<2)
+
+#define SpGistPageGetOpaque(page) ((SpGistPageOpaque) PageGetSpecialPointer(page))
+#define SpGistPageIsMeta(page) (SpGistPageGetOpaque(page)->flags & SPGIST_META)
+#define SpGistPageIsDeleted(page) (SpGistPageGetOpaque(page)->flags & SPGIST_DELETED)
+#define SpGistPageSetDeleted(page) (SpGistPageGetOpaque(page)->flags |= SPGIST_DELETED)
+#define SpGistPageSetNonDeleted(page) (SpGistPageGetOpaque(page)->flags &= ~SPGIST_DELETED)
+#define SpGistPageIsLeaf(page) (SpGistPageGetOpaque(page)->flags & SPGIST_LEAF)
+#define SpGistPageSetLeaf(page) (SpGistPageGetOpaque(page)->flags |= SPGIST_LEAF)
+#define SpGistPageSetInner(page) (SpGistPageGetOpaque(page)->flags &= ~SPGIST_LEAF)
+
+/*
+ * The page ID is for the convenience of pg_filedump and similar utilities,
+ * which otherwise would have a hard time telling pages of different index
+ * types apart. It should be the last 2 bytes on the page. This is more or
+ * less "free" due to alignment considerations.
+ */
+#define SPGIST_PAGE_ID 0xFF82
+
+/*
+ * Each backend keeps a cache of last-used page info in its index->rd_amcache
+ * area. This is initialized from, and occasionally written back to,
+ * shared storage in the index metapage.
+ */
+typedef struct SpGistLastUsedPage
+{
+ BlockNumber blkno; /* block number of described page */
+ int freeSpace; /* its free space (could be obsolete!) */
+} SpGistLastUsedPage;
+
+typedef struct SpGistCache
+{
+ SpGistLastUsedPage innerPage[3]; /* one per triple-parity group */
+ SpGistLastUsedPage leafPage;
+} SpGistCache;
+
+/*
+ * metapage
+ */
+typedef struct SpGistMetaPageData
+{
+ uint32 magicNumber; /* for identity cross-check */
+ SpGistCache lastUsedPages; /* shared storage of last-used info */
+} SpGistMetaPageData;
+
+#define SPGIST_MAGIC_NUMBER (0xBA0BABED)
+
+#define SpGistPageGetMeta(p) \
+ ((SpGistMetaPageData *) PageGetContents(p))
+
+/*
+ * Private state of index AM. SpGistState is common to both insert and
+ * search code; SpGistScanOpaque is for searches only.
+ */
+
+/* Per-datatype info needed in SpGistState */
+typedef struct SpGistTypeDesc
+{
+ Oid type;
+ bool attbyval;
+ int16 attlen;
+} SpGistTypeDesc;
+
+typedef struct SpGistState
+{
+ spgConfigOut config; /* filled in by opclass config method */
+
+ SpGistTypeDesc attType; /* type of input data and leaf values */
+ SpGistTypeDesc attPrefixType; /* type of inner-tuple prefix values */
+ SpGistTypeDesc attLabelType; /* type of node label values */
+
+ /* lookup data for the opclass support functions, except config */
+ FmgrInfo chooseFn;
+ FmgrInfo picksplitFn;
+ FmgrInfo innerConsistentFn;
+ FmgrInfo leafConsistentFn;
+
+ char *deadTupleStorage; /* workspace for spgFormDeadTuple */
+
+ TransactionId myXid; /* XID to use when creating a redirect tuple */
+ bool isBuild; /* true if doing index build */
+} SpGistState;
+
+/*
+ * Private state of an index scan
+ */
+typedef struct SpGistScanOpaqueData
+{
+ SpGistState state; /* see above */
+ MemoryContext tempCxt; /* short-lived memory context */
+
+ /* Index quals for scan (copied from IndexScanDesc for convenience) */
+ int numberOfKeys; /* number of index qualifier conditions */
+ ScanKey keyData; /* array of index qualifier descriptors */
+
+ /* Stack of yet-to-be-visited pages */
+ List *scanStack; /* List of ScanStackEntrys */
+
+ /* These fields are only used in amgetbitmap scans: */
+ TIDBitmap *tbm; /* bitmap being filled */
+ int64 ntids; /* number of TIDs passed to bitmap */
+
+ /* These fields are only used in amgettuple scans: */
+ int nPtrs; /* number of TIDs found on current page */
+ int iPtr; /* index for scanning through same */
+ ItemPointerData heapPtrs[MaxIndexTuplesPerPage]; /* TIDs from cur page */
+ bool recheck[MaxIndexTuplesPerPage]; /* their recheck flags */
+
+ /*
+ * Note: using MaxIndexTuplesPerPage above is a bit hokey since
+ * SpGistLeafTuples aren't exactly IndexTuples; however, they are
+ * larger, so this is safe.
+ */
+} SpGistScanOpaqueData;
+
+typedef SpGistScanOpaqueData *SpGistScanOpaque;
+
+
+/*
+ * SPGiST tuple types. Note: inner, leaf, and dead tuple structs
+ * must have the same tupstate field in the same position! Real inner and
+ * leaf tuples always have tupstate = LIVE; if the state is something else,
+ * use the SpGistDeadTuple struct to inspect the tuple.
+ */
+
+/* values of tupstate (see README for more info) */
+#define SPGIST_LIVE 0 /* normal live tuple (either inner or leaf) */
+#define SPGIST_REDIRECT 1 /* temporary redirection placeholder */
+#define SPGIST_DEAD 2 /* dead, cannot be removed because of links */
+#define SPGIST_PLACEHOLDER 3 /* placeholder, used to preserve offsets */
+
+/*
+ * SPGiST inner tuple: list of "nodes" that subdivide a set of tuples
+ *
+ * Inner tuple layout:
+ * header/optional prefix/array of nodes, which are SpGistNodeTuples
+ *
+ * size and prefixSize must be multiples of MAXALIGN
+ */
+typedef struct SpGistInnerTupleData
+{
+ unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */
+ allTheSame:1, /* all nodes in tuple are equivalent */
+ nNodes:13, /* number of nodes within inner tuple */
+ prefixSize:16; /* size of prefix, or 0 if none */
+ uint16 size; /* total size of inner tuple */
+ /* On most machines there will be a couple of wasted bytes here */
+ /* prefix datum follows, then nodes */
+} SpGistInnerTupleData;
+
+typedef SpGistInnerTupleData *SpGistInnerTuple;
+
+/* these must match largest values that fit in bit fields declared above */
+#define SGITMAXNNODES 0x1FFF
+#define SGITMAXPREFIXSIZE 0xFFFF
+#define SGITMAXSIZE 0xFFFF
+
+#define SGITHDRSZ MAXALIGN(sizeof(SpGistInnerTupleData))
+#define _SGITDATA(x) (((char *) (x)) + SGITHDRSZ)
+#define SGITDATAPTR(x) ((x)->prefixSize ? _SGITDATA(x) : NULL)
+#define SGITDATUM(x, s) ((x)->prefixSize ? \
+ ((s)->attPrefixType.attbyval ? \
+ *(Datum *) _SGITDATA(x) : \
+ PointerGetDatum(_SGITDATA(x))) \
+ : (Datum) 0)
+#define SGITNODEPTR(x) ((SpGistNodeTuple) (_SGITDATA(x) + (x)->prefixSize))
+
+/* Macro for iterating through the nodes of an inner tuple */
+#define SGITITERATE(x, i, nt) \
+ for ((i) = 0, (nt) = SGITNODEPTR(x); \
+ (i) < (x)->nNodes; \
+ (i)++, (nt) = (SpGistNodeTuple) (((char *) (nt)) + IndexTupleSize(nt)))
+
+/*
+ * SPGiST node tuple: one node within an inner tuple
+ *
+ * Node tuples use the same header as ordinary Postgres IndexTuples, but
+ * we do not use a null bitmap, because we know there is only one column
+ * so the INDEX_NULL_MASK bit suffices. Also, pass-by-value datums are
+ * stored as a full Datum, the same convention as for inner tuple prefixes
+ * and leaf tuple datums.
+ */
+
+typedef IndexTupleData SpGistNodeTupleData;
+
+typedef SpGistNodeTupleData *SpGistNodeTuple;
+
+#define SGNTHDRSZ MAXALIGN(sizeof(SpGistNodeTupleData))
+#define SGNTDATAPTR(x) (((char *) (x)) + SGNTHDRSZ)
+#define SGNTDATUM(x, s) ((s)->attLabelType.attbyval ? \
+ *(Datum *) SGNTDATAPTR(x) : \
+ PointerGetDatum(SGNTDATAPTR(x)))
+
+/*
+ * SPGiST leaf tuple: carries a datum and a heap tuple TID
+ *
+ * In the simplest case, the datum is the same as the indexed value; but
+ * it could also be a suffix or some other sort of delta that permits
+ * reconstruction given knowledge of the prefix path traversed to get here.
+ *
+ * The size field is wider than could possibly be needed for an on-disk leaf
+ * tuple, but this allows us to form leaf tuples even when the datum is too
+ * wide to be stored immediately, and it costs nothing because of alignment
+ * considerations.
+ *
+ * Normally, nextOffset links to the next tuple belonging to the same parent
+ * node (which must be on the same page). But when the root page is a leaf
+ * page, we don't chain its tuples, so nextOffset is always 0 on the root.
+ *
+ * size must be a multiple of MAXALIGN
+ */
+typedef struct SpGistLeafTupleData
+{
+ unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */
+ size:30; /* large enough for any palloc'able value */
+ OffsetNumber nextOffset; /* next tuple in chain, or InvalidOffset */
+ ItemPointerData heapPtr; /* TID of represented heap tuple */
+ /* leaf datum follows */
+} SpGistLeafTupleData;
+
+typedef SpGistLeafTupleData *SpGistLeafTuple;
+
+#define SGLTHDRSZ MAXALIGN(sizeof(SpGistLeafTupleData))
+#define SGLTDATAPTR(x) (((char *) (x)) + SGLTHDRSZ)
+#define SGLTDATUM(x, s) ((s)->attType.attbyval ? \
+ *(Datum *) SGLTDATAPTR(x) : \
+ PointerGetDatum(SGLTDATAPTR(x)))
+
+/*
+ * SPGiST dead tuple: declaration for examining non-live tuples
+ *
+ * The tupstate field of this struct must match those of regular inner and
+ * leaf tuples, and its size field must match a leaf tuple's.
+ * Also, the pointer field must be in the same place as a leaf tuple's heapPtr
+ * field, to satisfy some Asserts that we make when replacing a leaf tuple
+ * with a dead tuple.
+ * We don't use nextOffset, but it's needed to align the pointer field.
+ * pointer and xid are only valid when tupstate = REDIRECT.
+ */
+typedef struct SpGistDeadTupleData
+{
+ unsigned int tupstate:2, /* LIVE/REDIRECT/DEAD/PLACEHOLDER */
+ size:30;
+ OffsetNumber nextOffset; /* not used in dead tuples */
+ ItemPointerData pointer; /* redirection inside index */
+ TransactionId xid; /* ID of xact that inserted this tuple */
+} SpGistDeadTupleData;
+
+typedef SpGistDeadTupleData *SpGistDeadTuple;
+
+#define SGDTSIZE MAXALIGN(sizeof(SpGistDeadTupleData))
+
+/*
+ * Macros for doing free-space calculations. Note that when adding up the
+ * space needed for tuples, we always consider each tuple to need the tuple's
+ * size plus sizeof(ItemIdData) (for the line pointer). This works correctly
+ * so long as tuple sizes are always maxaligned.
+ */
+
+/* Page capacity after allowing for fixed header and special space */
+#define SPGIST_PAGE_CAPACITY \
+ MAXALIGN_DOWN(BLCKSZ - \
+ SizeOfPageHeaderData - \
+ MAXALIGN(sizeof(SpGistPageOpaqueData)))
+
+/*
+ * Compute free space on page, assuming that up to n placeholders can be
+ * recycled if present (n should be the number of tuples to be inserted)
+ */
+#define SpGistPageGetFreeSpace(p, n) \
+ (PageGetExactFreeSpace(p) + \
+ Min(SpGistPageGetOpaque(p)->nPlaceholder, n) * \
+ (SGDTSIZE + sizeof(ItemIdData)))
+
+/*
+ * XLOG stuff
+ *
+ * ACCEPT_RDATA_* can only use fixed-length rdata arrays, because of lengthof
+ */
+
+#define ACCEPT_RDATA_DATA(p, s, i) \
+ do { \
+ Assert((i) < lengthof(rdata)); \
+ rdata[i].data = (char *) (p); \
+ rdata[i].len = (s); \
+ rdata[i].buffer = InvalidBuffer; \
+ rdata[i].buffer_std = true; \
+ rdata[i].next = NULL; \
+ if ((i) > 0) \
+ rdata[(i) - 1].next = rdata + (i); \
+ } while(0)
+
+#define ACCEPT_RDATA_BUFFER(b, i) \
+ do { \
+ Assert((i) < lengthof(rdata)); \
+ rdata[i].data = NULL; \
+ rdata[i].len = 0; \
+ rdata[i].buffer = (b); \
+ rdata[i].buffer_std = true; \
+ rdata[i].next = NULL; \
+ if ((i) > 0) \
+ rdata[(i) - 1].next = rdata + (i); \
+ } while(0)
+
+
+/* XLOG record types for SPGiST */
+#define XLOG_SPGIST_CREATE_INDEX 0x00
+#define XLOG_SPGIST_ADD_LEAF 0x10
+#define XLOG_SPGIST_MOVE_LEAFS 0x20
+#define XLOG_SPGIST_ADD_NODE 0x30
+#define XLOG_SPGIST_SPLIT_TUPLE 0x40
+#define XLOG_SPGIST_PICKSPLIT 0x50
+#define XLOG_SPGIST_VACUUM_LEAF 0x60
+#define XLOG_SPGIST_VACUUM_ROOT 0x70
+#define XLOG_SPGIST_VACUUM_REDIRECT 0x80
+
+/*
+ * Some redo functions need an SpGistState, although only a few of its fields
+ * need to be valid. spgxlogState carries the required info in xlog records.
+ * (See fillFakeState in spgxlog.c for more comments.)
+ */
+typedef struct spgxlogState
+{
+ TransactionId myXid;
+ bool isBuild;
+} spgxlogState;
+
+#define STORE_STATE(s, d) \
+ do { \
+ (d).myXid = (s)->myXid; \
+ (d).isBuild = (s)->isBuild; \
+ } while(0)
+
+
+typedef struct spgxlogAddLeaf
+{
+ RelFileNode node;
+
+ BlockNumber blknoLeaf; /* destination page for leaf tuple */
+ bool newPage; /* init dest page? */
+ OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */
+ OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */
+
+ BlockNumber blknoParent; /* where the parent downlink is, if any */
+ OffsetNumber offnumParent;
+ uint16 nodeI;
+
+ /*
+ * new leaf tuple follows, on an intalign boundary (replay only needs to
+ * fetch its size field, so that should be enough alignment)
+ */
+} spgxlogAddLeaf;
+
+typedef struct spgxlogMoveLeafs
+{
+ RelFileNode node;
+
+ BlockNumber blknoSrc; /* source leaf page */
+ BlockNumber blknoDst; /* destination leaf page */
+ uint16 nMoves; /* number of tuples moved from source page */
+ bool newPage; /* init dest page? */
+ bool replaceDead; /* are we replacing a DEAD source tuple? */
+
+ BlockNumber blknoParent; /* where the parent downlink is */
+ OffsetNumber offnumParent;
+ uint16 nodeI;
+
+ spgxlogState stateSrc;
+
+ /*----------
+ * data follows:
+ * array of deleted tuple numbers, length nMoves
+ * array of inserted tuple numbers, length nMoves + 1 or 1
+ * list of leaf tuples, length nMoves + 1 or 1 (must be maxaligned)
+ * the tuple number arrays are padded to maxalign boundaries so that the
+ * leaf tuples will be suitably aligned
+ *
+ * Note: if replaceDead is true then there is only one inserted tuple
+ * number and only one leaf tuple in the data, because we are not copying
+ * the dead tuple from the source
+ *
+ * Buffer references in the rdata array are:
+ * Src page
+ * Dest page
+ * Parent page
+ *----------
+ */
+} spgxlogMoveLeafs;
+
+typedef struct spgxlogAddNode
+{
+ RelFileNode node;
+
+ BlockNumber blkno; /* block number of original inner tuple */
+ OffsetNumber offnum; /* offset of original inner tuple */
+
+ BlockNumber blknoParent; /* where parent downlink is, if updated */
+ OffsetNumber offnumParent;
+ uint16 nodeI;
+
+ BlockNumber blknoNew; /* where new tuple goes, if not same place */
+ OffsetNumber offnumNew;
+ bool newPage; /* init new page? */
+
+ spgxlogState stateSrc;
+
+ /*
+ * updated inner tuple follows, on an intalign boundary (replay only needs
+ * to fetch its size field, so that should be enough alignment)
+ */
+} spgxlogAddNode;
+
+typedef struct spgxlogSplitTuple
+{
+ RelFileNode node;
+
+ BlockNumber blknoPrefix; /* where the prefix tuple goes */
+ OffsetNumber offnumPrefix;
+
+ BlockNumber blknoPostfix; /* where the postfix tuple goes */
+ OffsetNumber offnumPostfix;
+ bool newPage; /* need to init that page? */
+
+ /*
+ * new prefix inner tuple follows, then new postfix inner tuple, on
+ * intalign boundaries (replay only needs to fetch size fields, so that
+ * should be enough alignment)
+ */
+} spgxlogSplitTuple;
+
+typedef struct spgxlogPickSplit
+{
+ RelFileNode node;
+
+ BlockNumber blknoSrc; /* original leaf page */
+ BlockNumber blknoDest; /* other leaf page, if any */
+ uint16 nDelete; /* n to delete from Src */
+ uint16 nInsert; /* n to insert on Src and/or Dest */
+ bool initSrc; /* re-init the Src page? */
+ bool initDest; /* re-init the Dest page? */
+
+ BlockNumber blknoInner; /* where to put new inner tuple */
+ OffsetNumber offnumInner;
+ bool initInner; /* re-init the Inner page? */
+
+ BlockNumber blknoParent; /* where the parent downlink is, if any */
+ OffsetNumber offnumParent;
+ uint16 nodeI;
+
+ spgxlogState stateSrc;
+
+ /*----------
+ * data follows:
+ * new inner tuple (assumed to have a maxaligned length)
+ * array of deleted tuple numbers, length nDelete
+ * array of inserted tuple numbers, length nInsert
+ * array of page selector bytes for inserted tuples, length nInsert
+ * list of leaf tuples, length nInsert (must be maxaligned)
+ * the tuple number and page selector arrays are padded to maxalign
+ * boundaries so that the leaf tuples will be suitably aligned
+ *
+ * Buffer references in the rdata array are:
+ * Src page (only if not root and not being init'd)
+ * Dest page (if used and not being init'd)
+ * Inner page (only if not being init'd)
+ * Parent page (if any; could be same as Inner)
+ *----------
+ */
+} spgxlogPickSplit;
+
+typedef struct spgxlogVacuumLeaf
+{
+ RelFileNode node;
+
+ BlockNumber blkno; /* block number to clean */
+ uint16 nDead; /* number of tuples to become DEAD */
+ uint16 nPlaceholder; /* number of tuples to become PLACEHOLDER */
+ uint16 nMove; /* number of tuples to move */
+ uint16 nChain; /* number of tuples to re-chain */
+
+ spgxlogState stateSrc;
+
+ /*----------
+ * data follows:
+ * tuple numbers to become DEAD
+ * tuple numbers to become PLACEHOLDER
+ * tuple numbers to move from (and replace with PLACEHOLDER)
+ * tuple numbers to move to (replacing what is there)
+ * tuple numbers to update nextOffset links of
+ * tuple numbers to insert in nextOffset links
+ *----------
+ */
+} spgxlogVacuumLeaf;
+
+typedef struct spgxlogVacuumRoot
+{
+ /* vacuum root page when it is a leaf */
+ RelFileNode node;
+
+ uint16 nDelete; /* number of tuples to delete */
+
+ spgxlogState stateSrc;
+
+ /* offsets of tuples to delete follow */
+} spgxlogVacuumRoot;
+
+typedef struct spgxlogVacuumRedirect
+{
+ RelFileNode node;
+
+ BlockNumber blkno; /* block number to clean */
+ uint16 nToPlaceholder; /* number of redirects to make placeholders */
+ OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */
+
+ /* offsets of redirect tuples to make placeholders follow */
+} spgxlogVacuumRedirect;
+
+/*
+ * The "flags" argument for SpGistGetBuffer should be either GBUF_LEAF to
+ * get a leaf page, or GBUF_INNER_PARITY(blockNumber) to get an inner
+ * page in the same triple-parity group as the specified block number.
+ * (Typically, this should be GBUF_INNER_PARITY(parentBlockNumber + 1)
+ * to follow the rule described in spgist/README.)
+ */
+#define GBUF_PARITY_MASK 0x03
+#define GBUF_LEAF 0x04
+#define GBUF_INNER_PARITY(x) ((x) % 3)
+
+/* spgutils.c */
+extern void initSpGistState(SpGistState *state, Relation index);
+extern Buffer SpGistNewBuffer(Relation index);
+extern void SpGistUpdateMetaPage(Relation index);
+extern Buffer SpGistGetBuffer(Relation index, int flags,
+ int needSpace, bool *isNew);
+extern void SpGistSetLastUsedPage(Relation index, Buffer buffer);
+extern void SpGistInitPage(Page page, uint16 f);
+extern void SpGistInitBuffer(Buffer b, uint16 f);
+extern void SpGistInitMetapage(Page page);
+extern unsigned int SpGistGetTypeSize(SpGistTypeDesc *att, Datum datum);
+extern SpGistLeafTuple spgFormLeafTuple(SpGistState *state,
+ ItemPointer heapPtr, Datum datum);
+extern SpGistNodeTuple spgFormNodeTuple(SpGistState *state,
+ Datum label, bool isnull);
+extern SpGistInnerTuple spgFormInnerTuple(SpGistState *state,
+ bool hasPrefix, Datum prefix,
+ int nNodes, SpGistNodeTuple *nodes);
+extern SpGistDeadTuple spgFormDeadTuple(SpGistState *state, int tupstate,
+ BlockNumber blkno, OffsetNumber offnum);
+extern Datum *spgExtractNodeLabels(SpGistState *state,
+ SpGistInnerTuple innerTuple);
+extern OffsetNumber SpGistPageAddNewItem(SpGistState *state, Page page,
+ Item item, Size size,
+ OffsetNumber *startOffset,
+ bool errorOK);
+
+/* spgdoinsert.c */
+extern void updateNodeLink(SpGistInnerTuple tup, int nodeN,
+ BlockNumber blkno, OffsetNumber offset);
+extern void spgPageIndexMultiDelete(SpGistState *state, Page page,
+ OffsetNumber *itemnos, int nitems,
+ int firststate, int reststate,
+ BlockNumber blkno, OffsetNumber offnum);
+extern void spgdoinsert(Relation index, SpGistState *state,
+ ItemPointer heapPtr, Datum datum);
+
+#endif /* SPGIST_PRIVATE_H */
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 14e177dc482..eb343545915 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 201112071
+#define CATALOG_VERSION_NO 201112171
#endif
diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h
index ddacdf274c4..6fdd1d5b052 100644
--- a/src/include/catalog/pg_am.h
+++ b/src/include/catalog/pg_am.h
@@ -117,17 +117,20 @@ typedef FormData_pg_am *Form_pg_am;
* ----------------
*/
-DATA(insert OID = 403 ( btree 5 2 t f t t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcostestimate btoptions ));
+DATA(insert OID = 403 ( btree 5 2 t f t t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcostestimate btoptions ));
DESCR("b-tree index access method");
#define BTREE_AM_OID 403
-DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
+DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
DESCR("hash index access method");
#define HASH_AM_OID 405
-DATA(insert OID = 783 ( gist 0 8 f t f f t f t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
+DATA(insert OID = 783 ( gist 0 8 f t f f t f t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
DESCR("GiST index access method");
#define GIST_AM_OID 783
-DATA(insert OID = 2742 ( gin 0 5 f f f f t f t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
+DATA(insert OID = 2742 ( gin 0 5 f f f f t f t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
DESCR("GIN index access method");
#define GIN_AM_OID 2742
+DATA(insert OID = 4000 ( spgist 0 5 f f f f f f f f f f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcostestimate spgoptions ));
+DESCR("SP-GiST index access method");
+#define SPGIST_AM_OID 4000
#endif /* PG_AM_H */
diff --git a/src/include/catalog/pg_amop.h b/src/include/catalog/pg_amop.h
index 1e8c9a289f9..cb394e03e40 100644
--- a/src/include/catalog/pg_amop.h
+++ b/src/include/catalog/pg_amop.h
@@ -737,4 +737,37 @@ DATA(insert ( 3919 3831 3831 8 s 3892 783 0 ));
DATA(insert ( 3919 3831 2283 16 s 3889 783 0 ));
DATA(insert ( 3919 3831 3831 18 s 3882 783 0 ));
+/*
+ * SP-GiST quad_point_ops
+ */
+DATA(insert ( 4015 600 600 11 s 506 4000 0 ));
+DATA(insert ( 4015 600 600 1 s 507 4000 0 ));
+DATA(insert ( 4015 600 600 5 s 508 4000 0 ));
+DATA(insert ( 4015 600 600 10 s 509 4000 0 ));
+DATA(insert ( 4015 600 600 6 s 510 4000 0 ));
+DATA(insert ( 4015 600 603 8 s 511 4000 0 ));
+
+/*
+ * SP-GiST kd_point_ops
+ */
+DATA(insert ( 4016 600 600 11 s 506 4000 0 ));
+DATA(insert ( 4016 600 600 1 s 507 4000 0 ));
+DATA(insert ( 4016 600 600 5 s 508 4000 0 ));
+DATA(insert ( 4016 600 600 10 s 509 4000 0 ));
+DATA(insert ( 4016 600 600 6 s 510 4000 0 ));
+DATA(insert ( 4016 600 603 8 s 511 4000 0 ));
+
+/*
+ * SP-GiST text_ops
+ */
+DATA(insert ( 4017 25 25 1 s 2314 4000 0 ));
+DATA(insert ( 4017 25 25 2 s 2315 4000 0 ));
+DATA(insert ( 4017 25 25 3 s 98 4000 0 ));
+DATA(insert ( 4017 25 25 4 s 2317 4000 0 ));
+DATA(insert ( 4017 25 25 5 s 2318 4000 0 ));
+DATA(insert ( 4017 25 25 11 s 664 4000 0 ));
+DATA(insert ( 4017 25 25 12 s 665 4000 0 ));
+DATA(insert ( 4017 25 25 14 s 667 4000 0 ));
+DATA(insert ( 4017 25 25 15 s 666 4000 0 ));
+
#endif /* PG_AMOP_H */
diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h
index 8571dd08709..a4c49efed83 100644
--- a/src/include/catalog/pg_amproc.h
+++ b/src/include/catalog/pg_amproc.h
@@ -356,4 +356,22 @@ DATA(insert ( 3919 3831 3831 5 3879 ));
DATA(insert ( 3919 3831 3831 6 3880 ));
DATA(insert ( 3919 3831 3831 7 3881 ));
+
+/* sp-gist */
+DATA(insert ( 4015 600 600 1 4018 ));
+DATA(insert ( 4015 600 600 2 4019 ));
+DATA(insert ( 4015 600 600 3 4020 ));
+DATA(insert ( 4015 600 600 4 4021 ));
+DATA(insert ( 4015 600 600 5 4022 ));
+DATA(insert ( 4016 600 600 1 4023 ));
+DATA(insert ( 4016 600 600 2 4024 ));
+DATA(insert ( 4016 600 600 3 4025 ));
+DATA(insert ( 4016 600 600 4 4026 ));
+DATA(insert ( 4016 600 600 5 4022 ));
+DATA(insert ( 4017 25 25 1 4027 ));
+DATA(insert ( 4017 25 25 2 4028 ));
+DATA(insert ( 4017 25 25 3 4029 ));
+DATA(insert ( 4017 25 25 4 4030 ));
+DATA(insert ( 4017 25 25 5 4031 ));
+
#endif /* PG_AMPROC_H */
diff --git a/src/include/catalog/pg_opclass.h b/src/include/catalog/pg_opclass.h
index eecd3b63c50..c692ae4311b 100644
--- a/src/include/catalog/pg_opclass.h
+++ b/src/include/catalog/pg_opclass.h
@@ -223,5 +223,8 @@ DATA(insert ( 783 tsquery_ops PGNSP PGUID 3702 3615 t 20 ));
DATA(insert ( 403 range_ops PGNSP PGUID 3901 3831 t 0 ));
DATA(insert ( 405 range_ops PGNSP PGUID 3903 3831 t 0 ));
DATA(insert ( 783 range_ops PGNSP PGUID 3919 3831 t 0 ));
+DATA(insert ( 4000 quad_point_ops PGNSP PGUID 4015 600 t 0 ));
+DATA(insert ( 4000 kd_point_ops PGNSP PGUID 4016 600 f 0 ));
+DATA(insert ( 4000 text_ops PGNSP PGUID 4017 25 t 0 ));
#endif /* PG_OPCLASS_H */
diff --git a/src/include/catalog/pg_opfamily.h b/src/include/catalog/pg_opfamily.h
index 5ea949bec6b..009000ffcff 100644
--- a/src/include/catalog/pg_opfamily.h
+++ b/src/include/catalog/pg_opfamily.h
@@ -142,5 +142,8 @@ DATA(insert OID = 3702 ( 783 tsquery_ops PGNSP PGUID ));
DATA(insert OID = 3901 ( 403 range_ops PGNSP PGUID ));
DATA(insert OID = 3903 ( 405 range_ops PGNSP PGUID ));
DATA(insert OID = 3919 ( 783 range_ops PGNSP PGUID ));
+DATA(insert OID = 4015 ( 4000 quad_point_ops PGNSP PGUID ));
+DATA(insert OID = 4016 ( 4000 kd_point_ops PGNSP PGUID ));
+DATA(insert OID = 4017 ( 4000 text_ops PGNSP PGUID ));
#endif /* PG_OPFAMILY_H */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 924cb1f601c..6da3b421ae3 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4481,6 +4481,68 @@ DESCR("int8range constructor");
DATA(insert OID = 3946 ( int8range PGNSP PGUID 12 1 0 0 0 f f f f f i 3 0 3926 "20 20 25" _null_ _null_ _null_ _null_ range_constructor3 _null_ _null_ _null_ ));
DESCR("int8range constructor");
+/* spgist support functions */
+DATA(insert OID = 4001 ( spggettuple PGNSP PGUID 12 1 0 0 0 f f f t f v 2 0 16 "2281 2281" _null_ _null_ _null_ _null_ spggettuple _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4002 ( spggetbitmap PGNSP PGUID 12 1 0 0 0 f f f t f v 2 0 20 "2281 2281" _null_ _null_ _null_ _null_ spggetbitmap _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4003 ( spginsert PGNSP PGUID 12 1 0 0 0 f f f t f v 6 0 16 "2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ spginsert _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4004 ( spgbeginscan PGNSP PGUID 12 1 0 0 0 f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ spgbeginscan _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4005 ( spgrescan PGNSP PGUID 12 1 0 0 0 f f f t f v 5 0 2278 "2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ spgrescan _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4006 ( spgendscan PGNSP PGUID 12 1 0 0 0 f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ spgendscan _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4007 ( spgmarkpos PGNSP PGUID 12 1 0 0 0 f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ spgmarkpos _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4008 ( spgrestrpos PGNSP PGUID 12 1 0 0 0 f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ spgrestrpos _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4009 ( spgbuild PGNSP PGUID 12 1 0 0 0 f f f t f v 3 0 2281 "2281 2281 2281" _null_ _null_ _null_ _null_ spgbuild _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4010 ( spgbuildempty PGNSP PGUID 12 1 0 0 0 f f f t f v 1 0 2278 "2281" _null_ _null_ _null_ _null_ spgbuildempty _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4011 ( spgbulkdelete PGNSP PGUID 12 1 0 0 0 f f f t f v 4 0 2281 "2281 2281 2281 2281" _null_ _null_ _null_ _null_ spgbulkdelete _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4012 ( spgvacuumcleanup PGNSP PGUID 12 1 0 0 0 f f f t f v 2 0 2281 "2281 2281" _null_ _null_ _null_ _null_ spgvacuumcleanup _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4013 ( spgcostestimate PGNSP PGUID 12 1 0 0 0 f f f t f v 9 0 2278 "2281 2281 2281 2281 2281 2281 2281 2281 2281" _null_ _null_ _null_ _null_ spgcostestimate _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+DATA(insert OID = 4014 ( spgoptions PGNSP PGUID 12 1 0 0 0 f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_ spgoptions _null_ _null_ _null_ ));
+DESCR("spgist(internal)");
+
+/* spgist opclasses */
+DATA(insert OID = 4018 ( spg_quad_config PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_quad_config _null_ _null_ _null_ ));
+DESCR("SP-GiST support for quad tree over point");
+DATA(insert OID = 4019 ( spg_quad_choose PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_quad_choose _null_ _null_ _null_ ));
+DESCR("SP-GiST support for quad tree over point");
+DATA(insert OID = 4020 ( spg_quad_picksplit PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_quad_picksplit _null_ _null_ _null_ ));
+DESCR("SP-GiST support for quad tree over point");
+DATA(insert OID = 4021 ( spg_quad_inner_consistent PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_quad_inner_consistent _null_ _null_ _null_ ));
+DESCR("SP-GiST support for quad tree over point");
+DATA(insert OID = 4022 ( spg_quad_leaf_consistent PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 16 "2281 2281" _null_ _null_ _null_ _null_ spg_quad_leaf_consistent _null_ _null_ _null_ ));
+DESCR("SP-GiST support for quad tree and k-d tree over point");
+
+DATA(insert OID = 4023 ( spg_kd_config PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_kd_config _null_ _null_ _null_ ));
+DESCR("SP-GiST support for k-d tree over point");
+DATA(insert OID = 4024 ( spg_kd_choose PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_kd_choose _null_ _null_ _null_ ));
+DESCR("SP-GiST support for k-d tree over point");
+DATA(insert OID = 4025 ( spg_kd_picksplit PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_kd_picksplit _null_ _null_ _null_ ));
+DESCR("SP-GiST support for k-d tree over point");
+DATA(insert OID = 4026 ( spg_kd_inner_consistent PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_kd_inner_consistent _null_ _null_ _null_ ));
+DESCR("SP-GiST support for k-d tree over point");
+
+DATA(insert OID = 4027 ( spg_text_config PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_text_config _null_ _null_ _null_ ));
+DESCR("SP-GiST support for suffix tree over text");
+DATA(insert OID = 4028 ( spg_text_choose PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_text_choose _null_ _null_ _null_ ));
+DESCR("SP-GiST support for suffix tree over text");
+DATA(insert OID = 4029 ( spg_text_picksplit PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_text_picksplit _null_ _null_ _null_ ));
+DESCR("SP-GiST support for suffix tree over text");
+DATA(insert OID = 4030 ( spg_text_inner_consistent PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 2278 "2281 2281" _null_ _null_ _null_ _null_ spg_text_inner_consistent _null_ _null_ _null_ ));
+DESCR("SP-GiST support for suffix tree over text");
+DATA(insert OID = 4031 ( spg_text_leaf_consistent PGNSP PGUID 12 1 0 0 0 f f f t f i 2 0 16 "2281 2281" _null_ _null_ _null_ _null_ spg_text_leaf_consistent _null_ _null_ _null_ ));
+DESCR("SP-GiST support for suffix tree over text");
+
/*
* Symbolic values for provolatile column: these indicate whether the result
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 994dc5368b1..9c5af5960fd 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -1080,6 +1080,26 @@ extern Datum window_first_value(PG_FUNCTION_ARGS);
extern Datum window_last_value(PG_FUNCTION_ARGS);
extern Datum window_nth_value(PG_FUNCTION_ARGS);
+/* access/spgist/spgquadtreeproc.c */
+extern Datum spg_quad_config(PG_FUNCTION_ARGS);
+extern Datum spg_quad_choose(PG_FUNCTION_ARGS);
+extern Datum spg_quad_picksplit(PG_FUNCTION_ARGS);
+extern Datum spg_quad_inner_consistent(PG_FUNCTION_ARGS);
+extern Datum spg_quad_leaf_consistent(PG_FUNCTION_ARGS);
+
+/* access/spgist/spgkdtreeproc.c */
+extern Datum spg_kd_config(PG_FUNCTION_ARGS);
+extern Datum spg_kd_choose(PG_FUNCTION_ARGS);
+extern Datum spg_kd_picksplit(PG_FUNCTION_ARGS);
+extern Datum spg_kd_inner_consistent(PG_FUNCTION_ARGS);
+
+/* access/spgist/spgtextproc.c */
+extern Datum spg_text_config(PG_FUNCTION_ARGS);
+extern Datum spg_text_choose(PG_FUNCTION_ARGS);
+extern Datum spg_text_picksplit(PG_FUNCTION_ARGS);
+extern Datum spg_text_inner_consistent(PG_FUNCTION_ARGS);
+extern Datum spg_text_leaf_consistent(PG_FUNCTION_ARGS);
+
/* access/gin/ginarrayproc.c */
extern Datum ginarrayextract(PG_FUNCTION_ARGS);
extern Datum ginarrayextract_2args(PG_FUNCTION_ARGS);
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 32d14b60290..6afcbf47537 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -194,6 +194,7 @@ extern Selectivity estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey,
extern Datum btcostestimate(PG_FUNCTION_ARGS);
extern Datum hashcostestimate(PG_FUNCTION_ARGS);
extern Datum gistcostestimate(PG_FUNCTION_ARGS);
+extern Datum spgcostestimate(PG_FUNCTION_ARGS);
extern Datum gincostestimate(PG_FUNCTION_ARGS);
#endif /* SELFUNCS_H */
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index bdd1f4ec78e..86cee2de942 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -61,6 +61,26 @@ CREATE TEMP TABLE gcircle_tbl AS
SELECT circle(home_base) AS f1 FROM slow_emp4000;
CREATE INDEX ggpolygonind ON gpolygon_tbl USING gist (f1);
CREATE INDEX ggcircleind ON gcircle_tbl USING gist (f1);
+--
+-- SP-GiST
+--
+CREATE TABLE quad_point_tbl AS
+ SELECT point(unique1,unique2) AS p FROM tenk1;
+INSERT INTO quad_point_tbl
+ SELECT '(333.0,400.0)'::point FROM generate_series(1,1000);
+CREATE INDEX sp_quad_ind ON quad_point_tbl USING spgist (p);
+CREATE TABLE kd_point_tbl AS SELECT * FROM quad_point_tbl;
+CREATE INDEX sp_kd_ind ON kd_point_tbl USING spgist (p kd_point_ops);
+CREATE TABLE suffix_text_tbl AS
+ SELECT name AS t FROM road;
+INSERT INTO suffix_text_tbl
+ SELECT '0123456789abcdef' FROM generate_series(1,1000);
+INSERT INTO suffix_text_tbl VALUES ('0123456789abcde');
+INSERT INTO suffix_text_tbl VALUES ('0123456789abcdefF');
+CREATE INDEX sp_suff_ind ON suffix_text_tbl USING spgist (t);
+--
+-- Test GiST and SP-GiST indexes
+--
-- get non-indexed results for comparison purposes
SET enable_seqscan = ON;
SET enable_indexscan = OFF;
@@ -207,22 +227,141 @@ SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0
(10,10)
(4 rows)
+SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+ count
+-------
+ 1057
+(1 row)
+
+SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+ count
+-------
+ 1057
+(1 row)
+
+SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)';
+ count
+-------
+ 6000
+(1 row)
+
+SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)';
+ count
+-------
+ 4999
+(1 row)
+
+SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)';
+ count
+-------
+ 5000
+(1 row)
+
+SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)';
+ count
+-------
+ 5999
+(1 row)
+
+SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
+ count
+-------
+ 1
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef';
+ count
+-------
+ 1000
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde';
+ count
+-------
+ 1
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF';
+ count
+-------
+ 1
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct ';
+ count
+-------
+ 1705
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct ';
+ count
+-------
+ 1705
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct ';
+ count
+-------
+ 1706
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct ';
+ count
+-------
+ 1706
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct ';
+ count
+-------
+ 1
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St ';
+ count
+-------
+ 2
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St ';
+ count
+-------
+ 50
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St ';
+ count
+-------
+ 50
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St ';
+ count
+-------
+ 48
+(1 row)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St ';
+ count
+-------
+ 48
+(1 row)
+
+-- Now check the results from plain indexscan
SET enable_seqscan = OFF;
SET enable_indexscan = ON;
-SET enable_bitmapscan = ON;
+SET enable_bitmapscan = OFF;
EXPLAIN (COSTS OFF)
SELECT * FROM fast_emp4000
WHERE home_base @ '(200,200),(2000,1000)'::box
ORDER BY (home_base[0])[0];
- QUERY PLAN
-----------------------------------------------------------------------
+ QUERY PLAN
+----------------------------------------------------------------
Sort
Sort Key: ((home_base[0])[0])
- -> Bitmap Heap Scan on fast_emp4000
- Recheck Cond: (home_base @ '(2000,1000),(200,200)'::box)
- -> Bitmap Index Scan on grect2ind
- Index Cond: (home_base @ '(2000,1000),(200,200)'::box)
-(6 rows)
+ -> Index Scan using grect2ind on fast_emp4000
+ Index Cond: (home_base @ '(2000,1000),(200,200)'::box)
+(4 rows)
SELECT * FROM fast_emp4000
WHERE home_base @ '(200,200),(2000,1000)'::box
@@ -235,14 +374,12 @@ SELECT * FROM fast_emp4000
EXPLAIN (COSTS OFF)
SELECT count(*) FROM fast_emp4000 WHERE home_base && '(1000,1000,0,0)'::box;
- QUERY PLAN
--------------------------------------------------------------------
+ QUERY PLAN
+-------------------------------------------------------------
Aggregate
- -> Bitmap Heap Scan on fast_emp4000
- Recheck Cond: (home_base && '(1000,1000),(0,0)'::box)
- -> Bitmap Index Scan on grect2ind
- Index Cond: (home_base && '(1000,1000),(0,0)'::box)
-(5 rows)
+ -> Index Scan using grect2ind on fast_emp4000
+ Index Cond: (home_base && '(1000,1000),(0,0)'::box)
+(3 rows)
SELECT count(*) FROM fast_emp4000 WHERE home_base && '(1000,1000,0,0)'::box;
count
@@ -252,14 +389,12 @@ SELECT count(*) FROM fast_emp4000 WHERE home_base && '(1000,1000,0,0)'::box;
EXPLAIN (COSTS OFF)
SELECT count(*) FROM fast_emp4000 WHERE home_base IS NULL;
- QUERY PLAN
------------------------------------------------
+ QUERY PLAN
+--------------------------------------------------
Aggregate
- -> Bitmap Heap Scan on fast_emp4000
- Recheck Cond: (home_base IS NULL)
- -> Bitmap Index Scan on grect2ind
- Index Cond: (home_base IS NULL)
-(5 rows)
+ -> Index Scan using grect2ind on fast_emp4000
+ Index Cond: (home_base IS NULL)
+(3 rows)
SELECT count(*) FROM fast_emp4000 WHERE home_base IS NULL;
count
@@ -308,14 +443,12 @@ SELECT * FROM circle_tbl WHERE f1 && circle(point(1,-2), 1)
EXPLAIN (COSTS OFF)
SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon;
- QUERY PLAN
-------------------------------------------------------------------
+ QUERY PLAN
+------------------------------------------------------------
Aggregate
- -> Bitmap Heap Scan on gpolygon_tbl
- Recheck Cond: (f1 && '((1000,1000),(0,0))'::polygon)
- -> Bitmap Index Scan on ggpolygonind
- Index Cond: (f1 && '((1000,1000),(0,0))'::polygon)
-(5 rows)
+ -> Index Scan using ggpolygonind on gpolygon_tbl
+ Index Cond: (f1 && '((1000,1000),(0,0))'::polygon)
+(3 rows)
SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon;
count
@@ -325,14 +458,12 @@ SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon;
EXPLAIN (COSTS OFF)
SELECT count(*) FROM gcircle_tbl WHERE f1 && '<(500,500),500>'::circle;
- QUERY PLAN
--------------------------------------------------------------
+ QUERY PLAN
+-------------------------------------------------------
Aggregate
- -> Bitmap Heap Scan on gcircle_tbl
- Recheck Cond: (f1 && '<(500,500),500>'::circle)
- -> Bitmap Index Scan on ggcircleind
- Index Cond: (f1 && '<(500,500),500>'::circle)
-(5 rows)
+ -> Index Scan using ggcircleind on gcircle_tbl
+ Index Cond: (f1 && '<(500,500),500>'::circle)
+(3 rows)
SELECT count(*) FROM gcircle_tbl WHERE f1 && '<(500,500),500>'::circle;
count
@@ -547,6 +678,412 @@ SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0
(10,10)
(4 rows)
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+ QUERY PLAN
+---------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_quad_ind on quad_point_tbl
+ Index Cond: (p <@ '(1000,1000),(200,200)'::box)
+(3 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+ count
+-------
+ 1057
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+ QUERY PLAN
+---------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_quad_ind on quad_point_tbl
+ Index Cond: ('(1000,1000),(200,200)'::box @> p)
+(3 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+ count
+-------
+ 1057
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)';
+ QUERY PLAN
+------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_quad_ind on quad_point_tbl
+ Index Cond: (p << '(5000,4000)'::point)
+(3 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)';
+ count
+-------
+ 6000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)';
+ QUERY PLAN
+------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_quad_ind on quad_point_tbl
+ Index Cond: (p >> '(5000,4000)'::point)
+(3 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)';
+ count
+-------
+ 4999
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)';
+ QUERY PLAN
+------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_quad_ind on quad_point_tbl
+ Index Cond: (p <^ '(5000,4000)'::point)
+(3 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)';
+ count
+-------
+ 5000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)';
+ QUERY PLAN
+------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_quad_ind on quad_point_tbl
+ Index Cond: (p >^ '(5000,4000)'::point)
+(3 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)';
+ count
+-------
+ 5999
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
+ QUERY PLAN
+------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_quad_ind on quad_point_tbl
+ Index Cond: (p ~= '(4585,365)'::point)
+(3 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
+ count
+-------
+ 1
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+ QUERY PLAN
+---------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_kd_ind on kd_point_tbl
+ Index Cond: (p <@ '(1000,1000),(200,200)'::box)
+(3 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+ count
+-------
+ 1057
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+ QUERY PLAN
+---------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_kd_ind on kd_point_tbl
+ Index Cond: ('(1000,1000),(200,200)'::box @> p)
+(3 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+ count
+-------
+ 1057
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)';
+ QUERY PLAN
+--------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_kd_ind on kd_point_tbl
+ Index Cond: (p << '(5000,4000)'::point)
+(3 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)';
+ count
+-------
+ 6000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)';
+ QUERY PLAN
+--------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_kd_ind on kd_point_tbl
+ Index Cond: (p >> '(5000,4000)'::point)
+(3 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)';
+ count
+-------
+ 4999
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)';
+ QUERY PLAN
+--------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_kd_ind on kd_point_tbl
+ Index Cond: (p <^ '(5000,4000)'::point)
+(3 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)';
+ count
+-------
+ 5000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)';
+ QUERY PLAN
+--------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_kd_ind on kd_point_tbl
+ Index Cond: (p >^ '(5000,4000)'::point)
+(3 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)';
+ count
+-------
+ 5999
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)';
+ QUERY PLAN
+--------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_kd_ind on kd_point_tbl
+ Index Cond: (p ~= '(4585,365)'::point)
+(3 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)';
+ count
+-------
+ 1
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t = '0123456789abcdef'::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef';
+ count
+-------
+ 1000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t = '0123456789abcde'::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde';
+ count
+-------
+ 1
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t = '0123456789abcdefF'::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF';
+ count
+-------
+ 1
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct ';
+ QUERY PLAN
+----------------------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t < 'Aztec Ct '::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct ';
+ count
+-------
+ 1705
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct ';
+ QUERY PLAN
+------------------------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t ~<~ 'Aztec Ct '::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct ';
+ count
+-------
+ 1705
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct ';
+ QUERY PLAN
+-----------------------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t <= 'Aztec Ct '::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct ';
+ count
+-------
+ 1706
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct ';
+ QUERY PLAN
+-------------------------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t ~<=~ 'Aztec Ct '::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct ';
+ count
+-------
+ 1706
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct ';
+ QUERY PLAN
+----------------------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t = 'Aztec Ct '::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct ';
+ count
+-------
+ 1
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St ';
+ QUERY PLAN
+----------------------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t = 'Worth St '::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St ';
+ count
+-------
+ 2
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St ';
+ QUERY PLAN
+-----------------------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t >= 'Worth St '::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St ';
+ count
+-------
+ 50
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St ';
+ QUERY PLAN
+-------------------------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t ~>=~ 'Worth St '::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St ';
+ count
+-------
+ 50
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St ';
+ QUERY PLAN
+----------------------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t > 'Worth St '::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St ';
+ count
+-------
+ 48
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St ';
+ QUERY PLAN
+------------------------------------------------------------------------
+ Aggregate
+ -> Index Scan using sp_suff_ind on suffix_text_tbl
+ Index Cond: (t ~>~ 'Worth St '::text)
+(3 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St ';
+ count
+-------
+ 48
+(1 row)
+
+-- Now check the results from bitmap indexscan
SET enable_seqscan = OFF;
SET enable_indexscan = OFF;
SET enable_bitmapscan = ON;
@@ -571,6 +1108,465 @@ SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0
(10,10)
(4 rows)
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+ QUERY PLAN
+---------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on quad_point_tbl
+ Recheck Cond: (p <@ '(1000,1000),(200,200)'::box)
+ -> Bitmap Index Scan on sp_quad_ind
+ Index Cond: (p <@ '(1000,1000),(200,200)'::box)
+(5 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+ count
+-------
+ 1057
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+ QUERY PLAN
+---------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on quad_point_tbl
+ Recheck Cond: ('(1000,1000),(200,200)'::box @> p)
+ -> Bitmap Index Scan on sp_quad_ind
+ Index Cond: ('(1000,1000),(200,200)'::box @> p)
+(5 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+ count
+-------
+ 1057
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on quad_point_tbl
+ Recheck Cond: (p << '(5000,4000)'::point)
+ -> Bitmap Index Scan on sp_quad_ind
+ Index Cond: (p << '(5000,4000)'::point)
+(5 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)';
+ count
+-------
+ 6000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on quad_point_tbl
+ Recheck Cond: (p >> '(5000,4000)'::point)
+ -> Bitmap Index Scan on sp_quad_ind
+ Index Cond: (p >> '(5000,4000)'::point)
+(5 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)';
+ count
+-------
+ 4999
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on quad_point_tbl
+ Recheck Cond: (p <^ '(5000,4000)'::point)
+ -> Bitmap Index Scan on sp_quad_ind
+ Index Cond: (p <^ '(5000,4000)'::point)
+(5 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)';
+ count
+-------
+ 5000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on quad_point_tbl
+ Recheck Cond: (p >^ '(5000,4000)'::point)
+ -> Bitmap Index Scan on sp_quad_ind
+ Index Cond: (p >^ '(5000,4000)'::point)
+(5 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)';
+ count
+-------
+ 5999
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
+ QUERY PLAN
+------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on quad_point_tbl
+ Recheck Cond: (p ~= '(4585,365)'::point)
+ -> Bitmap Index Scan on sp_quad_ind
+ Index Cond: (p ~= '(4585,365)'::point)
+(5 rows)
+
+SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
+ count
+-------
+ 1
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+ QUERY PLAN
+---------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on kd_point_tbl
+ Recheck Cond: (p <@ '(1000,1000),(200,200)'::box)
+ -> Bitmap Index Scan on sp_kd_ind
+ Index Cond: (p <@ '(1000,1000),(200,200)'::box)
+(5 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+ count
+-------
+ 1057
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+ QUERY PLAN
+---------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on kd_point_tbl
+ Recheck Cond: ('(1000,1000),(200,200)'::box @> p)
+ -> Bitmap Index Scan on sp_kd_ind
+ Index Cond: ('(1000,1000),(200,200)'::box @> p)
+(5 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+ count
+-------
+ 1057
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on kd_point_tbl
+ Recheck Cond: (p << '(5000,4000)'::point)
+ -> Bitmap Index Scan on sp_kd_ind
+ Index Cond: (p << '(5000,4000)'::point)
+(5 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)';
+ count
+-------
+ 6000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on kd_point_tbl
+ Recheck Cond: (p >> '(5000,4000)'::point)
+ -> Bitmap Index Scan on sp_kd_ind
+ Index Cond: (p >> '(5000,4000)'::point)
+(5 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)';
+ count
+-------
+ 4999
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on kd_point_tbl
+ Recheck Cond: (p <^ '(5000,4000)'::point)
+ -> Bitmap Index Scan on sp_kd_ind
+ Index Cond: (p <^ '(5000,4000)'::point)
+(5 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)';
+ count
+-------
+ 5000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)';
+ QUERY PLAN
+-------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on kd_point_tbl
+ Recheck Cond: (p >^ '(5000,4000)'::point)
+ -> Bitmap Index Scan on sp_kd_ind
+ Index Cond: (p >^ '(5000,4000)'::point)
+(5 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)';
+ count
+-------
+ 5999
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)';
+ QUERY PLAN
+------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on kd_point_tbl
+ Recheck Cond: (p ~= '(4585,365)'::point)
+ -> Bitmap Index Scan on sp_kd_ind
+ Index Cond: (p ~= '(4585,365)'::point)
+(5 rows)
+
+SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)';
+ count
+-------
+ 1
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef';
+ QUERY PLAN
+----------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t = '0123456789abcdef'::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t = '0123456789abcdef'::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef';
+ count
+-------
+ 1000
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde';
+ QUERY PLAN
+---------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t = '0123456789abcde'::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t = '0123456789abcde'::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde';
+ count
+-------
+ 1
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF';
+ QUERY PLAN
+-----------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t = '0123456789abcdefF'::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t = '0123456789abcdefF'::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF';
+ count
+-------
+ 1
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct ';
+ QUERY PLAN
+----------------------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t < 'Aztec Ct '::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t < 'Aztec Ct '::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct ';
+ count
+-------
+ 1705
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct ';
+ QUERY PLAN
+------------------------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t ~<~ 'Aztec Ct '::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t ~<~ 'Aztec Ct '::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct ';
+ count
+-------
+ 1705
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct ';
+ QUERY PLAN
+-----------------------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t <= 'Aztec Ct '::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t <= 'Aztec Ct '::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct ';
+ count
+-------
+ 1706
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct ';
+ QUERY PLAN
+-------------------------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t ~<=~ 'Aztec Ct '::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t ~<=~ 'Aztec Ct '::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct ';
+ count
+-------
+ 1706
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct ';
+ QUERY PLAN
+----------------------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t = 'Aztec Ct '::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t = 'Aztec Ct '::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct ';
+ count
+-------
+ 1
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St ';
+ QUERY PLAN
+----------------------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t = 'Worth St '::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t = 'Worth St '::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St ';
+ count
+-------
+ 2
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St ';
+ QUERY PLAN
+-----------------------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t >= 'Worth St '::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t >= 'Worth St '::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St ';
+ count
+-------
+ 50
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St ';
+ QUERY PLAN
+-------------------------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t ~>=~ 'Worth St '::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t ~>=~ 'Worth St '::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St ';
+ count
+-------
+ 50
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St ';
+ QUERY PLAN
+----------------------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t > 'Worth St '::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t > 'Worth St '::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St ';
+ count
+-------
+ 48
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St ';
+ QUERY PLAN
+------------------------------------------------------------------------------
+ Aggregate
+ -> Bitmap Heap Scan on suffix_text_tbl
+ Recheck Cond: (t ~>~ 'Worth St '::text)
+ -> Bitmap Index Scan on sp_suff_ind
+ Index Cond: (t ~>~ 'Worth St '::text)
+(5 rows)
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St ';
+ count
+-------
+ 48
+(1 row)
+
RESET enable_seqscan;
RESET enable_indexscan;
RESET enable_bitmapscan;
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index a0ffd77e0ed..8e4004ed311 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -1053,7 +1053,22 @@ ORDER BY 1, 2, 3;
2742 | 2 | @@@
2742 | 3 | <@
2742 | 4 | =
-(43 rows)
+ 4000 | 1 | <<
+ 4000 | 1 | ~<~
+ 4000 | 2 | ~<=~
+ 4000 | 3 | =
+ 4000 | 4 | ~>=~
+ 4000 | 5 | >>
+ 4000 | 5 | ~>~
+ 4000 | 6 | ~=
+ 4000 | 8 | <@
+ 4000 | 10 | <^
+ 4000 | 11 | <
+ 4000 | 11 | >^
+ 4000 | 12 | <=
+ 4000 | 14 | >=
+ 4000 | 15 | >
+(58 rows)
-- Check that all opclass search operators have selectivity estimators.
-- This is not absolutely required, but it seems a reasonable thing
@@ -1077,6 +1092,24 @@ WHERE NOT EXISTS(SELECT 1 FROM pg_amop AS p2
---------+-----------
(0 rows)
+-- Check that each operator listed in pg_amop has an associated opclass,
+-- that is one whose opcintype matches oprleft (possibly by coercion).
+-- Otherwise the operator is useless because it cannot be matched to an index.
+-- (In principle it could be useful to list such operators in multiple-datatype
+-- btree opfamilies, but in practice you'd expect there to be an opclass for
+-- every datatype the family knows about.)
+SELECT p1.amopfamily, p1.amopstrategy, p1.amopopr
+FROM pg_amop AS p1
+WHERE NOT EXISTS(SELECT 1 FROM pg_opclass AS p2
+ WHERE p2.opcfamily = p1.amopfamily
+ AND binary_coercible(p2.opcintype, p1.amoplefttype));
+ amopfamily | amopstrategy | amopopr
+------------+--------------+---------
+ 1029 | 27 | 433
+ 1029 | 47 | 757
+ 1029 | 67 | 759
+(3 rows)
+
-- Operators that are primary members of opclasses must be immutable (else
-- it suggests that the index ordering isn't fixed). Operators that are
-- cross-type members need only be stable, since they are just shorthands
@@ -1297,6 +1330,27 @@ ORDER BY 1;
2226 | 1 | hashint4 | cid_ops
(6 rows)
+-- We can also check SP-GiST carefully, since the support routine signatures
+-- are independent of the datatype being indexed.
+SELECT p1.amprocfamily, p1.amprocnum,
+ p2.oid, p2.proname,
+ p3.opfname
+FROM pg_amproc AS p1, pg_proc AS p2, pg_opfamily AS p3
+WHERE p3.opfmethod = (SELECT oid FROM pg_am WHERE amname = 'spgist')
+ AND p1.amprocfamily = p3.oid AND p1.amproc = p2.oid AND
+ (CASE WHEN amprocnum = 1 OR amprocnum = 2 OR amprocnum = 3 OR amprocnum = 4
+ THEN prorettype != 'void'::regtype OR proretset OR pronargs != 2
+ OR proargtypes[0] != 'internal'::regtype
+ OR proargtypes[1] != 'internal'::regtype
+ WHEN amprocnum = 5
+ THEN prorettype != 'bool'::regtype OR proretset OR pronargs != 2
+ OR proargtypes[0] != 'internal'::regtype
+ OR proargtypes[1] != 'internal'::regtype
+ ELSE true END);
+ amprocfamily | amprocnum | oid | proname | opfname
+--------------+-----------+-----+---------+---------
+(0 rows)
+
-- Support routines that are primary members of opfamilies must be immutable
-- (else it suggests that the index ordering isn't fixed). But cross-type
-- members need only be stable, since they are just shorthands
diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out
index cb468e58b91..9cae9d8bf10 100644
--- a/src/test/regress/expected/sanity_check.out
+++ b/src/test/regress/expected/sanity_check.out
@@ -63,6 +63,7 @@ SELECT relname, relhasindex
int8_tbl | f
interval_tbl | f
iportaltest | f
+ kd_point_tbl | t
log_table | f
lseg_tbl | f
main_table | f
@@ -134,6 +135,7 @@ SELECT relname, relhasindex
pg_user_mapping | t
point_tbl | t
polygon_tbl | t
+ quad_point_tbl | t
ramp | f
real_city | f
reltime_tbl | f
@@ -149,6 +151,7 @@ SELECT relname, relhasindex
sql_sizing_profiles | f
stud_emp | f
student | f
+ suffix_text_tbl | t
tenk1 | t
tenk2 | t
test_range_excl | t
@@ -161,7 +164,7 @@ SELECT relname, relhasindex
timetz_tbl | f
tinterval_tbl | f
varchar_tbl | f
-(150 rows)
+(153 rows)
--
-- another sanity check: every system catalog that has OIDs should have
diff --git a/src/test/regress/output/misc.source b/src/test/regress/output/misc.source
index 45bc926407d..b57c5546ded 100644
--- a/src/test/regress/output/misc.source
+++ b/src/test/regress/output/misc.source
@@ -636,6 +636,7 @@ SELECT user_relns() AS user_relns
int8_tbl
interval_tbl
iportaltest
+ kd_point_tbl
log_table
lseg_tbl
main_table
@@ -657,6 +658,7 @@ SELECT user_relns() AS user_relns
person
point_tbl
polygon_tbl
+ quad_point_tbl
ramp
random_tbl
real_city
@@ -668,6 +670,7 @@ SELECT user_relns() AS user_relns
stud_emp
student
subselect_tbl
+ suffix_text_tbl
tenk1
tenk2
test_range_excl
@@ -682,7 +685,7 @@ SELECT user_relns() AS user_relns
toyemp
varchar_tbl
xacttest
-(104 rows)
+(107 rows)
SELECT name(equipment(hobby_construct(text 'skywalking', text 'mer')));
name
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index 85cf23ccb8f..babde51d2c3 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -92,6 +92,36 @@ CREATE INDEX ggpolygonind ON gpolygon_tbl USING gist (f1);
CREATE INDEX ggcircleind ON gcircle_tbl USING gist (f1);
+--
+-- SP-GiST
+--
+
+CREATE TABLE quad_point_tbl AS
+ SELECT point(unique1,unique2) AS p FROM tenk1;
+
+INSERT INTO quad_point_tbl
+ SELECT '(333.0,400.0)'::point FROM generate_series(1,1000);
+
+CREATE INDEX sp_quad_ind ON quad_point_tbl USING spgist (p);
+
+CREATE TABLE kd_point_tbl AS SELECT * FROM quad_point_tbl;
+
+CREATE INDEX sp_kd_ind ON kd_point_tbl USING spgist (p kd_point_ops);
+
+CREATE TABLE suffix_text_tbl AS
+ SELECT name AS t FROM road;
+
+INSERT INTO suffix_text_tbl
+ SELECT '0123456789abcdef' FROM generate_series(1,1000);
+INSERT INTO suffix_text_tbl VALUES ('0123456789abcde');
+INSERT INTO suffix_text_tbl VALUES ('0123456789abcdefF');
+
+CREATE INDEX sp_suff_ind ON suffix_text_tbl USING spgist (t);
+
+--
+-- Test GiST and SP-GiST indexes
+--
+
-- get non-indexed results for comparison purposes
SET enable_seqscan = ON;
@@ -142,9 +172,50 @@ SELECT * FROM point_tbl WHERE f1 IS NOT NULL ORDER BY f1 <-> '0,1';
SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
+SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+
+SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+
+SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)';
+
+SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)';
+
+SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)';
+
+SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)';
+
+SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct ';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct ';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct ';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct ';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct ';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St ';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St ';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St ';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St ';
+
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St ';
+
+-- Now check the results from plain indexscan
SET enable_seqscan = OFF;
SET enable_indexscan = ON;
-SET enable_bitmapscan = ON;
+SET enable_bitmapscan = OFF;
EXPLAIN (COSTS OFF)
SELECT * FROM fast_emp4000
@@ -234,6 +305,115 @@ EXPLAIN (COSTS OFF)
SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)';
+SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)';
+SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)';
+SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)';
+SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
+SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)';
+SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)';
+SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)';
+SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)';
+SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)';
+SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef';
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde';
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF';
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct ';
+SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct ';
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct ';
+SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct ';
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct ';
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St ';
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St ';
+SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St ';
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St ';
+SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St ';
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St ';
+
+-- Now check the results from bitmap indexscan
SET enable_seqscan = OFF;
SET enable_indexscan = OFF;
SET enable_bitmapscan = ON;
@@ -242,6 +422,114 @@ EXPLAIN (COSTS OFF)
SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)';
+SELECT count(*) FROM quad_point_tbl WHERE p << '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)';
+SELECT count(*) FROM quad_point_tbl WHERE p >> '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)';
+SELECT count(*) FROM quad_point_tbl WHERE p <^ '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)';
+SELECT count(*) FROM quad_point_tbl WHERE p >^ '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
+SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)';
+SELECT count(*) FROM kd_point_tbl WHERE p << '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)';
+SELECT count(*) FROM kd_point_tbl WHERE p >> '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)';
+SELECT count(*) FROM kd_point_tbl WHERE p <^ '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)';
+SELECT count(*) FROM kd_point_tbl WHERE p >^ '(5000, 4000)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)';
+SELECT count(*) FROM kd_point_tbl WHERE p ~= '(4585, 365)';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef';
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdef';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde';
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcde';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF';
+SELECT count(*) FROM suffix_text_tbl WHERE t = '0123456789abcdefF';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct ';
+SELECT count(*) FROM suffix_text_tbl WHERE t < 'Aztec Ct ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct ';
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<~ 'Aztec Ct ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct ';
+SELECT count(*) FROM suffix_text_tbl WHERE t <= 'Aztec Ct ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct ';
+SELECT count(*) FROM suffix_text_tbl WHERE t ~<=~ 'Aztec Ct ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct ';
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Aztec Ct ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St ';
+SELECT count(*) FROM suffix_text_tbl WHERE t = 'Worth St ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St ';
+SELECT count(*) FROM suffix_text_tbl WHERE t >= 'Worth St ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St ';
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>=~ 'Worth St ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St ';
+SELECT count(*) FROM suffix_text_tbl WHERE t > 'Worth St ';
+
+EXPLAIN (COSTS OFF)
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St ';
+SELECT count(*) FROM suffix_text_tbl WHERE t ~>~ 'Worth St ';
+
RESET enable_seqscan;
RESET enable_indexscan;
RESET enable_bitmapscan;
diff --git a/src/test/regress/sql/opr_sanity.sql b/src/test/regress/sql/opr_sanity.sql
index 6a79ea180c1..e29148fd5bd 100644
--- a/src/test/regress/sql/opr_sanity.sql
+++ b/src/test/regress/sql/opr_sanity.sql
@@ -831,6 +831,19 @@ WHERE NOT EXISTS(SELECT 1 FROM pg_amop AS p2
WHERE p2.amopfamily = p1.opcfamily
AND binary_coercible(p1.opcintype, p2.amoplefttype));
+-- Check that each operator listed in pg_amop has an associated opclass,
+-- that is one whose opcintype matches oprleft (possibly by coercion).
+-- Otherwise the operator is useless because it cannot be matched to an index.
+-- (In principle it could be useful to list such operators in multiple-datatype
+-- btree opfamilies, but in practice you'd expect there to be an opclass for
+-- every datatype the family knows about.)
+
+SELECT p1.amopfamily, p1.amopstrategy, p1.amopopr
+FROM pg_amop AS p1
+WHERE NOT EXISTS(SELECT 1 FROM pg_opclass AS p2
+ WHERE p2.opcfamily = p1.amopfamily
+ AND binary_coercible(p2.opcintype, p1.amoplefttype));
+
-- Operators that are primary members of opclasses must be immutable (else
-- it suggests that the index ordering isn't fixed). Operators that are
-- cross-type members need only be stable, since they are just shorthands
@@ -1018,6 +1031,25 @@ WHERE p3.opfmethod = (SELECT oid FROM pg_am WHERE amname = 'hash')
OR amproclefttype != amprocrighttype)
ORDER BY 1;
+-- We can also check SP-GiST carefully, since the support routine signatures
+-- are independent of the datatype being indexed.
+
+SELECT p1.amprocfamily, p1.amprocnum,
+ p2.oid, p2.proname,
+ p3.opfname
+FROM pg_amproc AS p1, pg_proc AS p2, pg_opfamily AS p3
+WHERE p3.opfmethod = (SELECT oid FROM pg_am WHERE amname = 'spgist')
+ AND p1.amprocfamily = p3.oid AND p1.amproc = p2.oid AND
+ (CASE WHEN amprocnum = 1 OR amprocnum = 2 OR amprocnum = 3 OR amprocnum = 4
+ THEN prorettype != 'void'::regtype OR proretset OR pronargs != 2
+ OR proargtypes[0] != 'internal'::regtype
+ OR proargtypes[1] != 'internal'::regtype
+ WHEN amprocnum = 5
+ THEN prorettype != 'bool'::regtype OR proretset OR pronargs != 2
+ OR proargtypes[0] != 'internal'::regtype
+ OR proargtypes[1] != 'internal'::regtype
+ ELSE true END);
+
-- Support routines that are primary members of opfamilies must be immutable
-- (else it suggests that the index ordering isn't fixed). But cross-type
-- members need only be stable, since they are just shorthands