aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/gist/gistbuild.c
diff options
context:
space:
mode:
authorAlexander Korotkov <akorotkov@postgresql.org>2022-02-07 23:20:42 +0300
committerAlexander Korotkov <akorotkov@postgresql.org>2022-02-07 23:20:42 +0300
commitf1ea98a7975e15cefdb446385880a2f55224ee7d (patch)
tree8efc57dc7b2480397fc838baa0dd328630f611b6 /src/backend/access/gist/gistbuild.c
parent42a9e88bf6a809e6023c9d50f58cc6b9446f229d (diff)
downloadpostgresql-f1ea98a7975e15cefdb446385880a2f55224ee7d.tar.gz
postgresql-f1ea98a7975e15cefdb446385880a2f55224ee7d.zip
Reduce non-leaf keys overlap in GiST indexes produced by a sorted build
The GiST sorted build currently chooses split points according to the only page space utilization. That may lead to higher non-leaf keys overlap and, in turn, slower search query answers. This commit makes the sorted build use the opclass's picksplit method. Once four pages at the level are accumulated, the picksplit method is applied until each split partition fits the page. Some of our split algorithms could show significant performance degradation while processing 4-times more data at once. But those opclasses haven't received the sorted build support and shouldn't receive it before their split algorithms are improved. Discussion: https://postgr.es/m/CAHqSB9jqtS94e9%3D0vxqQX5dxQA89N95UKyz-%3DA7Y%2B_YJt%2BVW5A%40mail.gmail.com Author: Aliaksandr Kalenik, Sergei Shoulbakov, Andrey Borodin Reviewed-by: Björn Harrtell, Darafei Praliaskouski, Andres Freund Reviewed-by: Alexander Korotkov
Diffstat (limited to 'src/backend/access/gist/gistbuild.c')
-rw-r--r--src/backend/access/gist/gistbuild.c271
1 files changed, 172 insertions, 99 deletions
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 9854116fca5..4db896a533d 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -113,16 +113,23 @@ typedef struct
Page ready_pages[XLR_MAX_BLOCK_ID];
} GISTBuildState;
+#define GIST_SORTED_BUILD_PAGE_NUM 4
+
/*
* In sorted build, we use a stack of these structs, one for each level,
- * to hold an in-memory buffer of the rightmost page at the level. When the
- * page fills up, it is written out and a new page is allocated.
+ * to hold an in-memory buffer of last pages at the level.
+ *
+ * Sorting GiST build requires good linearization of the sort opclass. This is
+ * not always the case in multidimensional data. To tackle the anomalies, we
+ * buffer index tuples and apply picksplit that can be multidimension-aware.
*/
-typedef struct GistSortedBuildPageState
+typedef struct GistSortedBuildLevelState
{
- Page page;
- struct GistSortedBuildPageState *parent; /* Upper level, if any */
-} GistSortedBuildPageState;
+ int current_page;
+ BlockNumber last_blkno;
+ struct GistSortedBuildLevelState *parent; /* Upper level, if any */
+ Page pages[GIST_SORTED_BUILD_PAGE_NUM];
+} GistSortedBuildLevelState;
/* prototypes for private functions */
@@ -130,11 +137,11 @@ static void gistSortedBuildCallback(Relation index, ItemPointer tid,
Datum *values, bool *isnull,
bool tupleIsAlive, void *state);
static void gist_indexsortbuild(GISTBuildState *state);
-static void gist_indexsortbuild_pagestate_add(GISTBuildState *state,
- GistSortedBuildPageState *pagestate,
- IndexTuple itup);
-static void gist_indexsortbuild_pagestate_flush(GISTBuildState *state,
- GistSortedBuildPageState *pagestate);
+static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
+ GistSortedBuildLevelState *levelstate,
+ IndexTuple itup);
+static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
+ GistSortedBuildLevelState *levelstate);
static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state);
static void gistInitBuffering(GISTBuildState *buildstate);
@@ -396,8 +403,7 @@ static void
gist_indexsortbuild(GISTBuildState *state)
{
IndexTuple itup;
- GistSortedBuildPageState *leafstate;
- GistSortedBuildPageState *pagestate;
+ GistSortedBuildLevelState *levelstate;
Page page;
state->pages_allocated = 0;
@@ -414,10 +420,10 @@ gist_indexsortbuild(GISTBuildState *state)
state->pages_allocated++;
state->pages_written++;
- /* Allocate a temporary buffer for the first leaf page. */
- leafstate = palloc(sizeof(GistSortedBuildPageState));
- leafstate->page = page;
- leafstate->parent = NULL;
+ /* Allocate a temporary buffer for the first leaf page batch. */
+ levelstate = palloc0(sizeof(GistSortedBuildLevelState));
+ levelstate->pages[0] = page;
+ levelstate->parent = NULL;
gistinitpage(page, F_LEAF);
/*
@@ -425,133 +431,200 @@ gist_indexsortbuild(GISTBuildState *state)
*/
while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
{
- gist_indexsortbuild_pagestate_add(state, leafstate, itup);
+ gist_indexsortbuild_levelstate_add(state, levelstate, itup);
MemoryContextReset(state->giststate->tempCxt);
}
/*
* Write out the partially full non-root pages.
*
- * Keep in mind that flush can build a new root.
+ * Keep in mind that flush can build a new root. If number of pages is > 1
+ * then new root is required.
*/
- pagestate = leafstate;
- while (pagestate->parent != NULL)
+ while (levelstate->parent != NULL || levelstate->current_page != 0)
{
- GistSortedBuildPageState *parent;
-
- gist_indexsortbuild_pagestate_flush(state, pagestate);
- parent = pagestate->parent;
- pfree(pagestate->page);
- pfree(pagestate);
- pagestate = parent;
+ GistSortedBuildLevelState *parent;
+
+ gist_indexsortbuild_levelstate_flush(state, levelstate);
+ parent = levelstate->parent;
+ for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
+ if (levelstate->pages[i])
+ pfree(levelstate->pages[i]);
+ pfree(levelstate);
+ levelstate = parent;
}
gist_indexsortbuild_flush_ready_pages(state);
/* Write out the root */
- PageSetLSN(pagestate->page, GistBuildLSN);
- PageSetChecksumInplace(pagestate->page, GIST_ROOT_BLKNO);
+ PageSetLSN(levelstate->pages[0], GistBuildLSN);
+ PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO);
smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
- pagestate->page, true);
+ levelstate->pages[0], true);
if (RelationNeedsWAL(state->indexrel))
log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
- pagestate->page, true);
+ levelstate->pages[0], true);
- pfree(pagestate->page);
- pfree(pagestate);
+ pfree(levelstate->pages[0]);
+ pfree(levelstate);
}
/*
- * Add tuple to a page. If the pages is full, write it out and re-initialize
+ * Add tuple to a page. If the pages are full, write them out and re-initialize
* a new page first.
*/
static void
-gist_indexsortbuild_pagestate_add(GISTBuildState *state,
- GistSortedBuildPageState *pagestate,
- IndexTuple itup)
+gist_indexsortbuild_levelstate_add(GISTBuildState *state,
+ GistSortedBuildLevelState *levelstate,
+ IndexTuple itup)
{
Size sizeNeeded;
- /* Does the tuple fit? If not, flush */
- sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData) + state->freespace;
- if (PageGetFreeSpace(pagestate->page) < sizeNeeded)
- gist_indexsortbuild_pagestate_flush(state, pagestate);
+ /* Check if tuple can be added to the current page */
+ sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
+ if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
+ {
+ Page newPage;
+ Page old_page = levelstate->pages[levelstate->current_page];
+ uint16 old_page_flags = GistPageGetOpaque(old_page)->flags;
- gistfillbuffer(pagestate->page, &itup, 1, InvalidOffsetNumber);
+ if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
+ {
+ gist_indexsortbuild_levelstate_flush(state, levelstate);
+ }
+ else
+ levelstate->current_page++;
+
+ if (levelstate->pages[levelstate->current_page] == NULL)
+ levelstate->pages[levelstate->current_page] = palloc(BLCKSZ);
+
+ newPage = levelstate->pages[levelstate->current_page];
+ gistinitpage(newPage, old_page_flags);
+ }
+
+ gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
}
static void
-gist_indexsortbuild_pagestate_flush(GISTBuildState *state,
- GistSortedBuildPageState *pagestate)
+gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
+ GistSortedBuildLevelState *levelstate)
{
- GistSortedBuildPageState *parent;
- IndexTuple *itvec;
- IndexTuple union_tuple;
- int vect_len;
- bool isleaf;
+ GistSortedBuildLevelState *parent;
BlockNumber blkno;
MemoryContext oldCtx;
+ IndexTuple union_tuple;
+ SplitedPageLayout *dist;
+ IndexTuple *itvec;
+ int vect_len;
+ bool isleaf = GistPageIsLeaf(levelstate->pages[0]);
- /* check once per page */
CHECK_FOR_INTERRUPTS();
- if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
- gist_indexsortbuild_flush_ready_pages(state);
+ oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
- /*
- * The page is now complete. Assign a block number to it, and add it to
- * the list of finished pages. (We don't write it out immediately, because
- * we want to WAL-log the pages in batches.)
- */
- blkno = state->pages_allocated++;
- state->ready_blknos[state->ready_num_pages] = blkno;
- state->ready_pages[state->ready_num_pages] = pagestate->page;
- state->ready_num_pages++;
+ /* Get index tuples from first page */
+ itvec = gistextractpage(levelstate->pages[0], &vect_len);
+ if (levelstate->current_page > 0)
+ {
+ /* Append tuples from each page */
+ for (int i = 1; i < levelstate->current_page + 1; i++)
+ {
+ int len_local;
+ IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);
- isleaf = GistPageIsLeaf(pagestate->page);
+ itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
+ pfree(itvec_local);
+ }
+
+ /* Apply picksplit to list of all collected tuples */
+ dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
+ }
+ else
+ {
+ /* Create splitted layout from single page */
+ dist = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout));
+ union_tuple = gistunion(state->indexrel, itvec, vect_len,
+ state->giststate);
+ dist->itup = union_tuple;
+ dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
+ dist->block.num = vect_len;
+ }
- /*
- * Form a downlink tuple to represent all the tuples on the page.
- */
- oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
- itvec = gistextractpage(pagestate->page, &vect_len);
- union_tuple = gistunion(state->indexrel, itvec, vect_len,
- state->giststate);
- ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
MemoryContextSwitchTo(oldCtx);
- /*
- * Insert the downlink to the parent page. If this was the root, create a
- * new page as the parent, which becomes the new root.
- */
- parent = pagestate->parent;
- if (parent == NULL)
+ /* Reset page counter */
+ levelstate->current_page = 0;
+
+ /* Create pages for all partitions in split result */
+ for (; dist != NULL; dist = dist->next)
{
- parent = palloc(sizeof(GistSortedBuildPageState));
- parent->page = (Page) palloc(BLCKSZ);
- parent->parent = NULL;
- gistinitpage(parent->page, 0);
+ char *data;
+ Page target;
- pagestate->parent = parent;
- }
- gist_indexsortbuild_pagestate_add(state, parent, union_tuple);
+ /* check once per page */
+ CHECK_FOR_INTERRUPTS();
- /* Re-initialize the page buffer for next page on this level. */
- pagestate->page = palloc(BLCKSZ);
- gistinitpage(pagestate->page, isleaf ? F_LEAF : 0);
+ /* Create page and copy data */
+ data = (char *) (dist->list);
+ target = palloc0(BLCKSZ);
+ gistinitpage(target, isleaf ? F_LEAF : 0);
+ for (int i = 0; i < dist->block.num; i++)
+ {
+ IndexTuple thistup = (IndexTuple) data;
- /*
- * Set the right link to point to the previous page. This is just for
- * debugging purposes: GiST only follows the right link if a page is split
- * concurrently to a scan, and that cannot happen during index build.
- *
- * It's a bit counterintuitive that we set the right link on the new page
- * to point to the previous page, and not the other way round. But GiST
- * pages are not ordered like B-tree pages are, so as long as the
- * right-links form a chain through all the pages in the same level, the
- * order doesn't matter.
- */
- GistPageGetOpaque(pagestate->page)->rightlink = blkno;
+ if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));
+
+ data += IndexTupleSize(thistup);
+ }
+ union_tuple = dist->itup;
+
+ if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
+ gist_indexsortbuild_flush_ready_pages(state);
+
+ /*
+ * The page is now complete. Assign a block number to it, and add it
+ * to the list of finished pages. (We don't write it out immediately,
+ * because we want to WAL-log the pages in batches.)
+ */
+ blkno = state->pages_allocated++;
+ state->ready_blknos[state->ready_num_pages] = blkno;
+ state->ready_pages[state->ready_num_pages] = target;
+ state->ready_num_pages++;
+ ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
+
+ /*
+ * Set the right link to point to the previous page. This is just for
+ * debugging purposes: GiST only follows the right link if a page is
+ * split concurrently to a scan, and that cannot happen during index
+ * build.
+ *
+ * It's a bit counterintuitive that we set the right link on the new
+ * page to point to the previous page, not the other way around. But
+ * GiST pages are not ordered like B-tree pages are, so as long as the
+ * right-links form a chain through all the pages at the same level,
+ * the order doesn't matter.
+ */
+ if (levelstate->last_blkno)
+ GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
+ levelstate->last_blkno = blkno;
+
+ /*
+ * Insert the downlink to the parent page. If this was the root,
+ * create a new page as the parent, which becomes the new root.
+ */
+ parent = levelstate->parent;
+ if (parent == NULL)
+ {
+ parent = palloc0(sizeof(GistSortedBuildLevelState));
+ parent->pages[0] = (Page) palloc(BLCKSZ);
+ parent->parent = NULL;
+ gistinitpage(parent->pages[0], 0);
+
+ levelstate->parent = parent;
+ }
+ gist_indexsortbuild_levelstate_add(state, parent, union_tuple);
+ }
}
static void