diff options
Diffstat (limited to 'src/backend/access')
-rw-r--r-- | src/backend/access/gin/README | 123 | ||||
-rw-r--r-- | src/backend/access/gin/ginbtree.c | 73 | ||||
-rw-r--r-- | src/backend/access/gin/gindatapage.c | 1450 | ||||
-rw-r--r-- | src/backend/access/gin/ginentrypage.c | 134 | ||||
-rw-r--r-- | src/backend/access/gin/ginfast.c | 2 | ||||
-rw-r--r-- | src/backend/access/gin/ginget.c | 117 | ||||
-rw-r--r-- | src/backend/access/gin/gininsert.c | 67 | ||||
-rw-r--r-- | src/backend/access/gin/ginpostinglist.c | 386 | ||||
-rw-r--r-- | src/backend/access/gin/ginvacuum.c | 232 | ||||
-rw-r--r-- | src/backend/access/gin/ginxlog.c | 184 | ||||
-rw-r--r-- | src/backend/access/rmgrdesc/gindesc.c | 45 |
11 files changed, 2139 insertions, 674 deletions
diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README index 1c1c7b64336..3f0c3e2de86 100644 --- a/src/backend/access/gin/README +++ b/src/backend/access/gin/README @@ -135,15 +135,15 @@ same category of null entry are merged into one index entry just as happens with ordinary key entries. * In a key entry at the btree leaf level, at the next SHORTALIGN boundary, -there is an array of zero or more ItemPointers, which store the heap tuple -TIDs for which the indexable items contain this key. This is called the -"posting list". The TIDs in a posting list must appear in sorted order. -If the list would be too big for the index tuple to fit on an index page, -the ItemPointers are pushed out to a separate posting page or pages, and -none appear in the key entry itself. The separate pages are called a -"posting tree"; they are organized as a btree of ItemPointer values. -Note that in either case, the ItemPointers associated with a key can -easily be read out in sorted order; this is relied on by the scan +there is a list of item pointers, in compressed format (see Posting List +Compression section), pointing to the heap tuples for which the indexable +items contain this key. This is called the "posting list". + +If the list would be too big for the index tuple to fit on an index page, the +ItemPointers are pushed out to a separate posting page or pages, and none +appear in the key entry itself. The separate pages are called a "posting +tree" (see below); Note that in either case, the ItemPointers associated with +a key can easily be read out in sorted order; this is relied on by the scan algorithms. * The index tuple header fields of a leaf key entry are abused as follows: @@ -163,6 +163,11 @@ algorithms. * The posting list can be accessed with GinGetPosting(itup) +* If GinITupIsCompressed(itup), the posting list is stored in compressed + format. Otherwise it is just an array of ItemPointers. New tuples are always + stored in compressed format, uncompressed items can be present if the + database was migrated from 9.3 or earlier version. + 2) Posting tree case: * ItemPointerGetBlockNumber(&itup->t_tid) contains the index block number @@ -210,6 +215,76 @@ fit on one pending-list page must have those pages to itself, even if this results in wasting much of the space on the preceding page and the last page for the tuple.) +Posting tree +------------ + +If a posting list is too large to store in-line in a key entry, a posting tree +is created. A posting tree is a B-tree structure, where the ItemPointer is +used as the key. + +Internal posting tree pages use the standard PageHeader and the same "opaque" +struct as other GIN page, but do not contain regular index tuples. Instead, +the contents of the page is an array of PostingItem structs. Each PostingItem +consists of the block number of the child page, and the right bound of that +child page, as an ItemPointer. The right bound of the page is stored right +after the page header, before the PostingItem array. + +Posting tree leaf pages also use the standard PageHeader and opaque struct, +and the right bound of the page is stored right after the page header, +but the page content comprises of 0-32 compressed posting lists, and an +additional array of regular uncompressed item pointers. The compressed posting +lists are stored one after each other, between page header and pd_lower. The +uncompressed array is stored between pd_upper and pd_special. The space +between pd_lower and pd_upper is unused, which allows full-page images of +posting tree leaf pages to skip the unused space in middle (buffer_std = true +in XLogRecData). For historical reasons, this does not apply to internal +pages, or uncompressed leaf pages migrated from earlier versions. + +The item pointers are stored in a number of independent compressed posting +lists (also called segments), instead of one big one, to make random access +to a given item pointer faster: to find an item in a compressed list, you +have to read the list from the beginning, but when the items are split into +multiple lists, you can first skip over to the list containing the item you're +looking for, and read only that segment. Also, an update only needs to +re-encode the affected segment. + +The uncompressed items array is used for insertions, to avoid re-encoding +a compressed list on every update. If there is room on a page, an insertion +simply inserts the new item to the right place in the uncompressed array. +When a page becomes full, it is rewritten, merging all the uncompressed items +are into the compressed lists. When reading, the uncompressed array and the +compressed lists are read in tandem, and merged into one stream of sorted +item pointers. + +Posting List Compression +------------------------ + +To fit as many item pointers on a page as possible, posting tree leaf pages +and posting lists stored inline in entry tree leaf tuples use a lightweight +form of compression. We take advantage of the fact that the item pointers +are stored in sorted order. Instead of storing the block and offset number of +each item pointer separately, we store the difference from the previous item. +That in itself doesn't do much, but it allows us to use so-called varbyte +encoding to compress them. + +Varbyte encoding is a method to encode integers, allowing smaller numbers to +take less space at the cost of larger numbers. Each integer is represented by +variable number of bytes. High bit of each byte in varbyte encoding determines +whether the next byte is still part of this number. Therefore, to read a single +varbyte encoded number, you have to read bytes until you find a byte with the +high bit not set. + +When encoding, the block and offset number forming the item pointer are +combined into a single integer. The offset number is stored in the 11 low +bits (see MaxHeapTuplesPerPageBits in ginpostinglist.c), and the block number +is stored in the higher bits. That requires 43 bits in total, which +conveniently fits in at most 6 bytes. + +A compressed posting list is passed around and stored on disk in a +PackedPostingList struct. The first item in the list is stored uncompressed +as a regular ItemPointerData, followed by the length of the list in bytes, +followed by the packed items. + Concurrency ----------- @@ -260,6 +335,36 @@ page-deletions safe; it stamps the deleted pages with an XID and keeps the deleted pages around with the right-link intact until all concurrent scans have finished.) +Compatibility +------------- + +Compression of TIDs was introduced in 9.4. Some GIN indexes could remain in +uncompressed format because of pg_upgrade from 9.3 or earlier versions. +For compatibility, old uncompressed format is also supported. Following +rules are used to handle it: + +* GIN_ITUP_COMPRESSED flag marks index tuples that contain a posting list. +This flag is stored in high bit of ItemPointerGetBlockNumber(&itup->t_tid). +Use GinItupIsCompressed(itup) to check the flag. + +* Posting tree pages in the new format are marked with the GIN_COMPRESSED flag. + Macros GinPageIsCompressed(page) and GinPageSetCompressed(page) are used to + check and set this flag. + +* All scan operations check format of posting list add use corresponding code +to read its content. + +* When updating an index tuple containing an uncompressed posting list, it +will be replaced with new index tuple containing a compressed list. + +* When updating an uncompressed posting tree leaf page, it's compressed. + +* If vacuum finds some dead TIDs in uncompressed posting lists, they are +converted into compressed posting lists. This assumes that the compressed +posting list fits in the space occupied by the uncompressed list. IOW, we +assume that the compressed version of the page, with the dead items removed, +takes less space than the old uncompressed version. + Limitations ----------- diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 813f96a521b..c17c90fbc2e 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -325,9 +325,10 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, { Page page = BufferGetPage(stack->buffer); XLogRecData *payloadrdata; - bool fit; + GinPlaceToPageRC rc; uint16 xlflags = 0; Page childpage = NULL; + Page newlpage = NULL, newrpage = NULL; if (GinPageIsData(page)) xlflags |= GIN_INSERT_ISDATA; @@ -345,16 +346,17 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, } /* - * Try to put the incoming tuple on the page. If it doesn't fit, - * placeToPage method will return false and leave the page unmodified, and - * we'll have to split the page. + * Try to put the incoming tuple on the page. placeToPage will decide + * if the page needs to be split. */ - START_CRIT_SECTION(); - fit = btree->placeToPage(btree, stack->buffer, stack->off, - insertdata, updateblkno, - &payloadrdata); - if (fit) + rc = btree->placeToPage(btree, stack->buffer, stack, + insertdata, updateblkno, + &payloadrdata, &newlpage, &newrpage); + if (rc == UNMODIFIED) + return true; + else if (rc == INSERTED) { + /* placeToPage did START_CRIT_SECTION() */ MarkBufferDirty(stack->buffer); /* An insert to an internal page finishes the split of the child. */ @@ -373,7 +375,6 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, xlrec.node = btree->index->rd_node; xlrec.blkno = BufferGetBlockNumber(stack->buffer); - xlrec.offset = stack->off; xlrec.flags = xlflags; rdata[0].buffer = InvalidBuffer; @@ -415,20 +416,16 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, return true; } - else + else if (rc == SPLIT) { /* Didn't fit, have to split */ Buffer rbuffer; - Page newlpage; BlockNumber savedRightLink; - Page rpage; XLogRecData rdata[2]; ginxlogSplit data; Buffer lbuffer = InvalidBuffer; Page newrootpg = NULL; - END_CRIT_SECTION(); - rbuffer = GinNewBuffer(btree->index); /* During index build, count the new page */ @@ -443,12 +440,9 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, savedRightLink = GinPageGetOpaque(page)->rightlink; /* - * newlpage is a pointer to memory page, it is not associated with a - * buffer. stack->buffer is not touched yet. + * newlpage and newrpage are pointers to memory pages, not associated + * with buffers. stack->buffer is not touched yet. */ - newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, - insertdata, updateblkno, - &payloadrdata); data.node = btree->index->rd_node; data.rblkno = BufferGetBlockNumber(rbuffer); @@ -481,8 +475,6 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, else rdata[0].next = payloadrdata; - rpage = BufferGetPage(rbuffer); - if (stack->parent == NULL) { /* @@ -508,7 +500,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, data.lblkno = BufferGetBlockNumber(lbuffer); data.flags |= GIN_SPLIT_ROOT; - GinPageGetOpaque(rpage)->rightlink = InvalidBlockNumber; + GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); /* @@ -517,12 +509,12 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, * than overwriting the original page directly, so that we can still * abort gracefully if this fails.) */ - newrootpg = PageGetTempPage(rpage); - GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~GIN_LEAF, BLCKSZ); + newrootpg = PageGetTempPage(newrpage); + GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); btree->fillRoot(btree, newrootpg, BufferGetBlockNumber(lbuffer), newlpage, - BufferGetBlockNumber(rbuffer), rpage); + BufferGetBlockNumber(rbuffer), newrpage); } else { @@ -530,7 +522,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, data.rrlink = savedRightLink; data.lblkno = BufferGetBlockNumber(stack->buffer); - GinPageGetOpaque(rpage)->rightlink = savedRightLink; + GinPageGetOpaque(newrpage)->rightlink = savedRightLink; GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); } @@ -550,16 +542,24 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, START_CRIT_SECTION(); MarkBufferDirty(rbuffer); + MarkBufferDirty(stack->buffer); + /* + * Restore the temporary copies over the real buffers. But don't free + * the temporary copies yet, WAL record data points to them. + */ if (stack->parent == NULL) { - PageRestoreTempPage(newlpage, BufferGetPage(lbuffer)); MarkBufferDirty(lbuffer); - newlpage = newrootpg; + memcpy(BufferGetPage(stack->buffer), newrootpg, BLCKSZ); + memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); + memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); + } + else + { + memcpy(BufferGetPage(stack->buffer), newlpage, BLCKSZ); + memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } - - PageRestoreTempPage(newlpage, BufferGetPage(stack->buffer)); - MarkBufferDirty(stack->buffer); /* write WAL record */ if (RelationNeedsWAL(btree->index)) @@ -568,7 +568,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(BufferGetPage(stack->buffer), recptr); - PageSetLSN(rpage, recptr); + PageSetLSN(BufferGetPage(rbuffer), recptr); if (stack->parent == NULL) PageSetLSN(BufferGetPage(lbuffer), recptr); } @@ -582,6 +582,11 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, if (stack->parent == NULL) UnlockReleaseBuffer(lbuffer); + pfree(newlpage); + pfree(newrpage); + if (newrootpg) + pfree(newrootpg); + /* * If we split the root, we're done. Otherwise the split is not * complete until the downlink for the new page has been inserted to @@ -592,6 +597,8 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, else return false; } + else + elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc); } /* diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 164d3ba00e6..8504f4c7afc 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * gindatapage.c - * page utilities routines for the postgres inverted index access method. + * routines for handling GIN posting tree pages. * * * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group @@ -15,12 +15,166 @@ #include "postgres.h" #include "access/gin_private.h" +#include "access/heapam_xlog.h" +#include "lib/ilist.h" #include "miscadmin.h" +#include "utils/memutils.h" #include "utils/rel.h" /* - * Checks, should we move to right link... - * Compares inserting itemp pointer with right bound of current page + * Size of the posting lists stored on leaf pages, in bytes. The code can + * deal with any size, but random access is more efficient when a number of + * smaller lists are stored, rather than one big list. + */ +#define GinPostingListSegmentMaxSize 256 + +/* + * Existing posting lists smaller than this are recompressed, when inserting + * new items to page. + */ +#define GinPostingListSegmentMinSize 192 + +/* + * At least this many items fit in a GinPostingListSegmentMaxSize-bytes + * long segment. This is used when estimating how much space is required + * for N items, at minimum. + */ +#define MinTuplesPerSegment ((GinPostingListSegmentMaxSize - 2) / 6) + +/* + * A working struct for manipulating a posting tree leaf page. + */ +typedef struct +{ + dlist_head segments; /* a list of leafSegmentInfos */ + + /* + * The following fields represent how the segments are split across + * pages, if a page split is required. Filled in by leafRepackItems. + */ + dlist_node *lastleft; /* last segment on left page */ + int lsize; /* total size on left page */ + int rsize; /* total size on right page */ +} disassembledLeaf; + +typedef struct +{ + dlist_node node; /* linked list pointers */ + + /* + * The following fields represent the items in this segment. + * If 'items' is not NULL, it contains a palloc'd array of the items + * in this segment. If 'seg' is not NULL, it contains the items in an + * already-compressed format. It can point to an on-disk page (!modified), + * or a palloc'd segment in memory. If both are set, they must represent + * the same items. + */ + GinPostingList *seg; + ItemPointer items; + int nitems; /* # of items in 'items', if items != NULL */ + + bool modified; /* is this segment on page already? */ +} leafSegmentInfo; + +static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems); +static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + XLogRecData **prdata, Page *newlpage, Page *newrpage); + +static disassembledLeaf *disassembleLeaf(Page page); +static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining); +static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems); + + +static void dataPlaceToPageLeafRecompress(Buffer buf, + disassembledLeaf *leaf, + XLogRecData **prdata); +static void dataPlaceToPageLeafSplit(Buffer buf, + disassembledLeaf *leaf, + ItemPointerData lbound, ItemPointerData rbound, + XLogRecData **prdata, Page lpage, Page rpage); + +/* + * Read all TIDs from leaf data page to single uncompressed array. + */ +ItemPointer +GinDataLeafPageGetItems(Page page, int *nitems) +{ + ItemPointer result; + + if (GinPageIsCompressed(page)) + { + GinPostingList *ptr = GinDataLeafPageGetPostingList(page); + Size len = GinDataLeafPageGetPostingListSize(page); + + result = ginPostingListDecodeAllSegments(ptr, len, nitems); + } + else + { + ItemPointer tmp = dataLeafPageGetUncompressed(page, nitems); + + result = palloc((*nitems) * sizeof(ItemPointerData)); + memcpy(result, tmp, (*nitems) * sizeof(ItemPointerData)); + } + + return result; +} + +/* + * Places all TIDs from leaf data page to bitmap. + */ +int +GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm) +{ + ItemPointer uncompressed; + int nitems; + + if (GinPageIsCompressed(page)) + { + GinPostingList *segment = GinDataLeafPageGetPostingList(page); + Size len = GinDataLeafPageGetPostingListSize(page); + + nitems = ginPostingListDecodeAllSegmentsToTbm(segment, len, tbm); + } + else + { + uncompressed = dataLeafPageGetUncompressed(page, &nitems); + + if (nitems > 0) + tbm_add_tuples(tbm, uncompressed, nitems, false); + } + + return nitems; +} + +/* + * Get pointer to the uncompressed array of items on a pre-9.4 format + * uncompressed leaf page. The number of items in the array is returned in + * *nitems. + */ +static ItemPointer +dataLeafPageGetUncompressed(Page page, int *nitems) +{ + ItemPointer items; + + Assert(!GinPageIsCompressed(page)); + + /* + * In the old pre-9.4 page format, the whole page content is used for + * uncompressed items, and the number of items is stored in 'maxoff' + */ + items = (ItemPointer) GinDataPageGetData(page); + *nitems = GinPageGetOpaque(page)->maxoff; + + return items; +} + +/* + * Check if we should follow the right link to find the item we're searching + * for. + * + * Compares inserting item pointer with the right bound of the current page. */ static bool dataIsMoveRight(GinBtree btree, Page page) @@ -34,8 +188,8 @@ dataIsMoveRight(GinBtree btree, Page page) } /* - * Find correct PostingItem in non-leaf page. It supposed that page - * correctly chosen and searching value SHOULD be on page + * Find correct PostingItem in non-leaf page. It is assumed that this is + * the correct page, and the searched value SHOULD be on the page. */ static BlockNumber dataLocateItem(GinBtree btree, GinBtreeStack *stack) @@ -102,63 +256,7 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack) } /* - * Searches correct position for value on leaf page. - * Page should be correctly chosen. - * Returns true if value found on page. - */ -static bool -dataLocateLeafItem(GinBtree btree, GinBtreeStack *stack) -{ - Page page = BufferGetPage(stack->buffer); - OffsetNumber low, - high; - int result; - - Assert(GinPageIsLeaf(page)); - Assert(GinPageIsData(page)); - - if (btree->fullScan) - { - stack->off = FirstOffsetNumber; - return TRUE; - } - - low = FirstOffsetNumber; - high = GinPageGetOpaque(page)->maxoff; - - if (high < low) - { - stack->off = FirstOffsetNumber; - return false; - } - - high++; - - while (high > low) - { - OffsetNumber mid = low + ((high - low) / 2); - - result = ginCompareItemPointers(&btree->itemptr, - GinDataPageGetItemPointer(page, mid)); - - if (result == 0) - { - stack->off = mid; - return true; - } - else if (result > 0) - low = mid + 1; - else - high = mid; - } - - stack->off = high; - return false; -} - -/* - * Finds links to blkno on non-leaf page, returns - * offset of PostingItem + * Find link to blkno on non-leaf page, returns offset of PostingItem */ static OffsetNumber dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff) @@ -203,7 +301,7 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor } /* - * returns blkno of leftmost child + * Return blkno of leftmost child */ static BlockNumber dataGetLeftMostPage(GinBtree btree, Page page) @@ -219,36 +317,7 @@ dataGetLeftMostPage(GinBtree btree, Page page) } /* - * add ItemPointer to a leaf page. - */ -void -GinDataPageAddItemPointer(Page page, ItemPointer data, OffsetNumber offset) -{ - OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; - char *ptr; - - Assert(ItemPointerIsValid(data)); - Assert(GinPageIsLeaf(page)); - - if (offset == InvalidOffsetNumber) - { - ptr = (char *) GinDataPageGetItemPointer(page, maxoff + 1); - } - else - { - ptr = (char *) GinDataPageGetItemPointer(page, offset); - if (maxoff + 1 - offset != 0) - memmove(ptr + sizeof(ItemPointerData), - ptr, - (maxoff - offset + 1) * sizeof(ItemPointerData)); - } - memcpy(ptr, data, sizeof(ItemPointerData)); - - GinPageGetOpaque(page)->maxoff++; -} - -/* - * add PostingItem to a non-leaf page. + * Add PostingItem to a non-leaf page. */ void GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset) @@ -266,7 +335,7 @@ GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset) else { ptr = (char *) GinDataPageGetPostingItem(page, offset); - if (maxoff + 1 - offset != 0) + if (offset != maxoff + 1) memmove(ptr + sizeof(PostingItem), ptr, (maxoff - offset + 1) * sizeof(PostingItem)); @@ -277,7 +346,7 @@ GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset) } /* - * Deletes posting item from non-leaf page + * Delete posting item from non-leaf page */ void GinPageDeletePostingItem(Page page, OffsetNumber offset) @@ -296,269 +365,725 @@ GinPageDeletePostingItem(Page page, OffsetNumber offset) } /* - * checks space to install new value, - * item pointer never deletes! + * Places keys to leaf data page and fills WAL record. */ -static bool -dataIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off, void *insertdata) +static GinPlaceToPageRC +dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, XLogRecData **prdata, + Page *newlpage, Page *newrpage) { + GinBtreeDataLeafInsertData *items = insertdata; + ItemPointer newItems = &items->items[items->curitem]; + int maxitems = items->nitem - items->curitem; Page page = BufferGetPage(buf); + int i; + ItemPointerData rbound; + ItemPointerData lbound; + bool needsplit; + bool append; + int segsize; + Size freespace; + MemoryContext tmpCxt; + MemoryContext oldCxt; + disassembledLeaf *leaf; + leafSegmentInfo *lastleftinfo; + ItemPointerData maxOldItem; + ItemPointerData remaining; Assert(GinPageIsData(page)); - if (GinPageIsLeaf(page)) - { - GinBtreeDataLeafInsertData *items = insertdata; + rbound = *GinDataPageGetRightBound(page); - if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff) + /* + * Count how many of the new items belong to this page. + */ + if (!GinPageRightMost(page)) + { + for (i = 0; i < maxitems; i++) { - if ((items->nitem - items->curitem) * sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page)) - return true; + if (ginCompareItemPointers(&newItems[i], &rbound) > 0) + { + /* + * This needs to go to some other location in the tree. (The + * caller should've chosen the insert location so that at least + * the first item goes here.) + */ + Assert(i > 0); + break; + } } - else if (sizeof(ItemPointerData) <= GinDataPageGetFreeSpace(page)) - return true; + maxitems = i; } - else if (sizeof(PostingItem) <= GinDataPageGetFreeSpace(page)) - return true; - return false; -} + /* + * The following operations do quite a lot of small memory allocations, + * create a temporary memory context so that we don't need to keep track + * of them individually. + */ + tmpCxt = AllocSetContextCreate(CurrentMemoryContext, + "Gin split temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldCxt = MemoryContextSwitchTo(tmpCxt); -/* - * Places keys to page and fills WAL record. In case leaf page and - * build mode puts all ItemPointers to page. - * - * If none of the keys fit, returns false without modifying the page. - * - * On insertion to an internal node, in addition to inserting the given item, - * the downlink of the existing item at 'off' is updated to point to - * 'updateblkno'. - */ -static bool -dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, - void *insertdata, BlockNumber updateblkno, - XLogRecData **prdata) -{ - Page page = BufferGetPage(buf); - /* these must be static so they can be returned to caller */ - static XLogRecData rdata[2]; + leaf = disassembleLeaf(page); - /* quick exit if it doesn't fit */ - if (!dataIsEnoughSpace(btree, buf, off, insertdata)) - return false; + /* + * Are we appending to the end of the page? IOW, are all the new items + * larger than any of the existing items. + */ + if (!dlist_is_empty(&leaf->segments)) + { + lastleftinfo = dlist_container(leafSegmentInfo, node, + dlist_tail_node(&leaf->segments)); + if (!lastleftinfo->items) + lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg, + &lastleftinfo->nitems); + maxOldItem = lastleftinfo->items[lastleftinfo->nitems - 1]; + if (ginCompareItemPointers(&newItems[0], &maxOldItem) >= 0) + append = true; + else + append = false; + } + else + { + ItemPointerSetMin(&maxOldItem); + append = true; + } - *prdata = rdata; - Assert(GinPageIsData(page)); + /* + * If we're appending to the end of the page, we will append as many items + * as we can fit (after splitting), and stop when the pages becomes full. + * Otherwise we have to limit the number of new items to insert, because + * once we start packing we can't just stop when we run out of space, + * because we must make sure that all the old items still fit. + */ + if (GinPageIsCompressed(page)) + freespace = GinDataLeafPageGetFreeSpace(page); + else + freespace = 0; + if (append) + { + /* + * Even when appending, trying to append more items than will fit is + * not completely free, because we will merge the new items and old + * items into an array below. In the best case, every new item fits in + * a single byte, and we can use all the free space on the old page as + * well as the new page. For simplicity, ignore segment overhead etc. + */ + maxitems = Min(maxitems, freespace + GinDataLeafMaxContentSize); + } + else + { + /* + * Calculate a conservative estimate of how many new items we can fit + * on the two pages after splitting. + * + * We can use any remaining free space on the old page to store full + * segments, as well as the new page. Each full-sized segment can hold + * at least MinTuplesPerSegment items + */ + int nnewsegments; - /* Update existing downlink to point to next page (on internal page) */ - if (!GinPageIsLeaf(page)) + nnewsegments = freespace / GinPostingListSegmentMaxSize; + nnewsegments += GinDataLeafMaxContentSize / GinPostingListSegmentMaxSize; + maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment); + } + + /* Add the new items to the segments */ + if (!addItemsToLeaf(leaf, newItems, maxitems)) { - PostingItem *pitem = GinDataPageGetPostingItem(page, off); + /* all items were duplicates, we have nothing to do */ + items->curitem += maxitems; + + MemoryContextSwitchTo(oldCxt); + MemoryContextDelete(tmpCxt); - PostingItemSetBlockNumber(pitem, updateblkno); + return UNMODIFIED; } - if (GinPageIsLeaf(page)) + /* + * Pack the items back to compressed segments, ready for writing to disk. + */ + needsplit = leafRepackItems(leaf, &remaining); + + /* + * Did all the new items fit? + * + * If we're appending, it's OK if they didn't. But as a sanity check, + * verify that all the old items fit. + */ + if (ItemPointerIsValid(&remaining)) + { + if (!append || ItemPointerCompare(&maxOldItem, &remaining) >= 0) + elog(ERROR, "could not split GIN page; all old items didn't fit"); + + /* Count how many of the new items did fit. */ + for (i = 0; i < maxitems; i++) + { + if (ginCompareItemPointers(&newItems[i], &remaining) >= 0) + break; + } + if (i == 0) + elog(ERROR, "could not split GIN page; no new items fit"); + maxitems = i; + } + + if (!needsplit) { - GinBtreeDataLeafInsertData *items = insertdata; - static ginxlogInsertDataLeaf data; - uint32 savedPos = items->curitem; + /* + * Great, all the items fit on a single page. Write the segments to + * the page, and WAL-log appropriately. + * + * Once we start modifying the page, there's no turning back. The + * caller is responsible for calling END_CRIT_SECTION() after writing + * the WAL record. + */ + START_CRIT_SECTION(); + dataPlaceToPageLeafRecompress(buf, leaf, prdata); - if (GinPageRightMost(page) && off > GinPageGetOpaque(page)->maxoff) + if (append) + elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, + items->nitem - items->curitem - maxitems); + else + elog(DEBUG2, "inserted %d new items to block %u; %d bytes (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, + items->nitem - items->curitem - maxitems); + } + else + { + /* + * Had to split. + * + * We already divided the segments between the left and the right + * page. The left page was filled as full as possible, and the rest + * overflowed to the right page. When building a new index, that's + * good, because the table is scanned from beginning to end and there + * won't be any more insertions to the left page during the build. + * This packs the index as tight as possible. But otherwise, split + * 50/50, by moving segments from the left page to the right page + * until they're balanced. + * + * As a further heuristic, when appending items to the end of the + * page, split 75/25, one the assumption that subsequent insertions + * will probably also go to the end. This packs the index somewhat + * tighter when appending to a table, which is very common. + */ + if (!btree->isBuild) { - /* usually, create index... */ - while (items->curitem < items->nitem) + while (dlist_has_prev(&leaf->segments, leaf->lastleft)) { - GinDataPageAddItemPointer(page, items->items + items->curitem, off); - off++; - items->curitem++; + lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft); + + segsize = SizeOfGinPostingList(lastleftinfo->seg); + if (append) + { + if ((leaf->lsize - segsize) - (leaf->lsize - segsize) < BLCKSZ / 4) + break; + } + else + { + if ((leaf->lsize - segsize) - (leaf->rsize + segsize) < 0) + break; + } + + /* don't consider segments moved to right as unmodified */ + lastleftinfo->modified = true; + leaf->lsize -= segsize; + leaf->rsize += segsize; + leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft); } - data.nitem = items->curitem - savedPos; } + Assert(leaf->lsize <= GinDataLeafMaxContentSize); + Assert(leaf->rsize <= GinDataLeafMaxContentSize); + + /* + * Fetch the max item in the left page's last segment; it becomes the + * right bound of the page. + */ + lastleftinfo = dlist_container(leafSegmentInfo, node, leaf->lastleft); + if (!lastleftinfo->items) + lastleftinfo->items = ginPostingListDecode(lastleftinfo->seg, + &lastleftinfo->nitems); + lbound = lastleftinfo->items[lastleftinfo->nitems - 1]; + + *newlpage = MemoryContextAlloc(oldCxt, BLCKSZ); + *newrpage = MemoryContextAlloc(oldCxt, BLCKSZ); + + dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound, + prdata, *newlpage, *newrpage); + + Assert(GinPageRightMost(page) || + ginCompareItemPointers(GinDataPageGetRightBound(*newlpage), + GinDataPageGetRightBound(*newrpage)) < 0); + + if (append) + elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, + items->nitem - items->curitem - maxitems); + else + elog(DEBUG2, "inserted %d items to block %u; split %d/%d (%d to go)", + maxitems, BufferGetBlockNumber(buf), (int) leaf->lsize, (int) leaf->rsize, + items->nitem - items->curitem - maxitems); + } + + MemoryContextSwitchTo(oldCxt); + MemoryContextDelete(tmpCxt); + + items->curitem += maxitems; + + return needsplit ? SPLIT : INSERTED; +} + +/* + * Vacuum a posting tree leaf page. + */ +void +ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) +{ + Page page = BufferGetPage(buffer); + disassembledLeaf *leaf; + bool removedsomething = false; + dlist_iter iter; + + leaf = disassembleLeaf(page); + + /* Vacuum each segment. */ + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); + int oldsegsize; + ItemPointer cleaned; + int ncleaned; + + if (!seginfo->items) + seginfo->items = ginPostingListDecode(seginfo->seg, + &seginfo->nitems); + if (seginfo->seg) + oldsegsize = SizeOfGinPostingList(seginfo->seg); else + oldsegsize = GinDataLeafMaxContentSize; + + cleaned = ginVacuumItemPointers(gvs, + seginfo->items, + seginfo->nitems, + &ncleaned); + pfree(seginfo->items); + seginfo->items = NULL; + seginfo->nitems = 0; + if (cleaned) { - GinDataPageAddItemPointer(page, items->items + items->curitem, off); - items->curitem++; - data.nitem = 1; + if (ncleaned > 0) + { + int npacked; + + seginfo->seg = ginCompressPostingList(cleaned, + ncleaned, + oldsegsize, + &npacked); + /* Removing an item never increases the size of the segment */ + if (npacked != ncleaned) + elog(ERROR, "could not fit vacuumed posting list"); + } + else + { + seginfo->seg = NULL; + seginfo->items = NULL; + } + seginfo->nitems = ncleaned; + seginfo->modified = true; + + removedsomething = true; } + } - rdata[0].buffer = buf; - rdata[0].buffer_std = false; - rdata[0].data = (char *) &data; - rdata[0].len = offsetof(ginxlogInsertDataLeaf, items); - rdata[0].next = &rdata[1]; + /* + * If we removed any items, reconstruct the page from the pieces. + * + * We don't try to re-encode the segments here, even though some of them + * might be really small, now that we've removed some items from them. + * It seems like a waste of effort, as there isn't really any benefit from + * larger segments per se; larger segments only help you to pack more + * items in the same space. We might as well delay doing that until the + * next insertion, which will need to re-encode at least part of the page + * anyway. + * + * Also note if the page was in uncompressed, pre-9.4 format before, it + * is now represented as one huge segment that contains all the items. + * It might make sense to split that, to speed up random access, but we + * don't bother. You'll have to REINDEX anyway if you want the full gain + * of the new tighter index format. + */ + if (removedsomething) + { + XLogRecData *payloadrdata; - rdata[1].buffer = buf; - rdata[1].buffer_std = false; - rdata[1].data = (char *) &items->items[savedPos]; - rdata[1].len = sizeof(ItemPointerData) * data.nitem; - rdata[1].next = NULL; + START_CRIT_SECTION(); + dataPlaceToPageLeafRecompress(buffer, leaf, &payloadrdata); + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(indexrel)) + { + XLogRecPtr recptr; + XLogRecData rdata; + ginxlogVacuumDataLeafPage xlrec; + + xlrec.node = indexrel->rd_node; + xlrec.blkno = BufferGetBlockNumber(buffer); + + rdata.buffer = InvalidBuffer; + rdata.data = (char *) &xlrec; + rdata.len = offsetof(ginxlogVacuumDataLeafPage, data); + rdata.next = payloadrdata; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE, &rdata); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); } - else +} + +/* + * Assemble a disassembled posting tree leaf page back to a buffer. + * + * *prdata is filled with WAL information about this operation. The caller + * is responsible for inserting to the WAL, along with any other information + * about the operation that triggered this recompression. + */ +static void +dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf, + XLogRecData **prdata) +{ + Page page = BufferGetPage(buf); + char *ptr; + int newsize; + int unmodifiedsize; + bool modified = false; + dlist_iter iter; + /* + * these must be static so they can be returned to caller (no pallocs + * since we're in a critical section!) + */ + static ginxlogRecompressDataLeaf recompress_xlog; + static XLogRecData rdata[2]; + + ptr = (char *) GinDataLeafPageGetPostingList(page); + newsize = 0; + unmodifiedsize = 0; + modified = false; + dlist_foreach(iter, &leaf->segments) { - PostingItem *pitem = insertdata; + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); + int segsize; + + if (seginfo->modified) + modified = true; + + /* + * Nothing to do with empty segments, except keep track if they've been + * modified + */ + if (seginfo->seg == NULL) + { + Assert(seginfo->items == NULL); + continue; + } - GinDataPageAddPostingItem(page, pitem, off); + segsize = SizeOfGinPostingList(seginfo->seg); - rdata[0].buffer = buf; - rdata[0].buffer_std = false; - rdata[0].data = (char *) pitem; - rdata[0].len = sizeof(PostingItem); - rdata[0].next = NULL; + if (!modified) + unmodifiedsize += segsize; + else + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + newsize += segsize; } + Assert(newsize < GinDataLeafMaxContentSize); + GinDataLeafPageSetPostingListSize(page, newsize); + GinPageSetCompressed(page); /* in case it was in pre-9.4 format before */ + + /* Put WAL data */ + recompress_xlog.length = (uint16) newsize; + recompress_xlog.unmodifiedsize = unmodifiedsize; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &recompress_xlog; + rdata[0].len = offsetof(ginxlogRecompressDataLeaf, newdata); + rdata[0].next = &rdata[1]; + + rdata[1].buffer = buf; + rdata[1].buffer_std = TRUE; + rdata[1].data = ((char *) GinDataLeafPageGetPostingList(page)) + unmodifiedsize; + rdata[1].len = newsize - unmodifiedsize; + rdata[1].next = NULL; - return true; + *prdata = rdata; } /* - * split page and fills WAL record. original buffer(lbuf) leaves untouched, - * returns shadow page of lbuf filled new data. In leaf page and build mode puts all - * ItemPointers to pages. Also, in build mode splits data by way to full fulled - * left page + * Like dataPlaceToPageLeafRecompress, but writes the disassembled leaf + * segments to two pages instead of one. + * + * This is different from the non-split cases in that this does not modify + * the original page directly, but to temporary in-memory copies of the new + * left and right pages. */ -static Page -dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, - void *insertdata, BlockNumber updateblkno, XLogRecData **prdata) +static void +dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, + ItemPointerData lbound, ItemPointerData rbound, + XLogRecData **prdata, Page lpage, Page rpage) { char *ptr; - OffsetNumber separator; - ItemPointer bound; - Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf)); - bool isleaf = GinPageIsLeaf(lpage); - ItemPointerData oldbound = *GinDataPageGetRightBound(lpage); - int sizeofitem = GinSizeOfDataPageItem(lpage); - OffsetNumber maxoff = GinPageGetOpaque(lpage)->maxoff; - Page rpage = BufferGetPage(rbuf); - Size pageSize = PageGetPageSize(lpage); - Size freeSpace; - + int segsize; + int lsize; + int rsize; + dlist_node *node; + dlist_node *firstright; + leafSegmentInfo *seginfo; /* these must be static so they can be returned to caller */ - static ginxlogSplitData data; - static XLogRecData rdata[2]; - static char vector[2 * BLCKSZ]; - - GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize); - freeSpace = GinDataPageGetFreeSpace(rpage); + static ginxlogSplitDataLeaf split_xlog; + static XLogRecData rdata[3]; - *prdata = rdata; + /* Initialize temporary pages to hold the new left and right pages */ + GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); - /* Update existing downlink to point to next page (on internal page) */ - if (!isleaf) + /* + * Copy the segments that go to the left page. + * + * XXX: We should skip copying the unmodified part of the left page, like + * we do when recompressing. + */ + lsize = 0; + ptr = (char *) GinDataLeafPageGetPostingList(lpage); + firstright = dlist_next_node(&leaf->segments, leaf->lastleft); + for (node = dlist_head_node(&leaf->segments); + node != firstright; + node = dlist_next_node(&leaf->segments, node)) { - PostingItem *pitem = GinDataPageGetPostingItem(lpage, off); + seginfo = dlist_container(leafSegmentInfo, node, node); + segsize = SizeOfGinPostingList(seginfo->seg); - PostingItemSetBlockNumber(pitem, updateblkno); + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + lsize += segsize; } - - if (isleaf) + Assert(lsize == leaf->lsize); + GinDataLeafPageSetPostingListSize(lpage, lsize); + *GinDataPageGetRightBound(lpage) = lbound; + + /* Copy the segments that go to the right page */ + ptr = (char *) GinDataLeafPageGetPostingList(rpage); + rsize = 0; + for (node = firstright; + ; + node = dlist_next_node(&leaf->segments, node)) { - memcpy(vector, - GinDataPageGetItemPointer(lpage, FirstOffsetNumber), - maxoff * sizeof(ItemPointerData)); + seginfo = dlist_container(leafSegmentInfo, node, node); + segsize = SizeOfGinPostingList(seginfo->seg); + + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + rsize += segsize; + + if (!dlist_has_next(&leaf->segments, node)) + break; } - else + Assert(rsize == leaf->rsize); + GinDataLeafPageSetPostingListSize(rpage, rsize); + *GinDataPageGetRightBound(rpage) = rbound; + + /* Create WAL record */ + split_xlog.lsize = lsize; + split_xlog.rsize = rsize; + split_xlog.lrightbound = lbound; + split_xlog.rrightbound = rbound; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &split_xlog; + rdata[0].len = sizeof(ginxlogSplitDataLeaf); + rdata[0].next = &rdata[1]; + + rdata[1].buffer = InvalidBuffer; + rdata[1].data = (char *) GinDataLeafPageGetPostingList(lpage); + rdata[1].len = lsize; + rdata[1].next = &rdata[2]; + + rdata[2].buffer = InvalidBuffer; + rdata[2].data = (char *) GinDataLeafPageGetPostingList(rpage); + rdata[2].len = rsize; + rdata[2].next = NULL; + + *prdata = rdata; +} + +/* + * Place a PostingItem to page, and fill a WAL record. + * + * If the item doesn't fit, returns false without modifying the page. + * + * In addition to inserting the given item, the downlink of the existing item + * at 'off' is updated to point to 'updateblkno'. + */ +static GinPlaceToPageRC +dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + XLogRecData **prdata, Page *newlpage, Page *newrpage) +{ + Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; + PostingItem *pitem; + /* these must be static so they can be returned to caller */ + static XLogRecData rdata; + static ginxlogInsertDataInternal data; + + /* split if we have to */ + if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem)) { - memcpy(vector, - GinDataPageGetPostingItem(lpage, FirstOffsetNumber), - maxoff * sizeof(PostingItem)); + dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno, + prdata, newlpage, newrpage); + return SPLIT; } - if (isleaf && GinPageRightMost(lpage) && off > GinPageGetOpaque(lpage)->maxoff) - { - /* append new items to the end */ - GinBtreeDataLeafInsertData *items = insertdata; + *prdata = &rdata; + Assert(GinPageIsData(page)); - while (items->curitem < items->nitem && - maxoff * sizeof(ItemPointerData) < 2 * (freeSpace - sizeof(ItemPointerData))) - { - memcpy(vector + maxoff * sizeof(ItemPointerData), - items->items + items->curitem, - sizeof(ItemPointerData)); - maxoff++; - items->curitem++; - } - } + START_CRIT_SECTION(); + + /* Update existing downlink to point to next page (on internal page) */ + pitem = GinDataPageGetPostingItem(page, off); + PostingItemSetBlockNumber(pitem, updateblkno); + + /* Add new item */ + pitem = (PostingItem *) insertdata; + GinDataPageAddPostingItem(page, pitem, off); + + data.offset = off; + data.newitem = *pitem; + + rdata.buffer = buf; + rdata.buffer_std = false; + rdata.data = (char *) &data; + rdata.len = sizeof(ginxlogInsertDataInternal); + rdata.next = NULL; + + return INSERTED; +} + +/* + * Places an item (or items) to a posting tree. Calls relevant function of + * internal of leaf page because they are handled very differently. + */ +static GinPlaceToPageRC +dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + XLogRecData **prdata, + Page *newlpage, Page *newrpage) +{ + Page page = BufferGetPage(buf); + + Assert(GinPageIsData(page)); + + if (GinPageIsLeaf(page)) + return dataPlaceToPageLeaf(btree, buf, stack, insertdata, + prdata, newlpage, newrpage); else - { - ptr = vector + (off - 1) * sizeofitem; - if (maxoff + 1 - off != 0) - memmove(ptr + sizeofitem, ptr, (maxoff - off + 1) * sizeofitem); - if (isleaf) - { - GinBtreeDataLeafInsertData *items = insertdata; + return dataPlaceToPageInternal(btree, buf, stack, + insertdata, updateblkno, + prdata, newlpage, newrpage); +} - memcpy(ptr, items->items + items->curitem, sizeofitem); - items->curitem++; - } - else - { - PostingItem *pitem = insertdata; +/* + * Split page and fill WAL record. Returns a new temp buffer filled with data + * that should go to the left page. The original buffer is left untouched. + */ +static void +dataSplitPageInternal(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + XLogRecData **prdata, Page *newlpage, Page *newrpage) +{ + Page oldpage = BufferGetPage(origbuf); + OffsetNumber off = stack->off; + int nitems = GinPageGetOpaque(oldpage)->maxoff; + Size pageSize = PageGetPageSize(oldpage); + ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage); + ItemPointer bound; + Page lpage; + Page rpage; + OffsetNumber separator; - memcpy(ptr, pitem, sizeofitem); - } + /* these must be static so they can be returned to caller */ + static ginxlogSplitDataInternal data; + static XLogRecData rdata[4]; + static PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1]; - maxoff++; - } + lpage = PageGetTempPage(oldpage); + rpage = PageGetTempPage(oldpage); + GinInitPage(lpage, GinPageGetOpaque(oldpage)->flags, pageSize); + GinInitPage(rpage, GinPageGetOpaque(oldpage)->flags, pageSize); + + *prdata = rdata; /* - * we assume that during index creation the table scanned from beginning - * to end, so ItemPointers are in monotonically increasing order. + * First construct a new list of PostingItems, which includes all the + * old items, and the new item. */ - if (btree->isBuild && GinPageRightMost(lpage)) - separator = freeSpace / sizeofitem; - else - separator = maxoff / 2; + memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber), + (off - 1) * sizeof(PostingItem)); - GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize); - GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize); + allitems[off - 1] = *((PostingItem *) insertdata); + memcpy(&allitems[off], GinDataPageGetPostingItem(oldpage, off), + (nitems - (off - 1)) * sizeof(PostingItem)); + nitems++; - if (isleaf) - memcpy(GinDataPageGetItemPointer(lpage, FirstOffsetNumber), - vector, separator * sizeof(ItemPointerData)); - else - memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber), - vector, separator * sizeof(PostingItem)); + /* Update existing downlink to point to next page */ + PostingItemSetBlockNumber(&allitems[off], updateblkno); - GinPageGetOpaque(lpage)->maxoff = separator; - if (isleaf) - memcpy(GinDataPageGetItemPointer(rpage, FirstOffsetNumber), - vector + separator * sizeof(ItemPointerData), - (maxoff - separator) * sizeof(ItemPointerData)); + /* + * When creating a new index, fit as many tuples as possible on the left + * page, on the assumption that the table is scanned from beginning to + * end. This packs the index as tight as possible. + */ + if (btree->isBuild && GinPageRightMost(oldpage)) + separator = GinNonLeafDataPageGetFreeSpace(rpage) / sizeof(PostingItem); else - memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber), - vector + separator * sizeof(PostingItem), - (maxoff - separator) * sizeof(PostingItem)); + separator = nitems / 2; - GinPageGetOpaque(rpage)->maxoff = maxoff - separator; + memcpy(GinDataPageGetPostingItem(lpage, FirstOffsetNumber), allitems, separator * sizeof(PostingItem)); + GinPageGetOpaque(lpage)->maxoff = separator; + memcpy(GinDataPageGetPostingItem(rpage, FirstOffsetNumber), + &allitems[separator], (nitems - separator) * sizeof(PostingItem)); + GinPageGetOpaque(rpage)->maxoff = nitems - separator; /* set up right bound for left page */ bound = GinDataPageGetRightBound(lpage); - if (GinPageIsLeaf(lpage)) - *bound = *GinDataPageGetItemPointer(lpage, - GinPageGetOpaque(lpage)->maxoff); - else - *bound = GinDataPageGetPostingItem(lpage, - GinPageGetOpaque(lpage)->maxoff)->key; + *bound = GinDataPageGetPostingItem(lpage, + GinPageGetOpaque(lpage)->maxoff)->key; /* set up right bound for right page */ - bound = GinDataPageGetRightBound(rpage); - *bound = oldbound; + *GinDataPageGetRightBound(rpage) = oldbound; data.separator = separator; - data.nitem = maxoff; + data.nitem = nitems; data.rightbound = oldbound; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogSplitData); + rdata[0].len = sizeof(ginxlogSplitDataInternal); rdata[0].next = &rdata[1]; rdata[1].buffer = InvalidBuffer; - rdata[1].data = vector; - rdata[1].len = maxoff * sizeofitem; + rdata[1].data = (char *) allitems; + rdata[1].len = nitems * sizeof(PostingItem); rdata[1].next = NULL; - return lpage; + *newlpage = lpage; + *newrpage = rpage; } /* @@ -595,6 +1120,318 @@ ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, Block GinDataPageAddPostingItem(root, &ri, InvalidOffsetNumber); } + +/*** Functions to work with disassembled leaf pages ***/ + +/* + * Disassemble page into a disassembledLeaf struct. + */ +static disassembledLeaf * +disassembleLeaf(Page page) +{ + disassembledLeaf *leaf; + GinPostingList *seg; + Pointer segbegin; + Pointer segend; + + leaf = palloc0(sizeof(disassembledLeaf)); + dlist_init(&leaf->segments); + + if (GinPageIsCompressed(page)) + { + /* + * Create a leafSegment entry for each segment. + */ + seg = GinDataLeafPageGetPostingList(page); + segbegin = (Pointer) seg; + segend = segbegin + GinDataLeafPageGetPostingListSize(page); + while ((Pointer) seg < segend) + { + leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo)); + + seginfo->seg = seg; + seginfo->items = NULL; + seginfo->nitems = 0; + seginfo->modified = false; + dlist_push_tail(&leaf->segments, &seginfo->node); + + seg = GinNextPostingListSegment(seg); + } + } + else + { + /* + * A pre-9.4 format uncompressed page is represented by a single + * segment, with an array of items. + */ + ItemPointer uncompressed; + int nuncompressed; + leafSegmentInfo *seginfo; + + uncompressed = dataLeafPageGetUncompressed(page, &nuncompressed); + + seginfo = palloc(sizeof(leafSegmentInfo)); + + seginfo->seg = NULL; + seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData)); + memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData)); + seginfo->nitems = nuncompressed; + /* make sure we rewrite this to disk */ + seginfo->modified = true; + + dlist_push_tail(&leaf->segments, &seginfo->node); + } + + return leaf; +} + +/* + * Distribute newItems to the segments. + * + * Any segments that acquire new items are decoded, and the new items are + * merged with the old items. + * + * Returns true if any new items were added. False means they were all + * duplicates of existing items on the page. + */ +static bool +addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) +{ + dlist_iter iter; + ItemPointer nextnew = newItems; + int newleft = nNewItems; + bool modified = false; + + /* + * If the page is completely empty, just construct one new segment to + * hold all the new items. + */ + if (dlist_is_empty(&leaf->segments)) + { + /* create a new segment for the new entries */ + leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo)); + + seginfo->seg = NULL; + seginfo->items = newItems; + seginfo->nitems = nNewItems; + seginfo->modified = true; + dlist_push_tail(&leaf->segments, &seginfo->node); + return true; + } + + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur); + int nthis; + ItemPointer tmpitems; + int ntmpitems; + + /* + * How many of the new items fall into this segment? + */ + if (!dlist_has_next(&leaf->segments, iter.cur)) + nthis = newleft; + else + { + leafSegmentInfo *next; + ItemPointerData next_first; + + next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, + dlist_next_node(&leaf->segments, iter.cur)); + if (next->items) + next_first = next->items[0]; + else + { + Assert(next->seg != NULL); + next_first = next->seg->first; + } + + nthis = 0; + while (nthis < newleft && ginCompareItemPointers(&nextnew[nthis], &next_first) < 0) + nthis++; + } + if (nthis == 0) + continue; + + /* Merge the new items with the existing items. */ + if (!cur->items) + cur->items = ginPostingListDecode(cur->seg, &cur->nitems); + + tmpitems = palloc((cur->nitems + nthis) * sizeof(ItemPointerData)); + ntmpitems = ginMergeItemPointers(tmpitems, + cur->items, cur->nitems, + nextnew, nthis); + if (ntmpitems != cur->nitems) + { + cur->items = tmpitems; + cur->nitems = ntmpitems; + cur->seg = NULL; + modified = cur->modified = true; + } + + nextnew += nthis; + newleft -= nthis; + if (newleft == 0) + break; + } + + return modified; +} + +/* + * Recompresses all segments that have been modified. + * + * If not all the items fit on two pages (ie. after split), we store as + * many items as fit, and set *remaining to the first item that didn't fit. + * If all items fit, *remaining is set to invalid. + * + * Returns true if the page has to be split. + * + * XXX: Actually, this re-encodes all segments after the first one that was + * modified, to make sure the new segments are all more or less of equal + * length. That's unnecessarily aggressive; if we've only added a single item + * to one segment, for example, we could re-encode just that single segment, + * as long as it's still smaller than, say, 2x the normal segment size. + */ +static bool +leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining) +{ + dlist_iter iter; + ItemPointer allitems; + int nallitems; + int pgused = 0; + bool needsplit; + int totalpacked; + dlist_mutable_iter miter; + dlist_head recode_list; + int nrecode; + bool recoding; + + ItemPointerSetInvalid(remaining); + + /* + * Find the first segment that needs to be re-coded. Move all segments + * that need recoding to separate list, and count the total number of + * items in them. Also, add up the number of bytes in unmodified segments + * (pgused). + */ + dlist_init(&recode_list); + recoding = false; + nrecode = 0; + pgused = 0; + dlist_foreach_modify(miter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, miter.cur); + + /* If the segment was modified, re-encode it */ + if (seginfo->modified || seginfo->seg == NULL) + recoding = true; + /* + * Also re-encode abnormally small or large segments. (Vacuum can + * leave behind small segments, and conversion from pre-9.4 format + * can leave behind large segments). + */ + else if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize) + recoding = true; + else if (SizeOfGinPostingList(seginfo->seg) > GinPostingListSegmentMaxSize) + recoding = true; + + if (recoding) + { + if (!seginfo->items) + seginfo->items = ginPostingListDecode(seginfo->seg, + &seginfo->nitems); + nrecode += seginfo->nitems; + + dlist_delete(miter.cur); + dlist_push_tail(&recode_list, &seginfo->node); + } + else + pgused += SizeOfGinPostingList(seginfo->seg); + } + + if (nrecode == 0) + return false; /* nothing to do */ + + /* + * Construct one big array of the items that need to be re-encoded. + */ + allitems = palloc(nrecode * sizeof(ItemPointerData)); + nallitems = 0; + dlist_foreach(iter, &recode_list) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); + memcpy(&allitems[nallitems], seginfo->items, seginfo->nitems * sizeof(ItemPointerData)); + nallitems += seginfo->nitems; + } + Assert(nallitems == nrecode); + + /* + * Start packing the items into segments. Stop when we have consumed + * enough space to fill both pages, or we run out of items. + */ + totalpacked = 0; + needsplit = false; + while (totalpacked < nallitems) + { + leafSegmentInfo *seginfo; + int npacked; + GinPostingList *seg; + int segsize; + + seg = ginCompressPostingList(&allitems[totalpacked], + nallitems - totalpacked, + GinPostingListSegmentMaxSize, + &npacked); + segsize = SizeOfGinPostingList(seg); + if (pgused + segsize > GinDataLeafMaxContentSize) + { + if (!needsplit) + { + /* switch to right page */ + Assert(pgused > 0); + leaf->lastleft = dlist_tail_node(&leaf->segments); + needsplit = true; + leaf->lsize = pgused; + pgused = 0; + } + else + { + /* filled both pages */ + *remaining = allitems[totalpacked]; + break; + } + } + + seginfo = palloc(sizeof(leafSegmentInfo)); + seginfo->seg = seg; + seginfo->items = &allitems[totalpacked]; + seginfo->nitems = npacked; + seginfo->modified = true; + + dlist_push_tail(&leaf->segments, &seginfo->node); + + pgused += segsize; + totalpacked += npacked; + } + + if (!needsplit) + { + leaf->lsize = pgused; + leaf->rsize = 0; + } + else + leaf->rsize = pgused; + + Assert(leaf->lsize <= GinDataLeafMaxContentSize); + Assert(leaf->rsize <= GinDataLeafMaxContentSize); + + return needsplit; +} + + +/*** Functions that are exported to the rest of the GIN code ***/ + /* * Creates new posting tree containing the given TIDs. Returns the page * number of the root of the new posting tree. @@ -608,10 +1445,9 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, BlockNumber blkno; Buffer buffer; Page page; + Pointer ptr; int nrootitems; - - /* Calculate how many TIDs will fit on first page. */ - nrootitems = Min(nitems, GinMaxLeafDataItems); + int rootsize; /* * Create the root page. @@ -622,12 +1458,41 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, START_CRIT_SECTION(); - GinInitBuffer(buffer, GIN_DATA | GIN_LEAF); - memcpy(GinDataPageGetData(page), items, sizeof(ItemPointerData) * nrootitems); - GinPageGetOpaque(page)->maxoff = nrootitems; + GinInitPage(page, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + GinPageGetOpaque(page)->rightlink = InvalidBlockNumber; + /* + * Write as many of the items to the root page as fit. In segments + * of max GinPostingListSegmentMaxSize bytes each. + */ + nrootitems = 0; + rootsize = 0; + ptr = (Pointer) GinDataLeafPageGetPostingList(page); + while (nrootitems < nitems) + { + GinPostingList *segment; + int npacked; + int segsize; + + segment = ginCompressPostingList(&items[nrootitems], + nitems - nrootitems, + GinPostingListSegmentMaxSize, + &npacked); + segsize = SizeOfGinPostingList(segment); + if (rootsize + segsize > GinDataLeafMaxContentSize) + break; + + memcpy(ptr, segment, segsize); + ptr += segsize; + rootsize += segsize; + nrootitems += npacked; + pfree(segment); + } + GinDataLeafPageSetPostingListSize(page, rootsize); MarkBufferDirty(buffer); + elog(DEBUG2, "created GIN posting tree with %d items", nrootitems); + if (RelationNeedsWAL(index)) { XLogRecPtr recptr; @@ -636,7 +1501,7 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, data.node = index->rd_node; data.blkno = blkno; - data.nitem = nrootitems; + data.size = rootsize; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; @@ -644,8 +1509,8 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, rdata[0].next = &rdata[1]; rdata[1].buffer = InvalidBuffer; - rdata[1].data = (char *) items; - rdata[1].len = sizeof(ItemPointerData) * nrootitems; + rdata[1].data = (char *) GinDataLeafPageGetPostingList(page); + rdata[1].len = rootsize; rdata[1].next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata); @@ -685,10 +1550,9 @@ ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno) btree->findChildPage = dataLocateItem; btree->getLeftMostChild = dataGetLeftMostPage; btree->isMoveRight = dataIsMoveRight; - btree->findItem = dataLocateLeafItem; + btree->findItem = NULL; btree->findChildPtr = dataFindChildPtr; btree->placeToPage = dataPlaceToPage; - btree->splitPage = dataSplitPage; btree->fillRoot = ginDataFillRoot; btree->prepareDownlink = dataPrepareDownlink; @@ -721,17 +1585,7 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno, btree.itemptr = insertdata.items[insertdata.curitem]; stack = ginFindLeafPage(&btree, false); - if (btree.findItem(&btree, stack)) - { - /* - * Current item already exists in index. - */ - insertdata.curitem++; - LockBuffer(stack->buffer, GIN_UNLOCK); - freeGinBtreeStack(stack); - } - else - ginInsertValue(&btree, stack, &insertdata, buildStats); + ginInsertValue(&btree, stack, &insertdata, buildStats); } } diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index e60c708b01e..6f4e727b3e9 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * ginentrypage.c - * page utilities routines for the postgres inverted index access method. + * routines for handling GIN entry tree pages. * * * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group @@ -15,8 +15,15 @@ #include "postgres.h" #include "access/gin_private.h" +#include "miscadmin.h" #include "utils/rel.h" +static void entrySplitPage(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, + void *insertPayload, + BlockNumber updateblkno, XLogRecData **prdata, + Page *newlpage, Page *newrpage); + /* * Form a tuple for entry tree. * @@ -27,15 +34,15 @@ * format that is being built here. We build on the assumption that we * are making a leaf-level key entry containing a posting list of nipd items. * If the caller is actually trying to make a posting-tree entry, non-leaf - * entry, or pending-list entry, it should pass nipd = 0 and then overwrite - * the t_tid fields as necessary. In any case, ipd can be NULL to skip - * copying any itempointers into the posting list; the caller is responsible - * for filling the posting list afterwards, if ipd = NULL and nipd > 0. + * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite + * the t_tid fields as necessary. In any case, 'data' can be NULL to skip + * filling in the posting list; the caller is responsible for filling it + * afterwards if data = NULL and nipd > 0. */ IndexTuple GinFormTuple(GinState *ginstate, OffsetNumber attnum, Datum key, GinNullCategory category, - ItemPointerData *ipd, uint32 nipd, + Pointer data, Size dataSize, int nipd, bool errorTooBig) { Datum datums[2]; @@ -80,27 +87,25 @@ GinFormTuple(GinState *ginstate, newsize = Max(newsize, minsize); } - newsize = SHORTALIGN(newsize); - GinSetPostingOffset(itup, newsize); - GinSetNPosting(itup, nipd); /* * Add space needed for posting list, if any. Then check that the tuple * won't be too big to store. */ - newsize += sizeof(ItemPointerData) * nipd; + newsize += dataSize; + newsize = MAXALIGN(newsize); - if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize)) + + if (newsize > GinMaxItemSize) { if (errorTooBig) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) newsize, - (unsigned long) Min(INDEX_SIZE_MASK, - GinMaxItemSize), + (unsigned long) GinMaxItemSize, RelationGetRelationName(ginstate->index)))); pfree(itup); return NULL; @@ -119,13 +124,21 @@ GinFormTuple(GinState *ginstate, */ memset((char *) itup + IndexTupleSize(itup), 0, newsize - IndexTupleSize(itup)); - /* set new size in tuple header */ itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; } /* + * Copy in the posting list, if provided + */ + if (data) + { + char *ptr = GinGetPosting(itup); + memcpy(ptr, data, dataSize); + } + + /* * Insert category byte, if needed */ if (category != GIN_CAT_NORM_KEY) @@ -133,37 +146,45 @@ GinFormTuple(GinState *ginstate, Assert(IndexTupleHasNulls(itup)); GinSetNullCategory(itup, ginstate, category); } - - /* - * Copy in the posting list, if provided - */ - if (ipd) - memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd); - return itup; } /* - * Sometimes we reduce the number of posting list items in a tuple after - * having built it with GinFormTuple. This function adjusts the size - * fields to match. + * Read item pointers from leaf entry tuple. + * + * Returns a palloc'd array of ItemPointers. The number of items is returned + * in *nitems. */ -void -GinShortenTuple(IndexTuple itup, uint32 nipd) +ItemPointer +ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup, + int *nitems) { - uint32 newsize; - - Assert(nipd <= GinGetNPosting(itup)); + Pointer ptr = GinGetPosting(itup); + int nipd = GinGetNPosting(itup); + ItemPointer ipd; + int ndecoded; - newsize = GinGetPostingOffset(itup) + sizeof(ItemPointerData) * nipd; - newsize = MAXALIGN(newsize); - - Assert(newsize <= (itup->t_info & INDEX_SIZE_MASK)); - - itup->t_info &= ~INDEX_SIZE_MASK; - itup->t_info |= newsize; - - GinSetNPosting(itup, nipd); + if (GinItupIsCompressed(itup)) + { + if (nipd > 0) + { + ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded); + if (nipd != ndecoded) + elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded", + nipd, ndecoded); + } + else + { + ipd = palloc(0); + } + } + else + { + ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd); + memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd); + } + *nitems = nipd; + return ipd; } /* @@ -492,13 +513,14 @@ entryPreparePage(GinBtree btree, Page page, OffsetNumber off, * the downlink of the existing item at 'off' is updated to point to * 'updateblkno'. */ -static bool -entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, +static GinPlaceToPageRC +entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, void *insertPayload, BlockNumber updateblkno, - XLogRecData **prdata) + XLogRecData **prdata, Page *newlpage, Page *newrpage) { GinBtreeEntryInsertData *insertData = insertPayload; Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; OffsetNumber placed; int cnt = 0; @@ -508,7 +530,13 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, /* quick exit if it doesn't fit */ if (!entryIsEnoughSpace(btree, buf, off, insertData)) - return false; + { + entrySplitPage(btree, buf, stack, insertPayload, updateblkno, + prdata, newlpage, newrpage); + return SPLIT; + } + + START_CRIT_SECTION(); *prdata = rdata; entryPreparePage(btree, page, off, insertData, updateblkno); @@ -522,6 +550,7 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, RelationGetRelationName(btree->index)); data.isDelete = insertData->isDelete; + data.offset = off; rdata[cnt].buffer = buf; rdata[cnt].buffer_std = false; @@ -536,21 +565,24 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, rdata[cnt].len = IndexTupleSize(insertData->entry); rdata[cnt].next = NULL; - return true; + return INSERTED; } /* * Place tuple and split page, original buffer(lbuf) leaves untouched, - * returns shadow page of lbuf filled new data. + * returns shadow pages filled with new data. * Tuples are distributed between pages by equal size on its, not * an equal number! */ -static Page -entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, +static void +entrySplitPage(GinBtree btree, Buffer origbuf, + GinBtreeStack *stack, void *insertPayload, - BlockNumber updateblkno, XLogRecData **prdata) + BlockNumber updateblkno, XLogRecData **prdata, + Page *newlpage, Page *newrpage) { GinBtreeEntryInsertData *insertData = insertPayload; + OffsetNumber off = stack->off; OffsetNumber i, maxoff, separator = InvalidOffsetNumber; @@ -561,8 +593,8 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, char *ptr; IndexTuple itup; Page page; - Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf)); - Page rpage = BufferGetPage(rbuf); + Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf)); + Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf)); Size pageSize = PageGetPageSize(lpage); /* these must be static so they can be returned to caller */ @@ -651,7 +683,8 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, rdata[1].len = tupstoresize; rdata[1].next = NULL; - return lpage; + *newlpage = lpage; + *newrpage = rpage; } /* @@ -719,7 +752,6 @@ ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, btree->findItem = entryLocateLeafEntry; btree->findChildPtr = entryFindChildPtr; btree->placeToPage = entryPlaceToPage; - btree->splitPage = entrySplitPage; btree->fillRoot = ginEntryFillRoot; btree->prepareDownlink = entryPrepareDownlink; diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index 35e1d7114b7..4a65046547e 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -487,7 +487,7 @@ ginHeapTupleFastCollect(GinState *ginstate, IndexTuple itup; itup = GinFormTuple(ginstate, attnum, entries[i], categories[i], - NULL, 0, true); + NULL, 0, 0, true); itup->t_tid = *ht_ctid; collector->tuples[collector->ntuples++] = itup; collector->sumsize += IndexTupleSize(itup); diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 3d99bef76a3..4bdbd4525d5 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -71,24 +71,20 @@ callConsistentFn(GinState *ginstate, GinScanKey key) * Tries to refind previously taken ItemPointer on a posting page. */ static bool -findItemInPostingPage(Page page, ItemPointer item, OffsetNumber *off) +needToStepRight(Page page, ItemPointer item) { - OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; - int res; - if (GinPageGetOpaque(page)->flags & GIN_DELETED) /* page was deleted by concurrent vacuum */ - return false; + return true; - /* - * scan page to find equal or first greater value - */ - for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++) + if (ginCompareItemPointers(item, GinDataPageGetRightBound(page)) > 0 + && !GinPageRightMost(page)) { - res = ginCompareItemPointers(item, GinDataPageGetItemPointer(page, *off)); - - if (res <= 0) - return true; + /* + * the item we're looking is > the right bound of the page, so it + * can't be on this page. + */ + return true; } return false; @@ -143,14 +139,10 @@ scanPostingTree(Relation index, GinScanEntry scanEntry, for (;;) { page = BufferGetPage(buffer); - - if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0 && - GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber) + if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0) { - tbm_add_tuples(scanEntry->matchBitmap, - GinDataPageGetItemPointer(page, FirstOffsetNumber), - GinPageGetOpaque(page)->maxoff, false); - scanEntry->predictNumberResult += GinPageGetOpaque(page)->maxoff; + int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap); + scanEntry->predictNumberResult += n; } if (GinPageRightMost(page)) @@ -335,8 +327,11 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, } else { - tbm_add_tuples(scanEntry->matchBitmap, - GinGetPosting(itup), GinGetNPosting(itup), false); + ItemPointer ipd; + int nipd; + + ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd); + tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false); scanEntry->predictNumberResult += GinGetNPosting(itup); } @@ -450,16 +445,14 @@ restartScanEntry: IncrBufferRefCount(entry->buffer); page = BufferGetPage(entry->buffer); - entry->predictNumberResult = stack->predictNumber * GinPageGetOpaque(page)->maxoff; /* - * Keep page content in memory to prevent durable page locking + * Copy page content to memory to avoid keeping it locked for + * a long time. */ - entry->list = (ItemPointerData *) palloc(BLCKSZ); - entry->nlist = GinPageGetOpaque(page)->maxoff; - memcpy(entry->list, - GinDataPageGetItemPointer(page, FirstOffsetNumber), - GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData)); + entry->list = GinDataLeafPageGetItems(page, &entry->nlist); + + entry->predictNumberResult = stack->predictNumber * entry->nlist; LockBuffer(entry->buffer, GIN_UNLOCK); freeGinBtreeStack(stack); @@ -467,9 +460,10 @@ restartScanEntry: } else if (GinGetNPosting(itup) > 0) { - entry->nlist = GinGetNPosting(itup); - entry->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * entry->nlist); - memcpy(entry->list, GinGetPosting(itup), sizeof(ItemPointerData) * entry->nlist); + entry->list = ginReadTuple(ginstate, entry->attnum, itup, + &entry->nlist); + entry->predictNumberResult = entry->nlist; + entry->isFinished = FALSE; } } @@ -532,6 +526,7 @@ static void entryGetNextItem(GinState *ginstate, GinScanEntry entry) { Page page; + int i; for (;;) { @@ -564,35 +559,47 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry) page = BufferGetPage(entry->buffer); entry->offset = InvalidOffsetNumber; - if (!ItemPointerIsValid(&entry->curItem) || - findItemInPostingPage(page, &entry->curItem, &entry->offset)) + if (entry->list) { - /* - * Found position equal to or greater than stored - */ - entry->nlist = GinPageGetOpaque(page)->maxoff; - memcpy(entry->list, - GinDataPageGetItemPointer(page, FirstOffsetNumber), - GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData)); + pfree(entry->list); + entry->list = NULL; + } - LockBuffer(entry->buffer, GIN_UNLOCK); + /* + * If the page was concurrently split, we have to re-find the + * item we were stopped on. If the page was split more than once, + * the item might not be on this page, but somewhere to the right. + * Keep following the right-links until we re-find the correct + * page. + */ + if (ItemPointerIsValid(&entry->curItem) && + needToStepRight(page, &entry->curItem)) + { + continue; + } + + entry->list = GinDataLeafPageGetItems(page, &entry->nlist); - if (!ItemPointerIsValid(&entry->curItem) || - ginCompareItemPointers(&entry->curItem, - entry->list + entry->offset - 1) == 0) + /* re-find the item we were stopped on. */ + if (ItemPointerIsValid(&entry->curItem)) + { + for (i = 0; i < entry->nlist; i++) { - /* - * First pages are deleted or empty, or we found exact - * position, so break inner loop and continue outer one. - */ - break; + if (ginCompareItemPointers(&entry->curItem, + &entry->list[i]) < 0) + { + LockBuffer(entry->buffer, GIN_UNLOCK); + entry->offset = i + 1; + entry->curItem = entry->list[entry->offset - 1]; + return; + } } - - /* - * Find greater than entry->curItem position, store it. - */ + } + else + { + LockBuffer(entry->buffer, GIN_UNLOCK); + entry->offset = 1; /* scan all items on the page. */ entry->curItem = entry->list[entry->offset - 1]; - return; } } diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 5e6f627a76b..cd9c7c120b3 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -53,31 +53,42 @@ addItemPointersToLeafTuple(GinState *ginstate, Datum key; GinNullCategory category; IndexTuple res; + ItemPointerData *newItems, + *oldItems; + int oldNPosting, + newNPosting; + GinPostingList *compressedList; Assert(!GinIsPostingTree(old)); attnum = gintuple_get_attrnum(ginstate, old); key = gintuple_get_key(ginstate, old, &category); - /* try to build tuple with room for all the items */ - res = GinFormTuple(ginstate, attnum, key, category, - NULL, nitem + GinGetNPosting(old), - false); + /* merge the old and new posting lists */ + oldItems = ginReadTuple(ginstate, attnum, old, &oldNPosting); - if (res) + newNPosting = oldNPosting + nitem; + newItems = (ItemPointerData *) palloc(sizeof(ItemPointerData) * newNPosting); + + newNPosting = ginMergeItemPointers(newItems, + items, nitem, + oldItems, oldNPosting); + + /* Compress the posting list, and try to a build tuple with room for it */ + res = NULL; + compressedList = ginCompressPostingList(newItems, newNPosting, GinMaxItemSize, + NULL); + pfree(newItems); + if (compressedList) { - /* good, small enough */ - uint32 newnitem; - - /* fill in the posting list with union of old and new TIDs */ - newnitem = ginMergeItemPointers(GinGetPosting(res), - GinGetPosting(old), - GinGetNPosting(old), - items, nitem); - /* merge might have eliminated some duplicate items */ - GinShortenTuple(res, newnitem); + res = GinFormTuple(ginstate, attnum, key, category, + (char *) compressedList, + SizeOfGinPostingList(compressedList), + newNPosting, + false); + pfree(compressedList); } - else + if (!res) { /* posting list would be too big, convert to posting tree */ BlockNumber postingRoot; @@ -88,8 +99,8 @@ addItemPointersToLeafTuple(GinState *ginstate, * already be in order with no duplicates. */ postingRoot = createPostingTree(ginstate->index, - GinGetPosting(old), - GinGetNPosting(old), + oldItems, + oldNPosting, buildStats); /* Now insert the TIDs-to-be-added into the posting tree */ @@ -98,9 +109,10 @@ addItemPointersToLeafTuple(GinState *ginstate, buildStats); /* And build a new posting-tree-only result tuple */ - res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, true); + res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true); GinSetPostingTree(res, postingRoot); } + pfree(oldItems); return res; } @@ -119,12 +131,19 @@ buildFreshLeafTuple(GinState *ginstate, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats) { - IndexTuple res; + IndexTuple res = NULL; + GinPostingList *compressedList; /* try to build a posting list tuple with all the items */ - res = GinFormTuple(ginstate, attnum, key, category, - items, nitem, false); - + compressedList = ginCompressPostingList(items, nitem, GinMaxItemSize, NULL); + if (compressedList) + { + res = GinFormTuple(ginstate, attnum, key, category, + (char *) compressedList, + SizeOfGinPostingList(compressedList), + nitem, false); + pfree(compressedList); + } if (!res) { /* posting list would be too big, build posting tree */ @@ -134,7 +153,7 @@ buildFreshLeafTuple(GinState *ginstate, * Build posting-tree-only result tuple. We do this first so as to * fail quickly if the key is too big. */ - res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, true); + res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, 0, true); /* * Initialize a new posting tree with the TIDs. diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c index 840fa7724c9..f532a3dbf03 100644 --- a/src/backend/access/gin/ginpostinglist.c +++ b/src/backend/access/gin/ginpostinglist.c @@ -16,12 +16,342 @@ #include "access/gin_private.h" +#ifdef USE_ASSERT_CHECKING +#define CHECK_ENCODING_ROUNDTRIP +#endif + +/* + * For encoding purposes, item pointers are represented as 64-bit unsigned + * integers. The lowest 11 bits represent the offset number, and the next + * lowest 32 bits are the block number. That leaves 17 bits unused, ie. + * only 43 low bits are used. + * + * These 43-bit integers are encoded using varbyte encoding. In each byte, + * the 7 low bits contain data, while the highest bit is a continuation bit. + * When the continuation bit is set, the next byte is part of the same + * integer, otherwise this is the last byte of this integer. 43 bits fit + * conveniently in at most 6 bytes when varbyte encoded (the 6th byte does + * not need a continuation bit, because we know the max size to be 43 bits): + * + * 0XXXXXXX + * 1XXXXXXX 0XXXXYYY + * 1XXXXXXX 1XXXXYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY + * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY YYYYYYYY + * + * X = bits used for offset number + * Y = bits used for block number + * + * The bytes are in stored in little-endian order. + * + * An important property of this encoding is that removing an item from list + * never increases the size of the resulting compressed posting list. Proof: + * + * Removing number is actually replacement of two numbers with their sum. We + * have to prove that varbyte encoding of a sum can't be longer than varbyte + * encoding of its summands. Sum of two numbers is at most one bit wider than + * than the larger of the summands. Widening a number by one bit enlarges its + * length in varbyte encoding by at most one byte. Therefore, varbyte encoding + * of sum is at most one byte longer than varbyte encoding of larger summand. + * Lesser summand is at least one byte, so the sum cannot take more space than + * the summands, Q.E.D. + * + * This property greatly simplifies VACUUM, which can assume that posting + * lists always fit on the same page after vacuuming. Note that even though + * that holds for removing items from a posting list, you must also be + * careful to not cause expansion e.g when merging uncompressed items on the + * page into the compressed lists, when vacuuming. + */ + +/* + * How many bits do you need to encode offset number? OffsetNumber is a 16-bit + * integer, but you can't fit that many items on a page. 11 ought to be more + * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and + * use the minimum number of bits, but that would require changing the on-disk + * format if MaxHeapTuplesPerPage changes. Better to leave some slack. + */ +#define MaxHeapTuplesPerPageBits 11 + +static inline uint64 +itemptr_to_uint64(const ItemPointer iptr) +{ + uint64 val; + + Assert(ItemPointerIsValid(iptr)); + Assert(iptr->ip_posid < (1 << MaxHeapTuplesPerPageBits)); + + val = iptr->ip_blkid.bi_hi; + val <<= 16; + val |= iptr->ip_blkid.bi_lo; + val <<= MaxHeapTuplesPerPageBits; + val |= iptr->ip_posid; + + return val; +} + +static inline void +uint64_to_itemptr(uint64 val, ItemPointer iptr) +{ + iptr->ip_posid = val & ((1 << MaxHeapTuplesPerPageBits) - 1); + val = val >> MaxHeapTuplesPerPageBits; + iptr->ip_blkid.bi_lo = val & 0xFFFF; + val = val >> 16; + iptr->ip_blkid.bi_hi = val & 0xFFFF; + + Assert(ItemPointerIsValid(iptr)); +} + +/* + * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer. + */ +static void +encode_varbyte(uint64 val, unsigned char **ptr) +{ + unsigned char *p = *ptr; + + while (val > 0x7F) + { + *(p++) = 0x80 | (val & 0x7F); + val >>= 7; + } + *(p++) = (unsigned char) val; + + *ptr = p; +} + +/* + * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer. + */ +static uint64 +decode_varbyte(unsigned char **ptr) +{ + uint64 val; + unsigned char *p = *ptr; + uint64 c; + + c = *(p++); + val = c & 0x7F; + if (c & 0x80) + { + c = *(p++); + val |= (c & 0x7F) << 7; + if (c & 0x80) + { + c = *(p++); + val |= (c & 0x7F) << 14; + if (c & 0x80) + { + c = *(p++); + val |= (c & 0x7F) << 21; + if (c & 0x80) + { + c = *(p++); + val |= (c & 0x7F) << 28; + if (c & 0x80) + { + c = *(p++); + val |= (c & 0x7F) << 35; + if (c & 0x80) + { + /* last byte, no continuation bit */ + c = *(p++); + val |= c << 42; + } + } + } + } + } + } + + *ptr = p; + + return val; +} + +/* + * Encode a posting list. + * + * The encoded list is returned in a palloc'd struct, which will be at most + * 'maxsize' bytes in size. The number items in the returned segment is + * returned in *nwritten. If it's not equal to nipd, not all the items fit + * in 'maxsize', and only the first *nwritten were encoded. + */ +GinPostingList * +ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, + int *nwritten) +{ + uint64 prev; + int totalpacked = 0; + int maxbytes; + GinPostingList *result; + unsigned char *ptr; + unsigned char *endptr; + + result = palloc(maxsize); + + maxbytes = maxsize - offsetof(GinPostingList, bytes); + + /* Store the first special item */ + result->first = ipd[0]; + + prev = itemptr_to_uint64(&result->first); + + ptr = result->bytes; + endptr = result->bytes + maxbytes; + for (totalpacked = 1; totalpacked < nipd; totalpacked++) + { + uint64 val = itemptr_to_uint64(&ipd[totalpacked]); + uint64 delta = val - prev; + + Assert (val > prev); + + if (endptr - ptr >= 6) + encode_varbyte(delta, &ptr); + else + { + /* + * There are less than 6 bytes left. Have to check if the next + * item fits in that space before writing it out. + */ + unsigned char buf[6]; + unsigned char *p = buf; + + encode_varbyte(delta, &p); + if (p - buf > (endptr - ptr)) + break; /* output is full */ + + memcpy(ptr, buf, p - buf); + ptr += (p - buf); + } + prev = val; + } + result->nbytes = ptr - result->bytes; + + if (nwritten) + *nwritten = totalpacked; + + Assert(SizeOfGinPostingList(result) <= maxsize); + + /* + * Check that the encoded segment decodes back to the original items. + */ +#if defined (CHECK_ENCODING_ROUNDTRIP) + if (assert_enabled) + { + int ndecoded; + ItemPointer tmp = ginPostingListDecode(result, &ndecoded); + int i; + + Assert(ndecoded == totalpacked); + for (i = 0; i < ndecoded; i++) + Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == 0); + pfree(tmp); + } +#endif + + return result; +} + +/* + * Decode a compressed posting list into an array of item pointers. + * The number of items is returned in *ndecoded. + */ +ItemPointer +ginPostingListDecode(GinPostingList *plist, int *ndecoded) +{ + return ginPostingListDecodeAllSegments(plist, + SizeOfGinPostingList(plist), + ndecoded); +} + +/* + * Decode multiple posting list segments into an array of item pointers. + * The number of items is returned in *ndecoded_out. The segments are stored + * one after each other, with total size 'len' bytes. + */ +ItemPointer +ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out) +{ + ItemPointer result; + int nallocated; + uint64 val; + char *endseg = ((char *) segment) + len; + int ndecoded; + unsigned char *ptr; + unsigned char *endptr; + + /* + * Guess an initial size of the array. + */ + nallocated = segment->nbytes * 2 + 1; + result = palloc(nallocated * sizeof(ItemPointerData)); + + ndecoded = 0; + while ((char *) segment < endseg) + { + /* enlarge output array if needed */ + if (ndecoded >= nallocated) + { + nallocated *= 2; + result = repalloc(result, nallocated * sizeof(ItemPointerData)); + } + + /* copy the first item */ + result[ndecoded] = segment->first; + ndecoded++; + Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first))); + + val = itemptr_to_uint64(&segment->first); + ptr = segment->bytes; + endptr = segment->bytes + segment->nbytes; + while (ptr < endptr) + { + /* enlarge output array if needed */ + if (ndecoded >= nallocated) + { + nallocated *= 2; + result = repalloc(result, nallocated * sizeof(ItemPointerData)); + } + + val += decode_varbyte(&ptr); + + uint64_to_itemptr(val, &result[ndecoded]); + ndecoded++; + } + segment = GinNextPostingListSegment(segment); + } + + if (ndecoded_out) + *ndecoded_out = ndecoded; + return result; +} + +/* + * Add all item pointers from a bunch of posting lists to a TIDBitmap. + */ +int +ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, + TIDBitmap *tbm) +{ + int ndecoded; + ItemPointer items; + + items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded); + tbm_add_tuples(tbm, items, ndecoded, false); + pfree(items); + + return ndecoded; +} + /* * Merge two ordered arrays of itempointers, eliminating any duplicates. * Returns the number of items in the result. * Caller is responsible that there is enough space at *dst. + * + * It's OK if 'dst' overlaps with the *beginning* of one of the arguments. */ -uint32 +int ginMergeItemPointers(ItemPointerData *dst, ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb) @@ -29,28 +359,50 @@ ginMergeItemPointers(ItemPointerData *dst, ItemPointerData *dptr = dst; ItemPointerData *aptr = a, *bptr = b; + int result; - while (aptr - a < na && bptr - b < nb) + /* + * If the argument arrays don't overlap, we can just append them to + * each other. + */ + if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0) { - int cmp = ginCompareItemPointers(aptr, bptr); - - if (cmp > 0) - *dptr++ = *bptr++; - else if (cmp == 0) + memmove(dst, a, na * sizeof(ItemPointerData)); + memmove(&dst[na], b, nb * sizeof(ItemPointerData)); + result = na + nb; + } + else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0) + { + memmove(dst, b, nb * sizeof(ItemPointerData)); + memmove(&dst[nb], a, na * sizeof(ItemPointerData)); + result = na + nb; + } + else + { + while (aptr - a < na && bptr - b < nb) { - /* we want only one copy of the identical items */ - *dptr++ = *bptr++; - aptr++; + int cmp = ginCompareItemPointers(aptr, bptr); + + if (cmp > 0) + *dptr++ = *bptr++; + else if (cmp == 0) + { + /* only keep one copy of the identical items */ + *dptr++ = *bptr++; + aptr++; + } + else + *dptr++ = *aptr++; } - else + + while (aptr - a < na) *dptr++ = *aptr++; - } - while (aptr - a < na) - *dptr++ = *aptr++; + while (bptr - b < nb) + *dptr++ = *bptr++; - while (bptr - b < nb) - *dptr++ = *bptr++; + result = dptr - dst; + } - return dptr - dst; + return result; } diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 4212effbe46..6bf65c32935 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -20,8 +20,9 @@ #include "postmaster/autovacuum.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "utils/memutils.h" -typedef struct +typedef struct GinVacuumState { Relation index; IndexBulkDeleteResult *result; @@ -29,56 +30,58 @@ typedef struct void *callback_state; GinState ginstate; BufferAccessStrategy strategy; + MemoryContext tmpCxt; } GinVacuumState; - /* - * Vacuums a list of item pointers. The original size of the list is 'nitem', - * returns the number of items remaining afterwards. + * Vacuums an uncompressed posting list. The size of the must can be specified + * in number of items (nitems). * - * If *cleaned == NULL on entry, the original array is left unmodified; if - * any items are removed, a palloc'd copy of the result is stored in *cleaned. - * Otherwise *cleaned should point to the original array, in which case it's - * modified directly. + * If none of the items need to be removed, returns NULL. Otherwise returns + * a new palloc'd array with the remaining items. The number of remaining + * items is returned in *nremaining. */ -static int -ginVacuumPostingList(GinVacuumState *gvs, ItemPointerData *items, int nitem, - ItemPointerData **cleaned) +ItemPointer +ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items, + int nitem, int *nremaining) { int i, - j = 0; - - Assert(*cleaned == NULL || *cleaned == items); + remaining = 0; + ItemPointer tmpitems = NULL; /* - * just scan over ItemPointer array + * Iterate over TIDs array */ for (i = 0; i < nitem; i++) { if (gvs->callback(items + i, gvs->callback_state)) { gvs->result->tuples_removed += 1; - if (!*cleaned) + if (!tmpitems) { - *cleaned = (ItemPointerData *) palloc(sizeof(ItemPointerData) * nitem); - if (i != 0) - memcpy(*cleaned, items, sizeof(ItemPointerData) * i); + /* + * First TID to be deleted: allocate memory to hold the + * remaining items. + */ + tmpitems = palloc(sizeof(ItemPointerData) * nitem); + memcpy(tmpitems, items, sizeof(ItemPointerData) * i); } } else { gvs->result->num_index_tuples += 1; - if (i != j) - (*cleaned)[j] = items[i]; - j++; + if (tmpitems) + tmpitems[remaining] = items[i]; + remaining++; } } - return j; + *nremaining = remaining; + return tmpitems; } /* - * fills WAL record for vacuum leaf page + * Create a WAL record for vacuuming entry tree leaf page. */ static void xlogVacuumPage(Relation index, Buffer buffer) @@ -86,65 +89,64 @@ xlogVacuumPage(Relation index, Buffer buffer) Page page = BufferGetPage(buffer); XLogRecPtr recptr; XLogRecData rdata[3]; - ginxlogVacuumPage data; - char *backup; - char itups[BLCKSZ]; - uint32 len = 0; + ginxlogVacuumPage xlrec; + uint16 lower; + uint16 upper; + /* This is only used for entry tree leaf pages. */ + Assert(!GinPageIsData(page)); Assert(GinPageIsLeaf(page)); if (!RelationNeedsWAL(index)) return; - data.node = index->rd_node; - data.blkno = BufferGetBlockNumber(buffer); + xlrec.node = index->rd_node; + xlrec.blkno = BufferGetBlockNumber(buffer); + + /* Assume we can omit data between pd_lower and pd_upper */ + lower = ((PageHeader) page)->pd_lower; + upper = ((PageHeader) page)->pd_upper; + + Assert(lower < BLCKSZ); + Assert(upper < BLCKSZ); - if (GinPageIsData(page)) + if (lower >= SizeOfPageHeaderData && + upper > lower && + upper <= BLCKSZ) { - backup = GinDataPageGetData(page); - data.nitem = GinPageGetOpaque(page)->maxoff; - if (data.nitem) - len = MAXALIGN(sizeof(ItemPointerData) * data.nitem); + xlrec.hole_offset = lower; + xlrec.hole_length = upper - lower; } else { - char *ptr; - OffsetNumber i; - - ptr = backup = itups; - for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++) - { - IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); - - memcpy(ptr, itup, IndexTupleSize(itup)); - ptr += MAXALIGN(IndexTupleSize(itup)); - } - - data.nitem = PageGetMaxOffsetNumber(page); - len = ptr - backup; + /* No "hole" to compress out */ + xlrec.hole_offset = 0; + xlrec.hole_length = 0; } - rdata[0].buffer = buffer; - rdata[0].buffer_std = (GinPageIsData(page)) ? FALSE : TRUE; - rdata[0].len = 0; - rdata[0].data = NULL; - rdata[0].next = rdata + 1; + rdata[0].data = (char *) &xlrec; + rdata[0].len = sizeof(ginxlogVacuumPage); + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &rdata[1]; - rdata[1].buffer = InvalidBuffer; - rdata[1].len = sizeof(ginxlogVacuumPage); - rdata[1].data = (char *) &data; - - if (len == 0) + if (xlrec.hole_length == 0) { + rdata[1].data = (char *) page; + rdata[1].len = BLCKSZ; + rdata[1].buffer = InvalidBuffer; rdata[1].next = NULL; } else { - rdata[1].next = rdata + 2; - + /* must skip the hole */ + rdata[1].data = (char *) page; + rdata[1].len = xlrec.hole_offset; + rdata[1].buffer = InvalidBuffer; + rdata[1].next = &rdata[2]; + + rdata[2].data = (char *) page + (xlrec.hole_offset + xlrec.hole_length); + rdata[2].len = BLCKSZ - (xlrec.hole_offset + xlrec.hole_length); rdata[2].buffer = InvalidBuffer; - rdata[2].len = len; - rdata[2].data = backup; rdata[2].next = NULL; } @@ -158,6 +160,7 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, Buffer buffer; Page page; bool hasVoidPage = FALSE; + MemoryContext oldCxt; buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, RBM_NORMAL, gvs->strategy); @@ -169,7 +172,6 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, * again). New scan can't start but previously started ones work * concurrently. */ - if (isRoot) LockBufferForCleanup(buffer); else @@ -179,32 +181,14 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, if (GinPageIsLeaf(page)) { - OffsetNumber newMaxOff, - oldMaxOff = GinPageGetOpaque(page)->maxoff; - ItemPointerData *cleaned = NULL; - - newMaxOff = ginVacuumPostingList(gvs, - (ItemPointer) GinDataPageGetData(page), oldMaxOff, &cleaned); - - /* saves changes about deleted tuple ... */ - if (oldMaxOff != newMaxOff) - { - START_CRIT_SECTION(); - - if (newMaxOff > 0) - memcpy(GinDataPageGetData(page), cleaned, sizeof(ItemPointerData) * newMaxOff); - pfree(cleaned); - GinPageGetOpaque(page)->maxoff = newMaxOff; + oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); + ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(gvs->tmpCxt); - MarkBufferDirty(buffer); - xlogVacuumPage(gvs->index, buffer); - - END_CRIT_SECTION(); - - /* if root is a leaf page, we don't desire further processing */ - if (!isRoot && GinPageGetOpaque(page)->maxoff < FirstOffsetNumber) - hasVoidPage = TRUE; - } + /* if root is a leaf page, we don't desire further processing */ + if (!isRoot && !hasVoidPage && GinDataLeafPageIsEmpty(page)) + hasVoidPage = TRUE; } else { @@ -224,7 +208,7 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, } /* - * if we have root and theres void pages in tree, then we don't release + * if we have root and there are empty pages in tree, then we don't release * lock to go further processing and guarantee that tree is unused */ if (!(isRoot && hasVoidPage)) @@ -391,6 +375,7 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, Buffer buffer; Page page; bool meDelete = FALSE; + bool isempty; if (isRoot) { @@ -429,7 +414,12 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, } } - if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber) + if (GinPageIsLeaf(page)) + isempty = GinDataLeafPageIsEmpty(page); + else + isempty = GinPageGetOpaque(page)->maxoff < FirstOffsetNumber; + + if (isempty) { /* we never delete the left- or rightmost branch */ if (me->leftBlkno != InvalidBlockNumber && !GinPageRightMost(page)) @@ -513,22 +503,47 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3 } else if (GinGetNPosting(itup) > 0) { + int nitems; + ItemPointer uncompressed; + /* - * if we already created a temporary page, make changes in place + * Vacuum posting list with proper function for compressed and + * uncompressed format. */ - ItemPointerData *cleaned = (tmppage == origpage) ? NULL : GinGetPosting(itup); - int newN; - - newN = ginVacuumPostingList(gvs, GinGetPosting(itup), GinGetNPosting(itup), &cleaned); + if (GinItupIsCompressed(itup)) + uncompressed = ginPostingListDecode((GinPostingList *) GinGetPosting(itup), &nitems); + else + { + uncompressed = (ItemPointer) GinGetPosting(itup); + nitems = GinGetNPosting(itup); + } - if (GinGetNPosting(itup) != newN) + uncompressed = ginVacuumItemPointers(gvs, uncompressed, nitems, + &nitems); + if (uncompressed) { + /* + * Some ItemPointers were deleted, recreate tuple. + */ OffsetNumber attnum; Datum key; GinNullCategory category; + GinPostingList *plist; + int plistsize; + + if (nitems > 0) + { + plist = ginCompressPostingList(uncompressed, nitems, GinMaxItemSize, NULL); + plistsize = SizeOfGinPostingList(plist); + } + else + { + plist = NULL; + plistsize = 0; + } /* - * Some ItemPointers were deleted, recreate tuple. + * if we already created a temporary page, make changes in place */ if (tmppage == origpage) { @@ -538,15 +553,6 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3 */ tmppage = PageGetTempPageCopy(origpage); - if (newN > 0) - { - Size pos = ((char *) GinGetPosting(itup)) - ((char *) origpage); - - memcpy(tmppage + pos, cleaned, sizeof(ItemPointerData) * newN); - } - - pfree(cleaned); - /* set itup pointer to new page */ itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i)); } @@ -554,7 +560,10 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3 attnum = gintuple_get_attrnum(&gvs->ginstate, itup); key = gintuple_get_key(&gvs->ginstate, itup, &category); itup = GinFormTuple(&gvs->ginstate, attnum, key, category, - GinGetPosting(itup), newN, true); + (char *) plist, plistsize, + nitems, true); + if (plist) + pfree(plist); PageIndexTupleDelete(tmppage, i); if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i) @@ -583,6 +592,11 @@ ginbulkdelete(PG_FUNCTION_ARGS) BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; uint32 nRoot; + gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext, + "Gin vacuum temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); gvs.index = index; gvs.callback = callback; gvs.callback_state = callback_state; @@ -683,6 +697,8 @@ ginbulkdelete(PG_FUNCTION_ARGS) LockBuffer(buffer, GIN_EXCLUSIVE); } + MemoryContextDelete(gvs.tmpCxt); + PG_RETURN_POINTER(gvs.result); } diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index c13e01a3c6a..9a978ce2bdb 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -78,7 +78,7 @@ static void ginRedoCreatePTree(XLogRecPtr lsn, XLogRecord *record) { ginxlogCreatePostingTree *data = (ginxlogCreatePostingTree *) XLogRecGetData(record); - ItemPointerData *items = (ItemPointerData *) (XLogRecGetData(record) + sizeof(ginxlogCreatePostingTree)); + char *ptr; Buffer buffer; Page page; @@ -89,9 +89,14 @@ ginRedoCreatePTree(XLogRecPtr lsn, XLogRecord *record) Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); - GinInitBuffer(buffer, GIN_DATA | GIN_LEAF); - memcpy(GinDataPageGetData(page), items, sizeof(ItemPointerData) * data->nitem); - GinPageGetOpaque(page)->maxoff = data->nitem; + GinInitBuffer(buffer, GIN_DATA | GIN_LEAF | GIN_COMPRESSED); + + ptr = XLogRecGetData(record) + sizeof(ginxlogCreatePostingTree); + + /* Place page data */ + memcpy(GinDataLeafPageGetPostingList(page), ptr, data->size); + + GinDataLeafPageSetPostingListSize(page, data->size); PageSetLSN(page, lsn); @@ -100,11 +105,11 @@ ginRedoCreatePTree(XLogRecPtr lsn, XLogRecord *record) } static void -ginRedoInsertEntry(Buffer buffer, OffsetNumber offset, BlockNumber rightblkno, - void *rdata) +ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata) { Page page = BufferGetPage(buffer); ginxlogInsertEntry *data = (ginxlogInsertEntry *) rdata; + OffsetNumber offset = data->offset; IndexTuple itup; if (rightblkno != InvalidBlockNumber) @@ -138,30 +143,43 @@ ginRedoInsertEntry(Buffer buffer, OffsetNumber offset, BlockNumber rightblkno, } static void -ginRedoInsertData(Buffer buffer, OffsetNumber offset, BlockNumber rightblkno, - void *rdata) +ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) +{ + Pointer segment; + + /* Copy the new data to the right place */ + segment = ((Pointer) GinDataLeafPageGetPostingList(page)) + + data->unmodifiedsize; + memcpy(segment, data->newdata, data->length - data->unmodifiedsize); + GinDataLeafPageSetPostingListSize(page, data->length); + GinPageSetCompressed(page); +} + +static void +ginRedoInsertData(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata) { Page page = BufferGetPage(buffer); - if (GinPageIsLeaf(page)) + if (isLeaf) { - ginxlogInsertDataLeaf *data = (ginxlogInsertDataLeaf *) rdata; - ItemPointerData *items = data->items; - OffsetNumber i; + ginxlogRecompressDataLeaf *data = (ginxlogRecompressDataLeaf *) rdata; - for (i = 0; i < data->nitem; i++) - GinDataPageAddItemPointer(page, &items[i], offset + i); + Assert(GinPageIsLeaf(page)); + + ginRedoRecompress(page, data); } else { - PostingItem *pitem = (PostingItem *) rdata; + ginxlogInsertDataInternal *data = (ginxlogInsertDataInternal *) rdata; PostingItem *oldpitem; + Assert(!GinPageIsLeaf(page)); + /* update link to right page after split */ - oldpitem = GinDataPageGetPostingItem(page, offset); + oldpitem = GinDataPageGetPostingItem(page, data->offset); PostingItemSetBlockNumber(oldpitem, rightblkno); - GinDataPageAddPostingItem(page, pitem, offset); + GinDataPageAddPostingItem(page, &data->newitem, data->offset); } } @@ -213,12 +231,12 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record) if (data->flags & GIN_INSERT_ISDATA) { Assert(GinPageIsData(page)); - ginRedoInsertData(buffer, data->offset, rightChildBlkno, payload); + ginRedoInsertData(buffer, isLeaf, rightChildBlkno, payload); } else { Assert(!GinPageIsData(page)); - ginRedoInsertEntry(buffer, data->offset, rightChildBlkno, payload); + ginRedoInsertEntry(buffer, isLeaf, rightChildBlkno, payload); } PageSetLSN(page, lsn); @@ -253,38 +271,42 @@ ginRedoSplitEntry(Page lpage, Page rpage, void *rdata) static void ginRedoSplitData(Page lpage, Page rpage, void *rdata) { - ginxlogSplitData *data = (ginxlogSplitData *) rdata; bool isleaf = GinPageIsLeaf(lpage); - char *ptr = (char *) rdata + sizeof(ginxlogSplitData); - OffsetNumber i; - ItemPointer bound; if (isleaf) { - ItemPointer items = (ItemPointer) ptr; - for (i = 0; i < data->separator; i++) - GinDataPageAddItemPointer(lpage, &items[i], InvalidOffsetNumber); - for (i = data->separator; i < data->nitem; i++) - GinDataPageAddItemPointer(rpage, &items[i], InvalidOffsetNumber); + ginxlogSplitDataLeaf *data = (ginxlogSplitDataLeaf *) rdata; + Pointer lptr = (Pointer) rdata + sizeof(ginxlogSplitDataLeaf); + Pointer rptr = lptr + data->lsize; + + Assert(data->lsize > 0 && data->lsize <= GinDataLeafMaxContentSize); + Assert(data->rsize > 0 && data->rsize <= GinDataLeafMaxContentSize); + + memcpy(GinDataLeafPageGetPostingList(lpage), lptr, data->lsize); + memcpy(GinDataLeafPageGetPostingList(rpage), rptr, data->rsize); + + GinDataLeafPageSetPostingListSize(lpage, data->lsize); + GinDataLeafPageSetPostingListSize(rpage, data->rsize); + *GinDataPageGetRightBound(lpage) = data->lrightbound; + *GinDataPageGetRightBound(rpage) = data->rrightbound; } else { - PostingItem *items = (PostingItem *) ptr; + ginxlogSplitDataInternal *data = (ginxlogSplitDataInternal *) rdata; + PostingItem *items = (PostingItem *) ((char *) rdata + sizeof(ginxlogSplitDataInternal)); + OffsetNumber i; + OffsetNumber maxoff; + for (i = 0; i < data->separator; i++) GinDataPageAddPostingItem(lpage, &items[i], InvalidOffsetNumber); for (i = data->separator; i < data->nitem; i++) GinDataPageAddPostingItem(rpage, &items[i], InvalidOffsetNumber); - } - - /* set up right key */ - bound = GinDataPageGetRightBound(lpage); - if (isleaf) - *bound = *GinDataPageGetItemPointer(lpage, GinPageGetOpaque(lpage)->maxoff); - else - *bound = GinDataPageGetPostingItem(lpage, GinPageGetOpaque(lpage)->maxoff)->key; - bound = GinDataPageGetRightBound(rpage); - *bound = data->rightbound; + /* set up right key */ + maxoff = GinPageGetOpaque(lpage)->maxoff; + *GinDataPageGetRightBound(lpage) = GinDataPageGetPostingItem(lpage, maxoff)->key; + *GinDataPageGetRightBound(rpage) = data->rightbound; + } } static void @@ -317,9 +339,10 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) if (isLeaf) flags |= GIN_LEAF; - if (isData) flags |= GIN_DATA; + if (isLeaf && isData) + flags |= GIN_COMPRESSED; lbuffer = XLogReadBuffer(data->node, data->lblkno, true); Assert(BufferIsValid(lbuffer)); @@ -352,7 +375,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) Buffer rootBuf = XLogReadBuffer(data->node, rootBlkno, true); Page rootPage = BufferGetPage(rootBuf); - GinInitBuffer(rootBuf, flags & ~GIN_LEAF); + GinInitBuffer(rootBuf, flags & ~GIN_LEAF & ~GIN_COMPRESSED); if (isData) { @@ -383,13 +406,20 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) UnlockReleaseBuffer(lbuffer); } +/* + * This is functionally the same as heap_xlog_newpage. + */ static void ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record) { - ginxlogVacuumPage *data = (ginxlogVacuumPage *) XLogRecGetData(record); + ginxlogVacuumPage *xlrec = (ginxlogVacuumPage *) XLogRecGetData(record); + char *blk = ((char *) xlrec) + sizeof(ginxlogVacuumPage); Buffer buffer; Page page; + Assert(xlrec->hole_offset < BLCKSZ); + Assert(xlrec->hole_length < BLCKSZ); + /* If we have a full-page image, restore it and we're done */ if (record->xl_info & XLR_BKP_BLOCK(0)) { @@ -397,41 +427,56 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record) return; } - buffer = XLogReadBuffer(data->node, data->blkno, false); + buffer = XLogReadBuffer(xlrec->node, xlrec->blkno, true); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); - if (lsn > PageGetLSN(page)) + if (xlrec->hole_length == 0) { - if (GinPageIsData(page)) - { - memcpy(GinDataPageGetData(page), - XLogRecGetData(record) + sizeof(ginxlogVacuumPage), - data->nitem * GinSizeOfDataPageItem(page)); - GinPageGetOpaque(page)->maxoff = data->nitem; - } - else - { - OffsetNumber i, - *tod; - IndexTuple itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogVacuumPage)); + memcpy((char *) page, blk, BLCKSZ); + } + else + { + memcpy((char *) page, blk, xlrec->hole_offset); + /* must zero-fill the hole */ + MemSet((char *) page + xlrec->hole_offset, 0, xlrec->hole_length); + memcpy((char *) page + (xlrec->hole_offset + xlrec->hole_length), + blk + xlrec->hole_offset, + BLCKSZ - (xlrec->hole_offset + xlrec->hole_length)); + } - tod = (OffsetNumber *) palloc(sizeof(OffsetNumber) * PageGetMaxOffsetNumber(page)); - for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i++) - tod[i - 1] = i; + PageSetLSN(page, lsn); - PageIndexMultiDelete(page, tod, PageGetMaxOffsetNumber(page)); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); +} - for (i = 0; i < data->nitem; i++) - { - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) - elog(ERROR, "failed to add item to index page in %u/%u/%u", - data->node.spcNode, data->node.dbNode, data->node.relNode); - itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup))); - } - } +static void +ginRedoVacuumDataLeafPage(XLogRecPtr lsn, XLogRecord *record) +{ + ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetData(record); + Buffer buffer; + Page page; + + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); + return; + } + buffer = XLogReadBuffer(xlrec->node, xlrec->blkno, false); + if (!BufferIsValid(buffer)) + return; + page = (Page) BufferGetPage(buffer); + + Assert(GinPageIsLeaf(page)); + Assert(GinPageIsData(page)); + + if (lsn > PageGetLSN(page)) + { + ginRedoRecompress(page, &xlrec->data); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -747,6 +792,9 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record) case XLOG_GIN_VACUUM_PAGE: ginRedoVacuumPage(lsn, record); break; + case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: + ginRedoVacuumDataLeafPage(lsn, record); + break; case XLOG_GIN_DELETE_PAGE: ginRedoDeletePage(lsn, record); break; diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c index f7b08ac1681..1983bcc94c7 100644 --- a/src/backend/access/rmgrdesc/gindesc.c +++ b/src/backend/access/rmgrdesc/gindesc.c @@ -47,8 +47,7 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec) appendStringInfoString(buf, "Insert item, "); desc_node(buf, xlrec->node, xlrec->blkno); - appendStringInfo(buf, " offset: %u isdata: %c isleaf: %c", - xlrec->offset, + appendStringInfo(buf, " isdata: %c isleaf: %c", (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F', (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F'); if (!(xlrec->flags & GIN_INSERT_ISLEAF)) @@ -67,24 +66,50 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec) appendStringInfo(buf, " isdelete: %c", (((ginxlogInsertEntry *) payload)->isDelete) ? 'T' : 'F'); else if (xlrec->flags & GIN_INSERT_ISLEAF) - appendStringInfo(buf, " nitem: %u", - (((ginxlogInsertDataLeaf *) payload)->nitem)); + { + ginxlogRecompressDataLeaf *insertData = + (ginxlogRecompressDataLeaf *) payload; + + appendStringInfo(buf, " unmodified: %u length: %u (compressed)", + insertData->unmodifiedsize, + insertData->length); + } else + { + ginxlogInsertDataInternal *insertData = (ginxlogInsertDataInternal *) payload; appendStringInfo(buf, " pitem: %u-%u/%u", - PostingItemGetBlockNumber((PostingItem *) payload), - ItemPointerGetBlockNumber(&((PostingItem *) payload)->key), - ItemPointerGetOffsetNumber(&((PostingItem *) payload)->key)); + PostingItemGetBlockNumber(&insertData->newitem), + ItemPointerGetBlockNumber(&insertData->newitem.key), + ItemPointerGetOffsetNumber(&insertData->newitem.key)); + } } break; case XLOG_GIN_SPLIT: - appendStringInfoString(buf, "Page split, "); - desc_node(buf, ((ginxlogSplit *) rec)->node, ((ginxlogSplit *) rec)->lblkno); - appendStringInfo(buf, " isrootsplit: %c", (((ginxlogSplit *) rec)->flags & GIN_SPLIT_ROOT) ? 'T' : 'F'); + { + ginxlogSplit *xlrec = (ginxlogSplit *) rec; + + appendStringInfoString(buf, "Page split, "); + desc_node(buf, ((ginxlogSplit *) rec)->node, ((ginxlogSplit *) rec)->lblkno); + appendStringInfo(buf, " isrootsplit: %c", (((ginxlogSplit *) rec)->flags & GIN_SPLIT_ROOT) ? 'T' : 'F'); + appendStringInfo(buf, " isdata: %c isleaf: %c", + (xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F', + (xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F'); + } break; case XLOG_GIN_VACUUM_PAGE: appendStringInfoString(buf, "Vacuum page, "); desc_node(buf, ((ginxlogVacuumPage *) rec)->node, ((ginxlogVacuumPage *) rec)->blkno); break; + case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: + { + ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec; + appendStringInfoString(buf, "Vacuum data leaf page, "); + desc_node(buf, xlrec->node, xlrec->blkno); + appendStringInfo(buf, " unmodified: %u length: %u", + xlrec->data.unmodifiedsize, + xlrec->data.length); + } + break; case XLOG_GIN_DELETE_PAGE: appendStringInfoString(buf, "Delete page, "); desc_node(buf, ((ginxlogDeletePage *) rec)->node, ((ginxlogDeletePage *) rec)->blkno); |