diff options
-rw-r--r-- | src/backend/access/gin/gindatapage.c | 629 | ||||
-rw-r--r-- | src/backend/access/gin/ginpostinglist.c | 3 | ||||
-rw-r--r-- | src/backend/access/gin/ginxlog.c | 161 | ||||
-rw-r--r-- | src/backend/access/rmgrdesc/gindesc.c | 65 | ||||
-rw-r--r-- | src/include/access/gin_private.h | 35 |
5 files changed, 666 insertions, 227 deletions
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 313d53c354f..21ce79fc0b1 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -22,17 +22,17 @@ #include "utils/rel.h" /* - * Size of the posting lists stored on leaf pages, in bytes. The code can - * deal with any size, but random access is more efficient when a number of - * smaller lists are stored, rather than one big list. - */ -#define GinPostingListSegmentMaxSize 256 - -/* - * Existing posting lists smaller than this are recompressed, when inserting - * new items to page. + * Min, Max and Target size of posting lists stored on leaf pages, in bytes. + * + * The code can deal with any size, but random access is more efficient when + * a number of smaller lists are stored, rather than one big list. If a + * posting list would become larger than Max size as a result of insertions, + * it is split into two. If a posting list would be smaller than minimum + * size, it is merged with the next posting list. */ -#define GinPostingListSegmentMinSize 192 +#define GinPostingListSegmentMaxSize 384 +#define GinPostingListSegmentTargetSize 256 +#define GinPostingListSegmentMinSize 128 /* * At least this many items fit in a GinPostingListSegmentMaxSize-bytes @@ -55,25 +55,43 @@ typedef struct dlist_node *lastleft; /* last segment on left page */ int lsize; /* total size on left page */ int rsize; /* total size on right page */ + + bool oldformat; /* page is in pre-9.4 format on disk */ } disassembledLeaf; typedef struct { dlist_node node; /* linked list pointers */ + /*------------- + * 'action' indicates the status of this in-memory segment, compared to + * what's on disk. It is one of the GIN_SEGMENT_* action codes: + * + * UNMODIFIED no changes + * DELETE the segment is to be removed. 'seg' and 'items' are + * ignored + * INSERT this is a completely new segment + * REPLACE this replaces an existing segment with new content + * ADDITEMS like REPLACE, but no items have been removed, and we track + * in detail what items have been added to this segment, in + * 'modifieditems' + *------------- + */ + char action; + + ItemPointerData *modifieditems; + int nmodifieditems; + /* - * The following fields represent the items in this segment. - * If 'items' is not NULL, it contains a palloc'd array of the items - * in this segment. If 'seg' is not NULL, it contains the items in an - * already-compressed format. It can point to an on-disk page (!modified), - * or a palloc'd segment in memory. If both are set, they must represent - * the same items. + * The following fields represent the items in this segment. If 'items' + * is not NULL, it contains a palloc'd array of the itemsin this segment. + * If 'seg' is not NULL, it contains the items in an already-compressed + * format. It can point to an on-disk page (!modified), or a palloc'd + * segment in memory. If both are set, they must represent the same items. */ GinPostingList *seg; ItemPointer items; int nitems; /* # of items in 'items', if items != NULL */ - - bool modified; /* is this segment on page already? */ } leafSegmentInfo; static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems); @@ -84,12 +102,12 @@ static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, static disassembledLeaf *disassembleLeaf(Page page); static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining); -static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems); - +static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, + int nNewItems); -static void dataPlaceToPageLeafRecompress(Buffer buf, - disassembledLeaf *leaf, - XLogRecData **prdata); +static XLogRecData *constructLeafRecompressWALData(Buffer buf, + disassembledLeaf *leaf); +static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf); static void dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, @@ -563,15 +581,21 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, if (!needsplit) { /* - * Great, all the items fit on a single page. Write the segments to - * the page, and WAL-log appropriately. + * Great, all the items fit on a single page. Construct a WAL record + * describing the changes we made, and write the segments back to the + * page. * * Once we start modifying the page, there's no turning back. The * caller is responsible for calling END_CRIT_SECTION() after writing * the WAL record. */ + MemoryContextSwitchTo(oldCxt); + if (RelationNeedsWAL(btree->index)) + *prdata = constructLeafRecompressWALData(buf, leaf); + else + *prdata = NULL; START_CRIT_SECTION(); - dataPlaceToPageLeafRecompress(buf, leaf, prdata); + dataPlaceToPageLeafRecompress(buf, leaf); if (append) elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)", @@ -619,8 +643,6 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, break; } - /* don't consider segments moved to right as unmodified */ - lastleftinfo->modified = true; leaf->lsize -= segsize; leaf->rsize += segsize; leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft); @@ -716,14 +738,15 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) /* Removing an item never increases the size of the segment */ if (npacked != ncleaned) elog(ERROR, "could not fit vacuumed posting list"); + seginfo->action = GIN_SEGMENT_REPLACE; } else { seginfo->seg = NULL; seginfo->items = NULL; + seginfo->action = GIN_SEGMENT_DELETE; } seginfo->nitems = ncleaned; - seginfo->modified = true; removedsomething = true; } @@ -733,11 +756,11 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) * If we removed any items, reconstruct the page from the pieces. * * We don't try to re-encode the segments here, even though some of them - * might be really small, now that we've removed some items from them. - * It seems like a waste of effort, as there isn't really any benefit from - * larger segments per se; larger segments only help you to pack more - * items in the same space. We might as well delay doing that until the - * next insertion, which will need to re-encode at least part of the page + * might be really small now that we've removed some items from them. It + * seems like a waste of effort, as there isn't really any benefit from + * larger segments per se; larger segments only help to pack more items + * in the same space. We might as well delay doing that until the next + * insertion, which will need to re-encode at least part of the page * anyway. * * Also note if the page was in uncompressed, pre-9.4 format before, it @@ -748,10 +771,35 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) */ if (removedsomething) { - XLogRecData *payloadrdata; + XLogRecData *payloadrdata = NULL; + bool modified; + + /* + * Make sure we have a palloc'd copy of all segments, after the first + * segment that is modified. (dataPlaceToPageLeafRecompress requires + * this). + */ + modified = false; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + modified = true; + if (modified && seginfo->action != GIN_SEGMENT_DELETE) + { + int segsize = SizeOfGinPostingList(seginfo->seg); + GinPostingList *tmp = (GinPostingList *) palloc(segsize); + memcpy(tmp, seginfo->seg, segsize); + seginfo->seg = tmp; + } + } + + if (RelationNeedsWAL(indexrel)) + payloadrdata = constructLeafRecompressWALData(buffer, leaf); START_CRIT_SECTION(); - dataPlaceToPageLeafRecompress(buffer, leaf, &payloadrdata); + dataPlaceToPageLeafRecompress(buffer, leaf); MarkBufferDirty(buffer); @@ -778,96 +826,169 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) } /* + * Construct a ginxlogRecompressDataLeaf record representing the changes + * in *leaf. + */ +static XLogRecData * +constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) +{ + int nmodified = 0; + char *walbufbegin; + char *walbufend; + XLogRecData *rdata; + dlist_iter iter; + int segno; + ginxlogRecompressDataLeaf *recompress_xlog; + + /* Count the modified segments */ + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) + nmodified++; + } + + walbufbegin = palloc( + sizeof(ginxlogRecompressDataLeaf) + + BLCKSZ + /* max size needed to hold the segment data */ + nmodified * 2 + /* (segno + action) per action */ + sizeof(XLogRecData)); + walbufend = walbufbegin; + + recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend; + walbufend += sizeof(ginxlogRecompressDataLeaf); + + recompress_xlog->nactions = nmodified; + + segno = 0; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + int segsize = 0; + int datalen; + uint8 action = seginfo->action; + + if (action == GIN_SEGMENT_UNMODIFIED) + { + segno++; + continue; + } + + if (action != GIN_SEGMENT_DELETE) + segsize = SizeOfGinPostingList(seginfo->seg); + + /* + * If storing the uncompressed list of added item pointers would take + * more space than storing the compressed segment as is, do that + * instead. + */ + if (action == GIN_SEGMENT_ADDITEMS && + seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize) + { + action = GIN_SEGMENT_REPLACE; + } + + *((uint8 *) (walbufend++)) = segno; + *(walbufend++) = action; + + switch (action) + { + case GIN_SEGMENT_DELETE: + datalen = 0; + break; + + case GIN_SEGMENT_ADDITEMS: + datalen = seginfo->nmodifieditems * sizeof(ItemPointerData); + memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16)); + memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen); + datalen += sizeof(uint16); + break; + + case GIN_SEGMENT_INSERT: + case GIN_SEGMENT_REPLACE: + datalen = SHORTALIGN(segsize); + memcpy(walbufend, seginfo->seg, segsize); + break; + + default: + elog(ERROR, "unexpected GIN leaf action %d", action); + } + walbufend += datalen; + + if (action != GIN_SEGMENT_INSERT) + segno++; + } + + rdata = (XLogRecData *) MAXALIGN(walbufend); + rdata->buffer = buf; + rdata->buffer_std = TRUE; + rdata->data = walbufbegin; + rdata->len = walbufend - walbufbegin; + rdata->next = NULL; + + return rdata; +} + +/* * Assemble a disassembled posting tree leaf page back to a buffer. * * *prdata is filled with WAL information about this operation. The caller * is responsible for inserting to the WAL, along with any other information * about the operation that triggered this recompression. * - * NOTE: The segment pointers can point directly to the same buffer, with - * the limitation that any earlier segment must not overlap with an original, - * later segment. In other words, some segments may point the original buffer - * as long as you don't make any segments larger. Currently, leafRepackItems - * satisies this rule because it rewrites all segments after the first - * modified one, and vacuum can only make segments shorter. + * NOTE: The segment pointers must not point directly to the same buffer, + * except for segments that have not been modified and whose preceding + * segments have not been modified either. */ static void -dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf, - XLogRecData **prdata) +dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf) { Page page = BufferGetPage(buf); char *ptr; int newsize; - int unmodifiedsize; bool modified = false; dlist_iter iter; + int segsize; + /* - * these must be static so they can be returned to caller (no pallocs - * since we're in a critical section!) + * If the page was in pre-9.4 format before, convert the header, and + * force all segments to be copied to the page whether they were modified + * or not. */ - static ginxlogRecompressDataLeaf recompress_xlog; - static XLogRecData rdata[2]; + if (!GinPageIsCompressed(page)) + { + Assert(leaf->oldformat); + GinPageSetCompressed(page); + GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; + modified = true; + } ptr = (char *) GinDataLeafPageGetPostingList(page); newsize = 0; - unmodifiedsize = 0; - modified = false; dlist_foreach(iter, &leaf->segments) { leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); - int segsize; - if (seginfo->modified) + if (seginfo->action != GIN_SEGMENT_UNMODIFIED) modified = true; - /* - * Nothing to do with empty segments, except keep track if they've been - * modified - */ - if (seginfo->seg == NULL) + if (seginfo->action != GIN_SEGMENT_DELETE) { - Assert(seginfo->items == NULL); - continue; - } + segsize = SizeOfGinPostingList(seginfo->seg); - segsize = SizeOfGinPostingList(seginfo->seg); + if (modified) + memcpy(ptr, seginfo->seg, segsize); - if (!modified) - unmodifiedsize += segsize; - else - { - /* - * Use memmove rather than memcpy, in case the segment points - * to the same buffer - */ - memmove(ptr, seginfo->seg, segsize); + ptr += segsize; + newsize += segsize; } - ptr += segsize; - newsize += segsize; } + Assert(newsize <= GinDataLeafMaxContentSize); GinDataLeafPageSetPostingListSize(page, newsize); - - /* Reset these in case the page was in pre-9.4 format before */ - GinPageSetCompressed(page); - GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; - - /* Put WAL data */ - recompress_xlog.length = (uint16) newsize; - recompress_xlog.unmodifiedsize = unmodifiedsize; - - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &recompress_xlog; - rdata[0].len = offsetof(ginxlogRecompressDataLeaf, newdata); - rdata[0].next = &rdata[1]; - - rdata[1].buffer = buf; - rdata[1].buffer_std = TRUE; - rdata[1].data = ((char *) GinDataLeafPageGetPostingList(page)) + unmodifiedsize; - rdata[1].len = newsize - unmodifiedsize; - rdata[1].next = NULL; - - *prdata = rdata; } /* @@ -912,11 +1033,14 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, node = dlist_next_node(&leaf->segments, node)) { seginfo = dlist_container(leafSegmentInfo, node, node); - segsize = SizeOfGinPostingList(seginfo->seg); - memcpy(ptr, seginfo->seg, segsize); - ptr += segsize; - lsize += segsize; + if (seginfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(seginfo->seg); + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + lsize += segsize; + } } Assert(lsize == leaf->lsize); GinDataLeafPageSetPostingListSize(lpage, lsize); @@ -930,11 +1054,14 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, node = dlist_next_node(&leaf->segments, node)) { seginfo = dlist_container(leafSegmentInfo, node, node); - segsize = SizeOfGinPostingList(seginfo->seg); - memcpy(ptr, seginfo->seg, segsize); - ptr += segsize; - rsize += segsize; + if (seginfo->action != GIN_SEGMENT_DELETE) + { + segsize = SizeOfGinPostingList(seginfo->seg); + memcpy(ptr, seginfo->seg, segsize); + ptr += segsize; + rsize += segsize; + } if (!dlist_has_next(&leaf->segments, node)) break; @@ -1195,14 +1322,15 @@ disassembleLeaf(Page page) { leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo)); + seginfo->action = GIN_SEGMENT_UNMODIFIED; seginfo->seg = seg; seginfo->items = NULL; seginfo->nitems = 0; - seginfo->modified = false; dlist_push_tail(&leaf->segments, &seginfo->node); seg = GinNextPostingListSegment(seg); } + leaf->oldformat = false; } else { @@ -1218,14 +1346,15 @@ disassembleLeaf(Page page) seginfo = palloc(sizeof(leafSegmentInfo)); + seginfo->action = GIN_SEGMENT_REPLACE; seginfo->seg = NULL; seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData)); memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData)); seginfo->nitems = nuncompressed; - /* make sure we rewrite this to disk */ - seginfo->modified = true; dlist_push_tail(&leaf->segments, &seginfo->node); + + leaf->oldformat = true; } return leaf; @@ -1247,6 +1376,7 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) ItemPointer nextnew = newItems; int newleft = nNewItems; bool modified = false; + leafSegmentInfo *newseg; /* * If the page is completely empty, just construct one new segment to @@ -1254,14 +1384,12 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) */ if (dlist_is_empty(&leaf->segments)) { - /* create a new segment for the new entries */ - leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo)); - - seginfo->seg = NULL; - seginfo->items = newItems; - seginfo->nitems = nNewItems; - seginfo->modified = true; - dlist_push_tail(&leaf->segments, &seginfo->node); + newseg = palloc(sizeof(leafSegmentInfo)); + newseg->seg = NULL; + newseg->items = newItems; + newseg->nitems = nNewItems; + newseg->action = GIN_SEGMENT_INSERT; + dlist_push_tail(&leaf->segments, &newseg->node); return true; } @@ -1303,15 +1431,51 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) if (!cur->items) cur->items = ginPostingListDecode(cur->seg, &cur->nitems); + /* + * Fast path for the important special case that we're appending to + * the end of the page: don't let the last segment on the page grow + * larger than the target, create a new segment before that happens. + */ + if (!dlist_has_next(&leaf->segments, iter.cur) && + ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 && + cur->seg != NULL && + SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize) + { + newseg = palloc(sizeof(leafSegmentInfo)); + newseg->seg = NULL; + newseg->items = nextnew; + newseg->nitems = nthis; + newseg->action = GIN_SEGMENT_INSERT; + dlist_push_tail(&leaf->segments, &newseg->node); + modified = true; + break; + } + tmpitems = ginMergeItemPointers(cur->items, cur->nitems, nextnew, nthis, &ntmpitems); if (ntmpitems != cur->nitems) { + /* + * If there are no duplicates, track the added items so that we + * can emit a compact ADDITEMS WAL record later on. (it doesn't + * seem worth re-checking which items were duplicates, if there + * were any) + */ + if (ntmpitems == nthis + cur->nitems && + cur->action == GIN_SEGMENT_UNMODIFIED) + { + cur->action = GIN_SEGMENT_ADDITEMS; + cur->modifieditems = nextnew; + cur->nmodifieditems = nthis; + } + else + cur->action = GIN_SEGMENT_REPLACE; + cur->items = tmpitems; cur->nitems = ntmpitems; cur->seg = NULL; - modified = cur->modified = true; + modified = true; } nextnew += nthis; @@ -1331,133 +1495,160 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems) * If all items fit, *remaining is set to invalid. * * Returns true if the page has to be split. - * - * XXX: Actually, this re-encodes all segments after the first one that was - * modified, to make sure the new segments are all more or less of equal - * length. That's unnecessarily aggressive; if we've only added a single item - * to one segment, for example, we could re-encode just that single segment, - * as long as it's still smaller than, say, 2x the normal segment size. */ static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining) { - dlist_iter iter; - ItemPointer allitems; - int nallitems; int pgused = 0; - bool needsplit; - int totalpacked; - dlist_mutable_iter miter; - dlist_head recode_list; - int nrecode; - bool recoding; + bool needsplit = false; + dlist_iter iter; + int segsize; + leafSegmentInfo *nextseg; + int npacked; + bool modified; + dlist_node *cur_node; + dlist_node *next_node; ItemPointerSetInvalid(remaining); /* - * Find the first segment that needs to be re-coded. Move all segments - * that need recoding to separate list, and count the total number of - * items in them. Also, add up the number of bytes in unmodified segments - * (pgused). + * cannot use dlist_foreach_modify here because we insert adjacent items + * while iterating. */ - dlist_init(&recode_list); - recoding = false; - nrecode = 0; - pgused = 0; - dlist_foreach_modify(miter, &leaf->segments) + for (cur_node = dlist_head_node(&leaf->segments); + cur_node != NULL; + cur_node = next_node) { - leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, miter.cur); + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + cur_node); - /* If the segment was modified, re-encode it */ - if (seginfo->modified || seginfo->seg == NULL) - recoding = true; - /* - * Also re-encode abnormally small or large segments. (Vacuum can - * leave behind small segments, and conversion from pre-9.4 format - * can leave behind large segments). - */ - else if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize) - recoding = true; - else if (SizeOfGinPostingList(seginfo->seg) > GinPostingListSegmentMaxSize) - recoding = true; + if (dlist_has_next(&leaf->segments, cur_node)) + next_node = dlist_next_node(&leaf->segments, cur_node); + else + next_node = NULL; - if (recoding) + /* Compress the posting list, if necessary */ + if (seginfo->action != GIN_SEGMENT_DELETE) { - if (!seginfo->items) - seginfo->items = ginPostingListDecode(seginfo->seg, - &seginfo->nitems); - nrecode += seginfo->nitems; - - dlist_delete(miter.cur); - dlist_push_tail(&recode_list, &seginfo->node); - } - else - pgused += SizeOfGinPostingList(seginfo->seg); - } + if (seginfo->seg == NULL) + { + if (seginfo->nitems > GinPostingListSegmentMaxSize) + npacked = 0; /* no chance that it would fit. */ + else + { + seginfo->seg = ginCompressPostingList(seginfo->items, + seginfo->nitems, + GinPostingListSegmentMaxSize, + &npacked); + } + if (npacked != seginfo->nitems) + { + /* + * Too large. Compress again to the target size, and create + * a new segment to represent the remaining items. The new + * segment is inserted after this one, so it will be + * processed in the next iteration of this loop. + */ + if (seginfo->seg) + pfree(seginfo->seg); + seginfo->seg = ginCompressPostingList(seginfo->items, + seginfo->nitems, + GinPostingListSegmentTargetSize, + &npacked); + if (seginfo->action != GIN_SEGMENT_INSERT) + seginfo->action = GIN_SEGMENT_REPLACE; + + nextseg = palloc(sizeof(leafSegmentInfo)); + nextseg->action = GIN_SEGMENT_INSERT; + nextseg->seg = NULL; + nextseg->items = &seginfo->items[npacked]; + nextseg->nitems = seginfo->nitems - npacked; + next_node = &nextseg->node; + dlist_insert_after(cur_node, next_node); + } + } - if (nrecode == 0) - return false; /* nothing to do */ + /* + * If the segment is very small, merge it with the next segment. + */ + if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node) + { + int nmerged; + + nextseg = dlist_container(leafSegmentInfo, node, next_node); + + if (seginfo->items == NULL) + seginfo->items = ginPostingListDecode(seginfo->seg, + &seginfo->nitems); + if (nextseg->items == NULL) + nextseg->items = ginPostingListDecode(nextseg->seg, + &nextseg->nitems); + nextseg->items = + ginMergeItemPointers(seginfo->items, seginfo->nitems, + nextseg->items, nextseg->nitems, + &nmerged); + Assert(nmerged == seginfo->nitems + nextseg->nitems); + nextseg->nitems = nmerged; + nextseg->seg = NULL; + + nextseg->action = GIN_SEGMENT_REPLACE; + nextseg->modifieditems = NULL; + nextseg->nmodifieditems = 0; + + if (seginfo->action == GIN_SEGMENT_INSERT) + { + dlist_delete(cur_node); + continue; + } + else + { + seginfo->action = GIN_SEGMENT_DELETE; + seginfo->seg = NULL; + } + } - /* - * Construct one big array of the items that need to be re-encoded. - */ - allitems = palloc(nrecode * sizeof(ItemPointerData)); - nallitems = 0; - dlist_foreach(iter, &recode_list) - { - leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur); - memcpy(&allitems[nallitems], seginfo->items, seginfo->nitems * sizeof(ItemPointerData)); - nallitems += seginfo->nitems; - } - Assert(nallitems == nrecode); + seginfo->items = NULL; + seginfo->nitems = 0; + } - /* - * Start packing the items into segments. Stop when we have consumed - * enough space to fill both pages, or we run out of items. - */ - totalpacked = 0; - needsplit = false; - while (totalpacked < nallitems) - { - leafSegmentInfo *seginfo; - int npacked; - GinPostingList *seg; - int segsize; + if (seginfo->action == GIN_SEGMENT_DELETE) + continue; - seg = ginCompressPostingList(&allitems[totalpacked], - nallitems - totalpacked, - GinPostingListSegmentMaxSize, - &npacked); - segsize = SizeOfGinPostingList(seg); + /* + * OK, we now have a compressed version of this segment ready for + * copying to the page. Did we exceed the size that fits on one page? + */ + segsize = SizeOfGinPostingList(seginfo->seg); if (pgused + segsize > GinDataLeafMaxContentSize) { if (!needsplit) { /* switch to right page */ Assert(pgused > 0); - leaf->lastleft = dlist_tail_node(&leaf->segments); + leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node); needsplit = true; leaf->lsize = pgused; pgused = 0; } else { - /* filled both pages */ - *remaining = allitems[totalpacked]; + /* + * Filled both pages. The last segment we constructed did not + * fit. + */ + *remaining = seginfo->seg->first; + + /* + * remove all segments that did not fit from the list. + */ + while (dlist_has_next(&leaf->segments, cur_node)) + dlist_delete(dlist_next_node(&leaf->segments, cur_node)); + dlist_delete(cur_node); break; } } - seginfo = palloc(sizeof(leafSegmentInfo)); - seginfo->seg = seg; - seginfo->items = &allitems[totalpacked]; - seginfo->nitems = npacked; - seginfo->modified = true; - - dlist_push_tail(&leaf->segments, &seginfo->node); - pgused += segsize; - totalpacked += npacked; } if (!needsplit) @@ -1471,6 +1662,32 @@ leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining) Assert(leaf->lsize <= GinDataLeafMaxContentSize); Assert(leaf->rsize <= GinDataLeafMaxContentSize); + /* + * Make a palloc'd copy of every segment after the first modified one, + * because as we start copying items to the original page, we might + * overwrite an existing segment. + */ + modified = false; + dlist_foreach(iter, &leaf->segments) + { + leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, + iter.cur); + + if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED) + { + modified = true; + } + else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED) + { + GinPostingList *tmp; + + segsize = SizeOfGinPostingList(seginfo->seg); + tmp = palloc(segsize); + memcpy(tmp, seginfo->seg, segsize); + seginfo->seg = tmp; + } + } + return needsplit; } diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c index 4c4110d54b1..9d68a980722 100644 --- a/src/backend/access/gin/ginpostinglist.c +++ b/src/backend/access/gin/ginpostinglist.c @@ -298,9 +298,10 @@ ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_ } /* copy the first item */ + Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first))); + Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0); result[ndecoded] = segment->first; ndecoded++; - Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first))); val = itemptr_to_uint64(&segment->first); ptr = segment->bytes; diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index c46f7ac4f7e..02e566cc685 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -145,15 +145,158 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda static void ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) { - Pointer segment; - - /* Copy the new data to the right place */ - segment = ((Pointer) GinDataLeafPageGetPostingList(page)) - + data->unmodifiedsize; - memcpy(segment, data->newdata, data->length - data->unmodifiedsize); - GinDataLeafPageSetPostingListSize(page, data->length); - GinPageSetCompressed(page); - GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; + int actionno; + int segno; + GinPostingList *oldseg; + Pointer segmentend; + char *walbuf; + int totalsize; + + /* + * If the page is in pre-9.4 format, convert to new format first. + */ + if (!GinPageIsCompressed(page)) + { + ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page); + int nuncompressed = GinPageGetOpaque(page)->maxoff; + int npacked; + GinPostingList *plist; + + plist = ginCompressPostingList(uncompressed, nuncompressed, + BLCKSZ, &npacked); + Assert(npacked == nuncompressed); + + totalsize = SizeOfGinPostingList(plist); + + memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize); + GinDataLeafPageSetPostingListSize(page, totalsize); + GinPageSetCompressed(page); + GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; + } + + oldseg = GinDataLeafPageGetPostingList(page); + segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page); + segno = 0; + + walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf); + for (actionno = 0; actionno < data->nactions; actionno++) + { + uint8 a_segno = *((uint8 *) (walbuf++)); + uint8 a_action = *((uint8 *) (walbuf++)); + GinPostingList *newseg = NULL; + int newsegsize = 0; + ItemPointerData *items = NULL; + uint16 nitems = 0; + ItemPointerData *olditems; + int nolditems; + ItemPointerData *newitems; + int nnewitems; + int segsize; + Pointer segptr; + int szleft; + + /* Extract all the information we need from the WAL record */ + if (a_action == GIN_SEGMENT_INSERT || + a_action == GIN_SEGMENT_REPLACE) + { + newseg = (GinPostingList *) walbuf; + newsegsize = SizeOfGinPostingList(newseg); + walbuf += SHORTALIGN(newsegsize); + } + + if (a_action == GIN_SEGMENT_ADDITEMS) + { + memcpy(&nitems, walbuf, sizeof(uint16)); + walbuf += sizeof(uint16); + items = (ItemPointerData *) walbuf; + walbuf += nitems * sizeof(ItemPointerData); + } + + /* Skip to the segment that this action concerns */ + Assert(segno <= a_segno); + while (segno < a_segno) + { + oldseg = GinNextPostingListSegment(oldseg); + segno++; + } + + /* + * ADDITEMS action is handled like REPLACE, but the new segment to + * replace the old one is reconstructed using the old segment from + * disk and the new items from the WAL record. + */ + if (a_action == GIN_SEGMENT_ADDITEMS) + { + int npacked; + + olditems = ginPostingListDecode(oldseg, &nolditems); + + newitems = ginMergeItemPointers(items, nitems, + olditems, nolditems, + &nnewitems); + Assert(nnewitems == nolditems + nitems); + + newseg = ginCompressPostingList(newitems, nnewitems, + BLCKSZ, &npacked); + Assert(npacked == nnewitems); + + newsegsize = SizeOfGinPostingList(newseg); + a_action = GIN_SEGMENT_REPLACE; + } + + segptr = (Pointer) oldseg; + if (segptr != segmentend) + segsize = SizeOfGinPostingList(oldseg); + else + { + /* + * Positioned after the last existing segment. Only INSERTs + * expected here. + */ + Assert(a_action == GIN_SEGMENT_INSERT); + segsize = 0; + } + szleft = segmentend - segptr; + + switch (a_action) + { + case GIN_SEGMENT_DELETE: + memmove(segptr, segptr + segsize, szleft - segsize); + segmentend -= segsize; + + segno++; + break; + + case GIN_SEGMENT_INSERT: + /* make room for the new segment */ + memmove(segptr + newsegsize, segptr, szleft); + /* copy the new segment in place */ + memcpy(segptr, newseg, newsegsize); + segmentend += newsegsize; + segptr += newsegsize; + break; + + case GIN_SEGMENT_REPLACE: + /* shift the segments that follow */ + memmove(segptr + newsegsize, + segptr + segsize, + szleft - segsize); + /* copy the replacement segment in place */ + memcpy(segptr, newseg, newsegsize); + segmentend -= segsize; + segmentend += newsegsize; + segptr += newsegsize; + segno++; + break; + + default: + elog(ERROR, "unexpected GIN leaf action: %u", a_action); + } + oldseg = (GinPostingList *) segptr; + } + + totalsize = segmentend - (Pointer) GinDataLeafPageGetPostingList(page); + GinDataLeafPageSetPostingListSize(page, totalsize); } static void diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c index 1983bcc94c7..aa60c8db65c 100644 --- a/src/backend/access/rmgrdesc/gindesc.c +++ b/src/backend/access/rmgrdesc/gindesc.c @@ -25,6 +25,57 @@ desc_node(StringInfo buf, RelFileNode node, BlockNumber blkno) node.spcNode, node.dbNode, node.relNode, blkno); } +static void +desc_recompress_leaf(StringInfo buf, ginxlogRecompressDataLeaf *insertData) +{ + int i; + char *walbuf = ((char *) insertData) + sizeof(ginxlogRecompressDataLeaf); + + appendStringInfo(buf, " %d segments:", (int) insertData->nactions); + + for (i = 0; i < insertData->nactions; i++) + { + uint8 a_segno = *((uint8 *) (walbuf++)); + uint8 a_action = *((uint8 *) (walbuf++)); + uint16 nitems = 0; + int newsegsize = 0; + + if (a_action == GIN_SEGMENT_INSERT || + a_action == GIN_SEGMENT_REPLACE) + { + newsegsize = SizeOfGinPostingList((GinPostingList *) walbuf); + walbuf += SHORTALIGN(newsegsize); + } + + if (a_action == GIN_SEGMENT_ADDITEMS) + { + memcpy(&nitems, walbuf, sizeof(uint16)); + walbuf += sizeof(uint16); + walbuf += nitems * sizeof(ItemPointerData); + } + + switch(a_action) + { + case GIN_SEGMENT_ADDITEMS: + appendStringInfo(buf, " %d (add %d items)", a_segno, nitems); + break; + case GIN_SEGMENT_DELETE: + appendStringInfo(buf, " %d (delete)", a_segno); + break; + case GIN_SEGMENT_INSERT: + appendStringInfo(buf, " %d (insert)", a_segno); + break; + case GIN_SEGMENT_REPLACE: + appendStringInfo(buf, " %d (replace)", a_segno); + break; + default: + appendStringInfo(buf, " %d unknown action %d ???", a_segno, a_action); + /* cannot decode unrecognized actions further */ + return; + } + } +} + void gin_desc(StringInfo buf, uint8 xl_info, char *rec) { @@ -70,9 +121,10 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec) ginxlogRecompressDataLeaf *insertData = (ginxlogRecompressDataLeaf *) payload; - appendStringInfo(buf, " unmodified: %u length: %u (compressed)", - insertData->unmodifiedsize, - insertData->length); + if (xl_info & XLR_BKP_BLOCK(0)) + appendStringInfo(buf, " (full page image)"); + else + desc_recompress_leaf(buf, insertData); } else { @@ -105,9 +157,10 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec) ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec; appendStringInfoString(buf, "Vacuum data leaf page, "); desc_node(buf, xlrec->node, xlrec->blkno); - appendStringInfo(buf, " unmodified: %u length: %u", - xlrec->data.unmodifiedsize, - xlrec->data.length); + if (xl_info & XLR_BKP_BLOCK(0)) + appendStringInfo(buf, " (full page image)"); + else + desc_recompress_leaf(buf, &xlrec->data); } break; case XLOG_GIN_DELETE_PAGE: diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index d11811acb57..d33c216b757 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -406,7 +406,8 @@ typedef struct * whose split this insertion finishes. As BlockIdData[2] (beware of adding * fields before this that would make them not 16-bit aligned) * - * 2. one of the following structs, depending on tree type. + * 2. an ginxlogInsertEntry or ginxlogRecompressDataLeaf struct, depending + * on tree type. * * NB: the below structs are only 16-bit aligned when appended to a * ginxlogInsert struct! Beware of adding fields to them that require @@ -421,15 +422,39 @@ typedef struct IndexTupleData tuple; /* variable length */ } ginxlogInsertEntry; + typedef struct { - uint16 length; - uint16 unmodifiedsize; + uint16 nactions; - /* compressed segments, variable length */ - char newdata[1]; + /* Variable number of 'actions' follow */ } ginxlogRecompressDataLeaf; +/* + * Note: this struct is currently not used in code, and only acts as + * documentation. The WAL record format is as specified here, but the code + * uses straight access through a Pointer and memcpy to read/write these. + */ +typedef struct +{ + uint8 segno; /* segment this action applies to */ + char type; /* action type (see below) */ + + /* + * Action-specific data follows. For INSERT and REPLACE actions that is a + * GinPostingList struct. For ADDITEMS, a uint16 for the number of items + * added, followed by the items themselves as ItemPointers. DELETE actions + * have no further data. + */ +} ginxlogSegmentAction; + +/* Action types */ +#define GIN_SEGMENT_UNMODIFIED 0 /* no action (not used in WAL records) */ +#define GIN_SEGMENT_DELETE 1 /* a whole segment is removed */ +#define GIN_SEGMENT_INSERT 2 /* a whole segment is added */ +#define GIN_SEGMENT_REPLACE 3 /* a segment is replaced */ +#define GIN_SEGMENT_ADDITEMS 4 /* items are added to existing segment */ + typedef struct { OffsetNumber offset; |