aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/gin/gindatapage.c629
-rw-r--r--src/backend/access/gin/ginpostinglist.c3
-rw-r--r--src/backend/access/gin/ginxlog.c161
-rw-r--r--src/backend/access/rmgrdesc/gindesc.c65
-rw-r--r--src/include/access/gin_private.h35
5 files changed, 666 insertions, 227 deletions
diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c
index 313d53c354f..21ce79fc0b1 100644
--- a/src/backend/access/gin/gindatapage.c
+++ b/src/backend/access/gin/gindatapage.c
@@ -22,17 +22,17 @@
#include "utils/rel.h"
/*
- * Size of the posting lists stored on leaf pages, in bytes. The code can
- * deal with any size, but random access is more efficient when a number of
- * smaller lists are stored, rather than one big list.
- */
-#define GinPostingListSegmentMaxSize 256
-
-/*
- * Existing posting lists smaller than this are recompressed, when inserting
- * new items to page.
+ * Min, Max and Target size of posting lists stored on leaf pages, in bytes.
+ *
+ * The code can deal with any size, but random access is more efficient when
+ * a number of smaller lists are stored, rather than one big list. If a
+ * posting list would become larger than Max size as a result of insertions,
+ * it is split into two. If a posting list would be smaller than minimum
+ * size, it is merged with the next posting list.
*/
-#define GinPostingListSegmentMinSize 192
+#define GinPostingListSegmentMaxSize 384
+#define GinPostingListSegmentTargetSize 256
+#define GinPostingListSegmentMinSize 128
/*
* At least this many items fit in a GinPostingListSegmentMaxSize-bytes
@@ -55,25 +55,43 @@ typedef struct
dlist_node *lastleft; /* last segment on left page */
int lsize; /* total size on left page */
int rsize; /* total size on right page */
+
+ bool oldformat; /* page is in pre-9.4 format on disk */
} disassembledLeaf;
typedef struct
{
dlist_node node; /* linked list pointers */
+ /*-------------
+ * 'action' indicates the status of this in-memory segment, compared to
+ * what's on disk. It is one of the GIN_SEGMENT_* action codes:
+ *
+ * UNMODIFIED no changes
+ * DELETE the segment is to be removed. 'seg' and 'items' are
+ * ignored
+ * INSERT this is a completely new segment
+ * REPLACE this replaces an existing segment with new content
+ * ADDITEMS like REPLACE, but no items have been removed, and we track
+ * in detail what items have been added to this segment, in
+ * 'modifieditems'
+ *-------------
+ */
+ char action;
+
+ ItemPointerData *modifieditems;
+ int nmodifieditems;
+
/*
- * The following fields represent the items in this segment.
- * If 'items' is not NULL, it contains a palloc'd array of the items
- * in this segment. If 'seg' is not NULL, it contains the items in an
- * already-compressed format. It can point to an on-disk page (!modified),
- * or a palloc'd segment in memory. If both are set, they must represent
- * the same items.
+ * The following fields represent the items in this segment. If 'items'
+ * is not NULL, it contains a palloc'd array of the itemsin this segment.
+ * If 'seg' is not NULL, it contains the items in an already-compressed
+ * format. It can point to an on-disk page (!modified), or a palloc'd
+ * segment in memory. If both are set, they must represent the same items.
*/
GinPostingList *seg;
ItemPointer items;
int nitems; /* # of items in 'items', if items != NULL */
-
- bool modified; /* is this segment on page already? */
} leafSegmentInfo;
static ItemPointer dataLeafPageGetUncompressed(Page page, int *nitems);
@@ -84,12 +102,12 @@ static void dataSplitPageInternal(GinBtree btree, Buffer origbuf,
static disassembledLeaf *disassembleLeaf(Page page);
static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining);
-static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems);
-
+static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems,
+ int nNewItems);
-static void dataPlaceToPageLeafRecompress(Buffer buf,
- disassembledLeaf *leaf,
- XLogRecData **prdata);
+static XLogRecData *constructLeafRecompressWALData(Buffer buf,
+ disassembledLeaf *leaf);
+static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf);
static void dataPlaceToPageLeafSplit(Buffer buf,
disassembledLeaf *leaf,
ItemPointerData lbound, ItemPointerData rbound,
@@ -563,15 +581,21 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
if (!needsplit)
{
/*
- * Great, all the items fit on a single page. Write the segments to
- * the page, and WAL-log appropriately.
+ * Great, all the items fit on a single page. Construct a WAL record
+ * describing the changes we made, and write the segments back to the
+ * page.
*
* Once we start modifying the page, there's no turning back. The
* caller is responsible for calling END_CRIT_SECTION() after writing
* the WAL record.
*/
+ MemoryContextSwitchTo(oldCxt);
+ if (RelationNeedsWAL(btree->index))
+ *prdata = constructLeafRecompressWALData(buf, leaf);
+ else
+ *prdata = NULL;
START_CRIT_SECTION();
- dataPlaceToPageLeafRecompress(buf, leaf, prdata);
+ dataPlaceToPageLeafRecompress(buf, leaf);
if (append)
elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)",
@@ -619,8 +643,6 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
break;
}
- /* don't consider segments moved to right as unmodified */
- lastleftinfo->modified = true;
leaf->lsize -= segsize;
leaf->rsize += segsize;
leaf->lastleft = dlist_prev_node(&leaf->segments, leaf->lastleft);
@@ -716,14 +738,15 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
/* Removing an item never increases the size of the segment */
if (npacked != ncleaned)
elog(ERROR, "could not fit vacuumed posting list");
+ seginfo->action = GIN_SEGMENT_REPLACE;
}
else
{
seginfo->seg = NULL;
seginfo->items = NULL;
+ seginfo->action = GIN_SEGMENT_DELETE;
}
seginfo->nitems = ncleaned;
- seginfo->modified = true;
removedsomething = true;
}
@@ -733,11 +756,11 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
* If we removed any items, reconstruct the page from the pieces.
*
* We don't try to re-encode the segments here, even though some of them
- * might be really small, now that we've removed some items from them.
- * It seems like a waste of effort, as there isn't really any benefit from
- * larger segments per se; larger segments only help you to pack more
- * items in the same space. We might as well delay doing that until the
- * next insertion, which will need to re-encode at least part of the page
+ * might be really small now that we've removed some items from them. It
+ * seems like a waste of effort, as there isn't really any benefit from
+ * larger segments per se; larger segments only help to pack more items
+ * in the same space. We might as well delay doing that until the next
+ * insertion, which will need to re-encode at least part of the page
* anyway.
*
* Also note if the page was in uncompressed, pre-9.4 format before, it
@@ -748,10 +771,35 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
*/
if (removedsomething)
{
- XLogRecData *payloadrdata;
+ XLogRecData *payloadrdata = NULL;
+ bool modified;
+
+ /*
+ * Make sure we have a palloc'd copy of all segments, after the first
+ * segment that is modified. (dataPlaceToPageLeafRecompress requires
+ * this).
+ */
+ modified = false;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ modified = true;
+ if (modified && seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ int segsize = SizeOfGinPostingList(seginfo->seg);
+ GinPostingList *tmp = (GinPostingList *) palloc(segsize);
+ memcpy(tmp, seginfo->seg, segsize);
+ seginfo->seg = tmp;
+ }
+ }
+
+ if (RelationNeedsWAL(indexrel))
+ payloadrdata = constructLeafRecompressWALData(buffer, leaf);
START_CRIT_SECTION();
- dataPlaceToPageLeafRecompress(buffer, leaf, &payloadrdata);
+ dataPlaceToPageLeafRecompress(buffer, leaf);
MarkBufferDirty(buffer);
@@ -778,96 +826,169 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
}
/*
+ * Construct a ginxlogRecompressDataLeaf record representing the changes
+ * in *leaf.
+ */
+static XLogRecData *
+constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf)
+{
+ int nmodified = 0;
+ char *walbufbegin;
+ char *walbufend;
+ XLogRecData *rdata;
+ dlist_iter iter;
+ int segno;
+ ginxlogRecompressDataLeaf *recompress_xlog;
+
+ /* Count the modified segments */
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ nmodified++;
+ }
+
+ walbufbegin = palloc(
+ sizeof(ginxlogRecompressDataLeaf) +
+ BLCKSZ + /* max size needed to hold the segment data */
+ nmodified * 2 + /* (segno + action) per action */
+ sizeof(XLogRecData));
+ walbufend = walbufbegin;
+
+ recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
+ walbufend += sizeof(ginxlogRecompressDataLeaf);
+
+ recompress_xlog->nactions = nmodified;
+
+ segno = 0;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+ int segsize = 0;
+ int datalen;
+ uint8 action = seginfo->action;
+
+ if (action == GIN_SEGMENT_UNMODIFIED)
+ {
+ segno++;
+ continue;
+ }
+
+ if (action != GIN_SEGMENT_DELETE)
+ segsize = SizeOfGinPostingList(seginfo->seg);
+
+ /*
+ * If storing the uncompressed list of added item pointers would take
+ * more space than storing the compressed segment as is, do that
+ * instead.
+ */
+ if (action == GIN_SEGMENT_ADDITEMS &&
+ seginfo->nmodifieditems * sizeof(ItemPointerData) > segsize)
+ {
+ action = GIN_SEGMENT_REPLACE;
+ }
+
+ *((uint8 *) (walbufend++)) = segno;
+ *(walbufend++) = action;
+
+ switch (action)
+ {
+ case GIN_SEGMENT_DELETE:
+ datalen = 0;
+ break;
+
+ case GIN_SEGMENT_ADDITEMS:
+ datalen = seginfo->nmodifieditems * sizeof(ItemPointerData);
+ memcpy(walbufend, &seginfo->nmodifieditems, sizeof(uint16));
+ memcpy(walbufend + sizeof(uint16), seginfo->modifieditems, datalen);
+ datalen += sizeof(uint16);
+ break;
+
+ case GIN_SEGMENT_INSERT:
+ case GIN_SEGMENT_REPLACE:
+ datalen = SHORTALIGN(segsize);
+ memcpy(walbufend, seginfo->seg, segsize);
+ break;
+
+ default:
+ elog(ERROR, "unexpected GIN leaf action %d", action);
+ }
+ walbufend += datalen;
+
+ if (action != GIN_SEGMENT_INSERT)
+ segno++;
+ }
+
+ rdata = (XLogRecData *) MAXALIGN(walbufend);
+ rdata->buffer = buf;
+ rdata->buffer_std = TRUE;
+ rdata->data = walbufbegin;
+ rdata->len = walbufend - walbufbegin;
+ rdata->next = NULL;
+
+ return rdata;
+}
+
+/*
* Assemble a disassembled posting tree leaf page back to a buffer.
*
* *prdata is filled with WAL information about this operation. The caller
* is responsible for inserting to the WAL, along with any other information
* about the operation that triggered this recompression.
*
- * NOTE: The segment pointers can point directly to the same buffer, with
- * the limitation that any earlier segment must not overlap with an original,
- * later segment. In other words, some segments may point the original buffer
- * as long as you don't make any segments larger. Currently, leafRepackItems
- * satisies this rule because it rewrites all segments after the first
- * modified one, and vacuum can only make segments shorter.
+ * NOTE: The segment pointers must not point directly to the same buffer,
+ * except for segments that have not been modified and whose preceding
+ * segments have not been modified either.
*/
static void
-dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf,
- XLogRecData **prdata)
+dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
{
Page page = BufferGetPage(buf);
char *ptr;
int newsize;
- int unmodifiedsize;
bool modified = false;
dlist_iter iter;
+ int segsize;
+
/*
- * these must be static so they can be returned to caller (no pallocs
- * since we're in a critical section!)
+ * If the page was in pre-9.4 format before, convert the header, and
+ * force all segments to be copied to the page whether they were modified
+ * or not.
*/
- static ginxlogRecompressDataLeaf recompress_xlog;
- static XLogRecData rdata[2];
+ if (!GinPageIsCompressed(page))
+ {
+ Assert(leaf->oldformat);
+ GinPageSetCompressed(page);
+ GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+ modified = true;
+ }
ptr = (char *) GinDataLeafPageGetPostingList(page);
newsize = 0;
- unmodifiedsize = 0;
- modified = false;
dlist_foreach(iter, &leaf->segments)
{
leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
- int segsize;
- if (seginfo->modified)
+ if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
modified = true;
- /*
- * Nothing to do with empty segments, except keep track if they've been
- * modified
- */
- if (seginfo->seg == NULL)
+ if (seginfo->action != GIN_SEGMENT_DELETE)
{
- Assert(seginfo->items == NULL);
- continue;
- }
+ segsize = SizeOfGinPostingList(seginfo->seg);
- segsize = SizeOfGinPostingList(seginfo->seg);
+ if (modified)
+ memcpy(ptr, seginfo->seg, segsize);
- if (!modified)
- unmodifiedsize += segsize;
- else
- {
- /*
- * Use memmove rather than memcpy, in case the segment points
- * to the same buffer
- */
- memmove(ptr, seginfo->seg, segsize);
+ ptr += segsize;
+ newsize += segsize;
}
- ptr += segsize;
- newsize += segsize;
}
+
Assert(newsize <= GinDataLeafMaxContentSize);
GinDataLeafPageSetPostingListSize(page, newsize);
-
- /* Reset these in case the page was in pre-9.4 format before */
- GinPageSetCompressed(page);
- GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
-
- /* Put WAL data */
- recompress_xlog.length = (uint16) newsize;
- recompress_xlog.unmodifiedsize = unmodifiedsize;
-
- rdata[0].buffer = InvalidBuffer;
- rdata[0].data = (char *) &recompress_xlog;
- rdata[0].len = offsetof(ginxlogRecompressDataLeaf, newdata);
- rdata[0].next = &rdata[1];
-
- rdata[1].buffer = buf;
- rdata[1].buffer_std = TRUE;
- rdata[1].data = ((char *) GinDataLeafPageGetPostingList(page)) + unmodifiedsize;
- rdata[1].len = newsize - unmodifiedsize;
- rdata[1].next = NULL;
-
- *prdata = rdata;
}
/*
@@ -912,11 +1033,14 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
node = dlist_next_node(&leaf->segments, node))
{
seginfo = dlist_container(leafSegmentInfo, node, node);
- segsize = SizeOfGinPostingList(seginfo->seg);
- memcpy(ptr, seginfo->seg, segsize);
- ptr += segsize;
- lsize += segsize;
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ memcpy(ptr, seginfo->seg, segsize);
+ ptr += segsize;
+ lsize += segsize;
+ }
}
Assert(lsize == leaf->lsize);
GinDataLeafPageSetPostingListSize(lpage, lsize);
@@ -930,11 +1054,14 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
node = dlist_next_node(&leaf->segments, node))
{
seginfo = dlist_container(leafSegmentInfo, node, node);
- segsize = SizeOfGinPostingList(seginfo->seg);
- memcpy(ptr, seginfo->seg, segsize);
- ptr += segsize;
- rsize += segsize;
+ if (seginfo->action != GIN_SEGMENT_DELETE)
+ {
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ memcpy(ptr, seginfo->seg, segsize);
+ ptr += segsize;
+ rsize += segsize;
+ }
if (!dlist_has_next(&leaf->segments, node))
break;
@@ -1195,14 +1322,15 @@ disassembleLeaf(Page page)
{
leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
+ seginfo->action = GIN_SEGMENT_UNMODIFIED;
seginfo->seg = seg;
seginfo->items = NULL;
seginfo->nitems = 0;
- seginfo->modified = false;
dlist_push_tail(&leaf->segments, &seginfo->node);
seg = GinNextPostingListSegment(seg);
}
+ leaf->oldformat = false;
}
else
{
@@ -1218,14 +1346,15 @@ disassembleLeaf(Page page)
seginfo = palloc(sizeof(leafSegmentInfo));
+ seginfo->action = GIN_SEGMENT_REPLACE;
seginfo->seg = NULL;
seginfo->items = palloc(nuncompressed * sizeof(ItemPointerData));
memcpy(seginfo->items, uncompressed, nuncompressed * sizeof(ItemPointerData));
seginfo->nitems = nuncompressed;
- /* make sure we rewrite this to disk */
- seginfo->modified = true;
dlist_push_tail(&leaf->segments, &seginfo->node);
+
+ leaf->oldformat = true;
}
return leaf;
@@ -1247,6 +1376,7 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
ItemPointer nextnew = newItems;
int newleft = nNewItems;
bool modified = false;
+ leafSegmentInfo *newseg;
/*
* If the page is completely empty, just construct one new segment to
@@ -1254,14 +1384,12 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
*/
if (dlist_is_empty(&leaf->segments))
{
- /* create a new segment for the new entries */
- leafSegmentInfo *seginfo = palloc(sizeof(leafSegmentInfo));
-
- seginfo->seg = NULL;
- seginfo->items = newItems;
- seginfo->nitems = nNewItems;
- seginfo->modified = true;
- dlist_push_tail(&leaf->segments, &seginfo->node);
+ newseg = palloc(sizeof(leafSegmentInfo));
+ newseg->seg = NULL;
+ newseg->items = newItems;
+ newseg->nitems = nNewItems;
+ newseg->action = GIN_SEGMENT_INSERT;
+ dlist_push_tail(&leaf->segments, &newseg->node);
return true;
}
@@ -1303,15 +1431,51 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
if (!cur->items)
cur->items = ginPostingListDecode(cur->seg, &cur->nitems);
+ /*
+ * Fast path for the important special case that we're appending to
+ * the end of the page: don't let the last segment on the page grow
+ * larger than the target, create a new segment before that happens.
+ */
+ if (!dlist_has_next(&leaf->segments, iter.cur) &&
+ ginCompareItemPointers(&cur->items[cur->nitems - 1], &nextnew[0]) < 0 &&
+ cur->seg != NULL &&
+ SizeOfGinPostingList(cur->seg) >= GinPostingListSegmentTargetSize)
+ {
+ newseg = palloc(sizeof(leafSegmentInfo));
+ newseg->seg = NULL;
+ newseg->items = nextnew;
+ newseg->nitems = nthis;
+ newseg->action = GIN_SEGMENT_INSERT;
+ dlist_push_tail(&leaf->segments, &newseg->node);
+ modified = true;
+ break;
+ }
+
tmpitems = ginMergeItemPointers(cur->items, cur->nitems,
nextnew, nthis,
&ntmpitems);
if (ntmpitems != cur->nitems)
{
+ /*
+ * If there are no duplicates, track the added items so that we
+ * can emit a compact ADDITEMS WAL record later on. (it doesn't
+ * seem worth re-checking which items were duplicates, if there
+ * were any)
+ */
+ if (ntmpitems == nthis + cur->nitems &&
+ cur->action == GIN_SEGMENT_UNMODIFIED)
+ {
+ cur->action = GIN_SEGMENT_ADDITEMS;
+ cur->modifieditems = nextnew;
+ cur->nmodifieditems = nthis;
+ }
+ else
+ cur->action = GIN_SEGMENT_REPLACE;
+
cur->items = tmpitems;
cur->nitems = ntmpitems;
cur->seg = NULL;
- modified = cur->modified = true;
+ modified = true;
}
nextnew += nthis;
@@ -1331,133 +1495,160 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
* If all items fit, *remaining is set to invalid.
*
* Returns true if the page has to be split.
- *
- * XXX: Actually, this re-encodes all segments after the first one that was
- * modified, to make sure the new segments are all more or less of equal
- * length. That's unnecessarily aggressive; if we've only added a single item
- * to one segment, for example, we could re-encode just that single segment,
- * as long as it's still smaller than, say, 2x the normal segment size.
*/
static bool
leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
{
- dlist_iter iter;
- ItemPointer allitems;
- int nallitems;
int pgused = 0;
- bool needsplit;
- int totalpacked;
- dlist_mutable_iter miter;
- dlist_head recode_list;
- int nrecode;
- bool recoding;
+ bool needsplit = false;
+ dlist_iter iter;
+ int segsize;
+ leafSegmentInfo *nextseg;
+ int npacked;
+ bool modified;
+ dlist_node *cur_node;
+ dlist_node *next_node;
ItemPointerSetInvalid(remaining);
/*
- * Find the first segment that needs to be re-coded. Move all segments
- * that need recoding to separate list, and count the total number of
- * items in them. Also, add up the number of bytes in unmodified segments
- * (pgused).
+ * cannot use dlist_foreach_modify here because we insert adjacent items
+ * while iterating.
*/
- dlist_init(&recode_list);
- recoding = false;
- nrecode = 0;
- pgused = 0;
- dlist_foreach_modify(miter, &leaf->segments)
+ for (cur_node = dlist_head_node(&leaf->segments);
+ cur_node != NULL;
+ cur_node = next_node)
{
- leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, miter.cur);
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ cur_node);
- /* If the segment was modified, re-encode it */
- if (seginfo->modified || seginfo->seg == NULL)
- recoding = true;
- /*
- * Also re-encode abnormally small or large segments. (Vacuum can
- * leave behind small segments, and conversion from pre-9.4 format
- * can leave behind large segments).
- */
- else if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize)
- recoding = true;
- else if (SizeOfGinPostingList(seginfo->seg) > GinPostingListSegmentMaxSize)
- recoding = true;
+ if (dlist_has_next(&leaf->segments, cur_node))
+ next_node = dlist_next_node(&leaf->segments, cur_node);
+ else
+ next_node = NULL;
- if (recoding)
+ /* Compress the posting list, if necessary */
+ if (seginfo->action != GIN_SEGMENT_DELETE)
{
- if (!seginfo->items)
- seginfo->items = ginPostingListDecode(seginfo->seg,
- &seginfo->nitems);
- nrecode += seginfo->nitems;
-
- dlist_delete(miter.cur);
- dlist_push_tail(&recode_list, &seginfo->node);
- }
- else
- pgused += SizeOfGinPostingList(seginfo->seg);
- }
+ if (seginfo->seg == NULL)
+ {
+ if (seginfo->nitems > GinPostingListSegmentMaxSize)
+ npacked = 0; /* no chance that it would fit. */
+ else
+ {
+ seginfo->seg = ginCompressPostingList(seginfo->items,
+ seginfo->nitems,
+ GinPostingListSegmentMaxSize,
+ &npacked);
+ }
+ if (npacked != seginfo->nitems)
+ {
+ /*
+ * Too large. Compress again to the target size, and create
+ * a new segment to represent the remaining items. The new
+ * segment is inserted after this one, so it will be
+ * processed in the next iteration of this loop.
+ */
+ if (seginfo->seg)
+ pfree(seginfo->seg);
+ seginfo->seg = ginCompressPostingList(seginfo->items,
+ seginfo->nitems,
+ GinPostingListSegmentTargetSize,
+ &npacked);
+ if (seginfo->action != GIN_SEGMENT_INSERT)
+ seginfo->action = GIN_SEGMENT_REPLACE;
+
+ nextseg = palloc(sizeof(leafSegmentInfo));
+ nextseg->action = GIN_SEGMENT_INSERT;
+ nextseg->seg = NULL;
+ nextseg->items = &seginfo->items[npacked];
+ nextseg->nitems = seginfo->nitems - npacked;
+ next_node = &nextseg->node;
+ dlist_insert_after(cur_node, next_node);
+ }
+ }
- if (nrecode == 0)
- return false; /* nothing to do */
+ /*
+ * If the segment is very small, merge it with the next segment.
+ */
+ if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node)
+ {
+ int nmerged;
+
+ nextseg = dlist_container(leafSegmentInfo, node, next_node);
+
+ if (seginfo->items == NULL)
+ seginfo->items = ginPostingListDecode(seginfo->seg,
+ &seginfo->nitems);
+ if (nextseg->items == NULL)
+ nextseg->items = ginPostingListDecode(nextseg->seg,
+ &nextseg->nitems);
+ nextseg->items =
+ ginMergeItemPointers(seginfo->items, seginfo->nitems,
+ nextseg->items, nextseg->nitems,
+ &nmerged);
+ Assert(nmerged == seginfo->nitems + nextseg->nitems);
+ nextseg->nitems = nmerged;
+ nextseg->seg = NULL;
+
+ nextseg->action = GIN_SEGMENT_REPLACE;
+ nextseg->modifieditems = NULL;
+ nextseg->nmodifieditems = 0;
+
+ if (seginfo->action == GIN_SEGMENT_INSERT)
+ {
+ dlist_delete(cur_node);
+ continue;
+ }
+ else
+ {
+ seginfo->action = GIN_SEGMENT_DELETE;
+ seginfo->seg = NULL;
+ }
+ }
- /*
- * Construct one big array of the items that need to be re-encoded.
- */
- allitems = palloc(nrecode * sizeof(ItemPointerData));
- nallitems = 0;
- dlist_foreach(iter, &recode_list)
- {
- leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node, iter.cur);
- memcpy(&allitems[nallitems], seginfo->items, seginfo->nitems * sizeof(ItemPointerData));
- nallitems += seginfo->nitems;
- }
- Assert(nallitems == nrecode);
+ seginfo->items = NULL;
+ seginfo->nitems = 0;
+ }
- /*
- * Start packing the items into segments. Stop when we have consumed
- * enough space to fill both pages, or we run out of items.
- */
- totalpacked = 0;
- needsplit = false;
- while (totalpacked < nallitems)
- {
- leafSegmentInfo *seginfo;
- int npacked;
- GinPostingList *seg;
- int segsize;
+ if (seginfo->action == GIN_SEGMENT_DELETE)
+ continue;
- seg = ginCompressPostingList(&allitems[totalpacked],
- nallitems - totalpacked,
- GinPostingListSegmentMaxSize,
- &npacked);
- segsize = SizeOfGinPostingList(seg);
+ /*
+ * OK, we now have a compressed version of this segment ready for
+ * copying to the page. Did we exceed the size that fits on one page?
+ */
+ segsize = SizeOfGinPostingList(seginfo->seg);
if (pgused + segsize > GinDataLeafMaxContentSize)
{
if (!needsplit)
{
/* switch to right page */
Assert(pgused > 0);
- leaf->lastleft = dlist_tail_node(&leaf->segments);
+ leaf->lastleft = dlist_prev_node(&leaf->segments, cur_node);
needsplit = true;
leaf->lsize = pgused;
pgused = 0;
}
else
{
- /* filled both pages */
- *remaining = allitems[totalpacked];
+ /*
+ * Filled both pages. The last segment we constructed did not
+ * fit.
+ */
+ *remaining = seginfo->seg->first;
+
+ /*
+ * remove all segments that did not fit from the list.
+ */
+ while (dlist_has_next(&leaf->segments, cur_node))
+ dlist_delete(dlist_next_node(&leaf->segments, cur_node));
+ dlist_delete(cur_node);
break;
}
}
- seginfo = palloc(sizeof(leafSegmentInfo));
- seginfo->seg = seg;
- seginfo->items = &allitems[totalpacked];
- seginfo->nitems = npacked;
- seginfo->modified = true;
-
- dlist_push_tail(&leaf->segments, &seginfo->node);
-
pgused += segsize;
- totalpacked += npacked;
}
if (!needsplit)
@@ -1471,6 +1662,32 @@ leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
Assert(leaf->lsize <= GinDataLeafMaxContentSize);
Assert(leaf->rsize <= GinDataLeafMaxContentSize);
+ /*
+ * Make a palloc'd copy of every segment after the first modified one,
+ * because as we start copying items to the original page, we might
+ * overwrite an existing segment.
+ */
+ modified = false;
+ dlist_foreach(iter, &leaf->segments)
+ {
+ leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
+ iter.cur);
+
+ if (!modified && seginfo->action != GIN_SEGMENT_UNMODIFIED)
+ {
+ modified = true;
+ }
+ else if (modified && seginfo->action == GIN_SEGMENT_UNMODIFIED)
+ {
+ GinPostingList *tmp;
+
+ segsize = SizeOfGinPostingList(seginfo->seg);
+ tmp = palloc(segsize);
+ memcpy(tmp, seginfo->seg, segsize);
+ seginfo->seg = tmp;
+ }
+ }
+
return needsplit;
}
diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c
index 4c4110d54b1..9d68a980722 100644
--- a/src/backend/access/gin/ginpostinglist.c
+++ b/src/backend/access/gin/ginpostinglist.c
@@ -298,9 +298,10 @@ ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_
}
/* copy the first item */
+ Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
+ Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
result[ndecoded] = segment->first;
ndecoded++;
- Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
val = itemptr_to_uint64(&segment->first);
ptr = segment->bytes;
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index c46f7ac4f7e..02e566cc685 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -145,15 +145,158 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda
static void
ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data)
{
- Pointer segment;
-
- /* Copy the new data to the right place */
- segment = ((Pointer) GinDataLeafPageGetPostingList(page))
- + data->unmodifiedsize;
- memcpy(segment, data->newdata, data->length - data->unmodifiedsize);
- GinDataLeafPageSetPostingListSize(page, data->length);
- GinPageSetCompressed(page);
- GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+ int actionno;
+ int segno;
+ GinPostingList *oldseg;
+ Pointer segmentend;
+ char *walbuf;
+ int totalsize;
+
+ /*
+ * If the page is in pre-9.4 format, convert to new format first.
+ */
+ if (!GinPageIsCompressed(page))
+ {
+ ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page);
+ int nuncompressed = GinPageGetOpaque(page)->maxoff;
+ int npacked;
+ GinPostingList *plist;
+
+ plist = ginCompressPostingList(uncompressed, nuncompressed,
+ BLCKSZ, &npacked);
+ Assert(npacked == nuncompressed);
+
+ totalsize = SizeOfGinPostingList(plist);
+
+ memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize);
+ GinDataLeafPageSetPostingListSize(page, totalsize);
+ GinPageSetCompressed(page);
+ GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber;
+ }
+
+ oldseg = GinDataLeafPageGetPostingList(page);
+ segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page);
+ segno = 0;
+
+ walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf);
+ for (actionno = 0; actionno < data->nactions; actionno++)
+ {
+ uint8 a_segno = *((uint8 *) (walbuf++));
+ uint8 a_action = *((uint8 *) (walbuf++));
+ GinPostingList *newseg = NULL;
+ int newsegsize = 0;
+ ItemPointerData *items = NULL;
+ uint16 nitems = 0;
+ ItemPointerData *olditems;
+ int nolditems;
+ ItemPointerData *newitems;
+ int nnewitems;
+ int segsize;
+ Pointer segptr;
+ int szleft;
+
+ /* Extract all the information we need from the WAL record */
+ if (a_action == GIN_SEGMENT_INSERT ||
+ a_action == GIN_SEGMENT_REPLACE)
+ {
+ newseg = (GinPostingList *) walbuf;
+ newsegsize = SizeOfGinPostingList(newseg);
+ walbuf += SHORTALIGN(newsegsize);
+ }
+
+ if (a_action == GIN_SEGMENT_ADDITEMS)
+ {
+ memcpy(&nitems, walbuf, sizeof(uint16));
+ walbuf += sizeof(uint16);
+ items = (ItemPointerData *) walbuf;
+ walbuf += nitems * sizeof(ItemPointerData);
+ }
+
+ /* Skip to the segment that this action concerns */
+ Assert(segno <= a_segno);
+ while (segno < a_segno)
+ {
+ oldseg = GinNextPostingListSegment(oldseg);
+ segno++;
+ }
+
+ /*
+ * ADDITEMS action is handled like REPLACE, but the new segment to
+ * replace the old one is reconstructed using the old segment from
+ * disk and the new items from the WAL record.
+ */
+ if (a_action == GIN_SEGMENT_ADDITEMS)
+ {
+ int npacked;
+
+ olditems = ginPostingListDecode(oldseg, &nolditems);
+
+ newitems = ginMergeItemPointers(items, nitems,
+ olditems, nolditems,
+ &nnewitems);
+ Assert(nnewitems == nolditems + nitems);
+
+ newseg = ginCompressPostingList(newitems, nnewitems,
+ BLCKSZ, &npacked);
+ Assert(npacked == nnewitems);
+
+ newsegsize = SizeOfGinPostingList(newseg);
+ a_action = GIN_SEGMENT_REPLACE;
+ }
+
+ segptr = (Pointer) oldseg;
+ if (segptr != segmentend)
+ segsize = SizeOfGinPostingList(oldseg);
+ else
+ {
+ /*
+ * Positioned after the last existing segment. Only INSERTs
+ * expected here.
+ */
+ Assert(a_action == GIN_SEGMENT_INSERT);
+ segsize = 0;
+ }
+ szleft = segmentend - segptr;
+
+ switch (a_action)
+ {
+ case GIN_SEGMENT_DELETE:
+ memmove(segptr, segptr + segsize, szleft - segsize);
+ segmentend -= segsize;
+
+ segno++;
+ break;
+
+ case GIN_SEGMENT_INSERT:
+ /* make room for the new segment */
+ memmove(segptr + newsegsize, segptr, szleft);
+ /* copy the new segment in place */
+ memcpy(segptr, newseg, newsegsize);
+ segmentend += newsegsize;
+ segptr += newsegsize;
+ break;
+
+ case GIN_SEGMENT_REPLACE:
+ /* shift the segments that follow */
+ memmove(segptr + newsegsize,
+ segptr + segsize,
+ szleft - segsize);
+ /* copy the replacement segment in place */
+ memcpy(segptr, newseg, newsegsize);
+ segmentend -= segsize;
+ segmentend += newsegsize;
+ segptr += newsegsize;
+ segno++;
+ break;
+
+ default:
+ elog(ERROR, "unexpected GIN leaf action: %u", a_action);
+ }
+ oldseg = (GinPostingList *) segptr;
+ }
+
+ totalsize = segmentend - (Pointer) GinDataLeafPageGetPostingList(page);
+ GinDataLeafPageSetPostingListSize(page, totalsize);
}
static void
diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c
index 1983bcc94c7..aa60c8db65c 100644
--- a/src/backend/access/rmgrdesc/gindesc.c
+++ b/src/backend/access/rmgrdesc/gindesc.c
@@ -25,6 +25,57 @@ desc_node(StringInfo buf, RelFileNode node, BlockNumber blkno)
node.spcNode, node.dbNode, node.relNode, blkno);
}
+static void
+desc_recompress_leaf(StringInfo buf, ginxlogRecompressDataLeaf *insertData)
+{
+ int i;
+ char *walbuf = ((char *) insertData) + sizeof(ginxlogRecompressDataLeaf);
+
+ appendStringInfo(buf, " %d segments:", (int) insertData->nactions);
+
+ for (i = 0; i < insertData->nactions; i++)
+ {
+ uint8 a_segno = *((uint8 *) (walbuf++));
+ uint8 a_action = *((uint8 *) (walbuf++));
+ uint16 nitems = 0;
+ int newsegsize = 0;
+
+ if (a_action == GIN_SEGMENT_INSERT ||
+ a_action == GIN_SEGMENT_REPLACE)
+ {
+ newsegsize = SizeOfGinPostingList((GinPostingList *) walbuf);
+ walbuf += SHORTALIGN(newsegsize);
+ }
+
+ if (a_action == GIN_SEGMENT_ADDITEMS)
+ {
+ memcpy(&nitems, walbuf, sizeof(uint16));
+ walbuf += sizeof(uint16);
+ walbuf += nitems * sizeof(ItemPointerData);
+ }
+
+ switch(a_action)
+ {
+ case GIN_SEGMENT_ADDITEMS:
+ appendStringInfo(buf, " %d (add %d items)", a_segno, nitems);
+ break;
+ case GIN_SEGMENT_DELETE:
+ appendStringInfo(buf, " %d (delete)", a_segno);
+ break;
+ case GIN_SEGMENT_INSERT:
+ appendStringInfo(buf, " %d (insert)", a_segno);
+ break;
+ case GIN_SEGMENT_REPLACE:
+ appendStringInfo(buf, " %d (replace)", a_segno);
+ break;
+ default:
+ appendStringInfo(buf, " %d unknown action %d ???", a_segno, a_action);
+ /* cannot decode unrecognized actions further */
+ return;
+ }
+ }
+}
+
void
gin_desc(StringInfo buf, uint8 xl_info, char *rec)
{
@@ -70,9 +121,10 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
ginxlogRecompressDataLeaf *insertData =
(ginxlogRecompressDataLeaf *) payload;
- appendStringInfo(buf, " unmodified: %u length: %u (compressed)",
- insertData->unmodifiedsize,
- insertData->length);
+ if (xl_info & XLR_BKP_BLOCK(0))
+ appendStringInfo(buf, " (full page image)");
+ else
+ desc_recompress_leaf(buf, insertData);
}
else
{
@@ -105,9 +157,10 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec;
appendStringInfoString(buf, "Vacuum data leaf page, ");
desc_node(buf, xlrec->node, xlrec->blkno);
- appendStringInfo(buf, " unmodified: %u length: %u",
- xlrec->data.unmodifiedsize,
- xlrec->data.length);
+ if (xl_info & XLR_BKP_BLOCK(0))
+ appendStringInfo(buf, " (full page image)");
+ else
+ desc_recompress_leaf(buf, &xlrec->data);
}
break;
case XLOG_GIN_DELETE_PAGE:
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
index d11811acb57..d33c216b757 100644
--- a/src/include/access/gin_private.h
+++ b/src/include/access/gin_private.h
@@ -406,7 +406,8 @@ typedef struct
* whose split this insertion finishes. As BlockIdData[2] (beware of adding
* fields before this that would make them not 16-bit aligned)
*
- * 2. one of the following structs, depending on tree type.
+ * 2. an ginxlogInsertEntry or ginxlogRecompressDataLeaf struct, depending
+ * on tree type.
*
* NB: the below structs are only 16-bit aligned when appended to a
* ginxlogInsert struct! Beware of adding fields to them that require
@@ -421,15 +422,39 @@ typedef struct
IndexTupleData tuple; /* variable length */
} ginxlogInsertEntry;
+
typedef struct
{
- uint16 length;
- uint16 unmodifiedsize;
+ uint16 nactions;
- /* compressed segments, variable length */
- char newdata[1];
+ /* Variable number of 'actions' follow */
} ginxlogRecompressDataLeaf;
+/*
+ * Note: this struct is currently not used in code, and only acts as
+ * documentation. The WAL record format is as specified here, but the code
+ * uses straight access through a Pointer and memcpy to read/write these.
+ */
+typedef struct
+{
+ uint8 segno; /* segment this action applies to */
+ char type; /* action type (see below) */
+
+ /*
+ * Action-specific data follows. For INSERT and REPLACE actions that is a
+ * GinPostingList struct. For ADDITEMS, a uint16 for the number of items
+ * added, followed by the items themselves as ItemPointers. DELETE actions
+ * have no further data.
+ */
+} ginxlogSegmentAction;
+
+/* Action types */
+#define GIN_SEGMENT_UNMODIFIED 0 /* no action (not used in WAL records) */
+#define GIN_SEGMENT_DELETE 1 /* a whole segment is removed */
+#define GIN_SEGMENT_INSERT 2 /* a whole segment is added */
+#define GIN_SEGMENT_REPLACE 3 /* a segment is replaced */
+#define GIN_SEGMENT_ADDITEMS 4 /* items are added to existing segment */
+
typedef struct
{
OffsetNumber offset;