aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/heap/heapam.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/heap/heapam.c')
-rw-r--r--src/backend/access/heap/heapam.c638
1 files changed, 574 insertions, 64 deletions
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 53e997cd553..5b9cfb26cf7 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -55,6 +55,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/atomics.h"
+#include "port/pg_bitutils.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
@@ -102,6 +103,8 @@ static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 in
int *remaining);
static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
uint16 infomask, Relation rel, int *remaining);
+static void index_delete_sort(TM_IndexDeleteOp *delstate);
+static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed,
bool *copy);
@@ -166,18 +169,33 @@ static const struct
#ifdef USE_PREFETCH
/*
- * heap_compute_xid_horizon_for_tuples and xid_horizon_prefetch_buffer use
- * this structure to coordinate prefetching activity.
+ * heap_index_delete_tuples and index_delete_prefetch_buffer use this
+ * structure to coordinate prefetching activity
*/
typedef struct
{
BlockNumber cur_hblkno;
int next_item;
- int nitems;
- ItemPointerData *tids;
-} XidHorizonPrefetchState;
+ int ndeltids;
+ TM_IndexDelete *deltids;
+} IndexDeletePrefetchState;
#endif
+/* heap_index_delete_tuples bottom-up index deletion costing constants */
+#define BOTTOMUP_MAX_NBLOCKS 6
+#define BOTTOMUP_TOLERANCE_NBLOCKS 3
+
+/*
+ * heap_index_delete_tuples uses this when determining which heap blocks it
+ * must visit to help its bottom-up index deletion caller
+ */
+typedef struct IndexDeleteCounts
+{
+ int16 npromisingtids; /* Number of "promising" TIDs in group */
+ int16 ntids; /* Number of TIDs in group */
+ int16 ifirsttid; /* Offset to group's first deltid */
+} IndexDeleteCounts;
+
/*
* This table maps tuple lock strength values for each particular
* MultiXactStatus value.
@@ -6936,28 +6954,31 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
#ifdef USE_PREFETCH
/*
- * Helper function for heap_compute_xid_horizon_for_tuples. Issue prefetch
- * requests for the number of buffers indicated by prefetch_count. The
- * prefetch_state keeps track of all the buffers that we can prefetch and
- * which ones have already been prefetched; each call to this function picks
- * up where the previous call left off.
+ * Helper function for heap_index_delete_tuples. Issues prefetch requests for
+ * prefetch_count buffers. The prefetch_state keeps track of all the buffers
+ * we can prefetch, and which have already been prefetched; each call to this
+ * function picks up where the previous call left off.
+ *
+ * Note: we expect the deltids array to be sorted in an order that groups TIDs
+ * by heap block, with all TIDs for each block appearing together in exactly
+ * one group.
*/
static void
-xid_horizon_prefetch_buffer(Relation rel,
- XidHorizonPrefetchState *prefetch_state,
- int prefetch_count)
+index_delete_prefetch_buffer(Relation rel,
+ IndexDeletePrefetchState *prefetch_state,
+ int prefetch_count)
{
BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
int count = 0;
int i;
- int nitems = prefetch_state->nitems;
- ItemPointerData *tids = prefetch_state->tids;
+ int ndeltids = prefetch_state->ndeltids;
+ TM_IndexDelete *deltids = prefetch_state->deltids;
for (i = prefetch_state->next_item;
- i < nitems && count < prefetch_count;
+ i < ndeltids && count < prefetch_count;
i++)
{
- ItemPointer htid = &tids[i];
+ ItemPointer htid = &deltids[i].tid;
if (cur_hblkno == InvalidBlockNumber ||
ItemPointerGetBlockNumber(htid) != cur_hblkno)
@@ -6978,24 +6999,20 @@ xid_horizon_prefetch_buffer(Relation rel,
#endif
/*
- * Get the latestRemovedXid from the heap pages pointed at by the index
- * tuples being deleted.
+ * heapam implementation of tableam's index_delete_tuples interface.
*
- * We used to do this during recovery rather than on the primary, but that
- * approach now appears inferior. It meant that the primary could generate
- * a lot of work for the standby without any back-pressure to slow down the
- * primary, and it required the standby to have reached consistency, whereas
- * we want to have correct information available even before that point.
+ * This helper function is called by index AMs during index tuple deletion.
+ * See tableam header comments for an explanation of the interface implemented
+ * here and a general theory of operation. Note that each call here is either
+ * a simple index deletion call, or a bottom-up index deletion call.
*
* It's possible for this to generate a fair amount of I/O, since we may be
* deleting hundreds of tuples from a single index block. To amortize that
* cost to some degree, this uses prefetching and combines repeat accesses to
- * the same block.
+ * the same heap block.
*/
TransactionId
-heap_compute_xid_horizon_for_tuples(Relation rel,
- ItemPointerData *tids,
- int nitems)
+heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
{
/* Initial assumption is that earlier pruning took care of conflict */
TransactionId latestRemovedXid = InvalidTransactionId;
@@ -7005,28 +7022,44 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
OffsetNumber maxoff = InvalidOffsetNumber;
TransactionId priorXmax;
#ifdef USE_PREFETCH
- XidHorizonPrefetchState prefetch_state;
+ IndexDeletePrefetchState prefetch_state;
int prefetch_distance;
#endif
+ SnapshotData SnapshotNonVacuumable;
+ int finalndeltids = 0,
+ nblocksaccessed = 0;
+
+ /* State that's only used in bottom-up index deletion case */
+ int nblocksfavorable = 0;
+ int curtargetfreespace = delstate->bottomupfreespace,
+ lastfreespace = 0,
+ actualfreespace = 0;
+ bool bottomup_final_block = false;
+
+ InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel));
+
+ /* Sort caller's deltids array by TID for further processing */
+ index_delete_sort(delstate);
/*
- * Sort to avoid repeated lookups for the same page, and to make it more
- * likely to access items in an efficient order. In particular, this
- * ensures that if there are multiple pointers to the same page, they all
- * get processed looking up and locking the page just once.
+ * Bottom-up case: resort deltids array in an order attuned to where the
+ * greatest number of promising TIDs are to be found, and determine how
+ * many blocks from the start of sorted array should be considered
+ * favorable. This will also shrink the deltids array in order to
+ * eliminate completely unfavorable blocks up front.
*/
- qsort((void *) tids, nitems, sizeof(ItemPointerData),
- (int (*) (const void *, const void *)) ItemPointerCompare);
+ if (delstate->bottomup)
+ nblocksfavorable = bottomup_sort_and_shrink(delstate);
#ifdef USE_PREFETCH
/* Initialize prefetch state. */
prefetch_state.cur_hblkno = InvalidBlockNumber;
prefetch_state.next_item = 0;
- prefetch_state.nitems = nitems;
- prefetch_state.tids = tids;
+ prefetch_state.ndeltids = delstate->ndeltids;
+ prefetch_state.deltids = delstate->deltids;
/*
- * Compute the prefetch distance that we will attempt to maintain.
+ * Determine the prefetch distance that we will attempt to maintain.
*
* Since the caller holds a buffer lock somewhere in rel, we'd better make
* sure that isn't a catalog relation before we call code that does
@@ -7038,33 +7071,111 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
prefetch_distance =
get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
+ /* Cap initial prefetch distance for bottom-up deletion caller */
+ if (delstate->bottomup)
+ {
+ Assert(nblocksfavorable >= 1);
+ Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS);
+ prefetch_distance = Min(prefetch_distance, nblocksfavorable);
+ }
+
/* Start prefetching. */
- xid_horizon_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
+ index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
#endif
- /* Iterate over all tids, and check their horizon */
- for (int i = 0; i < nitems; i++)
+ /* Iterate over deltids, determine which to delete, check their horizon */
+ Assert(delstate->ndeltids > 0);
+ for (int i = 0; i < delstate->ndeltids; i++)
{
- ItemPointer htid = &tids[i];
+ TM_IndexDelete *ideltid = &delstate->deltids[i];
+ TM_IndexStatus *istatus = delstate->status + ideltid->id;
+ ItemPointer htid = &ideltid->tid;
OffsetNumber offnum;
/*
- * Read heap buffer, but avoid refetching if it's the same block as
- * required for the last tid.
+ * Read buffer, and perform required extra steps each time a new block
+ * is encountered. Avoid refetching if it's the same block as the one
+ * from the last htid.
*/
if (blkno == InvalidBlockNumber ||
ItemPointerGetBlockNumber(htid) != blkno)
{
- /* release old buffer */
- if (BufferIsValid(buf))
+ /*
+ * Consider giving up early for bottom-up index deletion caller
+ * first. (Only prefetch next-next block afterwards, when it
+ * becomes clear that we're at least going to access the next
+ * block in line.)
+ *
+ * Sometimes the first block frees so much space for bottom-up
+ * caller that the deletion process can end without accessing any
+ * more blocks. It is usually necessary to access 2 or 3 blocks
+ * per bottom-up deletion operation, though.
+ */
+ if (delstate->bottomup)
{
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
- ReleaseBuffer(buf);
+ /*
+ * We often allow caller to delete a few additional items
+ * whose entries we reached after the point that space target
+ * from caller was satisfied. The cost of accessing the page
+ * was already paid at that point, so it made sense to finish
+ * it off. When that happened, we finalize everything here
+ * (by finishing off the whole bottom-up deletion operation
+ * without needlessly paying the cost of accessing any more
+ * blocks).
+ */
+ if (bottomup_final_block)
+ break;
+
+ /*
+ * Give up when we didn't enable our caller to free any
+ * additional space as a result of processing the page that we
+ * just finished up with. This rule is the main way in which
+ * we keep the cost of bottom-up deletion under control.
+ */
+ if (nblocksaccessed >= 1 && actualfreespace == lastfreespace)
+ break;
+ lastfreespace = actualfreespace; /* for next time */
+
+ /*
+ * Deletion operation (which is bottom-up) will definitely
+ * access the next block in line. Prepare for that now.
+ *
+ * Decay target free space so that we don't hang on for too
+ * long with a marginal case. (Space target is only truly
+ * helpful when it allows us to recognize that we don't need
+ * to access more than 1 or 2 blocks to satisfy caller due to
+ * agreeable workload characteristics.)
+ *
+ * We are a bit more patient when we encounter contiguous
+ * blocks, though: these are treated as favorable blocks. The
+ * decay process is only applied when the next block in line
+ * is not a favorable/contiguous block. This is not an
+ * exception to the general rule; we still insist on finding
+ * at least one deletable item per block accessed. See
+ * bottomup_nblocksfavorable() for full details of the theory
+ * behind favorable blocks and heap block locality in general.
+ *
+ * Note: The first block in line is always treated as a
+ * favorable block, so the earliest possible point that the
+ * decay can be applied is just before we access the second
+ * block in line. The Assert() verifies this for us.
+ */
+ Assert(nblocksaccessed > 0 || nblocksfavorable > 0);
+ if (nblocksfavorable > 0)
+ nblocksfavorable--;
+ else
+ curtargetfreespace /= 2;
}
- blkno = ItemPointerGetBlockNumber(htid);
+ /* release old buffer */
+ if (BufferIsValid(buf))
+ UnlockReleaseBuffer(buf);
+ blkno = ItemPointerGetBlockNumber(htid);
buf = ReadBuffer(rel, blkno);
+ nblocksaccessed++;
+ Assert(!delstate->bottomup ||
+ nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS);
#ifdef USE_PREFETCH
@@ -7072,7 +7183,7 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
* To maintain the prefetch distance, prefetch one more page for
* each page we read.
*/
- xid_horizon_prefetch_buffer(rel, &prefetch_state, 1);
+ index_delete_prefetch_buffer(rel, &prefetch_state, 1);
#endif
LockBuffer(buf, BUFFER_LOCK_SHARE);
@@ -7081,6 +7192,31 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
maxoff = PageGetMaxOffsetNumber(page);
}
+ if (istatus->knowndeletable)
+ Assert(!delstate->bottomup && !istatus->promising);
+ else
+ {
+ ItemPointerData tmp = *htid;
+ HeapTupleData heapTuple;
+
+ /* Are any tuples from this HOT chain non-vacuumable? */
+ if (heap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable,
+ &heapTuple, NULL, true))
+ continue; /* can't delete entry */
+
+ /* Caller will delete, since whole HOT chain is vacuumable */
+ istatus->knowndeletable = true;
+
+ /* Maintain index free space info for bottom-up deletion case */
+ if (delstate->bottomup)
+ {
+ Assert(istatus->freespace > 0);
+ actualfreespace += istatus->freespace;
+ if (actualfreespace >= curtargetfreespace)
+ bottomup_final_block = true;
+ }
+ }
+
/*
* Maintain latestRemovedXid value for deletion operation as a whole
* by advancing current value using heap tuple headers. This is
@@ -7108,17 +7244,18 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
}
/*
- * We'll often encounter LP_DEAD line pointers. No need to do
- * anything more with htid when that happens. This is okay
- * because the earlier pruning operation that made the line
- * pointer LP_DEAD in the first place must have considered the
- * tuple header as part of generating its own latestRemovedXid
- * value.
+ * We'll often encounter LP_DEAD line pointers (especially with an
+ * entry marked knowndeletable by our caller up front). No heap
+ * tuple headers get examined for an htid that leads us to an
+ * LP_DEAD item. This is okay because the earlier pruning
+ * operation that made the line pointer LP_DEAD in the first place
+ * must have considered the original tuple header as part of
+ * generating its own latestRemovedXid value.
*
- * Relying on XLOG_HEAP2_CLEANUP_INFO records like this is the
- * same strategy that index vacuuming uses in all cases. Index
- * VACUUM WAL records don't even have a latestRemovedXid field of
- * their own for this reason.
+ * Relying on XLOG_HEAP2_CLEAN records like this is the same
+ * strategy that index vacuuming uses in all cases. Index VACUUM
+ * WAL records don't even have a latestRemovedXid field of their
+ * own for this reason.
*/
if (!ItemIdIsNormal(lp))
break;
@@ -7148,15 +7285,388 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
priorXmax = HeapTupleHeaderGetUpdateXid(htup);
}
+
+ /* Enable further/final shrinking of deltids for caller */
+ finalndeltids = i + 1;
}
- if (BufferIsValid(buf))
+ UnlockReleaseBuffer(buf);
+
+ /*
+ * Shrink deltids array to exclude non-deletable entries at the end. This
+ * is not just a minor optimization. Final deltids array size might be
+ * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
+ * ndeltids being zero in all cases with zero total deletable entries.
+ */
+ Assert(finalndeltids > 0 || delstate->bottomup);
+ delstate->ndeltids = finalndeltids;
+
+ return latestRemovedXid;
+}
+
+/*
+ * Specialized inlineable comparison function for index_delete_sort()
+ */
+static inline int
+index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
+{
+ ItemPointer tid1 = &deltid1->tid;
+ ItemPointer tid2 = &deltid2->tid;
+
+ {
+ BlockNumber blk1 = ItemPointerGetBlockNumber(tid1);
+ BlockNumber blk2 = ItemPointerGetBlockNumber(tid2);
+
+ if (blk1 != blk2)
+ return (blk1 < blk2) ? -1 : 1;
+ }
{
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
- ReleaseBuffer(buf);
+ OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1);
+ OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2);
+
+ if (pos1 != pos2)
+ return (pos1 < pos2) ? -1 : 1;
}
- return latestRemovedXid;
+ pg_unreachable();
+
+ return 0;
+}
+
+/*
+ * Sort deltids array from delstate by TID. This prepares it for further
+ * processing by heap_index_delete_tuples().
+ *
+ * This operation becomes a noticeable consumer of CPU cycles with some
+ * workloads, so we go to the trouble of specialization/micro optimization.
+ * We use shellsort for this because it's easy to specialize, compiles to
+ * relatively few instructions, and is adaptive to presorted inputs/subsets
+ * (which are typical here).
+ */
+static void
+index_delete_sort(TM_IndexDeleteOp *delstate)
+{
+ TM_IndexDelete *deltids = delstate->deltids;
+ int ndeltids = delstate->ndeltids;
+ int low = 0;
+
+ /*
+ * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
+ *
+ * This implementation is fast with array sizes up to ~4500. This covers
+ * all supported BLCKSZ values.
+ */
+ const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
+
+ /* Think carefully before changing anything here -- keep swaps cheap */
+ StaticAssertStmt(sizeof(TM_IndexDelete) <= 8,
+ "element size exceeds 8 bytes");
+
+ for (int g = 0; g < lengthof(gaps); g++)
+ {
+ for (int hi = gaps[g], i = low + hi; i < ndeltids; i++)
+ {
+ TM_IndexDelete d = deltids[i];
+ int j = i;
+
+ while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
+ {
+ deltids[j] = deltids[j - hi];
+ j -= hi;
+ }
+ deltids[j] = d;
+ }
+ }
+}
+
+/*
+ * Returns how many blocks should be considered favorable/contiguous for a
+ * bottom-up index deletion pass. This is a number of heap blocks that starts
+ * from and includes the first block in line.
+ *
+ * There is always at least one favorable block during bottom-up index
+ * deletion. In the worst case (i.e. with totally random heap blocks) the
+ * first block in line (the only favorable block) can be thought of as a
+ * degenerate array of contiguous blocks that consists of a single block.
+ * heap_index_delete_tuples() will expect this.
+ *
+ * Caller passes blockgroups, a description of the final order that deltids
+ * will be sorted in for heap_index_delete_tuples() bottom-up index deletion
+ * processing. Note that deltids need not actually be sorted just yet (caller
+ * only passes deltids to us so that we can interpret blockgroups).
+ *
+ * You might guess that the existence of contiguous blocks cannot matter much,
+ * since in general the main factor that determines which blocks we visit is
+ * the number of promising TIDs, which is a fixed hint from the index AM.
+ * We're not really targeting the general case, though -- the actual goal is
+ * to adapt our behavior to a wide variety of naturally occurring conditions.
+ * The effects of most of the heuristics we apply are only noticeable in the
+ * aggregate, over time and across many _related_ bottom-up index deletion
+ * passes.
+ *
+ * Deeming certain blocks favorable allows heapam to recognize and adapt to
+ * workloads where heap blocks visited during bottom-up index deletion can be
+ * accessed contiguously, in the sense that each newly visited block is the
+ * neighbor of the block that bottom-up deletion just finished processing (or
+ * close enough to it). It will likely be cheaper to access more favorable
+ * blocks sooner rather than later (e.g. in this pass, not across a series of
+ * related bottom-up passes). Either way it is probably only a matter of time
+ * (or a matter of further correlated version churn) before all blocks that
+ * appear together as a single large batch of favorable blocks get accessed by
+ * _some_ bottom-up pass. Large batches of favorable blocks tend to either
+ * appear almost constantly or not even once (it all depends on per-index
+ * workload characteristics).
+ *
+ * Note that the blockgroups sort order applies a power-of-two bucketing
+ * scheme that creates opportunities for contiguous groups of blocks to get
+ * batched together, at least with workloads that are naturally amenable to
+ * being driven by heap block locality. This doesn't just enhance the spatial
+ * locality of bottom-up heap block processing in the obvious way. It also
+ * enables temporal locality of access, since sorting by heap block number
+ * naturally tends to make the bottom-up processing order deterministic.
+ *
+ * Consider the following example to get a sense of how temporal locality
+ * might matter: There is a heap relation with several indexes, each of which
+ * is low to medium cardinality. It is subject to constant non-HOT updates.
+ * The updates are skewed (in one part of the primary key, perhaps). None of
+ * the indexes are logically modified by the UPDATE statements (if they were
+ * then bottom-up index deletion would not be triggered in the first place).
+ * Naturally, each new round of index tuples (for each heap tuple that gets a
+ * heap_update() call) will have the same heap TID in each and every index.
+ * Since these indexes are low cardinality and never get logically modified,
+ * heapam processing during bottom-up deletion passes will access heap blocks
+ * in approximately sequential order. Temporal locality of access occurs due
+ * to bottom-up deletion passes behaving very similarly across each of the
+ * indexes at any given moment. This keeps the number of buffer misses needed
+ * to visit heap blocks to a minimum.
+ */
+static int
+bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups,
+ TM_IndexDelete *deltids)
+{
+ int64 lastblock = -1;
+ int nblocksfavorable = 0;
+
+ Assert(nblockgroups >= 1);
+ Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS);
+
+ /*
+ * We tolerate heap blocks that will be accessed only slightly out of
+ * physical order. Small blips occur when a pair of almost-contiguous
+ * blocks happen to fall into different buckets (perhaps due only to a
+ * small difference in npromisingtids that the bucketing scheme didn't
+ * quite manage to ignore). We effectively ignore these blips by applying
+ * a small tolerance. The precise tolerance we use is a little arbitrary,
+ * but it works well enough in practice.
+ */
+ for (int b = 0; b < nblockgroups; b++)
+ {
+ IndexDeleteCounts *group = blockgroups + b;
+ TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
+ BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid);
+
+ if (lastblock != -1 &&
+ ((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS ||
+ (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS))
+ break;
+
+ nblocksfavorable++;
+ lastblock = block;
+ }
+
+ /* Always indicate that there is at least 1 favorable block */
+ Assert(nblocksfavorable >= 1);
+
+ return nblocksfavorable;
+}
+
+/*
+ * qsort comparison function for bottomup_sort_and_shrink()
+ */
+static int
+bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
+{
+ const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1;
+ const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2;
+
+ /*
+ * Most significant field is npromisingtids (which we invert the order of
+ * so as to sort in desc order).
+ *
+ * Caller should have already normalized npromisingtids fields into
+ * power-of-two values (buckets).
+ */
+ if (group1->npromisingtids > group2->npromisingtids)
+ return -1;
+ if (group1->npromisingtids < group2->npromisingtids)
+ return 1;
+
+ /*
+ * Tiebreak: desc ntids sort order.
+ *
+ * We cannot expect power-of-two values for ntids fields. We should
+ * behave as if they were already rounded up for us instead.
+ */
+ if (group1->ntids != group2->ntids)
+ {
+ uint32 ntids1 = pg_nextpower2_32((uint32) group1->ntids);
+ uint32 ntids2 = pg_nextpower2_32((uint32) group2->ntids);
+
+ if (ntids1 > ntids2)
+ return -1;
+ if (ntids1 < ntids2)
+ return 1;
+ }
+
+ /*
+ * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
+ * block in deltids array) order.
+ *
+ * This is equivalent to sorting in ascending heap block number order
+ * (among otherwise equal subsets of the array). This approach allows us
+ * to avoid accessing the out-of-line TID. (We rely on the assumption
+ * that the deltids array was sorted in ascending heap TID order when
+ * these offsets to the first TID from each heap block group were formed.)
+ */
+ if (group1->ifirsttid > group2->ifirsttid)
+ return 1;
+ if (group1->ifirsttid < group2->ifirsttid)
+ return -1;
+
+ pg_unreachable();
+
+ return 0;
+}
+
+/*
+ * heap_index_delete_tuples() helper function for bottom-up deletion callers.
+ *
+ * Sorts deltids array in the order needed for useful processing by bottom-up
+ * deletion. The array should already be sorted in TID order when we're
+ * called. The sort process groups heap TIDs from deltids into heap block
+ * groupings. Earlier/more-promising groups/blocks are usually those that are
+ * known to have the most "promising" TIDs.
+ *
+ * Sets new size of deltids array (ndeltids) in state. deltids will only have
+ * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
+ * return. This often means that deltids will be shrunk to a small fraction
+ * of its original size (we eliminate many heap blocks from consideration for
+ * caller up front).
+ *
+ * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable()
+ * for a definition and full details.
+ */
+static int
+bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
+{
+ IndexDeleteCounts *blockgroups;
+ TM_IndexDelete *reordereddeltids;
+ BlockNumber curblock = InvalidBlockNumber;
+ int nblockgroups = 0;
+ int ncopied = 0;
+ int nblocksfavorable = 0;
+
+ Assert(delstate->bottomup);
+ Assert(delstate->ndeltids > 0);
+
+ /* Calculate per-heap-block count of TIDs */
+ blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids);
+ for (int i = 0; i < delstate->ndeltids; i++)
+ {
+ TM_IndexDelete *ideltid = &delstate->deltids[i];
+ TM_IndexStatus *istatus = delstate->status + ideltid->id;
+ ItemPointer htid = &ideltid->tid;
+ bool promising = istatus->promising;
+
+ if (curblock != ItemPointerGetBlockNumber(htid))
+ {
+ /* New block group */
+ nblockgroups++;
+
+ Assert(curblock < ItemPointerGetBlockNumber(htid) ||
+ !BlockNumberIsValid(curblock));
+
+ curblock = ItemPointerGetBlockNumber(htid);
+ blockgroups[nblockgroups - 1].ifirsttid = i;
+ blockgroups[nblockgroups - 1].ntids = 1;
+ blockgroups[nblockgroups - 1].npromisingtids = 0;
+ }
+ else
+ {
+ blockgroups[nblockgroups - 1].ntids++;
+ }
+
+ if (promising)
+ blockgroups[nblockgroups - 1].npromisingtids++;
+ }
+
+ /*
+ * We're about ready to sort block groups to determine the optimal order
+ * for visiting heap blocks. But before we do, round the number of
+ * promising tuples for each block group up to the nearest power-of-two
+ * (except for block groups where npromisingtids is already 0).
+ *
+ * This scheme divides heap blocks/block groups into buckets. Each bucket
+ * contains blocks that have _approximately_ the same number of promising
+ * TIDs as each other. The goal is to ignore relatively small differences
+ * in the total number of promising entries, so that the whole process can
+ * give a little weight to heapam factors (like heap block locality)
+ * instead. This isn't a trade-off, really -- we have nothing to lose.
+ * It would be foolish to interpret small differences in npromisingtids
+ * values as anything more than noise.
+ *
+ * We tiebreak on nhtids when sorting block group subsets that have the
+ * same npromisingtids, but this has the same issues as npromisingtids,
+ * and so nhtids is subject to the same power-of-two bucketing scheme.
+ * The only reason that we don't fix nhtids in the same way here too is
+ * that we'll need accurate nhtids values after the sort. We handle
+ * nhtids bucketization dynamically instead (in the sort comparator).
+ *
+ * See bottomup_nblocksfavorable() for a full explanation of when and how
+ * heap locality/favorable blocks can significantly influence when and how
+ * heap blocks are accessed.
+ */
+ for (int b = 0; b < nblockgroups; b++)
+ {
+ IndexDeleteCounts *group = blockgroups + b;
+
+ /* Better off falling back on nhtids with low npromisingtids */
+ if (group->npromisingtids <= 4)
+ group->npromisingtids = 4;
+ else
+ group->npromisingtids =
+ pg_nextpower2_32((uint32) group->npromisingtids);
+ }
+
+ /* Sort groups and rearrange caller's deltids array */
+ qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts),
+ bottomup_sort_and_shrink_cmp);
+ reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
+
+ nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups);
+ /* Determine number of favorable blocks at the start of final deltids */
+ nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups,
+ delstate->deltids);
+
+ for (int b = 0; b < nblockgroups; b++)
+ {
+ IndexDeleteCounts *group = blockgroups + b;
+ TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
+
+ memcpy(reordereddeltids + ncopied, firstdtid,
+ sizeof(TM_IndexDelete) * group->ntids);
+ ncopied += group->ntids;
+ }
+
+ /* Copy final grouped and sorted TIDs back into start of caller's array */
+ memcpy(delstate->deltids, reordereddeltids,
+ sizeof(TM_IndexDelete) * ncopied);
+ delstate->ndeltids = ncopied;
+
+ pfree(reordereddeltids);
+ pfree(blockgroups);
+
+ return nblocksfavorable;
}
/*