diff options
Diffstat (limited to 'src/backend/access/heap/heapam.c')
-rw-r--r-- | src/backend/access/heap/heapam.c | 638 |
1 files changed, 574 insertions, 64 deletions
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 53e997cd553..5b9cfb26cf7 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -55,6 +55,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "port/atomics.h" +#include "port/pg_bitutils.h" #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/lmgr.h" @@ -102,6 +103,8 @@ static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 in int *remaining); static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining); +static void index_delete_sort(TM_IndexDeleteOp *delstate); +static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate); static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed, bool *copy); @@ -166,18 +169,33 @@ static const struct #ifdef USE_PREFETCH /* - * heap_compute_xid_horizon_for_tuples and xid_horizon_prefetch_buffer use - * this structure to coordinate prefetching activity. + * heap_index_delete_tuples and index_delete_prefetch_buffer use this + * structure to coordinate prefetching activity */ typedef struct { BlockNumber cur_hblkno; int next_item; - int nitems; - ItemPointerData *tids; -} XidHorizonPrefetchState; + int ndeltids; + TM_IndexDelete *deltids; +} IndexDeletePrefetchState; #endif +/* heap_index_delete_tuples bottom-up index deletion costing constants */ +#define BOTTOMUP_MAX_NBLOCKS 6 +#define BOTTOMUP_TOLERANCE_NBLOCKS 3 + +/* + * heap_index_delete_tuples uses this when determining which heap blocks it + * must visit to help its bottom-up index deletion caller + */ +typedef struct IndexDeleteCounts +{ + int16 npromisingtids; /* Number of "promising" TIDs in group */ + int16 ntids; /* Number of TIDs in group */ + int16 ifirsttid; /* Offset to group's first deltid */ +} IndexDeleteCounts; + /* * This table maps tuple lock strength values for each particular * MultiXactStatus value. @@ -6936,28 +6954,31 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, #ifdef USE_PREFETCH /* - * Helper function for heap_compute_xid_horizon_for_tuples. Issue prefetch - * requests for the number of buffers indicated by prefetch_count. The - * prefetch_state keeps track of all the buffers that we can prefetch and - * which ones have already been prefetched; each call to this function picks - * up where the previous call left off. + * Helper function for heap_index_delete_tuples. Issues prefetch requests for + * prefetch_count buffers. The prefetch_state keeps track of all the buffers + * we can prefetch, and which have already been prefetched; each call to this + * function picks up where the previous call left off. + * + * Note: we expect the deltids array to be sorted in an order that groups TIDs + * by heap block, with all TIDs for each block appearing together in exactly + * one group. */ static void -xid_horizon_prefetch_buffer(Relation rel, - XidHorizonPrefetchState *prefetch_state, - int prefetch_count) +index_delete_prefetch_buffer(Relation rel, + IndexDeletePrefetchState *prefetch_state, + int prefetch_count) { BlockNumber cur_hblkno = prefetch_state->cur_hblkno; int count = 0; int i; - int nitems = prefetch_state->nitems; - ItemPointerData *tids = prefetch_state->tids; + int ndeltids = prefetch_state->ndeltids; + TM_IndexDelete *deltids = prefetch_state->deltids; for (i = prefetch_state->next_item; - i < nitems && count < prefetch_count; + i < ndeltids && count < prefetch_count; i++) { - ItemPointer htid = &tids[i]; + ItemPointer htid = &deltids[i].tid; if (cur_hblkno == InvalidBlockNumber || ItemPointerGetBlockNumber(htid) != cur_hblkno) @@ -6978,24 +6999,20 @@ xid_horizon_prefetch_buffer(Relation rel, #endif /* - * Get the latestRemovedXid from the heap pages pointed at by the index - * tuples being deleted. + * heapam implementation of tableam's index_delete_tuples interface. * - * We used to do this during recovery rather than on the primary, but that - * approach now appears inferior. It meant that the primary could generate - * a lot of work for the standby without any back-pressure to slow down the - * primary, and it required the standby to have reached consistency, whereas - * we want to have correct information available even before that point. + * This helper function is called by index AMs during index tuple deletion. + * See tableam header comments for an explanation of the interface implemented + * here and a general theory of operation. Note that each call here is either + * a simple index deletion call, or a bottom-up index deletion call. * * It's possible for this to generate a fair amount of I/O, since we may be * deleting hundreds of tuples from a single index block. To amortize that * cost to some degree, this uses prefetching and combines repeat accesses to - * the same block. + * the same heap block. */ TransactionId -heap_compute_xid_horizon_for_tuples(Relation rel, - ItemPointerData *tids, - int nitems) +heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) { /* Initial assumption is that earlier pruning took care of conflict */ TransactionId latestRemovedXid = InvalidTransactionId; @@ -7005,28 +7022,44 @@ heap_compute_xid_horizon_for_tuples(Relation rel, OffsetNumber maxoff = InvalidOffsetNumber; TransactionId priorXmax; #ifdef USE_PREFETCH - XidHorizonPrefetchState prefetch_state; + IndexDeletePrefetchState prefetch_state; int prefetch_distance; #endif + SnapshotData SnapshotNonVacuumable; + int finalndeltids = 0, + nblocksaccessed = 0; + + /* State that's only used in bottom-up index deletion case */ + int nblocksfavorable = 0; + int curtargetfreespace = delstate->bottomupfreespace, + lastfreespace = 0, + actualfreespace = 0; + bool bottomup_final_block = false; + + InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel)); + + /* Sort caller's deltids array by TID for further processing */ + index_delete_sort(delstate); /* - * Sort to avoid repeated lookups for the same page, and to make it more - * likely to access items in an efficient order. In particular, this - * ensures that if there are multiple pointers to the same page, they all - * get processed looking up and locking the page just once. + * Bottom-up case: resort deltids array in an order attuned to where the + * greatest number of promising TIDs are to be found, and determine how + * many blocks from the start of sorted array should be considered + * favorable. This will also shrink the deltids array in order to + * eliminate completely unfavorable blocks up front. */ - qsort((void *) tids, nitems, sizeof(ItemPointerData), - (int (*) (const void *, const void *)) ItemPointerCompare); + if (delstate->bottomup) + nblocksfavorable = bottomup_sort_and_shrink(delstate); #ifdef USE_PREFETCH /* Initialize prefetch state. */ prefetch_state.cur_hblkno = InvalidBlockNumber; prefetch_state.next_item = 0; - prefetch_state.nitems = nitems; - prefetch_state.tids = tids; + prefetch_state.ndeltids = delstate->ndeltids; + prefetch_state.deltids = delstate->deltids; /* - * Compute the prefetch distance that we will attempt to maintain. + * Determine the prefetch distance that we will attempt to maintain. * * Since the caller holds a buffer lock somewhere in rel, we'd better make * sure that isn't a catalog relation before we call code that does @@ -7038,33 +7071,111 @@ heap_compute_xid_horizon_for_tuples(Relation rel, prefetch_distance = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); + /* Cap initial prefetch distance for bottom-up deletion caller */ + if (delstate->bottomup) + { + Assert(nblocksfavorable >= 1); + Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS); + prefetch_distance = Min(prefetch_distance, nblocksfavorable); + } + /* Start prefetching. */ - xid_horizon_prefetch_buffer(rel, &prefetch_state, prefetch_distance); + index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance); #endif - /* Iterate over all tids, and check their horizon */ - for (int i = 0; i < nitems; i++) + /* Iterate over deltids, determine which to delete, check their horizon */ + Assert(delstate->ndeltids > 0); + for (int i = 0; i < delstate->ndeltids; i++) { - ItemPointer htid = &tids[i]; + TM_IndexDelete *ideltid = &delstate->deltids[i]; + TM_IndexStatus *istatus = delstate->status + ideltid->id; + ItemPointer htid = &ideltid->tid; OffsetNumber offnum; /* - * Read heap buffer, but avoid refetching if it's the same block as - * required for the last tid. + * Read buffer, and perform required extra steps each time a new block + * is encountered. Avoid refetching if it's the same block as the one + * from the last htid. */ if (blkno == InvalidBlockNumber || ItemPointerGetBlockNumber(htid) != blkno) { - /* release old buffer */ - if (BufferIsValid(buf)) + /* + * Consider giving up early for bottom-up index deletion caller + * first. (Only prefetch next-next block afterwards, when it + * becomes clear that we're at least going to access the next + * block in line.) + * + * Sometimes the first block frees so much space for bottom-up + * caller that the deletion process can end without accessing any + * more blocks. It is usually necessary to access 2 or 3 blocks + * per bottom-up deletion operation, though. + */ + if (delstate->bottomup) { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buf); + /* + * We often allow caller to delete a few additional items + * whose entries we reached after the point that space target + * from caller was satisfied. The cost of accessing the page + * was already paid at that point, so it made sense to finish + * it off. When that happened, we finalize everything here + * (by finishing off the whole bottom-up deletion operation + * without needlessly paying the cost of accessing any more + * blocks). + */ + if (bottomup_final_block) + break; + + /* + * Give up when we didn't enable our caller to free any + * additional space as a result of processing the page that we + * just finished up with. This rule is the main way in which + * we keep the cost of bottom-up deletion under control. + */ + if (nblocksaccessed >= 1 && actualfreespace == lastfreespace) + break; + lastfreespace = actualfreespace; /* for next time */ + + /* + * Deletion operation (which is bottom-up) will definitely + * access the next block in line. Prepare for that now. + * + * Decay target free space so that we don't hang on for too + * long with a marginal case. (Space target is only truly + * helpful when it allows us to recognize that we don't need + * to access more than 1 or 2 blocks to satisfy caller due to + * agreeable workload characteristics.) + * + * We are a bit more patient when we encounter contiguous + * blocks, though: these are treated as favorable blocks. The + * decay process is only applied when the next block in line + * is not a favorable/contiguous block. This is not an + * exception to the general rule; we still insist on finding + * at least one deletable item per block accessed. See + * bottomup_nblocksfavorable() for full details of the theory + * behind favorable blocks and heap block locality in general. + * + * Note: The first block in line is always treated as a + * favorable block, so the earliest possible point that the + * decay can be applied is just before we access the second + * block in line. The Assert() verifies this for us. + */ + Assert(nblocksaccessed > 0 || nblocksfavorable > 0); + if (nblocksfavorable > 0) + nblocksfavorable--; + else + curtargetfreespace /= 2; } - blkno = ItemPointerGetBlockNumber(htid); + /* release old buffer */ + if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); + blkno = ItemPointerGetBlockNumber(htid); buf = ReadBuffer(rel, blkno); + nblocksaccessed++; + Assert(!delstate->bottomup || + nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS); #ifdef USE_PREFETCH @@ -7072,7 +7183,7 @@ heap_compute_xid_horizon_for_tuples(Relation rel, * To maintain the prefetch distance, prefetch one more page for * each page we read. */ - xid_horizon_prefetch_buffer(rel, &prefetch_state, 1); + index_delete_prefetch_buffer(rel, &prefetch_state, 1); #endif LockBuffer(buf, BUFFER_LOCK_SHARE); @@ -7081,6 +7192,31 @@ heap_compute_xid_horizon_for_tuples(Relation rel, maxoff = PageGetMaxOffsetNumber(page); } + if (istatus->knowndeletable) + Assert(!delstate->bottomup && !istatus->promising); + else + { + ItemPointerData tmp = *htid; + HeapTupleData heapTuple; + + /* Are any tuples from this HOT chain non-vacuumable? */ + if (heap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable, + &heapTuple, NULL, true)) + continue; /* can't delete entry */ + + /* Caller will delete, since whole HOT chain is vacuumable */ + istatus->knowndeletable = true; + + /* Maintain index free space info for bottom-up deletion case */ + if (delstate->bottomup) + { + Assert(istatus->freespace > 0); + actualfreespace += istatus->freespace; + if (actualfreespace >= curtargetfreespace) + bottomup_final_block = true; + } + } + /* * Maintain latestRemovedXid value for deletion operation as a whole * by advancing current value using heap tuple headers. This is @@ -7108,17 +7244,18 @@ heap_compute_xid_horizon_for_tuples(Relation rel, } /* - * We'll often encounter LP_DEAD line pointers. No need to do - * anything more with htid when that happens. This is okay - * because the earlier pruning operation that made the line - * pointer LP_DEAD in the first place must have considered the - * tuple header as part of generating its own latestRemovedXid - * value. + * We'll often encounter LP_DEAD line pointers (especially with an + * entry marked knowndeletable by our caller up front). No heap + * tuple headers get examined for an htid that leads us to an + * LP_DEAD item. This is okay because the earlier pruning + * operation that made the line pointer LP_DEAD in the first place + * must have considered the original tuple header as part of + * generating its own latestRemovedXid value. * - * Relying on XLOG_HEAP2_CLEANUP_INFO records like this is the - * same strategy that index vacuuming uses in all cases. Index - * VACUUM WAL records don't even have a latestRemovedXid field of - * their own for this reason. + * Relying on XLOG_HEAP2_CLEAN records like this is the same + * strategy that index vacuuming uses in all cases. Index VACUUM + * WAL records don't even have a latestRemovedXid field of their + * own for this reason. */ if (!ItemIdIsNormal(lp)) break; @@ -7148,15 +7285,388 @@ heap_compute_xid_horizon_for_tuples(Relation rel, offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetUpdateXid(htup); } + + /* Enable further/final shrinking of deltids for caller */ + finalndeltids = i + 1; } - if (BufferIsValid(buf)) + UnlockReleaseBuffer(buf); + + /* + * Shrink deltids array to exclude non-deletable entries at the end. This + * is not just a minor optimization. Final deltids array size might be + * zero for a bottom-up caller. Index AM is explicitly allowed to rely on + * ndeltids being zero in all cases with zero total deletable entries. + */ + Assert(finalndeltids > 0 || delstate->bottomup); + delstate->ndeltids = finalndeltids; + + return latestRemovedXid; +} + +/* + * Specialized inlineable comparison function for index_delete_sort() + */ +static inline int +index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2) +{ + ItemPointer tid1 = &deltid1->tid; + ItemPointer tid2 = &deltid2->tid; + + { + BlockNumber blk1 = ItemPointerGetBlockNumber(tid1); + BlockNumber blk2 = ItemPointerGetBlockNumber(tid2); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buf); + OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; } - return latestRemovedXid; + pg_unreachable(); + + return 0; +} + +/* + * Sort deltids array from delstate by TID. This prepares it for further + * processing by heap_index_delete_tuples(). + * + * This operation becomes a noticeable consumer of CPU cycles with some + * workloads, so we go to the trouble of specialization/micro optimization. + * We use shellsort for this because it's easy to specialize, compiles to + * relatively few instructions, and is adaptive to presorted inputs/subsets + * (which are typical here). + */ +static void +index_delete_sort(TM_IndexDeleteOp *delstate) +{ + TM_IndexDelete *deltids = delstate->deltids; + int ndeltids = delstate->ndeltids; + int low = 0; + + /* + * Shellsort gap sequence (taken from Sedgewick-Incerpi paper). + * + * This implementation is fast with array sizes up to ~4500. This covers + * all supported BLCKSZ values. + */ + const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1}; + + /* Think carefully before changing anything here -- keep swaps cheap */ + StaticAssertStmt(sizeof(TM_IndexDelete) <= 8, + "element size exceeds 8 bytes"); + + for (int g = 0; g < lengthof(gaps); g++) + { + for (int hi = gaps[g], i = low + hi; i < ndeltids; i++) + { + TM_IndexDelete d = deltids[i]; + int j = i; + + while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0) + { + deltids[j] = deltids[j - hi]; + j -= hi; + } + deltids[j] = d; + } + } +} + +/* + * Returns how many blocks should be considered favorable/contiguous for a + * bottom-up index deletion pass. This is a number of heap blocks that starts + * from and includes the first block in line. + * + * There is always at least one favorable block during bottom-up index + * deletion. In the worst case (i.e. with totally random heap blocks) the + * first block in line (the only favorable block) can be thought of as a + * degenerate array of contiguous blocks that consists of a single block. + * heap_index_delete_tuples() will expect this. + * + * Caller passes blockgroups, a description of the final order that deltids + * will be sorted in for heap_index_delete_tuples() bottom-up index deletion + * processing. Note that deltids need not actually be sorted just yet (caller + * only passes deltids to us so that we can interpret blockgroups). + * + * You might guess that the existence of contiguous blocks cannot matter much, + * since in general the main factor that determines which blocks we visit is + * the number of promising TIDs, which is a fixed hint from the index AM. + * We're not really targeting the general case, though -- the actual goal is + * to adapt our behavior to a wide variety of naturally occurring conditions. + * The effects of most of the heuristics we apply are only noticeable in the + * aggregate, over time and across many _related_ bottom-up index deletion + * passes. + * + * Deeming certain blocks favorable allows heapam to recognize and adapt to + * workloads where heap blocks visited during bottom-up index deletion can be + * accessed contiguously, in the sense that each newly visited block is the + * neighbor of the block that bottom-up deletion just finished processing (or + * close enough to it). It will likely be cheaper to access more favorable + * blocks sooner rather than later (e.g. in this pass, not across a series of + * related bottom-up passes). Either way it is probably only a matter of time + * (or a matter of further correlated version churn) before all blocks that + * appear together as a single large batch of favorable blocks get accessed by + * _some_ bottom-up pass. Large batches of favorable blocks tend to either + * appear almost constantly or not even once (it all depends on per-index + * workload characteristics). + * + * Note that the blockgroups sort order applies a power-of-two bucketing + * scheme that creates opportunities for contiguous groups of blocks to get + * batched together, at least with workloads that are naturally amenable to + * being driven by heap block locality. This doesn't just enhance the spatial + * locality of bottom-up heap block processing in the obvious way. It also + * enables temporal locality of access, since sorting by heap block number + * naturally tends to make the bottom-up processing order deterministic. + * + * Consider the following example to get a sense of how temporal locality + * might matter: There is a heap relation with several indexes, each of which + * is low to medium cardinality. It is subject to constant non-HOT updates. + * The updates are skewed (in one part of the primary key, perhaps). None of + * the indexes are logically modified by the UPDATE statements (if they were + * then bottom-up index deletion would not be triggered in the first place). + * Naturally, each new round of index tuples (for each heap tuple that gets a + * heap_update() call) will have the same heap TID in each and every index. + * Since these indexes are low cardinality and never get logically modified, + * heapam processing during bottom-up deletion passes will access heap blocks + * in approximately sequential order. Temporal locality of access occurs due + * to bottom-up deletion passes behaving very similarly across each of the + * indexes at any given moment. This keeps the number of buffer misses needed + * to visit heap blocks to a minimum. + */ +static int +bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups, + TM_IndexDelete *deltids) +{ + int64 lastblock = -1; + int nblocksfavorable = 0; + + Assert(nblockgroups >= 1); + Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS); + + /* + * We tolerate heap blocks that will be accessed only slightly out of + * physical order. Small blips occur when a pair of almost-contiguous + * blocks happen to fall into different buckets (perhaps due only to a + * small difference in npromisingtids that the bucketing scheme didn't + * quite manage to ignore). We effectively ignore these blips by applying + * a small tolerance. The precise tolerance we use is a little arbitrary, + * but it works well enough in practice. + */ + for (int b = 0; b < nblockgroups; b++) + { + IndexDeleteCounts *group = blockgroups + b; + TM_IndexDelete *firstdtid = deltids + group->ifirsttid; + BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid); + + if (lastblock != -1 && + ((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS || + (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS)) + break; + + nblocksfavorable++; + lastblock = block; + } + + /* Always indicate that there is at least 1 favorable block */ + Assert(nblocksfavorable >= 1); + + return nblocksfavorable; +} + +/* + * qsort comparison function for bottomup_sort_and_shrink() + */ +static int +bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2) +{ + const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1; + const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2; + + /* + * Most significant field is npromisingtids (which we invert the order of + * so as to sort in desc order). + * + * Caller should have already normalized npromisingtids fields into + * power-of-two values (buckets). + */ + if (group1->npromisingtids > group2->npromisingtids) + return -1; + if (group1->npromisingtids < group2->npromisingtids) + return 1; + + /* + * Tiebreak: desc ntids sort order. + * + * We cannot expect power-of-two values for ntids fields. We should + * behave as if they were already rounded up for us instead. + */ + if (group1->ntids != group2->ntids) + { + uint32 ntids1 = pg_nextpower2_32((uint32) group1->ntids); + uint32 ntids2 = pg_nextpower2_32((uint32) group2->ntids); + + if (ntids1 > ntids2) + return -1; + if (ntids1 < ntids2) + return 1; + } + + /* + * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for + * block in deltids array) order. + * + * This is equivalent to sorting in ascending heap block number order + * (among otherwise equal subsets of the array). This approach allows us + * to avoid accessing the out-of-line TID. (We rely on the assumption + * that the deltids array was sorted in ascending heap TID order when + * these offsets to the first TID from each heap block group were formed.) + */ + if (group1->ifirsttid > group2->ifirsttid) + return 1; + if (group1->ifirsttid < group2->ifirsttid) + return -1; + + pg_unreachable(); + + return 0; +} + +/* + * heap_index_delete_tuples() helper function for bottom-up deletion callers. + * + * Sorts deltids array in the order needed for useful processing by bottom-up + * deletion. The array should already be sorted in TID order when we're + * called. The sort process groups heap TIDs from deltids into heap block + * groupings. Earlier/more-promising groups/blocks are usually those that are + * known to have the most "promising" TIDs. + * + * Sets new size of deltids array (ndeltids) in state. deltids will only have + * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we + * return. This often means that deltids will be shrunk to a small fraction + * of its original size (we eliminate many heap blocks from consideration for + * caller up front). + * + * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable() + * for a definition and full details. + */ +static int +bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) +{ + IndexDeleteCounts *blockgroups; + TM_IndexDelete *reordereddeltids; + BlockNumber curblock = InvalidBlockNumber; + int nblockgroups = 0; + int ncopied = 0; + int nblocksfavorable = 0; + + Assert(delstate->bottomup); + Assert(delstate->ndeltids > 0); + + /* Calculate per-heap-block count of TIDs */ + blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids); + for (int i = 0; i < delstate->ndeltids; i++) + { + TM_IndexDelete *ideltid = &delstate->deltids[i]; + TM_IndexStatus *istatus = delstate->status + ideltid->id; + ItemPointer htid = &ideltid->tid; + bool promising = istatus->promising; + + if (curblock != ItemPointerGetBlockNumber(htid)) + { + /* New block group */ + nblockgroups++; + + Assert(curblock < ItemPointerGetBlockNumber(htid) || + !BlockNumberIsValid(curblock)); + + curblock = ItemPointerGetBlockNumber(htid); + blockgroups[nblockgroups - 1].ifirsttid = i; + blockgroups[nblockgroups - 1].ntids = 1; + blockgroups[nblockgroups - 1].npromisingtids = 0; + } + else + { + blockgroups[nblockgroups - 1].ntids++; + } + + if (promising) + blockgroups[nblockgroups - 1].npromisingtids++; + } + + /* + * We're about ready to sort block groups to determine the optimal order + * for visiting heap blocks. But before we do, round the number of + * promising tuples for each block group up to the nearest power-of-two + * (except for block groups where npromisingtids is already 0). + * + * This scheme divides heap blocks/block groups into buckets. Each bucket + * contains blocks that have _approximately_ the same number of promising + * TIDs as each other. The goal is to ignore relatively small differences + * in the total number of promising entries, so that the whole process can + * give a little weight to heapam factors (like heap block locality) + * instead. This isn't a trade-off, really -- we have nothing to lose. + * It would be foolish to interpret small differences in npromisingtids + * values as anything more than noise. + * + * We tiebreak on nhtids when sorting block group subsets that have the + * same npromisingtids, but this has the same issues as npromisingtids, + * and so nhtids is subject to the same power-of-two bucketing scheme. + * The only reason that we don't fix nhtids in the same way here too is + * that we'll need accurate nhtids values after the sort. We handle + * nhtids bucketization dynamically instead (in the sort comparator). + * + * See bottomup_nblocksfavorable() for a full explanation of when and how + * heap locality/favorable blocks can significantly influence when and how + * heap blocks are accessed. + */ + for (int b = 0; b < nblockgroups; b++) + { + IndexDeleteCounts *group = blockgroups + b; + + /* Better off falling back on nhtids with low npromisingtids */ + if (group->npromisingtids <= 4) + group->npromisingtids = 4; + else + group->npromisingtids = + pg_nextpower2_32((uint32) group->npromisingtids); + } + + /* Sort groups and rearrange caller's deltids array */ + qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts), + bottomup_sort_and_shrink_cmp); + reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete)); + + nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups); + /* Determine number of favorable blocks at the start of final deltids */ + nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups, + delstate->deltids); + + for (int b = 0; b < nblockgroups; b++) + { + IndexDeleteCounts *group = blockgroups + b; + TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid; + + memcpy(reordereddeltids + ncopied, firstdtid, + sizeof(TM_IndexDelete) * group->ntids); + ncopied += group->ntids; + } + + /* Copy final grouped and sorted TIDs back into start of caller's array */ + memcpy(delstate->deltids, reordereddeltids, + sizeof(TM_IndexDelete) * ncopied); + delstate->ndeltids = ncopied; + + pfree(reordereddeltids); + pfree(blockgroups); + + return nblocksfavorable; } /* |