diff options
Diffstat (limited to 'src/backend/access/heap/visibilitymap.c')
-rw-r--r-- | src/backend/access/heap/visibilitymap.c | 191 |
1 files changed, 109 insertions, 82 deletions
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index fc28f3f8c5d..2e64fc3dfe8 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -15,39 +15,42 @@ * visibilitymap_pin - pin a map page for setting a bit * visibilitymap_pin_ok - check whether correct map page is already pinned * visibilitymap_set - set a bit in a previously pinned page - * visibilitymap_test - test if a bit is set + * visibilitymap_get_status - get status of bits * visibilitymap_count - count number of bits set in visibility map * visibilitymap_truncate - truncate the visibility map * * NOTES * - * The visibility map is a bitmap with one bit per heap page. A set bit means - * that all tuples on the page are known visible to all transactions, and - * therefore the page doesn't need to be vacuumed. The map is conservative in - * the sense that we make sure that whenever a bit is set, we know the - * condition is true, but if a bit is not set, it might or might not be true. + * The visibility map is a bitmap with two bits (all-visible and all-frozen) + * per heap page. A set all-visible bit means that all tuples on the page are + * known visible to all transactions, and therefore the page doesn't need to + * be vacuumed. A set all-frozen bit means that all tuples on the page are + * completely frozen, and therefore the page doesn't need to be vacuumed even + * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum). + * The all-frozen bit must be set only when the page is already all-visible. * - * Clearing a visibility map bit is not separately WAL-logged. The callers + * The map is conservative in the sense that we make sure that whenever a bit + * is set, we know the condition is true, but if a bit is not set, it might or + * might not be true. + * + * Clearing both visibility map bits is not separately WAL-logged. The callers * must make sure that whenever a bit is cleared, the bit is cleared on WAL * replay of the updating operation as well. * * When we *set* a visibility map during VACUUM, we must write WAL. This may * seem counterintuitive, since the bit is basically a hint: if it is clear, - * it may still be the case that every tuple on the page is visible to all - * transactions; we just don't know that for certain. The difficulty is that - * there are two bits which are typically set together: the PD_ALL_VISIBLE bit - * on the page itself, and the visibility map bit. If a crash occurs after the - * visibility map page makes it to disk and before the updated heap page makes - * it to disk, redo must set the bit on the heap page. Otherwise, the next - * insert, update, or delete on the heap page will fail to realize that the - * visibility map bit must be cleared, possibly causing index-only scans to - * return wrong answers. + * it may still be the case that every tuple on the page is all-visible or + * all-frozen we just don't know that for certain. The difficulty is that + * there are two bits which are typically set together: the PD_ALL_VISIBLE + * or PD_ALL_FROZEN bit on the page itself, and the corresponding visibility + * map bit. If a crash occurs after the visibility map page makes it to disk + * and before the updated heap page makes it to disk, redo must set the bit on + * the heap page. Otherwise, the next insert, update, or delete on the heap + * page will fail to realize that the visibility map bit must be cleared, + * possibly causing index-only scans to return wrong answers. * * VACUUM will normally skip pages for which the visibility map bit is set; * such pages can't contain any dead tuples and therefore don't need vacuuming. - * The visibility map is not used for anti-wraparound vacuums, because - * an anti-wraparound vacuum needs to freeze tuples and observe the latest xid - * present in the table, even on pages that don't have any dead tuples. * * LOCKING * @@ -101,38 +104,50 @@ */ #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) -/* Number of bits allocated for each heap block. */ -#define BITS_PER_HEAPBLOCK 1 - -/* Number of heap blocks we can represent in one byte. */ -#define HEAPBLOCKS_PER_BYTE 8 - /* Number of heap blocks we can represent in one visibility map page. */ #define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE) /* Mapping from heap block number to the right bit in the visibility map */ #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) -#define HEAPBLK_TO_MAPBIT(x) ((x) % HEAPBLOCKS_PER_BYTE) - -/* table for fast counting of set bits */ -static const uint8 number_of_ones[256] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 +#define HEAPBLK_TO_MAPBIT(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) + +/* tables for fast counting of set bits for visible and frozen */ +static const uint8 number_of_ones_for_visible[256] = { + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4, + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4, + 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3, + 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4 +}; +static const uint8 number_of_ones_for_frozen[256] = { + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4, + 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4 }; /* prototypes for internal routines */ @@ -141,7 +156,7 @@ static void vm_extend(Relation rel, BlockNumber nvmblocks); /* - * visibilitymap_clear - clear a bit in visibility map + * visibilitymap_clear - clear all bits in visibility map * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do @@ -153,7 +168,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf) BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); int mapBit = HEAPBLK_TO_MAPBIT(heapBlk); - uint8 mask = 1 << mapBit; + uint8 mask = VISIBILITYMAP_VALID_BITS << mapBit; char *map; #ifdef TRACE_VISIBILITYMAP @@ -186,7 +201,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf) * visibilitymap_set to actually set the bit. * * On entry, *buf should be InvalidBuffer or a valid buffer returned by - * an earlier call to visibilitymap_pin or visibilitymap_test on the same + * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same * relation. On return, *buf is a valid buffer with the map page containing * the bit for heapBlk. * @@ -212,7 +227,7 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf) * visibilitymap_pin_ok - do we already have the correct page pinned? * * On entry, buf should be InvalidBuffer or a valid buffer returned by - * an earlier call to visibilitymap_pin or visibilitymap_test on the same + * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same * relation. The return value indicates whether the buffer covers the * given heapBlk. */ @@ -225,19 +240,22 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf) } /* - * visibilitymap_set - set a bit on a previously pinned page + * visibilitymap_set - set bit(s) on a previously pinned page * * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the * one provided; in normal running, we generate a new XLOG record and set the * page LSN to that value. cutoff_xid is the largest xmin on the page being * marked all-visible; it is needed for Hot Standby, and can be - * InvalidTransactionId if the page contains no tuples. + * InvalidTransactionId if the page contains no tuples. It can also be set + * to InvalidTransactionId when a page that is already all-visible is being + * marked all-frozen. * - * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling - * this function. Except in recovery, caller should also pass the heap - * buffer. When checksums are enabled and we're not in recovery, we must add - * the heap buffer to the WAL chain to protect it from being torn. + * Caller is expected to set the heap page's PD_ALL_VISIBLE or PD_ALL_FROZEN + * bit before calling this function. Except in recovery, caller should also + * pass the heap buffer and flags which indicates what flag we want to set. + * When checksums are enabled and we're not in recovery, we must add the heap + * buffer to the WAL chain to protect it from being torn. * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do @@ -245,13 +263,14 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf) */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, - XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid) + XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, + uint8 flags) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); Page page; - char *map; + uint8 *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); @@ -259,6 +278,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); Assert(InRecovery || BufferIsValid(heapBuf)); + Assert(flags & VISIBILITYMAP_VALID_BITS); /* Check that we have the right heap page pinned, if present */ if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) @@ -269,14 +289,14 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); page = BufferGetPage(vmBuf); - map = PageGetContents(page); + map = (uint8 *)PageGetContents(page); LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); - if (!(map[mapByte] & (1 << mapBit))) + if (flags != (map[mapByte] >> mapBit & VISIBILITYMAP_VALID_BITS)) { START_CRIT_SECTION(); - map[mapByte] |= (1 << mapBit); + map[mapByte] |= (flags << mapBit); MarkBufferDirty(vmBuf); if (RelationNeedsWAL(rel)) @@ -285,7 +305,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, { Assert(!InRecovery); recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, - cutoff_xid); + cutoff_xid, flags); /* * If data checksums are enabled (or wal_log_hints=on), we @@ -295,8 +315,10 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, { Page heapPage = BufferGetPage(heapBuf); - /* caller is expected to set PD_ALL_VISIBLE first */ - Assert(PageIsAllVisible(heapPage)); + /* Caller is expected to set page-level bits first. */ + Assert((flags & VISIBILITYMAP_ALL_VISIBLE) == 0 || PageIsAllVisible(heapPage)); + Assert((flags & VISIBILITYMAP_ALL_FROZEN) == 0 || PageIsAllFrozen(heapPage)); + PageSetLSN(heapPage, recptr); } } @@ -310,15 +332,17 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, } /* - * visibilitymap_test - test if a bit is set + * visibilitymap_get_status - get status of bits * - * Are all tuples on heapBlk visible to all, according to the visibility map? + * Are all tuples on heapBlk visible to all or are marked frozen, according + * to the visibility map? * * On entry, *buf should be InvalidBuffer or a valid buffer returned by an - * earlier call to visibilitymap_pin or visibilitymap_test on the same + * earlier call to visibilitymap_pin or visibilitymap_get_status on the same * relation. On return, *buf is a valid buffer with the map page containing * the bit for heapBlk, or InvalidBuffer. The caller is responsible for - * releasing *buf after it's done testing and setting bits. + * releasing *buf after it's done testing and setting bits, and must pass flags + * for which it needs to check the value in visibility map. * * NOTE: This function is typically called without a lock on the heap page, * so somebody else could change the bit just after we look at it. In fact, @@ -327,17 +351,16 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, * we might see the old value. It is the caller's responsibility to deal with * all concurrency issues! */ -bool -visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) +uint8 +visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *buf) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); - bool result; char *map; #ifdef TRACE_VISIBILITYMAP - elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk); + elog(DEBUG1, "vm_get_status %s %d", RelationGetRelationName(rel), heapBlk); #endif /* Reuse the old pinned buffer if possible */ @@ -360,13 +383,11 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) map = PageGetContents(BufferGetPage(*buf)); /* - * A single-bit read is atomic. There could be memory-ordering effects + * A single byte read is atomic. There could be memory-ordering effects * here, but for performance reasons we make it the caller's job to worry * about that. */ - result = (map[mapByte] & (1 << mapBit)) ? true : false; - - return result; + return ((map[mapByte] >> mapBit) & VISIBILITYMAP_VALID_BITS); } /* @@ -374,14 +395,20 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) * * Note: we ignore the possibility of race conditions when the table is being * extended concurrently with the call. New pages added to the table aren't - * going to be marked all-visible, so they won't affect the result. + * going to be marked all-visible or all-frozen, so they won't affect the result. */ -BlockNumber -visibilitymap_count(Relation rel) +void +visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen) { - BlockNumber result = 0; BlockNumber mapBlock; + /* all_visible must be specified */ + Assert(all_visible); + + *all_visible = 0; + if (all_frozen) + *all_frozen = 0; + for (mapBlock = 0;; mapBlock++) { Buffer mapBuffer; @@ -406,13 +433,13 @@ visibilitymap_count(Relation rel) for (i = 0; i < MAPSIZE; i++) { - result += number_of_ones[map[i]]; + *all_visible += number_of_ones_for_visible[map[i]]; + if (all_frozen) + *all_frozen += number_of_ones_for_frozen[map[i]]; } ReleaseBuffer(mapBuffer); } - - return result; } /* |