aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/heap/visibilitymap.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/heap/visibilitymap.c')
-rw-r--r--src/backend/access/heap/visibilitymap.c191
1 files changed, 109 insertions, 82 deletions
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index fc28f3f8c5d..2e64fc3dfe8 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -15,39 +15,42 @@
* visibilitymap_pin - pin a map page for setting a bit
* visibilitymap_pin_ok - check whether correct map page is already pinned
* visibilitymap_set - set a bit in a previously pinned page
- * visibilitymap_test - test if a bit is set
+ * visibilitymap_get_status - get status of bits
* visibilitymap_count - count number of bits set in visibility map
* visibilitymap_truncate - truncate the visibility map
*
* NOTES
*
- * The visibility map is a bitmap with one bit per heap page. A set bit means
- * that all tuples on the page are known visible to all transactions, and
- * therefore the page doesn't need to be vacuumed. The map is conservative in
- * the sense that we make sure that whenever a bit is set, we know the
- * condition is true, but if a bit is not set, it might or might not be true.
+ * The visibility map is a bitmap with two bits (all-visible and all-frozen)
+ * per heap page. A set all-visible bit means that all tuples on the page are
+ * known visible to all transactions, and therefore the page doesn't need to
+ * be vacuumed. A set all-frozen bit means that all tuples on the page are
+ * completely frozen, and therefore the page doesn't need to be vacuumed even
+ * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum).
+ * The all-frozen bit must be set only when the page is already all-visible.
*
- * Clearing a visibility map bit is not separately WAL-logged. The callers
+ * The map is conservative in the sense that we make sure that whenever a bit
+ * is set, we know the condition is true, but if a bit is not set, it might or
+ * might not be true.
+ *
+ * Clearing both visibility map bits is not separately WAL-logged. The callers
* must make sure that whenever a bit is cleared, the bit is cleared on WAL
* replay of the updating operation as well.
*
* When we *set* a visibility map during VACUUM, we must write WAL. This may
* seem counterintuitive, since the bit is basically a hint: if it is clear,
- * it may still be the case that every tuple on the page is visible to all
- * transactions; we just don't know that for certain. The difficulty is that
- * there are two bits which are typically set together: the PD_ALL_VISIBLE bit
- * on the page itself, and the visibility map bit. If a crash occurs after the
- * visibility map page makes it to disk and before the updated heap page makes
- * it to disk, redo must set the bit on the heap page. Otherwise, the next
- * insert, update, or delete on the heap page will fail to realize that the
- * visibility map bit must be cleared, possibly causing index-only scans to
- * return wrong answers.
+ * it may still be the case that every tuple on the page is all-visible or
+ * all-frozen we just don't know that for certain. The difficulty is that
+ * there are two bits which are typically set together: the PD_ALL_VISIBLE
+ * or PD_ALL_FROZEN bit on the page itself, and the corresponding visibility
+ * map bit. If a crash occurs after the visibility map page makes it to disk
+ * and before the updated heap page makes it to disk, redo must set the bit on
+ * the heap page. Otherwise, the next insert, update, or delete on the heap
+ * page will fail to realize that the visibility map bit must be cleared,
+ * possibly causing index-only scans to return wrong answers.
*
* VACUUM will normally skip pages for which the visibility map bit is set;
* such pages can't contain any dead tuples and therefore don't need vacuuming.
- * The visibility map is not used for anti-wraparound vacuums, because
- * an anti-wraparound vacuum needs to freeze tuples and observe the latest xid
- * present in the table, even on pages that don't have any dead tuples.
*
* LOCKING
*
@@ -101,38 +104,50 @@
*/
#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
-/* Number of bits allocated for each heap block. */
-#define BITS_PER_HEAPBLOCK 1
-
-/* Number of heap blocks we can represent in one byte. */
-#define HEAPBLOCKS_PER_BYTE 8
-
/* Number of heap blocks we can represent in one visibility map page. */
#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
/* Mapping from heap block number to the right bit in the visibility map */
#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
-#define HEAPBLK_TO_MAPBIT(x) ((x) % HEAPBLOCKS_PER_BYTE)
-
-/* table for fast counting of set bits */
-static const uint8 number_of_ones[256] = {
- 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+#define HEAPBLK_TO_MAPBIT(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
+
+/* tables for fast counting of set bits for visible and frozen */
+static const uint8 number_of_ones_for_visible[256] = {
+ 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
+ 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
+ 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
+ 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
+ 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
+ 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
+ 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
+ 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
+ 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
+ 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
+ 0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
+ 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
+ 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
+ 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
+ 1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
+ 2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4
+};
+static const uint8 number_of_ones_for_frozen[256] = {
+ 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
+ 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
+ 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
+ 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
+ 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
+ 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
+ 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
+ 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
+ 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
+ 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
+ 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
+ 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
+ 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
+ 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
+ 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
+ 2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4
};
/* prototypes for internal routines */
@@ -141,7 +156,7 @@ static void vm_extend(Relation rel, BlockNumber nvmblocks);
/*
- * visibilitymap_clear - clear a bit in visibility map
+ * visibilitymap_clear - clear all bits in visibility map
*
* You must pass a buffer containing the correct map page to this function.
* Call visibilitymap_pin first to pin the right one. This function doesn't do
@@ -153,7 +168,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf)
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
int mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
- uint8 mask = 1 << mapBit;
+ uint8 mask = VISIBILITYMAP_VALID_BITS << mapBit;
char *map;
#ifdef TRACE_VISIBILITYMAP
@@ -186,7 +201,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf)
* visibilitymap_set to actually set the bit.
*
* On entry, *buf should be InvalidBuffer or a valid buffer returned by
- * an earlier call to visibilitymap_pin or visibilitymap_test on the same
+ * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
* relation. On return, *buf is a valid buffer with the map page containing
* the bit for heapBlk.
*
@@ -212,7 +227,7 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf)
* visibilitymap_pin_ok - do we already have the correct page pinned?
*
* On entry, buf should be InvalidBuffer or a valid buffer returned by
- * an earlier call to visibilitymap_pin or visibilitymap_test on the same
+ * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
* relation. The return value indicates whether the buffer covers the
* given heapBlk.
*/
@@ -225,19 +240,22 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
}
/*
- * visibilitymap_set - set a bit on a previously pinned page
+ * visibilitymap_set - set bit(s) on a previously pinned page
*
* recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
* or InvalidXLogRecPtr in normal running. The page LSN is advanced to the
* one provided; in normal running, we generate a new XLOG record and set the
* page LSN to that value. cutoff_xid is the largest xmin on the page being
* marked all-visible; it is needed for Hot Standby, and can be
- * InvalidTransactionId if the page contains no tuples.
+ * InvalidTransactionId if the page contains no tuples. It can also be set
+ * to InvalidTransactionId when a page that is already all-visible is being
+ * marked all-frozen.
*
- * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
- * this function. Except in recovery, caller should also pass the heap
- * buffer. When checksums are enabled and we're not in recovery, we must add
- * the heap buffer to the WAL chain to protect it from being torn.
+ * Caller is expected to set the heap page's PD_ALL_VISIBLE or PD_ALL_FROZEN
+ * bit before calling this function. Except in recovery, caller should also
+ * pass the heap buffer and flags which indicates what flag we want to set.
+ * When checksums are enabled and we're not in recovery, we must add the heap
+ * buffer to the WAL chain to protect it from being torn.
*
* You must pass a buffer containing the correct map page to this function.
* Call visibilitymap_pin first to pin the right one. This function doesn't do
@@ -245,13 +263,14 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
*/
void
visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
- XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid)
+ XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid,
+ uint8 flags)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
Page page;
- char *map;
+ uint8 *map;
#ifdef TRACE_VISIBILITYMAP
elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
@@ -259,6 +278,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
Assert(InRecovery || BufferIsValid(heapBuf));
+ Assert(flags & VISIBILITYMAP_VALID_BITS);
/* Check that we have the right heap page pinned, if present */
if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk)
@@ -269,14 +289,14 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
elog(ERROR, "wrong VM buffer passed to visibilitymap_set");
page = BufferGetPage(vmBuf);
- map = PageGetContents(page);
+ map = (uint8 *)PageGetContents(page);
LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE);
- if (!(map[mapByte] & (1 << mapBit)))
+ if (flags != (map[mapByte] >> mapBit & VISIBILITYMAP_VALID_BITS))
{
START_CRIT_SECTION();
- map[mapByte] |= (1 << mapBit);
+ map[mapByte] |= (flags << mapBit);
MarkBufferDirty(vmBuf);
if (RelationNeedsWAL(rel))
@@ -285,7 +305,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
{
Assert(!InRecovery);
recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf,
- cutoff_xid);
+ cutoff_xid, flags);
/*
* If data checksums are enabled (or wal_log_hints=on), we
@@ -295,8 +315,10 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
{
Page heapPage = BufferGetPage(heapBuf);
- /* caller is expected to set PD_ALL_VISIBLE first */
- Assert(PageIsAllVisible(heapPage));
+ /* Caller is expected to set page-level bits first. */
+ Assert((flags & VISIBILITYMAP_ALL_VISIBLE) == 0 || PageIsAllVisible(heapPage));
+ Assert((flags & VISIBILITYMAP_ALL_FROZEN) == 0 || PageIsAllFrozen(heapPage));
+
PageSetLSN(heapPage, recptr);
}
}
@@ -310,15 +332,17 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
}
/*
- * visibilitymap_test - test if a bit is set
+ * visibilitymap_get_status - get status of bits
*
- * Are all tuples on heapBlk visible to all, according to the visibility map?
+ * Are all tuples on heapBlk visible to all or are marked frozen, according
+ * to the visibility map?
*
* On entry, *buf should be InvalidBuffer or a valid buffer returned by an
- * earlier call to visibilitymap_pin or visibilitymap_test on the same
+ * earlier call to visibilitymap_pin or visibilitymap_get_status on the same
* relation. On return, *buf is a valid buffer with the map page containing
* the bit for heapBlk, or InvalidBuffer. The caller is responsible for
- * releasing *buf after it's done testing and setting bits.
+ * releasing *buf after it's done testing and setting bits, and must pass flags
+ * for which it needs to check the value in visibility map.
*
* NOTE: This function is typically called without a lock on the heap page,
* so somebody else could change the bit just after we look at it. In fact,
@@ -327,17 +351,16 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
* we might see the old value. It is the caller's responsibility to deal with
* all concurrency issues!
*/
-bool
-visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
+uint8
+visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *buf)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
- bool result;
char *map;
#ifdef TRACE_VISIBILITYMAP
- elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk);
+ elog(DEBUG1, "vm_get_status %s %d", RelationGetRelationName(rel), heapBlk);
#endif
/* Reuse the old pinned buffer if possible */
@@ -360,13 +383,11 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
map = PageGetContents(BufferGetPage(*buf));
/*
- * A single-bit read is atomic. There could be memory-ordering effects
+ * A single byte read is atomic. There could be memory-ordering effects
* here, but for performance reasons we make it the caller's job to worry
* about that.
*/
- result = (map[mapByte] & (1 << mapBit)) ? true : false;
-
- return result;
+ return ((map[mapByte] >> mapBit) & VISIBILITYMAP_VALID_BITS);
}
/*
@@ -374,14 +395,20 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
*
* Note: we ignore the possibility of race conditions when the table is being
* extended concurrently with the call. New pages added to the table aren't
- * going to be marked all-visible, so they won't affect the result.
+ * going to be marked all-visible or all-frozen, so they won't affect the result.
*/
-BlockNumber
-visibilitymap_count(Relation rel)
+void
+visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen)
{
- BlockNumber result = 0;
BlockNumber mapBlock;
+ /* all_visible must be specified */
+ Assert(all_visible);
+
+ *all_visible = 0;
+ if (all_frozen)
+ *all_frozen = 0;
+
for (mapBlock = 0;; mapBlock++)
{
Buffer mapBuffer;
@@ -406,13 +433,13 @@ visibilitymap_count(Relation rel)
for (i = 0; i < MAPSIZE; i++)
{
- result += number_of_ones[map[i]];
+ *all_visible += number_of_ones_for_visible[map[i]];
+ if (all_frozen)
+ *all_frozen += number_of_ones_for_frozen[map[i]];
}
ReleaseBuffer(mapBuffer);
}
-
- return result;
}
/*