aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/heap/heapam.c18
-rw-r--r--src/backend/access/heap/visibilitymap.c11
-rw-r--r--src/backend/commands/vacuumlazy.c34
-rw-r--r--src/backend/executor/nodeIndexonlyscan.c13
4 files changed, 61 insertions, 15 deletions
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index ce52e2dd48e..2d81383ae8a 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2149,15 +2149,6 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
&vmbuffer, NULL);
page = BufferGetPage(buffer);
- if (PageIsAllVisible(page))
- {
- all_visible_cleared = true;
- PageClearAllVisible(page);
- visibilitymap_clear(relation,
- BufferGetBlockNumber(buffer),
- vmbuffer);
- }
-
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
@@ -2172,6 +2163,15 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
RelationPutHeapTuple(relation, buffer, heaptup);
}
+ if (PageIsAllVisible(page))
+ {
+ all_visible_cleared = true;
+ PageClearAllVisible(page);
+ visibilitymap_clear(relation,
+ BufferGetBlockNumber(buffer),
+ vmbuffer);
+ }
+
/*
* XXX Should we set PageSetPrunable on this page ? See heap_insert()
*/
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 5696abe4d2a..9152c7d1511 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -293,6 +293,13 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
* relation. On return, *buf is a valid buffer with the map page containing
* the bit for heapBlk, or InvalidBuffer. The caller is responsible for
* releasing *buf after it's done testing and setting bits.
+ *
+ * NOTE: This function is typically called without a lock on the heap page,
+ * so somebody else could change the bit just after we look at it. In fact,
+ * since we don't lock the visibility map page either, it's even possible that
+ * someone else could have changed the bit just before we look at it, but yet
+ * we might see the old value. It is the caller's responsibility to deal with
+ * all concurrency issues!
*/
bool
visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
@@ -327,7 +334,9 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
map = PageGetContents(BufferGetPage(*buf));
/*
- * We don't need to lock the page, as we're only looking at a single bit.
+ * A single-bit read is atomic. There could be memory-ordering effects
+ * here, but for performance reasons we make it the caller's job to worry
+ * about that.
*/
result = (map[mapByte] & (1 << mapBit)) ? true : false;
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 0e0193d40e1..3ff56a73664 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -419,6 +419,15 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
* Note: if scan_all is true, we won't actually skip any pages; but we
* maintain next_not_all_visible_block anyway, so as to set up the
* all_visible_according_to_vm flag correctly for each page.
+ *
+ * Note: The value returned by visibilitymap_test could be slightly
+ * out-of-date, since we make this test before reading the corresponding
+ * heap page or locking the buffer. This is OK. If we mistakenly think
+ * that the page is all-visible when in fact the flag's just been cleared,
+ * we might fail to vacuum the page. But it's OK to skip pages when
+ * scan_all is not set, so no great harm done; the next vacuum will find
+ * them. If we make the reverse mistake and vacuum a page unnecessarily,
+ * it'll just be a no-op.
*/
for (next_not_all_visible_block = 0;
next_not_all_visible_block < nblocks;
@@ -852,22 +861,37 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
freespace = PageGetHeapFreeSpace(page);
/* mark page all-visible, if appropriate */
- if (all_visible && !all_visible_according_to_vm)
+ if (all_visible)
{
if (!PageIsAllVisible(page))
{
PageSetAllVisible(page);
MarkBufferDirty(buf);
+ visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
+ visibility_cutoff_xid);
+ }
+ else if (!all_visible_according_to_vm)
+ {
+ /*
+ * It should never be the case that the visibility map page
+ * is set while the page-level bit is clear, but the reverse
+ * is allowed. Set the visibility map bit as well so that
+ * we get back in sync.
+ */
+ visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
+ visibility_cutoff_xid);
}
- visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
- visibility_cutoff_xid);
}
/*
* As of PostgreSQL 9.2, the visibility map bit should never be set if
- * the page-level bit is clear.
+ * the page-level bit is clear. However, it's possible that the bit
+ * got cleared after we checked it and before we took the buffer
+ * content lock, so we must recheck before jumping to the conclusion
+ * that something bad has happened.
*/
- else if (all_visible_according_to_vm && !PageIsAllVisible(page))
+ else if (all_visible_according_to_vm && !PageIsAllVisible(page)
+ && visibilitymap_test(onerel, blkno, &vmbuffer))
{
elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
relname, blkno);
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 4abd805aa31..af31671b3eb 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -82,6 +82,19 @@ IndexOnlyNext(IndexOnlyScanState *node)
* We can skip the heap fetch if the TID references a heap page on
* which all tuples are known visible to everybody. In any case,
* we'll use the index tuple not the heap tuple as the data source.
+ *
+ * Note on Memory Ordering Effects: visibilitymap_test does not lock
+ * the visibility map buffer, and therefore the result we read here
+ * could be slightly stale. However, it can't be stale enough to
+ * matter. It suffices to show that (1) there is a read barrier
+ * between the time we read the index TID and the time we test the
+ * visibility map; and (2) there is a write barrier between the time
+ * some other concurrent process clears the visibility map bit and the
+ * time it inserts the index TID. Since acquiring or releasing a
+ * LWLock interposes a full barrier, this is easy to show: (1) is
+ * satisfied by the release of the index buffer content lock after
+ * reading the TID; and (2) is satisfied by the acquisition of the
+ * buffer content lock in order to insert the TID.
*/
if (!visibilitymap_test(scandesc->heapRelation,
ItemPointerGetBlockNumber(tid),