diff options
author | Andres Freund <andres@anarazel.de> | 2019-01-15 12:06:19 -0800 |
---|---|---|
committer | Andres Freund <andres@anarazel.de> | 2019-01-15 12:06:19 -0800 |
commit | 285d8e12055f27bce5675e93fef365b6c337f2b3 (patch) | |
tree | ddfaa994ec647460be3247289c80caa53719a74f /src/backend/access | |
parent | 148e632c05412aa46b450d31cc598a0a33222792 (diff) | |
download | postgresql-285d8e12055f27bce5675e93fef365b6c337f2b3.tar.gz postgresql-285d8e12055f27bce5675e93fef365b6c337f2b3.zip |
Move vacuumlazy.c into access/heap.
It's heap table storage specific code that can't realistically be
generalized into table AM agnostic code.
Author: Andres Freund
Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
Diffstat (limited to 'src/backend/access')
-rw-r--r-- | src/backend/access/heap/Makefile | 3 | ||||
-rw-r--r-- | src/backend/access/heap/vacuumlazy.c | 2296 |
2 files changed, 2298 insertions, 1 deletions
diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile index b83d496bcd7..7e7324a9166 100644 --- a/src/backend/access/heap/Makefile +++ b/src/backend/access/heap/Makefile @@ -12,6 +12,7 @@ subdir = src/backend/access/heap top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o visibilitymap.o +OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o \ + vacuumlazy.o visibilitymap.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c new file mode 100644 index 00000000000..2d317a94620 --- /dev/null +++ b/src/backend/access/heap/vacuumlazy.c @@ -0,0 +1,2296 @@ +/*------------------------------------------------------------------------- + * + * vacuumlazy.c + * Concurrent ("lazy") vacuuming. + * + * + * The major space usage for LAZY VACUUM is storage for the array of dead tuple + * TIDs. We want to ensure we can vacuum even the very largest relations with + * finite memory space usage. To do that, we set upper bounds on the number of + * tuples we will keep track of at once. + * + * We are willing to use at most maintenance_work_mem (or perhaps + * autovacuum_work_mem) memory space to keep track of dead tuples. We + * initially allocate an array of TIDs of that size, with an upper limit that + * depends on table size (this limit ensures we don't allocate a huge area + * uselessly for vacuuming small tables). If the array threatens to overflow, + * we suspend the heap scan phase and perform a pass of index cleanup and page + * compaction, then resume the heap scan with an empty TID array. + * + * If we're processing a table with no indexes, we can just vacuum each page + * as we go; there's no need to save up multiple tuples to minimize the number + * of index scans performed. So we don't use maintenance_work_mem memory for + * the TID array, just enough to hold as many heap tuples as fit on one page. + * + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/heap/vacuumlazy.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "access/genam.h" +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/htup_details.h" +#include "access/multixact.h" +#include "access/transam.h" +#include "access/visibilitymap.h" +#include "access/xlog.h" +#include "catalog/storage.h" +#include "commands/dbcommands.h" +#include "commands/progress.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "portability/instr_time.h" +#include "postmaster/autovacuum.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/timestamp.h" +#include "utils/tqual.h" + + +/* + * Space/time tradeoff parameters: do these need to be user-tunable? + * + * To consider truncating the relation, we want there to be at least + * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever + * is less) potentially-freeable pages. + */ +#define REL_TRUNCATE_MINIMUM 1000 +#define REL_TRUNCATE_FRACTION 16 + +/* + * Timing parameters for truncate locking heuristics. + * + * These were not exposed as user tunable GUC values because it didn't seem + * that the potential for improvement was great enough to merit the cost of + * supporting them. + */ +#define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */ +#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */ +#define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */ + +/* + * When a table has no indexes, vacuum the FSM after every 8GB, approximately + * (it won't be exact because we only vacuum FSM after processing a heap page + * that has some removable tuples). When there are indexes, this is ignored, + * and we vacuum FSM after each index/heap cleaning pass. + */ +#define VACUUM_FSM_EVERY_PAGES \ + ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ)) + +/* + * Guesstimation of number of dead tuples per page. This is used to + * provide an upper limit to memory allocated when vacuuming small + * tables. + */ +#define LAZY_ALLOC_TUPLES MaxHeapTuplesPerPage + +/* + * Before we consider skipping a page that's marked as clean in + * visibility map, we must've seen at least this many clean pages. + */ +#define SKIP_PAGES_THRESHOLD ((BlockNumber) 32) + +/* + * Size of the prefetch window for lazy vacuum backwards truncation scan. + * Needs to be a power of 2. + */ +#define PREFETCH_SIZE ((BlockNumber) 32) + +typedef struct LVRelStats +{ + /* hasindex = true means two-pass strategy; false means one-pass */ + bool hasindex; + /* Overall statistics about rel */ + BlockNumber old_rel_pages; /* previous value of pg_class.relpages */ + BlockNumber rel_pages; /* total number of pages */ + BlockNumber scanned_pages; /* number of pages we examined */ + BlockNumber pinskipped_pages; /* # of pages we skipped due to a pin */ + BlockNumber frozenskipped_pages; /* # of frozen pages we skipped */ + BlockNumber tupcount_pages; /* pages whose tuples we counted */ + double old_live_tuples; /* previous value of pg_class.reltuples */ + double new_rel_tuples; /* new estimated total # of tuples */ + double new_live_tuples; /* new estimated total # of live tuples */ + double new_dead_tuples; /* new estimated total # of dead tuples */ + BlockNumber pages_removed; + double tuples_deleted; + BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */ + /* List of TIDs of tuples we intend to delete */ + /* NB: this list is ordered by TID address */ + int num_dead_tuples; /* current # of entries */ + int max_dead_tuples; /* # slots allocated in array */ + ItemPointer dead_tuples; /* array of ItemPointerData */ + int num_index_scans; + TransactionId latestRemovedXid; + bool lock_waiter_detected; +} LVRelStats; + + +/* A few variables that don't seem worth passing around as parameters */ +static int elevel = -1; + +static TransactionId OldestXmin; +static TransactionId FreezeLimit; +static MultiXactId MultiXactCutoff; + +static BufferAccessStrategy vac_strategy; + + +/* non-export function prototypes */ +static void lazy_scan_heap(Relation onerel, int options, + LVRelStats *vacrelstats, Relation *Irel, int nindexes, + bool aggressive); +static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats); +static bool lazy_check_needs_freeze(Buffer buf, bool *hastup); +static void lazy_vacuum_index(Relation indrel, + IndexBulkDeleteResult **stats, + LVRelStats *vacrelstats); +static void lazy_cleanup_index(Relation indrel, + IndexBulkDeleteResult *stats, + LVRelStats *vacrelstats); +static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, + int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer); +static bool should_attempt_truncation(LVRelStats *vacrelstats); +static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats); +static BlockNumber count_nondeletable_pages(Relation onerel, + LVRelStats *vacrelstats); +static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks); +static void lazy_record_dead_tuple(LVRelStats *vacrelstats, + ItemPointer itemptr); +static bool lazy_tid_reaped(ItemPointer itemptr, void *state); +static int vac_cmp_itemptr(const void *left, const void *right); +static bool heap_page_is_all_visible(Relation rel, Buffer buf, + TransactionId *visibility_cutoff_xid, bool *all_frozen); + + +/* + * vacuum_heap_rel() -- perform VACUUM for one heap relation + * + * This routine vacuums a single heap, cleans out its indexes, and + * updates its relpages and reltuples statistics. + * + * At entry, we have already established a transaction and opened + * and locked the relation. + */ +void +heap_vacuum_rel(Relation onerel, int options, VacuumParams *params, + BufferAccessStrategy bstrategy) +{ + LVRelStats *vacrelstats; + Relation *Irel; + int nindexes; + PGRUsage ru0; + TimestampTz starttime = 0; + long secs; + int usecs; + double read_rate, + write_rate; + bool aggressive; /* should we scan all unfrozen pages? */ + bool scanned_all_unfrozen; /* actually scanned all such pages? */ + TransactionId xidFullScanLimit; + MultiXactId mxactFullScanLimit; + BlockNumber new_rel_pages; + BlockNumber new_rel_allvisible; + double new_live_tuples; + TransactionId new_frozen_xid; + MultiXactId new_min_multi; + + Assert(params != NULL); + + /* measure elapsed time iff autovacuum logging requires it */ + if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) + { + pg_rusage_init(&ru0); + starttime = GetCurrentTimestamp(); + } + + if (options & VACOPT_VERBOSE) + elevel = INFO; + else + elevel = DEBUG2; + + pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM, + RelationGetRelid(onerel)); + + vac_strategy = bstrategy; + + vacuum_set_xid_limits(onerel, + params->freeze_min_age, + params->freeze_table_age, + params->multixact_freeze_min_age, + params->multixact_freeze_table_age, + &OldestXmin, &FreezeLimit, &xidFullScanLimit, + &MultiXactCutoff, &mxactFullScanLimit); + + /* + * We request an aggressive scan if the table's frozen Xid is now older + * than or equal to the requested Xid full-table scan limit; or if the + * table's minimum MultiXactId is older than or equal to the requested + * mxid full-table scan limit; or if DISABLE_PAGE_SKIPPING was specified. + */ + aggressive = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid, + xidFullScanLimit); + aggressive |= MultiXactIdPrecedesOrEquals(onerel->rd_rel->relminmxid, + mxactFullScanLimit); + if (options & VACOPT_DISABLE_PAGE_SKIPPING) + aggressive = true; + + vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); + + vacrelstats->old_rel_pages = onerel->rd_rel->relpages; + vacrelstats->old_live_tuples = onerel->rd_rel->reltuples; + vacrelstats->num_index_scans = 0; + vacrelstats->pages_removed = 0; + vacrelstats->lock_waiter_detected = false; + + /* Open all indexes of the relation */ + vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); + vacrelstats->hasindex = (nindexes > 0); + + /* Do the vacuuming */ + lazy_scan_heap(onerel, options, vacrelstats, Irel, nindexes, aggressive); + + /* Done with indexes */ + vac_close_indexes(nindexes, Irel, NoLock); + + /* + * Compute whether we actually scanned the all unfrozen pages. If we did, + * we can adjust relfrozenxid and relminmxid. + * + * NB: We need to check this before truncating the relation, because that + * will change ->rel_pages. + */ + if ((vacrelstats->scanned_pages + vacrelstats->frozenskipped_pages) + < vacrelstats->rel_pages) + { + Assert(!aggressive); + scanned_all_unfrozen = false; + } + else + scanned_all_unfrozen = true; + + /* + * Optionally truncate the relation. + */ + if (should_attempt_truncation(vacrelstats)) + lazy_truncate_heap(onerel, vacrelstats); + + /* Report that we are now doing final cleanup */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_FINAL_CLEANUP); + + /* + * Update statistics in pg_class. + * + * A corner case here is that if we scanned no pages at all because every + * page is all-visible, we should not update relpages/reltuples, because + * we have no new information to contribute. In particular this keeps us + * from replacing relpages=reltuples=0 (which means "unknown tuple + * density") with nonzero relpages and reltuples=0 (which means "zero + * tuple density") unless there's some actual evidence for the latter. + * + * It's important that we use tupcount_pages and not scanned_pages for the + * check described above; scanned_pages counts pages where we could not + * get cleanup lock, and which were processed only for frozenxid purposes. + * + * We do update relallvisible even in the corner case, since if the table + * is all-visible we'd definitely like to know that. But clamp the value + * to be not more than what we're setting relpages to. + * + * Also, don't change relfrozenxid/relminmxid if we skipped any pages, + * since then we don't know for certain that all tuples have a newer xmin. + */ + new_rel_pages = vacrelstats->rel_pages; + new_live_tuples = vacrelstats->new_live_tuples; + if (vacrelstats->tupcount_pages == 0 && new_rel_pages > 0) + { + new_rel_pages = vacrelstats->old_rel_pages; + new_live_tuples = vacrelstats->old_live_tuples; + } + + visibilitymap_count(onerel, &new_rel_allvisible, NULL); + if (new_rel_allvisible > new_rel_pages) + new_rel_allvisible = new_rel_pages; + + new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId; + new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId; + + vac_update_relstats(onerel, + new_rel_pages, + new_live_tuples, + new_rel_allvisible, + vacrelstats->hasindex, + new_frozen_xid, + new_min_multi, + false); + + /* report results to the stats collector, too */ + pgstat_report_vacuum(RelationGetRelid(onerel), + onerel->rd_rel->relisshared, + new_live_tuples, + vacrelstats->new_dead_tuples); + pgstat_progress_end_command(); + + /* and log the action if appropriate */ + if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) + { + TimestampTz endtime = GetCurrentTimestamp(); + + if (params->log_min_duration == 0 || + TimestampDifferenceExceeds(starttime, endtime, + params->log_min_duration)) + { + StringInfoData buf; + char *msgfmt; + + TimestampDifference(starttime, endtime, &secs, &usecs); + + read_rate = 0; + write_rate = 0; + if ((secs > 0) || (usecs > 0)) + { + read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) / + (secs + usecs / 1000000.0); + write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) / + (secs + usecs / 1000000.0); + } + + /* + * This is pretty messy, but we split it up so that we can skip + * emitting individual parts of the message when not applicable. + */ + initStringInfo(&buf); + if (params->is_wraparound) + { + if (aggressive) + msgfmt = _("automatic aggressive vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n"); + else + msgfmt = _("automatic vacuum to prevent wraparound of table \"%s.%s.%s\": index scans: %d\n"); + } + else + { + if (aggressive) + msgfmt = _("automatic aggressive vacuum of table \"%s.%s.%s\": index scans: %d\n"); + else + msgfmt = _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"); + } + appendStringInfo(&buf, msgfmt, + get_database_name(MyDatabaseId), + get_namespace_name(RelationGetNamespace(onerel)), + RelationGetRelationName(onerel), + vacrelstats->num_index_scans); + appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"), + vacrelstats->pages_removed, + vacrelstats->rel_pages, + vacrelstats->pinskipped_pages, + vacrelstats->frozenskipped_pages); + appendStringInfo(&buf, + _("tuples: %.0f removed, %.0f remain, %.0f are dead but not yet removable, oldest xmin: %u\n"), + vacrelstats->tuples_deleted, + vacrelstats->new_rel_tuples, + vacrelstats->new_dead_tuples, + OldestXmin); + appendStringInfo(&buf, + _("buffer usage: %d hits, %d misses, %d dirtied\n"), + VacuumPageHit, + VacuumPageMiss, + VacuumPageDirty); + appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"), + read_rate, write_rate); + appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0)); + + ereport(LOG, + (errmsg_internal("%s", buf.data))); + pfree(buf.data); + } + } +} + +/* + * For Hot Standby we need to know the highest transaction id that will + * be removed by any change. VACUUM proceeds in a number of passes so + * we need to consider how each pass operates. The first phase runs + * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it + * progresses - these will have a latestRemovedXid on each record. + * In some cases this removes all of the tuples to be removed, though + * often we have dead tuples with index pointers so we must remember them + * for removal in phase 3. Index records for those rows are removed + * in phase 2 and index blocks do not have MVCC information attached. + * So before we can allow removal of any index tuples we need to issue + * a WAL record containing the latestRemovedXid of rows that will be + * removed in phase three. This allows recovery queries to block at the + * correct place, i.e. before phase two, rather than during phase three + * which would be after the rows have become inaccessible. + */ +static void +vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats) +{ + /* + * Skip this for relations for which no WAL is to be written, or if we're + * not trying to support archive recovery. + */ + if (!RelationNeedsWAL(rel) || !XLogIsNeeded()) + return; + + /* + * No need to write the record at all unless it contains a valid value + */ + if (TransactionIdIsValid(vacrelstats->latestRemovedXid)) + (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid); +} + +/* + * lazy_scan_heap() -- scan an open heap relation + * + * This routine prunes each page in the heap, which will among other + * things truncate dead tuples to dead line pointers, defragment the + * page, and set commit status bits (see heap_page_prune). It also builds + * lists of dead tuples and pages with free space, calculates statistics + * on the number of live tuples in the heap, and marks pages as + * all-visible if appropriate. When done, or when we run low on space for + * dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap + * to reclaim dead line pointers. + * + * If there are no indexes then we can reclaim line pointers on the fly; + * dead line pointers need only be retained until all index pointers that + * reference them have been killed. + */ +static void +lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats, + Relation *Irel, int nindexes, bool aggressive) +{ + BlockNumber nblocks, + blkno; + HeapTupleData tuple; + char *relname; + TransactionId relfrozenxid = onerel->rd_rel->relfrozenxid; + TransactionId relminmxid = onerel->rd_rel->relminmxid; + BlockNumber empty_pages, + vacuumed_pages, + next_fsm_block_to_vacuum; + double num_tuples, /* total number of nonremovable tuples */ + live_tuples, /* live tuples (reltuples estimate) */ + tups_vacuumed, /* tuples cleaned up by vacuum */ + nkeep, /* dead-but-not-removable tuples */ + nunused; /* unused item pointers */ + IndexBulkDeleteResult **indstats; + int i; + PGRUsage ru0; + Buffer vmbuffer = InvalidBuffer; + BlockNumber next_unskippable_block; + bool skipping_blocks; + xl_heap_freeze_tuple *frozen; + StringInfoData buf; + const int initprog_index[] = { + PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_TOTAL_HEAP_BLKS, + PROGRESS_VACUUM_MAX_DEAD_TUPLES + }; + int64 initprog_val[3]; + + pg_rusage_init(&ru0); + + relname = RelationGetRelationName(onerel); + if (aggressive) + ereport(elevel, + (errmsg("aggressively vacuuming \"%s.%s\"", + get_namespace_name(RelationGetNamespace(onerel)), + relname))); + else + ereport(elevel, + (errmsg("vacuuming \"%s.%s\"", + get_namespace_name(RelationGetNamespace(onerel)), + relname))); + + empty_pages = vacuumed_pages = 0; + next_fsm_block_to_vacuum = (BlockNumber) 0; + num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0; + + indstats = (IndexBulkDeleteResult **) + palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); + + nblocks = RelationGetNumberOfBlocks(onerel); + vacrelstats->rel_pages = nblocks; + vacrelstats->scanned_pages = 0; + vacrelstats->tupcount_pages = 0; + vacrelstats->nonempty_pages = 0; + vacrelstats->latestRemovedXid = InvalidTransactionId; + + lazy_space_alloc(vacrelstats, nblocks); + frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage); + + /* Report that we're scanning the heap, advertising total # of blocks */ + initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP; + initprog_val[1] = nblocks; + initprog_val[2] = vacrelstats->max_dead_tuples; + pgstat_progress_update_multi_param(3, initprog_index, initprog_val); + + /* + * Except when aggressive is set, we want to skip pages that are + * all-visible according to the visibility map, but only when we can skip + * at least SKIP_PAGES_THRESHOLD consecutive pages. Since we're reading + * sequentially, the OS should be doing readahead for us, so there's no + * gain in skipping a page now and then; that's likely to disable + * readahead and so be counterproductive. Also, skipping even a single + * page means that we can't update relfrozenxid, so we only want to do it + * if we can skip a goodly number of pages. + * + * When aggressive is set, we can't skip pages just because they are + * all-visible, but we can still skip pages that are all-frozen, since + * such pages do not need freezing and do not affect the value that we can + * safely set for relfrozenxid or relminmxid. + * + * Before entering the main loop, establish the invariant that + * next_unskippable_block is the next block number >= blkno that we can't + * skip based on the visibility map, either all-visible for a regular scan + * or all-frozen for an aggressive scan. We set it to nblocks if there's + * no such block. We also set up the skipping_blocks flag correctly at + * this stage. + * + * Note: The value returned by visibilitymap_get_status could be slightly + * out-of-date, since we make this test before reading the corresponding + * heap page or locking the buffer. This is OK. If we mistakenly think + * that the page is all-visible or all-frozen when in fact the flag's just + * been cleared, we might fail to vacuum the page. It's easy to see that + * skipping a page when aggressive is not set is not a very big deal; we + * might leave some dead tuples lying around, but the next vacuum will + * find them. But even when aggressive *is* set, it's still OK if we miss + * a page whose all-frozen marking has just been cleared. Any new XIDs + * just added to that page are necessarily newer than the GlobalXmin we + * computed, so they'll have no effect on the value to which we can safely + * set relfrozenxid. A similar argument applies for MXIDs and relminmxid. + * + * We will scan the table's last page, at least to the extent of + * determining whether it has tuples or not, even if it should be skipped + * according to the above rules; except when we've already determined that + * it's not worth trying to truncate the table. This avoids having + * lazy_truncate_heap() take access-exclusive lock on the table to attempt + * a truncation that just fails immediately because there are tuples in + * the last page. This is worth avoiding mainly because such a lock must + * be replayed on any hot standby, where it can be disruptive. + */ + next_unskippable_block = 0; + if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0) + { + while (next_unskippable_block < nblocks) + { + uint8 vmstatus; + + vmstatus = visibilitymap_get_status(onerel, next_unskippable_block, + &vmbuffer); + if (aggressive) + { + if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0) + break; + } + else + { + if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0) + break; + } + vacuum_delay_point(); + next_unskippable_block++; + } + } + + if (next_unskippable_block >= SKIP_PAGES_THRESHOLD) + skipping_blocks = true; + else + skipping_blocks = false; + + for (blkno = 0; blkno < nblocks; blkno++) + { + Buffer buf; + Page page; + OffsetNumber offnum, + maxoff; + bool tupgone, + hastup; + int prev_dead_count; + int nfrozen; + Size freespace; + bool all_visible_according_to_vm = false; + bool all_visible; + bool all_frozen = true; /* provided all_visible is also true */ + bool has_dead_tuples; + TransactionId visibility_cutoff_xid = InvalidTransactionId; + + /* see note above about forcing scanning of last page */ +#define FORCE_CHECK_PAGE() \ + (blkno == nblocks - 1 && should_attempt_truncation(vacrelstats)) + + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); + + if (blkno == next_unskippable_block) + { + /* Time to advance next_unskippable_block */ + next_unskippable_block++; + if ((options & VACOPT_DISABLE_PAGE_SKIPPING) == 0) + { + while (next_unskippable_block < nblocks) + { + uint8 vmskipflags; + + vmskipflags = visibilitymap_get_status(onerel, + next_unskippable_block, + &vmbuffer); + if (aggressive) + { + if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0) + break; + } + else + { + if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0) + break; + } + vacuum_delay_point(); + next_unskippable_block++; + } + } + + /* + * We know we can't skip the current block. But set up + * skipping_blocks to do the right thing at the following blocks. + */ + if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD) + skipping_blocks = true; + else + skipping_blocks = false; + + /* + * Normally, the fact that we can't skip this block must mean that + * it's not all-visible. But in an aggressive vacuum we know only + * that it's not all-frozen, so it might still be all-visible. + */ + if (aggressive && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer)) + all_visible_according_to_vm = true; + } + else + { + /* + * The current block is potentially skippable; if we've seen a + * long enough run of skippable blocks to justify skipping it, and + * we're not forced to check it, then go ahead and skip. + * Otherwise, the page must be at least all-visible if not + * all-frozen, so we can set all_visible_according_to_vm = true. + */ + if (skipping_blocks && !FORCE_CHECK_PAGE()) + { + /* + * Tricky, tricky. If this is in aggressive vacuum, the page + * must have been all-frozen at the time we checked whether it + * was skippable, but it might not be any more. We must be + * careful to count it as a skipped all-frozen page in that + * case, or else we'll think we can't update relfrozenxid and + * relminmxid. If it's not an aggressive vacuum, we don't + * know whether it was all-frozen, so we have to recheck; but + * in this case an approximate answer is OK. + */ + if (aggressive || VM_ALL_FROZEN(onerel, blkno, &vmbuffer)) + vacrelstats->frozenskipped_pages++; + continue; + } + all_visible_according_to_vm = true; + } + + vacuum_delay_point(); + + /* + * If we are close to overrunning the available space for dead-tuple + * TIDs, pause and do a cycle of vacuuming before we tackle this page. + */ + if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && + vacrelstats->num_dead_tuples > 0) + { + const int hvp_index[] = { + PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_NUM_INDEX_VACUUMS + }; + int64 hvp_val[2]; + + /* + * Before beginning index vacuuming, we release any pin we may + * hold on the visibility map page. This isn't necessary for + * correctness, but we do it anyway to avoid holding the pin + * across a lengthy, unrelated operation. + */ + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } + + /* Log cleanup info before we touch indexes */ + vacuum_log_cleanup_info(onerel, vacrelstats); + + /* Report that we are now vacuuming indexes */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_VACUUM_INDEX); + + /* Remove index entries */ + for (i = 0; i < nindexes; i++) + lazy_vacuum_index(Irel[i], + &indstats[i], + vacrelstats); + + /* + * Report that we are now vacuuming the heap. We also increase + * the number of index scans here; note that by using + * pgstat_progress_update_multi_param we can update both + * parameters atomically. + */ + hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP; + hvp_val[1] = vacrelstats->num_index_scans + 1; + pgstat_progress_update_multi_param(2, hvp_index, hvp_val); + + /* Remove tuples from heap */ + lazy_vacuum_heap(onerel, vacrelstats); + + /* + * Forget the now-vacuumed tuples, and press on, but be careful + * not to reset latestRemovedXid since we want that value to be + * valid. + */ + vacrelstats->num_dead_tuples = 0; + vacrelstats->num_index_scans++; + + /* + * Vacuum the Free Space Map to make newly-freed space visible on + * upper-level FSM pages. Note we have not yet processed blkno. + */ + FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno); + next_fsm_block_to_vacuum = blkno; + + /* Report that we are once again scanning the heap */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_SCAN_HEAP); + } + + /* + * Pin the visibility map page in case we need to mark the page + * all-visible. In most cases this will be very cheap, because we'll + * already have the correct page pinned anyway. However, it's + * possible that (a) next_unskippable_block is covered by a different + * VM page than the current block or (b) we released our pin and did a + * cycle of index vacuuming. + * + */ + visibilitymap_pin(onerel, blkno, &vmbuffer); + + buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, + RBM_NORMAL, vac_strategy); + + /* We need buffer cleanup lock so that we can prune HOT chains. */ + if (!ConditionalLockBufferForCleanup(buf)) + { + /* + * If we're not performing an aggressive scan to guard against XID + * wraparound, and we don't want to forcibly check the page, then + * it's OK to skip vacuuming pages we get a lock conflict on. They + * will be dealt with in some future vacuum. + */ + if (!aggressive && !FORCE_CHECK_PAGE()) + { + ReleaseBuffer(buf); + vacrelstats->pinskipped_pages++; + continue; + } + + /* + * Read the page with share lock to see if any xids on it need to + * be frozen. If not we just skip the page, after updating our + * scan statistics. If there are some, we wait for cleanup lock. + * + * We could defer the lock request further by remembering the page + * and coming back to it later, or we could even register + * ourselves for multiple buffers and then service whichever one + * is received first. For now, this seems good enough. + * + * If we get here with aggressive false, then we're just forcibly + * checking the page, and so we don't want to insist on getting + * the lock; we only need to know if the page contains tuples, so + * that we can update nonempty_pages correctly. It's convenient + * to use lazy_check_needs_freeze() for both situations, though. + */ + LockBuffer(buf, BUFFER_LOCK_SHARE); + if (!lazy_check_needs_freeze(buf, &hastup)) + { + UnlockReleaseBuffer(buf); + vacrelstats->scanned_pages++; + vacrelstats->pinskipped_pages++; + if (hastup) + vacrelstats->nonempty_pages = blkno + 1; + continue; + } + if (!aggressive) + { + /* + * Here, we must not advance scanned_pages; that would amount + * to claiming that the page contains no freezable tuples. + */ + UnlockReleaseBuffer(buf); + vacrelstats->pinskipped_pages++; + if (hastup) + vacrelstats->nonempty_pages = blkno + 1; + continue; + } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBufferForCleanup(buf); + /* drop through to normal processing */ + } + + vacrelstats->scanned_pages++; + vacrelstats->tupcount_pages++; + + page = BufferGetPage(buf); + + if (PageIsNew(page)) + { + /* + * An all-zeroes page could be left over if a backend extends the + * relation but crashes before initializing the page. Reclaim such + * pages for use. + * + * We have to be careful here because we could be looking at a + * page that someone has just added to the relation and not yet + * been able to initialize (see RelationGetBufferForTuple). To + * protect against that, release the buffer lock, grab the + * relation extension lock momentarily, and re-lock the buffer. If + * the page is still uninitialized by then, it must be left over + * from a crashed backend, and we can initialize it. + * + * We don't really need the relation lock when this is a new or + * temp relation, but it's probably not worth the code space to + * check that, since this surely isn't a critical path. + * + * Note: the comparable code in vacuum.c need not worry because + * it's got exclusive lock on the whole relation. + */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockRelationForExtension(onerel, ExclusiveLock); + UnlockRelationForExtension(onerel, ExclusiveLock); + LockBufferForCleanup(buf); + if (PageIsNew(page)) + { + ereport(WARNING, + (errmsg("relation \"%s\" page %u is uninitialized --- fixing", + relname, blkno))); + PageInit(page, BufferGetPageSize(buf), 0); + empty_pages++; + } + freespace = PageGetHeapFreeSpace(page); + MarkBufferDirty(buf); + UnlockReleaseBuffer(buf); + + RecordPageWithFreeSpace(onerel, blkno, freespace); + continue; + } + + if (PageIsEmpty(page)) + { + empty_pages++; + freespace = PageGetHeapFreeSpace(page); + + /* empty pages are always all-visible and all-frozen */ + if (!PageIsAllVisible(page)) + { + START_CRIT_SECTION(); + + /* mark buffer dirty before writing a WAL record */ + MarkBufferDirty(buf); + + /* + * It's possible that another backend has extended the heap, + * initialized the page, and then failed to WAL-log the page + * due to an ERROR. Since heap extension is not WAL-logged, + * recovery might try to replay our record setting the page + * all-visible and find that the page isn't initialized, which + * will cause a PANIC. To prevent that, check whether the + * page has been previously WAL-logged, and if not, do that + * now. + */ + if (RelationNeedsWAL(onerel) && + PageGetLSN(page) == InvalidXLogRecPtr) + log_newpage_buffer(buf, true); + + PageSetAllVisible(page); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); + END_CRIT_SECTION(); + } + + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(onerel, blkno, freespace); + continue; + } + + /* + * Prune all HOT-update chains in this page. + * + * We count tuples removed by the pruning step as removed by VACUUM. + */ + tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, + &vacrelstats->latestRemovedXid); + + /* + * Now scan the page to collect vacuumable items and check for tuples + * requiring freezing. + */ + all_visible = true; + has_dead_tuples = false; + nfrozen = 0; + hastup = false; + prev_dead_count = vacrelstats->num_dead_tuples; + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Note: If you change anything in the loop below, also look at + * heap_page_is_all_visible to see if that needs to be changed. + */ + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + itemid = PageGetItemId(page, offnum); + + /* Unused items require no processing, but we count 'em */ + if (!ItemIdIsUsed(itemid)) + { + nunused += 1; + continue; + } + + /* Redirect items mustn't be touched */ + if (ItemIdIsRedirected(itemid)) + { + hastup = true; /* this page won't be truncatable */ + continue; + } + + ItemPointerSet(&(tuple.t_self), blkno, offnum); + + /* + * DEAD item pointers are to be vacuumed normally; but we don't + * count them in tups_vacuumed, else we'd be double-counting (at + * least in the common case where heap_page_prune() just freed up + * a non-HOT tuple). + */ + if (ItemIdIsDead(itemid)) + { + lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); + all_visible = false; + continue; + } + + Assert(ItemIdIsNormal(itemid)); + + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(onerel); + + tupgone = false; + + /* + * The criteria for counting a tuple as live in this block need to + * match what analyze.c's acquire_sample_rows() does, otherwise + * VACUUM and ANALYZE may produce wildly different reltuples + * values, e.g. when there are many recently-dead tuples. + * + * The logic here is a bit simpler than acquire_sample_rows(), as + * VACUUM can't run inside a transaction block, which makes some + * cases impossible (e.g. in-progress insert from the same + * transaction). + */ + switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) + { + case HEAPTUPLE_DEAD: + + /* + * Ordinarily, DEAD tuples would have been removed by + * heap_page_prune(), but it's possible that the tuple + * state changed since heap_page_prune() looked. In + * particular an INSERT_IN_PROGRESS tuple could have + * changed to DEAD if the inserter aborted. So this + * cannot be considered an error condition. + * + * If the tuple is HOT-updated then it must only be + * removed by a prune operation; so we keep it just as if + * it were RECENTLY_DEAD. Also, if it's a heap-only + * tuple, we choose to keep it, because it'll be a lot + * cheaper to get rid of it in the next pruning pass than + * to treat it like an indexed tuple. + * + * If this were to happen for a tuple that actually needed + * to be deleted, we'd be in trouble, because it'd + * possibly leave a tuple below the relation's xmin + * horizon alive. heap_prepare_freeze_tuple() is prepared + * to detect that case and abort the transaction, + * preventing corruption. + */ + if (HeapTupleIsHotUpdated(&tuple) || + HeapTupleIsHeapOnly(&tuple)) + nkeep += 1; + else + tupgone = true; /* we can delete the tuple */ + all_visible = false; + break; + case HEAPTUPLE_LIVE: + /* + * Count it as live. Not only is this natural, but it's + * also what acquire_sample_rows() does. + */ + live_tuples += 1; + + /* + * Is the tuple definitely visible to all transactions? + * + * NB: Like with per-tuple hint bits, we can't set the + * PD_ALL_VISIBLE flag if the inserter committed + * asynchronously. See SetHintBits for more info. Check + * that the tuple is hinted xmin-committed because of + * that. + */ + if (all_visible) + { + TransactionId xmin; + + if (!HeapTupleHeaderXminCommitted(tuple.t_data)) + { + all_visible = false; + break; + } + + /* + * The inserter definitely committed. But is it old + * enough that everyone sees it as committed? + */ + xmin = HeapTupleHeaderGetXmin(tuple.t_data); + if (!TransactionIdPrecedes(xmin, OldestXmin)) + { + all_visible = false; + break; + } + + /* Track newest xmin on page. */ + if (TransactionIdFollows(xmin, visibility_cutoff_xid)) + visibility_cutoff_xid = xmin; + } + break; + case HEAPTUPLE_RECENTLY_DEAD: + + /* + * If tuple is recently deleted then we must not remove it + * from relation. + */ + nkeep += 1; + all_visible = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * This is an expected case during concurrent vacuum. + * + * We do not count these rows as live, because we expect + * the inserting transaction to update the counters at + * commit, and we assume that will happen only after we + * report our results. This assumption is a bit shaky, + * but it is what acquire_sample_rows() does, so be + * consistent. + */ + all_visible = false; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + /* This is an expected case during concurrent vacuum */ + all_visible = false; + + /* + * Count such rows as live. As above, we assume the + * deleting transaction will commit and update the + * counters after we report. + */ + live_tuples += 1; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + if (tupgone) + { + lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); + HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, + &vacrelstats->latestRemovedXid); + tups_vacuumed += 1; + has_dead_tuples = true; + } + else + { + bool tuple_totally_frozen; + + num_tuples += 1; + hastup = true; + + /* + * Each non-removable tuple must be checked to see if it needs + * freezing. Note we already have exclusive buffer lock. + */ + if (heap_prepare_freeze_tuple(tuple.t_data, + relfrozenxid, relminmxid, + FreezeLimit, MultiXactCutoff, + &frozen[nfrozen], + &tuple_totally_frozen)) + frozen[nfrozen++].offset = offnum; + + if (!tuple_totally_frozen) + all_frozen = false; + } + } /* scan along page */ + + /* + * If we froze any tuples, mark the buffer dirty, and write a WAL + * record recording the changes. We must log the changes to be + * crash-safe against future truncation of CLOG. + */ + if (nfrozen > 0) + { + START_CRIT_SECTION(); + + MarkBufferDirty(buf); + + /* execute collected freezes */ + for (i = 0; i < nfrozen; i++) + { + ItemId itemid; + HeapTupleHeader htup; + + itemid = PageGetItemId(page, frozen[i].offset); + htup = (HeapTupleHeader) PageGetItem(page, itemid); + + heap_execute_freeze_tuple(htup, &frozen[i]); + } + + /* Now WAL-log freezing if necessary */ + if (RelationNeedsWAL(onerel)) + { + XLogRecPtr recptr; + + recptr = log_heap_freeze(onerel, buf, FreezeLimit, + frozen, nfrozen); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } + + /* + * If there are no indexes then we can vacuum the page right now + * instead of doing a second scan. + */ + if (nindexes == 0 && + vacrelstats->num_dead_tuples > 0) + { + /* Remove tuples from heap */ + lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer); + has_dead_tuples = false; + + /* + * Forget the now-vacuumed tuples, and press on, but be careful + * not to reset latestRemovedXid since we want that value to be + * valid. + */ + vacrelstats->num_dead_tuples = 0; + vacuumed_pages++; + + /* + * Periodically do incremental FSM vacuuming to make newly-freed + * space visible on upper FSM pages. Note: although we've cleaned + * the current block, we haven't yet updated its FSM entry (that + * happens further down), so passing end == blkno is correct. + */ + if (blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) + { + FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, + blkno); + next_fsm_block_to_vacuum = blkno; + } + } + + freespace = PageGetHeapFreeSpace(page); + + /* mark page all-visible, if appropriate */ + if (all_visible && !all_visible_according_to_vm) + { + uint8 flags = VISIBILITYMAP_ALL_VISIBLE; + + if (all_frozen) + flags |= VISIBILITYMAP_ALL_FROZEN; + + /* + * It should never be the case that the visibility map page is set + * while the page-level bit is clear, but the reverse is allowed + * (if checksums are not enabled). Regardless, set the both bits + * so that we get back in sync. + * + * NB: If the heap page is all-visible but the VM bit is not set, + * we don't need to dirty the heap page. However, if checksums + * are enabled, we do need to make sure that the heap page is + * dirtied before passing it to visibilitymap_set(), because it + * may be logged. Given that this situation should only happen in + * rare cases after a crash, it is not worth optimizing. + */ + PageSetAllVisible(page); + MarkBufferDirty(buf); + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, visibility_cutoff_xid, flags); + } + + /* + * As of PostgreSQL 9.2, the visibility map bit should never be set if + * the page-level bit is clear. However, it's possible that the bit + * got cleared after we checked it and before we took the buffer + * content lock, so we must recheck before jumping to the conclusion + * that something bad has happened. + */ + else if (all_visible_according_to_vm && !PageIsAllVisible(page) + && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer)) + { + elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + relname, blkno); + visibilitymap_clear(onerel, blkno, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } + + /* + * It's possible for the value returned by GetOldestXmin() to move + * backwards, so it's not wrong for us to see tuples that appear to + * not be visible to everyone yet, while PD_ALL_VISIBLE is already + * set. The real safe xmin value never moves backwards, but + * GetOldestXmin() is conservative and sometimes returns a value + * that's unnecessarily small, so if we see that contradiction it just + * means that the tuples that we think are not visible to everyone yet + * actually are, and the PD_ALL_VISIBLE flag is correct. + * + * There should never be dead tuples on a page with PD_ALL_VISIBLE + * set, however. + */ + else if (PageIsAllVisible(page) && has_dead_tuples) + { + elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", + relname, blkno); + PageClearAllVisible(page); + MarkBufferDirty(buf); + visibilitymap_clear(onerel, blkno, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } + + /* + * If the all-visible page is turned out to be all-frozen but not + * marked, we should so mark it. Note that all_frozen is only valid + * if all_visible is true, so we must check both. + */ + else if (all_visible_according_to_vm && all_visible && all_frozen && + !VM_ALL_FROZEN(onerel, blkno, &vmbuffer)) + { + /* + * We can pass InvalidTransactionId as the cutoff XID here, + * because setting the all-frozen bit doesn't cause recovery + * conflicts. + */ + visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, + VISIBILITYMAP_ALL_FROZEN); + } + + UnlockReleaseBuffer(buf); + + /* Remember the location of the last page with nonremovable tuples */ + if (hastup) + vacrelstats->nonempty_pages = blkno + 1; + + /* + * If we remembered any tuples for deletion, then the page will be + * visited again by lazy_vacuum_heap, which will compute and record + * its post-compaction free space. If not, then we're done with this + * page, so remember its free space as-is. (This path will always be + * taken if there are no indexes.) + */ + if (vacrelstats->num_dead_tuples == prev_dead_count) + RecordPageWithFreeSpace(onerel, blkno, freespace); + } + + /* report that everything is scanned and vacuumed */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); + + pfree(frozen); + + /* save stats for use later */ + vacrelstats->tuples_deleted = tups_vacuumed; + vacrelstats->new_dead_tuples = nkeep; + + /* now we can compute the new value for pg_class.reltuples */ + vacrelstats->new_live_tuples = vac_estimate_reltuples(onerel, + nblocks, + vacrelstats->tupcount_pages, + live_tuples); + + /* also compute total number of surviving heap entries */ + vacrelstats->new_rel_tuples = + vacrelstats->new_live_tuples + vacrelstats->new_dead_tuples; + + /* + * Release any remaining pin on visibility map page. + */ + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } + + /* If any tuples need to be deleted, perform final vacuum cycle */ + /* XXX put a threshold on min number of tuples here? */ + if (vacrelstats->num_dead_tuples > 0) + { + const int hvp_index[] = { + PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_NUM_INDEX_VACUUMS + }; + int64 hvp_val[2]; + + /* Log cleanup info before we touch indexes */ + vacuum_log_cleanup_info(onerel, vacrelstats); + + /* Report that we are now vacuuming indexes */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_VACUUM_INDEX); + + /* Remove index entries */ + for (i = 0; i < nindexes; i++) + lazy_vacuum_index(Irel[i], + &indstats[i], + vacrelstats); + + /* Report that we are now vacuuming the heap */ + hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP; + hvp_val[1] = vacrelstats->num_index_scans + 1; + pgstat_progress_update_multi_param(2, hvp_index, hvp_val); + + /* Remove tuples from heap */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_VACUUM_HEAP); + lazy_vacuum_heap(onerel, vacrelstats); + vacrelstats->num_index_scans++; + } + + /* + * Vacuum the remainder of the Free Space Map. We must do this whether or + * not there were indexes. + */ + if (blkno > next_fsm_block_to_vacuum) + FreeSpaceMapVacuumRange(onerel, next_fsm_block_to_vacuum, blkno); + + /* report all blocks vacuumed; and that we're cleaning up */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_INDEX_CLEANUP); + + /* Do post-vacuum cleanup and statistics update for each index */ + for (i = 0; i < nindexes; i++) + lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); + + /* If no indexes, make log report that lazy_vacuum_heap would've made */ + if (vacuumed_pages) + ereport(elevel, + (errmsg("\"%s\": removed %.0f row versions in %u pages", + RelationGetRelationName(onerel), + tups_vacuumed, vacuumed_pages))); + + /* + * This is pretty messy, but we split it up so that we can skip emitting + * individual parts of the message when not applicable. + */ + initStringInfo(&buf); + appendStringInfo(&buf, + _("%.0f dead row versions cannot be removed yet, oldest xmin: %u\n"), + nkeep, OldestXmin); + appendStringInfo(&buf, _("There were %.0f unused item pointers.\n"), + nunused); + appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins, ", + "Skipped %u pages due to buffer pins, ", + vacrelstats->pinskipped_pages), + vacrelstats->pinskipped_pages); + appendStringInfo(&buf, ngettext("%u frozen page.\n", + "%u frozen pages.\n", + vacrelstats->frozenskipped_pages), + vacrelstats->frozenskipped_pages); + appendStringInfo(&buf, ngettext("%u page is entirely empty.\n", + "%u pages are entirely empty.\n", + empty_pages), + empty_pages); + appendStringInfo(&buf, _("%s."), pg_rusage_show(&ru0)); + + ereport(elevel, + (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", + RelationGetRelationName(onerel), + tups_vacuumed, num_tuples, + vacrelstats->scanned_pages, nblocks), + errdetail_internal("%s", buf.data))); + pfree(buf.data); +} + + +/* + * lazy_vacuum_heap() -- second pass over the heap + * + * This routine marks dead tuples as unused and compacts out free + * space on their pages. Pages not having dead tuples recorded from + * lazy_scan_heap are not visited at all. + * + * Note: the reason for doing this as a second pass is we cannot remove + * the tuples until we've removed their index entries, and we want to + * process index entry removal in batches as large as possible. + */ +static void +lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) +{ + int tupindex; + int npages; + PGRUsage ru0; + Buffer vmbuffer = InvalidBuffer; + + pg_rusage_init(&ru0); + npages = 0; + + tupindex = 0; + while (tupindex < vacrelstats->num_dead_tuples) + { + BlockNumber tblk; + Buffer buf; + Page page; + Size freespace; + + vacuum_delay_point(); + + tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); + buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL, + vac_strategy); + if (!ConditionalLockBufferForCleanup(buf)) + { + ReleaseBuffer(buf); + ++tupindex; + continue; + } + tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats, + &vmbuffer); + + /* Now that we've compacted the page, record its available space */ + page = BufferGetPage(buf); + freespace = PageGetHeapFreeSpace(page); + + UnlockReleaseBuffer(buf); + RecordPageWithFreeSpace(onerel, tblk, freespace); + npages++; + } + + if (BufferIsValid(vmbuffer)) + { + ReleaseBuffer(vmbuffer); + vmbuffer = InvalidBuffer; + } + + ereport(elevel, + (errmsg("\"%s\": removed %d row versions in %d pages", + RelationGetRelationName(onerel), + tupindex, npages), + errdetail_internal("%s", pg_rusage_show(&ru0)))); +} + +/* + * lazy_vacuum_page() -- free dead tuples on a page + * and repair its fragmentation. + * + * Caller must hold pin and buffer cleanup lock on the buffer. + * + * tupindex is the index in vacrelstats->dead_tuples of the first dead + * tuple for this page. We assume the rest follow sequentially. + * The return value is the first tupindex after the tuples of this page. + */ +static int +lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, + int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer) +{ + Page page = BufferGetPage(buffer); + OffsetNumber unused[MaxOffsetNumber]; + int uncnt = 0; + TransactionId visibility_cutoff_xid; + bool all_frozen; + + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); + + START_CRIT_SECTION(); + + for (; tupindex < vacrelstats->num_dead_tuples; tupindex++) + { + BlockNumber tblk; + OffsetNumber toff; + ItemId itemid; + + tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); + if (tblk != blkno) + break; /* past end of tuples for this block */ + toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]); + itemid = PageGetItemId(page, toff); + ItemIdSetUnused(itemid); + unused[uncnt++] = toff; + } + + PageRepairFragmentation(page); + + /* + * Mark buffer dirty before we write WAL. + */ + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(onerel)) + { + XLogRecPtr recptr; + + recptr = log_heap_clean(onerel, buffer, + NULL, 0, NULL, 0, + unused, uncnt, + vacrelstats->latestRemovedXid); + PageSetLSN(page, recptr); + } + + /* + * End critical section, so we safely can do visibility tests (which + * possibly need to perform IO and allocate memory!). If we crash now the + * page (including the corresponding vm bit) might not be marked all + * visible, but that's fine. A later vacuum will fix that. + */ + END_CRIT_SECTION(); + + /* + * Now that we have removed the dead tuples from the page, once again + * check if the page has become all-visible. The page is already marked + * dirty, exclusively locked, and, if needed, a full page image has been + * emitted in the log_heap_clean() above. + */ + if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid, + &all_frozen)) + PageSetAllVisible(page); + + /* + * All the changes to the heap page have been done. If the all-visible + * flag is now set, also set the VM all-visible bit (and, if possible, the + * all-frozen bit) unless this has already been done previously. + */ + if (PageIsAllVisible(page)) + { + uint8 vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer); + uint8 flags = 0; + + /* Set the VM all-frozen bit to flag, if needed */ + if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0) + flags |= VISIBILITYMAP_ALL_VISIBLE; + if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen) + flags |= VISIBILITYMAP_ALL_FROZEN; + + Assert(BufferIsValid(*vmbuffer)); + if (flags != 0) + visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, + *vmbuffer, visibility_cutoff_xid, flags); + } + + return tupindex; +} + +/* + * lazy_check_needs_freeze() -- scan page to see if any tuples + * need to be cleaned to avoid wraparound + * + * Returns true if the page needs to be vacuumed using cleanup lock. + * Also returns a flag indicating whether page contains any tuples at all. + */ +static bool +lazy_check_needs_freeze(Buffer buf, bool *hastup) +{ + Page page = BufferGetPage(buf); + OffsetNumber offnum, + maxoff; + HeapTupleHeader tupleheader; + + *hastup = false; + + /* If we hit an uninitialized page, we want to force vacuuming it. */ + if (PageIsNew(page)) + return true; + + /* Quick out for ordinary empty page. */ + if (PageIsEmpty(page)) + return false; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + itemid = PageGetItemId(page, offnum); + + /* this should match hastup test in count_nondeletable_pages() */ + if (ItemIdIsUsed(itemid)) + *hastup = true; + + /* dead and redirect items never need freezing */ + if (!ItemIdIsNormal(itemid)) + continue; + + tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); + + if (heap_tuple_needs_freeze(tupleheader, FreezeLimit, + MultiXactCutoff, buf)) + return true; + } /* scan along page */ + + return false; +} + + +/* + * lazy_vacuum_index() -- vacuum one index relation. + * + * Delete all the index entries pointing to tuples listed in + * vacrelstats->dead_tuples, and update running statistics. + */ +static void +lazy_vacuum_index(Relation indrel, + IndexBulkDeleteResult **stats, + LVRelStats *vacrelstats) +{ + IndexVacuumInfo ivinfo; + PGRUsage ru0; + + pg_rusage_init(&ru0); + + ivinfo.index = indrel; + ivinfo.analyze_only = false; + ivinfo.estimated_count = true; + ivinfo.message_level = elevel; + /* We can only provide an approximate value of num_heap_tuples here */ + ivinfo.num_heap_tuples = vacrelstats->old_live_tuples; + ivinfo.strategy = vac_strategy; + + /* Do bulk deletion */ + *stats = index_bulk_delete(&ivinfo, *stats, + lazy_tid_reaped, (void *) vacrelstats); + + ereport(elevel, + (errmsg("scanned index \"%s\" to remove %d row versions", + RelationGetRelationName(indrel), + vacrelstats->num_dead_tuples), + errdetail_internal("%s", pg_rusage_show(&ru0)))); +} + +/* + * lazy_cleanup_index() -- do post-vacuum cleanup for one index relation. + */ +static void +lazy_cleanup_index(Relation indrel, + IndexBulkDeleteResult *stats, + LVRelStats *vacrelstats) +{ + IndexVacuumInfo ivinfo; + PGRUsage ru0; + + pg_rusage_init(&ru0); + + ivinfo.index = indrel; + ivinfo.analyze_only = false; + ivinfo.estimated_count = (vacrelstats->tupcount_pages < vacrelstats->rel_pages); + ivinfo.message_level = elevel; + + /* + * Now we can provide a better estimate of total number of surviving + * tuples (we assume indexes are more interested in that than in the + * number of nominally live tuples). + */ + ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples; + ivinfo.strategy = vac_strategy; + + stats = index_vacuum_cleanup(&ivinfo, stats); + + if (!stats) + return; + + /* + * Now update statistics in pg_class, but only if the index says the count + * is accurate. + */ + if (!stats->estimated_count) + vac_update_relstats(indrel, + stats->num_pages, + stats->num_index_tuples, + 0, + false, + InvalidTransactionId, + InvalidMultiXactId, + false); + + ereport(elevel, + (errmsg("index \"%s\" now contains %.0f row versions in %u pages", + RelationGetRelationName(indrel), + stats->num_index_tuples, + stats->num_pages), + errdetail("%.0f index row versions were removed.\n" + "%u index pages have been deleted, %u are currently reusable.\n" + "%s.", + stats->tuples_removed, + stats->pages_deleted, stats->pages_free, + pg_rusage_show(&ru0)))); + + pfree(stats); +} + +/* + * should_attempt_truncation - should we attempt to truncate the heap? + * + * Don't even think about it unless we have a shot at releasing a goodly + * number of pages. Otherwise, the time taken isn't worth it. + * + * Also don't attempt it if we are doing early pruning/vacuuming, because a + * scan which cannot find a truncated heap page cannot determine that the + * snapshot is too old to read that page. We might be able to get away with + * truncating all except one of the pages, setting its LSN to (at least) the + * maximum of the truncated range if we also treated an index leaf tuple + * pointing to a missing heap page as something to trigger the "snapshot too + * old" error, but that seems fragile and seems like it deserves its own patch + * if we consider it. + * + * This is split out so that we can test whether truncation is going to be + * called for before we actually do it. If you change the logic here, be + * careful to depend only on fields that lazy_scan_heap updates on-the-fly. + */ +static bool +should_attempt_truncation(LVRelStats *vacrelstats) +{ + BlockNumber possibly_freeable; + + possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; + if (possibly_freeable > 0 && + (possibly_freeable >= REL_TRUNCATE_MINIMUM || + possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) && + old_snapshot_threshold < 0) + return true; + else + return false; +} + +/* + * lazy_truncate_heap - try to truncate off any empty pages at the end + */ +static void +lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) +{ + BlockNumber old_rel_pages = vacrelstats->rel_pages; + BlockNumber new_rel_pages; + PGRUsage ru0; + int lock_retry; + + pg_rusage_init(&ru0); + + /* Report that we are now truncating */ + pgstat_progress_update_param(PROGRESS_VACUUM_PHASE, + PROGRESS_VACUUM_PHASE_TRUNCATE); + + /* + * Loop until no more truncating can be done. + */ + do + { + /* + * We need full exclusive lock on the relation in order to do + * truncation. If we can't get it, give up rather than waiting --- we + * don't want to block other backends, and we don't want to deadlock + * (which is quite possible considering we already hold a lower-grade + * lock). + */ + vacrelstats->lock_waiter_detected = false; + lock_retry = 0; + while (true) + { + if (ConditionalLockRelation(onerel, AccessExclusiveLock)) + break; + + /* + * Check for interrupts while trying to (re-)acquire the exclusive + * lock. + */ + CHECK_FOR_INTERRUPTS(); + + if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT / + VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL)) + { + /* + * We failed to establish the lock in the specified number of + * retries. This means we give up truncating. + */ + vacrelstats->lock_waiter_detected = true; + ereport(elevel, + (errmsg("\"%s\": stopping truncate due to conflicting lock request", + RelationGetRelationName(onerel)))); + return; + } + + pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL * 1000L); + } + + /* + * Now that we have exclusive lock, look to see if the rel has grown + * whilst we were vacuuming with non-exclusive lock. If so, give up; + * the newly added pages presumably contain non-deletable tuples. + */ + new_rel_pages = RelationGetNumberOfBlocks(onerel); + if (new_rel_pages != old_rel_pages) + { + /* + * Note: we intentionally don't update vacrelstats->rel_pages with + * the new rel size here. If we did, it would amount to assuming + * that the new pages are empty, which is unlikely. Leaving the + * numbers alone amounts to assuming that the new pages have the + * same tuple density as existing ones, which is less unlikely. + */ + UnlockRelation(onerel, AccessExclusiveLock); + return; + } + + /* + * Scan backwards from the end to verify that the end pages actually + * contain no tuples. This is *necessary*, not optional, because + * other backends could have added tuples to these pages whilst we + * were vacuuming. + */ + new_rel_pages = count_nondeletable_pages(onerel, vacrelstats); + + if (new_rel_pages >= old_rel_pages) + { + /* can't do anything after all */ + UnlockRelation(onerel, AccessExclusiveLock); + return; + } + + /* + * Okay to truncate. + */ + RelationTruncate(onerel, new_rel_pages); + + /* + * We can release the exclusive lock as soon as we have truncated. + * Other backends can't safely access the relation until they have + * processed the smgr invalidation that smgrtruncate sent out ... but + * that should happen as part of standard invalidation processing once + * they acquire lock on the relation. + */ + UnlockRelation(onerel, AccessExclusiveLock); + + /* + * Update statistics. Here, it *is* correct to adjust rel_pages + * without also touching reltuples, since the tuple count wasn't + * changed by the truncation. + */ + vacrelstats->pages_removed += old_rel_pages - new_rel_pages; + vacrelstats->rel_pages = new_rel_pages; + + ereport(elevel, + (errmsg("\"%s\": truncated %u to %u pages", + RelationGetRelationName(onerel), + old_rel_pages, new_rel_pages), + errdetail_internal("%s", + pg_rusage_show(&ru0)))); + old_rel_pages = new_rel_pages; + } while (new_rel_pages > vacrelstats->nonempty_pages && + vacrelstats->lock_waiter_detected); +} + +/* + * Rescan end pages to verify that they are (still) empty of tuples. + * + * Returns number of nondeletable pages (last nonempty page + 1). + */ +static BlockNumber +count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) +{ + BlockNumber blkno; + BlockNumber prefetchedUntil; + instr_time starttime; + + /* Initialize the starttime if we check for conflicting lock requests */ + INSTR_TIME_SET_CURRENT(starttime); + + /* + * Start checking blocks at what we believe relation end to be and move + * backwards. (Strange coding of loop control is needed because blkno is + * unsigned.) To make the scan faster, we prefetch a few blocks at a time + * in forward direction, so that OS-level readahead can kick in. + */ + blkno = vacrelstats->rel_pages; + StaticAssertStmt((PREFETCH_SIZE & (PREFETCH_SIZE - 1)) == 0, + "prefetch size must be power of 2"); + prefetchedUntil = InvalidBlockNumber; + while (blkno > vacrelstats->nonempty_pages) + { + Buffer buf; + Page page; + OffsetNumber offnum, + maxoff; + bool hastup; + + /* + * Check if another process requests a lock on our relation. We are + * holding an AccessExclusiveLock here, so they will be waiting. We + * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we + * only check if that interval has elapsed once every 32 blocks to + * keep the number of system calls and actual shared lock table + * lookups to a minimum. + */ + if ((blkno % 32) == 0) + { + instr_time currenttime; + instr_time elapsed; + + INSTR_TIME_SET_CURRENT(currenttime); + elapsed = currenttime; + INSTR_TIME_SUBTRACT(elapsed, starttime); + if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) + >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) + { + if (LockHasWaitersRelation(onerel, AccessExclusiveLock)) + { + ereport(elevel, + (errmsg("\"%s\": suspending truncate due to conflicting lock request", + RelationGetRelationName(onerel)))); + + vacrelstats->lock_waiter_detected = true; + return blkno; + } + starttime = currenttime; + } + } + + /* + * We don't insert a vacuum delay point here, because we have an + * exclusive lock on the table which we want to hold for as short a + * time as possible. We still need to check for interrupts however. + */ + CHECK_FOR_INTERRUPTS(); + + blkno--; + + /* If we haven't prefetched this lot yet, do so now. */ + if (prefetchedUntil > blkno) + { + BlockNumber prefetchStart; + BlockNumber pblkno; + + prefetchStart = blkno & ~(PREFETCH_SIZE - 1); + for (pblkno = prefetchStart; pblkno <= blkno; pblkno++) + { + PrefetchBuffer(onerel, MAIN_FORKNUM, pblkno); + CHECK_FOR_INTERRUPTS(); + } + prefetchedUntil = prefetchStart; + } + + buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, + RBM_NORMAL, vac_strategy); + + /* In this phase we only need shared access to the buffer */ + LockBuffer(buf, BUFFER_LOCK_SHARE); + + page = BufferGetPage(buf); + + if (PageIsNew(page) || PageIsEmpty(page)) + { + /* PageIsNew probably shouldn't happen... */ + UnlockReleaseBuffer(buf); + continue; + } + + hastup = false; + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + itemid = PageGetItemId(page, offnum); + + /* + * Note: any non-unused item should be taken as a reason to keep + * this page. We formerly thought that DEAD tuples could be + * thrown away, but that's not so, because we'd not have cleaned + * out their index entries. + */ + if (ItemIdIsUsed(itemid)) + { + hastup = true; + break; /* can stop scanning */ + } + } /* scan along page */ + + UnlockReleaseBuffer(buf); + + /* Done scanning if we found a tuple here */ + if (hastup) + return blkno + 1; + } + + /* + * If we fall out of the loop, all the previously-thought-to-be-empty + * pages still are; we need not bother to look at the last known-nonempty + * page. + */ + return vacrelstats->nonempty_pages; +} + +/* + * lazy_space_alloc - space allocation decisions for lazy vacuum + * + * See the comments at the head of this file for rationale. + */ +static void +lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks) +{ + long maxtuples; + int vac_work_mem = IsAutoVacuumWorkerProcess() && + autovacuum_work_mem != -1 ? + autovacuum_work_mem : maintenance_work_mem; + + if (vacrelstats->hasindex) + { + maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData); + maxtuples = Min(maxtuples, INT_MAX); + maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData)); + + /* curious coding here to ensure the multiplication can't overflow */ + if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks) + maxtuples = relblocks * LAZY_ALLOC_TUPLES; + + /* stay sane if small maintenance_work_mem */ + maxtuples = Max(maxtuples, MaxHeapTuplesPerPage); + } + else + { + maxtuples = MaxHeapTuplesPerPage; + } + + vacrelstats->num_dead_tuples = 0; + vacrelstats->max_dead_tuples = (int) maxtuples; + vacrelstats->dead_tuples = (ItemPointer) + palloc(maxtuples * sizeof(ItemPointerData)); +} + +/* + * lazy_record_dead_tuple - remember one deletable tuple + */ +static void +lazy_record_dead_tuple(LVRelStats *vacrelstats, + ItemPointer itemptr) +{ + /* + * The array shouldn't overflow under normal behavior, but perhaps it + * could if we are given a really small maintenance_work_mem. In that + * case, just forget the last few tuples (we'll get 'em next time). + */ + if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples) + { + vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr; + vacrelstats->num_dead_tuples++; + pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, + vacrelstats->num_dead_tuples); + } +} + +/* + * lazy_tid_reaped() -- is a particular tid deletable? + * + * This has the right signature to be an IndexBulkDeleteCallback. + * + * Assumes dead_tuples array is in sorted order. + */ +static bool +lazy_tid_reaped(ItemPointer itemptr, void *state) +{ + LVRelStats *vacrelstats = (LVRelStats *) state; + ItemPointer res; + + res = (ItemPointer) bsearch((void *) itemptr, + (void *) vacrelstats->dead_tuples, + vacrelstats->num_dead_tuples, + sizeof(ItemPointerData), + vac_cmp_itemptr); + + return (res != NULL); +} + +/* + * Comparator routines for use with qsort() and bsearch(). + */ +static int +vac_cmp_itemptr(const void *left, const void *right) +{ + BlockNumber lblk, + rblk; + OffsetNumber loff, + roff; + + lblk = ItemPointerGetBlockNumber((ItemPointer) left); + rblk = ItemPointerGetBlockNumber((ItemPointer) right); + + if (lblk < rblk) + return -1; + if (lblk > rblk) + return 1; + + loff = ItemPointerGetOffsetNumber((ItemPointer) left); + roff = ItemPointerGetOffsetNumber((ItemPointer) right); + + if (loff < roff) + return -1; + if (loff > roff) + return 1; + + return 0; +} + +/* + * Check if every tuple in the given page is visible to all current and future + * transactions. Also return the visibility_cutoff_xid which is the highest + * xmin amongst the visible tuples. Set *all_frozen to true if every tuple + * on this page is frozen. + */ +static bool +heap_page_is_all_visible(Relation rel, Buffer buf, + TransactionId *visibility_cutoff_xid, + bool *all_frozen) +{ + Page page = BufferGetPage(buf); + BlockNumber blockno = BufferGetBlockNumber(buf); + OffsetNumber offnum, + maxoff; + bool all_visible = true; + + *visibility_cutoff_xid = InvalidTransactionId; + *all_frozen = true; + + /* + * This is a stripped down version of the line pointer scan in + * lazy_scan_heap(). So if you change anything here, also check that code. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff && all_visible; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + HeapTupleData tuple; + + itemid = PageGetItemId(page, offnum); + + /* Unused or redirect line pointers are of no interest */ + if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid)) + continue; + + ItemPointerSet(&(tuple.t_self), blockno, offnum); + + /* + * Dead line pointers can have index pointers pointing to them. So + * they can't be treated as visible + */ + if (ItemIdIsDead(itemid)) + { + all_visible = false; + *all_frozen = false; + break; + } + + Assert(ItemIdIsNormal(itemid)); + + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(rel); + + switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) + { + case HEAPTUPLE_LIVE: + { + TransactionId xmin; + + /* Check comments in lazy_scan_heap. */ + if (!HeapTupleHeaderXminCommitted(tuple.t_data)) + { + all_visible = false; + *all_frozen = false; + break; + } + + /* + * The inserter definitely committed. But is it old enough + * that everyone sees it as committed? + */ + xmin = HeapTupleHeaderGetXmin(tuple.t_data); + if (!TransactionIdPrecedes(xmin, OldestXmin)) + { + all_visible = false; + *all_frozen = false; + break; + } + + /* Track newest xmin on page. */ + if (TransactionIdFollows(xmin, *visibility_cutoff_xid)) + *visibility_cutoff_xid = xmin; + + /* Check whether this tuple is already frozen or not */ + if (all_visible && *all_frozen && + heap_tuple_needs_eventual_freeze(tuple.t_data)) + *all_frozen = false; + } + break; + + case HEAPTUPLE_DEAD: + case HEAPTUPLE_RECENTLY_DEAD: + case HEAPTUPLE_INSERT_IN_PROGRESS: + case HEAPTUPLE_DELETE_IN_PROGRESS: + { + all_visible = false; + *all_frozen = false; + break; + } + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + } /* scan along page */ + + return all_visible; +} |