diff options
author | Teodor Sigaev <teodor@sigaev.ru> | 2018-04-04 19:29:00 +0300 |
---|---|---|
committer | Teodor Sigaev <teodor@sigaev.ru> | 2018-04-04 19:29:00 +0300 |
commit | 857f9c36cda520030381bd8c2af20adf0ce0e1d4 (patch) | |
tree | 3d896351d041c5745111e5ae5dc2c11177dfd31c /src | |
parent | eac93e20afe434a79e81558c17a7a1408cf9d74a (diff) | |
download | postgresql-857f9c36cda520030381bd8c2af20adf0ce0e1d4.tar.gz postgresql-857f9c36cda520030381bd8c2af20adf0ce0e1d4.zip |
Skip full index scan during cleanup of B-tree indexes when possible
Vacuum of index consists from two stages: multiple (zero of more) ambulkdelete
calls and one amvacuumcleanup call. When workload on particular table
is append-only, then autovacuum isn't intended to touch this table. However,
user may run vacuum manually in order to fill visibility map and get benefits
of index-only scans. Then ambulkdelete wouldn't be called for indexes
of such table (because no heap tuples were deleted), only amvacuumcleanup would
be called In this case, amvacuumcleanup would perform full index scan for
two objectives: put recyclable pages into free space map and update index
statistics.
This patch allows btvacuumclanup to skip full index scan when two conditions
are satisfied: no pages are going to be put into free space map and index
statistics isn't stalled. In order to check first condition, we store
oldest btpo_xact in the meta-page. When it's precedes RecentGlobalXmin, then
there are some recyclable pages. In order to check second condition we store
number of heap tuples observed during previous full index scan by cleanup.
If fraction of newly inserted tuples is less than
vacuum_cleanup_index_scale_factor, then statistics isn't considered to be
stalled. vacuum_cleanup_index_scale_factor can be defined as both reloption and GUC (default).
This patch bumps B-tree meta-page version. Upgrade of meta-page is performed
"on the fly": during VACUUM meta-page is rewritten with new version. No special
handling in pg_upgrade is required.
Author: Masahiko Sawada, Alexander Korotkov
Review by: Peter Geoghegan, Kyotaro Horiguchi, Alexander Korotkov, Yura Sokolov
Discussion: https://www.postgresql.org/message-id/flat/CAD21AoAX+d2oD_nrd9O2YkpzHaFr=uQeGr9s1rKC3O4ENc568g@mail.gmail.com
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/access/common/reloptions.c | 13 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 12 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtpage.c | 150 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 118 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtxlog.c | 6 | ||||
-rw-r--r-- | src/backend/utils/init/globals.c | 2 | ||||
-rw-r--r-- | src/backend/utils/misc/guc.c | 10 | ||||
-rw-r--r-- | src/include/access/nbtree.h | 11 | ||||
-rw-r--r-- | src/include/access/nbtxlog.h | 4 | ||||
-rw-r--r-- | src/include/miscadmin.h | 2 | ||||
-rw-r--r-- | src/include/utils/rel.h | 2 | ||||
-rw-r--r-- | src/test/regress/expected/btree_index.out | 29 | ||||
-rw-r--r-- | src/test/regress/sql/btree_index.sql | 19 |
13 files changed, 358 insertions, 20 deletions
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 35c09987adb..69ab2f101c7 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -409,6 +409,15 @@ static relopt_real realRelOpts[] = }, 0, -1.0, DBL_MAX }, + { + { + "vacuum_cleanup_index_scale_factor", + "Number of tuple inserts prior to index cleanup as a fraction of reltuples.", + RELOPT_KIND_BTREE, + ShareUpdateExclusiveLock + }, + -1, 0.0, 100.0 + }, /* list terminator */ {{NULL}} }; @@ -1371,7 +1380,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind) {"user_catalog_table", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, user_catalog_table)}, {"parallel_workers", RELOPT_TYPE_INT, - offsetof(StdRdOptions, parallel_workers)} + offsetof(StdRdOptions, parallel_workers)}, + {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, + offsetof(StdRdOptions, vacuum_cleanup_index_scale_factor)} }; options = parseRelOptions(reloptions, validate, kind, &numoptions); diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 40111990c5e..fd7360278db 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -939,6 +939,9 @@ _bt_insertonpg(Relation rel, if (BufferIsValid(metabuf)) { + /* upgrade meta-page if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); metad->btm_fastroot = itup_blkno; metad->btm_fastlevel = lpageop->btpo.level; MarkBufferDirty(metabuf); @@ -997,6 +1000,9 @@ _bt_insertonpg(Relation rel, xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; + xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + xlmeta.last_cleanup_num_heap_tuples = + metad->btm_last_cleanup_num_heap_tuples; XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata)); @@ -2049,6 +2055,10 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); + /* * Create downlink item for left page (old root). Since this will be the * first item in a non-leaf page, it implicitly has minus-infinity key @@ -2138,6 +2148,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) md.level = metad->btm_level; md.fastroot = rootblknum; md.fastlevel = metad->btm_level; + md.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 92afe2de383..505a67e6ed2 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -60,6 +60,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) metad->btm_level = level; metad->btm_fastroot = rootbknum; metad->btm_fastlevel = level; + metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_heap_tuples = -1.0; metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); metaopaque->btpo_flags = BTP_META; @@ -74,6 +76,114 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level) } /* + * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new. + * + * This routine does purely in-memory image upgrade. Caller is + * responsible for locking, WAL-logging etc. + */ +void +_bt_upgrademetapage(Page page) +{ + BTMetaPageData *metad; + BTPageOpaque metaopaque; + + metad = BTPageGetMeta(page); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* It must be really a meta page of upgradable version */ + Assert(metaopaque->btpo_flags & BTP_META); + Assert(metad->btm_version < BTREE_VERSION); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + + /* Set version number and fill extra fields added into version 3 */ + metad->btm_version = BTREE_VERSION; + metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_heap_tuples = -1.0; + + /* Adjust pd_lower (see _bt_initmetapage() for details) */ + ((PageHeader) page)->pd_lower = + ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; +} + +/* + * _bt_update_meta_cleanup_info() -- Update cleanup-related information in + * the metapage. + * + * This routine checks if provided cleanup-related information is matching + * to those written in the metapage. On mismatch, metapage is overritten. + */ +void +_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, + float8 numHeapTuples) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + bool needsRewrite = false; + XLogRecPtr recptr; + + /* read the metapage and check if it needs rewrite */ + metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + /* outdated version of metapage always needs rewrite */ + if (metad->btm_version < BTREE_VERSION) + needsRewrite = true; + else if (metad->btm_oldest_btpo_xact != oldestBtpoXact || + metad->btm_last_cleanup_num_heap_tuples != numHeapTuples) + needsRewrite = true; + + if (!needsRewrite) + { + _bt_relbuf(rel, metabuf); + return; + } + + /* trade in our read lock for a write lock */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + LockBuffer(metabuf, BT_WRITE); + + START_CRIT_SECTION(); + + /* upgrade meta-page if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); + + /* update cleanup-related infromation */ + metad->btm_oldest_btpo_xact = oldestBtpoXact; + metad->btm_last_cleanup_num_heap_tuples = numHeapTuples; + MarkBufferDirty(metabuf); + + /* write wal record if needed */ + if (RelationNeedsWAL(rel)) + { + xl_btree_metadata md; + + XLogBeginInsert(); + XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); + + md.root = metad->btm_root; + md.level = metad->btm_level; + md.fastroot = metad->btm_fastroot; + md.fastlevel = metad->btm_fastlevel; + md.oldest_btpo_xact = oldestBtpoXact; + md.last_cleanup_num_heap_tuples = numHeapTuples; + + XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata)); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP); + + PageSetLSN(metapg, recptr); + } + + END_CRIT_SECTION(); + _bt_relbuf(rel, metabuf); +} + +/* * _bt_getroot() -- Get the root page of the btree. * * Since the root page can move around the btree file, we have to read @@ -124,7 +234,8 @@ _bt_getroot(Relation rel, int access) metad = (BTMetaPageData *) rel->rd_amcache; /* We shouldn't have cached it if any of these fail */ Assert(metad->btm_magic == BTREE_MAGIC); - Assert(metad->btm_version == BTREE_VERSION); + Assert(metad->btm_version >= BTREE_MIN_VERSION); + Assert(metad->btm_version <= BTREE_VERSION); Assert(metad->btm_root != P_NONE); rootblkno = metad->btm_fastroot; @@ -170,12 +281,14 @@ _bt_getroot(Relation rel, int access) errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); /* if no root page initialized yet, do it */ if (metad->btm_root == P_NONE) @@ -191,6 +304,10 @@ _bt_getroot(Relation rel, int access) LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(metabuf, BT_WRITE); + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); + /* * Race condition: if someone else initialized the metadata between * the time we released the read lock and acquired the write lock, we @@ -229,6 +346,8 @@ _bt_getroot(Relation rel, int access) metad->btm_level = 0; metad->btm_fastroot = rootblkno; metad->btm_fastlevel = 0; + metad->btm_oldest_btpo_xact = InvalidTransactionId; + metad->btm_last_cleanup_num_heap_tuples = -1.0; MarkBufferDirty(rootbuf); MarkBufferDirty(metabuf); @@ -248,6 +367,8 @@ _bt_getroot(Relation rel, int access) md.level = 0; md.fastroot = rootblkno; md.fastlevel = 0; + md.oldest_btpo_xact = InvalidTransactionId; + md.last_cleanup_num_heap_tuples = -1.0; XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata)); @@ -373,12 +494,14 @@ _bt_gettrueroot(Relation rel) errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); /* if no root page initialized yet, fail */ if (metad->btm_root == P_NONE) @@ -460,12 +583,14 @@ _bt_getrootheight(Relation rel) errmsg("index \"%s\" is not a btree", RelationGetRelationName(rel)))); - if (metad->btm_version != BTREE_VERSION) + if (metad->btm_version < BTREE_MIN_VERSION || + metad->btm_version > BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg("version mismatch in index \"%s\": file version %d, code version %d", + errmsg("version mismatch in index \"%s\": file version %d, " + "current version %d, minimal supported version %d", RelationGetRelationName(rel), - metad->btm_version, BTREE_VERSION))); + metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); /* * If there's no root page yet, _bt_getroot() doesn't expect a cache @@ -1784,6 +1909,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { + /* upgrade metapage if needed */ + if (metad->btm_version < BTREE_VERSION) + _bt_upgrademetapage(metapg); metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; MarkBufferDirty(metabuf); @@ -1834,6 +1962,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; + xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact; + xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples; XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata)); xlinfo = XLOG_BTREE_UNLINK_PAGE_META; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 6fca8e358fe..06badc90ba1 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -19,11 +19,14 @@ #include "postgres.h" #include "access/nbtree.h" +#include "access/nbtxlog.h" #include "access/relscan.h" #include "access/xlog.h" #include "commands/vacuum.h" +#include "miscadmin.h" #include "nodes/execnodes.h" #include "pgstat.h" +#include "postmaster/autovacuum.h" #include "storage/condition_variable.h" #include "storage/indexfsm.h" #include "storage/ipc.h" @@ -45,6 +48,7 @@ typedef struct BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */ BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */ BlockNumber totFreePages; /* true total # of free pages */ + TransactionId oldestBtpoXact; MemoryContext pagedelcontext; } BTVacState; @@ -89,7 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc; static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, - BTCycleId cycleid); + BTCycleId cycleid, TransactionId *oldestBtpoXact); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); @@ -774,6 +778,70 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan) } /* + * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that + * btbulkdelete() wasn't called. + */ +static bool +_bt_vacuum_needs_cleanup(IndexVacuumInfo *info) +{ + Buffer metabuf; + Page metapg; + BTPageOpaque metaopaque; + BTMetaPageData *metad; + bool result = false; + + metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ); + metapg = BufferGetPage(metabuf); + metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); + metad = BTPageGetMeta(metapg); + + if (metad->btm_version < BTREE_VERSION) + { + /* + * Do cleanup if metapage needs upgrade, because we don't have + * cleanup-related meta-information yet. + */ + result = true; + } + else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) && + TransactionIdPrecedes(metad->btm_oldest_btpo_xact, + RecentGlobalXmin)) + { + /* + * If oldest btpo.xact in the deleted pages is older than + * RecentGlobalXmin, then at least one deleted page can be recycled. + */ + result = true; + } + else + { + StdRdOptions *relopts; + float8 cleanup_scale_factor; + + /* + * If table receives large enough amount of insertions and no cleanup + * was performed, then index might appear to have stalled statistics. + * In order to evade that, we perform cleanup when table receives + * vacuum_cleanup_index_scale_factor fractions of insertions. + */ + relopts = (StdRdOptions *) info->index->rd_options; + cleanup_scale_factor = (relopts && + relopts->vacuum_cleanup_index_scale_factor >= 0) + ? relopts->vacuum_cleanup_index_scale_factor + : vacuum_cleanup_index_scale_factor; + + if (cleanup_scale_factor < 0 || + metad->btm_last_cleanup_num_heap_tuples < 0 || + info->num_heap_tuples > (1.0 + cleanup_scale_factor) * + metad->btm_last_cleanup_num_heap_tuples) + result = true; + } + + _bt_relbuf(info->index, metabuf); + return result; +} + +/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. @@ -795,9 +863,20 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* The ENSURE stuff ensures we clean up shared memory on failure */ PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); { + TransactionId oldestBtpoXact; + cycleid = _bt_start_vacuum(rel); - btvacuumscan(info, stats, callback, callback_state, cycleid); + btvacuumscan(info, stats, callback, callback_state, cycleid, + &oldestBtpoXact); + + /* + * Update cleanup-related information in metapage. These information + * is used only for cleanup but keeping up them to date can avoid + * unnecessary cleanup even after bulkdelete. + */ + _bt_update_meta_cleanup_info(info->index, oldestBtpoXact, + info->num_heap_tuples); } PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)); _bt_end_vacuum(rel); @@ -819,17 +898,28 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) /* * If btbulkdelete was called, we need not do anything, just return the - * stats from the latest btbulkdelete call. If it wasn't called, we must - * still do a pass over the index, to recycle any newly-recyclable pages - * and to obtain index statistics. + * stats from the latest btbulkdelete call. If it wasn't called, we might + * still need to do a pass over the index, to recycle any newly-recyclable + * pages and to obtain index statistics. _bt_vacuum_needs_cleanup checks + * is there are newly-recyclable or stalled index statistics. * * Since we aren't going to actually delete any leaf items, there's no * need to go through all the vacuum-cycle-ID pushups. */ if (stats == NULL) { + TransactionId oldestBtpoXact; + + /* Check if we need a cleanup */ + if (!_bt_vacuum_needs_cleanup(info)) + return NULL; + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); - btvacuumscan(info, stats, NULL, NULL, 0); + btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact); + + /* Update cleanup-related information in the metapage */ + _bt_update_meta_cleanup_info(info->index, oldestBtpoXact, + info->num_heap_tuples); } /* @@ -862,7 +952,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, - BTCycleId cycleid) + BTCycleId cycleid, TransactionId *oldestBtpoXact) { Relation rel = info->index; BTVacState vstate; @@ -887,6 +977,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastBlockLocked = BTREE_METAPAGE; vstate.totFreePages = 0; + vstate.oldestBtpoXact = InvalidTransactionId; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, @@ -991,6 +1082,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; + + if (oldestBtpoXact) + *oldestBtpoXact = vstate.oldestBtpoXact; } /* @@ -1070,6 +1164,11 @@ restart: { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; + + /* Update the oldest btpo.xact */ + if (!TransactionIdIsValid(vstate->oldestBtpoXact) || + TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) + vstate->oldestBtpoXact = opaque->btpo.xact; } else if (P_ISHALFDEAD(opaque)) { @@ -1238,7 +1337,12 @@ restart: /* count only this page, else may double-count parent */ if (ndel) + { stats->pages_deleted++; + if (!TransactionIdIsValid(vstate->oldestBtpoXact) || + TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact)) + vstate->oldestBtpoXact = opaque->btpo.xact; + } MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 233c3965d95..b565bcb5401 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -108,6 +108,8 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id) md->btm_level = xlrec->level; md->btm_fastroot = xlrec->fastroot; md->btm_fastlevel = xlrec->fastlevel; + md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact; + md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples; pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); pageop->btpo_flags = BTP_META; @@ -985,7 +987,6 @@ btree_xlog_reuse_page(XLogReaderState *record) } } - void btree_redo(XLogReaderState *record) { @@ -1027,6 +1028,9 @@ btree_redo(XLogReaderState *record) case XLOG_BTREE_REUSE_PAGE: btree_xlog_reuse_page(record); break; + case XLOG_BTREE_META_CLEANUP: + _bt_restore_meta(record, 0); + break; default: elog(PANIC, "btree_redo: unknown op code %u", info); } diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 446040d8160..c1f0441b081 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -138,3 +138,5 @@ int VacuumPageDirty = 0; int VacuumCostBalance = 0; /* working state for vacuum */ bool VacuumCostActive = false; + +double vacuum_cleanup_index_scale_factor; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 4ffc8451ca4..260ae264d88 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -3208,6 +3208,16 @@ static struct config_real ConfigureNamesReal[] = NULL, NULL, NULL }, + { + {"vacuum_cleanup_index_scale_factor", PGC_SIGHUP, AUTOVACUUM, + gettext_noop("Number of tuple inserts prior to index cleanup as a fraction of reltuples."), + NULL + }, + &vacuum_cleanup_index_scale_factor, + 0.1, 0.0, 100.0, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 2b0b1da7636..f532f3ffff3 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -102,6 +102,11 @@ typedef struct BTMetaPageData uint32 btm_level; /* tree level of the root page */ BlockNumber btm_fastroot; /* current "fast" root location */ uint32 btm_fastlevel; /* tree level of the "fast" root page */ + /* following fields are available since page version 3 */ + TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among of + * deleted pages */ + float4 btm_last_cleanup_num_heap_tuples; /* number of heap tuples + * during last cleanup */ } BTMetaPageData; #define BTPageGetMeta(p) \ @@ -109,7 +114,8 @@ typedef struct BTMetaPageData #define BTREE_METAPAGE 0 /* first page is meta */ #define BTREE_MAGIC 0x053162 /* magic number of btree pages */ -#define BTREE_VERSION 2 /* current version number */ +#define BTREE_VERSION 3 /* current version number */ +#define BTREE_MIN_VERSION 2 /* minimal supported version number */ /* * Maximum size of a btree index entry, including its tuple header. @@ -481,6 +487,9 @@ extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack); * prototypes for functions in nbtpage.c */ extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); +extern void _bt_update_meta_cleanup_info(Relation rel, + TransactionId oldestBtpoXact, float8 numHeapTuples); +extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); extern int _bt_getrootheight(Relation rel); diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 8297df75fe8..a8ccdcec426 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -38,6 +38,8 @@ * vacuum */ #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from * FSM */ +#define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the + * metapage */ /* * All that we need to regenerate the meta-data page @@ -48,6 +50,8 @@ typedef struct xl_btree_metadata uint32 level; BlockNumber fastroot; uint32 fastlevel; + TransactionId oldest_btpo_xact; + double last_cleanup_num_heap_tuples; } xl_btree_metadata; /* diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index a4574cd5331..a429a19964e 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -256,6 +256,8 @@ extern int VacuumPageDirty; extern int VacuumCostBalance; extern bool VacuumCostActive; +extern double vacuum_cleanup_index_scale_factor; + /* in tcop/postgres.c */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index c26c395b0bd..9826c67fc41 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -287,6 +287,8 @@ typedef struct StdRdOptions { int32 vl_len_; /* varlena header (do not touch directly!) */ int fillfactor; /* page fill factor in percent (0..100) */ + /* fraction of newly inserted tuples prior to trigger index cleanup */ + float8 vacuum_cleanup_index_scale_factor; int toast_tuple_target; /* target for tuple toasting */ AutoVacOpts autovacuum; /* autovacuum-related options */ bool user_catalog_table; /* use as an additional catalog relation */ diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index 755cd177925..4778ac14a4c 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -150,3 +150,32 @@ vacuum btree_tall_tbl; -- need to insert some rows to cause the fast root page to split. insert into btree_tall_tbl (id, t) select g, repeat('x', 100) from generate_series(1, 500) g; +-- +-- Test vacuum_cleanup_index_scale_factor +-- +-- Simple create +create table btree_test(a int); +create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass; + reloptions +------------------------------------------ + {vacuum_cleanup_index_scale_factor=40.0} +(1 row) + +-- Fail while setting improper values +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0); +ERROR: value -10.0 out of bounds for option "vacuum_cleanup_index_scale_factor" +DETAIL: Valid values are between "0.000000" and "100.000000". +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string'); +ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": string +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true); +ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": true +-- Simple ALTER INDEX +alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass; + reloptions +------------------------------------------ + {vacuum_cleanup_index_scale_factor=70.0} +(1 row) + diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 65b08c82824..21171f77625 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -92,3 +92,22 @@ vacuum btree_tall_tbl; -- need to insert some rows to cause the fast root page to split. insert into btree_tall_tbl (id, t) select g, repeat('x', 100) from generate_series(1, 500) g; + +-- +-- Test vacuum_cleanup_index_scale_factor +-- + +-- Simple create +create table btree_test(a int); +create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass; + +-- Fail while setting improper values +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string'); +create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true); + +-- Simple ALTER INDEX +alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0); +select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass; |