diff options
author | Andres Freund <andres@anarazel.de> | 2019-03-28 20:01:14 -0700 |
---|---|---|
committer | Andres Freund <andres@anarazel.de> | 2019-03-28 20:01:43 -0700 |
commit | d25f519107bff602e1ebc81853fe592d020c118d (patch) | |
tree | 00cfff63480c0d555f372ba2e1866d6622740432 /src/backend/access/heap/heapam_handler.c | |
parent | 7e69323bf72a924fd1b04a7a91da343a0cda91cf (diff) | |
download | postgresql-d25f519107bff602e1ebc81853fe592d020c118d.tar.gz postgresql-d25f519107bff602e1ebc81853fe592d020c118d.zip |
tableam: relation creation, VACUUM FULL/CLUSTER, SET TABLESPACE.
This moves the responsibility for:
- creating the storage necessary for a relation, including creating a
new relfilenode for a relation with existing storage
- non-transactional truncation of a relation
- VACUUM FULL / CLUSTER's rewrite of a table
below tableam.
This is fairly straight forward, with a bit of complexity smattered in
to move the computation of xid / multixid horizons below the AM, as
they don't make sense for every table AM.
Author: Andres Freund
Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
Diffstat (limited to 'src/backend/access/heap/heapam_handler.c')
-rw-r--r-- | src/backend/access/heap/heapam_handler.c | 451 |
1 files changed, 451 insertions, 0 deletions
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 1e4394a665b..581a6bd9d16 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -23,16 +23,32 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/multixact.h" +#include "access/rewriteheap.h" #include "access/tableam.h" #include "access/xact.h" #include "catalog/catalog.h" #include "catalog/index.h" +#include "catalog/storage.h" +#include "catalog/storage_xlog.h" +#include "commands/progress.h" #include "executor/executor.h" +#include "pgstat.h" #include "storage/bufmgr.h" #include "storage/bufpage.h" +#include "storage/bufmgr.h" #include "storage/lmgr.h" +#include "storage/predicate.h" #include "storage/procarray.h" +#include "storage/smgr.h" #include "utils/builtins.h" +#include "utils/rel.h" + + +static void +reform_and_rewrite_tuple(HeapTuple tuple, + Relation OldHeap, Relation NewHeap, + Datum *values, bool *isnull, RewriteState rwstate); static const TableAmRoutine heapam_methods; @@ -523,6 +539,388 @@ tuple_lock_retry: * ------------------------------------------------------------------------ */ +static void +heapam_relation_set_new_filenode(Relation rel, char persistence, + TransactionId *freezeXid, + MultiXactId *minmulti) +{ + /* + * Initialize to the minimum XID that could put tuples in the table. We + * know that no xacts older than RecentXmin are still running, so that + * will do. + */ + *freezeXid = RecentXmin; + + /* + * Similarly, initialize the minimum Multixact to the first value that + * could possibly be stored in tuples in the table. Running transactions + * could reuse values from their local cache, so we are careful to + * consider all currently running multis. + * + * XXX this could be refined further, but is it worth the hassle? + */ + *minmulti = GetOldestMultiXactId(); + + RelationCreateStorage(rel->rd_node, persistence); + + /* + * If required, set up an init fork for an unlogged table so that it can + * be correctly reinitialized on restart. An immediate sync is required + * even if the page has been logged, because the write did not go through + * shared_buffers and therefore a concurrent checkpoint may have moved the + * redo pointer past our xlog record. Recovery may as well remove it + * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE + * record. Therefore, logging is necessary even if wal_level=minimal. + */ + if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) + { + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW || + rel->rd_rel->relkind == RELKIND_TOASTVALUE); + RelationOpenSmgr(rel); + smgrcreate(rel->rd_smgr, INIT_FORKNUM, false); + log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM); + smgrimmedsync(rel->rd_smgr, INIT_FORKNUM); + } +} + +static void +heapam_relation_nontransactional_truncate(Relation rel) +{ + RelationTruncate(rel, 0); +} + +static void +heapam_relation_copy_data(Relation rel, RelFileNode newrnode) +{ + SMgrRelation dstrel; + + dstrel = smgropen(newrnode, rel->rd_backend); + RelationOpenSmgr(rel); + + /* + * Create and copy all forks of the relation, and schedule unlinking of + * old physical files. + * + * NOTE: any conflict in relfilenode value will be caught in + * RelationCreateStorage(). + */ + RelationCreateStorage(newrnode, rel->rd_rel->relpersistence); + + /* copy main fork */ + RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM, + rel->rd_rel->relpersistence); + + /* copy those extra forks that exist */ + for (ForkNumber forkNum = MAIN_FORKNUM + 1; + forkNum <= MAX_FORKNUM; forkNum++) + { + if (smgrexists(rel->rd_smgr, forkNum)) + { + smgrcreate(dstrel, forkNum, false); + + /* + * WAL log creation if the relation is persistent, or this is the + * init fork of an unlogged relation. + */ + if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT || + (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && + forkNum == INIT_FORKNUM)) + log_smgrcreate(&newrnode, forkNum); + RelationCopyStorage(rel->rd_smgr, dstrel, forkNum, + rel->rd_rel->relpersistence); + } + } + + + /* drop old relation, and close new one */ + RelationDropStorage(rel); + smgrclose(dstrel); +} + +static void +heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap, + Relation OldIndex, bool use_sort, + TransactionId OldestXmin, + TransactionId FreezeXid, + MultiXactId MultiXactCutoff, + double *num_tuples, + double *tups_vacuumed, + double *tups_recently_dead) +{ + RewriteState rwstate; + IndexScanDesc indexScan; + TableScanDesc tableScan; + HeapScanDesc heapScan; + bool use_wal; + bool is_system_catalog; + Tuplesortstate *tuplesort; + TupleDesc oldTupDesc = RelationGetDescr(OldHeap); + TupleDesc newTupDesc = RelationGetDescr(NewHeap); + TupleTableSlot *slot; + int natts; + Datum *values; + bool *isnull; + BufferHeapTupleTableSlot *hslot; + + /* Remember if it's a system catalog */ + is_system_catalog = IsSystemRelation(OldHeap); + + /* + * We need to log the copied data in WAL iff WAL archiving/streaming is + * enabled AND it's a WAL-logged rel. + */ + use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap); + + /* use_wal off requires smgr_targblock be initially invalid */ + Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber); + + /* Preallocate values/isnull arrays */ + natts = newTupDesc->natts; + values = (Datum *) palloc(natts * sizeof(Datum)); + isnull = (bool *) palloc(natts * sizeof(bool)); + + /* Initialize the rewrite operation */ + rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid, + MultiXactCutoff, use_wal); + + + /* Set up sorting if wanted */ + if (use_sort) + tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex, + maintenance_work_mem, + NULL, false); + else + tuplesort = NULL; + + /* + * Prepare to scan the OldHeap. To ensure we see recently-dead tuples + * that still need to be copied, we scan with SnapshotAny and use + * HeapTupleSatisfiesVacuum for the visibility test. + */ + if (OldIndex != NULL && !use_sort) + { + const int ci_index[] = { + PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_INDEX_RELID + }; + int64 ci_val[2]; + + /* Set phase and OIDOldIndex to columns */ + ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP; + ci_val[1] = RelationGetRelid(OldIndex); + pgstat_progress_update_multi_param(2, ci_index, ci_val); + + tableScan = NULL; + heapScan = NULL; + indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0); + index_rescan(indexScan, NULL, 0, NULL, 0); + } + else + { + /* In scan-and-sort mode and also VACUUM FULL, set phase */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP); + + tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL); + heapScan = (HeapScanDesc) tableScan; + indexScan = NULL; + + /* Set total heap blocks */ + pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS, + heapScan->rs_nblocks); + } + + slot = table_slot_create(OldHeap, NULL); + hslot = (BufferHeapTupleTableSlot *) slot; + + /* + * Scan through the OldHeap, either in OldIndex order or sequentially; + * copy each tuple into the NewHeap, or transiently to the tuplesort + * module. Note that we don't bother sorting dead tuples (they won't get + * to the new table anyway). + */ + for (;;) + { + HeapTuple tuple; + Buffer buf; + bool isdead; + + CHECK_FOR_INTERRUPTS(); + + if (indexScan != NULL) + { + if (!index_getnext_slot(indexScan, ForwardScanDirection, slot)) + break; + + /* Since we used no scan keys, should never need to recheck */ + if (indexScan->xs_recheck) + elog(ERROR, "CLUSTER does not support lossy index conditions"); + } + else + { + if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot)) + break; + + /* In scan-and-sort mode and also VACUUM FULL, set heap blocks scanned */ + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, + heapScan->rs_cblock + 1); + } + + tuple = ExecFetchSlotHeapTuple(slot, false, NULL); + buf = hslot->buffer; + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf)) + { + case HEAPTUPLE_DEAD: + /* Definitely dead */ + isdead = true; + break; + case HEAPTUPLE_RECENTLY_DEAD: + *tups_recently_dead += 1; + /* fall through */ + case HEAPTUPLE_LIVE: + /* Live or recently dead, must copy it */ + isdead = false; + break; + case HEAPTUPLE_INSERT_IN_PROGRESS: + + /* + * Since we hold exclusive lock on the relation, normally the + * only way to see this is if it was inserted earlier in our + * own transaction. However, it can happen in system + * catalogs, since we tend to release write lock before commit + * there. Give a warning if neither case applies; but in any + * case we had better copy it. + */ + if (!is_system_catalog && + !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data))) + elog(WARNING, "concurrent insert in progress within table \"%s\"", + RelationGetRelationName(OldHeap)); + /* treat as live */ + isdead = false; + break; + case HEAPTUPLE_DELETE_IN_PROGRESS: + + /* + * Similar situation to INSERT_IN_PROGRESS case. + */ + if (!is_system_catalog && + !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data))) + elog(WARNING, "concurrent delete in progress within table \"%s\"", + RelationGetRelationName(OldHeap)); + /* treat as recently dead */ + *tups_recently_dead += 1; + isdead = false; + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + isdead = false; /* keep compiler quiet */ + break; + } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (isdead) + { + *tups_vacuumed += 1; + /* heap rewrite module still needs to see it... */ + if (rewrite_heap_dead_tuple(rwstate, tuple)) + { + /* A previous recently-dead tuple is now known dead */ + *tups_vacuumed += 1; + *tups_recently_dead -= 1; + } + continue; + } + + *num_tuples += 1; + if (tuplesort != NULL) + { + tuplesort_putheaptuple(tuplesort, tuple); + + /* In scan-and-sort mode, report increase in number of tuples scanned */ + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, + *num_tuples); + } + else + { + const int ct_index[] = { + PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, + PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN + }; + int64 ct_val[2]; + + reform_and_rewrite_tuple(tuple, OldHeap, NewHeap, + values, isnull, rwstate); + + /* + * In indexscan mode and also VACUUM FULL, report increase in + * number of tuples scanned and written + */ + ct_val[0] = *num_tuples; + ct_val[1] = *num_tuples; + pgstat_progress_update_multi_param(2, ct_index, ct_val); + } + } + + if (indexScan != NULL) + index_endscan(indexScan); + if (tableScan != NULL) + table_endscan(tableScan); + if (slot) + ExecDropSingleTupleTableSlot(slot); + + /* + * In scan-and-sort mode, complete the sort, then read out all live tuples + * from the tuplestore and write them to the new relation. + */ + if (tuplesort != NULL) + { + double n_tuples = 0; + /* Report that we are now sorting tuples */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_SORT_TUPLES); + + tuplesort_performsort(tuplesort); + + /* Report that we are now writing new heap */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP); + + for (;;) + { + HeapTuple tuple; + + CHECK_FOR_INTERRUPTS(); + + tuple = tuplesort_getheaptuple(tuplesort, true); + if (tuple == NULL) + break; + + n_tuples += 1; + reform_and_rewrite_tuple(tuple, + OldHeap, NewHeap, + values, isnull, + rwstate); + /* Report n_tuples */ + pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN, + n_tuples); + } + + tuplesort_end(tuplesort); + } + + /* Write out any remaining tuples, and fsync if needed */ + end_heap_rewrite(rwstate); + + /* Clean up */ + pfree(values); + pfree(isnull); +} + static double heapam_index_build_range_scan(Relation heapRelation, Relation indexRelation, @@ -1256,6 +1654,55 @@ heapam_index_validate_scan(Relation heapRelation, } +/* ---------------------------------------------------------------------------- + * Helper functions for the above. + * ---------------------------------------------------------------------------- + */ + +/* + * Reconstruct and rewrite the given tuple + * + * We cannot simply copy the tuple as-is, for several reasons: + * + * 1. We'd like to squeeze out the values of any dropped columns, both + * to save space and to ensure we have no corner-case failures. (It's + * possible for example that the new table hasn't got a TOAST table + * and so is unable to store any large values of dropped cols.) + * + * 2. The tuple might not even be legal for the new table; this is + * currently only known to happen as an after-effect of ALTER TABLE + * SET WITHOUT OIDS. + * + * So, we must reconstruct the tuple from component Datums. + */ +static void +reform_and_rewrite_tuple(HeapTuple tuple, + Relation OldHeap, Relation NewHeap, + Datum *values, bool *isnull, RewriteState rwstate) +{ + TupleDesc oldTupDesc = RelationGetDescr(OldHeap); + TupleDesc newTupDesc = RelationGetDescr(NewHeap); + HeapTuple copiedTuple; + int i; + + heap_deform_tuple(tuple, oldTupDesc, values, isnull); + + /* Be sure to null out any dropped columns */ + for (i = 0; i < newTupDesc->natts; i++) + { + if (TupleDescAttr(newTupDesc, i)->attisdropped) + isnull[i] = true; + } + + copiedTuple = heap_form_tuple(newTupDesc, values, isnull); + + /* The heap rewrite module does the rest */ + rewrite_heap_tuple(rwstate, tuple, copiedTuple); + + heap_freetuple(copiedTuple); +} + + /* ------------------------------------------------------------------------ * Definition of the heap table access method. * ------------------------------------------------------------------------ @@ -1292,6 +1739,10 @@ static const TableAmRoutine heapam_methods = { .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot, .compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples, + .relation_set_new_filenode = heapam_relation_set_new_filenode, + .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate, + .relation_copy_data = heapam_relation_copy_data, + .relation_copy_for_cluster = heapam_relation_copy_for_cluster, .index_build_range_scan = heapam_index_build_range_scan, .index_validate_scan = heapam_index_validate_scan, }; |