diff options
Diffstat (limited to 'src/include')
-rw-r--r-- | src/include/access/heapam.h | 5 | ||||
-rw-r--r-- | src/include/access/nbtree.h | 19 | ||||
-rw-r--r-- | src/include/access/nbtxlog.h | 93 | ||||
-rw-r--r-- | src/include/access/tableam.h | 128 | ||||
-rw-r--r-- | src/include/access/xlog_internal.h | 2 |
5 files changed, 183 insertions, 64 deletions
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 95cb1e346b2..d96a47b1cec 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -166,9 +166,8 @@ extern void simple_heap_delete(Relation relation, ItemPointer tid); extern void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup); -extern TransactionId heap_compute_xid_horizon_for_tuples(Relation rel, - ItemPointerData *items, - int nitems); +extern TransactionId heap_index_delete_tuples(Relation rel, + TM_IndexDeleteOp *delstate); /* in heap/pruneheap.c */ struct GlobalVisState; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 7f8489aac2a..cad4f2bdeb9 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -17,6 +17,7 @@ #include "access/amapi.h" #include "access/itup.h" #include "access/sdir.h" +#include "access/tableam.h" #include "access/xlogreader.h" #include "catalog/pg_am_d.h" #include "catalog/pg_index.h" @@ -168,7 +169,7 @@ typedef struct BTMetaPageData /* * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples * that may be stored on a btree leaf page. It is used to size the - * per-page temporary buffers used by index scans. + * per-page temporary buffers. * * Note: we don't bother considering per-tuple overheads here to keep * things simple (value is based on how many elements a single array of @@ -766,8 +767,9 @@ typedef struct BTDedupStateData typedef BTDedupStateData *BTDedupState; /* - * BTVacuumPostingData is state that represents how to VACUUM a posting list - * tuple when some (though not all) of its TIDs are to be deleted. + * BTVacuumPostingData is state that represents how to VACUUM (or delete) a + * posting list tuple when some (though not all) of its TIDs are to be + * deleted. * * Convention is that itup field is the original posting list tuple on input, * and palloc()'d final tuple used to overwrite existing tuple on output. @@ -1031,6 +1033,8 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); extern void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem, Size newitemsz, bool checkingunique); +extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, + Size newitemsz); extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff); extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup); @@ -1045,7 +1049,8 @@ extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, * prototypes for functions in nbtinsert.c */ extern bool _bt_doinsert(Relation rel, IndexTuple itup, - IndexUniqueCheck checkUnique, Relation heapRel); + IndexUniqueCheck checkUnique, bool indexUnchanged, + Relation heapRel); extern void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack); extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child); @@ -1083,9 +1088,9 @@ extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable); -extern void _bt_delitems_delete(Relation rel, Buffer buf, - OffsetNumber *deletable, int ndeletable, - Relation heapRel); +extern void _bt_delitems_delete_check(Relation rel, Buffer buf, + Relation heapRel, + TM_IndexDeleteOp *delstate); extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact); diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index f5d3e9f5e09..7ae5c98c2b8 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -177,24 +177,6 @@ typedef struct xl_btree_dedup #define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16)) /* - * This is what we need to know about delete of individual leaf index tuples. - * The WAL record can represent deletion of any number of index tuples on a - * single index page when *not* executed by VACUUM. Deletion of a subset of - * the TIDs within a posting list tuple is not supported. - * - * Backup Blk 0: index page - */ -typedef struct xl_btree_delete -{ - TransactionId latestRemovedXid; - uint32 ndeleted; - - /* DELETED TARGET OFFSET NUMBERS FOLLOW */ -} xl_btree_delete; - -#define SizeOfBtreeDelete (offsetof(xl_btree_delete, ndeleted) + sizeof(uint32)) - -/* * This is what we need to know about page reuse within btree. This record * only exists to generate a conflict point for Hot Standby. * @@ -211,31 +193,30 @@ typedef struct xl_btree_reuse_page #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) /* - * This is what we need to know about which TIDs to remove from an individual - * posting list tuple during vacuuming. An array of these may appear at the - * end of xl_btree_vacuum records. - */ -typedef struct xl_btree_update -{ - uint16 ndeletedtids; - - /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */ -} xl_btree_update; - -#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16)) - -/* - * This is what we need to know about a VACUUM of a leaf page. The WAL record - * can represent deletion of any number of index tuples on a single index page - * when executed by VACUUM. It can also support "updates" of index tuples, - * which is how deletes of a subset of TIDs contained in an existing posting - * list tuple are implemented. (Updates are only used when there will be some - * remaining TIDs once VACUUM finishes; otherwise the posting list tuple can - * just be deleted). + * xl_btree_vacuum and xl_btree_delete records describe deletion of index + * tuples on a leaf page. The former variant is used by VACUUM, while the + * latter variant is used by the ad-hoc deletions that sometimes take place + * when btinsert() is called. + * + * The records are very similar. The only difference is that xl_btree_delete + * has to include a latestRemovedXid field to generate recovery conflicts. + * (VACUUM operations can just rely on earlier conflicts generated during + * pruning of the table whose TIDs the to-be-deleted index tuples point to. + * There are also small differences between each REDO routine that we don't go + * into here.) + * + * xl_btree_vacuum and xl_btree_delete both represent deletion of any number + * of index tuples on a single leaf page using page offset numbers. Both also + * support "updates" of index tuples, which is how deletes of a subset of TIDs + * contained in an existing posting list tuple are implemented. * * Updated posting list tuples are represented using xl_btree_update metadata. - * The REDO routine uses each xl_btree_update (plus its corresponding original - * index tuple from the target leaf page) to generate the final updated tuple. + * The REDO routines each use the xl_btree_update entries (plus each + * corresponding original index tuple from the target leaf page) to generate + * the final updated tuple. + * + * Updates are only used when there will be some remaining TIDs left by the + * REDO routine. Otherwise the posting list tuple just gets deleted outright. */ typedef struct xl_btree_vacuum { @@ -244,11 +225,39 @@ typedef struct xl_btree_vacuum /* DELETED TARGET OFFSET NUMBERS FOLLOW */ /* UPDATED TARGET OFFSET NUMBERS FOLLOW */ - /* UPDATED TUPLES METADATA ARRAY FOLLOWS */ + /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */ } xl_btree_vacuum; #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16)) +typedef struct xl_btree_delete +{ + TransactionId latestRemovedXid; + uint16 ndeleted; + uint16 nupdated; + + /* DELETED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TARGET OFFSET NUMBERS FOLLOW */ + /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */ +} xl_btree_delete; + +#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nupdated) + sizeof(uint16)) + +/* + * The offsets that appear in xl_btree_update metadata are offsets into the + * original posting list from tuple, not page offset numbers. These are + * 0-based. The page offset number for the original posting list tuple comes + * from the main xl_btree_vacuum/xl_btree_delete record. + */ +typedef struct xl_btree_update +{ + uint16 ndeletedtids; + + /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */ +} xl_btree_update; + +#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16)) + /* * This is what we need to know about marking an empty subtree for deletion. * The target identifies the tuple removed from the parent page (note that we diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index c2357a72e48..33bffb6815b 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -128,6 +128,106 @@ typedef struct TM_FailureData bool traversed; } TM_FailureData; +/* + * State used when calling table_index_delete_tuples(). + * + * Represents the status of table tuples, referenced by table TID and taken by + * index AM from index tuples. State consists of high level parameters of the + * deletion operation, plus two mutable palloc()'d arrays for information + * about the status of individual table tuples. These are conceptually one + * single array. Using two arrays keeps the TM_IndexDelete struct small, + * which makes sorting the first array (the deltids array) fast. + * + * Some index AM callers perform simple index tuple deletion (by specifying + * bottomup = false), and include only known-dead deltids. These known-dead + * entries are all marked knowndeletable = true directly (typically these are + * TIDs from LP_DEAD-marked index tuples), but that isn't strictly required. + * + * Callers that specify bottomup = true are "bottom-up index deletion" + * callers. The considerations for the tableam are more subtle with these + * callers because they ask the tableam to perform highly speculative work, + * and might only expect the tableam to check a small fraction of all entries. + * Caller is not allowed to specify knowndeletable = true for any entry + * because everything is highly speculative. Bottom-up caller provides + * context and hints to tableam -- see comments below for details on how index + * AMs and tableams should coordinate during bottom-up index deletion. + * + * Simple index deletion callers may ask the tableam to perform speculative + * work, too. This is a little like bottom-up deletion, but not too much. + * The tableam will only perform speculative work when it's practically free + * to do so in passing for simple deletion caller (while always performing + * whatever work is is needed to enable knowndeletable/LP_DEAD index tuples to + * be deleted within index AM). This is the real reason why it's possible for + * simple index deletion caller to specify knowndeletable = false up front + * (this means "check if it's possible for me to delete corresponding index + * tuple when it's cheap to do so in passing"). The index AM should only + * include "extra" entries for index tuples whose TIDs point to a table block + * that tableam is expected to have to visit anyway (in the event of a block + * orientated tableam). The tableam isn't strictly obligated to check these + * "extra" TIDs, but a block-based AM should always manage to do so in + * practice. + * + * The final contents of the deltids/status arrays are interesting to callers + * that ask tableam to perform speculative work (i.e. when _any_ items have + * knowndeletable set to false up front). These index AM callers will + * naturally need to consult final state to determine which index tuples are + * in fact deletable. + * + * The index AM can keep track of which index tuple relates to which deltid by + * setting idxoffnum (and/or relying on each entry being uniquely identifiable + * using tid), which is important when the final contents of the array will + * need to be interpreted -- the array can shrink from initial size after + * tableam processing and/or have entries in a new order (tableam may sort + * deltids array for its own reasons). Bottom-up callers may find that final + * ndeltids is 0 on return from call to tableam, in which case no index tuple + * deletions are possible. Simple deletion callers can rely on any entries + * they know to be deletable appearing in the final array as deletable. + */ +typedef struct TM_IndexDelete +{ + ItemPointerData tid; /* table TID from index tuple */ + int16 id; /* Offset into TM_IndexStatus array */ +} TM_IndexDelete; + +typedef struct TM_IndexStatus +{ + OffsetNumber idxoffnum; /* Index am page offset number */ + bool knowndeletable; /* Currently known to be deletable? */ + + /* Bottom-up index deletion specific fields follow */ + bool promising; /* Promising (duplicate) index tuple? */ + int16 freespace; /* Space freed in index if deleted */ +} TM_IndexStatus; + +/* + * Index AM/tableam coordination is central to the design of bottom-up index + * deletion. The index AM provides hints about where to look to the tableam + * by marking some entries as "promising". Index AM does this with duplicate + * index tuples that are strongly suspected to be old versions left behind by + * UPDATEs that did not logically modify indexed values. Index AM may find it + * helpful to only mark entries as promising when they're thought to have been + * affected by such an UPDATE in the recent past. + * + * Bottom-up index deletion casts a wide net at first, usually by including + * all TIDs on a target index page. It is up to the tableam to worry about + * the cost of checking transaction status information. The tableam is in + * control, but needs careful guidance from the index AM. Index AM requests + * that bottomupfreespace target be met, while tableam measures progress + * towards that goal by tallying the per-entry freespace value for known + * deletable entries. (All !bottomup callers can just set these space related + * fields to zero.) + */ +typedef struct TM_IndexDeleteOp +{ + bool bottomup; /* Bottom-up (not simple) deletion? */ + int bottomupfreespace; /* Bottom-up space target */ + + /* Mutable per-TID information follows (index AM initializes entries) */ + int ndeltids; /* Current # of deltids/status elements */ + TM_IndexDelete *deltids; + TM_IndexStatus *status; +} TM_IndexDeleteOp; + /* "options" flag bits for table_tuple_insert */ /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */ #define TABLE_INSERT_SKIP_FSM 0x0002 @@ -342,10 +442,9 @@ typedef struct TableAmRoutine TupleTableSlot *slot, Snapshot snapshot); - /* see table_compute_xid_horizon_for_tuples() */ - TransactionId (*compute_xid_horizon_for_tuples) (Relation rel, - ItemPointerData *items, - int nitems); + /* see table_index_delete_tuples() */ + TransactionId (*index_delete_tuples) (Relation rel, + TM_IndexDeleteOp *delstate); /* ------------------------------------------------------------------------ @@ -1122,16 +1221,23 @@ table_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, } /* - * Compute the newest xid among the tuples pointed to by items. This is used - * to compute what snapshots to conflict with when replaying WAL records for - * page-level index vacuums. + * Determine which index tuples are safe to delete based on their table TID. + * + * Determines which entries from index AM caller's TM_IndexDeleteOp state + * point to vacuumable table tuples. Entries that are found by tableam to be + * vacuumable are naturally safe for index AM to delete, and so get directly + * marked as deletable. See comments above TM_IndexDelete and comments above + * TM_IndexDeleteOp for full details. + * + * Returns a latestRemovedXid transaction ID that caller generally places in + * its index deletion WAL record. This might be used during subsequent REDO + * of the WAL record when in Hot Standby mode -- a recovery conflict for the + * index deletion operation might be required on the standby. */ static inline TransactionId -table_compute_xid_horizon_for_tuples(Relation rel, - ItemPointerData *items, - int nitems) +table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) { - return rel->rd_tableam->compute_xid_horizon_for_tuples(rel, items, nitems); + return rel->rd_tableam->index_delete_tuples(rel, delstate); } diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 9585ad17b3c..224cae0246f 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD108 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD109 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { |