aboutsummaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
Diffstat (limited to 'src/include')
-rw-r--r--src/include/access/heapam.h5
-rw-r--r--src/include/access/nbtree.h19
-rw-r--r--src/include/access/nbtxlog.h93
-rw-r--r--src/include/access/tableam.h128
-rw-r--r--src/include/access/xlog_internal.h2
5 files changed, 183 insertions, 64 deletions
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 95cb1e346b2..d96a47b1cec 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -166,9 +166,8 @@ extern void simple_heap_delete(Relation relation, ItemPointer tid);
extern void simple_heap_update(Relation relation, ItemPointer otid,
HeapTuple tup);
-extern TransactionId heap_compute_xid_horizon_for_tuples(Relation rel,
- ItemPointerData *items,
- int nitems);
+extern TransactionId heap_index_delete_tuples(Relation rel,
+ TM_IndexDeleteOp *delstate);
/* in heap/pruneheap.c */
struct GlobalVisState;
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 7f8489aac2a..cad4f2bdeb9 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -17,6 +17,7 @@
#include "access/amapi.h"
#include "access/itup.h"
#include "access/sdir.h"
+#include "access/tableam.h"
#include "access/xlogreader.h"
#include "catalog/pg_am_d.h"
#include "catalog/pg_index.h"
@@ -168,7 +169,7 @@ typedef struct BTMetaPageData
/*
* MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
* that may be stored on a btree leaf page. It is used to size the
- * per-page temporary buffers used by index scans.
+ * per-page temporary buffers.
*
* Note: we don't bother considering per-tuple overheads here to keep
* things simple (value is based on how many elements a single array of
@@ -766,8 +767,9 @@ typedef struct BTDedupStateData
typedef BTDedupStateData *BTDedupState;
/*
- * BTVacuumPostingData is state that represents how to VACUUM a posting list
- * tuple when some (though not all) of its TIDs are to be deleted.
+ * BTVacuumPostingData is state that represents how to VACUUM (or delete) a
+ * posting list tuple when some (though not all) of its TIDs are to be
+ * deleted.
*
* Convention is that itup field is the original posting list tuple on input,
* and palloc()'d final tuple used to overwrite existing tuple on output.
@@ -1031,6 +1033,8 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
extern void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel,
IndexTuple newitem, Size newitemsz,
bool checkingunique);
+extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
+ Size newitemsz);
extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
OffsetNumber baseoff);
extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
@@ -1045,7 +1049,8 @@ extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
* prototypes for functions in nbtinsert.c
*/
extern bool _bt_doinsert(Relation rel, IndexTuple itup,
- IndexUniqueCheck checkUnique, Relation heapRel);
+ IndexUniqueCheck checkUnique, bool indexUnchanged,
+ Relation heapRel);
extern void _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack);
extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child);
@@ -1083,9 +1088,9 @@ extern bool _bt_page_recyclable(Page page);
extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
OffsetNumber *deletable, int ndeletable,
BTVacuumPosting *updatable, int nupdatable);
-extern void _bt_delitems_delete(Relation rel, Buffer buf,
- OffsetNumber *deletable, int ndeletable,
- Relation heapRel);
+extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
+ Relation heapRel,
+ TM_IndexDeleteOp *delstate);
extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf,
TransactionId *oldestBtpoXact);
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index f5d3e9f5e09..7ae5c98c2b8 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -177,24 +177,6 @@ typedef struct xl_btree_dedup
#define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16))
/*
- * This is what we need to know about delete of individual leaf index tuples.
- * The WAL record can represent deletion of any number of index tuples on a
- * single index page when *not* executed by VACUUM. Deletion of a subset of
- * the TIDs within a posting list tuple is not supported.
- *
- * Backup Blk 0: index page
- */
-typedef struct xl_btree_delete
-{
- TransactionId latestRemovedXid;
- uint32 ndeleted;
-
- /* DELETED TARGET OFFSET NUMBERS FOLLOW */
-} xl_btree_delete;
-
-#define SizeOfBtreeDelete (offsetof(xl_btree_delete, ndeleted) + sizeof(uint32))
-
-/*
* This is what we need to know about page reuse within btree. This record
* only exists to generate a conflict point for Hot Standby.
*
@@ -211,31 +193,30 @@ typedef struct xl_btree_reuse_page
#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
/*
- * This is what we need to know about which TIDs to remove from an individual
- * posting list tuple during vacuuming. An array of these may appear at the
- * end of xl_btree_vacuum records.
- */
-typedef struct xl_btree_update
-{
- uint16 ndeletedtids;
-
- /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
-} xl_btree_update;
-
-#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
-
-/*
- * This is what we need to know about a VACUUM of a leaf page. The WAL record
- * can represent deletion of any number of index tuples on a single index page
- * when executed by VACUUM. It can also support "updates" of index tuples,
- * which is how deletes of a subset of TIDs contained in an existing posting
- * list tuple are implemented. (Updates are only used when there will be some
- * remaining TIDs once VACUUM finishes; otherwise the posting list tuple can
- * just be deleted).
+ * xl_btree_vacuum and xl_btree_delete records describe deletion of index
+ * tuples on a leaf page. The former variant is used by VACUUM, while the
+ * latter variant is used by the ad-hoc deletions that sometimes take place
+ * when btinsert() is called.
+ *
+ * The records are very similar. The only difference is that xl_btree_delete
+ * has to include a latestRemovedXid field to generate recovery conflicts.
+ * (VACUUM operations can just rely on earlier conflicts generated during
+ * pruning of the table whose TIDs the to-be-deleted index tuples point to.
+ * There are also small differences between each REDO routine that we don't go
+ * into here.)
+ *
+ * xl_btree_vacuum and xl_btree_delete both represent deletion of any number
+ * of index tuples on a single leaf page using page offset numbers. Both also
+ * support "updates" of index tuples, which is how deletes of a subset of TIDs
+ * contained in an existing posting list tuple are implemented.
*
* Updated posting list tuples are represented using xl_btree_update metadata.
- * The REDO routine uses each xl_btree_update (plus its corresponding original
- * index tuple from the target leaf page) to generate the final updated tuple.
+ * The REDO routines each use the xl_btree_update entries (plus each
+ * corresponding original index tuple from the target leaf page) to generate
+ * the final updated tuple.
+ *
+ * Updates are only used when there will be some remaining TIDs left by the
+ * REDO routine. Otherwise the posting list tuple just gets deleted outright.
*/
typedef struct xl_btree_vacuum
{
@@ -244,11 +225,39 @@ typedef struct xl_btree_vacuum
/* DELETED TARGET OFFSET NUMBERS FOLLOW */
/* UPDATED TARGET OFFSET NUMBERS FOLLOW */
- /* UPDATED TUPLES METADATA ARRAY FOLLOWS */
+ /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
} xl_btree_vacuum;
#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
+typedef struct xl_btree_delete
+{
+ TransactionId latestRemovedXid;
+ uint16 ndeleted;
+ uint16 nupdated;
+
+ /* DELETED TARGET OFFSET NUMBERS FOLLOW */
+ /* UPDATED TARGET OFFSET NUMBERS FOLLOW */
+ /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */
+} xl_btree_delete;
+
+#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nupdated) + sizeof(uint16))
+
+/*
+ * The offsets that appear in xl_btree_update metadata are offsets into the
+ * original posting list from tuple, not page offset numbers. These are
+ * 0-based. The page offset number for the original posting list tuple comes
+ * from the main xl_btree_vacuum/xl_btree_delete record.
+ */
+typedef struct xl_btree_update
+{
+ uint16 ndeletedtids;
+
+ /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */
+} xl_btree_update;
+
+#define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16))
+
/*
* This is what we need to know about marking an empty subtree for deletion.
* The target identifies the tuple removed from the parent page (note that we
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index c2357a72e48..33bffb6815b 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -128,6 +128,106 @@ typedef struct TM_FailureData
bool traversed;
} TM_FailureData;
+/*
+ * State used when calling table_index_delete_tuples().
+ *
+ * Represents the status of table tuples, referenced by table TID and taken by
+ * index AM from index tuples. State consists of high level parameters of the
+ * deletion operation, plus two mutable palloc()'d arrays for information
+ * about the status of individual table tuples. These are conceptually one
+ * single array. Using two arrays keeps the TM_IndexDelete struct small,
+ * which makes sorting the first array (the deltids array) fast.
+ *
+ * Some index AM callers perform simple index tuple deletion (by specifying
+ * bottomup = false), and include only known-dead deltids. These known-dead
+ * entries are all marked knowndeletable = true directly (typically these are
+ * TIDs from LP_DEAD-marked index tuples), but that isn't strictly required.
+ *
+ * Callers that specify bottomup = true are "bottom-up index deletion"
+ * callers. The considerations for the tableam are more subtle with these
+ * callers because they ask the tableam to perform highly speculative work,
+ * and might only expect the tableam to check a small fraction of all entries.
+ * Caller is not allowed to specify knowndeletable = true for any entry
+ * because everything is highly speculative. Bottom-up caller provides
+ * context and hints to tableam -- see comments below for details on how index
+ * AMs and tableams should coordinate during bottom-up index deletion.
+ *
+ * Simple index deletion callers may ask the tableam to perform speculative
+ * work, too. This is a little like bottom-up deletion, but not too much.
+ * The tableam will only perform speculative work when it's practically free
+ * to do so in passing for simple deletion caller (while always performing
+ * whatever work is is needed to enable knowndeletable/LP_DEAD index tuples to
+ * be deleted within index AM). This is the real reason why it's possible for
+ * simple index deletion caller to specify knowndeletable = false up front
+ * (this means "check if it's possible for me to delete corresponding index
+ * tuple when it's cheap to do so in passing"). The index AM should only
+ * include "extra" entries for index tuples whose TIDs point to a table block
+ * that tableam is expected to have to visit anyway (in the event of a block
+ * orientated tableam). The tableam isn't strictly obligated to check these
+ * "extra" TIDs, but a block-based AM should always manage to do so in
+ * practice.
+ *
+ * The final contents of the deltids/status arrays are interesting to callers
+ * that ask tableam to perform speculative work (i.e. when _any_ items have
+ * knowndeletable set to false up front). These index AM callers will
+ * naturally need to consult final state to determine which index tuples are
+ * in fact deletable.
+ *
+ * The index AM can keep track of which index tuple relates to which deltid by
+ * setting idxoffnum (and/or relying on each entry being uniquely identifiable
+ * using tid), which is important when the final contents of the array will
+ * need to be interpreted -- the array can shrink from initial size after
+ * tableam processing and/or have entries in a new order (tableam may sort
+ * deltids array for its own reasons). Bottom-up callers may find that final
+ * ndeltids is 0 on return from call to tableam, in which case no index tuple
+ * deletions are possible. Simple deletion callers can rely on any entries
+ * they know to be deletable appearing in the final array as deletable.
+ */
+typedef struct TM_IndexDelete
+{
+ ItemPointerData tid; /* table TID from index tuple */
+ int16 id; /* Offset into TM_IndexStatus array */
+} TM_IndexDelete;
+
+typedef struct TM_IndexStatus
+{
+ OffsetNumber idxoffnum; /* Index am page offset number */
+ bool knowndeletable; /* Currently known to be deletable? */
+
+ /* Bottom-up index deletion specific fields follow */
+ bool promising; /* Promising (duplicate) index tuple? */
+ int16 freespace; /* Space freed in index if deleted */
+} TM_IndexStatus;
+
+/*
+ * Index AM/tableam coordination is central to the design of bottom-up index
+ * deletion. The index AM provides hints about where to look to the tableam
+ * by marking some entries as "promising". Index AM does this with duplicate
+ * index tuples that are strongly suspected to be old versions left behind by
+ * UPDATEs that did not logically modify indexed values. Index AM may find it
+ * helpful to only mark entries as promising when they're thought to have been
+ * affected by such an UPDATE in the recent past.
+ *
+ * Bottom-up index deletion casts a wide net at first, usually by including
+ * all TIDs on a target index page. It is up to the tableam to worry about
+ * the cost of checking transaction status information. The tableam is in
+ * control, but needs careful guidance from the index AM. Index AM requests
+ * that bottomupfreespace target be met, while tableam measures progress
+ * towards that goal by tallying the per-entry freespace value for known
+ * deletable entries. (All !bottomup callers can just set these space related
+ * fields to zero.)
+ */
+typedef struct TM_IndexDeleteOp
+{
+ bool bottomup; /* Bottom-up (not simple) deletion? */
+ int bottomupfreespace; /* Bottom-up space target */
+
+ /* Mutable per-TID information follows (index AM initializes entries) */
+ int ndeltids; /* Current # of deltids/status elements */
+ TM_IndexDelete *deltids;
+ TM_IndexStatus *status;
+} TM_IndexDeleteOp;
+
/* "options" flag bits for table_tuple_insert */
/* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */
#define TABLE_INSERT_SKIP_FSM 0x0002
@@ -342,10 +442,9 @@ typedef struct TableAmRoutine
TupleTableSlot *slot,
Snapshot snapshot);
- /* see table_compute_xid_horizon_for_tuples() */
- TransactionId (*compute_xid_horizon_for_tuples) (Relation rel,
- ItemPointerData *items,
- int nitems);
+ /* see table_index_delete_tuples() */
+ TransactionId (*index_delete_tuples) (Relation rel,
+ TM_IndexDeleteOp *delstate);
/* ------------------------------------------------------------------------
@@ -1122,16 +1221,23 @@ table_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
}
/*
- * Compute the newest xid among the tuples pointed to by items. This is used
- * to compute what snapshots to conflict with when replaying WAL records for
- * page-level index vacuums.
+ * Determine which index tuples are safe to delete based on their table TID.
+ *
+ * Determines which entries from index AM caller's TM_IndexDeleteOp state
+ * point to vacuumable table tuples. Entries that are found by tableam to be
+ * vacuumable are naturally safe for index AM to delete, and so get directly
+ * marked as deletable. See comments above TM_IndexDelete and comments above
+ * TM_IndexDeleteOp for full details.
+ *
+ * Returns a latestRemovedXid transaction ID that caller generally places in
+ * its index deletion WAL record. This might be used during subsequent REDO
+ * of the WAL record when in Hot Standby mode -- a recovery conflict for the
+ * index deletion operation might be required on the standby.
*/
static inline TransactionId
-table_compute_xid_horizon_for_tuples(Relation rel,
- ItemPointerData *items,
- int nitems)
+table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
{
- return rel->rd_tableam->compute_xid_horizon_for_tuples(rel, items, nitems);
+ return rel->rd_tableam->index_delete_tuples(rel, delstate);
}
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 9585ad17b3c..224cae0246f 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
/*
* Each page of XLOG file has a header like this:
*/
-#define XLOG_PAGE_MAGIC 0xD108 /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD109 /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{