diff options
Diffstat (limited to 'src')
61 files changed, 3440 insertions, 501 deletions
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index a5253da0211..70867ac40ba 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.8 2007/09/12 22:10:25 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.9 2007/09/20 17:56:30 tgl Exp $ *------------------------------------------------------------------------- */ @@ -359,7 +359,7 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd *prdata = rdata; data.updateBlkno = entryPreparePage(btree, page, off); - placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false); + placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false); if (placed != off) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(btree->index)); @@ -488,7 +488,7 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); } - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber) + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(btree->index)); ptr += MAXALIGN(IndexTupleSize(itup)); @@ -563,11 +563,11 @@ entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf) page = BufferGetPage(root); itup = ginPageGetLinkItup(lbuf); - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber) + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index root page"); itup = ginPageGetLinkItup(rbuf); - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber) + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index root page"); } diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 91f7f3e5f8b..1f26869d646 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.16 2007/09/12 22:10:25 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.17 2007/09/20 17:56:30 tgl Exp $ *------------------------------------------------------------------------- */ @@ -544,7 +544,7 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3 itup = GinFormTuple(&gvs->ginstate, value, GinGetPosting(itup), newN); PageIndexTupleDelete(tmppage, i); - if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false) != i) + if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gvs->index)); diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index db2d6b39336..bf2174c37c7 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.8 2007/09/12 22:10:25 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.9 2007/09/20 17:56:30 tgl Exp $ *------------------------------------------------------------------------- */ #include "postgres.h" @@ -199,7 +199,7 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record) itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert)); - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false) == InvalidOffsetNumber) + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in %u/%u/%u", data->node.spcNode, data->node.dbNode, data->node.relNode); @@ -281,7 +281,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) for (i = 0; i < data->separator; i++) { - if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber) + if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in %u/%u/%u", data->node.spcNode, data->node.dbNode, data->node.relNode); itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup))); @@ -289,7 +289,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record) for (i = data->separator; i < data->nitem; i++) { - if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber) + if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in %u/%u/%u", data->node.spcNode, data->node.dbNode, data->node.relNode); itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup))); @@ -375,7 +375,7 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record) for (i = 0; i < data->nitem; i++) { - if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber) + if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in %u/%u/%u", data->node.spcNode, data->node.dbNode, data->node.relNode); itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup))); diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index fce9a94ebae..0c1b94d7d38 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.146 2007/09/12 22:10:25 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.147 2007/09/20 17:56:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -366,7 +366,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate) data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { - if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber) + if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r)); data += IndexTupleSize((IndexTuple) data); } diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 6d4f31d53b2..409377d1d14 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.23 2007/09/12 22:10:25 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.24 2007/09/20 17:56:30 tgl Exp $ *------------------------------------------------------------------------- */ #include "postgres.h" @@ -42,7 +42,7 @@ gistfillbuffer(Relation r, Page page, IndexTuple *itup, for (i = 0; i < len; i++) { l = PageAddItem(page, (Item) itup[i], IndexTupleSize(itup[i]), - off, false); + off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(r)); diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 0abd0197ad3..212995e7c57 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.31 2007/09/12 22:10:25 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.32 2007/09/20 17:56:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -201,7 +201,7 @@ vacuumSplitPage(GistVacuum *gv, Page tempPage, Buffer buffer, IndexTuple *addon, data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { - if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber) + if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gv->index)); data += IndexTupleSize((IndexTuple) data); } diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index c82ad0ad9fb..8cbf0294acf 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.46 2007/09/12 22:10:25 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.47 2007/09/20 17:56:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -200,7 +200,7 @@ _hash_pgaddtup(Relation rel, page = BufferGetPage(buf); itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page)); - if (PageAddItem(page, (Item) itup, itemsize, itup_off, false) + if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 7e87f308b26..e4ea24a62d1 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.59 2007/09/12 22:10:25 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.60 2007/09/20 17:56:30 tgl Exp $ * * NOTES * Overflow pages look like ordinary relation pages. @@ -684,7 +684,7 @@ _hash_squeezebucket(Relation rel, * we have found room so insert on the "write" page. */ woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage)); - if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false) + if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 4b1450926d3..807dbed8a8c 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.69 2007/09/12 22:10:25 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.70 2007/09/20 17:56:30 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -830,7 +830,7 @@ _hash_splitbucket(Relation rel, } noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage)); - if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false) + if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile index ac2401232bb..aff2847bab5 100644 --- a/src/backend/access/heap/Makefile +++ b/src/backend/access/heap/Makefile @@ -4,7 +4,7 @@ # Makefile for access/heap # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.16 2007/06/08 18:23:52 tgl Exp $ +# $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.17 2007/09/20 17:56:30 tgl Exp $ # #------------------------------------------------------------------------- @@ -12,7 +12,7 @@ subdir = src/backend/access/heap top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = heapam.o hio.o rewriteheap.o syncscan.o tuptoaster.o +OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o all: SUBSYS.o diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT new file mode 100644 index 00000000000..8cf0fa44de6 --- /dev/null +++ b/src/backend/access/heap/README.HOT @@ -0,0 +1,489 @@ +$PostgreSQL: pgsql/src/backend/access/heap/README.HOT,v 1.1 2007/09/20 17:56:30 tgl Exp $ + + Heap Only Tuples (HOT) + +Introduction +------------ + +The Heap Only Tuple (HOT) feature eliminates redundant index entries and +allows the re-use of space taken by DELETEd or obsoleted UPDATEd tuples +without performing a table-wide vacuum. It does this by allowing +single-page vacuuming, also called "defragmentation". + +Note: there is a Glossary at the end of this document that may be helpful +for first-time readers. + + +Technical Challenges +-------------------- + +Page-at-a-time vacuuming is normally impractical because of the costs of +finding and removing the index entries that link to the tuples to be +reclaimed. Standard vacuuming scans the indexes to ensure all such index +entries are removed, amortizing the index scan cost across as many dead +tuples as possible; this approach does not scale down well to the case of +reclaiming just a few tuples. In principle one could recompute the index +keys and do standard index searches to find the index entries, but this is +risky in the presence of possibly-buggy user-defined functions in +functional indexes. An allegedly immutable function that in fact is not +immutable might prevent us from re-finding an index entry (and we cannot +throw an error for not finding it, in view of the fact that dead index +entries are sometimes reclaimed early). That would lead to a seriously +corrupt index, in the form of entries pointing to tuple slots that by now +contain some unrelated content. In any case we would prefer to be able +to do vacuuming without invoking any user-written code. + +HOT solves this problem for a restricted but useful special case: +where a tuple is repeatedly updated in ways that do not change its +indexed columns. (Here, "indexed column" means any column referenced +at all in an index definition, including for example columns that are +tested in a partial-index predicate but are not stored in the index.) + +An additional property of HOT is that it reduces index size by avoiding +the creation of identically-keyed index entries. This improves search +speeds. + + +Update Chains With a Single Index Entry +--------------------------------------- + +Without HOT, every version of a row in an update chain has its own index +entries, even if all indexed columns are the same. With HOT, a new tuple +placed on the same page and with all indexed columns the same as its +parent row version does not get new index entries. This means there is +only one index entry for the entire update chain on the heap page. +An index-entry-less tuple is marked with the HEAP_ONLY_TUPLE flag. +The prior row version is marked HEAP_HOT_UPDATED, and (as always in an +update chain) its t_ctid field links forward to the newer version. + +For example: + + Index points to 1 + lp [1] [2] + + [111111111]->[2222222222] + +In the above diagram, the index points to line pointer 1, and tuple 1 is +marked as HEAP_HOT_UPDATED. Tuple 2 is a HOT tuple, meaning it has +no index entry pointing to it, and is marked as HEAP_ONLY_TUPLE. +Although tuple 2 is not directly referenced by the index, it can still be +found by an index search: after traversing from the index to tuple 1, +the index search proceeds forward to child tuples as long as it sees the +HEAP_HOT_UPDATED flag set. Since we restrict the HOT chain to lie within +a single page, this requires no additional page fetches and doesn't +introduce much performance penalty. + +Eventually, tuple 1 will no longer be visible to any transaction. +At that point its space could be reclaimed, but its line pointer cannot, +since the index still links to that line pointer and we still need to +be able to find tuple 2 in an index search. HOT handles this by turning +line pointer 1 into a "redirecting line pointer", which links to tuple 2 +but has no actual tuple attached. This state of affairs looks like + + Index points to 1 + lp [1]->[2] + + [2222222222] + +If now the row is updated again, to version 3, the page looks like this: + + Index points to 1 + lp [1]->[2] [3] + + [2222222222]->[3333333333] + +At some later time when no transaction can see tuple 2 in its snapshot, +tuple 2 and its line pointer can be pruned entirely: + + Index points to 1 + lp [1]------>[3] + + [3333333333] + +This is safe because no index entry points to line pointer 2. Subsequent +insertions into the page can now recycle both line pointer 2 and the +space formerly used by tuple 2. + +If an update changes any indexed column, or there is not room on the +same page for the new tuple, then the HOT chain ends: the last member +has a regular t_ctid link to the next version and is not marked +HEAP_HOT_UPDATED. (In principle we could continue a HOT chain across +pages, but this would destroy the desired property of being able to +reclaim space with just page-local manipulations. Anyway, we don't +want to have to chase through multiple heap pages to get from an index +entry to the desired tuple, so it seems better to create a new index +entry for the new tuple.) If further updates occur, the next version +could become the root of a new HOT chain. + +Line pointer 1 has to remain as long as there is any non-dead member of +the chain on the page. When there is not, it is marked "dead". +This lets us reclaim the last child line pointer and associated tuple +immediately. The next regular VACUUM pass can reclaim the index entries +pointing at the line pointer and then the line pointer itself. Since a +line pointer is small compared to a tuple, this does not represent an +undue space cost. + +Note: we can use a "dead" line pointer for any DELETEd tuple, +whether it was part of a HOT chain or not. This allows space reclamation +in advance of running VACUUM for plain DELETEs as well as HOT updates. + +The requirement for doing a HOT update is that none of the indexed +columns are changed. This is checked at execution time by comparing the +binary representation of the old and new values. We insist on bitwise +equality rather than using datatype-specific equality routines. The +main reason to avoid the latter is that there might be multiple notions +of equality for a datatype, and we don't know exactly which one is +relevant for the indexes at hand. We assume that bitwise equality +guarantees equality for all purposes. + + +Abort Cases +----------- + +If a heap-only tuple's xmin is aborted, then it can be removed immediately: +it was never visible to any other transaction, and all descendant row +versions must be aborted as well. Therefore we need not consider it part +of a HOT chain. By the same token, if a HOT-updated tuple's xmax is +aborted, there is no need to follow the chain link. However, there is a +race condition here: the transaction that did the HOT update might abort +between the time we inspect the HOT-updated tuple and the time we reach +the descendant heap-only tuple. It is conceivable that someone prunes +the heap-only tuple before that, and even conceivable that the line pointer +is re-used for another purpose. Therefore, when following a HOT chain, +it is always necessary to be prepared for the possibility that the +linked-to item pointer is unused, dead, or redirected; and if it is a +normal item pointer, we still have to check that XMIN of the tuple matches +the XMAX of the tuple we left. Otherwise we should assume that we have +come to the end of the HOT chain. Note that this sort of XMIN/XMAX +matching is required when following ordinary update chains anyway. + +(Early versions of the HOT code assumed that holding pin on the page +buffer while following a HOT link would prevent this type of problem, +but checking XMIN/XMAX matching is a much more robust solution.) + + +Index/Sequential Scans +---------------------- + +When doing an index scan, whenever we reach a HEAP_HOT_UPDATED tuple whose +xmax is not aborted, we need to follow its t_ctid link and check that +entry as well; possibly repeatedly until we reach the end of the HOT +chain. (When using an MVCC snapshot it is possible to optimize this a +bit: there can be at most one visible tuple in the chain, so we can stop +when we find it. This rule does not work for non-MVCC snapshots, though.) + +Sequential scans do not need to pay attention to the HOT links because +they scan every item pointer on the page anyway. The same goes for a +bitmap heap scan with a lossy bitmap. + + +Pruning +------- + +HOT pruning means updating item pointers so that HOT chains are +reduced in length, by collapsing out line pointers for intermediate dead +tuples. Although this makes those line pointers available for re-use, +it does not immediately make the space occupied by their tuples available. + + +Defragmentation +--------------- + +Defragmentation centralizes unused space. After we have converted root +line pointers to redirected line pointers and pruned away any dead +intermediate line pointers, the tuples they linked to are free space. +But unless that space is adjacent to the central "hole" on the page +(the pd_lower-to-pd_upper area) it cannot be used by tuple insertion. +Defragmentation moves the surviving tuples to coalesce all the free +space into one "hole". This is done with the same PageRepairFragmentation +function that regular VACUUM uses. + + +When can/should we prune or defragment? +--------------------------------------- + +This is the most interesting question in HOT implementation, since there +is no simple right answer: we must use heuristics to determine when it's +most efficient to perform pruning and/or defragmenting. + +We cannot prune or defragment unless we can get a "buffer cleanup lock" +on the target page; otherwise, pruning might destroy line pointers that +other backends have live references to, and defragmenting might move +tuples that other backends have live pointers to. Thus the general +approach must be to heuristically decide if we should try to prune +or defragment, and if so try to acquire the buffer cleanup lock without +blocking. If we succeed we can proceed with our housekeeping work. +If we cannot get the lock (which should not happen often, except under +very heavy contention) then the housekeeping has to be postponed till +some other time. The worst-case consequence of this is only that an +UPDATE cannot be made HOT but has to link to a new tuple version placed on +some other page, for lack of centralized space on the original page. + +Ideally we would do defragmenting only when we are about to attempt +heap_update on a HOT-safe tuple. The difficulty with this approach +is that the update query has certainly got a pin on the old tuple, and +therefore our attempt to acquire a buffer cleanup lock will always fail. +(This corresponds to the idea that we don't want to move the old tuple +out from under where the query's HeapTuple pointer points. It might +be possible to finesse that, but it seems fragile.) + +Pruning, however, is potentially useful even when we are not about to +insert a new tuple, since shortening a HOT chain reduces the cost of +subsequent index searches. However it is unclear that this gain is +large enough to accept any extra maintenance burden for. + +The currently planned heuristic is to prune and defrag when first accessing +a page that potentially has prunable tuples (flagged by the PD_PRUNABLE +page hint bit) and that either has free space less than MAX(fillfactor +target free space, BLCKSZ/10) *or* has recently had an UPDATE fail to +find enough free space to store an updated tuple version. (These rules +are subject to change.) + +We have effectively implemented the "truncate dead tuples to just line +pointer" idea that has been proposed and rejected before because of fear +of line pointer bloat: we might end up with huge numbers of line pointers +and just a few actual tuples on a page. To limit the damage in the worst +case, and to keep various work arrays as well as the bitmaps in bitmap +scans reasonably sized, the maximum number of line pointers per page +is arbitrarily capped at MaxHeapTuplesPerPage (the most tuples that +could fit without HOT pruning). + + +VACUUM +------ + +There is little change to regular vacuum. It performs pruning to remove +dead heap-only tuples, and cleans up any dead line pointers as if they were +regular dead tuples. + + +VACUUM FULL +----------- + +VACUUM FULL performs an extra operation of collapsing out redirecting line +pointers, by moving the first non-DEAD tuple of each HOT chain to the root +position and clearing its heap-only-tuple flag. This effectively changes +the user-visible CTID of that tuple. This would be completely unsafe +during normal concurrent operation, but since VACUUM FULL takes full +exclusive lock on the table, it should be OK. (Note that VACUUM FULL has +always felt free to change tuples' CTIDs by moving them across pages.) +Eliminating redirection links means that the main body of VACUUM FULL +doesn't have to deal with them, which seems a good thing since VACUUM FULL +is horrendously complex already. + +When VACUUM FULL tries to move tuple chains, it does not distinguish regular +and heap-only tuples, but just moves both types the same. This is OK because +it will move the entire non-DEAD tail of an update chain and remove index +entries for each item moved. At worst, we'll uselessly search for index +entries matching the heap-only tuples included in the move. + + +Statistics +---------- + +Currently, we count HOT updates the same as cold updates for statistics +purposes, though there is an additional per-table counter that counts +only HOT updates. When a page pruning operation is able to remove a +physical tuple by eliminating an intermediate heap-only tuple or +replacing a physical root tuple by a redirect pointer, a decrement in +the table's number of dead tuples is reported to pgstats, which may +postpone autovacuuming. Note that we do not count replacing a root tuple +by a DEAD item pointer as decrementing n_dead_tuples; we still want +autovacuum to run to clean up the index entries and DEAD item. + +This area probably needs further work ... + + +CREATE INDEX +------------ + +CREATE INDEX presents a problem for HOT updates. While the existing HOT +chains all have the same index values for existing indexes, the columns +in the new index might change within a pre-existing HOT chain, creating +a "broken" chain that can't be indexed properly. + +To address this issue, regular (non-concurrent) CREATE INDEX makes the +new index usable only by transactions newer than the CREATE INDEX +command. This prevents transactions that can see the inconsistent HOT +chains from trying to use the new index and getting incorrect results. +New transactions can only see the rows visible after the index was +created, hence the HOT chains are consistent for them. + +Entries in the new index point to root tuples (tuples with current index +pointers) so that our index uses the same index pointers as all other +indexes on the table. However the row we want to index is actually at +the *end* of the chain, ie, the most recent live tuple on the HOT chain. +That is the one we compute the index entry values for, but the TID +we put into the index is that of the root tuple. Since transactions that +will be allowed to use the new index cannot see any of the older tuple +versions in the chain, the fact that they might not match the index entry +isn't a problem. (Such transactions will check the tuple visibility +information of the older versions and ignore them, without ever looking at +their contents, so the content inconsistency is OK.) Subsequent updates +to the live tuple will be allowed to extend the HOT chain only if they are +HOT-safe for all the indexes. + +Because we have ShareLock on the table, any DELETE_IN_PROGRESS or +INSERT_IN_PROGRESS tuples should have come from our own transaction. +Therefore we can consider them committed since if the CREATE INDEX +commits, they will be committed, and if it aborts the index is discarded. +An exception to this is that early lock release is customary for system +catalog updates, and so we might find such tuples when reindexing a system +catalog. In that case we deal with it by waiting for the source +transaction to commit or roll back. (We could do that for user tables +too, but since the case is unexpected we prefer to throw an error.) + +Practically, we prevent old transactions from using the new index by +setting pg_index.indcheckxmin to TRUE. Queries are allowed to use such an +index only after pg_index.xmin is below their TransactionXmin horizon, +thereby ensuring that any incompatible rows in HOT chains are dead to them. +(pg_index.xmin will be the XID of the CREATE INDEX transaction. The reason +for using xmin rather than a normal column is that the regular vacuum +freezing mechanism will take care of converting xmin to FrozenTransactionId +before it can wrap around.) + +This means in particular that the transaction creating the index will be +unable to use the index. We alleviate that problem somewhat by not setting +indcheckxmin unless the table actually contains HOT chains with +RECENTLY_DEAD members. (In 8.4 we may be able to improve the situation, +at least for non-serializable transactions, because we expect to be able to +advance TransactionXmin intratransaction.) + +Another unpleasant consequence is that it is now risky to use SnapshotAny +in an index scan: if the index was created more recently than the last +vacuum, it's possible that some of the visited tuples do not match the +index entry they are linked to. This does not seem to be a fatal +objection, since there are few users of SnapshotAny and most use seqscans. +The only exception at this writing is CLUSTER, which is okay because it +does not require perfect ordering of the indexscan readout (and especially +so because CLUSTER tends to write recently-dead tuples out of order anyway). + + +CREATE INDEX CONCURRENTLY +------------------------- + +In the concurrent case we must take a different approach. We create the +pg_index entry immediately, before we scan the table. The pg_index entry +is marked as "not ready for inserts". Then we commit and wait for any +transactions which have the table open to finish. This ensures that no +new HOT updates will change the key value for our new index, because all +transactions will see the existence of the index and will respect its +constraint on which updates can be HOT. Other transactions must include +such an index when determining HOT-safety of updates, even though they +must ignore it for both insertion and searching purposes. + +We must do this to avoid making incorrect index entries. For example, +suppose we are building an index on column X and we make an index entry for +a non-HOT tuple with X=1. Then some other backend, unaware that X is an +indexed column, HOT-updates the row to have X=2, and commits. We now have +an index entry for X=1 pointing at a HOT chain whose live row has X=2. +We could make an index entry with X=2 during the validation pass, but +there is no nice way to get rid of the wrong entry with X=1. So we must +have the HOT-safety property enforced before we start to build the new +index. + +After waiting for transactions which had the table open, we build the index +for all rows that are valid in a fresh snapshot. Any tuples visible in the +snapshot will have only valid forward-growing HOT chains. (They might have +older HOT updates behind them which are broken, but this is OK for the same +reason it's OK in a regular index build.) As above, we point the index +entry at the root of the HOT-update chain but we use the key value from the +live tuple. + +We mark the index open for inserts (but still not ready for reads) then +we again wait for transactions which have the table open. Then we take +a second reference snapshot and validate the index. This searches for +tuples missing from the index, and inserts any missing ones. Again, +the index entries have to have TIDs equal to HOT-chain root TIDs, but +the value to be inserted is the one from the live tuple. + +Then we wait until every transaction that could have a snapshot older than +the second reference snapshot is finished. This ensures that nobody is +alive any longer who could need to see any tuples that might be missing +from the index, as well as ensuring that no one can see any inconsistent +rows in a broken HOT chain (the first condition is stronger than the +second). Finally, we can mark the index valid for searches. + + +Limitations and Restrictions +---------------------------- + +It is worth noting that HOT forever forecloses alternative approaches +to vacuuming, specifically the recompute-the-index-keys approach alluded +to in Technical Challenges above. It'll be tough to recompute the index +keys for a root line pointer you don't have data for anymore ... + + +Glossary +-------- + +Broken HOT Chain + + A HOT chain in which the key value for an index has changed. + + This is not allowed to occur normally but if a new index is created + it can happen. In that case various strategies are used to ensure + that no transaction for which the older tuples are visible can + use the index. + +Cold update + + A normal, non-HOT update, in which index entries are made for + the new version of the tuple. + +Dead line pointer + + A stub line pointer, that does not point to anything, but cannot + be removed or reused yet because there are index pointers to it. + Semantically same as a dead tuple. It has state LP_DEAD. + +Heap-only tuple + + A heap tuple with no index pointers, which can only be reached + from indexes indirectly through its ancestral root tuple. + Marked with HEAP_ONLY_TUPLE flag. + +HOT-safe + + A proposed tuple update is said to be HOT-safe if it changes + none of the tuple's indexed columns. It will only become an + actual HOT update if we can find room on the same page for + the new tuple version. + +HOT update + + An UPDATE where the new tuple becomes a heap-only tuple, and no + new index entries are made. + +HOT-updated tuple + + An updated tuple, for which the next tuple in the chain is a + heap-only tuple. Marked with HEAP_HOT_UPDATED flag. + +Indexed column + + A column used in an index definition. The column might not + actually be stored in the index --- it could be used in a + functional index's expression, or used in a partial index + predicate. HOT treats all these cases alike. + +Redirecting line pointer + + A line pointer that points to another line pointer and has no + associated tuple. It has the special lp_flags state LP_REDIRECT, + and lp_off is the OffsetNumber of the line pointer it links to. + This is used when a root tuple becomes dead but we cannot prune + the line pointer because there are non-dead heap-only tuples + further down the chain. + +Root tuple + + The first tuple in a HOT update chain; the one that indexes point to. + +Update chain + + A chain of updated tuples, in which each tuple's ctid points to + the next tuple in the chain. A HOT update chain is an update chain + (or portion of an update chain) that consists of a root tuple and + one or more heap-only tuples. A complete update chain can contain + both HOT and non-HOT (cold) updated tuples. diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 09a70d813f7..d5a2f9a43d1 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.240 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.241 2007/09/20 17:56:30 tgl Exp $ * * * INTERFACE ROUTINES @@ -52,6 +52,7 @@ #include "pgstat.h" #include "storage/procarray.h" #include "storage/smgr.h" +#include "utils/datum.h" #include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/relcache.h" @@ -64,6 +65,8 @@ static HeapScanDesc heap_beginscan_internal(Relation relation, bool is_bitmapscan); static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move); +static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs, + HeapTuple oldtup, HeapTuple newtup); /* ---------------------------------------------------------------- @@ -184,6 +187,11 @@ heapgetpage(HeapScanDesc scan, BlockNumber page) snapshot = scan->rs_snapshot; /* + * Prune and repair fragmentation for the whole page, if possible. + */ + heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); + + /* * We must hold share lock on the buffer content while examining tuple * visibility. Afterwards, however, the tuples we have found to be * visible are guaranteed good as long as we hold the buffer pin. @@ -316,7 +324,7 @@ heapgettup(HeapScanDesc scan, * forward scanners. */ scan->rs_syncscan = false; - /* start from last page of the scan */ + /* start from last page of the scan */ if (scan->rs_startblock > 0) page = scan->rs_startblock - 1; else @@ -368,6 +376,7 @@ heapgettup(HeapScanDesc scan, dp = (Page) BufferGetPage(scan->rs_cbuf); lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self)); lpp = PageGetItemId(dp, lineoff); + Assert(ItemIdIsNormal(lpp)); tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); tuple->t_len = ItemIdGetLength(lpp); @@ -583,7 +592,7 @@ heapgettup_pagemode(HeapScanDesc scan, * forward scanners. */ scan->rs_syncscan = false; - /* start from last page of the scan */ + /* start from last page of the scan */ if (scan->rs_startblock > 0) page = scan->rs_startblock - 1; else @@ -632,6 +641,7 @@ heapgettup_pagemode(HeapScanDesc scan, dp = (Page) BufferGetPage(scan->rs_cbuf); lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self)); lpp = PageGetItemId(dp, lineoff); + Assert(ItemIdIsNormal(lpp)); tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); tuple->t_len = ItemIdGetLength(lpp); @@ -1246,6 +1256,9 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction) * for statistical purposes. (This could be the heap rel itself, an * associated index, or NULL to not count the fetch at all.) * + * heap_fetch does not follow HOT chains: only the exact TID requested will + * be fetched. + * * It is somewhat inconsistent that we ereport() on invalid block number but * return false on invalid item number. There are a couple of reasons though. * One is that the caller can relatively easily check the block number for @@ -1390,6 +1403,143 @@ heap_release_fetch(Relation relation, } /* + * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot + * + * On entry, *tid is the TID of a tuple (either a simple tuple, or the root + * of a HOT chain), and buffer is the buffer holding this tuple. We search + * for the first chain member satisfying the given snapshot. If one is + * found, we update *tid to reference that tuple's offset number, and + * return TRUE. If no match, return FALSE without modifying *tid. + * + * If all_dead is not NULL, we check non-visible tuples to see if they are + * globally dead; *all_dead is set TRUE if all members of the HOT chain + * are vacuumable, FALSE if not. + * + * Unlike heap_fetch, the caller must already have pin and (at least) share + * lock on the buffer; it is still pinned/locked at exit. Also unlike + * heap_fetch, we do not report any pgstats count; caller may do so if wanted. + */ +bool +heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot, + bool *all_dead) +{ + Page dp = (Page) BufferGetPage(buffer); + TransactionId prev_xmax = InvalidTransactionId; + OffsetNumber offnum; + bool at_chain_start; + + if (all_dead) + *all_dead = true; + + Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer)); + offnum = ItemPointerGetOffsetNumber(tid); + at_chain_start = true; + + /* Scan through possible multiple members of HOT-chain */ + for (;;) + { + ItemId lp; + HeapTupleData heapTuple; + + /* check for bogus TID */ + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) + break; + + lp = PageGetItemId(dp, offnum); + + /* check for unused, dead, or redirected items */ + if (!ItemIdIsNormal(lp)) + { + /* We should only see a redirect at start of chain */ + if (ItemIdIsRedirected(lp) && at_chain_start) + { + /* Follow the redirect */ + offnum = ItemIdGetRedirect(lp); + at_chain_start = false; + continue; + } + /* else must be end of chain */ + break; + } + + heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp); + heapTuple.t_len = ItemIdGetLength(lp); + + /* + * Shouldn't see a HEAP_ONLY tuple at chain start. + */ + if (at_chain_start && HeapTupleIsHeapOnly(&heapTuple)) + break; + + /* + * The xmin should match the previous xmax value, else chain is broken. + */ + if (TransactionIdIsValid(prev_xmax) && + !TransactionIdEquals(prev_xmax, + HeapTupleHeaderGetXmin(heapTuple.t_data))) + break; + + /* If it's visible per the snapshot, we must return it */ + if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer)) + { + ItemPointerSetOffsetNumber(tid, offnum); + if (all_dead) + *all_dead = false; + return true; + } + + /* + * If we can't see it, maybe no one else can either. At caller + * request, check whether all chain members are dead to all + * transactions. + */ + if (all_dead && *all_dead && + HeapTupleSatisfiesVacuum(heapTuple.t_data, RecentGlobalXmin, + buffer) != HEAPTUPLE_DEAD) + *all_dead = false; + + /* + * Check to see if HOT chain continues past this tuple; if so + * fetch the next offnum and loop around. + */ + if (HeapTupleIsHotUpdated(&heapTuple)) + { + Assert(ItemPointerGetBlockNumber(&heapTuple.t_data->t_ctid) == + ItemPointerGetBlockNumber(tid)); + offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid); + at_chain_start = false; + prev_xmax = HeapTupleHeaderGetXmax(heapTuple.t_data); + } + else + break; /* end of chain */ + } + + return false; +} + +/* + * heap_hot_search - search HOT chain for tuple satisfying snapshot + * + * This has the same API as heap_hot_search_buffer, except that the caller + * does not provide the buffer containing the page, rather we access it + * locally. + */ +bool +heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot, + bool *all_dead) +{ + bool result; + Buffer buffer; + + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + return result; +} + +/* * heap_get_latest_tid - get the latest tid of a specified tuple * * Actually, this gets the latest version that is visible according to @@ -1594,6 +1744,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, } tup->t_data->t_infomask &= ~(HEAP_XACT_MASK); + tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); tup->t_data->t_infomask |= HEAP_XMAX_INVALID; HeapTupleHeaderSetXmin(tup->t_data, xid); HeapTupleHeaderSetCmin(tup->t_data, cid); @@ -1628,6 +1779,17 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, RelationPutHeapTuple(relation, buffer, heaptup); + /* + * XXX Should we set PageSetPrunable on this page ? + * + * The inserting transaction may eventually abort thus making this tuple + * DEAD and hence available for pruning. Though we don't want to optimize + * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the + * aborted tuple will never be pruned until next vacuum is triggered. + * + * If you do add PageSetPrunable here, add it in heap_xlog_insert too. + */ + MarkBufferDirty(buffer); /* XLOG stuff */ @@ -1904,12 +2066,21 @@ l1: START_CRIT_SECTION(); + /* + * If this transaction commits, the tuple will become DEAD sooner or + * later. Set hint bit that this page is a candidate for pruning. If + * the transaction finally aborts, the subsequent page pruning will be + * a no-op and the hint will be cleared. + */ + PageSetPrunable((Page) dp); + /* store transaction information of xact deleting the tuple */ tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); + HeapTupleHeaderClearHotUpdated(tp.t_data); HeapTupleHeaderSetXmax(tp.t_data, xid); HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo); /* Make sure there is no forward chain link in t_ctid */ @@ -2045,7 +2216,8 @@ simple_heap_delete(Relation relation, ItemPointer tid) * * On success, the header fields of *newtup are updated to match the new * stored tuple; in particular, newtup->t_self is set to the TID where the - * new tuple was inserted. However, any TOAST changes in the new tuple's + * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT + * update was done. However, any TOAST changes in the new tuple's * data are not reflected into *newtup. * * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. @@ -2060,6 +2232,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); + Bitmapset *hot_attrs; ItemId lp; HeapTupleData oldtup; HeapTuple heaptup; @@ -2072,9 +2245,24 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, pagefree; bool have_tuple_lock = false; bool iscombo; + bool use_hot_update = false; Assert(ItemPointerIsValid(otid)); + /* + * Fetch the list of attributes to be checked for HOT update. This is + * wasted effort if we fail to update or have to put the new tuple on + * a different page. But we must compute the list before obtaining + * buffer lock --- in the worst case, if we are doing an update on one + * of the relevant system catalogs, we could deadlock if we try to + * fetch the list later. In any case, the relcache caches the data + * so this is usually pretty cheap. + * + * Note that we get a copy here, so we need not worry about relcache + * flush happening midway through. + */ + hot_attrs = RelationGetIndexAttrBitmap(relation); + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid)); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); @@ -2208,6 +2396,7 @@ l2: UnlockReleaseBuffer(buffer); if (have_tuple_lock) UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock); + bms_free(hot_attrs); return result; } @@ -2227,6 +2416,7 @@ l2: } newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK); + newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED); HeapTupleHeaderSetXmin(newtup->t_data, xid); HeapTupleHeaderSetCmin(newtup->t_data, cid); @@ -2261,17 +2451,20 @@ l2: HeapTupleHasExternal(newtup) || newtup->t_len > TOAST_TUPLE_THRESHOLD); - pagefree = PageGetFreeSpace((Page) dp); + pagefree = PageGetHeapFreeSpace((Page) dp); newtupsize = MAXALIGN(newtup->t_len); if (need_toast || newtupsize > pagefree) { + /* Clear obsolete visibility flags ... */ oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); + HeapTupleClearHotUpdated(&oldtup); + /* ... and store info about transaction updating this tuple */ HeapTupleHeaderSetXmax(oldtup.t_data, xid); HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); /* temporarily make it look not-updated */ @@ -2324,7 +2517,7 @@ l2: /* Re-acquire the lock on the old tuple's page. */ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* Re-check using the up-to-date free space */ - pagefree = PageGetFreeSpace((Page) dp); + pagefree = PageGetHeapFreeSpace((Page) dp); if (newtupsize > pagefree) { /* @@ -2357,18 +2550,66 @@ l2: * one pin is held. */ + if (newbuf == buffer) + { + /* + * Since the new tuple is going into the same page, we might be able + * to do a HOT update. Check if any of the index columns have been + * changed. If not, then HOT update is possible. + */ + if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup)) + use_hot_update = true; + } + else + { + /* Set a hint that the old page could use prune/defrag */ + PageSetFull(dp); + } + /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); + /* + * If this transaction commits, the old tuple will become DEAD sooner or + * later. Set hint bit that this page is a candidate for pruning. If + * the transaction finally aborts, the subsequent page pruning will be + * a no-op and the hint will be cleared. + * + * XXX Should we set hint on newbuf as well? If the transaction + * aborts, there would be a prunable tuple in the newbuf; but for now + * we choose not to optimize for aborts. Note that heap_xlog_update + * must be kept in sync if this changes. + */ + PageSetPrunable(dp); + + if (use_hot_update) + { + /* Mark the old tuple as HOT-updated */ + HeapTupleSetHotUpdated(&oldtup); + /* And mark the new tuple as heap-only */ + HeapTupleSetHeapOnly(heaptup); + /* Mark the caller's copy too, in case different from heaptup */ + HeapTupleSetHeapOnly(newtup); + } + else + { + /* Make sure tuples are correctly marked as not-HOT */ + HeapTupleClearHotUpdated(&oldtup); + HeapTupleClearHeapOnly(heaptup); + HeapTupleClearHeapOnly(newtup); + } + RelationPutHeapTuple(relation, newbuf, heaptup); /* insert new tuple */ if (!already_marked) { + /* Clear obsolete visibility flags ... */ oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); + /* ... and store info about transaction updating this tuple */ HeapTupleHeaderSetXmax(oldtup.t_data, xid); HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); } @@ -2427,7 +2668,7 @@ l2: if (have_tuple_lock) UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock); - pgstat_count_heap_update(relation); + pgstat_count_heap_update(relation, use_hot_update); /* * If heaptup is a private copy, release it. Don't forget to copy t_self @@ -2439,10 +2680,120 @@ l2: heap_freetuple(heaptup); } + bms_free(hot_attrs); + return HeapTupleMayBeUpdated; } /* + * Check if the specified attribute's value is same in both given tuples. + * Subroutine for HeapSatisfiesHOTUpdate. + */ +static bool +heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, + HeapTuple tup1, HeapTuple tup2) +{ + Datum value1, value2; + bool isnull1, isnull2; + Form_pg_attribute att; + + /* + * If it's a whole-tuple reference, say "not equal". It's not really + * worth supporting this case, since it could only succeed after a + * no-op update, which is hardly a case worth optimizing for. + */ + if (attrnum == 0) + return false; + + /* + * Likewise, automatically say "not equal" for any system attribute + * other than OID and tableOID; we cannot expect these to be consistent + * in a HOT chain, or even to be set correctly yet in the new tuple. + */ + if (attrnum < 0) + { + if (attrnum != ObjectIdAttributeNumber && + attrnum != TableOidAttributeNumber) + return false; + } + + /* + * Extract the corresponding values. XXX this is pretty inefficient + * if there are many indexed columns. Should HeapSatisfiesHOTUpdate + * do a single heap_deform_tuple call on each tuple, instead? But + * that doesn't work for system columns ... + */ + value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1); + value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2); + + /* + * If one value is NULL and other is not, then they are certainly + * not equal + */ + if (isnull1 != isnull2) + return false; + + /* + * If both are NULL, they can be considered equal. + */ + if (isnull1) + return true; + + /* + * We do simple binary comparison of the two datums. This may be overly + * strict because there can be multiple binary representations for the + * same logical value. But we should be OK as long as there are no false + * positives. Using a type-specific equality operator is messy because + * there could be multiple notions of equality in different operator + * classes; furthermore, we cannot safely invoke user-defined functions + * while holding exclusive buffer lock. + */ + if (attrnum <= 0) + { + /* The only allowed system columns are OIDs, so do this */ + return (DatumGetObjectId(value1) == DatumGetObjectId(value2)); + } + else + { + Assert(attrnum <= tupdesc->natts); + att = tupdesc->attrs[attrnum - 1]; + return datumIsEqual(value1, value2, att->attbyval, att->attlen); + } +} + +/* + * Check if the old and new tuples represent a HOT-safe update. To be able + * to do a HOT update, we must not have changed any columns used in index + * definitions. + * + * The set of attributes to be checked is passed in (we dare not try to + * compute it while holding exclusive buffer lock...) NOTE that hot_attrs + * is destructively modified! That is OK since this is invoked at most once + * by heap_update(). + * + * Returns true if safe to do HOT update. + */ +static bool +HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs, + HeapTuple oldtup, HeapTuple newtup) +{ + int attrnum; + + while ((attrnum = bms_first_member(hot_attrs)) >= 0) + { + /* Adjust for system attributes */ + attrnum += FirstLowInvalidHeapAttributeNumber; + + /* If the attribute value has changed, we can't do HOT update */ + if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum, + oldtup, newtup)) + return false; + } + + return true; +} + +/* * simple_heap_update - replace a tuple * * This routine may be used to update a tuple when concurrent updates of @@ -2865,6 +3216,7 @@ l3: * avoids possibly generating a useless combo CID. */ tuple->t_data->t_infomask = new_infomask; + HeapTupleHeaderClearHotUpdated(tuple->t_data); HeapTupleHeaderSetXmax(tuple->t_data, xid); /* Make sure there is no forward chain link in t_ctid */ tuple->t_data->t_ctid = *tid; @@ -3110,6 +3462,7 @@ recheck_xmax: */ tuple->t_infomask &= ~HEAP_XMAX_COMMITTED; tuple->t_infomask |= HEAP_XMAX_INVALID; + HeapTupleHeaderClearHotUpdated(tuple); changed = true; } } @@ -3245,21 +3598,29 @@ heap_restrpos(HeapScanDesc scan) * Perform XLogInsert for a heap-clean operation. Caller must already * have modified the buffer and marked it dirty. * - * Note: for historical reasons, the entries in the unused[] array should - * be zero-based tuple indexes, not one-based. + * Note: prior to Postgres 8.3, the entries in the nowunused[] array were + * zero-based tuple indexes. Now they are one-based like other uses + * of OffsetNumber. */ XLogRecPtr -log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt) +log_heap_clean(Relation reln, Buffer buffer, + OffsetNumber *redirected, int nredirected, + OffsetNumber *nowdead, int ndead, + OffsetNumber *nowunused, int nunused, + bool redirect_move) { xl_heap_clean xlrec; + uint8 info; XLogRecPtr recptr; - XLogRecData rdata[2]; + XLogRecData rdata[4]; /* Caller should not call me on a temp relation */ Assert(!reln->rd_istemp); xlrec.node = reln->rd_node; xlrec.block = BufferGetBlockNumber(buffer); + xlrec.nredirected = nredirected; + xlrec.ndead = ndead; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapClean; @@ -3267,14 +3628,17 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt) rdata[0].next = &(rdata[1]); /* - * The unused-offsets array is not actually in the buffer, but pretend - * that it is. When XLogInsert stores the whole buffer, the offsets array - * need not be stored too. + * The OffsetNumber arrays are not actually in the buffer, but we pretend + * that they are. When XLogInsert stores the whole buffer, the offset + * arrays need not be stored too. Note that even if all three arrays + * are empty, we want to expose the buffer as a candidate for whole-page + * storage, since this record type implies a defragmentation operation + * even if no item pointers changed state. */ - if (uncnt > 0) + if (nredirected > 0) { - rdata[1].data = (char *) unused; - rdata[1].len = uncnt * sizeof(OffsetNumber); + rdata[1].data = (char *) redirected; + rdata[1].len = nredirected * sizeof(OffsetNumber) * 2; } else { @@ -3283,9 +3647,38 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt) } rdata[1].buffer = buffer; rdata[1].buffer_std = true; - rdata[1].next = NULL; + rdata[1].next = &(rdata[2]); + + if (ndead > 0) + { + rdata[2].data = (char *) nowdead; + rdata[2].len = ndead * sizeof(OffsetNumber); + } + else + { + rdata[2].data = NULL; + rdata[2].len = 0; + } + rdata[2].buffer = buffer; + rdata[2].buffer_std = true; + rdata[2].next = &(rdata[3]); + + if (nunused > 0) + { + rdata[3].data = (char *) nowunused; + rdata[3].len = nunused * sizeof(OffsetNumber); + } + else + { + rdata[3].data = NULL; + rdata[3].len = 0; + } + rdata[3].buffer = buffer; + rdata[3].buffer_std = true; + rdata[3].next = NULL; - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CLEAN, rdata); + info = redirect_move ? XLOG_HEAP2_CLEAN_MOVE : XLOG_HEAP2_CLEAN; + recptr = XLogInsert(RM_HEAP2_ID, info, rdata); return recptr; } @@ -3293,8 +3686,6 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt) /* * Perform XLogInsert for a heap-freeze operation. Caller must already * have modified the buffer and marked it dirty. - * - * Unlike log_heap_clean(), the offsets[] entries are one-based. */ XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, @@ -3363,17 +3754,28 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from, } xlhdr; int hsize = SizeOfHeapHeader; xl_heap_update xlrec; + uint8 info; XLogRecPtr recptr; XLogRecData rdata[4]; Page page = BufferGetPage(newbuf); - uint8 info = (move) ? XLOG_HEAP_MOVE : XLOG_HEAP_UPDATE; /* Caller should not call me on a temp relation */ Assert(!reln->rd_istemp); + if (move) + { + Assert(!HeapTupleIsHeapOnly(newtup)); + info = XLOG_HEAP_MOVE; + } + else if (HeapTupleIsHeapOnly(newtup)) + info = XLOG_HEAP_HOT_UPDATE; + else + info = XLOG_HEAP_UPDATE; + xlrec.target.node = reln->rd_node; xlrec.target.tid = from; xlrec.newtid = newtup->t_self; + rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfHeapUpdate; rdata[0].buffer = InvalidBuffer; @@ -3489,13 +3891,21 @@ log_newpage(RelFileNode *rnode, BlockNumber blkno, Page page) return recptr; } +/* + * Handles CLEAN and CLEAN_MOVE record types + */ static void -heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) +heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move) { xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record); Relation reln; Buffer buffer; Page page; + OffsetNumber *offnum; + OffsetNumber *end; + int nredirected; + int ndead; + int i; if (record->xl_info & XLR_BKP_BLOCK_1) return; @@ -3512,25 +3922,63 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record) return; } - if (record->xl_len > SizeOfHeapClean) - { - OffsetNumber *unused; - OffsetNumber *unend; - ItemId lp; + nredirected = xlrec->nredirected; + ndead = xlrec->ndead; + offnum = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean); + end = (OffsetNumber *) ((char *) xlrec + record->xl_len); - unused = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean); - unend = (OffsetNumber *) ((char *) xlrec + record->xl_len); + /* Update all redirected or moved line pointers */ + for (i = 0; i < nredirected; i++) + { + OffsetNumber fromoff = *offnum++; + OffsetNumber tooff = *offnum++; + ItemId fromlp = PageGetItemId(page, fromoff); - while (unused < unend) + if (clean_move) { - /* unused[] entries are zero-based */ - lp = PageGetItemId(page, *unused + 1); - ItemIdSetUnused(lp); - unused++; + /* Physically move the "to" item to the "from" slot */ + ItemId tolp = PageGetItemId(page, tooff); + HeapTupleHeader htup; + + *fromlp = *tolp; + ItemIdSetUnused(tolp); + + /* We also have to clear the tuple's heap-only bit */ + Assert(ItemIdIsNormal(fromlp)); + htup = (HeapTupleHeader) PageGetItem(page, fromlp); + Assert(HeapTupleHeaderIsHeapOnly(htup)); + HeapTupleHeaderClearHeapOnly(htup); + } + else + { + /* Just insert a REDIRECT link at fromoff */ + ItemIdSetRedirect(fromlp, tooff); } } - PageRepairFragmentation(page, NULL); + /* Update all now-dead line pointers */ + for (i = 0; i < ndead; i++) + { + OffsetNumber off = *offnum++; + ItemId lp = PageGetItemId(page, off); + + ItemIdSetDead(lp); + } + + /* Update all now-unused line pointers */ + while (offnum < end) + { + OffsetNumber off = *offnum++; + ItemId lp = PageGetItemId(page, off); + + ItemIdSetUnused(lp); + } + + /* + * Finally, repair any fragmentation, and update the page's hint bit + * about whether it has free pointers. + */ + PageRepairFragmentation(page); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); @@ -3655,8 +4103,13 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record) HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); + HeapTupleHeaderClearHotUpdated(htup); HeapTupleHeaderSetXmax(htup, record->xl_xid); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page); + /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = xlrec->target.tid; PageSetLSN(page, lsn); @@ -3736,7 +4189,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) HeapTupleHeaderSetCmin(htup, FirstCommandId); htup->t_ctid = xlrec->target.tid; - offnum = PageAddItem(page, (Item) htup, newlen, offnum, true); + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "heap_insert_redo: failed to add tuple"); PageSetLSN(page, lsn); @@ -3746,10 +4199,10 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record) } /* - * Handles UPDATE & MOVE + * Handles UPDATE, HOT_UPDATE & MOVE */ static void -heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move) +heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update) { xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record); Relation reln = XLogOpenRelation(xlrec->target.node); @@ -3808,6 +4261,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move) HEAP_XMIN_INVALID | HEAP_MOVED_IN); htup->t_infomask |= HEAP_MOVED_OFF; + HeapTupleHeaderClearHotUpdated(htup); HeapTupleHeaderSetXvac(htup, record->xl_xid); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = xlrec->target.tid; @@ -3819,12 +4273,19 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move) HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); + if (hot_update) + HeapTupleHeaderSetHotUpdated(htup); + else + HeapTupleHeaderClearHotUpdated(htup); HeapTupleHeaderSetXmax(htup, record->xl_xid); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); /* Set forward chain link in t_ctid */ htup->t_ctid = xlrec->newtid; } + /* Mark the page as a candidate for pruning */ + PageSetPrunable(page); + /* * this test is ugly, but necessary to avoid thinking that insert change * is already applied @@ -3914,7 +4375,7 @@ newsame:; /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = xlrec->newtid; - offnum = PageAddItem(page, (Item) htup, newlen, offnum, true); + offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "heap_update_redo: failed to add tuple"); PageSetLSN(page, lsn); @@ -3971,6 +4432,7 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record) htup->t_infomask |= HEAP_XMAX_SHARED_LOCK; else htup->t_infomask |= HEAP_XMAX_EXCL_LOCK; + HeapTupleHeaderClearHotUpdated(htup); HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); /* Make sure there is no forward chain link in t_ctid */ @@ -4039,25 +4501,35 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; - info &= XLOG_HEAP_OPMASK; - if (info == XLOG_HEAP_INSERT) - heap_xlog_insert(lsn, record); - else if (info == XLOG_HEAP_DELETE) - heap_xlog_delete(lsn, record); - else if (info == XLOG_HEAP_UPDATE) - heap_xlog_update(lsn, record, false); - else if (info == XLOG_HEAP_MOVE) - heap_xlog_update(lsn, record, true); - else if (info == XLOG_HEAP_CLEAN) - heap_xlog_clean(lsn, record); - else if (info == XLOG_HEAP_NEWPAGE) - heap_xlog_newpage(lsn, record); - else if (info == XLOG_HEAP_LOCK) - heap_xlog_lock(lsn, record); - else if (info == XLOG_HEAP_INPLACE) - heap_xlog_inplace(lsn, record); - else - elog(PANIC, "heap_redo: unknown op code %u", info); + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP_INSERT: + heap_xlog_insert(lsn, record); + break; + case XLOG_HEAP_DELETE: + heap_xlog_delete(lsn, record); + break; + case XLOG_HEAP_UPDATE: + heap_xlog_update(lsn, record, false, false); + break; + case XLOG_HEAP_MOVE: + heap_xlog_update(lsn, record, true, false); + break; + case XLOG_HEAP_HOT_UPDATE: + heap_xlog_update(lsn, record, false, true); + break; + case XLOG_HEAP_NEWPAGE: + heap_xlog_newpage(lsn, record); + break; + case XLOG_HEAP_LOCK: + heap_xlog_lock(lsn, record); + break; + case XLOG_HEAP_INPLACE: + heap_xlog_inplace(lsn, record); + break; + default: + elog(PANIC, "heap_redo: unknown op code %u", info); + } } void @@ -4065,11 +4537,20 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; - info &= XLOG_HEAP_OPMASK; - if (info == XLOG_HEAP2_FREEZE) - heap_xlog_freeze(lsn, record); - else - elog(PANIC, "heap2_redo: unknown op code %u", info); + switch (info & XLOG_HEAP_OPMASK) + { + case XLOG_HEAP2_FREEZE: + heap_xlog_freeze(lsn, record); + break; + case XLOG_HEAP2_CLEAN: + heap_xlog_clean(lsn, record, false); + break; + case XLOG_HEAP2_CLEAN_MOVE: + heap_xlog_clean(lsn, record, true); + break; + default: + elog(PANIC, "heap2_redo: unknown op code %u", info); + } } static void @@ -4130,13 +4611,18 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec) ItemPointerGetBlockNumber(&(xlrec->newtid)), ItemPointerGetOffsetNumber(&(xlrec->newtid))); } - else if (info == XLOG_HEAP_CLEAN) + else if (info == XLOG_HEAP_HOT_UPDATE) { - xl_heap_clean *xlrec = (xl_heap_clean *) rec; + xl_heap_update *xlrec = (xl_heap_update *) rec; - appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block); + if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */ + appendStringInfo(buf, "hot_update(init): "); + else + appendStringInfo(buf, "hot_update: "); + out_target(buf, &(xlrec->target)); + appendStringInfo(buf, "; new %u/%u", + ItemPointerGetBlockNumber(&(xlrec->newtid)), + ItemPointerGetOffsetNumber(&(xlrec->newtid))); } else if (info == XLOG_HEAP_NEWPAGE) { @@ -4187,6 +4673,22 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec) xlrec->node.relNode, xlrec->block, xlrec->cutoff_xid); } + else if (info == XLOG_HEAP2_CLEAN) + { + xl_heap_clean *xlrec = (xl_heap_clean *) rec; + + appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block); + } + else if (info == XLOG_HEAP2_CLEAN_MOVE) + { + xl_heap_clean *xlrec = (xl_heap_clean *) rec; + + appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u", + xlrec->node.spcNode, xlrec->node.dbNode, + xlrec->node.relNode, xlrec->block); + } else appendStringInfo(buf, "UNKNOWN"); } diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 6dbdf13fbe0..cd13d8f87c9 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.66 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.67 2007/09/20 17:56:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -41,7 +41,7 @@ RelationPutHeapTuple(Relation relation, pageHeader = BufferGetPage(buffer); offnum = PageAddItem(pageHeader, (Item) tuple->t_data, - tuple->t_len, InvalidOffsetNumber, false); + tuple->t_len, InvalidOffsetNumber, false, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple to page"); @@ -218,7 +218,7 @@ RelationGetBufferForTuple(Relation relation, Size len, * we're done. */ pageHeader = (Page) BufferGetPage(buffer); - pageFreeSpace = PageGetFreeSpace(pageHeader); + pageFreeSpace = PageGetHeapFreeSpace(pageHeader); if (len + saveFreeSpace <= pageFreeSpace) { /* use this page as future insert target, too */ @@ -311,7 +311,7 @@ RelationGetBufferForTuple(Relation relation, Size len, PageInit(pageHeader, BufferGetPageSize(buffer), 0); - if (len > PageGetFreeSpace(pageHeader)) + if (len > PageGetHeapFreeSpace(pageHeader)) { /* We should not get here given the test at the top */ elog(PANIC, "tuple is too big: size %lu", (unsigned long) len); diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c new file mode 100644 index 00000000000..d5496689003 --- /dev/null +++ b/src/backend/access/heap/pruneheap.c @@ -0,0 +1,702 @@ +/*------------------------------------------------------------------------- + * + * pruneheap.c + * heap page pruning and HOT-chain management code + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.1 2007/09/20 17:56:30 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/transam.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/inval.h" + + +/* Local functions */ +static int heap_prune_chain(Relation relation, Buffer buffer, + OffsetNumber rootoffnum, + TransactionId OldestXmin, + OffsetNumber *redirected, int *nredirected, + OffsetNumber *nowdead, int *ndead, + OffsetNumber *nowunused, int *nunused, + bool redirect_move); +static void heap_prune_record_redirect(OffsetNumber *redirected, + int *nredirected, + OffsetNumber offnum, + OffsetNumber rdoffnum); +static void heap_prune_record_dead(OffsetNumber *nowdead, int *ndead, + OffsetNumber offnum); +static void heap_prune_record_unused(OffsetNumber *nowunused, int *nunused, + OffsetNumber offnum); + + +/* + * Optionally prune and repair fragmentation in the specified page. + * + * This is an opportunistic function. It will perform housekeeping + * only if the page heuristically looks like a candidate for pruning and we + * can acquire buffer cleanup lock without blocking. + * + * Note: this is called quite often. It's important that it fall out quickly + * if there's not any use in pruning. + * + * Caller must have pin on the buffer, and must *not* have a lock on it. + * + * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD + * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). + */ +void +heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin) +{ + PageHeader dp = (PageHeader) BufferGetPage(buffer); + Size minfree; + + /* + * Let's see if we really need pruning. + * + * Forget it if page is not hinted to contain something prunable + */ + if (!PageIsPrunable(dp)) + return; + + /* + * We prune when a previous UPDATE failed to find enough space on the + * page for a new tuple version, or when free space falls below the + * relation's fill-factor target (but not less than 10%). + * + * Checking free space here is questionable since we aren't holding + * any lock on the buffer; in the worst case we could get a bogus + * answer. It's unlikely to be *seriously* wrong, though, since + * reading either pd_lower or pd_upper is probably atomic. Avoiding + * taking a lock seems better than sometimes getting a wrong answer + * in what is after all just a heuristic estimate. + */ + minfree = RelationGetTargetPageFreeSpace(relation, + HEAP_DEFAULT_FILLFACTOR); + minfree = Max(minfree, BLCKSZ / 10); + + if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree) + { + /* OK, try to get exclusive buffer lock */ + if (!ConditionalLockBufferForCleanup(buffer)) + return; + + /* + * Now that we have buffer lock, get accurate information about the + * page's free space, and recheck the heuristic about whether to prune. + */ + if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree) + { + /* OK to prune (though not to remove redirects) */ + (void) heap_page_prune(relation, buffer, OldestXmin, false, true); + } + + /* And release buffer lock */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } +} + + +/* + * Prune and repair fragmentation in the specified page. + * + * Caller must have pin and buffer cleanup lock on the page. + * + * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD + * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). + * + * If redirect_move is set, we remove redirecting line pointers by + * updating the root line pointer to point directly to the first non-dead + * tuple in the chain. NOTE: eliminating the redirect changes the first + * tuple's effective CTID, and is therefore unsafe except within VACUUM FULL. + * The only reason we support this capability at all is that by using it, + * VACUUM FULL need not cope with LP_REDIRECT items at all; which seems a + * good thing since VACUUM FULL is overly complicated already. + * + * If report_stats is true then we send the number of reclaimed heap-only + * tuples to pgstats. (This must be FALSE during vacuum, since vacuum will + * send its own new total to pgstats, and we don't want this delta applied + * on top of that.) + * + * Returns the number of tuples deleted from the page. + */ +int +heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, + bool redirect_move, bool report_stats) +{ + int ndeleted = 0; + Page page = BufferGetPage(buffer); + OffsetNumber offnum, + maxoff; + OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; + OffsetNumber nowdead[MaxHeapTuplesPerPage]; + OffsetNumber nowunused[MaxHeapTuplesPerPage]; + int nredirected = 0; + int ndead = 0; + int nunused = 0; + + START_CRIT_SECTION(); + + /* + * Mark the page as clear of prunable tuples. If we find a tuple which + * may soon become prunable, we shall set the hint again. Also clear + * the "page is full" flag, since there's no point in repeating the + * prune/defrag process until something else happens to the page. + */ + PageClearPrunable(page); + PageClearFull(page); + + /* Scan the page */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + + /* Nothing to do if slot is empty or already dead */ + if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) + continue; + + /* Process this item or chain of items */ + ndeleted += heap_prune_chain(relation, buffer, offnum, + OldestXmin, + redirected, &nredirected, + nowdead, &ndead, + nowunused, &nunused, + redirect_move); + } + + /* Have we pruned any items? */ + if (nredirected > 0 || ndead > 0 || nunused > 0) + { + /* + * Repair page fragmentation, and update the page's hint bit about + * whether it has free line pointers. + */ + PageRepairFragmentation((Page) page); + + MarkBufferDirty(buffer); + + /* + * Emit a WAL HEAP_CLEAN or HEAP_CLEAN_MOVE record showing what we did + */ + if (!relation->rd_istemp) + { + XLogRecPtr recptr; + + recptr = log_heap_clean(relation, buffer, + redirected, nredirected, + nowdead, ndead, + nowunused, nunused, + redirect_move); + PageSetTLI(BufferGetPage(buffer), ThisTimeLineID); + PageSetLSN(BufferGetPage(buffer), recptr); + } + } + + END_CRIT_SECTION(); + + /* + * If requested, report the number of tuples reclaimed to pgstats. + * This is ndeleted minus ndead, because we don't want to count a now-DEAD + * root item as a deletion for this purpose. + */ + if (report_stats && ndeleted > ndead) + pgstat_update_heap_dead_tuples(relation, ndeleted - ndead); + + /* + * XXX Should we update the FSM information of this page ? + * + * There are two schools of thought here. We may not want to update + * FSM information so that the page is not used for unrelated + * UPDATEs/INSERTs and any free space in this page will remain + * available for further UPDATEs in *this* page, thus improving + * chances for doing HOT updates. + * + * But for a large table and where a page does not receive further + * UPDATEs for a long time, we might waste this space by not + * updating the FSM information. The relation may get extended and + * fragmented further. + * + * One possibility is to leave "fillfactor" worth of space in this + * page and update FSM with the remaining space. + * + * In any case, the current FSM implementation doesn't accept + * one-page-at-a-time updates, so this is all academic for now. + */ + + return ndeleted; +} + + +/* + * Prune specified item pointer or a HOT chain originating at that item. + * + * If the item is an index-referenced tuple (i.e. not a heap-only tuple), + * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT + * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. + * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really + * DEAD, the OldestXmin test is just too coarse to detect it. + * + * The root line pointer is redirected to the tuple immediately after the + * latest DEAD tuple. If all tuples in the chain are DEAD, the root line + * pointer is marked LP_DEAD. (This includes the case of a DEAD simple + * tuple, which we treat as a chain of length 1.) + * + * OldestXmin is the cutoff XID used to identify dead tuples. + * + * Redirected items are added to the redirected[] array (two entries per + * redirection); items set to LP_DEAD state are added to nowdead[]; and + * items set to LP_UNUSED state are added to nowunused[]. (These arrays + * will be used to generate a WAL record after all chains are pruned.) + * + * If redirect_move is true, we get rid of redirecting line pointers. + * + * Returns the number of tuples deleted from the page. + */ +static int +heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, + TransactionId OldestXmin, + OffsetNumber *redirected, int *nredirected, + OffsetNumber *nowdead, int *ndead, + OffsetNumber *nowunused, int *nunused, + bool redirect_move) +{ + int ndeleted = 0; + Page dp = (Page) BufferGetPage(buffer); + TransactionId priorXmax = InvalidTransactionId; + ItemId rootlp; + HeapTupleHeader htup; + OffsetNumber latestdead = InvalidOffsetNumber, + maxoff = PageGetMaxOffsetNumber(dp), + offnum; + OffsetNumber chainitems[MaxHeapTuplesPerPage]; + int nchain = 0, + i; + + rootlp = PageGetItemId(dp, rootoffnum); + + /* + * If it's a heap-only tuple, then it is not the start of a HOT chain. + */ + if (ItemIdIsNormal(rootlp)) + { + htup = (HeapTupleHeader) PageGetItem(dp, rootlp); + if (HeapTupleHeaderIsHeapOnly(htup)) + { + /* + * If the tuple is DEAD and doesn't chain to anything else, mark it + * unused immediately. (If it does chain, we can only remove it as + * part of pruning its chain.) + * + * We need this primarily to handle aborted HOT updates, that is, + * XMIN_INVALID heap-only tuples. Those might not be linked to + * by any chain, since the parent tuple might be re-updated before + * any pruning occurs. So we have to be able to reap them + * separately from chain-pruning. + * + * Note that we might first arrive at a dead heap-only tuple + * either here or while following a chain below. Whichever path + * gets there first will mark the tuple unused. + */ + if (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer) + == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) + { + ItemIdSetUnused(rootlp); + heap_prune_record_unused(nowunused, nunused, rootoffnum); + ndeleted++; + } + + /* Nothing more to do */ + return ndeleted; + } + } + + /* Start from the root tuple */ + offnum = rootoffnum; + + /* while not end of the chain */ + for (;;) + { + ItemId lp; + bool tupdead, + recent_dead; + + /* Some sanity checks */ + if (offnum < FirstOffsetNumber || offnum > maxoff) + break; + + lp = PageGetItemId(dp, offnum); + + if (!ItemIdIsUsed(lp)) + break; + + /* + * If we are looking at the redirected root line pointer, + * jump to the first normal tuple in the chain. If we find + * a redirect somewhere else, stop --- it must not be same chain. + */ + if (ItemIdIsRedirected(lp)) + { + if (nchain > 0) + break; /* not at start of chain */ + chainitems[nchain++] = offnum; + offnum = ItemIdGetRedirect(rootlp); + continue; + } + + /* + * Likewise, a dead item pointer can't be part of the chain. + * (We already eliminated the case of dead root tuple outside + * this function.) + */ + if (ItemIdIsDead(lp)) + break; + + Assert(ItemIdIsNormal(lp)); + htup = (HeapTupleHeader) PageGetItem(dp, lp); + + /* + * Check the tuple XMIN against prior XMAX, if any + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) + break; + + /* + * OK, this tuple is indeed a member of the chain. + */ + chainitems[nchain++] = offnum; + + /* + * Check tuple's visibility status. + */ + tupdead = recent_dead = false; + + switch (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer)) + { + case HEAPTUPLE_DEAD: + tupdead = true; + break; + + case HEAPTUPLE_RECENTLY_DEAD: + recent_dead = true; + /* + * This tuple may soon become DEAD. Re-set the hint bit so + * that the page is reconsidered for pruning in future. + */ + PageSetPrunable(dp); + break; + + case HEAPTUPLE_DELETE_IN_PROGRESS: + /* + * This tuple may soon become DEAD. Re-set the hint bit so + * that the page is reconsidered for pruning in future. + */ + PageSetPrunable(dp); + break; + + case HEAPTUPLE_LIVE: + case HEAPTUPLE_INSERT_IN_PROGRESS: + /* + * If we wanted to optimize for aborts, we might consider + * marking the page prunable when we see INSERT_IN_PROGRESS. + * But we don't. See related decisions about when to mark + * the page prunable in heapam.c. + */ + break; + + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + + /* + * Remember the last DEAD tuple seen. We will advance past + * RECENTLY_DEAD tuples just in case there's a DEAD one after them; + * but we can't advance past anything else. (XXX is it really worth + * continuing to scan beyond RECENTLY_DEAD? The case where we will + * find another DEAD tuple is a fairly unusual corner case.) + */ + if (tupdead) + latestdead = offnum; + else if (!recent_dead) + break; + + /* + * If the tuple is not HOT-updated, then we are at the end of this + * HOT-update chain. + */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + break; + + /* + * Advance to next chain member. + */ + Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == + BufferGetBlockNumber(buffer)); + offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetXmax(htup); + } + + /* + * If we found a DEAD tuple in the chain, adjust the HOT chain so that all + * the DEAD tuples at the start of the chain are removed and the root line + * pointer is appropriately redirected. + */ + if (OffsetNumberIsValid(latestdead)) + { + /* + * Mark as unused each intermediate item that we are able to remove + * from the chain. + * + * When the previous item is the last dead tuple seen, we are at + * the right candidate for redirection. + */ + for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) + { + ItemId lp = PageGetItemId(dp, chainitems[i]); + + ItemIdSetUnused(lp); + heap_prune_record_unused(nowunused, nunused, chainitems[i]); + ndeleted++; + } + + /* + * If the root entry had been a normal tuple, we are deleting it, + * so count it in the result. But changing a redirect (even to + * DEAD state) doesn't count. + */ + if (ItemIdIsNormal(rootlp)) + ndeleted++; + + /* + * If the DEAD tuple is at the end of the chain, the entire chain is + * dead and the root line pointer can be marked dead. Otherwise + * just redirect the root to the correct chain member. + */ + if (i >= nchain) + { + ItemIdSetDead(rootlp); + heap_prune_record_dead(nowdead, ndead, rootoffnum); + } + else + { + ItemIdSetRedirect(rootlp, chainitems[i]); + heap_prune_record_redirect(redirected, nredirected, + rootoffnum, + chainitems[i]); + } + } + else if (nchain < 2 && ItemIdIsRedirected(rootlp)) + { + /* + * We found a redirect item that doesn't point to a valid follow-on + * item. This can happen if the loop in heap_page_prune caused us + * to visit the dead successor of a redirect item before visiting + * the redirect item. We can clean up by setting the redirect item + * to DEAD state. + */ + ItemIdSetDead(rootlp); + heap_prune_record_dead(nowdead, ndead, rootoffnum); + } + + /* + * If requested, eliminate LP_REDIRECT items by moving tuples. Note that + * if the root item is LP_REDIRECT and doesn't point to a valid follow-on + * item, we already killed it above. + */ + if (redirect_move && ItemIdIsRedirected(rootlp)) + { + OffsetNumber firstoffnum = ItemIdGetRedirect(rootlp); + ItemId firstlp = PageGetItemId(dp, firstoffnum); + HeapTupleData firsttup; + + Assert(ItemIdIsNormal(firstlp)); + /* Set up firsttup to reference the tuple at its existing CTID */ + firsttup.t_data = (HeapTupleHeader) PageGetItem(dp, firstlp); + firsttup.t_len = ItemIdGetLength(firstlp); + ItemPointerSet(&firsttup.t_self, + BufferGetBlockNumber(buffer), + firstoffnum); + firsttup.t_tableOid = RelationGetRelid(relation); + + /* + * Mark the tuple for invalidation. Needed because we're changing + * its CTID. + */ + CacheInvalidateHeapTuple(relation, &firsttup); + + /* + * Change heap-only status of the tuple because after the line + * pointer manipulation, it's no longer a heap-only tuple, but is + * directly pointed to by index entries. + */ + Assert(HeapTupleIsHeapOnly(&firsttup)); + HeapTupleClearHeapOnly(&firsttup); + + /* Now move the item pointer */ + *rootlp = *firstlp; + ItemIdSetUnused(firstlp); + + /* + * If latestdead is valid, we have already recorded the redirection + * above. Otherwise, do it now. + * + * We don't record firstlp in the nowunused[] array, since the + * redirection entry is enough to tell heap_xlog_clean what to do. + */ + if (!OffsetNumberIsValid(latestdead)) + heap_prune_record_redirect(redirected, nredirected, rootoffnum, + firstoffnum); + } + + return ndeleted; +} + + +/* Record newly-redirected item pointer */ +static void +heap_prune_record_redirect(OffsetNumber *redirected, int *nredirected, + OffsetNumber offnum, OffsetNumber rdoffnum) +{ + Assert(*nredirected < MaxHeapTuplesPerPage); + redirected[*nredirected * 2] = offnum; + redirected[*nredirected * 2 + 1] = rdoffnum; + (*nredirected)++; +} + +/* Record newly-dead item pointer */ +static void +heap_prune_record_dead(OffsetNumber *nowdead, int *ndead, + OffsetNumber offnum) +{ + Assert(*ndead < MaxHeapTuplesPerPage); + nowdead[*ndead] = offnum; + (*ndead)++; +} + +/* Record newly-unused item pointer */ +static void +heap_prune_record_unused(OffsetNumber *nowunused, int *nunused, + OffsetNumber offnum) +{ + Assert(*nunused < MaxHeapTuplesPerPage); + nowunused[*nunused] = offnum; + (*nunused)++; +} + + +/* + * For all items in this page, find their respective root line pointers. + * If item k is part of a HOT-chain with root at item j, then we set + * root_offsets[k - 1] = j. + * + * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. + * We zero out all unused entries. + * + * The function must be called with at least share lock on the buffer, to + * prevent concurrent prune operations. + * + * Note: The information collected here is valid only as long as the caller + * holds a pin on the buffer. Once pin is released, a tuple might be pruned + * and reused by a completely unrelated tuple. + */ +void +heap_get_root_tuples(Page page, OffsetNumber *root_offsets) +{ + OffsetNumber offnum, maxoff; + + MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber)); + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum++) + { + ItemId lp = PageGetItemId(page, offnum); + HeapTupleHeader htup; + OffsetNumber nextoffnum; + TransactionId priorXmax; + + /* skip unused and dead items */ + if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) + continue; + + if (ItemIdIsNormal(lp)) + { + htup = (HeapTupleHeader) PageGetItem(page, lp); + + /* + * Check if this tuple is part of a HOT-chain rooted at some other + * tuple. If so, skip it for now; we'll process it when we find + * its root. + */ + if (HeapTupleHeaderIsHeapOnly(htup)) + continue; + + /* + * This is either a plain tuple or the root of a HOT-chain. + * Remember it in the mapping. + */ + root_offsets[offnum - 1] = offnum; + + /* If it's not the start of a HOT-chain, we're done with it */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + continue; + + /* Set up to scan the HOT-chain */ + nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetXmax(htup); + } + else + { + /* Must be a redirect item. We do not set its root_offsets entry */ + Assert(ItemIdIsRedirected(lp)); + /* Set up to scan the HOT-chain */ + nextoffnum = ItemIdGetRedirect(lp); + priorXmax = InvalidTransactionId; + } + + /* + * Now follow the HOT-chain and collect other tuples in the chain. + * + * Note: Even though this is a nested loop, the complexity of the + * function is O(N) because a tuple in the page should be visited not + * more than twice, once in the outer loop and once in HOT-chain + * chases. + */ + for (;;) + { + lp = PageGetItemId(page, nextoffnum); + + /* Check for broken chains */ + if (!ItemIdIsNormal(lp)) + break; + + htup = (HeapTupleHeader) PageGetItem(page, lp); + + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) + break; + + /* Remember the root line pointer for this item */ + root_offsets[nextoffnum - 1] = offnum; + + /* Advance to next chain member, if any */ + if (!HeapTupleHeaderIsHotUpdated(htup)) + break; + + nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); + priorXmax = HeapTupleHeaderGetXmax(htup); + } + } +} diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 60aab58de38..e8c5eec50ac 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -96,7 +96,7 @@ * Portions Copyright (c) 1994-5, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.6 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.7 2007/09/20 17:56:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -320,12 +320,14 @@ rewrite_heap_tuple(RewriteState state, * Copy the original tuple's visibility information into new_tuple. * * XXX we might later need to copy some t_infomask2 bits, too? + * Right now, we intentionally clear the HOT status bits. */ memcpy(&new_tuple->t_data->t_choice.t_heap, &old_tuple->t_data->t_choice.t_heap, sizeof(HeapTupleFields)); new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK; + new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK; new_tuple->t_data->t_infomask |= old_tuple->t_data->t_infomask & HEAP_XACT_MASK; @@ -593,7 +595,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) /* Now we can check to see if there's enough free space already. */ if (state->rs_buffer_valid) { - pageFreeSpace = PageGetFreeSpace(page); + pageFreeSpace = PageGetHeapFreeSpace(page); if (len + saveFreeSpace > pageFreeSpace) { @@ -628,7 +630,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) /* And now we can insert the tuple into the page */ newoff = PageAddItem(page, (Item) heaptup->t_data, len, - InvalidOffsetNumber, false); + InvalidOffsetNumber, false, true); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add tuple"); diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 0009739180c..7bf1e43cb45 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.62 2007/05/27 03:50:38 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.63 2007/09/20 17:56:30 tgl Exp $ * * NOTES * many of the old access method routines have been turned into @@ -21,6 +21,7 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/transam.h" #include "miscadmin.h" #include "pgstat.h" @@ -95,6 +96,9 @@ RelationGetIndexScan(Relation indexRelation, ItemPointerSetInvalid(&scan->xs_ctup.t_self); scan->xs_ctup.t_data = NULL; scan->xs_cbuf = InvalidBuffer; + scan->xs_prev_xmax = InvalidTransactionId; + scan->xs_next_hot = InvalidOffsetNumber; + scan->xs_hot_dead = false; /* * Let the AM fill in the key and any opaque data it wants. diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index d905013a5fc..fd727ca68c8 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.98 2007/05/27 03:50:38 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.99 2007/09/20 17:56:30 tgl Exp $ * * INTERFACE ROUTINES * index_open - open an index relation by relation OID @@ -64,6 +64,7 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/transam.h" #include "pgstat.h" #include "utils/relcache.h" @@ -313,6 +314,8 @@ index_rescan(IndexScanDesc scan, ScanKey key) scan->xs_cbuf = InvalidBuffer; } + scan->xs_next_hot = InvalidOffsetNumber; + scan->kill_prior_tuple = false; /* for safety */ FunctionCall2(procedure, @@ -370,6 +373,14 @@ index_markpos(IndexScanDesc scan) * NOTE: this only restores the internal scan state of the index AM. * The current result tuple (scan->xs_ctup) doesn't change. See comments * for ExecRestrPos(). + * + * NOTE: in the presence of HOT chains, mark/restore only works correctly + * if the scan's snapshot is MVCC-safe; that ensures that there's at most one + * returnable tuple in each HOT chain, and so restoring the prior state at the + * granularity of the index AM is sufficient. Since the only current user + * of mark/restore functionality is nodeMergejoin.c, this effectively means + * that merge-join plans only work for MVCC snapshots. This could be fixed + * if necessary, but for now it seems unimportant. * ---------------- */ void @@ -377,9 +388,13 @@ index_restrpos(IndexScanDesc scan) { FmgrInfo *procedure; + Assert(IsMVCCSnapshot(scan->xs_snapshot)); + SCAN_CHECKS; GET_SCAN_PROCEDURE(amrestrpos); + scan->xs_next_hot = InvalidOffsetNumber; + scan->kill_prior_tuple = false; /* for safety */ FunctionCall1(procedure, PointerGetDatum(scan)); @@ -398,72 +413,224 @@ HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { HeapTuple heapTuple = &scan->xs_ctup; + ItemPointer tid = &heapTuple->t_self; FmgrInfo *procedure; SCAN_CHECKS; GET_SCAN_PROCEDURE(amgettuple); - /* just make sure this is false... */ - scan->kill_prior_tuple = false; + /* + * We always reset xs_hot_dead; if we are here then either we are just + * starting the scan, or we previously returned a visible tuple, and in + * either case it's inappropriate to kill the prior index entry. + */ + scan->xs_hot_dead = false; for (;;) { - bool found; + OffsetNumber offnum; + bool at_chain_start; + Page dp; - /* - * The AM's gettuple proc finds the next tuple matching the scan keys. - */ - found = DatumGetBool(FunctionCall2(procedure, - PointerGetDatum(scan), - Int32GetDatum(direction))); + if (scan->xs_next_hot != InvalidOffsetNumber) + { + /* + * We are resuming scan of a HOT chain after having returned + * an earlier member. Must still hold pin on current heap page. + */ + Assert(BufferIsValid(scan->xs_cbuf)); + Assert(ItemPointerGetBlockNumber(tid) == + BufferGetBlockNumber(scan->xs_cbuf)); + Assert(TransactionIdIsValid(scan->xs_prev_xmax)); + offnum = scan->xs_next_hot; + at_chain_start = false; + scan->xs_next_hot = InvalidOffsetNumber; + } + else + { + bool found; + Buffer prev_buf; + + /* + * If we scanned a whole HOT chain and found only dead tuples, + * tell index AM to kill its entry for that TID. + */ + scan->kill_prior_tuple = scan->xs_hot_dead; + + /* + * The AM's gettuple proc finds the next index entry matching the + * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). + */ + found = DatumGetBool(FunctionCall2(procedure, + PointerGetDatum(scan), + Int32GetDatum(direction))); + + /* Reset kill flag immediately for safety */ + scan->kill_prior_tuple = false; + + /* If we're out of index entries, break out of outer loop */ + if (!found) + break; + + pgstat_count_index_tuples(scan->indexRelation, 1); + + /* Switch to correct buffer if we don't have it already */ + prev_buf = scan->xs_cbuf; + scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, + scan->heapRelation, + ItemPointerGetBlockNumber(tid)); + + /* + * Prune page, but only if we weren't already on this page + */ + if (prev_buf != scan->xs_cbuf) + heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, + RecentGlobalXmin); + + /* Prepare to scan HOT chain starting at index-referenced offnum */ + offnum = ItemPointerGetOffsetNumber(tid); + at_chain_start = true; + + /* We don't know what the first tuple's xmin should be */ + scan->xs_prev_xmax = InvalidTransactionId; + + /* Initialize flag to detect if all entries are dead */ + scan->xs_hot_dead = true; + } + + /* Obtain share-lock on the buffer so we can examine visibility */ + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); - /* Reset kill flag immediately for safety */ - scan->kill_prior_tuple = false; + dp = (Page) BufferGetPage(scan->xs_cbuf); - if (!found) + /* Scan through possible multiple members of HOT-chain */ + for (;;) { - /* Release any held pin on a heap page */ - if (BufferIsValid(scan->xs_cbuf)) - { - ReleaseBuffer(scan->xs_cbuf); - scan->xs_cbuf = InvalidBuffer; - } - return NULL; /* failure exit */ - } + ItemId lp; + ItemPointer ctid; - pgstat_count_index_tuples(scan->indexRelation, 1); + /* check for bogus TID */ + if (offnum < FirstOffsetNumber || + offnum > PageGetMaxOffsetNumber(dp)) + break; - /* - * Fetch the heap tuple and see if it matches the snapshot. - */ - if (heap_release_fetch(scan->heapRelation, scan->xs_snapshot, - heapTuple, &scan->xs_cbuf, true, - scan->indexRelation)) - break; + lp = PageGetItemId(dp, offnum); - /* Skip if no undeleted tuple at this location */ - if (heapTuple->t_data == NULL) - continue; + /* check for unused, dead, or redirected items */ + if (!ItemIdIsNormal(lp)) + { + /* We should only see a redirect at start of chain */ + if (ItemIdIsRedirected(lp) && at_chain_start) + { + /* Follow the redirect */ + offnum = ItemIdGetRedirect(lp); + at_chain_start = false; + continue; + } + /* else must be end of chain */ + break; + } - /* - * If we can't see it, maybe no one else can either. Check to see if - * the tuple is dead to all transactions. If so, signal the index AM - * to not return it on future indexscans. - * - * We told heap_release_fetch to keep a pin on the buffer, so we can - * re-access the tuple here. But we must re-lock the buffer first. - */ - LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); + /* + * We must initialize all of *heapTuple (ie, scan->xs_ctup) + * since it is returned to the executor on success. + */ + heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp); + heapTuple->t_len = ItemIdGetLength(lp); + ItemPointerSetOffsetNumber(tid, offnum); + heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation); + ctid = &heapTuple->t_data->t_ctid; + + /* + * Shouldn't see a HEAP_ONLY tuple at chain start. (This test + * should be unnecessary, since the chain root can't be removed + * while we have pin on the index entry, but let's make it anyway.) + */ + if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) + break; + + /* + * The xmin should match the previous xmax value, else chain is + * broken. (Note: this test is not optional because it protects + * us against the case where the prior chain member's xmax + * aborted since we looked at it.) + */ + if (TransactionIdIsValid(scan->xs_prev_xmax) && + !TransactionIdEquals(scan->xs_prev_xmax, + HeapTupleHeaderGetXmin(heapTuple->t_data))) + break; + + /* If it's visible per the snapshot, we must return it */ + if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot, + scan->xs_cbuf)) + { + /* + * If the snapshot is MVCC, we know that it could accept + * at most one member of the HOT chain, so we can skip + * examining any more members. Otherwise, check for + * continuation of the HOT-chain, and set state for next time. + */ + if (IsMVCCSnapshot(scan->xs_snapshot)) + scan->xs_next_hot = InvalidOffsetNumber; + else if (HeapTupleIsHotUpdated(heapTuple)) + { + Assert(ItemPointerGetBlockNumber(ctid) == + ItemPointerGetBlockNumber(tid)); + scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid); + scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); + } + else + scan->xs_next_hot = InvalidOffsetNumber; + + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); + + pgstat_count_heap_fetch(scan->indexRelation); + + return heapTuple; + } - if (HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin, - scan->xs_cbuf) == HEAPTUPLE_DEAD) - scan->kill_prior_tuple = true; + /* + * If we can't see it, maybe no one else can either. Check to see + * if the tuple is dead to all transactions. If we find that all + * the tuples in the HOT chain are dead, we'll signal the index AM + * to not return that TID on future indexscans. + */ + if (scan->xs_hot_dead && + HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin, + scan->xs_cbuf) != HEAPTUPLE_DEAD) + scan->xs_hot_dead = false; + + /* + * Check to see if HOT chain continues past this tuple; if so + * fetch the next offnum (we don't bother storing it into + * xs_next_hot, but must store xs_prev_xmax), and loop around. + */ + if (HeapTupleIsHotUpdated(heapTuple)) + { + Assert(ItemPointerGetBlockNumber(ctid) == + ItemPointerGetBlockNumber(tid)); + offnum = ItemPointerGetOffsetNumber(ctid); + at_chain_start = false; + scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); + } + else + break; /* end of chain */ + } /* loop over a single HOT chain */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); + + /* Loop around to ask index AM for another TID */ + scan->xs_next_hot = InvalidOffsetNumber; + } + + /* Release any held pin on a heap page */ + if (BufferIsValid(scan->xs_cbuf)) + { + ReleaseBuffer(scan->xs_cbuf); + scan->xs_cbuf = InvalidBuffer; } - /* Success exit */ - return heapTuple; + return NULL; /* failure exit */ } /* ---------------- diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 7dbaa2c245f..5f7ecbe16da 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.159 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.160 2007/09/20 17:56:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -193,8 +193,6 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, */ for (;;) { - HeapTupleData htup; - Buffer hbuffer; ItemId curitemid; IndexTuple curitup; BlockNumber nblkno; @@ -223,6 +221,9 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, */ if (!ItemIdIsDead(curitemid)) { + ItemPointerData htid; + bool all_dead; + /* * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's * how we handling NULLs - and so we must not use _bt_compare @@ -234,17 +235,20 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, /* okay, we gotta fetch the heap tuple ... */ curitup = (IndexTuple) PageGetItem(page, curitemid); - htup.t_self = curitup->t_tid; - if (heap_fetch(heapRel, &SnapshotDirty, &htup, &hbuffer, - true, NULL)) + htid = curitup->t_tid; + + /* + * We check the whole HOT-chain to see if there is any tuple + * that satisfies SnapshotDirty. This is necessary because + * we have just a single index entry for the entire chain. + */ + if (heap_hot_search(&htid, heapRel, &SnapshotDirty, &all_dead)) { /* it is a duplicate */ TransactionId xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? SnapshotDirty.xmin : SnapshotDirty.xmax; - ReleaseBuffer(hbuffer); - /* * If this tuple is being updated by other transaction * then we have to wait for its commit/abort. @@ -263,15 +267,22 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * is itself now committed dead --- if so, don't complain. * This is a waste of time in normal scenarios but we must * do it to support CREATE INDEX CONCURRENTLY. + * + * We must follow HOT-chains here because during + * concurrent index build, we insert the root TID though + * the actual tuple may be somewhere in the HOT-chain. + * While following the chain we might not stop at the exact + * tuple which triggered the insert, but that's OK because + * if we find a live tuple anywhere in this chain, we have + * a unique key conflict. The other live tuple is not part + * of this chain because it had a different index entry. */ - htup.t_self = itup->t_tid; - if (heap_fetch(heapRel, SnapshotSelf, &htup, &hbuffer, - false, NULL)) + htid = itup->t_tid; + if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL)) { /* Normal case --- it's still live */ - ReleaseBuffer(hbuffer); } - else if (htup.t_data != NULL) + else { /* * It's been deleted, so no error, and no need to @@ -279,39 +290,27 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, */ break; } - else - { - /* couldn't find the tuple?? */ - elog(ERROR, "failed to fetch tuple being inserted"); - } ereport(ERROR, (errcode(ERRCODE_UNIQUE_VIOLATION), errmsg("duplicate key value violates unique constraint \"%s\"", RelationGetRelationName(rel)))); } - else if (htup.t_data != NULL) + else if (all_dead) { /* - * Hmm, if we can't see the tuple, maybe it can be marked - * killed. This logic should match index_getnext and - * btgettuple. + * The conflicting tuple (or whole HOT chain) is dead to + * everyone, so we may as well mark the index entry + * killed. */ - LockBuffer(hbuffer, BUFFER_LOCK_SHARE); - if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin, - hbuffer) == HEAPTUPLE_DEAD) - { - ItemIdMarkDead(curitemid); - opaque->btpo_flags |= BTP_HAS_GARBAGE; - /* be sure to mark the proper buffer dirty... */ - if (nbuf != InvalidBuffer) - SetBufferCommitInfoNeedsSave(nbuf); - else - SetBufferCommitInfoNeedsSave(buf); - } - LockBuffer(hbuffer, BUFFER_LOCK_UNLOCK); + ItemIdMarkDead(curitemid); + opaque->btpo_flags |= BTP_HAS_GARBAGE; + /* be sure to mark the proper buffer dirty... */ + if (nbuf != InvalidBuffer) + SetBufferCommitInfoNeedsSave(nbuf); + else + SetBufferCommitInfoNeedsSave(buf); } - ReleaseBuffer(hbuffer); } } @@ -840,7 +839,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, itemsz = ItemIdGetLength(itemid); item = (IndexTuple) PageGetItem(origpage, itemid); if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, - false) == InvalidOffsetNumber) + false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add hikey to the right sibling"); rightoff = OffsetNumberNext(rightoff); } @@ -865,7 +864,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, item = (IndexTuple) PageGetItem(origpage, itemid); } if (PageAddItem(leftpage, (Item) item, itemsz, leftoff, - false) == InvalidOffsetNumber) + false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add hikey to the left sibling"); leftoff = OffsetNumberNext(leftoff); @@ -1700,7 +1699,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) * benefit of _bt_restore_page(). */ if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, - false) == InvalidOffsetNumber) + false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add leftkey to new root page"); pfree(new_item); @@ -1718,7 +1717,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) * insert the right page pointer into the new root page. */ if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, - false) == InvalidOffsetNumber) + false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add rightkey to new root page"); pfree(new_item); @@ -1805,7 +1804,7 @@ _bt_pgaddtup(Relation rel, } if (PageAddItem(page, (Item) itup, itemsize, itup_off, - false) == InvalidOffsetNumber) + false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add item to the %s for \"%s\"", where, RelationGetRelationName(rel)); } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 118dc22bb35..6293792b9f5 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -57,7 +57,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.112 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.113 2007/09/20 17:56:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -400,7 +400,7 @@ _bt_sortaddtup(Page page, } if (PageAddItem(page, (Item) itup, itemsize, itup_off, - false) == InvalidOffsetNumber) + false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to the index page"); } diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index db64422b19f..499129c48f1 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.45 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.46 2007/09/20 17:56:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -141,8 +141,8 @@ _bt_restore_page(Page page, char *from, int len) memcpy(&itupdata, from, sizeof(IndexTupleData)); itemsz = IndexTupleDSize(itupdata); itemsz = MAXALIGN(itemsz); - if (PageAddItem(page, (Item) from, itemsz, - FirstOffsetNumber, false) == InvalidOffsetNumber) + if (PageAddItem(page, (Item) from, itemsz, FirstOffsetNumber, + false, false) == InvalidOffsetNumber) elog(PANIC, "_bt_restore_page: cannot add item to page"); from += itemsz; } @@ -238,7 +238,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, { if (PageAddItem(page, (Item) datapos, datalen, ItemPointerGetOffsetNumber(&(xlrec->target.tid)), - false) == InvalidOffsetNumber) + false, false) == InvalidOffsetNumber) elog(PANIC, "btree_insert_redo: failed to add item"); PageSetLSN(page, lsn); @@ -389,7 +389,7 @@ btree_xlog_split(bool onleft, bool isroot, if (onleft) { if (PageAddItem(lpage, newitem, newitemsz, newitemoff, - false) == InvalidOffsetNumber) + false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add new item to left page after split"); } @@ -398,7 +398,7 @@ btree_xlog_split(bool onleft, bool isroot, hiItem = PageGetItem(rpage, hiItemId); if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId), - P_HIKEY, false) == InvalidOffsetNumber) + P_HIKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add high key to left page after split"); /* Fix opaque fields */ diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 9aa58e35f9a..8137377e7a5 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.284 2007/05/30 20:11:55 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.285 2007/09/20 17:56:30 tgl Exp $ * * * INTERFACE ROUTINES @@ -410,6 +410,9 @@ UpdateIndexRelation(Oid indexoid, values[Anum_pg_index_indisprimary - 1] = BoolGetDatum(primary); values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false); values[Anum_pg_index_indisvalid - 1] = BoolGetDatum(isvalid); + values[Anum_pg_index_indcheckxmin - 1] = BoolGetDatum(false); + /* we set isvalid and isready the same way */ + values[Anum_pg_index_indisready - 1] = BoolGetDatum(isvalid); values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey); values[Anum_pg_index_indclass - 1] = PointerGetDatum(indclass); values[Anum_pg_index_indoption - 1] = PointerGetDatum(indoption); @@ -944,7 +947,11 @@ BuildIndexInfo(Relation index) /* other info */ ii->ii_Unique = indexStruct->indisunique; - ii->ii_Concurrent = false; /* assume normal case */ + ii->ii_ReadyForInserts = indexStruct->indisready; + + /* initialize index-build state to default */ + ii->ii_Concurrent = false; + ii->ii_BrokenHotChain = false; return ii; } @@ -1309,6 +1316,35 @@ index_build(Relation heapRelation, Assert(PointerIsValid(stats)); /* + * If we found any potentially broken HOT chains, mark the index as + * not being usable until the current transaction is below the event + * horizon. See src/backend/access/heap/README.HOT for discussion. + */ + if (indexInfo->ii_BrokenHotChain) + { + Oid indexId = RelationGetRelid(indexRelation); + Relation pg_index; + HeapTuple indexTuple; + Form_pg_index indexForm; + + pg_index = heap_open(IndexRelationId, RowExclusiveLock); + + indexTuple = SearchSysCacheCopy(INDEXRELID, + ObjectIdGetDatum(indexId), + 0, 0, 0); + if (!HeapTupleIsValid(indexTuple)) + elog(ERROR, "cache lookup failed for index %u", indexId); + indexForm = (Form_pg_index) GETSTRUCT(indexTuple); + + indexForm->indcheckxmin = true; + simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); + CatalogUpdateIndexes(pg_index, indexTuple); + + heap_freetuple(indexTuple); + heap_close(pg_index, RowExclusiveLock); + } + + /* * Update heap and index pg_class rows */ index_update_stats(heapRelation, @@ -1346,6 +1382,11 @@ index_build(Relation heapRelation, * must keep track of the number of index tuples; we don't do so here because * the AM might reject some of the tuples for its own reasons, such as being * unable to store NULLs. + * + * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect + * any potentially broken HOT chains. Currently, we set this if there are + * any RECENTLY_DEAD entries in a HOT chain, without trying very hard to + * detect whether they're really incompatible with the chain tip. */ double IndexBuildHeapScan(Relation heapRelation, @@ -1365,6 +1406,8 @@ IndexBuildHeapScan(Relation heapRelation, ExprContext *econtext; Snapshot snapshot; TransactionId OldestXmin; + BlockNumber root_blkno = InvalidBlockNumber; + OffsetNumber root_offsets[MaxHeapTuplesPerPage]; /* * sanity checks @@ -1427,15 +1470,47 @@ IndexBuildHeapScan(Relation heapRelation, CHECK_FOR_INTERRUPTS(); + /* + * When dealing with a HOT-chain of updated tuples, we want to + * index the values of the live tuple (if any), but index it + * under the TID of the chain's root tuple. This approach is + * necessary to preserve the HOT-chain structure in the heap. + * So we need to be able to find the root item offset for every + * tuple that's in a HOT-chain. When first reaching a new page + * of the relation, call heap_get_root_tuples() to build a map + * of root item offsets on the page. + * + * It might look unsafe to use this information across buffer + * lock/unlock. However, we hold ShareLock on the table so no + * ordinary insert/update/delete should occur; and we hold pin on + * the buffer continuously while visiting the page, so no pruning + * operation can occur either. + * + * Note the implied assumption that there is no more than one live + * tuple per HOT-chain ... + */ + if (scan->rs_cblock != root_blkno) + { + Page page = BufferGetPage(scan->rs_cbuf); + + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + heap_get_root_tuples(page, root_offsets); + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + root_blkno = scan->rs_cblock; + } + if (snapshot == SnapshotAny) { /* do our own time qual check */ bool indexIt; + recheck: /* * We could possibly get away with not locking the buffer here, * since caller should hold ShareLock on the relation, but let's - * be conservative about it. + * be conservative about it. (This remark is still correct + * even with HOT-pruning: our pin on the buffer prevents pruning.) */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); @@ -1458,10 +1533,29 @@ IndexBuildHeapScan(Relation heapRelation, * If tuple is recently deleted then we must index it * anyway to preserve MVCC semantics. (Pre-existing * transactions could try to use the index after we finish - * building it, and may need to see such tuples.) Exclude - * it from unique-checking, however. + * building it, and may need to see such tuples.) + * + * However, if it was HOT-updated then we must only index + * the live tuple at the end of the HOT-chain. Since this + * breaks semantics for pre-existing snapshots, mark + * the index as unusable for them. + * + * If we've already decided that the index will be unsafe + * for old snapshots, we may as well stop indexing + * recently-dead tuples, since there's no longer any + * point. */ - indexIt = true; + if (HeapTupleIsHotUpdated(heapTuple)) + { + indexIt = false; + /* mark the index as unsafe for old snapshots */ + indexInfo->ii_BrokenHotChain = true; + } + else if (indexInfo->ii_BrokenHotChain) + indexIt = false; + else + indexIt = true; + /* In any case, exclude the tuple from unique-checking */ tupleIsAlive = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: @@ -1473,12 +1567,31 @@ IndexBuildHeapScan(Relation heapRelation, * followed by CREATE INDEX within a transaction.) An * exception occurs when reindexing a system catalog, * because we often release lock on system catalogs before - * committing. + * committing. In that case we wait for the inserting + * transaction to finish and check again. (We could do + * that on user tables too, but since the case is not + * expected it seems better to throw an error.) */ if (!TransactionIdIsCurrentTransactionId( - HeapTupleHeaderGetXmin(heapTuple->t_data)) - && !IsSystemRelation(heapRelation)) - elog(ERROR, "concurrent insert in progress"); + HeapTupleHeaderGetXmin(heapTuple->t_data))) + { + if (!IsSystemRelation(heapRelation)) + elog(ERROR, "concurrent insert in progress"); + else + { + /* + * Must drop the lock on the buffer before we wait + */ + TransactionId xwait = HeapTupleHeaderGetXmin(heapTuple->t_data); + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(xwait); + goto recheck; + } + } + /* + * We must index such tuples, since if the index build + * commits then they're good. + */ indexIt = true; tupleIsAlive = true; break; @@ -1491,19 +1604,48 @@ IndexBuildHeapScan(Relation heapRelation, * followed by CREATE INDEX within a transaction.) An * exception occurs when reindexing a system catalog, * because we often release lock on system catalogs before - * committing. + * committing. In that case we wait for the deleting + * transaction to finish and check again. (We could do + * that on user tables too, but since the case is not + * expected it seems better to throw an error.) */ Assert(!(heapTuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)); if (!TransactionIdIsCurrentTransactionId( - HeapTupleHeaderGetXmax(heapTuple->t_data)) - && !IsSystemRelation(heapRelation)) - elog(ERROR, "concurrent delete in progress"); - indexIt = true; + HeapTupleHeaderGetXmax(heapTuple->t_data))) + { + if (!IsSystemRelation(heapRelation)) + elog(ERROR, "concurrent delete in progress"); + else + { + /* + * Must drop the lock on the buffer before we wait + */ + TransactionId xwait = HeapTupleHeaderGetXmax(heapTuple->t_data); + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(xwait); + goto recheck; + } + } + /* + * Otherwise, we have to treat these tuples just like + * RECENTLY_DELETED ones. + */ + if (HeapTupleIsHotUpdated(heapTuple)) + { + indexIt = false; + /* mark the index as unsafe for old snapshots */ + indexInfo->ii_BrokenHotChain = true; + } + else if (indexInfo->ii_BrokenHotChain) + indexIt = false; + else + indexIt = true; + /* In any case, exclude the tuple from unique-checking */ tupleIsAlive = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); - indexIt = tupleIsAlive = false; /* keep compiler quiet */ + indexIt = tupleIsAlive = false; /* keep compiler quiet */ break; } @@ -1552,9 +1694,33 @@ IndexBuildHeapScan(Relation heapRelation, * pass the values[] and isnull[] arrays, instead. */ - /* Call the AM's callback routine to process the tuple */ - callback(indexRelation, heapTuple, values, isnull, tupleIsAlive, - callback_state); + if (HeapTupleIsHeapOnly(heapTuple)) + { + /* + * For a heap-only tuple, pretend its TID is that of the root. + * See src/backend/access/heap/README.HOT for discussion. + */ + HeapTupleData rootTuple; + OffsetNumber offnum; + + rootTuple = *heapTuple; + offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self); + + Assert(OffsetNumberIsValid(root_offsets[offnum - 1])); + + ItemPointerSetOffsetNumber(&rootTuple.t_self, + root_offsets[offnum - 1]); + + /* Call the AM's callback routine to process the tuple */ + callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive, + callback_state); + } + else + { + /* Call the AM's callback routine to process the tuple */ + callback(indexRelation, heapTuple, values, isnull, tupleIsAlive, + callback_state); + } } heap_endscan(scan); @@ -1574,8 +1740,15 @@ IndexBuildHeapScan(Relation heapRelation, /* * validate_index - support code for concurrent index builds * - * We do a concurrent index build by first building the index normally via - * index_create(), while holding a weak lock that allows concurrent + * We do a concurrent index build by first inserting the catalog entry for the + * index via index_create(), marking it not indisready and not indisvalid. + * Then we commit our transaction and start a new one, then we wait for all + * transactions that could have been modifying the table to terminate. Now + * we know that any subsequently-started transactions will see the index and + * honor its constraints on HOT updates; so while existing HOT-chains might + * be broken with respect to the index, no currently live tuple will have an + * incompatible HOT update done to it. We now build the index normally via + * index_build(), while holding a weak lock that allows concurrent * insert/update/delete. Also, we index only tuples that are valid * as of the start of the scan (see IndexBuildHeapScan), whereas a normal * build takes care to include recently-dead tuples. This is OK because @@ -1586,11 +1759,10 @@ IndexBuildHeapScan(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Next, we commit the transaction so that the index becomes visible to other - * backends, but it is marked not "indisvalid" to prevent the planner from - * relying on it for indexscans. Then we wait for all transactions that - * could have been modifying the table to terminate. At this point we - * know that any subsequently-started transactions will see the index and + * Next, we mark the index "indisready" (but still not "indisvalid") and + * commit the second transaction and start a third. Again we wait for all + * transactions that could have been modifying the table to terminate. Now + * we know that any subsequently-started transactions will see the index and * insert their new tuples into it. We then take a new reference snapshot * which is passed to validate_index(). Any tuples that are valid according * to this snap, but are not in the index, must be added to the index. @@ -1610,7 +1782,7 @@ IndexBuildHeapScan(Relation heapRelation, * Building a unique index this way is tricky: we might try to insert a * tuple that is already dead or is in process of being deleted, and we * mustn't have a uniqueness failure against an updated version of the same - * row. We can check the tuple to see if it's already dead and tell + * row. We could try to check the tuple to see if it's already dead and tell * index_insert() not to do the uniqueness check, but that still leaves us * with a race condition against an in-progress update. To handle that, * we expect the index AM to recheck liveness of the to-be-inserted tuple @@ -1620,7 +1792,8 @@ IndexBuildHeapScan(Relation heapRelation, * were alive at the time of the reference snapshot are gone; this is * necessary to be sure there are none left with a serializable snapshot * older than the reference (and hence possibly able to see tuples we did - * not index). Then we mark the index valid and commit. + * not index). Then we mark the index "indisvalid" and commit. Subsequent + * transactions will be able to use it for queries. * * Doing two full table scans is a brute-force strategy. We could try to be * cleverer, eg storing new tuples in a special area of the table (perhaps @@ -1727,6 +1900,9 @@ validate_index_heapscan(Relation heapRelation, TupleTableSlot *slot; EState *estate; ExprContext *econtext; + BlockNumber root_blkno = InvalidBlockNumber; + OffsetNumber root_offsets[MaxHeapTuplesPerPage]; + bool in_index[MaxHeapTuplesPerPage]; /* state variables for the merge */ ItemPointer indexcursor = NULL; @@ -1768,39 +1944,86 @@ validate_index_heapscan(Relation heapRelation, while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { ItemPointer heapcursor = &heapTuple->t_self; + ItemPointerData rootTuple; + OffsetNumber root_offnum; CHECK_FOR_INTERRUPTS(); state->htups += 1; /* + * As commented in IndexBuildHeapScan, we should index heap-only tuples + * under the TIDs of their root tuples; so when we advance onto a new + * heap page, build a map of root item offsets on the page. + * + * This complicates merging against the tuplesort output: we will + * visit the live tuples in order by their offsets, but the root + * offsets that we need to compare against the index contents might + * be ordered differently. So we might have to "look back" within + * the tuplesort output, but only within the current page. We handle + * that by keeping a bool array in_index[] showing all the + * already-passed-over tuplesort output TIDs of the current page. + * We clear that array here, when advancing onto a new heap page. + */ + if (scan->rs_cblock != root_blkno) + { + Page page = BufferGetPage(scan->rs_cbuf); + + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + heap_get_root_tuples(page, root_offsets); + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + memset(in_index, 0, sizeof(in_index)); + + root_blkno = scan->rs_cblock; + } + + /* Convert actual tuple TID to root TID */ + rootTuple = *heapcursor; + root_offnum = ItemPointerGetOffsetNumber(heapcursor); + + if (HeapTupleIsHeapOnly(heapTuple)) + { + root_offnum = root_offsets[root_offnum - 1]; + Assert(OffsetNumberIsValid(root_offnum)); + ItemPointerSetOffsetNumber(&rootTuple, root_offnum); + } + + /* * "merge" by skipping through the index tuples until we find or pass - * the current heap tuple. + * the current root tuple. */ while (!tuplesort_empty && (!indexcursor || - ItemPointerCompare(indexcursor, heapcursor) < 0)) + ItemPointerCompare(indexcursor, &rootTuple) < 0)) { Datum ts_val; bool ts_isnull; if (indexcursor) + { + /* + * Remember index items seen earlier on the current heap page + */ + if (ItemPointerGetBlockNumber(indexcursor) == root_blkno) + in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true; pfree(indexcursor); + } + tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true, &ts_val, &ts_isnull); Assert(tuplesort_empty || !ts_isnull); indexcursor = (ItemPointer) DatumGetPointer(ts_val); } - if (tuplesort_empty || - ItemPointerCompare(indexcursor, heapcursor) > 0) + /* + * If the tuplesort has overshot *and* we didn't see a match earlier, + * then this tuple is missing from the index, so insert it. + */ + if ((tuplesort_empty || + ItemPointerCompare(indexcursor, &rootTuple) > 0) && + !in_index[root_offnum - 1]) { - /* - * We've overshot which means this heap tuple is missing from the - * index, so insert it. - */ - bool check_unique; - MemoryContextReset(econtext->ecxt_per_tuple_memory); /* Set up for predicate or expression evaluation */ @@ -1828,39 +2051,29 @@ validate_index_heapscan(Relation heapRelation, isnull); /* - * If the tuple is already committed dead, we still have to put it - * in the index (because some xacts might be able to see it), but - * we might as well suppress uniqueness checking. This is just an - * optimization because the index AM is not supposed to raise a - * uniqueness failure anyway. - */ - if (indexInfo->ii_Unique) - { - /* must lock buffer to call HeapTupleSatisfiesVisibility */ - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); - - if (HeapTupleSatisfiesVisibility(heapTuple, SnapshotNow, - scan->rs_cbuf)) - check_unique = true; - else - check_unique = false; - - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); - } - else - check_unique = false; - - /* * You'd think we should go ahead and build the index tuple here, * but some index AMs want to do further processing on the data * first. So pass the values[] and isnull[] arrays, instead. */ + + /* + * If the tuple is already committed dead, you might think we + * could suppress uniqueness checking, but this is no longer + * true in the presence of HOT, because the insert is actually + * a proxy for a uniqueness check on the whole HOT-chain. That + * is, the tuple we have here could be dead because it was already + * HOT-updated, and if so the updating transaction will not have + * thought it should insert index entries. The index AM will + * check the whole HOT-chain and correctly detect a conflict + * if there is one. + */ + index_insert(indexRelation, values, isnull, - heapcursor, + &rootTuple, heapRelation, - check_unique); + indexInfo->ii_Unique); state->tups_inserted += 1; } @@ -1983,9 +2196,9 @@ reindex_index(Oid indexId) ResetReindexProcessing(); /* - * If the index is marked invalid (ie, it's from a failed CREATE INDEX - * CONCURRENTLY), we can now mark it valid. This allows REINDEX to be - * used to clean up in such cases. + * If the index is marked invalid or not ready (ie, it's from a failed + * CREATE INDEX CONCURRENTLY), we can now mark it valid. This allows + * REINDEX to be used to clean up in such cases. */ pg_index = heap_open(IndexRelationId, RowExclusiveLock); @@ -1996,9 +2209,10 @@ reindex_index(Oid indexId) elog(ERROR, "cache lookup failed for index %u", indexId); indexForm = (Form_pg_index) GETSTRUCT(indexTuple); - if (!indexForm->indisvalid) + if (!indexForm->indisvalid || !indexForm->indisready) { indexForm->indisvalid = true; + indexForm->indisready = true; simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); CatalogUpdateIndexes(pg_index, indexTuple); } diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c index e6ef88fd4ab..6f71022ffd2 100644 --- a/src/backend/catalog/indexing.c +++ b/src/backend/catalog/indexing.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/indexing.c,v 1.114 2007/01/05 22:19:24 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/indexing.c,v 1.115 2007/09/20 17:56:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -78,6 +78,10 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + /* HOT update does not require index inserts */ + if (HeapTupleIsHeapOnly(heapTuple)) + return; + /* * Get information from the state structure. Fall out if nothing to do. */ @@ -101,6 +105,10 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple) indexInfo = indexInfoArray[i]; + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + /* * Expressional and partial indexes on system catalogs are not * supported diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index e4ae0f39d4e..3e76bd17253 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -3,7 +3,7 @@ * * Copyright (c) 1996-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.44 2007/09/11 08:51:22 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.45 2007/09/20 17:56:30 tgl Exp $ */ CREATE VIEW pg_roles AS @@ -207,6 +207,7 @@ CREATE VIEW pg_stat_all_tables AS pg_stat_get_tuples_inserted(C.oid) AS n_tup_ins, pg_stat_get_tuples_updated(C.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(C.oid) AS n_tup_del, + pg_stat_get_tuples_hot_updated(C.oid) AS n_tup_hot_upd, pg_stat_get_live_tuples(C.oid) AS n_live_tup, pg_stat_get_dead_tuples(C.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(C.oid) as last_vacuum, diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index 2fe44f59f8c..86d1def1cc8 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.7 2007/07/25 22:16:18 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/catalog/toasting.c,v 1.8 2007/09/20 17:56:30 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -225,7 +225,9 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid) indexInfo->ii_Predicate = NIL; indexInfo->ii_PredicateState = NIL; indexInfo->ii_Unique = true; + indexInfo->ii_ReadyForInserts = true; indexInfo->ii_Concurrent = false; + indexInfo->ii_BrokenHotChain = false; classObjectId[0] = OID_BTREE_OPS_OID; classObjectId[1] = INT4_BTREE_OPS_OID; diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index ebac5957bd2..943978e589a 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.165 2007/09/10 21:59:37 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.166 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -119,6 +119,7 @@ DefineIndex(RangeVar *heapRelation, Oid namespaceId; Oid tablespaceId; Relation rel; + Relation indexRelation; HeapTuple tuple; Form_pg_am accessMethodForm; bool amcanorder; @@ -420,7 +421,10 @@ DefineIndex(RangeVar *heapRelation, indexInfo->ii_Predicate = make_ands_implicit(predicate); indexInfo->ii_PredicateState = NIL; indexInfo->ii_Unique = unique; + /* In a concurrent build, mark it not-ready-for-inserts */ + indexInfo->ii_ReadyForInserts = !concurrent; indexInfo->ii_Concurrent = concurrent; + indexInfo->ii_BrokenHotChain = false; classObjectId = (Oid *) palloc(numberOfAttributes * sizeof(Oid)); coloptions = (int16 *) palloc(numberOfAttributes * sizeof(int16)); @@ -439,23 +443,38 @@ DefineIndex(RangeVar *heapRelation, primary ? "PRIMARY KEY" : "UNIQUE", indexRelationName, RelationGetRelationName(rel)))); - /* save lockrelid for below, then close rel */ + /* save lockrelid and locktag for below, then close rel */ heaprelid = rel->rd_lockInfo.lockRelId; + SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId); heap_close(rel, NoLock); + if (!concurrent) + { + indexRelationId = + index_create(relationId, indexRelationName, indexRelationId, + indexInfo, accessMethodId, tablespaceId, classObjectId, + coloptions, reloptions, primary, isconstraint, + allowSystemTableMods, skip_build, concurrent); + + return; /* We're done, in the standard case */ + } + + /* + * For a concurrent build, we next insert the catalog entry and add + * constraints. We don't build the index just yet; we must first make + * the catalog entry so that the new index is visible to updating + * transactions. That will prevent them from making incompatible HOT + * updates. The new index will be marked not indisready and not + * indisvalid, so that no one else tries to either insert into it or use + * it for queries. We pass skip_build = true to prevent the build. + */ indexRelationId = index_create(relationId, indexRelationName, indexRelationId, indexInfo, accessMethodId, tablespaceId, classObjectId, coloptions, reloptions, primary, isconstraint, - allowSystemTableMods, skip_build, concurrent); - - if (!concurrent) - return; /* We're done, in the standard case */ + allowSystemTableMods, true, concurrent); /* - * Phase 2 of concurrent index build (see comments for validate_index() - * for an overview of how this works) - * * We must commit our current transaction so that the index becomes * visible; then start another. Note that all the data structures we just * built are lost in the commit. The only data we keep past here are the @@ -476,6 +495,9 @@ DefineIndex(RangeVar *heapRelation, StartTransactionCommand(); /* + * Phase 2 of concurrent index build (see comments for validate_index() + * for an overview of how this works) + * * Now we must wait until no running transaction could have the table open * with the old list of indexes. To do this, inquire which xacts * currently would conflict with ShareLock on the table -- ie, which ones @@ -494,7 +516,91 @@ DefineIndex(RangeVar *heapRelation, * check for that. Also, prepared xacts are not reported, which is * fine since they certainly aren't going to do anything more. */ - SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId); + old_lockholders = GetLockConflicts(&heaplocktag, ShareLock); + + while (VirtualTransactionIdIsValid(*old_lockholders)) + { + VirtualXactLockTableWait(*old_lockholders); + old_lockholders++; + } + + /* + * At this moment we are sure that there are no transactions with the + * table open for write that don't have this new index in their list of + * indexes. We have waited out all the existing transactions and any new + * transaction will have the new index in its list, but the index is still + * marked as "not-ready-for-inserts". The index is consulted while + * deciding HOT-safety though. This arrangement ensures that no new HOT + * chains can be created where the new tuple and the old tuple in the + * chain have different index keys. + * + * We now take a new snapshot, and build the index using all tuples that + * are visible in this snapshot. We can be sure that any HOT updates + * to these tuples will be compatible with the index, since any updates + * made by transactions that didn't know about the index are now committed + * or rolled back. Thus, each visible tuple is either the end of its + * HOT-chain or the extension of the chain is HOT-safe for this index. + */ + + /* Open and lock the parent heap relation */ + rel = heap_openrv(heapRelation, ShareUpdateExclusiveLock); + + /* And the target index relation */ + indexRelation = index_open(indexRelationId, RowExclusiveLock); + + /* Set ActiveSnapshot since functions in the indexes may need it */ + ActiveSnapshot = CopySnapshot(GetTransactionSnapshot()); + + /* We have to re-build the IndexInfo struct, since it was lost in commit */ + indexInfo = BuildIndexInfo(indexRelation); + Assert(!indexInfo->ii_ReadyForInserts); + indexInfo->ii_Concurrent = true; + indexInfo->ii_BrokenHotChain = false; + + /* Now build the index */ + index_build(rel, indexRelation, indexInfo, primary); + + /* Close both the relations, but keep the locks */ + heap_close(rel, NoLock); + index_close(indexRelation, NoLock); + + /* + * Update the pg_index row to mark the index as ready for inserts. + * Once we commit this transaction, any new transactions that + * open the table must insert new entries into the index for insertions + * and non-HOT updates. + */ + pg_index = heap_open(IndexRelationId, RowExclusiveLock); + + indexTuple = SearchSysCacheCopy(INDEXRELID, + ObjectIdGetDatum(indexRelationId), + 0, 0, 0); + if (!HeapTupleIsValid(indexTuple)) + elog(ERROR, "cache lookup failed for index %u", indexRelationId); + indexForm = (Form_pg_index) GETSTRUCT(indexTuple); + + Assert(!indexForm->indisready); + Assert(!indexForm->indisvalid); + + indexForm->indisready = true; + + simple_heap_update(pg_index, &indexTuple->t_self, indexTuple); + CatalogUpdateIndexes(pg_index, indexTuple); + + heap_close(pg_index, RowExclusiveLock); + + /* + * Commit this transaction to make the indisready update visible. + */ + CommitTransactionCommand(); + StartTransactionCommand(); + + /* + * Phase 3 of concurrent index build + * + * We once again wait until no transaction can have the table open with + * the index marked as read-only for updates. + */ old_lockholders = GetLockConflicts(&heaplocktag, ShareLock); while (VirtualTransactionIdIsValid(*old_lockholders)) @@ -505,7 +611,7 @@ DefineIndex(RangeVar *heapRelation, /* * Now take the "reference snapshot" that will be used by validate_index() - * to filter candidate tuples. Beware! There might be still snapshots + * to filter candidate tuples. Beware! There might still be snapshots * in use that treat some transaction as in-progress that our reference * snapshot treats as committed. If such a recently-committed transaction * deleted tuples in the table, we will not include them in the index; yet @@ -560,7 +666,7 @@ DefineIndex(RangeVar *heapRelation, elog(ERROR, "cache lookup failed for index %u", indexRelationId); indexForm = (Form_pg_index) GETSTRUCT(indexTuple); - Assert(indexForm->indexrelid = indexRelationId); + Assert(indexForm->indisready); Assert(!indexForm->indisvalid); indexForm->indisvalid = true; @@ -575,7 +681,8 @@ DefineIndex(RangeVar *heapRelation, * relcache entries for the index itself, but we should also send a * relcache inval on the parent table to force replanning of cached plans. * Otherwise existing sessions might fail to use the new index where it - * would be useful. + * would be useful. (Note that our earlier commits did not create + * reasons to replan; relcache flush on the index itself was sufficient.) */ CacheInvalidateRelcacheByRelid(heaprelid.relId); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 73024a7e703..25d1e2311b6 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.145 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.146 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1281,7 +1281,7 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record) itemsz = record->xl_len - sizeof(xl_seq_rec); itemsz = MAXALIGN(itemsz); if (PageAddItem(page, (Item) item, itemsz, - FirstOffsetNumber, false) == InvalidOffsetNumber) + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(PANIC, "seq_redo: failed to add item to page"); PageSetLSN(page, lsn); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index f9b9423534e..5630fc2730d 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.358 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.359 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -124,10 +124,11 @@ typedef VTupleMoveData *VTupleMove; typedef struct VRelStats { /* miscellaneous statistics */ - BlockNumber rel_pages; - double rel_tuples; - Size min_tlen; - Size max_tlen; + BlockNumber rel_pages; /* pages in relation */ + double rel_tuples; /* tuples that remain after vacuuming */ + double rel_indexed_tuples; /* indexed tuples that remain */ + Size min_tlen; /* min surviving tuple size */ + Size max_tlen; /* max surviving tuple size */ bool hasindex; /* vtlinks array for tuple chain following - sorted by new_tid */ int num_vtlinks; @@ -1177,6 +1178,7 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt) vacrelstats = (VRelStats *) palloc(sizeof(VRelStats)); vacrelstats->rel_pages = 0; vacrelstats->rel_tuples = 0; + vacrelstats->rel_indexed_tuples = 0; vacrelstats->hasindex = false; /* scan the heap */ @@ -1195,13 +1197,13 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt) { for (i = 0; i < nindexes; i++) vacuum_index(&vacuum_pages, Irel[i], - vacrelstats->rel_tuples, 0); + vacrelstats->rel_indexed_tuples, 0); } else { /* just scan indexes to update statistic */ for (i = 0; i < nindexes; i++) - scan_index(Irel[i], vacrelstats->rel_tuples); + scan_index(Irel[i], vacrelstats->rel_indexed_tuples); } } @@ -1256,6 +1258,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, BlockNumber empty_pages, empty_end_pages; double num_tuples, + num_indexed_tuples, tups_vacuumed, nkeep, nunused; @@ -1278,7 +1281,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, relname))); empty_pages = empty_end_pages = 0; - num_tuples = tups_vacuumed = nkeep = nunused = 0; + num_tuples = num_indexed_tuples = tups_vacuumed = nkeep = nunused = 0; free_space = 0; nblocks = RelationGetNumberOfBlocks(onerel); @@ -1313,9 +1316,13 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, * background writer will try to write the page if it's already marked * dirty. To ensure that invalid data doesn't get written to disk, we * must take exclusive buffer lock wherever we potentially modify - * pages. + * pages. In fact, we insist on cleanup lock so that we can safely + * call heap_page_prune(). (This might be overkill, since the bgwriter + * pays no attention to individual tuples, but on the other hand it's + * unlikely that the bgwriter has this particular page pinned at this + * instant. So violating the coding rule would buy us little anyway.) */ - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + LockBufferForCleanup(buf); vacpage->blkno = blkno; vacpage->offsets_used = 0; @@ -1356,6 +1363,21 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, continue; } + /* + * Prune all HOT-update chains in this page. + * + * We use the redirect_move option so that redirecting line pointers + * get collapsed out; this allows us to not worry about them below. + * + * We count tuples removed by the pruning step as removed by VACUUM. + */ + tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, + true, false); + + /* + * Now scan the page to collect vacuumable items and check for + * tuples requiring freezing. + */ nfrozen = 0; notup = true; maxoff = PageGetMaxOffsetNumber(page); @@ -1369,7 +1391,9 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, /* * Collect un-used items too - it's possible to have indexes - * pointing here after crash. + * pointing here after crash. (That's an ancient comment and + * is likely obsolete with WAL, but we might as well continue + * to check for such problems.) */ if (!ItemIdIsUsed(itemid)) { @@ -1378,6 +1402,23 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, continue; } + /* + * DEAD item pointers are to be vacuumed normally; but we don't + * count them in tups_vacuumed, else we'd be double-counting + * (at least in the common case where heap_page_prune() just + * freed up a non-HOT tuple). + */ + if (ItemIdIsDead(itemid)) + { + vacpage->offsets[vacpage->offsets_free++] = offnum; + continue; + } + + /* Shouldn't have any redirected items anymore */ + if (!ItemIdIsNormal(itemid)) + elog(ERROR, "relation \"%s\" TID %u/%u: unexpected redirect item", + relname, blkno, offnum); + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); ItemPointerSet(&(tuple.t_self), blkno, offnum); @@ -1410,12 +1451,45 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, } break; case HEAPTUPLE_DEAD: - tupgone = true; /* we can delete the tuple */ /* - * We need not require XMIN_COMMITTED or XMAX_COMMITTED to - * be set, since we will remove the tuple without any - * further examination of its hint bits. + * Ordinarily, DEAD tuples would have been removed by + * heap_page_prune(), but it's possible that the tuple + * state changed since heap_page_prune() looked. In + * particular an INSERT_IN_PROGRESS tuple could have + * changed to DEAD if the inserter aborted. So this + * cannot be considered an error condition, though it + * does suggest that someone released a lock early. + * + * If the tuple is HOT-updated then it must only be + * removed by a prune operation; so we keep it as if it + * were RECENTLY_DEAD, and abandon shrinking. (XXX is it + * worth trying to make the shrinking code smart enough + * to handle this? It's an unusual corner case.) + * + * DEAD heap-only tuples can safely be removed if they + * aren't themselves HOT-updated, although this is a bit + * inefficient since we'll uselessly try to remove + * index entries for them. */ + if (HeapTupleIsHotUpdated(&tuple)) + { + nkeep += 1; + if (do_shrinking) + ereport(LOG, + (errmsg("relation \"%s\" TID %u/%u: dead HOT-updated tuple --- cannot shrink relation", + relname, blkno, offnum))); + do_shrinking = false; + } + else + { + tupgone = true; /* we can delete the tuple */ + /* + * We need not require XMIN_COMMITTED or + * XMAX_COMMITTED to be set, since we will remove the + * tuple without any further examination of its hint + * bits. + */ + } break; case HEAPTUPLE_RECENTLY_DEAD: @@ -1530,6 +1604,8 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, else { num_tuples += 1; + if (!HeapTupleIsHeapOnly(&tuple)) + num_indexed_tuples += 1; notup = false; if (tuple.t_len < min_tlen) min_tlen = tuple.t_len; @@ -1549,7 +1625,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, if (tempPage != NULL) { /* Some tuples are removable; figure free space after removal */ - PageRepairFragmentation(tempPage, NULL); + PageRepairFragmentation(tempPage); vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, tempPage); pfree(tempPage); do_reap = true; @@ -1558,7 +1634,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, { /* Just use current available space */ vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page); - /* Need to reap the page if it has LP_UNUSED line pointers */ + /* Need to reap the page if it has UNUSED or DEAD line pointers */ do_reap = (vacpage->offsets_free > 0); } @@ -1621,6 +1697,7 @@ scan_heap(VRelStats *vacrelstats, Relation onerel, /* save stats in the rel list for use later */ vacrelstats->rel_tuples = num_tuples; + vacrelstats->rel_indexed_tuples = num_indexed_tuples; vacrelstats->rel_pages = nblocks; if (num_tuples == 0) min_tlen = max_tlen = 0; @@ -1720,6 +1797,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, num_fraged_pages, vacuumed_pages; int keep_tuples = 0; + int keep_indexed_tuples = 0; PGRUsage ru0; pg_rusage_init(&ru0); @@ -1845,6 +1923,16 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, if (!ItemIdIsUsed(itemid)) continue; + if (ItemIdIsDead(itemid)) + { + /* just remember it for vacuum_page() */ + vacpage->offsets[vacpage->offsets_free++] = offnum; + continue; + } + + /* Shouldn't have any redirected items now */ + Assert(ItemIdIsNormal(itemid)); + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple_len = tuple.t_len = ItemIdGetLength(itemid); ItemPointerSet(&(tuple.t_self), blkno, offnum); @@ -1906,12 +1994,28 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, if (i >= vacpage->offsets_free) /* not found */ { vacpage->offsets[vacpage->offsets_free++] = offnum; + /* + * If this is not a heap-only tuple, there must be an + * index entry for this item which will be removed in + * the index cleanup. Decrement the keep_indexed_tuples + * count to remember this. + */ + if (!HeapTupleHeaderIsHeapOnly(tuple.t_data)) + keep_indexed_tuples--; keep_tuples--; } } else { vacpage->offsets[vacpage->offsets_free++] = offnum; + /* + * If this is not a heap-only tuple, there must be an + * index entry for this item which will be removed in + * the index cleanup. Decrement the keep_indexed_tuples + * count to remember this. + */ + if (!HeapTupleHeaderIsHeapOnly(tuple.t_data)) + keep_indexed_tuples--; keep_tuples--; } continue; @@ -2028,7 +2132,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, break; } nextItemid = PageGetItemId(nextPage, nextOffnum); - if (!ItemIdIsUsed(nextItemid)) + if (!ItemIdIsNormal(nextItemid)) { ReleaseBuffer(nextBuf); break; @@ -2166,7 +2270,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, Pitemid = PageGetItemId(Ppage, ItemPointerGetOffsetNumber(&(tp.t_self))); /* this can't happen since we saw tuple earlier: */ - if (!ItemIdIsUsed(Pitemid)) + if (!ItemIdIsNormal(Pitemid)) elog(ERROR, "parent itemid marked as unused"); PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid); @@ -2268,6 +2372,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, dst_buffer, dst_page, destvacpage, &ec, &Ctid, vtmove[ti].cleanVpd); + /* + * If the tuple we are moving is a heap-only tuple, + * this move will generate an additional index entry, + * so increment the rel_indexed_tuples count. + */ + if (HeapTupleHeaderIsHeapOnly(tuple.t_data)) + vacrelstats->rel_indexed_tuples++; + num_moved++; if (destvacpage->blkno > last_move_dest_block) last_move_dest_block = destvacpage->blkno; @@ -2280,7 +2392,31 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, vacpage->offsets[vacpage->offsets_free++] = ItemPointerGetOffsetNumber(&(tuple.t_self)); else + { + /* + * When we move tuple chains, we may need to move + * tuples from a block that we haven't yet scanned in + * the outer walk-along-the-relation loop. Note that we + * can't be moving a tuple from a block that we have + * already scanned because if such a tuple exists, then + * we must have moved the chain along with that tuple + * when we scanned that block. IOW the test of + * (Cbuf != buf) guarantees that the tuple we are + * looking at right now is in a block which is yet to + * be scanned. + * + * We maintain two counters to correctly count the + * moved-off tuples from blocks that are not yet + * scanned (keep_tuples) and how many of them have + * index pointers (keep_indexed_tuples). The main + * reason to track the latter is to help verify + * that indexes have the expected number of entries + * when all the dust settles. + */ + if (!HeapTupleHeaderIsHeapOnly(tuple.t_data)) + keep_indexed_tuples++; keep_tuples++; + } ReleaseBuffer(dst_buffer); ReleaseBuffer(Cbuf); @@ -2328,6 +2464,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, move_plain_tuple(onerel, buf, page, &tuple, dst_buffer, dst_page, dst_vacpage, &ec); + /* + * If the tuple we are moving is a heap-only tuple, + * this move will generate an additional index entry, + * so increment the rel_indexed_tuples count. + */ + if (HeapTupleHeaderIsHeapOnly(tuple.t_data)) + vacrelstats->rel_indexed_tuples++; + num_moved++; if (dst_vacpage->blkno > last_move_dest_block) last_move_dest_block = dst_vacpage->blkno; @@ -2361,6 +2505,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, if (!ItemIdIsUsed(itemid)) continue; + /* Shouldn't be any DEAD or REDIRECT items anymore */ + Assert(ItemIdIsNormal(itemid)); + htup = (HeapTupleHeader) PageGetItem(page, itemid); if (htup->t_infomask & HEAP_XMIN_COMMITTED) continue; @@ -2389,6 +2536,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, { vacpage->offsets[vacpage->offsets_free++] = off; Assert(keep_tuples > 0); + /* + * If this is not a heap-only tuple, there must be an + * index entry for this item which will be removed in + * the index cleanup. Decrement the keep_indexed_tuples + * count to remember this. + */ + if (!HeapTupleHeaderIsHeapOnly(htup)) + keep_indexed_tuples--; keep_tuples--; } } @@ -2396,6 +2551,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, { vacpage->offsets[vacpage->offsets_free++] = off; Assert(keep_tuples > 0); + if (!HeapTupleHeaderIsHeapOnly(htup)) + keep_indexed_tuples--; keep_tuples--; } } @@ -2529,11 +2686,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, * page during chain moves but not been scanned over subsequently. * The tuple ids of these tuples are not recorded as free offsets * for any VacPage, so they will not be cleared from the indexes. + * keep_indexed_tuples is the portion of these that are expected + * to have index entries. */ Assert(keep_tuples >= 0); for (i = 0; i < nindexes; i++) vacuum_index(&Nvacpagelist, Irel[i], - vacrelstats->rel_tuples, keep_tuples); + vacrelstats->rel_indexed_tuples, + keep_indexed_tuples); } /* @@ -2551,7 +2711,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, OffsetNumber unused[MaxOffsetNumber]; OffsetNumber offnum, maxoff; - int uncnt; + int uncnt = 0; int num_tuples = 0; buf = ReadBufferWithStrategy(onerel, vacpage->blkno, vac_strategy); @@ -2567,6 +2727,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, if (!ItemIdIsUsed(itemid)) continue; + /* Shouldn't be any DEAD or REDIRECT items anymore */ + Assert(ItemIdIsNormal(itemid)); + htup = (HeapTupleHeader) PageGetItem(page, itemid); if (htup->t_infomask & HEAP_XMIN_COMMITTED) continue; @@ -2584,12 +2747,14 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, ItemIdSetUnused(itemid); num_tuples++; + + unused[uncnt++] = offnum; } Assert(vacpage->offsets_free == num_tuples); START_CRIT_SECTION(); - uncnt = PageRepairFragmentation(page, unused); + PageRepairFragmentation(page); MarkBufferDirty(buf); @@ -2598,7 +2763,10 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, { XLogRecPtr recptr; - recptr = log_heap_clean(onerel, buf, unused, uncnt); + recptr = log_heap_clean(onerel, buf, + NULL, 0, NULL, 0, + unused, uncnt, + false); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } @@ -2706,15 +2874,17 @@ move_chain_tuple(Relation rel, /* * Update the state of the copied tuple, and store it on the destination - * page. + * page. The copied tuple is never part of a HOT chain. */ newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF); newtup.t_data->t_infomask |= HEAP_MOVED_IN; + HeapTupleHeaderClearHotUpdated(newtup.t_data); + HeapTupleHeaderClearHeapOnly(newtup.t_data); HeapTupleHeaderSetXvac(newtup.t_data, myXID); newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len, - InvalidOffsetNumber, false); + InvalidOffsetNumber, false, true); if (newoff == InvalidOffsetNumber) elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain", (unsigned long) tuple_len, dst_vacpage->blkno); @@ -2809,17 +2979,19 @@ move_plain_tuple(Relation rel, START_CRIT_SECTION(); /* - * Mark new tuple as MOVED_IN by me. + * Mark new tuple as MOVED_IN by me; also mark it not HOT. */ newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF); newtup.t_data->t_infomask |= HEAP_MOVED_IN; + HeapTupleHeaderClearHotUpdated(newtup.t_data); + HeapTupleHeaderClearHeapOnly(newtup.t_data); HeapTupleHeaderSetXvac(newtup.t_data, myXID); /* add tuple to the page */ newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len, - InvalidOffsetNumber, false); + InvalidOffsetNumber, false, true); if (newoff == InvalidOffsetNumber) elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)", (unsigned long) tuple_len, @@ -2934,6 +3106,9 @@ update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages, if (!ItemIdIsUsed(itemid)) continue; + /* Shouldn't be any DEAD or REDIRECT items anymore */ + Assert(ItemIdIsNormal(itemid)); + htup = (HeapTupleHeader) PageGetItem(page, itemid); if (htup->t_infomask & HEAP_XMIN_COMMITTED) continue; @@ -3019,10 +3194,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages) static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage) { - OffsetNumber unused[MaxOffsetNumber]; - int uncnt; Page page = BufferGetPage(buffer); - ItemId itemid; int i; /* There shouldn't be any tuples moved onto the page yet! */ @@ -3032,11 +3204,12 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage) for (i = 0; i < vacpage->offsets_free; i++) { - itemid = PageGetItemId(page, vacpage->offsets[i]); + ItemId itemid = PageGetItemId(page, vacpage->offsets[i]); + ItemIdSetUnused(itemid); } - uncnt = PageRepairFragmentation(page, unused); + PageRepairFragmentation(page); MarkBufferDirty(buffer); @@ -3045,7 +3218,10 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage) { XLogRecPtr recptr; - recptr = log_heap_clean(onerel, buffer, unused, uncnt); + recptr = log_heap_clean(onerel, buffer, + NULL, 0, NULL, 0, + vacpage->offsets, vacpage->offsets_free, + false); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } @@ -3527,8 +3703,7 @@ enough_space(VacPage vacpage, Size len) static Size PageGetFreeSpaceWithFillFactor(Relation relation, Page page) { - PageHeader pd = (PageHeader) page; - Size freespace = pd->pd_upper - pd->pd_lower; + Size freespace = PageGetHeapFreeSpace(page); Size targetfree; targetfree = RelationGetTargetPageFreeSpace(relation, diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 3faf172acbf..b9050719cb4 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -36,7 +36,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.96 2007/09/16 02:37:46 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.97 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -326,8 +326,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); - /* Initially, we only need shared access to the buffer */ - LockBuffer(buf, BUFFER_LOCK_SHARE); + /* We need buffer cleanup lock so that we can prune HOT chains. */ + LockBufferForCleanup(buf); page = BufferGetPage(buf); @@ -341,11 +341,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To - * interlock against that, release the buffer read lock (which we - * must do anyway) and grab the relation extension lock before - * re-locking in exclusive mode. If the page is still - * uninitialized by then, it must be left over from a crashed - * backend, and we can initialize it. + * protect against that, release the buffer lock, grab the + * relation extension lock momentarily, and re-lock the buffer. + * If the page is still uninitialized by then, it must be left + * over from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to @@ -357,7 +356,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, @@ -366,7 +365,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; lazy_record_free_space(vacrelstats, blkno, - PageGetFreeSpace(page)); + PageGetHeapFreeSpace(page)); } MarkBufferDirty(buf); UnlockReleaseBuffer(buf); @@ -377,11 +376,23 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, { empty_pages++; lazy_record_free_space(vacrelstats, blkno, - PageGetFreeSpace(page)); + PageGetHeapFreeSpace(page)); UnlockReleaseBuffer(buf); continue; } + /* + * Prune all HOT-update chains in this page. + * + * We count tuples removed by the pruning step as removed by VACUUM. + */ + tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, + false, false); + + /* + * Now scan the page to collect vacuumable items and check for + * tuples requiring freezing. + */ nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; @@ -394,22 +405,64 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, itemid = PageGetItemId(page, offnum); + /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } + /* Redirect items mustn't be touched */ + if (ItemIdIsRedirected(itemid)) + { + hastup = true; /* this page won't be truncatable */ + continue; + } + + ItemPointerSet(&(tuple.t_self), blkno, offnum); + + /* + * DEAD item pointers are to be vacuumed normally; but we don't + * count them in tups_vacuumed, else we'd be double-counting + * (at least in the common case where heap_page_prune() just + * freed up a non-HOT tuple). + */ + if (ItemIdIsDead(itemid)) + { + lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); + continue; + } + + Assert(ItemIdIsNormal(itemid)); + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); - ItemPointerSet(&(tuple.t_self), blkno, offnum); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: - tupgone = true; /* we can delete the tuple */ + /* + * Ordinarily, DEAD tuples would have been removed by + * heap_page_prune(), but it's possible that the tuple + * state changed since heap_page_prune() looked. In + * particular an INSERT_IN_PROGRESS tuple could have + * changed to DEAD if the inserter aborted. So this + * cannot be considered an error condition. + * + * If the tuple is HOT-updated then it must only be + * removed by a prune operation; so we keep it just as + * if it were RECENTLY_DEAD. Also, if it's a heap-only + * tuple, we choose to keep it, because it'll be a + * lot cheaper to get rid of it in the next pruning pass + * than to treat it like an indexed tuple. + */ + if (HeapTupleIsHotUpdated(&tuple) || + HeapTupleIsHeapOnly(&tuple)) + nkeep += 1; + else + tupgone = true; /* we can delete the tuple */ break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ @@ -449,11 +502,10 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, /* * Each non-removable tuple must be checked to see if it - * needs freezing. If we already froze anything, then - * we've already switched the buffer lock to exclusive. + * needs freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, - (nfrozen > 0) ? InvalidBuffer : buf)) + InvalidBuffer)) frozen[nfrozen++] = offnum; } } /* scan along page */ @@ -485,9 +537,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { - /* Trade in buffer share lock for super-exclusive lock */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockBufferForCleanup(buf); /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ @@ -505,7 +554,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, if (vacrelstats->num_dead_tuples == prev_dead_count) { lazy_record_free_space(vacrelstats, blkno, - PageGetFreeSpace(page)); + PageGetHeapFreeSpace(page)); } /* Remember the location of the last page with nonremovable tuples */ @@ -598,7 +647,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) /* Now that we've compacted the page, record its available space */ page = BufferGetPage(buf); lazy_record_free_space(vacrelstats, tblk, - PageGetFreeSpace(page)); + PageGetHeapFreeSpace(page)); UnlockReleaseBuffer(buf); npages++; } @@ -615,7 +664,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) * lazy_vacuum_page() -- free dead tuples on a page * and repair its fragmentation. * - * Caller must hold pin and lock on the buffer. + * Caller must hold pin and buffer cleanup lock on the buffer. * * tupindex is the index in vacrelstats->dead_tuples of the first dead * tuple for this page. We assume the rest follow sequentially. @@ -625,10 +674,9 @@ static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, int tupindex, LVRelStats *vacrelstats) { - OffsetNumber unused[MaxOffsetNumber]; - int uncnt; Page page = BufferGetPage(buffer); - ItemId itemid; + OffsetNumber unused[MaxOffsetNumber]; + int uncnt = 0; START_CRIT_SECTION(); @@ -636,6 +684,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, { BlockNumber tblk; OffsetNumber toff; + ItemId itemid; tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); if (tblk != blkno) @@ -643,9 +692,10 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]); itemid = PageGetItemId(page, toff); ItemIdSetUnused(itemid); + unused[uncnt++] = toff; } - uncnt = PageRepairFragmentation(page, unused); + PageRepairFragmentation(page); MarkBufferDirty(buffer); @@ -654,7 +704,10 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, { XLogRecPtr recptr; - recptr = log_heap_clean(onerel, buffer, unused, uncnt); + recptr = log_heap_clean(onerel, buffer, + NULL, 0, NULL, 0, + unused, uncnt, + false); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } @@ -980,7 +1033,7 @@ lazy_record_dead_tuple(LVRelStats *vacrelstats, /* * The array shouldn't overflow under normal behavior, but perhaps it * could if we are given a really small maintenance_work_mem. In that - * case, just forget the last few tuples. + * case, just forget the last few tuples (we'll get 'em next time). */ if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples) { diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 7e5873b89df..485f6ddc1ee 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -26,7 +26,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.297 2007/09/07 20:59:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.298 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1813,8 +1813,10 @@ lreplace:; * * Note: heap_update returns the tid (location) of the new tuple in the * t_self field. + * + * If it's a HOT update, we mustn't insert new index entries. */ - if (resultRelInfo->ri_NumIndices > 0) + if (resultRelInfo->ri_NumIndices > 0 && !HeapTupleIsHeapOnly(tuple)) ExecInsertIndexTuples(slot, &(tuple->t_self), estate, false); /* AFTER ROW UPDATE Triggers */ diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c index 1d478062998..790a9dccc10 100644 --- a/src/backend/executor/execUtils.c +++ b/src/backend/executor/execUtils.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/execUtils.c,v 1.150 2007/08/15 21:39:50 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/executor/execUtils.c,v 1.151 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -981,6 +981,10 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * stuff as it only exists here because the genam stuff * doesn't provide the functionality needed by the * executor.. -cim 9/27/89 + * + * CAUTION: this must not be called for a HOT update. + * We can't defend against that here for lack of info. + * Should we change the API to make it safer? * ---------------------------------------------------------------- */ void @@ -1029,6 +1033,10 @@ ExecInsertIndexTuples(TupleTableSlot *slot, indexInfo = indexInfoArray[i]; + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + /* Check for partial index */ if (indexInfo->ii_Predicate != NIL) { diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index f1e30aeb8f0..87e0063a03a 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -21,7 +21,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.19 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.20 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -240,12 +240,7 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) BlockNumber page = tbmres->blockno; Buffer buffer; Snapshot snapshot; - Page dp; int ntup; - int curslot; - int minslot; - int maxslot; - int maxoff; /* * Acquire pin on the target heap page, trading in any pin we held before. @@ -258,6 +253,13 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) buffer = scan->rs_cbuf; snapshot = scan->rs_snapshot; + ntup = 0; + + /* + * Prune and repair fragmentation for the whole page, if possible. + */ + heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); + /* * We must hold share lock on the buffer content while examining tuple * visibility. Afterwards, however, the tuples we have found to be @@ -265,71 +267,51 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) */ LockBuffer(buffer, BUFFER_LOCK_SHARE); - dp = (Page) BufferGetPage(buffer); - maxoff = PageGetMaxOffsetNumber(dp); - /* - * Determine how many entries we need to look at on this page. If the - * bitmap is lossy then we need to look at each physical item pointer; - * otherwise we just look through the offsets listed in tbmres. + * We need two separate strategies for lossy and non-lossy cases. */ if (tbmres->ntuples >= 0) { - /* non-lossy case */ - minslot = 0; - maxslot = tbmres->ntuples - 1; - } - else - { - /* lossy case */ - minslot = FirstOffsetNumber; - maxslot = maxoff; - } - - ntup = 0; - for (curslot = minslot; curslot <= maxslot; curslot++) - { - OffsetNumber targoffset; - ItemId lp; - HeapTupleData loctup; - bool valid; - - if (tbmres->ntuples >= 0) - { - /* non-lossy case */ - targoffset = tbmres->offsets[curslot]; - } - else - { - /* lossy case */ - targoffset = (OffsetNumber) curslot; - } - /* - * We'd better check for out-of-range offnum in case of VACUUM since - * the TID was obtained. + * Bitmap is non-lossy, so we just look through the offsets listed in + * tbmres; but we have to follow any HOT chain starting at each such + * offset. */ - if (targoffset < FirstOffsetNumber || targoffset > maxoff) - continue; + int curslot; - lp = PageGetItemId(dp, targoffset); + for (curslot = 0; curslot < tbmres->ntuples; curslot++) + { + OffsetNumber offnum = tbmres->offsets[curslot]; + ItemPointerData tid; + ItemPointerSet(&tid, page, offnum); + if (heap_hot_search_buffer(&tid, buffer, snapshot, NULL)) + scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); + } + } + else + { /* - * Must check for deleted tuple. + * Bitmap is lossy, so we must examine each item pointer on the page. + * But we can ignore HOT chains, since we'll check each tuple anyway. */ - if (!ItemIdIsNormal(lp)) - continue; + Page dp = (Page) BufferGetPage(buffer); + OffsetNumber maxoff = PageGetMaxOffsetNumber(dp); + OffsetNumber offnum; - /* - * check time qualification of tuple, remember it if valid - */ - loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); - loctup.t_len = ItemIdGetLength(lp); - ItemPointerSet(&(loctup.t_self), page, targoffset); + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum++) + { + ItemId lp; + HeapTupleData loctup; - valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); - if (valid) - scan->rs_vistuples[ntup++] = targoffset; + lp = PageGetItemId(dp, offnum); + if (!ItemIdIsNormal(lp)) + continue; + loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); + loctup.t_len = ItemIdGetLength(lp); + if (HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer)) + scan->rs_vistuples[ntup++] = offnum; + } } LockBuffer(buffer, BUFFER_LOCK_UNLOCK); diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index af94ad1a3b8..875e4da2914 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/spi.c,v 1.180 2007/08/15 19:15:46 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/executor/spi.c,v 1.181 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1407,6 +1407,7 @@ _SPI_prepare_plan(const char *src, SPIPlanPtr plan) plansource->num_params = nargs; plansource->fully_planned = true; plansource->fixed_result = false; + /* no need to set search_path, generation or saved_xmin */ plansource->resultDesc = PlanCacheComputeResultDesc(stmt_list); plansource->plan = cplan; @@ -1973,6 +1974,7 @@ _SPI_copy_plan(SPIPlanPtr plan, MemoryContext parentcxt) newsource->num_params = newplan->nargs; newsource->fully_planned = plansource->fully_planned; newsource->fixed_result = plansource->fixed_result; + /* no need to worry about seach_path, generation or saved_xmin */ if (plansource->resultDesc) newsource->resultDesc = CreateTupleDescCopy(plansource->resultDesc); newsource->plan = newcplan; diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 2f3e00d6a26..53e35b01acc 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -23,7 +23,7 @@ * Copyright (c) 2003-2007, PostgreSQL Global Development Group * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.12 2007/04/26 23:24:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.13 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -32,6 +32,7 @@ #include <limits.h> #include "access/htup.h" +#include "nodes/bitmapset.h" #include "nodes/tidbitmap.h" #include "storage/bufpage.h" #include "utils/hsearch.h" @@ -61,9 +62,7 @@ */ #define PAGES_PER_CHUNK (BLCKSZ / 32) -/* The bitmap unit size can be adjusted by changing these declarations: */ -#define BITS_PER_BITMAPWORD 32 -typedef uint32 bitmapword; /* must be an unsigned type */ +/* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */ #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD) #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD) diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index e2396d42ca6..e36ba97f6b8 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.221 2007/05/26 18:23:01 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.222 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -134,6 +134,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) glob->subrtables = NIL; glob->rewindPlanIDs = NULL; glob->finalrtable = NIL; + glob->transientPlan = false; /* Determine what fraction of the plan is likely to be scanned */ if (cursorOptions & CURSOR_OPT_FAST_PLAN) @@ -183,6 +184,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams) result->commandType = parse->commandType; result->canSetTag = parse->canSetTag; + result->transientPlan = glob->transientPlan; result->planTree = top_plan; result->rtable = glob->finalrtable; result->resultRelations = root->resultRelations; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 70b3d7d43f5..21dd342593a 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.136 2007/05/31 16:57:34 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.137 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -19,6 +19,7 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/transam.h" #include "catalog/pg_inherits.h" #include "nodes/makefuncs.h" #include "optimizer/clauses.h" @@ -164,6 +165,20 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, continue; } + /* + * If the index is valid, but cannot yet be used, ignore it; + * but mark the plan we are generating as transient. + * See src/backend/access/heap/README.HOT for discussion. + */ + if (index->indcheckxmin && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), + TransactionXmin)) + { + root->glob->transientPlan = true; + index_close(indexRelation, NoLock); + continue; + } + info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; diff --git a/src/backend/optimizer/util/var.c b/src/backend/optimizer/util/var.c index c501c827922..efb1ad9343d 100644 --- a/src/backend/optimizer/util/var.c +++ b/src/backend/optimizer/util/var.c @@ -8,12 +8,13 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/var.c,v 1.70 2007/06/11 01:16:23 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/var.c,v 1.71 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" +#include "access/htup.h" #include "optimizer/clauses.h" #include "optimizer/prep.h" #include "optimizer/var.h" @@ -54,6 +55,7 @@ typedef struct static bool pull_varnos_walker(Node *node, pull_varnos_context *context); +static bool pull_varattnos_walker(Node *node, Bitmapset **varattnos); static bool contain_var_reference_walker(Node *node, contain_var_reference_context *context); static bool contain_var_clause_walker(Node *node, void *context); @@ -134,6 +136,47 @@ pull_varnos_walker(Node *node, pull_varnos_context *context) (void *) context); } +/* + * pull_varattnos + * Find all the distinct attribute numbers present in an expression tree, + * and add them to the initial contents of *varattnos. + * Only Vars that reference RTE 1 of rtable level zero are considered. + * + * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that + * we can include system attributes (e.g., OID) in the bitmap representation. + * + * Currently, this does not support subqueries nor expressions containing + * references to multiple tables; not needed since it's only applied to + * index expressions and predicates. + */ +void +pull_varattnos(Node *node, Bitmapset **varattnos) +{ + (void) pull_varattnos_walker(node, varattnos); +} + +static bool +pull_varattnos_walker(Node *node, Bitmapset **varattnos) +{ + if (node == NULL) + return false; + if (IsA(node, Var)) + { + Var *var = (Var *) node; + + Assert(var->varno == 1); + *varattnos = bms_add_member(*varattnos, + var->varattno - FirstLowInvalidHeapAttributeNumber); + return false; + } + /* Should not find a subquery or subplan */ + Assert(!IsA(node, Query)); + Assert(!is_subplan(node)); + + return expression_tree_walker(node, pull_varattnos_walker, + (void *) varattnos); +} + /* * contain_var_reference diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 0d59d2e3463..9e088780d4c 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -13,7 +13,7 @@ * * Copyright (c) 2001-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.163 2007/09/11 03:28:05 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.164 2007/09/20 17:56:31 tgl Exp $ * ---------- */ #include "postgres.h" @@ -1294,7 +1294,7 @@ pgstat_count_heap_insert(Relation rel) * pgstat_count_heap_update - count a tuple update */ void -pgstat_count_heap_update(Relation rel) +pgstat_count_heap_update(Relation rel, bool hot) { PgStat_TableStatus *pgstat_info = rel->pgstat_info; @@ -1304,6 +1304,9 @@ pgstat_count_heap_update(Relation rel) /* t_tuples_updated is nontransactional, so just advance it */ pgstat_info->t_counts.t_tuples_updated++; + /* ditto for the hot_update counter */ + if (hot) + pgstat_info->t_counts.t_tuples_hot_updated++; /* We have to log the transactional effect at the proper level */ if (pgstat_info->trans == NULL || @@ -1340,6 +1343,23 @@ pgstat_count_heap_delete(Relation rel) } } +/* + * pgstat_update_heap_dead_tuples - update dead-tuples count + * + * The semantics of this are that we are reporting the nontransactional + * recovery of "delta" dead tuples; so t_new_dead_tuples decreases + * rather than increasing, and the change goes straight into the per-table + * counter, not into transactional state. + */ +void +pgstat_update_heap_dead_tuples(Relation rel, int delta) +{ + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + if (pgstat_collect_tuplelevel && pgstat_info != NULL) + pgstat_info->t_counts.t_new_dead_tuples -= delta; +} + /* ---------- * AtEOXact_PgStat @@ -2901,6 +2921,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->tuples_inserted = tabmsg[i].t_counts.t_tuples_inserted; tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated; tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted; + tabentry->tuples_hot_updated = tabmsg[i].t_counts.t_tuples_hot_updated; tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples; tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples; tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched; @@ -2923,6 +2944,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->tuples_inserted += tabmsg[i].t_counts.t_tuples_inserted; tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated; tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted; + tabentry->tuples_hot_updated += tabmsg[i].t_counts.t_tuples_hot_updated; tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples; tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples; tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched; @@ -2931,6 +2953,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) /* Clamp n_live_tuples in case of negative new_live_tuples */ tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); + /* Likewise for n_dead_tuples */ + tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); /* * Add per-table stats to the per-database entry, too. @@ -3115,6 +3139,7 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len) else tabentry->vacuum_timestamp = msg->m_vacuumtime; tabentry->n_live_tuples = msg->m_tuples; + /* Resetting dead_tuples to 0 is an approximation ... */ tabentry->n_dead_tuples = 0; if (msg->m_analyze) { diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 12564a69ee4..9c0ef67f6bb 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.223 2007/06/30 19:12:01 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.224 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -2067,6 +2067,55 @@ LockBufferForCleanup(Buffer buffer) } /* + * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock + * + * We won't loop, but just check once to see if the pin count is OK. If + * not, return FALSE with no lock held. + */ +bool +ConditionalLockBufferForCleanup(Buffer buffer) +{ + volatile BufferDesc *bufHdr; + + Assert(BufferIsValid(buffer)); + + if (BufferIsLocal(buffer)) + { + /* There should be exactly one pin */ + Assert(LocalRefCount[-buffer - 1] > 0); + if (LocalRefCount[-buffer - 1] != 1) + return false; + /* Nobody else to wait for */ + return true; + } + + /* There should be exactly one local pin */ + Assert(PrivateRefCount[buffer - 1] > 0); + if (PrivateRefCount[buffer - 1] != 1) + return false; + + /* Try to acquire lock */ + if (!ConditionalLockBuffer(buffer)) + return false; + + bufHdr = &BufferDescriptors[buffer - 1]; + LockBufHdr(bufHdr); + Assert(bufHdr->refcount > 0); + if (bufHdr->refcount == 1) + { + /* Successfully acquired exclusive lock with pincount 1 */ + UnlockBufHdr(bufHdr); + return true; + } + + /* Failed, so release the lock */ + UnlockBufHdr(bufHdr); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + return false; +} + + +/* * Functions for buffer I/O handling * * Note: We assume that nested buffer I/O never occurs. diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 3ce2f04bd8e..b382e4d0240 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -8,12 +8,13 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.73 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.74 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" +#include "access/htup.h" #include "storage/bufpage.h" @@ -108,6 +109,9 @@ PageHeaderIsValid(PageHeader page) * If offsetNumber is not valid, then assign one by finding the first * one that is both unused and deallocated. * + * If is_heap is true, we enforce that there can't be more than + * MaxHeapTuplesPerPage line pointers on the page. + * * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! */ OffsetNumber @@ -115,7 +119,8 @@ PageAddItem(Page page, Item item, Size size, OffsetNumber offsetNumber, - bool overwrite) + bool overwrite, + bool is_heap) { PageHeader phdr = (PageHeader) page; Size alignedSize; @@ -200,6 +205,12 @@ PageAddItem(Page page, return InvalidOffsetNumber; } + if (is_heap && offsetNumber > MaxHeapTuplesPerPage) + { + elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page"); + return InvalidOffsetNumber; + } + /* * Compute new lower and upper pointers for page, see if it'll fit. * @@ -315,11 +326,10 @@ itemoffcompare(const void *itemidp1, const void *itemidp2) * * This routine is usable for heap pages only, but see PageIndexMultiDelete. * - * Returns number of unused line pointers on page. If "unused" is not NULL - * then the unused[] array is filled with indexes of unused line pointers. + * As a side effect, the page's PD_HAS_FREE_LINES hint bit is updated. */ -int -PageRepairFragmentation(Page page, OffsetNumber *unused) +void +PageRepairFragmentation(Page page) { Offset pd_lower = ((PageHeader) page)->pd_lower; Offset pd_upper = ((PageHeader) page)->pd_upper; @@ -329,7 +339,7 @@ PageRepairFragmentation(Page page, OffsetNumber *unused) ItemId lp; int nline, nstorage, - nused; + nunused; int i; Size totallen; Offset upper; @@ -352,13 +362,12 @@ PageRepairFragmentation(Page page, OffsetNumber *unused) pd_lower, pd_upper, pd_special))); nline = PageGetMaxOffsetNumber(page); - nused = nstorage = 0; - for (i = 0; i < nline; i++) + nunused = nstorage = 0; + for (i = FirstOffsetNumber; i <= nline; i++) { - lp = PageGetItemId(page, i + 1); + lp = PageGetItemId(page, i); if (ItemIdIsUsed(lp)) { - nused++; if (ItemIdHasStorage(lp)) nstorage++; } @@ -366,9 +375,7 @@ PageRepairFragmentation(Page page, OffsetNumber *unused) { /* Unused entries should have lp_len = 0, but make sure */ ItemIdSetUnused(lp); - /* Report to caller if asked for */ - if (unused) - unused[i - nused] = (OffsetNumber) i; + nunused++; } } @@ -431,18 +438,19 @@ PageRepairFragmentation(Page page, OffsetNumber *unused) } /* Set hint bit for PageAddItem */ - if (nused < nline) + if (nunused > 0) PageSetHasFreeLinePointers(page); else PageClearHasFreeLinePointers(page); - - return (nline - nused); } /* * PageGetFreeSpace * Returns the size of the free (allocatable) space on a page, * reduced by the space needed for a new line pointer. + * + * Note: this should usually only be used on index pages. Use + * PageGetHeapFreeSpace on heap pages. */ Size PageGetFreeSpace(Page page) @@ -465,7 +473,8 @@ PageGetFreeSpace(Page page) /* * PageGetExactFreeSpace - * Returns the size of the free (allocatable) space on a page. + * Returns the size of the free (allocatable) space on a page, + * without any consideration for adding/removing line pointers. */ Size PageGetExactFreeSpace(Page page) @@ -484,6 +493,73 @@ PageGetExactFreeSpace(Page page) /* + * PageGetHeapFreeSpace + * Returns the size of the free (allocatable) space on a page, + * reduced by the space needed for a new line pointer. + * + * The difference between this and PageGetFreeSpace is that this will return + * zero if there are already MaxHeapTuplesPerPage line pointers in the page + * and none are free. We use this to enforce that no more than + * MaxHeapTuplesPerPage line pointers are created on a heap page. (Although + * no more tuples than that could fit anyway, in the presence of redirected + * or dead line pointers it'd be possible to have too many line pointers. + * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit + * on the number of line pointers, we make this extra check.) + */ +Size +PageGetHeapFreeSpace(Page page) +{ + Size space; + + space = PageGetFreeSpace(page); + if (space > 0) + { + OffsetNumber offnum, nline; + + /* + * Are there already MaxHeapTuplesPerPage line pointers in the page? + */ + nline = PageGetMaxOffsetNumber(page); + if (nline >= MaxHeapTuplesPerPage) + { + if (PageHasFreeLinePointers((PageHeader) page)) + { + /* + * Since this is just a hint, we must confirm that there is + * indeed a free line pointer + */ + for (offnum = FirstOffsetNumber; offnum <= nline; offnum++) + { + ItemId lp = PageGetItemId(page, offnum); + + if (!ItemIdIsUsed(lp)) + break; + } + + if (offnum > nline) + { + /* + * The hint is wrong, but we can't clear it here since + * we don't have the ability to mark the page dirty. + */ + space = 0; + } + } + else + { + /* + * Although the hint might be wrong, PageAddItem will believe + * it anyway, so we must believe it too. + */ + space = 0; + } + } + } + return space; +} + + +/* * PageIndexTupleDelete * * This routine does the work of removing a tuple from an index page. diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 55951cf71b9..954e174bb71 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/pgstatfuncs.c,v 1.44 2007/09/11 03:28:05 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/pgstatfuncs.c,v 1.45 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -28,6 +28,7 @@ extern Datum pg_stat_get_tuples_fetched(PG_FUNCTION_ARGS); extern Datum pg_stat_get_tuples_inserted(PG_FUNCTION_ARGS); extern Datum pg_stat_get_tuples_updated(PG_FUNCTION_ARGS); extern Datum pg_stat_get_tuples_deleted(PG_FUNCTION_ARGS); +extern Datum pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS); extern Datum pg_stat_get_live_tuples(PG_FUNCTION_ARGS); extern Datum pg_stat_get_dead_tuples(PG_FUNCTION_ARGS); extern Datum pg_stat_get_blocks_fetched(PG_FUNCTION_ARGS); @@ -170,6 +171,22 @@ pg_stat_get_tuples_deleted(PG_FUNCTION_ARGS) Datum +pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 result; + PgStat_StatTabEntry *tabentry; + + if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL) + result = 0; + else + result = (int64) (tabentry->tuples_hot_updated); + + PG_RETURN_INT64(result); +} + + +Datum pg_stat_get_live_tuples(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 21aed6eadbe..43297281f5f 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -33,13 +33,14 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/plancache.c,v 1.10 2007/06/05 20:00:41 wieck Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/plancache.c,v 1.11 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "utils/plancache.h" +#include "access/transam.h" #include "catalog/namespace.h" #include "executor/executor.h" #include "optimizer/clauses.h" @@ -79,6 +80,7 @@ static void ScanQueryForRelids(Query *parsetree, void *arg); static bool ScanQueryWalker(Node *node, ScanQueryWalkerContext *context); static bool rowmark_member(List *rowMarks, int rt_index); +static bool plan_list_is_transient(List *stmt_list); static void PlanCacheCallback(Datum arg, Oid relid); static void InvalRelid(Oid relid, LOCKMODE lockmode, InvalRelidContext *context); @@ -322,6 +324,13 @@ StoreCachedPlan(CachedPlanSource *plansource, plan->stmt_list = stmt_list; plan->fully_planned = plansource->fully_planned; plan->dead = false; + if (plansource->fully_planned && plan_list_is_transient(stmt_list)) + { + Assert(TransactionIdIsNormal(TransactionXmin)); + plan->saved_xmin = TransactionXmin; + } + else + plan->saved_xmin = InvalidTransactionId; plan->refcount = 1; /* for the parent's link */ plan->generation = ++(plansource->generation); plan->context = plan_context; @@ -412,6 +421,15 @@ RevalidateCachedPlan(CachedPlanSource *plansource, bool useResOwner) AcquirePlannerLocks(plan->stmt_list, true); /* + * If plan was transient, check to see if TransactionXmin has + * advanced, and if so invalidate it. + */ + if (!plan->dead && + TransactionIdIsValid(plan->saved_xmin) && + !TransactionIdEquals(plan->saved_xmin, TransactionXmin)) + plan->dead = true; + + /* * By now, if any invalidation has happened, PlanCacheCallback * will have marked the plan dead. */ @@ -790,6 +808,28 @@ rowmark_member(List *rowMarks, int rt_index) } /* + * plan_list_is_transient: check if any of the plans in the list are transient. + */ +static bool +plan_list_is_transient(List *stmt_list) +{ + ListCell *lc; + + foreach(lc, stmt_list) + { + PlannedStmt *plannedstmt = (PlannedStmt *) lfirst(lc); + + if (!IsA(plannedstmt, PlannedStmt)) + continue; /* Ignore utility statements */ + + if (plannedstmt->transientPlan) + return true; + } + + return false; +} + +/* * PlanCacheComputeResultDesc: given a list of either fully-planned statements * or Queries, determine the result tupledesc it will produce. Returns NULL * if the execution will not return tuples. diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index f69fb0c9362..8efa9e6c4e7 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.262 2007/07/25 22:16:18 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.263 2007/09/20 17:56:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -34,6 +34,7 @@ #include "access/reloptions.h" #include "access/xact.h" #include "catalog/catalog.h" +#include "catalog/index.h" #include "catalog/indexing.h" #include "catalog/namespace.h" #include "catalog/pg_amop.h" @@ -51,6 +52,7 @@ #include "optimizer/clauses.h" #include "optimizer/planmain.h" #include "optimizer/prep.h" +#include "optimizer/var.h" #include "rewrite/rewriteDefine.h" #include "storage/fd.h" #include "storage/smgr.h" @@ -1658,6 +1660,10 @@ RelationReloadIndexInfo(Relation relation) index = (Form_pg_index) GETSTRUCT(tuple); relation->rd_index->indisvalid = index->indisvalid; + relation->rd_index->indcheckxmin = index->indcheckxmin; + relation->rd_index->indisready = index->indisready; + HeapTupleHeaderSetXmin(relation->rd_indextuple->t_data, + HeapTupleHeaderGetXmin(tuple->t_data)); ReleaseSysCache(tuple); } @@ -1762,6 +1768,7 @@ RelationClearRelation(Relation relation, bool rebuild) if (relation->rd_options) pfree(relation->rd_options); list_free(relation->rd_indexlist); + bms_free(relation->rd_indexattr); if (relation->rd_indexcxt) MemoryContextDelete(relation->rd_indexcxt); @@ -2969,6 +2976,7 @@ RelationSetIndexList(Relation relation, List *indexIds, Oid oidIndex) relation->rd_indexvalid = 2; /* mark list as forced */ /* must flag that we have a forced index list */ need_eoxact_work = true; + /* we deliberately do not change rd_indexattr */ } /* @@ -3140,6 +3148,91 @@ RelationGetIndexPredicate(Relation relation) return result; } +/* + * RelationGetIndexAttrBitmap -- get a bitmap of index attribute numbers + * + * The result has a bit set for each attribute used anywhere in the index + * definitions of all the indexes on this relation. (This includes not only + * simple index keys, but attributes used in expressions and partial-index + * predicates.) + * + * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that + * we can include system attributes (e.g., OID) in the bitmap representation. + * + * The returned result is palloc'd in the caller's memory context and should + * be bms_free'd when not needed anymore. + */ +Bitmapset * +RelationGetIndexAttrBitmap(Relation relation) +{ + Bitmapset *indexattrs; + List *indexoidlist; + ListCell *l; + MemoryContext oldcxt; + + /* Quick exit if we already computed the result. */ + if (relation->rd_indexattr != NULL) + return bms_copy(relation->rd_indexattr); + + /* Fast path if definitely no indexes */ + if (!RelationGetForm(relation)->relhasindex) + return NULL; + + /* + * Get cached list of index OIDs + */ + indexoidlist = RelationGetIndexList(relation); + + /* Fall out if no indexes (but relhasindex was set) */ + if (indexoidlist == NIL) + return NULL; + + /* + * For each index, add referenced attributes to indexattrs. + */ + indexattrs = NULL; + foreach(l, indexoidlist) + { + Oid indexOid = lfirst_oid(l); + Relation indexDesc; + IndexInfo *indexInfo; + int i; + + indexDesc = index_open(indexOid, AccessShareLock); + + /* Extract index key information from the index's pg_index row */ + indexInfo = BuildIndexInfo(indexDesc); + + /* Collect simple attribute references */ + for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) + { + int attrnum = indexInfo->ii_KeyAttrNumbers[i]; + + if (attrnum != 0) + indexattrs = bms_add_member(indexattrs, + attrnum - FirstLowInvalidHeapAttributeNumber); + } + + /* Collect all attributes used in expressions, too */ + pull_varattnos((Node *) indexInfo->ii_Expressions, &indexattrs); + + /* Collect all attributes in the index predicate, too */ + pull_varattnos((Node *) indexInfo->ii_Predicate, &indexattrs); + + index_close(indexDesc, AccessShareLock); + } + + list_free(indexoidlist); + + /* Now save a copy of the bitmap in the relcache entry. */ + oldcxt = MemoryContextSwitchTo(CacheMemoryContext); + relation->rd_indexattr = bms_copy(indexattrs); + MemoryContextSwitchTo(oldcxt); + + /* We return our original working copy for caller to play with */ + return indexattrs; +} + /* * load_relcache_init_file, write_relcache_init_file @@ -3465,6 +3558,7 @@ load_relcache_init_file(void) rel->rd_refcnt = 0; rel->rd_indexvalid = 0; rel->rd_indexlist = NIL; + rel->rd_indexattr = NULL; rel->rd_oidindex = InvalidOid; rel->rd_createSubid = InvalidSubTransactionId; rel->rd_newRelfilenodeSubid = InvalidSubTransactionId; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index cade6a26aa9..fbe24c8e45f 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.126 2007/06/09 18:49:55 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.127 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -153,6 +153,10 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot, extern bool heap_release_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf, Relation stats_relation); +extern bool heap_hot_search_buffer(ItemPointer tid, Buffer buffer, + Snapshot snapshot, bool *all_dead); +extern bool heap_hot_search(ItemPointer tid, Relation relation, + Snapshot snapshot, bool *all_dead); extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid); @@ -183,6 +187,8 @@ extern void simple_heap_update(Relation relation, ItemPointer otid, extern void heap_markpos(HeapScanDesc scan); extern void heap_restrpos(HeapScanDesc scan); +extern void heap_sync(Relation relation); + extern void heap_redo(XLogRecPtr lsn, XLogRecord *rptr); extern void heap_desc(StringInfo buf, uint8 xl_info, char *rec); extern void heap2_redo(XLogRecPtr lsn, XLogRecord *rptr); @@ -192,7 +198,10 @@ extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf, ItemPointerData from, Buffer newbuf, HeapTuple newtup); extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, - OffsetNumber *unused, int uncnt); + OffsetNumber *redirected, int nredirected, + OffsetNumber *nowdead, int ndead, + OffsetNumber *nowunused, int nunused, + bool redirect_move); extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, OffsetNumber *offsets, int offcnt); @@ -240,7 +249,13 @@ extern MinimalTuple minimal_tuple_from_heap_tuple(HeapTuple htup); extern HeapTuple heap_addheader(int natts, bool withoid, Size structlen, void *structure); -extern void heap_sync(Relation relation); +/* in heap/pruneheap.c */ +extern void heap_page_prune_opt(Relation relation, Buffer buffer, + TransactionId OldestXmin); +extern int heap_page_prune(Relation relation, Buffer buffer, + TransactionId OldestXmin, + bool redirect_move, bool report_stats); +extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets); /* in heap/syncscan.c */ extern void ss_report_location(Relation rel, BlockNumber location); diff --git a/src/include/access/htup.h b/src/include/access/htup.h index ee816c568a8..32a27c972cb 100644 --- a/src/include/access/htup.h +++ b/src/include/access/htup.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.93 2007/04/06 04:21:43 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.94 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -184,8 +184,12 @@ typedef HeapTupleHeaderData *HeapTupleHeader; /* * information stored in t_infomask2: */ -#define HEAP_NATTS_MASK 0x7FF /* 11 bits for number of attributes */ -/* bits 0xF800 are currently unused */ +#define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ +/* bits 0x3800 are available */ +#define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ +#define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ + +#define HEAP2_XACT_MASK 0xC000 /* visibility-related bits */ /* * HeapTupleHeader accessor macros @@ -201,7 +205,7 @@ typedef HeapTupleHeaderData *HeapTupleHeader; #define HeapTupleHeaderSetXmin(tup, xid) \ ( \ - TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmin) \ + (tup)->t_choice.t_heap.t_xmin = (xid) \ ) #define HeapTupleHeaderGetXmax(tup) \ @@ -211,7 +215,7 @@ typedef HeapTupleHeaderData *HeapTupleHeader; #define HeapTupleHeaderSetXmax(tup, xid) \ ( \ - TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmax) \ + (tup)->t_choice.t_heap.t_xmax = (xid) \ ) /* @@ -255,7 +259,7 @@ do { \ #define HeapTupleHeaderSetXvac(tup, xid) \ do { \ Assert((tup)->t_infomask & HEAP_MOVED); \ - TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_field3.t_xvac); \ + (tup)->t_choice.t_heap.t_field3.t_xvac = (xid); \ } while (0) #define HeapTupleHeaderGetDatumLength(tup) \ @@ -298,6 +302,43 @@ do { \ *((Oid *) ((char *)(tup) + (tup)->t_hoff - sizeof(Oid))) = (oid); \ } while (0) +/* + * Note that we stop considering a tuple HOT-updated as soon as it is known + * aborted or the would-be updating transaction is known aborted. For best + * efficiency, check tuple visibility before using this macro, so that the + * INVALID bits will be as up to date as possible. + */ +#define HeapTupleHeaderIsHotUpdated(tup) \ +( \ + ((tup)->t_infomask2 & HEAP_HOT_UPDATED) != 0 && \ + ((tup)->t_infomask & (HEAP_XMIN_INVALID | HEAP_XMAX_INVALID)) == 0 \ +) + +#define HeapTupleHeaderSetHotUpdated(tup) \ +( \ + (tup)->t_infomask2 |= HEAP_HOT_UPDATED \ +) + +#define HeapTupleHeaderClearHotUpdated(tup) \ +( \ + (tup)->t_infomask2 &= ~HEAP_HOT_UPDATED \ +) + +#define HeapTupleHeaderIsHeapOnly(tup) \ +( \ + (tup)->t_infomask2 & HEAP_ONLY_TUPLE \ +) + +#define HeapTupleHeaderSetHeapOnly(tup) \ +( \ + (tup)->t_infomask2 |= HEAP_ONLY_TUPLE \ +) + +#define HeapTupleHeaderClearHeapOnly(tup) \ +( \ + (tup)->t_infomask2 &= ~HEAP_ONLY_TUPLE \ +) + #define HeapTupleHeaderGetNatts(tup) \ ((tup)->t_infomask2 & HEAP_NATTS_MASK) @@ -331,6 +372,11 @@ do { \ * fit on one heap page. (Note that indexes could have more, because they * use a smaller tuple header.) We arrive at the divisor because each tuple * must be maxaligned, and it must have an associated item pointer. + * + * Note: with HOT, there could theoretically be more line pointers (not actual + * tuples) than this on a heap page. However we constrain the number of line + * pointers to this anyway, to avoid excessive line-pointer bloat and not + * require increases in the size of work arrays. */ #define MaxHeapTuplesPerPage \ ((int) ((BLCKSZ - offsetof(PageHeaderData, pd_linp)) / \ @@ -484,6 +530,24 @@ typedef HeapTupleData *HeapTuple; #define HeapTupleHasExternal(tuple) \ (((tuple)->t_data->t_infomask & HEAP_HASEXTERNAL) != 0) +#define HeapTupleIsHotUpdated(tuple) \ + HeapTupleHeaderIsHotUpdated((tuple)->t_data) + +#define HeapTupleSetHotUpdated(tuple) \ + HeapTupleHeaderSetHotUpdated((tuple)->t_data) + +#define HeapTupleClearHotUpdated(tuple) \ + HeapTupleHeaderClearHotUpdated((tuple)->t_data) + +#define HeapTupleIsHeapOnly(tuple) \ + HeapTupleHeaderIsHeapOnly((tuple)->t_data) + +#define HeapTupleSetHeapOnly(tuple) \ + HeapTupleHeaderSetHeapOnly((tuple)->t_data) + +#define HeapTupleClearHeapOnly(tuple) \ + HeapTupleHeaderClearHeapOnly((tuple)->t_data) + #define HeapTupleGetOid(tuple) \ HeapTupleHeaderGetOid((tuple)->t_data) @@ -497,27 +561,30 @@ typedef HeapTupleData *HeapTuple; * XLOG allows to store some information in high 4 bits of log * record xl_info field. We use 3 for opcode and one for init bit. */ -#define XLOG_HEAP_INSERT 0x00 -#define XLOG_HEAP_DELETE 0x10 -#define XLOG_HEAP_UPDATE 0x20 -#define XLOG_HEAP_MOVE 0x30 -#define XLOG_HEAP_CLEAN 0x40 -#define XLOG_HEAP_NEWPAGE 0x50 -#define XLOG_HEAP_LOCK 0x60 -#define XLOG_HEAP_INPLACE 0x70 -#define XLOG_HEAP_OPMASK 0x70 +#define XLOG_HEAP_INSERT 0x00 +#define XLOG_HEAP_DELETE 0x10 +#define XLOG_HEAP_UPDATE 0x20 +#define XLOG_HEAP_MOVE 0x30 +#define XLOG_HEAP_HOT_UPDATE 0x40 +#define XLOG_HEAP_NEWPAGE 0x50 +#define XLOG_HEAP_LOCK 0x60 +#define XLOG_HEAP_INPLACE 0x70 + +#define XLOG_HEAP_OPMASK 0x70 /* * When we insert 1st item on new page in INSERT/UPDATE * we can (and we do) restore entire page in redo */ -#define XLOG_HEAP_INIT_PAGE 0x80 +#define XLOG_HEAP_INIT_PAGE 0x80 /* * We ran out of opcodes, so heapam.c now has a second RmgrId. These opcodes * are associated with RM_HEAP2_ID, but are not logically different from * the ones above associated with RM_HEAP_ID. We apply XLOG_HEAP_OPMASK, * although currently XLOG_HEAP_INIT_PAGE is not used for any of these. */ -#define XLOG_HEAP2_FREEZE 0x00 +#define XLOG_HEAP2_FREEZE 0x00 +#define XLOG_HEAP2_CLEAN 0x10 +#define XLOG_HEAP2_CLEAN_MOVE 0x20 /* * All what we need to find changed tuple @@ -569,7 +636,7 @@ typedef struct xl_heap_insert #define SizeOfHeapInsert (offsetof(xl_heap_insert, target) + SizeOfHeapTid) -/* This is what we need to know about update|move */ +/* This is what we need to know about update|move|hot_update */ typedef struct xl_heap_update { xl_heaptid target; /* deleted tuple id */ @@ -580,15 +647,34 @@ typedef struct xl_heap_update #define SizeOfHeapUpdate (offsetof(xl_heap_update, newtid) + SizeOfIptrData) -/* This is what we need to know about vacuum page cleanup */ +/* + * This is what we need to know about vacuum page cleanup/redirect + * + * The array of OffsetNumbers following the fixed part of the record contains: + * * for each redirected item: the item offset, then the offset redirected to + * * for each now-dead item: the item offset + * * for each now-unused item: the item offset + * The total number of OffsetNumbers is therefore 2*nredirected+ndead+nunused. + * Note that nunused is not explicitly stored, but may be found by reference + * to the total record length. + * + * If the opcode is CLEAN_MOVE instead of CLEAN, then each redirection pair + * should be interpreted as physically moving the "to" item pointer to the + * "from" slot, rather than placing a redirection item in the "from" slot. + * The moved pointers should be replaced by LP_UNUSED items (there will not + * be explicit entries in the "now-unused" list for this). Also, the + * HEAP_ONLY bit in the moved tuples must be turned off. + */ typedef struct xl_heap_clean { RelFileNode node; BlockNumber block; - /* UNUSED OFFSET NUMBERS FOLLOW AT THE END */ + uint16 nredirected; + uint16 ndead; + /* OFFSET NUMBERS FOLLOW */ } xl_heap_clean; -#define SizeOfHeapClean (offsetof(xl_heap_clean, block) + sizeof(BlockNumber)) +#define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16)) /* This is for replacing a page's contents in toto */ /* NB: this is used for indexes as well as heaps */ diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 15b9b8a3374..b145e09e36a 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.56 2007/06/09 18:49:55 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/relscan.h,v 1.57 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -82,6 +82,9 @@ typedef struct IndexScanDescData HeapTupleData xs_ctup; /* current heap tuple, if any */ Buffer xs_cbuf; /* current heap buffer in scan, if any */ /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ + TransactionId xs_prev_xmax; /* previous HOT chain member's XMAX, if any */ + OffsetNumber xs_next_hot; /* next member of HOT chain, if any */ + bool xs_hot_dead; /* T if all members of HOT chain are dead */ } IndexScanDescData; typedef IndexScanDescData *IndexScanDesc; diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index e21606a1259..c295aab857c 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -37,7 +37,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.425 2007/09/18 17:41:17 adunstan Exp $ + * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.426 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 200709181 +#define CATALOG_VERSION_NO 200709201 #endif diff --git a/src/include/catalog/pg_attribute.h b/src/include/catalog/pg_attribute.h index 7970cac3c48..006a8e4392d 100644 --- a/src/include/catalog/pg_attribute.h +++ b/src/include/catalog/pg_attribute.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_attribute.h,v 1.132 2007/09/03 00:39:21 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_attribute.h,v 1.133 2007/09/20 17:56:32 tgl Exp $ * * NOTES * the genbki.sh script reads this file and generates .bki @@ -471,10 +471,12 @@ DATA(insert ( 1259 tableoid 26 0 4 -7 0 -1 -1 t p i t f f t 0)); { 0, {"indisprimary"}, 16, -1, 1, 5, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \ { 0, {"indisclustered"}, 16, -1, 1, 6, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \ { 0, {"indisvalid"}, 16, -1, 1, 7, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \ -{ 0, {"indkey"}, 22, -1, -1, 8, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \ -{ 0, {"indclass"}, 30, -1, -1, 9, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \ -{ 0, {"indoption"}, 22, -1, -1, 10, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \ -{ 0, {"indexprs"}, 25, -1, -1, 11, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }, \ -{ 0, {"indpred"}, 25, -1, -1, 12, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 } +{ 0, {"indcheckxmin"}, 16, -1, 1, 8, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \ +{ 0, {"indisready"}, 16, -1, 1, 9, 0, -1, -1, true, 'p', 'c', true, false, false, true, 0 }, \ +{ 0, {"indkey"}, 22, -1, -1, 10, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \ +{ 0, {"indclass"}, 30, -1, -1, 11, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \ +{ 0, {"indoption"}, 22, -1, -1, 12, 1, -1, -1, false, 'p', 'i', true, false, false, true, 0 }, \ +{ 0, {"indexprs"}, 25, -1, -1, 13, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 }, \ +{ 0, {"indpred"}, 25, -1, -1, 14, 0, -1, -1, false, 'x', 'i', false, false, false, true, 0 } #endif /* PG_ATTRIBUTE_H */ diff --git a/src/include/catalog/pg_index.h b/src/include/catalog/pg_index.h index 31c6e25fb0d..f74ff4af0b9 100644 --- a/src/include/catalog/pg_index.h +++ b/src/include/catalog/pg_index.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_index.h,v 1.43 2007/01/09 02:14:15 tgl Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_index.h,v 1.44 2007/09/20 17:56:32 tgl Exp $ * * NOTES * the genbki.sh script reads this file and generates .bki @@ -42,6 +42,8 @@ CATALOG(pg_index,2610) BKI_WITHOUT_OIDS bool indisprimary; /* is this index for primary key? */ bool indisclustered; /* is this the index last clustered by? */ bool indisvalid; /* is this index valid for use by queries? */ + bool indcheckxmin; /* must we wait for xmin to be old? */ + bool indisready; /* is this index ready for inserts? */ /* VARIABLE LENGTH FIELDS: */ int2vector indkey; /* column numbers of indexed cols, or 0 */ @@ -65,7 +67,7 @@ typedef FormData_pg_index *Form_pg_index; * compiler constants for pg_index * ---------------- */ -#define Natts_pg_index 12 +#define Natts_pg_index 14 #define Anum_pg_index_indexrelid 1 #define Anum_pg_index_indrelid 2 #define Anum_pg_index_indnatts 3 @@ -73,11 +75,13 @@ typedef FormData_pg_index *Form_pg_index; #define Anum_pg_index_indisprimary 5 #define Anum_pg_index_indisclustered 6 #define Anum_pg_index_indisvalid 7 -#define Anum_pg_index_indkey 8 -#define Anum_pg_index_indclass 9 -#define Anum_pg_index_indoption 10 -#define Anum_pg_index_indexprs 11 -#define Anum_pg_index_indpred 12 +#define Anum_pg_index_indcheckxmin 8 +#define Anum_pg_index_indisready 9 +#define Anum_pg_index_indkey 10 +#define Anum_pg_index_indclass 11 +#define Anum_pg_index_indoption 12 +#define Anum_pg_index_indexprs 13 +#define Anum_pg_index_indpred 14 /* * Index AMs that support ordered scans must support these two indoption diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 5f534839b23..3eaead16bda 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.470 2007/09/18 17:41:17 adunstan Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.471 2007/09/20 17:56:32 tgl Exp $ * * NOTES * The script catalog/genbki.sh reads this file and generates .bki @@ -2873,6 +2873,8 @@ DATA(insert OID = 1932 ( pg_stat_get_tuples_updated PGNSP PGUID 12 1 0 f f t f DESCR("statistics: number of tuples updated"); DATA(insert OID = 1933 ( pg_stat_get_tuples_deleted PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_tuples_deleted - _null_ _null_ )); DESCR("statistics: number of tuples deleted"); +DATA(insert OID = 1972 ( pg_stat_get_tuples_hot_updated PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_tuples_hot_updated - _null_ _null_ )); +DESCR("statistics: number of tuples hot updated"); DATA(insert OID = 2878 ( pg_stat_get_live_tuples PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_live_tuples - _null_ _null_ )); DESCR("statistics: number of live tuples"); DATA(insert OID = 2879 ( pg_stat_get_dead_tuples PGNSP PGUID 12 1 0 f f t f s 1 20 "26" _null_ _null_ _null_ pg_stat_get_dead_tuples - _null_ _null_ )); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index d886c0149fc..82f851eeacc 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.177 2007/08/15 21:39:50 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.178 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -37,7 +37,12 @@ * Predicate partial-index predicate, or NIL if none * PredicateState exec state for predicate, or NIL if none * Unique is it a unique index? + * ReadyForInserts is it valid for inserts? * Concurrent are we doing a concurrent index build? + * BrokenHotChain did we detect any broken HOT chains? + * + * ii_Concurrent and ii_BrokenHotChain are used only during index build; + * they're conventionally set to false otherwise. * ---------------- */ typedef struct IndexInfo @@ -50,7 +55,9 @@ typedef struct IndexInfo List *ii_Predicate; /* list of Expr */ List *ii_PredicateState; /* list of ExprState */ bool ii_Unique; + bool ii_ReadyForInserts; bool ii_Concurrent; + bool ii_BrokenHotChain; } IndexInfo; /* ---------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 9d336e4b889..992b47f58d8 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.94 2007/04/27 22:05:49 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/plannodes.h,v 1.95 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -39,6 +39,8 @@ typedef struct PlannedStmt bool canSetTag; /* do I set the command result tag? */ + bool transientPlan; /* redo plan when TransactionXmin changes? */ + struct Plan *planTree; /* tree of Plan nodes */ List *rtable; /* list of RangeTblEntry nodes */ diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index ab26491a629..32c699b6de6 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.145 2007/08/31 01:44:06 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.146 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -71,6 +71,8 @@ typedef struct PlannerGlobal Bitmapset *rewindPlanIDs; /* indices of subplans that require REWIND */ List *finalrtable; /* "flat" rangetable for executor */ + + bool transientPlan; /* redo plan when TransactionXmin changes? */ } PlannerGlobal; /* macro for fetching the Plan associated with a SubPlan node */ diff --git a/src/include/optimizer/var.h b/src/include/optimizer/var.h index 34f8c73f3f6..824ba5a1a4b 100644 --- a/src/include/optimizer/var.h +++ b/src/include/optimizer/var.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/optimizer/var.h,v 1.35 2007/01/05 22:19:56 momjian Exp $ + * $PostgreSQL: pgsql/src/include/optimizer/var.h,v 1.36 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -18,6 +18,7 @@ extern Relids pull_varnos(Node *node); +extern void pull_varattnos(Node *node, Bitmapset **varattnos); extern bool contain_var_reference(Node *node, int varno, int varattno, int levelsup); extern bool contain_var_clause(Node *node); diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 93f08cd2fbf..9cdeb2ee909 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -5,7 +5,7 @@ * * Copyright (c) 2001-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/pgstat.h,v 1.65 2007/09/11 03:28:05 tgl Exp $ + * $PostgreSQL: pgsql/src/include/pgstat.h,v 1.66 2007/09/20 17:56:32 tgl Exp $ * ---------- */ #ifndef PGSTAT_H @@ -55,10 +55,10 @@ typedef int64 PgStat_Counter; * the index AM, while tuples_fetched is the number of tuples successfully * fetched by heap_fetch under the control of simple indexscans for this index. * - * tuples_inserted/tuples_updated/tuples_deleted count attempted actions, + * tuples_inserted/updated/deleted/hot_updated count attempted actions, * regardless of whether the transaction committed. new_live_tuples and * new_dead_tuples are properly adjusted depending on commit or abort. - * Note that new_live_tuples can be negative! + * Note that new_live_tuples and new_dead_tuples can be negative! * ---------- */ typedef struct PgStat_TableCounts @@ -71,6 +71,7 @@ typedef struct PgStat_TableCounts PgStat_Counter t_tuples_inserted; PgStat_Counter t_tuples_updated; PgStat_Counter t_tuples_deleted; + PgStat_Counter t_tuples_hot_updated; PgStat_Counter t_new_live_tuples; PgStat_Counter t_new_dead_tuples; @@ -323,7 +324,7 @@ typedef union PgStat_Msg * ------------------------------------------------------------ */ -#define PGSTAT_FILE_FORMAT_ID 0x01A5BC96 +#define PGSTAT_FILE_FORMAT_ID 0x01A5BC97 /* ---------- * PgStat_StatDBEntry The collector's data per database @@ -367,6 +368,7 @@ typedef struct PgStat_StatTabEntry PgStat_Counter tuples_inserted; PgStat_Counter tuples_updated; PgStat_Counter tuples_deleted; + PgStat_Counter tuples_hot_updated; PgStat_Counter n_live_tuples; PgStat_Counter n_dead_tuples; @@ -545,8 +547,9 @@ extern void pgstat_initstats(Relation rel); } while (0) extern void pgstat_count_heap_insert(Relation rel); -extern void pgstat_count_heap_update(Relation rel); +extern void pgstat_count_heap_update(Relation rel, bool hot); extern void pgstat_count_heap_delete(Relation rel); +extern void pgstat_update_heap_dead_tuples(Relation rel, int delta); extern void AtEOXact_PgStat(bool isCommit); extern void AtEOSubXact_PgStat(bool isCommit, int nestDepth); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 7a4190d044b..1324befa1e2 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.106 2007/07/25 12:22:53 mha Exp $ + * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.107 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -156,6 +156,7 @@ extern void UnlockBuffers(void); extern void LockBuffer(Buffer buffer, int mode); extern bool ConditionalLockBuffer(Buffer buffer); extern void LockBufferForCleanup(Buffer buffer); +extern bool ConditionalLockBufferForCleanup(Buffer buffer); extern void AbortBufferIO(void); diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 7e6e429108d..8ca2dd8e38f 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.73 2007/09/12 22:10:26 tgl Exp $ + * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.74 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -140,10 +140,21 @@ typedef PageHeaderData *PageHeader; * PD_HAS_FREE_LINES is set if there are any LP_UNUSED line pointers before * pd_lower. This should be considered a hint rather than the truth, since * changes to it are not WAL-logged. + * + * PD_PRUNABLE is set if there are any prunable tuples in the page. + * This should be considered a hint rather than the truth, since + * the transaction which generates a prunable tuple may or may not commit. + * Also there is a lag before a tuple is declared dead. + * + * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the + * page for its new tuple version; this suggests that a prune is needed. + * Again, this is just a hint. */ #define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ +#define PD_PRUNABLE 0x0002 /* are there any prunable tuples? */ +#define PD_PAGE_FULL 0x0004 /* not enough free space for new tuple? */ -#define PD_VALID_FLAG_BITS 0x0001 /* OR of all valid pd_flags bits */ +#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. @@ -337,6 +348,20 @@ typedef PageHeaderData *PageHeader; #define PageClearHasFreeLinePointers(page) \ (((PageHeader) (page))->pd_flags &= ~PD_HAS_FREE_LINES) +#define PageIsPrunable(page) \ + (((PageHeader) (page))->pd_flags & PD_PRUNABLE) +#define PageSetPrunable(page) \ + (((PageHeader) (page))->pd_flags |= PD_PRUNABLE) +#define PageClearPrunable(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_PRUNABLE) + +#define PageIsFull(page) \ + (((PageHeader) (page))->pd_flags & PD_PAGE_FULL) +#define PageSetFull(page) \ + (((PageHeader) (page))->pd_flags |= PD_PAGE_FULL) +#define PageClearFull(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_PAGE_FULL) + /* ---------------------------------------------------------------- * extern declarations @@ -346,12 +371,13 @@ typedef PageHeaderData *PageHeader; extern void PageInit(Page page, Size pageSize, Size specialSize); extern bool PageHeaderIsValid(PageHeader page); extern OffsetNumber PageAddItem(Page page, Item item, Size size, - OffsetNumber offsetNumber, bool overwrite); + OffsetNumber offsetNumber, bool overwrite, bool is_heap); extern Page PageGetTempPage(Page page, Size specialSize); extern void PageRestoreTempPage(Page tempPage, Page oldPage); -extern int PageRepairFragmentation(Page page, OffsetNumber *unused); +extern void PageRepairFragmentation(Page page); extern Size PageGetFreeSpace(Page page); extern Size PageGetExactFreeSpace(Page page); +extern Size PageGetHeapFreeSpace(Page page); extern void PageIndexTupleDelete(Page page, OffsetNumber offset); extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems); diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h index d8152142aac..0a91e886e29 100644 --- a/src/include/utils/plancache.h +++ b/src/include/utils/plancache.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/plancache.h,v 1.7 2007/06/05 20:00:41 wieck Exp $ + * $PostgreSQL: pgsql/src/include/utils/plancache.h,v 1.8 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -75,6 +75,8 @@ typedef struct CachedPlan List *stmt_list; /* list of statement or Query nodes */ bool fully_planned; /* do we cache planner or rewriter output? */ bool dead; /* if true, do not use */ + TransactionId saved_xmin; /* if valid, replan when TransactionXmin + * changes from this value */ int refcount; /* count of live references to this struct */ int generation; /* counter, starting at 1, for replans */ MemoryContext context; /* context containing this CachedPlan */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index bc6bf190b86..48569c583b2 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.101 2007/05/27 03:50:39 tgl Exp $ + * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.102 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -19,6 +19,7 @@ #include "catalog/pg_class.h" #include "catalog/pg_index.h" #include "fmgr.h" +#include "nodes/bitmapset.h" #include "rewrite/prs2lock.h" #include "storage/block.h" #include "storage/relfilenode.h" @@ -145,6 +146,7 @@ typedef struct RelationData TupleDesc rd_att; /* tuple descriptor */ Oid rd_id; /* relation's object id */ List *rd_indexlist; /* list of OIDs of indexes on relation */ + Bitmapset *rd_indexattr; /* identifies columns used in indexes */ Oid rd_oidindex; /* OID of unique index on OID, if any */ LockInfoData rd_lockInfo; /* lock mgr's info for locking relation */ RuleLock *rd_rules; /* rewrite rules */ diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index 25b60082a09..a2b6f21248f 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.59 2007/03/29 00:15:39 tgl Exp $ + * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.60 2007/09/20 17:56:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -29,6 +29,7 @@ extern List *RelationGetIndexList(Relation relation); extern Oid RelationGetOidIndex(Relation relation); extern List *RelationGetIndexExpressions(Relation relation); extern List *RelationGetIndexPredicate(Relation relation); +extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation); extern void RelationSetIndexList(Relation relation, List *indexIds, Oid oidIndex); diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index adec0e6c847..3483ba15554 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -415,6 +415,7 @@ Table "public.concur_heap" f2 | text | Indexes: "concur_index2" UNIQUE, btree (f1) + "concur_index3" UNIQUE, btree (f2) INVALID "concur_index1" btree (f2, f1) "concur_index4" btree (f2) WHERE f1 = 'a'::text "concur_index5" btree (f2) WHERE f1 = 'x'::text diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index bafce821eba..3fc65ea2350 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1291,13 +1291,13 @@ SELECT viewname, definition FROM pg_views WHERE schemaname <> 'information_schem pg_shadow | SELECT pg_authid.rolname AS usename, pg_authid.oid AS usesysid, pg_authid.rolcreatedb AS usecreatedb, pg_authid.rolsuper AS usesuper, pg_authid.rolcatupdate AS usecatupd, pg_authid.rolpassword AS passwd, (pg_authid.rolvaliduntil)::abstime AS valuntil, pg_authid.rolconfig AS useconfig FROM pg_authid WHERE pg_authid.rolcanlogin; pg_stat_activity | SELECT d.oid AS datid, d.datname, pg_stat_get_backend_pid(s.backendid) AS procpid, pg_stat_get_backend_userid(s.backendid) AS usesysid, u.rolname AS usename, pg_stat_get_backend_activity(s.backendid) AS current_query, pg_stat_get_backend_waiting(s.backendid) AS waiting, pg_stat_get_backend_xact_start(s.backendid) AS xact_start, pg_stat_get_backend_activity_start(s.backendid) AS query_start, pg_stat_get_backend_start(s.backendid) AS backend_start, pg_stat_get_backend_client_addr(s.backendid) AS client_addr, pg_stat_get_backend_client_port(s.backendid) AS client_port FROM pg_database d, (SELECT pg_stat_get_backend_idset() AS backendid) s, pg_authid u WHERE ((pg_stat_get_backend_dbid(s.backendid) = d.oid) AND (pg_stat_get_backend_userid(s.backendid) = u.oid)); pg_stat_all_indexes | SELECT c.oid AS relid, i.oid AS indexrelid, n.nspname AS schemaname, c.relname, i.relname AS indexrelname, pg_stat_get_numscans(i.oid) AS idx_scan, pg_stat_get_tuples_returned(i.oid) AS idx_tup_read, pg_stat_get_tuples_fetched(i.oid) AS idx_tup_fetch FROM (((pg_class c JOIN pg_index x ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])); - pg_stat_all_tables | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, pg_stat_get_numscans(c.oid) AS seq_scan, pg_stat_get_tuples_returned(c.oid) AS seq_tup_read, (sum(pg_stat_get_numscans(i.indexrelid)))::bigint AS idx_scan, ((sum(pg_stat_get_tuples_fetched(i.indexrelid)))::bigint + pg_stat_get_tuples_fetched(c.oid)) AS idx_tup_fetch, pg_stat_get_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(c.oid) AS last_vacuum, pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum, pg_stat_get_last_analyze_time(c.oid) AS last_analyze, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname; + pg_stat_all_tables | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, pg_stat_get_numscans(c.oid) AS seq_scan, pg_stat_get_tuples_returned(c.oid) AS seq_tup_read, (sum(pg_stat_get_numscans(i.indexrelid)))::bigint AS idx_scan, ((sum(pg_stat_get_tuples_fetched(i.indexrelid)))::bigint + pg_stat_get_tuples_fetched(c.oid)) AS idx_tup_fetch, pg_stat_get_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(c.oid) AS n_tup_hot_upd, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(c.oid) AS last_vacuum, pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum, pg_stat_get_last_analyze_time(c.oid) AS last_analyze, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname; pg_stat_bgwriter | SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints_timed, pg_stat_get_bgwriter_requested_checkpoints() AS checkpoints_req, pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint, pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean, pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean; pg_stat_database | SELECT d.oid AS datid, d.datname, pg_stat_get_db_numbackends(d.oid) AS numbackends, pg_stat_get_db_xact_commit(d.oid) AS xact_commit, pg_stat_get_db_xact_rollback(d.oid) AS xact_rollback, (pg_stat_get_db_blocks_fetched(d.oid) - pg_stat_get_db_blocks_hit(d.oid)) AS blks_read, pg_stat_get_db_blocks_hit(d.oid) AS blks_hit, pg_stat_get_db_tuples_returned(d.oid) AS tup_returned, pg_stat_get_db_tuples_fetched(d.oid) AS tup_fetched, pg_stat_get_db_tuples_inserted(d.oid) AS tup_inserted, pg_stat_get_db_tuples_updated(d.oid) AS tup_updated, pg_stat_get_db_tuples_deleted(d.oid) AS tup_deleted FROM pg_database d; pg_stat_sys_indexes | SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, pg_stat_all_indexes.relname, pg_stat_all_indexes.indexrelname, pg_stat_all_indexes.idx_scan, pg_stat_all_indexes.idx_tup_read, pg_stat_all_indexes.idx_tup_fetch FROM pg_stat_all_indexes WHERE ((pg_stat_all_indexes.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_indexes.schemaname ~ '^pg_toast'::text)); - pg_stat_sys_tables | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text)); + pg_stat_sys_tables | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text)); pg_stat_user_indexes | SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, pg_stat_all_indexes.relname, pg_stat_all_indexes.indexrelname, pg_stat_all_indexes.idx_scan, pg_stat_all_indexes.idx_tup_read, pg_stat_all_indexes.idx_tup_fetch FROM pg_stat_all_indexes WHERE ((pg_stat_all_indexes.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_indexes.schemaname !~ '^pg_toast'::text)); - pg_stat_user_tables | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text)); + pg_stat_user_tables | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text)); pg_statio_all_indexes | SELECT c.oid AS relid, i.oid AS indexrelid, n.nspname AS schemaname, c.relname, i.relname AS indexrelname, (pg_stat_get_blocks_fetched(i.oid) - pg_stat_get_blocks_hit(i.oid)) AS idx_blks_read, pg_stat_get_blocks_hit(i.oid) AS idx_blks_hit FROM (((pg_class c JOIN pg_index x ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])); pg_statio_all_sequences | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, (pg_stat_get_blocks_fetched(c.oid) - pg_stat_get_blocks_hit(c.oid)) AS blks_read, pg_stat_get_blocks_hit(c.oid) AS blks_hit FROM (pg_class c LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = 'S'::"char"); pg_statio_all_tables | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, (pg_stat_get_blocks_fetched(c.oid) - pg_stat_get_blocks_hit(c.oid)) AS heap_blks_read, pg_stat_get_blocks_hit(c.oid) AS heap_blks_hit, (sum((pg_stat_get_blocks_fetched(i.indexrelid) - pg_stat_get_blocks_hit(i.indexrelid))))::bigint AS idx_blks_read, (sum(pg_stat_get_blocks_hit(i.indexrelid)))::bigint AS idx_blks_hit, (pg_stat_get_blocks_fetched(t.oid) - pg_stat_get_blocks_hit(t.oid)) AS toast_blks_read, pg_stat_get_blocks_hit(t.oid) AS toast_blks_hit, (pg_stat_get_blocks_fetched(x.oid) - pg_stat_get_blocks_hit(x.oid)) AS tidx_blks_read, pg_stat_get_blocks_hit(x.oid) AS tidx_blks_hit FROM ((((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_class t ON ((c.reltoastrelid = t.oid))) LEFT JOIN pg_class x ON ((t.reltoastidxid = x.oid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname, t.oid, x.oid; |