aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access')
-rw-r--r--src/backend/access/gin/ginentrypage.c10
-rw-r--r--src/backend/access/gin/ginvacuum.c4
-rw-r--r--src/backend/access/gin/ginxlog.c10
-rw-r--r--src/backend/access/gist/gist.c4
-rw-r--r--src/backend/access/gist/gistutil.c4
-rw-r--r--src/backend/access/gist/gistvacuum.c4
-rw-r--r--src/backend/access/hash/hashinsert.c4
-rw-r--r--src/backend/access/hash/hashovfl.c4
-rw-r--r--src/backend/access/hash/hashpage.c4
-rw-r--r--src/backend/access/heap/Makefile4
-rw-r--r--src/backend/access/heap/README.HOT489
-rw-r--r--src/backend/access/heap/heapam.c640
-rw-r--r--src/backend/access/heap/hio.c8
-rw-r--r--src/backend/access/heap/pruneheap.c702
-rw-r--r--src/backend/access/heap/rewriteheap.c8
-rw-r--r--src/backend/access/index/genam.c6
-rw-r--r--src/backend/access/index/indexam.c259
-rw-r--r--src/backend/access/nbtree/nbtinsert.c81
-rw-r--r--src/backend/access/nbtree/nbtsort.c4
-rw-r--r--src/backend/access/nbtree/nbtxlog.c12
20 files changed, 2063 insertions, 198 deletions
diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c
index a5253da0211..70867ac40ba 100644
--- a/src/backend/access/gin/ginentrypage.c
+++ b/src/backend/access/gin/ginentrypage.c
@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.8 2007/09/12 22:10:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.9 2007/09/20 17:56:30 tgl Exp $
*-------------------------------------------------------------------------
*/
@@ -359,7 +359,7 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd
*prdata = rdata;
data.updateBlkno = entryPreparePage(btree, page, off);
- placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false);
+ placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false);
if (placed != off)
elog(ERROR, "failed to add item to index page in \"%s\"",
RelationGetRelationName(btree->index));
@@ -488,7 +488,7 @@ entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogR
lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
}
- if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+ if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in \"%s\"",
RelationGetRelationName(btree->index));
ptr += MAXALIGN(IndexTupleSize(itup));
@@ -563,11 +563,11 @@ entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
page = BufferGetPage(root);
itup = ginPageGetLinkItup(lbuf);
- if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+ if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
itup = ginPageGetLinkItup(rbuf);
- if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+ if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
}
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index 91f7f3e5f8b..1f26869d646 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.16 2007/09/12 22:10:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/gin/ginvacuum.c,v 1.17 2007/09/20 17:56:30 tgl Exp $
*-------------------------------------------------------------------------
*/
@@ -544,7 +544,7 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
itup = GinFormTuple(&gvs->ginstate, value, GinGetPosting(itup), newN);
PageIndexTupleDelete(tmppage, i);
- if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false) != i)
+ if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i)
elog(ERROR, "failed to add item to index page in \"%s\"",
RelationGetRelationName(gvs->index));
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index db2d6b39336..bf2174c37c7 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.8 2007/09/12 22:10:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.9 2007/09/20 17:56:30 tgl Exp $
*-------------------------------------------------------------------------
*/
#include "postgres.h"
@@ -199,7 +199,7 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
itup = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsert));
- if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false) == InvalidOffsetNumber)
+ if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), data->offset, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in %u/%u/%u",
data->node.spcNode, data->node.dbNode, data->node.relNode);
@@ -281,7 +281,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
for (i = 0; i < data->separator; i++)
{
- if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+ if (PageAddItem(lpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in %u/%u/%u",
data->node.spcNode, data->node.dbNode, data->node.relNode);
itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
@@ -289,7 +289,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
for (i = data->separator; i < data->nitem; i++)
{
- if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+ if (PageAddItem(rpage, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in %u/%u/%u",
data->node.spcNode, data->node.dbNode, data->node.relNode);
itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
@@ -375,7 +375,7 @@ ginRedoVacuumPage(XLogRecPtr lsn, XLogRecord *record)
for (i = 0; i < data->nitem; i++)
{
- if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false) == InvalidOffsetNumber)
+ if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in %u/%u/%u",
data->node.spcNode, data->node.dbNode, data->node.relNode);
itup = (IndexTuple) (((char *) itup) + MAXALIGN(IndexTupleSize(itup)));
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index fce9a94ebae..0c1b94d7d38 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.146 2007/09/12 22:10:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/gist/gist.c,v 1.147 2007/09/20 17:56:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -366,7 +366,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate)
data = (char *) (ptr->list);
for (i = 0; i < ptr->block.num; i++)
{
- if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber)
+ if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r));
data += IndexTupleSize((IndexTuple) data);
}
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 6d4f31d53b2..409377d1d14 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.23 2007/09/12 22:10:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/gist/gistutil.c,v 1.24 2007/09/20 17:56:30 tgl Exp $
*-------------------------------------------------------------------------
*/
#include "postgres.h"
@@ -42,7 +42,7 @@ gistfillbuffer(Relation r, Page page, IndexTuple *itup,
for (i = 0; i < len; i++)
{
l = PageAddItem(page, (Item) itup[i], IndexTupleSize(itup[i]),
- off, false);
+ off, false, false);
if (l == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in \"%s\"",
RelationGetRelationName(r));
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 0abd0197ad3..212995e7c57 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.31 2007/09/12 22:10:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/gist/gistvacuum.c,v 1.32 2007/09/20 17:56:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -201,7 +201,7 @@ vacuumSplitPage(GistVacuum *gv, Page tempPage, Buffer buffer, IndexTuple *addon,
data = (char *) (ptr->list);
for (i = 0; i < ptr->block.num; i++)
{
- if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false) == InvalidOffsetNumber)
+ if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gv->index));
data += IndexTupleSize((IndexTuple) data);
}
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index c82ad0ad9fb..8cbf0294acf 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.46 2007/09/12 22:10:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.47 2007/09/20 17:56:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -200,7 +200,7 @@ _hash_pgaddtup(Relation rel,
page = BufferGetPage(buf);
itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
- if (PageAddItem(page, (Item) itup, itemsize, itup_off, false)
+ if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
== InvalidOffsetNumber)
elog(ERROR, "failed to add index item to \"%s\"",
RelationGetRelationName(rel));
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index 7e87f308b26..e4ea24a62d1 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.59 2007/09/12 22:10:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.60 2007/09/20 17:56:30 tgl Exp $
*
* NOTES
* Overflow pages look like ordinary relation pages.
@@ -684,7 +684,7 @@ _hash_squeezebucket(Relation rel,
* we have found room so insert on the "write" page.
*/
woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
- if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false)
+ if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, false, false)
== InvalidOffsetNumber)
elog(ERROR, "failed to add index item to \"%s\"",
RelationGetRelationName(rel));
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 4b1450926d3..807dbed8a8c 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.69 2007/09/12 22:10:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.70 2007/09/20 17:56:30 tgl Exp $
*
* NOTES
* Postgres hash pages look like ordinary relation pages. The opaque
@@ -830,7 +830,7 @@ _hash_splitbucket(Relation rel,
}
noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
- if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false)
+ if (PageAddItem(npage, (Item) itup, itemsz, noffnum, false, false)
== InvalidOffsetNumber)
elog(ERROR, "failed to add index item to \"%s\"",
RelationGetRelationName(rel));
diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile
index ac2401232bb..aff2847bab5 100644
--- a/src/backend/access/heap/Makefile
+++ b/src/backend/access/heap/Makefile
@@ -4,7 +4,7 @@
# Makefile for access/heap
#
# IDENTIFICATION
-# $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.16 2007/06/08 18:23:52 tgl Exp $
+# $PostgreSQL: pgsql/src/backend/access/heap/Makefile,v 1.17 2007/09/20 17:56:30 tgl Exp $
#
#-------------------------------------------------------------------------
@@ -12,7 +12,7 @@ subdir = src/backend/access/heap
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = heapam.o hio.o rewriteheap.o syncscan.o tuptoaster.o
+OBJS = heapam.o hio.o pruneheap.o rewriteheap.o syncscan.o tuptoaster.o
all: SUBSYS.o
diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT
new file mode 100644
index 00000000000..8cf0fa44de6
--- /dev/null
+++ b/src/backend/access/heap/README.HOT
@@ -0,0 +1,489 @@
+$PostgreSQL: pgsql/src/backend/access/heap/README.HOT,v 1.1 2007/09/20 17:56:30 tgl Exp $
+
+ Heap Only Tuples (HOT)
+
+Introduction
+------------
+
+The Heap Only Tuple (HOT) feature eliminates redundant index entries and
+allows the re-use of space taken by DELETEd or obsoleted UPDATEd tuples
+without performing a table-wide vacuum. It does this by allowing
+single-page vacuuming, also called "defragmentation".
+
+Note: there is a Glossary at the end of this document that may be helpful
+for first-time readers.
+
+
+Technical Challenges
+--------------------
+
+Page-at-a-time vacuuming is normally impractical because of the costs of
+finding and removing the index entries that link to the tuples to be
+reclaimed. Standard vacuuming scans the indexes to ensure all such index
+entries are removed, amortizing the index scan cost across as many dead
+tuples as possible; this approach does not scale down well to the case of
+reclaiming just a few tuples. In principle one could recompute the index
+keys and do standard index searches to find the index entries, but this is
+risky in the presence of possibly-buggy user-defined functions in
+functional indexes. An allegedly immutable function that in fact is not
+immutable might prevent us from re-finding an index entry (and we cannot
+throw an error for not finding it, in view of the fact that dead index
+entries are sometimes reclaimed early). That would lead to a seriously
+corrupt index, in the form of entries pointing to tuple slots that by now
+contain some unrelated content. In any case we would prefer to be able
+to do vacuuming without invoking any user-written code.
+
+HOT solves this problem for a restricted but useful special case:
+where a tuple is repeatedly updated in ways that do not change its
+indexed columns. (Here, "indexed column" means any column referenced
+at all in an index definition, including for example columns that are
+tested in a partial-index predicate but are not stored in the index.)
+
+An additional property of HOT is that it reduces index size by avoiding
+the creation of identically-keyed index entries. This improves search
+speeds.
+
+
+Update Chains With a Single Index Entry
+---------------------------------------
+
+Without HOT, every version of a row in an update chain has its own index
+entries, even if all indexed columns are the same. With HOT, a new tuple
+placed on the same page and with all indexed columns the same as its
+parent row version does not get new index entries. This means there is
+only one index entry for the entire update chain on the heap page.
+An index-entry-less tuple is marked with the HEAP_ONLY_TUPLE flag.
+The prior row version is marked HEAP_HOT_UPDATED, and (as always in an
+update chain) its t_ctid field links forward to the newer version.
+
+For example:
+
+ Index points to 1
+ lp [1] [2]
+
+ [111111111]->[2222222222]
+
+In the above diagram, the index points to line pointer 1, and tuple 1 is
+marked as HEAP_HOT_UPDATED. Tuple 2 is a HOT tuple, meaning it has
+no index entry pointing to it, and is marked as HEAP_ONLY_TUPLE.
+Although tuple 2 is not directly referenced by the index, it can still be
+found by an index search: after traversing from the index to tuple 1,
+the index search proceeds forward to child tuples as long as it sees the
+HEAP_HOT_UPDATED flag set. Since we restrict the HOT chain to lie within
+a single page, this requires no additional page fetches and doesn't
+introduce much performance penalty.
+
+Eventually, tuple 1 will no longer be visible to any transaction.
+At that point its space could be reclaimed, but its line pointer cannot,
+since the index still links to that line pointer and we still need to
+be able to find tuple 2 in an index search. HOT handles this by turning
+line pointer 1 into a "redirecting line pointer", which links to tuple 2
+but has no actual tuple attached. This state of affairs looks like
+
+ Index points to 1
+ lp [1]->[2]
+
+ [2222222222]
+
+If now the row is updated again, to version 3, the page looks like this:
+
+ Index points to 1
+ lp [1]->[2] [3]
+
+ [2222222222]->[3333333333]
+
+At some later time when no transaction can see tuple 2 in its snapshot,
+tuple 2 and its line pointer can be pruned entirely:
+
+ Index points to 1
+ lp [1]------>[3]
+
+ [3333333333]
+
+This is safe because no index entry points to line pointer 2. Subsequent
+insertions into the page can now recycle both line pointer 2 and the
+space formerly used by tuple 2.
+
+If an update changes any indexed column, or there is not room on the
+same page for the new tuple, then the HOT chain ends: the last member
+has a regular t_ctid link to the next version and is not marked
+HEAP_HOT_UPDATED. (In principle we could continue a HOT chain across
+pages, but this would destroy the desired property of being able to
+reclaim space with just page-local manipulations. Anyway, we don't
+want to have to chase through multiple heap pages to get from an index
+entry to the desired tuple, so it seems better to create a new index
+entry for the new tuple.) If further updates occur, the next version
+could become the root of a new HOT chain.
+
+Line pointer 1 has to remain as long as there is any non-dead member of
+the chain on the page. When there is not, it is marked "dead".
+This lets us reclaim the last child line pointer and associated tuple
+immediately. The next regular VACUUM pass can reclaim the index entries
+pointing at the line pointer and then the line pointer itself. Since a
+line pointer is small compared to a tuple, this does not represent an
+undue space cost.
+
+Note: we can use a "dead" line pointer for any DELETEd tuple,
+whether it was part of a HOT chain or not. This allows space reclamation
+in advance of running VACUUM for plain DELETEs as well as HOT updates.
+
+The requirement for doing a HOT update is that none of the indexed
+columns are changed. This is checked at execution time by comparing the
+binary representation of the old and new values. We insist on bitwise
+equality rather than using datatype-specific equality routines. The
+main reason to avoid the latter is that there might be multiple notions
+of equality for a datatype, and we don't know exactly which one is
+relevant for the indexes at hand. We assume that bitwise equality
+guarantees equality for all purposes.
+
+
+Abort Cases
+-----------
+
+If a heap-only tuple's xmin is aborted, then it can be removed immediately:
+it was never visible to any other transaction, and all descendant row
+versions must be aborted as well. Therefore we need not consider it part
+of a HOT chain. By the same token, if a HOT-updated tuple's xmax is
+aborted, there is no need to follow the chain link. However, there is a
+race condition here: the transaction that did the HOT update might abort
+between the time we inspect the HOT-updated tuple and the time we reach
+the descendant heap-only tuple. It is conceivable that someone prunes
+the heap-only tuple before that, and even conceivable that the line pointer
+is re-used for another purpose. Therefore, when following a HOT chain,
+it is always necessary to be prepared for the possibility that the
+linked-to item pointer is unused, dead, or redirected; and if it is a
+normal item pointer, we still have to check that XMIN of the tuple matches
+the XMAX of the tuple we left. Otherwise we should assume that we have
+come to the end of the HOT chain. Note that this sort of XMIN/XMAX
+matching is required when following ordinary update chains anyway.
+
+(Early versions of the HOT code assumed that holding pin on the page
+buffer while following a HOT link would prevent this type of problem,
+but checking XMIN/XMAX matching is a much more robust solution.)
+
+
+Index/Sequential Scans
+----------------------
+
+When doing an index scan, whenever we reach a HEAP_HOT_UPDATED tuple whose
+xmax is not aborted, we need to follow its t_ctid link and check that
+entry as well; possibly repeatedly until we reach the end of the HOT
+chain. (When using an MVCC snapshot it is possible to optimize this a
+bit: there can be at most one visible tuple in the chain, so we can stop
+when we find it. This rule does not work for non-MVCC snapshots, though.)
+
+Sequential scans do not need to pay attention to the HOT links because
+they scan every item pointer on the page anyway. The same goes for a
+bitmap heap scan with a lossy bitmap.
+
+
+Pruning
+-------
+
+HOT pruning means updating item pointers so that HOT chains are
+reduced in length, by collapsing out line pointers for intermediate dead
+tuples. Although this makes those line pointers available for re-use,
+it does not immediately make the space occupied by their tuples available.
+
+
+Defragmentation
+---------------
+
+Defragmentation centralizes unused space. After we have converted root
+line pointers to redirected line pointers and pruned away any dead
+intermediate line pointers, the tuples they linked to are free space.
+But unless that space is adjacent to the central "hole" on the page
+(the pd_lower-to-pd_upper area) it cannot be used by tuple insertion.
+Defragmentation moves the surviving tuples to coalesce all the free
+space into one "hole". This is done with the same PageRepairFragmentation
+function that regular VACUUM uses.
+
+
+When can/should we prune or defragment?
+---------------------------------------
+
+This is the most interesting question in HOT implementation, since there
+is no simple right answer: we must use heuristics to determine when it's
+most efficient to perform pruning and/or defragmenting.
+
+We cannot prune or defragment unless we can get a "buffer cleanup lock"
+on the target page; otherwise, pruning might destroy line pointers that
+other backends have live references to, and defragmenting might move
+tuples that other backends have live pointers to. Thus the general
+approach must be to heuristically decide if we should try to prune
+or defragment, and if so try to acquire the buffer cleanup lock without
+blocking. If we succeed we can proceed with our housekeeping work.
+If we cannot get the lock (which should not happen often, except under
+very heavy contention) then the housekeeping has to be postponed till
+some other time. The worst-case consequence of this is only that an
+UPDATE cannot be made HOT but has to link to a new tuple version placed on
+some other page, for lack of centralized space on the original page.
+
+Ideally we would do defragmenting only when we are about to attempt
+heap_update on a HOT-safe tuple. The difficulty with this approach
+is that the update query has certainly got a pin on the old tuple, and
+therefore our attempt to acquire a buffer cleanup lock will always fail.
+(This corresponds to the idea that we don't want to move the old tuple
+out from under where the query's HeapTuple pointer points. It might
+be possible to finesse that, but it seems fragile.)
+
+Pruning, however, is potentially useful even when we are not about to
+insert a new tuple, since shortening a HOT chain reduces the cost of
+subsequent index searches. However it is unclear that this gain is
+large enough to accept any extra maintenance burden for.
+
+The currently planned heuristic is to prune and defrag when first accessing
+a page that potentially has prunable tuples (flagged by the PD_PRUNABLE
+page hint bit) and that either has free space less than MAX(fillfactor
+target free space, BLCKSZ/10) *or* has recently had an UPDATE fail to
+find enough free space to store an updated tuple version. (These rules
+are subject to change.)
+
+We have effectively implemented the "truncate dead tuples to just line
+pointer" idea that has been proposed and rejected before because of fear
+of line pointer bloat: we might end up with huge numbers of line pointers
+and just a few actual tuples on a page. To limit the damage in the worst
+case, and to keep various work arrays as well as the bitmaps in bitmap
+scans reasonably sized, the maximum number of line pointers per page
+is arbitrarily capped at MaxHeapTuplesPerPage (the most tuples that
+could fit without HOT pruning).
+
+
+VACUUM
+------
+
+There is little change to regular vacuum. It performs pruning to remove
+dead heap-only tuples, and cleans up any dead line pointers as if they were
+regular dead tuples.
+
+
+VACUUM FULL
+-----------
+
+VACUUM FULL performs an extra operation of collapsing out redirecting line
+pointers, by moving the first non-DEAD tuple of each HOT chain to the root
+position and clearing its heap-only-tuple flag. This effectively changes
+the user-visible CTID of that tuple. This would be completely unsafe
+during normal concurrent operation, but since VACUUM FULL takes full
+exclusive lock on the table, it should be OK. (Note that VACUUM FULL has
+always felt free to change tuples' CTIDs by moving them across pages.)
+Eliminating redirection links means that the main body of VACUUM FULL
+doesn't have to deal with them, which seems a good thing since VACUUM FULL
+is horrendously complex already.
+
+When VACUUM FULL tries to move tuple chains, it does not distinguish regular
+and heap-only tuples, but just moves both types the same. This is OK because
+it will move the entire non-DEAD tail of an update chain and remove index
+entries for each item moved. At worst, we'll uselessly search for index
+entries matching the heap-only tuples included in the move.
+
+
+Statistics
+----------
+
+Currently, we count HOT updates the same as cold updates for statistics
+purposes, though there is an additional per-table counter that counts
+only HOT updates. When a page pruning operation is able to remove a
+physical tuple by eliminating an intermediate heap-only tuple or
+replacing a physical root tuple by a redirect pointer, a decrement in
+the table's number of dead tuples is reported to pgstats, which may
+postpone autovacuuming. Note that we do not count replacing a root tuple
+by a DEAD item pointer as decrementing n_dead_tuples; we still want
+autovacuum to run to clean up the index entries and DEAD item.
+
+This area probably needs further work ...
+
+
+CREATE INDEX
+------------
+
+CREATE INDEX presents a problem for HOT updates. While the existing HOT
+chains all have the same index values for existing indexes, the columns
+in the new index might change within a pre-existing HOT chain, creating
+a "broken" chain that can't be indexed properly.
+
+To address this issue, regular (non-concurrent) CREATE INDEX makes the
+new index usable only by transactions newer than the CREATE INDEX
+command. This prevents transactions that can see the inconsistent HOT
+chains from trying to use the new index and getting incorrect results.
+New transactions can only see the rows visible after the index was
+created, hence the HOT chains are consistent for them.
+
+Entries in the new index point to root tuples (tuples with current index
+pointers) so that our index uses the same index pointers as all other
+indexes on the table. However the row we want to index is actually at
+the *end* of the chain, ie, the most recent live tuple on the HOT chain.
+That is the one we compute the index entry values for, but the TID
+we put into the index is that of the root tuple. Since transactions that
+will be allowed to use the new index cannot see any of the older tuple
+versions in the chain, the fact that they might not match the index entry
+isn't a problem. (Such transactions will check the tuple visibility
+information of the older versions and ignore them, without ever looking at
+their contents, so the content inconsistency is OK.) Subsequent updates
+to the live tuple will be allowed to extend the HOT chain only if they are
+HOT-safe for all the indexes.
+
+Because we have ShareLock on the table, any DELETE_IN_PROGRESS or
+INSERT_IN_PROGRESS tuples should have come from our own transaction.
+Therefore we can consider them committed since if the CREATE INDEX
+commits, they will be committed, and if it aborts the index is discarded.
+An exception to this is that early lock release is customary for system
+catalog updates, and so we might find such tuples when reindexing a system
+catalog. In that case we deal with it by waiting for the source
+transaction to commit or roll back. (We could do that for user tables
+too, but since the case is unexpected we prefer to throw an error.)
+
+Practically, we prevent old transactions from using the new index by
+setting pg_index.indcheckxmin to TRUE. Queries are allowed to use such an
+index only after pg_index.xmin is below their TransactionXmin horizon,
+thereby ensuring that any incompatible rows in HOT chains are dead to them.
+(pg_index.xmin will be the XID of the CREATE INDEX transaction. The reason
+for using xmin rather than a normal column is that the regular vacuum
+freezing mechanism will take care of converting xmin to FrozenTransactionId
+before it can wrap around.)
+
+This means in particular that the transaction creating the index will be
+unable to use the index. We alleviate that problem somewhat by not setting
+indcheckxmin unless the table actually contains HOT chains with
+RECENTLY_DEAD members. (In 8.4 we may be able to improve the situation,
+at least for non-serializable transactions, because we expect to be able to
+advance TransactionXmin intratransaction.)
+
+Another unpleasant consequence is that it is now risky to use SnapshotAny
+in an index scan: if the index was created more recently than the last
+vacuum, it's possible that some of the visited tuples do not match the
+index entry they are linked to. This does not seem to be a fatal
+objection, since there are few users of SnapshotAny and most use seqscans.
+The only exception at this writing is CLUSTER, which is okay because it
+does not require perfect ordering of the indexscan readout (and especially
+so because CLUSTER tends to write recently-dead tuples out of order anyway).
+
+
+CREATE INDEX CONCURRENTLY
+-------------------------
+
+In the concurrent case we must take a different approach. We create the
+pg_index entry immediately, before we scan the table. The pg_index entry
+is marked as "not ready for inserts". Then we commit and wait for any
+transactions which have the table open to finish. This ensures that no
+new HOT updates will change the key value for our new index, because all
+transactions will see the existence of the index and will respect its
+constraint on which updates can be HOT. Other transactions must include
+such an index when determining HOT-safety of updates, even though they
+must ignore it for both insertion and searching purposes.
+
+We must do this to avoid making incorrect index entries. For example,
+suppose we are building an index on column X and we make an index entry for
+a non-HOT tuple with X=1. Then some other backend, unaware that X is an
+indexed column, HOT-updates the row to have X=2, and commits. We now have
+an index entry for X=1 pointing at a HOT chain whose live row has X=2.
+We could make an index entry with X=2 during the validation pass, but
+there is no nice way to get rid of the wrong entry with X=1. So we must
+have the HOT-safety property enforced before we start to build the new
+index.
+
+After waiting for transactions which had the table open, we build the index
+for all rows that are valid in a fresh snapshot. Any tuples visible in the
+snapshot will have only valid forward-growing HOT chains. (They might have
+older HOT updates behind them which are broken, but this is OK for the same
+reason it's OK in a regular index build.) As above, we point the index
+entry at the root of the HOT-update chain but we use the key value from the
+live tuple.
+
+We mark the index open for inserts (but still not ready for reads) then
+we again wait for transactions which have the table open. Then we take
+a second reference snapshot and validate the index. This searches for
+tuples missing from the index, and inserts any missing ones. Again,
+the index entries have to have TIDs equal to HOT-chain root TIDs, but
+the value to be inserted is the one from the live tuple.
+
+Then we wait until every transaction that could have a snapshot older than
+the second reference snapshot is finished. This ensures that nobody is
+alive any longer who could need to see any tuples that might be missing
+from the index, as well as ensuring that no one can see any inconsistent
+rows in a broken HOT chain (the first condition is stronger than the
+second). Finally, we can mark the index valid for searches.
+
+
+Limitations and Restrictions
+----------------------------
+
+It is worth noting that HOT forever forecloses alternative approaches
+to vacuuming, specifically the recompute-the-index-keys approach alluded
+to in Technical Challenges above. It'll be tough to recompute the index
+keys for a root line pointer you don't have data for anymore ...
+
+
+Glossary
+--------
+
+Broken HOT Chain
+
+ A HOT chain in which the key value for an index has changed.
+
+ This is not allowed to occur normally but if a new index is created
+ it can happen. In that case various strategies are used to ensure
+ that no transaction for which the older tuples are visible can
+ use the index.
+
+Cold update
+
+ A normal, non-HOT update, in which index entries are made for
+ the new version of the tuple.
+
+Dead line pointer
+
+ A stub line pointer, that does not point to anything, but cannot
+ be removed or reused yet because there are index pointers to it.
+ Semantically same as a dead tuple. It has state LP_DEAD.
+
+Heap-only tuple
+
+ A heap tuple with no index pointers, which can only be reached
+ from indexes indirectly through its ancestral root tuple.
+ Marked with HEAP_ONLY_TUPLE flag.
+
+HOT-safe
+
+ A proposed tuple update is said to be HOT-safe if it changes
+ none of the tuple's indexed columns. It will only become an
+ actual HOT update if we can find room on the same page for
+ the new tuple version.
+
+HOT update
+
+ An UPDATE where the new tuple becomes a heap-only tuple, and no
+ new index entries are made.
+
+HOT-updated tuple
+
+ An updated tuple, for which the next tuple in the chain is a
+ heap-only tuple. Marked with HEAP_HOT_UPDATED flag.
+
+Indexed column
+
+ A column used in an index definition. The column might not
+ actually be stored in the index --- it could be used in a
+ functional index's expression, or used in a partial index
+ predicate. HOT treats all these cases alike.
+
+Redirecting line pointer
+
+ A line pointer that points to another line pointer and has no
+ associated tuple. It has the special lp_flags state LP_REDIRECT,
+ and lp_off is the OffsetNumber of the line pointer it links to.
+ This is used when a root tuple becomes dead but we cannot prune
+ the line pointer because there are non-dead heap-only tuples
+ further down the chain.
+
+Root tuple
+
+ The first tuple in a HOT update chain; the one that indexes point to.
+
+Update chain
+
+ A chain of updated tuples, in which each tuple's ctid points to
+ the next tuple in the chain. A HOT update chain is an update chain
+ (or portion of an update chain) that consists of a root tuple and
+ one or more heap-only tuples. A complete update chain can contain
+ both HOT and non-HOT (cold) updated tuples.
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 09a70d813f7..d5a2f9a43d1 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.240 2007/09/12 22:10:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.241 2007/09/20 17:56:30 tgl Exp $
*
*
* INTERFACE ROUTINES
@@ -52,6 +52,7 @@
#include "pgstat.h"
#include "storage/procarray.h"
#include "storage/smgr.h"
+#include "utils/datum.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/relcache.h"
@@ -64,6 +65,8 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
bool is_bitmapscan);
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
+static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
+ HeapTuple oldtup, HeapTuple newtup);
/* ----------------------------------------------------------------
@@ -184,6 +187,11 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
snapshot = scan->rs_snapshot;
/*
+ * Prune and repair fragmentation for the whole page, if possible.
+ */
+ heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
+
+ /*
* We must hold share lock on the buffer content while examining tuple
* visibility. Afterwards, however, the tuples we have found to be
* visible are guaranteed good as long as we hold the buffer pin.
@@ -316,7 +324,7 @@ heapgettup(HeapScanDesc scan,
* forward scanners.
*/
scan->rs_syncscan = false;
- /* start from last page of the scan */
+ /* start from last page of the scan */
if (scan->rs_startblock > 0)
page = scan->rs_startblock - 1;
else
@@ -368,6 +376,7 @@ heapgettup(HeapScanDesc scan,
dp = (Page) BufferGetPage(scan->rs_cbuf);
lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
lpp = PageGetItemId(dp, lineoff);
+ Assert(ItemIdIsNormal(lpp));
tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
tuple->t_len = ItemIdGetLength(lpp);
@@ -583,7 +592,7 @@ heapgettup_pagemode(HeapScanDesc scan,
* forward scanners.
*/
scan->rs_syncscan = false;
- /* start from last page of the scan */
+ /* start from last page of the scan */
if (scan->rs_startblock > 0)
page = scan->rs_startblock - 1;
else
@@ -632,6 +641,7 @@ heapgettup_pagemode(HeapScanDesc scan,
dp = (Page) BufferGetPage(scan->rs_cbuf);
lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
lpp = PageGetItemId(dp, lineoff);
+ Assert(ItemIdIsNormal(lpp));
tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
tuple->t_len = ItemIdGetLength(lpp);
@@ -1246,6 +1256,9 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction)
* for statistical purposes. (This could be the heap rel itself, an
* associated index, or NULL to not count the fetch at all.)
*
+ * heap_fetch does not follow HOT chains: only the exact TID requested will
+ * be fetched.
+ *
* It is somewhat inconsistent that we ereport() on invalid block number but
* return false on invalid item number. There are a couple of reasons though.
* One is that the caller can relatively easily check the block number for
@@ -1390,6 +1403,143 @@ heap_release_fetch(Relation relation,
}
/*
+ * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
+ *
+ * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
+ * of a HOT chain), and buffer is the buffer holding this tuple. We search
+ * for the first chain member satisfying the given snapshot. If one is
+ * found, we update *tid to reference that tuple's offset number, and
+ * return TRUE. If no match, return FALSE without modifying *tid.
+ *
+ * If all_dead is not NULL, we check non-visible tuples to see if they are
+ * globally dead; *all_dead is set TRUE if all members of the HOT chain
+ * are vacuumable, FALSE if not.
+ *
+ * Unlike heap_fetch, the caller must already have pin and (at least) share
+ * lock on the buffer; it is still pinned/locked at exit. Also unlike
+ * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
+ */
+bool
+heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
+ bool *all_dead)
+{
+ Page dp = (Page) BufferGetPage(buffer);
+ TransactionId prev_xmax = InvalidTransactionId;
+ OffsetNumber offnum;
+ bool at_chain_start;
+
+ if (all_dead)
+ *all_dead = true;
+
+ Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
+ offnum = ItemPointerGetOffsetNumber(tid);
+ at_chain_start = true;
+
+ /* Scan through possible multiple members of HOT-chain */
+ for (;;)
+ {
+ ItemId lp;
+ HeapTupleData heapTuple;
+
+ /* check for bogus TID */
+ if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
+ break;
+
+ lp = PageGetItemId(dp, offnum);
+
+ /* check for unused, dead, or redirected items */
+ if (!ItemIdIsNormal(lp))
+ {
+ /* We should only see a redirect at start of chain */
+ if (ItemIdIsRedirected(lp) && at_chain_start)
+ {
+ /* Follow the redirect */
+ offnum = ItemIdGetRedirect(lp);
+ at_chain_start = false;
+ continue;
+ }
+ /* else must be end of chain */
+ break;
+ }
+
+ heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+ heapTuple.t_len = ItemIdGetLength(lp);
+
+ /*
+ * Shouldn't see a HEAP_ONLY tuple at chain start.
+ */
+ if (at_chain_start && HeapTupleIsHeapOnly(&heapTuple))
+ break;
+
+ /*
+ * The xmin should match the previous xmax value, else chain is broken.
+ */
+ if (TransactionIdIsValid(prev_xmax) &&
+ !TransactionIdEquals(prev_xmax,
+ HeapTupleHeaderGetXmin(heapTuple.t_data)))
+ break;
+
+ /* If it's visible per the snapshot, we must return it */
+ if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
+ {
+ ItemPointerSetOffsetNumber(tid, offnum);
+ if (all_dead)
+ *all_dead = false;
+ return true;
+ }
+
+ /*
+ * If we can't see it, maybe no one else can either. At caller
+ * request, check whether all chain members are dead to all
+ * transactions.
+ */
+ if (all_dead && *all_dead &&
+ HeapTupleSatisfiesVacuum(heapTuple.t_data, RecentGlobalXmin,
+ buffer) != HEAPTUPLE_DEAD)
+ *all_dead = false;
+
+ /*
+ * Check to see if HOT chain continues past this tuple; if so
+ * fetch the next offnum and loop around.
+ */
+ if (HeapTupleIsHotUpdated(&heapTuple))
+ {
+ Assert(ItemPointerGetBlockNumber(&heapTuple.t_data->t_ctid) ==
+ ItemPointerGetBlockNumber(tid));
+ offnum = ItemPointerGetOffsetNumber(&heapTuple.t_data->t_ctid);
+ at_chain_start = false;
+ prev_xmax = HeapTupleHeaderGetXmax(heapTuple.t_data);
+ }
+ else
+ break; /* end of chain */
+ }
+
+ return false;
+}
+
+/*
+ * heap_hot_search - search HOT chain for tuple satisfying snapshot
+ *
+ * This has the same API as heap_hot_search_buffer, except that the caller
+ * does not provide the buffer containing the page, rather we access it
+ * locally.
+ */
+bool
+heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
+ bool *all_dead)
+{
+ bool result;
+ Buffer buffer;
+
+ buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+ result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ ReleaseBuffer(buffer);
+ return result;
+}
+
+/*
* heap_get_latest_tid - get the latest tid of a specified tuple
*
* Actually, this gets the latest version that is visible according to
@@ -1594,6 +1744,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
}
tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
+ tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
HeapTupleHeaderSetXmin(tup->t_data, xid);
HeapTupleHeaderSetCmin(tup->t_data, cid);
@@ -1628,6 +1779,17 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
RelationPutHeapTuple(relation, buffer, heaptup);
+ /*
+ * XXX Should we set PageSetPrunable on this page ?
+ *
+ * The inserting transaction may eventually abort thus making this tuple
+ * DEAD and hence available for pruning. Though we don't want to optimize
+ * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
+ * aborted tuple will never be pruned until next vacuum is triggered.
+ *
+ * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
+ */
+
MarkBufferDirty(buffer);
/* XLOG stuff */
@@ -1904,12 +2066,21 @@ l1:
START_CRIT_SECTION();
+ /*
+ * If this transaction commits, the tuple will become DEAD sooner or
+ * later. Set hint bit that this page is a candidate for pruning. If
+ * the transaction finally aborts, the subsequent page pruning will be
+ * a no-op and the hint will be cleared.
+ */
+ PageSetPrunable((Page) dp);
+
/* store transaction information of xact deleting the tuple */
tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
HEAP_XMAX_INVALID |
HEAP_XMAX_IS_MULTI |
HEAP_IS_LOCKED |
HEAP_MOVED);
+ HeapTupleHeaderClearHotUpdated(tp.t_data);
HeapTupleHeaderSetXmax(tp.t_data, xid);
HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
/* Make sure there is no forward chain link in t_ctid */
@@ -2045,7 +2216,8 @@ simple_heap_delete(Relation relation, ItemPointer tid)
*
* On success, the header fields of *newtup are updated to match the new
* stored tuple; in particular, newtup->t_self is set to the TID where the
- * new tuple was inserted. However, any TOAST changes in the new tuple's
+ * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
+ * update was done. However, any TOAST changes in the new tuple's
* data are not reflected into *newtup.
*
* In the failure cases, the routine returns the tuple's t_ctid and t_xmax.
@@ -2060,6 +2232,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
{
HTSU_Result result;
TransactionId xid = GetCurrentTransactionId();
+ Bitmapset *hot_attrs;
ItemId lp;
HeapTupleData oldtup;
HeapTuple heaptup;
@@ -2072,9 +2245,24 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
pagefree;
bool have_tuple_lock = false;
bool iscombo;
+ bool use_hot_update = false;
Assert(ItemPointerIsValid(otid));
+ /*
+ * Fetch the list of attributes to be checked for HOT update. This is
+ * wasted effort if we fail to update or have to put the new tuple on
+ * a different page. But we must compute the list before obtaining
+ * buffer lock --- in the worst case, if we are doing an update on one
+ * of the relevant system catalogs, we could deadlock if we try to
+ * fetch the list later. In any case, the relcache caches the data
+ * so this is usually pretty cheap.
+ *
+ * Note that we get a copy here, so we need not worry about relcache
+ * flush happening midway through.
+ */
+ hot_attrs = RelationGetIndexAttrBitmap(relation);
+
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(otid));
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
@@ -2208,6 +2396,7 @@ l2:
UnlockReleaseBuffer(buffer);
if (have_tuple_lock)
UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
+ bms_free(hot_attrs);
return result;
}
@@ -2227,6 +2416,7 @@ l2:
}
newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
+ newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED);
HeapTupleHeaderSetXmin(newtup->t_data, xid);
HeapTupleHeaderSetCmin(newtup->t_data, cid);
@@ -2261,17 +2451,20 @@ l2:
HeapTupleHasExternal(newtup) ||
newtup->t_len > TOAST_TUPLE_THRESHOLD);
- pagefree = PageGetFreeSpace((Page) dp);
+ pagefree = PageGetHeapFreeSpace((Page) dp);
newtupsize = MAXALIGN(newtup->t_len);
if (need_toast || newtupsize > pagefree)
{
+ /* Clear obsolete visibility flags ... */
oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
HEAP_XMAX_INVALID |
HEAP_XMAX_IS_MULTI |
HEAP_IS_LOCKED |
HEAP_MOVED);
+ HeapTupleClearHotUpdated(&oldtup);
+ /* ... and store info about transaction updating this tuple */
HeapTupleHeaderSetXmax(oldtup.t_data, xid);
HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
/* temporarily make it look not-updated */
@@ -2324,7 +2517,7 @@ l2:
/* Re-acquire the lock on the old tuple's page. */
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
/* Re-check using the up-to-date free space */
- pagefree = PageGetFreeSpace((Page) dp);
+ pagefree = PageGetHeapFreeSpace((Page) dp);
if (newtupsize > pagefree)
{
/*
@@ -2357,18 +2550,66 @@ l2:
* one pin is held.
*/
+ if (newbuf == buffer)
+ {
+ /*
+ * Since the new tuple is going into the same page, we might be able
+ * to do a HOT update. Check if any of the index columns have been
+ * changed. If not, then HOT update is possible.
+ */
+ if (HeapSatisfiesHOTUpdate(relation, hot_attrs, &oldtup, heaptup))
+ use_hot_update = true;
+ }
+ else
+ {
+ /* Set a hint that the old page could use prune/defrag */
+ PageSetFull(dp);
+ }
+
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
+ /*
+ * If this transaction commits, the old tuple will become DEAD sooner or
+ * later. Set hint bit that this page is a candidate for pruning. If
+ * the transaction finally aborts, the subsequent page pruning will be
+ * a no-op and the hint will be cleared.
+ *
+ * XXX Should we set hint on newbuf as well? If the transaction
+ * aborts, there would be a prunable tuple in the newbuf; but for now
+ * we choose not to optimize for aborts. Note that heap_xlog_update
+ * must be kept in sync if this changes.
+ */
+ PageSetPrunable(dp);
+
+ if (use_hot_update)
+ {
+ /* Mark the old tuple as HOT-updated */
+ HeapTupleSetHotUpdated(&oldtup);
+ /* And mark the new tuple as heap-only */
+ HeapTupleSetHeapOnly(heaptup);
+ /* Mark the caller's copy too, in case different from heaptup */
+ HeapTupleSetHeapOnly(newtup);
+ }
+ else
+ {
+ /* Make sure tuples are correctly marked as not-HOT */
+ HeapTupleClearHotUpdated(&oldtup);
+ HeapTupleClearHeapOnly(heaptup);
+ HeapTupleClearHeapOnly(newtup);
+ }
+
RelationPutHeapTuple(relation, newbuf, heaptup); /* insert new tuple */
if (!already_marked)
{
+ /* Clear obsolete visibility flags ... */
oldtup.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
HEAP_XMAX_INVALID |
HEAP_XMAX_IS_MULTI |
HEAP_IS_LOCKED |
HEAP_MOVED);
+ /* ... and store info about transaction updating this tuple */
HeapTupleHeaderSetXmax(oldtup.t_data, xid);
HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
}
@@ -2427,7 +2668,7 @@ l2:
if (have_tuple_lock)
UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock);
- pgstat_count_heap_update(relation);
+ pgstat_count_heap_update(relation, use_hot_update);
/*
* If heaptup is a private copy, release it. Don't forget to copy t_self
@@ -2439,10 +2680,120 @@ l2:
heap_freetuple(heaptup);
}
+ bms_free(hot_attrs);
+
return HeapTupleMayBeUpdated;
}
/*
+ * Check if the specified attribute's value is same in both given tuples.
+ * Subroutine for HeapSatisfiesHOTUpdate.
+ */
+static bool
+heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
+ HeapTuple tup1, HeapTuple tup2)
+{
+ Datum value1, value2;
+ bool isnull1, isnull2;
+ Form_pg_attribute att;
+
+ /*
+ * If it's a whole-tuple reference, say "not equal". It's not really
+ * worth supporting this case, since it could only succeed after a
+ * no-op update, which is hardly a case worth optimizing for.
+ */
+ if (attrnum == 0)
+ return false;
+
+ /*
+ * Likewise, automatically say "not equal" for any system attribute
+ * other than OID and tableOID; we cannot expect these to be consistent
+ * in a HOT chain, or even to be set correctly yet in the new tuple.
+ */
+ if (attrnum < 0)
+ {
+ if (attrnum != ObjectIdAttributeNumber &&
+ attrnum != TableOidAttributeNumber)
+ return false;
+ }
+
+ /*
+ * Extract the corresponding values. XXX this is pretty inefficient
+ * if there are many indexed columns. Should HeapSatisfiesHOTUpdate
+ * do a single heap_deform_tuple call on each tuple, instead? But
+ * that doesn't work for system columns ...
+ */
+ value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
+ value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
+
+ /*
+ * If one value is NULL and other is not, then they are certainly
+ * not equal
+ */
+ if (isnull1 != isnull2)
+ return false;
+
+ /*
+ * If both are NULL, they can be considered equal.
+ */
+ if (isnull1)
+ return true;
+
+ /*
+ * We do simple binary comparison of the two datums. This may be overly
+ * strict because there can be multiple binary representations for the
+ * same logical value. But we should be OK as long as there are no false
+ * positives. Using a type-specific equality operator is messy because
+ * there could be multiple notions of equality in different operator
+ * classes; furthermore, we cannot safely invoke user-defined functions
+ * while holding exclusive buffer lock.
+ */
+ if (attrnum <= 0)
+ {
+ /* The only allowed system columns are OIDs, so do this */
+ return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
+ }
+ else
+ {
+ Assert(attrnum <= tupdesc->natts);
+ att = tupdesc->attrs[attrnum - 1];
+ return datumIsEqual(value1, value2, att->attbyval, att->attlen);
+ }
+}
+
+/*
+ * Check if the old and new tuples represent a HOT-safe update. To be able
+ * to do a HOT update, we must not have changed any columns used in index
+ * definitions.
+ *
+ * The set of attributes to be checked is passed in (we dare not try to
+ * compute it while holding exclusive buffer lock...) NOTE that hot_attrs
+ * is destructively modified! That is OK since this is invoked at most once
+ * by heap_update().
+ *
+ * Returns true if safe to do HOT update.
+ */
+static bool
+HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
+ HeapTuple oldtup, HeapTuple newtup)
+{
+ int attrnum;
+
+ while ((attrnum = bms_first_member(hot_attrs)) >= 0)
+ {
+ /* Adjust for system attributes */
+ attrnum += FirstLowInvalidHeapAttributeNumber;
+
+ /* If the attribute value has changed, we can't do HOT update */
+ if (!heap_tuple_attr_equals(RelationGetDescr(relation), attrnum,
+ oldtup, newtup))
+ return false;
+ }
+
+ return true;
+}
+
+/*
* simple_heap_update - replace a tuple
*
* This routine may be used to update a tuple when concurrent updates of
@@ -2865,6 +3216,7 @@ l3:
* avoids possibly generating a useless combo CID.
*/
tuple->t_data->t_infomask = new_infomask;
+ HeapTupleHeaderClearHotUpdated(tuple->t_data);
HeapTupleHeaderSetXmax(tuple->t_data, xid);
/* Make sure there is no forward chain link in t_ctid */
tuple->t_data->t_ctid = *tid;
@@ -3110,6 +3462,7 @@ recheck_xmax:
*/
tuple->t_infomask &= ~HEAP_XMAX_COMMITTED;
tuple->t_infomask |= HEAP_XMAX_INVALID;
+ HeapTupleHeaderClearHotUpdated(tuple);
changed = true;
}
}
@@ -3245,21 +3598,29 @@ heap_restrpos(HeapScanDesc scan)
* Perform XLogInsert for a heap-clean operation. Caller must already
* have modified the buffer and marked it dirty.
*
- * Note: for historical reasons, the entries in the unused[] array should
- * be zero-based tuple indexes, not one-based.
+ * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
+ * zero-based tuple indexes. Now they are one-based like other uses
+ * of OffsetNumber.
*/
XLogRecPtr
-log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
+log_heap_clean(Relation reln, Buffer buffer,
+ OffsetNumber *redirected, int nredirected,
+ OffsetNumber *nowdead, int ndead,
+ OffsetNumber *nowunused, int nunused,
+ bool redirect_move)
{
xl_heap_clean xlrec;
+ uint8 info;
XLogRecPtr recptr;
- XLogRecData rdata[2];
+ XLogRecData rdata[4];
/* Caller should not call me on a temp relation */
Assert(!reln->rd_istemp);
xlrec.node = reln->rd_node;
xlrec.block = BufferGetBlockNumber(buffer);
+ xlrec.nredirected = nredirected;
+ xlrec.ndead = ndead;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfHeapClean;
@@ -3267,14 +3628,17 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
rdata[0].next = &(rdata[1]);
/*
- * The unused-offsets array is not actually in the buffer, but pretend
- * that it is. When XLogInsert stores the whole buffer, the offsets array
- * need not be stored too.
+ * The OffsetNumber arrays are not actually in the buffer, but we pretend
+ * that they are. When XLogInsert stores the whole buffer, the offset
+ * arrays need not be stored too. Note that even if all three arrays
+ * are empty, we want to expose the buffer as a candidate for whole-page
+ * storage, since this record type implies a defragmentation operation
+ * even if no item pointers changed state.
*/
- if (uncnt > 0)
+ if (nredirected > 0)
{
- rdata[1].data = (char *) unused;
- rdata[1].len = uncnt * sizeof(OffsetNumber);
+ rdata[1].data = (char *) redirected;
+ rdata[1].len = nredirected * sizeof(OffsetNumber) * 2;
}
else
{
@@ -3283,9 +3647,38 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
}
rdata[1].buffer = buffer;
rdata[1].buffer_std = true;
- rdata[1].next = NULL;
+ rdata[1].next = &(rdata[2]);
+
+ if (ndead > 0)
+ {
+ rdata[2].data = (char *) nowdead;
+ rdata[2].len = ndead * sizeof(OffsetNumber);
+ }
+ else
+ {
+ rdata[2].data = NULL;
+ rdata[2].len = 0;
+ }
+ rdata[2].buffer = buffer;
+ rdata[2].buffer_std = true;
+ rdata[2].next = &(rdata[3]);
+
+ if (nunused > 0)
+ {
+ rdata[3].data = (char *) nowunused;
+ rdata[3].len = nunused * sizeof(OffsetNumber);
+ }
+ else
+ {
+ rdata[3].data = NULL;
+ rdata[3].len = 0;
+ }
+ rdata[3].buffer = buffer;
+ rdata[3].buffer_std = true;
+ rdata[3].next = NULL;
- recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CLEAN, rdata);
+ info = redirect_move ? XLOG_HEAP2_CLEAN_MOVE : XLOG_HEAP2_CLEAN;
+ recptr = XLogInsert(RM_HEAP2_ID, info, rdata);
return recptr;
}
@@ -3293,8 +3686,6 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
/*
* Perform XLogInsert for a heap-freeze operation. Caller must already
* have modified the buffer and marked it dirty.
- *
- * Unlike log_heap_clean(), the offsets[] entries are one-based.
*/
XLogRecPtr
log_heap_freeze(Relation reln, Buffer buffer,
@@ -3363,17 +3754,28 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
} xlhdr;
int hsize = SizeOfHeapHeader;
xl_heap_update xlrec;
+ uint8 info;
XLogRecPtr recptr;
XLogRecData rdata[4];
Page page = BufferGetPage(newbuf);
- uint8 info = (move) ? XLOG_HEAP_MOVE : XLOG_HEAP_UPDATE;
/* Caller should not call me on a temp relation */
Assert(!reln->rd_istemp);
+ if (move)
+ {
+ Assert(!HeapTupleIsHeapOnly(newtup));
+ info = XLOG_HEAP_MOVE;
+ }
+ else if (HeapTupleIsHeapOnly(newtup))
+ info = XLOG_HEAP_HOT_UPDATE;
+ else
+ info = XLOG_HEAP_UPDATE;
+
xlrec.target.node = reln->rd_node;
xlrec.target.tid = from;
xlrec.newtid = newtup->t_self;
+
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfHeapUpdate;
rdata[0].buffer = InvalidBuffer;
@@ -3489,13 +3891,21 @@ log_newpage(RelFileNode *rnode, BlockNumber blkno, Page page)
return recptr;
}
+/*
+ * Handles CLEAN and CLEAN_MOVE record types
+ */
static void
-heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
+heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
{
xl_heap_clean *xlrec = (xl_heap_clean *) XLogRecGetData(record);
Relation reln;
Buffer buffer;
Page page;
+ OffsetNumber *offnum;
+ OffsetNumber *end;
+ int nredirected;
+ int ndead;
+ int i;
if (record->xl_info & XLR_BKP_BLOCK_1)
return;
@@ -3512,25 +3922,63 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record)
return;
}
- if (record->xl_len > SizeOfHeapClean)
- {
- OffsetNumber *unused;
- OffsetNumber *unend;
- ItemId lp;
+ nredirected = xlrec->nredirected;
+ ndead = xlrec->ndead;
+ offnum = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
+ end = (OffsetNumber *) ((char *) xlrec + record->xl_len);
- unused = (OffsetNumber *) ((char *) xlrec + SizeOfHeapClean);
- unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+ /* Update all redirected or moved line pointers */
+ for (i = 0; i < nredirected; i++)
+ {
+ OffsetNumber fromoff = *offnum++;
+ OffsetNumber tooff = *offnum++;
+ ItemId fromlp = PageGetItemId(page, fromoff);
- while (unused < unend)
+ if (clean_move)
{
- /* unused[] entries are zero-based */
- lp = PageGetItemId(page, *unused + 1);
- ItemIdSetUnused(lp);
- unused++;
+ /* Physically move the "to" item to the "from" slot */
+ ItemId tolp = PageGetItemId(page, tooff);
+ HeapTupleHeader htup;
+
+ *fromlp = *tolp;
+ ItemIdSetUnused(tolp);
+
+ /* We also have to clear the tuple's heap-only bit */
+ Assert(ItemIdIsNormal(fromlp));
+ htup = (HeapTupleHeader) PageGetItem(page, fromlp);
+ Assert(HeapTupleHeaderIsHeapOnly(htup));
+ HeapTupleHeaderClearHeapOnly(htup);
+ }
+ else
+ {
+ /* Just insert a REDIRECT link at fromoff */
+ ItemIdSetRedirect(fromlp, tooff);
}
}
- PageRepairFragmentation(page, NULL);
+ /* Update all now-dead line pointers */
+ for (i = 0; i < ndead; i++)
+ {
+ OffsetNumber off = *offnum++;
+ ItemId lp = PageGetItemId(page, off);
+
+ ItemIdSetDead(lp);
+ }
+
+ /* Update all now-unused line pointers */
+ while (offnum < end)
+ {
+ OffsetNumber off = *offnum++;
+ ItemId lp = PageGetItemId(page, off);
+
+ ItemIdSetUnused(lp);
+ }
+
+ /*
+ * Finally, repair any fragmentation, and update the page's hint bit
+ * about whether it has free pointers.
+ */
+ PageRepairFragmentation(page);
PageSetLSN(page, lsn);
PageSetTLI(page, ThisTimeLineID);
@@ -3655,8 +4103,13 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
HEAP_XMAX_IS_MULTI |
HEAP_IS_LOCKED |
HEAP_MOVED);
+ HeapTupleHeaderClearHotUpdated(htup);
HeapTupleHeaderSetXmax(htup, record->xl_xid);
HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+
+ /* Mark the page as a candidate for pruning */
+ PageSetPrunable(page);
+
/* Make sure there is no forward chain link in t_ctid */
htup->t_ctid = xlrec->target.tid;
PageSetLSN(page, lsn);
@@ -3736,7 +4189,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
HeapTupleHeaderSetCmin(htup, FirstCommandId);
htup->t_ctid = xlrec->target.tid;
- offnum = PageAddItem(page, (Item) htup, newlen, offnum, true);
+ offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
if (offnum == InvalidOffsetNumber)
elog(PANIC, "heap_insert_redo: failed to add tuple");
PageSetLSN(page, lsn);
@@ -3746,10 +4199,10 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
}
/*
- * Handles UPDATE & MOVE
+ * Handles UPDATE, HOT_UPDATE & MOVE
*/
static void
-heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
+heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
{
xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
Relation reln = XLogOpenRelation(xlrec->target.node);
@@ -3808,6 +4261,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
HEAP_XMIN_INVALID |
HEAP_MOVED_IN);
htup->t_infomask |= HEAP_MOVED_OFF;
+ HeapTupleHeaderClearHotUpdated(htup);
HeapTupleHeaderSetXvac(htup, record->xl_xid);
/* Make sure there is no forward chain link in t_ctid */
htup->t_ctid = xlrec->target.tid;
@@ -3819,12 +4273,19 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
HEAP_XMAX_IS_MULTI |
HEAP_IS_LOCKED |
HEAP_MOVED);
+ if (hot_update)
+ HeapTupleHeaderSetHotUpdated(htup);
+ else
+ HeapTupleHeaderClearHotUpdated(htup);
HeapTupleHeaderSetXmax(htup, record->xl_xid);
HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
/* Set forward chain link in t_ctid */
htup->t_ctid = xlrec->newtid;
}
+ /* Mark the page as a candidate for pruning */
+ PageSetPrunable(page);
+
/*
* this test is ugly, but necessary to avoid thinking that insert change
* is already applied
@@ -3914,7 +4375,7 @@ newsame:;
/* Make sure there is no forward chain link in t_ctid */
htup->t_ctid = xlrec->newtid;
- offnum = PageAddItem(page, (Item) htup, newlen, offnum, true);
+ offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
if (offnum == InvalidOffsetNumber)
elog(PANIC, "heap_update_redo: failed to add tuple");
PageSetLSN(page, lsn);
@@ -3971,6 +4432,7 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
htup->t_infomask |= HEAP_XMAX_SHARED_LOCK;
else
htup->t_infomask |= HEAP_XMAX_EXCL_LOCK;
+ HeapTupleHeaderClearHotUpdated(htup);
HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
/* Make sure there is no forward chain link in t_ctid */
@@ -4039,25 +4501,35 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
{
uint8 info = record->xl_info & ~XLR_INFO_MASK;
- info &= XLOG_HEAP_OPMASK;
- if (info == XLOG_HEAP_INSERT)
- heap_xlog_insert(lsn, record);
- else if (info == XLOG_HEAP_DELETE)
- heap_xlog_delete(lsn, record);
- else if (info == XLOG_HEAP_UPDATE)
- heap_xlog_update(lsn, record, false);
- else if (info == XLOG_HEAP_MOVE)
- heap_xlog_update(lsn, record, true);
- else if (info == XLOG_HEAP_CLEAN)
- heap_xlog_clean(lsn, record);
- else if (info == XLOG_HEAP_NEWPAGE)
- heap_xlog_newpage(lsn, record);
- else if (info == XLOG_HEAP_LOCK)
- heap_xlog_lock(lsn, record);
- else if (info == XLOG_HEAP_INPLACE)
- heap_xlog_inplace(lsn, record);
- else
- elog(PANIC, "heap_redo: unknown op code %u", info);
+ switch (info & XLOG_HEAP_OPMASK)
+ {
+ case XLOG_HEAP_INSERT:
+ heap_xlog_insert(lsn, record);
+ break;
+ case XLOG_HEAP_DELETE:
+ heap_xlog_delete(lsn, record);
+ break;
+ case XLOG_HEAP_UPDATE:
+ heap_xlog_update(lsn, record, false, false);
+ break;
+ case XLOG_HEAP_MOVE:
+ heap_xlog_update(lsn, record, true, false);
+ break;
+ case XLOG_HEAP_HOT_UPDATE:
+ heap_xlog_update(lsn, record, false, true);
+ break;
+ case XLOG_HEAP_NEWPAGE:
+ heap_xlog_newpage(lsn, record);
+ break;
+ case XLOG_HEAP_LOCK:
+ heap_xlog_lock(lsn, record);
+ break;
+ case XLOG_HEAP_INPLACE:
+ heap_xlog_inplace(lsn, record);
+ break;
+ default:
+ elog(PANIC, "heap_redo: unknown op code %u", info);
+ }
}
void
@@ -4065,11 +4537,20 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
{
uint8 info = record->xl_info & ~XLR_INFO_MASK;
- info &= XLOG_HEAP_OPMASK;
- if (info == XLOG_HEAP2_FREEZE)
- heap_xlog_freeze(lsn, record);
- else
- elog(PANIC, "heap2_redo: unknown op code %u", info);
+ switch (info & XLOG_HEAP_OPMASK)
+ {
+ case XLOG_HEAP2_FREEZE:
+ heap_xlog_freeze(lsn, record);
+ break;
+ case XLOG_HEAP2_CLEAN:
+ heap_xlog_clean(lsn, record, false);
+ break;
+ case XLOG_HEAP2_CLEAN_MOVE:
+ heap_xlog_clean(lsn, record, true);
+ break;
+ default:
+ elog(PANIC, "heap2_redo: unknown op code %u", info);
+ }
}
static void
@@ -4130,13 +4611,18 @@ heap_desc(StringInfo buf, uint8 xl_info, char *rec)
ItemPointerGetBlockNumber(&(xlrec->newtid)),
ItemPointerGetOffsetNumber(&(xlrec->newtid)));
}
- else if (info == XLOG_HEAP_CLEAN)
+ else if (info == XLOG_HEAP_HOT_UPDATE)
{
- xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+ xl_heap_update *xlrec = (xl_heap_update *) rec;
- appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
- xlrec->node.spcNode, xlrec->node.dbNode,
- xlrec->node.relNode, xlrec->block);
+ if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */
+ appendStringInfo(buf, "hot_update(init): ");
+ else
+ appendStringInfo(buf, "hot_update: ");
+ out_target(buf, &(xlrec->target));
+ appendStringInfo(buf, "; new %u/%u",
+ ItemPointerGetBlockNumber(&(xlrec->newtid)),
+ ItemPointerGetOffsetNumber(&(xlrec->newtid)));
}
else if (info == XLOG_HEAP_NEWPAGE)
{
@@ -4187,6 +4673,22 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
xlrec->node.relNode, xlrec->block,
xlrec->cutoff_xid);
}
+ else if (info == XLOG_HEAP2_CLEAN)
+ {
+ xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+
+ appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
+ xlrec->node.spcNode, xlrec->node.dbNode,
+ xlrec->node.relNode, xlrec->block);
+ }
+ else if (info == XLOG_HEAP2_CLEAN_MOVE)
+ {
+ xl_heap_clean *xlrec = (xl_heap_clean *) rec;
+
+ appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
+ xlrec->node.spcNode, xlrec->node.dbNode,
+ xlrec->node.relNode, xlrec->block);
+ }
else
appendStringInfo(buf, "UNKNOWN");
}
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 6dbdf13fbe0..cd13d8f87c9 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.66 2007/09/12 22:10:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.67 2007/09/20 17:56:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -41,7 +41,7 @@ RelationPutHeapTuple(Relation relation,
pageHeader = BufferGetPage(buffer);
offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
- tuple->t_len, InvalidOffsetNumber, false);
+ tuple->t_len, InvalidOffsetNumber, false, true);
if (offnum == InvalidOffsetNumber)
elog(PANIC, "failed to add tuple to page");
@@ -218,7 +218,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
* we're done.
*/
pageHeader = (Page) BufferGetPage(buffer);
- pageFreeSpace = PageGetFreeSpace(pageHeader);
+ pageFreeSpace = PageGetHeapFreeSpace(pageHeader);
if (len + saveFreeSpace <= pageFreeSpace)
{
/* use this page as future insert target, too */
@@ -311,7 +311,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
PageInit(pageHeader, BufferGetPageSize(buffer), 0);
- if (len > PageGetFreeSpace(pageHeader))
+ if (len > PageGetHeapFreeSpace(pageHeader))
{
/* We should not get here given the test at the top */
elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
new file mode 100644
index 00000000000..d5496689003
--- /dev/null
+++ b/src/backend/access/heap/pruneheap.c
@@ -0,0 +1,702 @@
+/*-------------------------------------------------------------------------
+ *
+ * pruneheap.c
+ * heap page pruning and HOT-chain management code
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.1 2007/09/20 17:56:30 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/inval.h"
+
+
+/* Local functions */
+static int heap_prune_chain(Relation relation, Buffer buffer,
+ OffsetNumber rootoffnum,
+ TransactionId OldestXmin,
+ OffsetNumber *redirected, int *nredirected,
+ OffsetNumber *nowdead, int *ndead,
+ OffsetNumber *nowunused, int *nunused,
+ bool redirect_move);
+static void heap_prune_record_redirect(OffsetNumber *redirected,
+ int *nredirected,
+ OffsetNumber offnum,
+ OffsetNumber rdoffnum);
+static void heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
+ OffsetNumber offnum);
+static void heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
+ OffsetNumber offnum);
+
+
+/*
+ * Optionally prune and repair fragmentation in the specified page.
+ *
+ * This is an opportunistic function. It will perform housekeeping
+ * only if the page heuristically looks like a candidate for pruning and we
+ * can acquire buffer cleanup lock without blocking.
+ *
+ * Note: this is called quite often. It's important that it fall out quickly
+ * if there's not any use in pruning.
+ *
+ * Caller must have pin on the buffer, and must *not* have a lock on it.
+ *
+ * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
+ * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ */
+void
+heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
+{
+ PageHeader dp = (PageHeader) BufferGetPage(buffer);
+ Size minfree;
+
+ /*
+ * Let's see if we really need pruning.
+ *
+ * Forget it if page is not hinted to contain something prunable
+ */
+ if (!PageIsPrunable(dp))
+ return;
+
+ /*
+ * We prune when a previous UPDATE failed to find enough space on the
+ * page for a new tuple version, or when free space falls below the
+ * relation's fill-factor target (but not less than 10%).
+ *
+ * Checking free space here is questionable since we aren't holding
+ * any lock on the buffer; in the worst case we could get a bogus
+ * answer. It's unlikely to be *seriously* wrong, though, since
+ * reading either pd_lower or pd_upper is probably atomic. Avoiding
+ * taking a lock seems better than sometimes getting a wrong answer
+ * in what is after all just a heuristic estimate.
+ */
+ minfree = RelationGetTargetPageFreeSpace(relation,
+ HEAP_DEFAULT_FILLFACTOR);
+ minfree = Max(minfree, BLCKSZ / 10);
+
+ if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
+ {
+ /* OK, try to get exclusive buffer lock */
+ if (!ConditionalLockBufferForCleanup(buffer))
+ return;
+
+ /*
+ * Now that we have buffer lock, get accurate information about the
+ * page's free space, and recheck the heuristic about whether to prune.
+ */
+ if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
+ {
+ /* OK to prune (though not to remove redirects) */
+ (void) heap_page_prune(relation, buffer, OldestXmin, false, true);
+ }
+
+ /* And release buffer lock */
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ }
+}
+
+
+/*
+ * Prune and repair fragmentation in the specified page.
+ *
+ * Caller must have pin and buffer cleanup lock on the page.
+ *
+ * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
+ * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ *
+ * If redirect_move is set, we remove redirecting line pointers by
+ * updating the root line pointer to point directly to the first non-dead
+ * tuple in the chain. NOTE: eliminating the redirect changes the first
+ * tuple's effective CTID, and is therefore unsafe except within VACUUM FULL.
+ * The only reason we support this capability at all is that by using it,
+ * VACUUM FULL need not cope with LP_REDIRECT items at all; which seems a
+ * good thing since VACUUM FULL is overly complicated already.
+ *
+ * If report_stats is true then we send the number of reclaimed heap-only
+ * tuples to pgstats. (This must be FALSE during vacuum, since vacuum will
+ * send its own new total to pgstats, and we don't want this delta applied
+ * on top of that.)
+ *
+ * Returns the number of tuples deleted from the page.
+ */
+int
+heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
+ bool redirect_move, bool report_stats)
+{
+ int ndeleted = 0;
+ Page page = BufferGetPage(buffer);
+ OffsetNumber offnum,
+ maxoff;
+ OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
+ OffsetNumber nowdead[MaxHeapTuplesPerPage];
+ OffsetNumber nowunused[MaxHeapTuplesPerPage];
+ int nredirected = 0;
+ int ndead = 0;
+ int nunused = 0;
+
+ START_CRIT_SECTION();
+
+ /*
+ * Mark the page as clear of prunable tuples. If we find a tuple which
+ * may soon become prunable, we shall set the hint again. Also clear
+ * the "page is full" flag, since there's no point in repeating the
+ * prune/defrag process until something else happens to the page.
+ */
+ PageClearPrunable(page);
+ PageClearFull(page);
+
+ /* Scan the page */
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = FirstOffsetNumber;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, offnum);
+
+ /* Nothing to do if slot is empty or already dead */
+ if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
+ continue;
+
+ /* Process this item or chain of items */
+ ndeleted += heap_prune_chain(relation, buffer, offnum,
+ OldestXmin,
+ redirected, &nredirected,
+ nowdead, &ndead,
+ nowunused, &nunused,
+ redirect_move);
+ }
+
+ /* Have we pruned any items? */
+ if (nredirected > 0 || ndead > 0 || nunused > 0)
+ {
+ /*
+ * Repair page fragmentation, and update the page's hint bit about
+ * whether it has free line pointers.
+ */
+ PageRepairFragmentation((Page) page);
+
+ MarkBufferDirty(buffer);
+
+ /*
+ * Emit a WAL HEAP_CLEAN or HEAP_CLEAN_MOVE record showing what we did
+ */
+ if (!relation->rd_istemp)
+ {
+ XLogRecPtr recptr;
+
+ recptr = log_heap_clean(relation, buffer,
+ redirected, nredirected,
+ nowdead, ndead,
+ nowunused, nunused,
+ redirect_move);
+ PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
+ PageSetLSN(BufferGetPage(buffer), recptr);
+ }
+ }
+
+ END_CRIT_SECTION();
+
+ /*
+ * If requested, report the number of tuples reclaimed to pgstats.
+ * This is ndeleted minus ndead, because we don't want to count a now-DEAD
+ * root item as a deletion for this purpose.
+ */
+ if (report_stats && ndeleted > ndead)
+ pgstat_update_heap_dead_tuples(relation, ndeleted - ndead);
+
+ /*
+ * XXX Should we update the FSM information of this page ?
+ *
+ * There are two schools of thought here. We may not want to update
+ * FSM information so that the page is not used for unrelated
+ * UPDATEs/INSERTs and any free space in this page will remain
+ * available for further UPDATEs in *this* page, thus improving
+ * chances for doing HOT updates.
+ *
+ * But for a large table and where a page does not receive further
+ * UPDATEs for a long time, we might waste this space by not
+ * updating the FSM information. The relation may get extended and
+ * fragmented further.
+ *
+ * One possibility is to leave "fillfactor" worth of space in this
+ * page and update FSM with the remaining space.
+ *
+ * In any case, the current FSM implementation doesn't accept
+ * one-page-at-a-time updates, so this is all academic for now.
+ */
+
+ return ndeleted;
+}
+
+
+/*
+ * Prune specified item pointer or a HOT chain originating at that item.
+ *
+ * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
+ * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
+ * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
+ * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
+ * DEAD, the OldestXmin test is just too coarse to detect it.
+ *
+ * The root line pointer is redirected to the tuple immediately after the
+ * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
+ * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
+ * tuple, which we treat as a chain of length 1.)
+ *
+ * OldestXmin is the cutoff XID used to identify dead tuples.
+ *
+ * Redirected items are added to the redirected[] array (two entries per
+ * redirection); items set to LP_DEAD state are added to nowdead[]; and
+ * items set to LP_UNUSED state are added to nowunused[]. (These arrays
+ * will be used to generate a WAL record after all chains are pruned.)
+ *
+ * If redirect_move is true, we get rid of redirecting line pointers.
+ *
+ * Returns the number of tuples deleted from the page.
+ */
+static int
+heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
+ TransactionId OldestXmin,
+ OffsetNumber *redirected, int *nredirected,
+ OffsetNumber *nowdead, int *ndead,
+ OffsetNumber *nowunused, int *nunused,
+ bool redirect_move)
+{
+ int ndeleted = 0;
+ Page dp = (Page) BufferGetPage(buffer);
+ TransactionId priorXmax = InvalidTransactionId;
+ ItemId rootlp;
+ HeapTupleHeader htup;
+ OffsetNumber latestdead = InvalidOffsetNumber,
+ maxoff = PageGetMaxOffsetNumber(dp),
+ offnum;
+ OffsetNumber chainitems[MaxHeapTuplesPerPage];
+ int nchain = 0,
+ i;
+
+ rootlp = PageGetItemId(dp, rootoffnum);
+
+ /*
+ * If it's a heap-only tuple, then it is not the start of a HOT chain.
+ */
+ if (ItemIdIsNormal(rootlp))
+ {
+ htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
+ if (HeapTupleHeaderIsHeapOnly(htup))
+ {
+ /*
+ * If the tuple is DEAD and doesn't chain to anything else, mark it
+ * unused immediately. (If it does chain, we can only remove it as
+ * part of pruning its chain.)
+ *
+ * We need this primarily to handle aborted HOT updates, that is,
+ * XMIN_INVALID heap-only tuples. Those might not be linked to
+ * by any chain, since the parent tuple might be re-updated before
+ * any pruning occurs. So we have to be able to reap them
+ * separately from chain-pruning.
+ *
+ * Note that we might first arrive at a dead heap-only tuple
+ * either here or while following a chain below. Whichever path
+ * gets there first will mark the tuple unused.
+ */
+ if (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer)
+ == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
+ {
+ ItemIdSetUnused(rootlp);
+ heap_prune_record_unused(nowunused, nunused, rootoffnum);
+ ndeleted++;
+ }
+
+ /* Nothing more to do */
+ return ndeleted;
+ }
+ }
+
+ /* Start from the root tuple */
+ offnum = rootoffnum;
+
+ /* while not end of the chain */
+ for (;;)
+ {
+ ItemId lp;
+ bool tupdead,
+ recent_dead;
+
+ /* Some sanity checks */
+ if (offnum < FirstOffsetNumber || offnum > maxoff)
+ break;
+
+ lp = PageGetItemId(dp, offnum);
+
+ if (!ItemIdIsUsed(lp))
+ break;
+
+ /*
+ * If we are looking at the redirected root line pointer,
+ * jump to the first normal tuple in the chain. If we find
+ * a redirect somewhere else, stop --- it must not be same chain.
+ */
+ if (ItemIdIsRedirected(lp))
+ {
+ if (nchain > 0)
+ break; /* not at start of chain */
+ chainitems[nchain++] = offnum;
+ offnum = ItemIdGetRedirect(rootlp);
+ continue;
+ }
+
+ /*
+ * Likewise, a dead item pointer can't be part of the chain.
+ * (We already eliminated the case of dead root tuple outside
+ * this function.)
+ */
+ if (ItemIdIsDead(lp))
+ break;
+
+ Assert(ItemIdIsNormal(lp));
+ htup = (HeapTupleHeader) PageGetItem(dp, lp);
+
+ /*
+ * Check the tuple XMIN against prior XMAX, if any
+ */
+ if (TransactionIdIsValid(priorXmax) &&
+ !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
+ break;
+
+ /*
+ * OK, this tuple is indeed a member of the chain.
+ */
+ chainitems[nchain++] = offnum;
+
+ /*
+ * Check tuple's visibility status.
+ */
+ tupdead = recent_dead = false;
+
+ switch (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer))
+ {
+ case HEAPTUPLE_DEAD:
+ tupdead = true;
+ break;
+
+ case HEAPTUPLE_RECENTLY_DEAD:
+ recent_dead = true;
+ /*
+ * This tuple may soon become DEAD. Re-set the hint bit so
+ * that the page is reconsidered for pruning in future.
+ */
+ PageSetPrunable(dp);
+ break;
+
+ case HEAPTUPLE_DELETE_IN_PROGRESS:
+ /*
+ * This tuple may soon become DEAD. Re-set the hint bit so
+ * that the page is reconsidered for pruning in future.
+ */
+ PageSetPrunable(dp);
+ break;
+
+ case HEAPTUPLE_LIVE:
+ case HEAPTUPLE_INSERT_IN_PROGRESS:
+ /*
+ * If we wanted to optimize for aborts, we might consider
+ * marking the page prunable when we see INSERT_IN_PROGRESS.
+ * But we don't. See related decisions about when to mark
+ * the page prunable in heapam.c.
+ */
+ break;
+
+ default:
+ elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+ break;
+ }
+
+ /*
+ * Remember the last DEAD tuple seen. We will advance past
+ * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
+ * but we can't advance past anything else. (XXX is it really worth
+ * continuing to scan beyond RECENTLY_DEAD? The case where we will
+ * find another DEAD tuple is a fairly unusual corner case.)
+ */
+ if (tupdead)
+ latestdead = offnum;
+ else if (!recent_dead)
+ break;
+
+ /*
+ * If the tuple is not HOT-updated, then we are at the end of this
+ * HOT-update chain.
+ */
+ if (!HeapTupleHeaderIsHotUpdated(htup))
+ break;
+
+ /*
+ * Advance to next chain member.
+ */
+ Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
+ BufferGetBlockNumber(buffer));
+ offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+ priorXmax = HeapTupleHeaderGetXmax(htup);
+ }
+
+ /*
+ * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
+ * the DEAD tuples at the start of the chain are removed and the root line
+ * pointer is appropriately redirected.
+ */
+ if (OffsetNumberIsValid(latestdead))
+ {
+ /*
+ * Mark as unused each intermediate item that we are able to remove
+ * from the chain.
+ *
+ * When the previous item is the last dead tuple seen, we are at
+ * the right candidate for redirection.
+ */
+ for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
+ {
+ ItemId lp = PageGetItemId(dp, chainitems[i]);
+
+ ItemIdSetUnused(lp);
+ heap_prune_record_unused(nowunused, nunused, chainitems[i]);
+ ndeleted++;
+ }
+
+ /*
+ * If the root entry had been a normal tuple, we are deleting it,
+ * so count it in the result. But changing a redirect (even to
+ * DEAD state) doesn't count.
+ */
+ if (ItemIdIsNormal(rootlp))
+ ndeleted++;
+
+ /*
+ * If the DEAD tuple is at the end of the chain, the entire chain is
+ * dead and the root line pointer can be marked dead. Otherwise
+ * just redirect the root to the correct chain member.
+ */
+ if (i >= nchain)
+ {
+ ItemIdSetDead(rootlp);
+ heap_prune_record_dead(nowdead, ndead, rootoffnum);
+ }
+ else
+ {
+ ItemIdSetRedirect(rootlp, chainitems[i]);
+ heap_prune_record_redirect(redirected, nredirected,
+ rootoffnum,
+ chainitems[i]);
+ }
+ }
+ else if (nchain < 2 && ItemIdIsRedirected(rootlp))
+ {
+ /*
+ * We found a redirect item that doesn't point to a valid follow-on
+ * item. This can happen if the loop in heap_page_prune caused us
+ * to visit the dead successor of a redirect item before visiting
+ * the redirect item. We can clean up by setting the redirect item
+ * to DEAD state.
+ */
+ ItemIdSetDead(rootlp);
+ heap_prune_record_dead(nowdead, ndead, rootoffnum);
+ }
+
+ /*
+ * If requested, eliminate LP_REDIRECT items by moving tuples. Note that
+ * if the root item is LP_REDIRECT and doesn't point to a valid follow-on
+ * item, we already killed it above.
+ */
+ if (redirect_move && ItemIdIsRedirected(rootlp))
+ {
+ OffsetNumber firstoffnum = ItemIdGetRedirect(rootlp);
+ ItemId firstlp = PageGetItemId(dp, firstoffnum);
+ HeapTupleData firsttup;
+
+ Assert(ItemIdIsNormal(firstlp));
+ /* Set up firsttup to reference the tuple at its existing CTID */
+ firsttup.t_data = (HeapTupleHeader) PageGetItem(dp, firstlp);
+ firsttup.t_len = ItemIdGetLength(firstlp);
+ ItemPointerSet(&firsttup.t_self,
+ BufferGetBlockNumber(buffer),
+ firstoffnum);
+ firsttup.t_tableOid = RelationGetRelid(relation);
+
+ /*
+ * Mark the tuple for invalidation. Needed because we're changing
+ * its CTID.
+ */
+ CacheInvalidateHeapTuple(relation, &firsttup);
+
+ /*
+ * Change heap-only status of the tuple because after the line
+ * pointer manipulation, it's no longer a heap-only tuple, but is
+ * directly pointed to by index entries.
+ */
+ Assert(HeapTupleIsHeapOnly(&firsttup));
+ HeapTupleClearHeapOnly(&firsttup);
+
+ /* Now move the item pointer */
+ *rootlp = *firstlp;
+ ItemIdSetUnused(firstlp);
+
+ /*
+ * If latestdead is valid, we have already recorded the redirection
+ * above. Otherwise, do it now.
+ *
+ * We don't record firstlp in the nowunused[] array, since the
+ * redirection entry is enough to tell heap_xlog_clean what to do.
+ */
+ if (!OffsetNumberIsValid(latestdead))
+ heap_prune_record_redirect(redirected, nredirected, rootoffnum,
+ firstoffnum);
+ }
+
+ return ndeleted;
+}
+
+
+/* Record newly-redirected item pointer */
+static void
+heap_prune_record_redirect(OffsetNumber *redirected, int *nredirected,
+ OffsetNumber offnum, OffsetNumber rdoffnum)
+{
+ Assert(*nredirected < MaxHeapTuplesPerPage);
+ redirected[*nredirected * 2] = offnum;
+ redirected[*nredirected * 2 + 1] = rdoffnum;
+ (*nredirected)++;
+}
+
+/* Record newly-dead item pointer */
+static void
+heap_prune_record_dead(OffsetNumber *nowdead, int *ndead,
+ OffsetNumber offnum)
+{
+ Assert(*ndead < MaxHeapTuplesPerPage);
+ nowdead[*ndead] = offnum;
+ (*ndead)++;
+}
+
+/* Record newly-unused item pointer */
+static void
+heap_prune_record_unused(OffsetNumber *nowunused, int *nunused,
+ OffsetNumber offnum)
+{
+ Assert(*nunused < MaxHeapTuplesPerPage);
+ nowunused[*nunused] = offnum;
+ (*nunused)++;
+}
+
+
+/*
+ * For all items in this page, find their respective root line pointers.
+ * If item k is part of a HOT-chain with root at item j, then we set
+ * root_offsets[k - 1] = j.
+ *
+ * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
+ * We zero out all unused entries.
+ *
+ * The function must be called with at least share lock on the buffer, to
+ * prevent concurrent prune operations.
+ *
+ * Note: The information collected here is valid only as long as the caller
+ * holds a pin on the buffer. Once pin is released, a tuple might be pruned
+ * and reused by a completely unrelated tuple.
+ */
+void
+heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
+{
+ OffsetNumber offnum, maxoff;
+
+ MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum++)
+ {
+ ItemId lp = PageGetItemId(page, offnum);
+ HeapTupleHeader htup;
+ OffsetNumber nextoffnum;
+ TransactionId priorXmax;
+
+ /* skip unused and dead items */
+ if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
+ continue;
+
+ if (ItemIdIsNormal(lp))
+ {
+ htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+ /*
+ * Check if this tuple is part of a HOT-chain rooted at some other
+ * tuple. If so, skip it for now; we'll process it when we find
+ * its root.
+ */
+ if (HeapTupleHeaderIsHeapOnly(htup))
+ continue;
+
+ /*
+ * This is either a plain tuple or the root of a HOT-chain.
+ * Remember it in the mapping.
+ */
+ root_offsets[offnum - 1] = offnum;
+
+ /* If it's not the start of a HOT-chain, we're done with it */
+ if (!HeapTupleHeaderIsHotUpdated(htup))
+ continue;
+
+ /* Set up to scan the HOT-chain */
+ nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+ priorXmax = HeapTupleHeaderGetXmax(htup);
+ }
+ else
+ {
+ /* Must be a redirect item. We do not set its root_offsets entry */
+ Assert(ItemIdIsRedirected(lp));
+ /* Set up to scan the HOT-chain */
+ nextoffnum = ItemIdGetRedirect(lp);
+ priorXmax = InvalidTransactionId;
+ }
+
+ /*
+ * Now follow the HOT-chain and collect other tuples in the chain.
+ *
+ * Note: Even though this is a nested loop, the complexity of the
+ * function is O(N) because a tuple in the page should be visited not
+ * more than twice, once in the outer loop and once in HOT-chain
+ * chases.
+ */
+ for (;;)
+ {
+ lp = PageGetItemId(page, nextoffnum);
+
+ /* Check for broken chains */
+ if (!ItemIdIsNormal(lp))
+ break;
+
+ htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+ if (TransactionIdIsValid(priorXmax) &&
+ !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
+ break;
+
+ /* Remember the root line pointer for this item */
+ root_offsets[nextoffnum - 1] = offnum;
+
+ /* Advance to next chain member, if any */
+ if (!HeapTupleHeaderIsHotUpdated(htup))
+ break;
+
+ nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+ priorXmax = HeapTupleHeaderGetXmax(htup);
+ }
+ }
+}
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 60aab58de38..e8c5eec50ac 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -96,7 +96,7 @@
* Portions Copyright (c) 1994-5, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.6 2007/09/12 22:10:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/heap/rewriteheap.c,v 1.7 2007/09/20 17:56:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -320,12 +320,14 @@ rewrite_heap_tuple(RewriteState state,
* Copy the original tuple's visibility information into new_tuple.
*
* XXX we might later need to copy some t_infomask2 bits, too?
+ * Right now, we intentionally clear the HOT status bits.
*/
memcpy(&new_tuple->t_data->t_choice.t_heap,
&old_tuple->t_data->t_choice.t_heap,
sizeof(HeapTupleFields));
new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK;
+ new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK;
new_tuple->t_data->t_infomask |=
old_tuple->t_data->t_infomask & HEAP_XACT_MASK;
@@ -593,7 +595,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
/* Now we can check to see if there's enough free space already. */
if (state->rs_buffer_valid)
{
- pageFreeSpace = PageGetFreeSpace(page);
+ pageFreeSpace = PageGetHeapFreeSpace(page);
if (len + saveFreeSpace > pageFreeSpace)
{
@@ -628,7 +630,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
/* And now we can insert the tuple into the page */
newoff = PageAddItem(page, (Item) heaptup->t_data, len,
- InvalidOffsetNumber, false);
+ InvalidOffsetNumber, false, true);
if (newoff == InvalidOffsetNumber)
elog(ERROR, "failed to add tuple");
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 0009739180c..7bf1e43cb45 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.62 2007/05/27 03:50:38 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.63 2007/09/20 17:56:30 tgl Exp $
*
* NOTES
* many of the old access method routines have been turned into
@@ -21,6 +21,7 @@
#include "access/genam.h"
#include "access/heapam.h"
+#include "access/transam.h"
#include "miscadmin.h"
#include "pgstat.h"
@@ -95,6 +96,9 @@ RelationGetIndexScan(Relation indexRelation,
ItemPointerSetInvalid(&scan->xs_ctup.t_self);
scan->xs_ctup.t_data = NULL;
scan->xs_cbuf = InvalidBuffer;
+ scan->xs_prev_xmax = InvalidTransactionId;
+ scan->xs_next_hot = InvalidOffsetNumber;
+ scan->xs_hot_dead = false;
/*
* Let the AM fill in the key and any opaque data it wants.
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index d905013a5fc..fd727ca68c8 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.98 2007/05/27 03:50:38 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.99 2007/09/20 17:56:30 tgl Exp $
*
* INTERFACE ROUTINES
* index_open - open an index relation by relation OID
@@ -64,6 +64,7 @@
#include "access/genam.h"
#include "access/heapam.h"
+#include "access/transam.h"
#include "pgstat.h"
#include "utils/relcache.h"
@@ -313,6 +314,8 @@ index_rescan(IndexScanDesc scan, ScanKey key)
scan->xs_cbuf = InvalidBuffer;
}
+ scan->xs_next_hot = InvalidOffsetNumber;
+
scan->kill_prior_tuple = false; /* for safety */
FunctionCall2(procedure,
@@ -370,6 +373,14 @@ index_markpos(IndexScanDesc scan)
* NOTE: this only restores the internal scan state of the index AM.
* The current result tuple (scan->xs_ctup) doesn't change. See comments
* for ExecRestrPos().
+ *
+ * NOTE: in the presence of HOT chains, mark/restore only works correctly
+ * if the scan's snapshot is MVCC-safe; that ensures that there's at most one
+ * returnable tuple in each HOT chain, and so restoring the prior state at the
+ * granularity of the index AM is sufficient. Since the only current user
+ * of mark/restore functionality is nodeMergejoin.c, this effectively means
+ * that merge-join plans only work for MVCC snapshots. This could be fixed
+ * if necessary, but for now it seems unimportant.
* ----------------
*/
void
@@ -377,9 +388,13 @@ index_restrpos(IndexScanDesc scan)
{
FmgrInfo *procedure;
+ Assert(IsMVCCSnapshot(scan->xs_snapshot));
+
SCAN_CHECKS;
GET_SCAN_PROCEDURE(amrestrpos);
+ scan->xs_next_hot = InvalidOffsetNumber;
+
scan->kill_prior_tuple = false; /* for safety */
FunctionCall1(procedure, PointerGetDatum(scan));
@@ -398,72 +413,224 @@ HeapTuple
index_getnext(IndexScanDesc scan, ScanDirection direction)
{
HeapTuple heapTuple = &scan->xs_ctup;
+ ItemPointer tid = &heapTuple->t_self;
FmgrInfo *procedure;
SCAN_CHECKS;
GET_SCAN_PROCEDURE(amgettuple);
- /* just make sure this is false... */
- scan->kill_prior_tuple = false;
+ /*
+ * We always reset xs_hot_dead; if we are here then either we are just
+ * starting the scan, or we previously returned a visible tuple, and in
+ * either case it's inappropriate to kill the prior index entry.
+ */
+ scan->xs_hot_dead = false;
for (;;)
{
- bool found;
+ OffsetNumber offnum;
+ bool at_chain_start;
+ Page dp;
- /*
- * The AM's gettuple proc finds the next tuple matching the scan keys.
- */
- found = DatumGetBool(FunctionCall2(procedure,
- PointerGetDatum(scan),
- Int32GetDatum(direction)));
+ if (scan->xs_next_hot != InvalidOffsetNumber)
+ {
+ /*
+ * We are resuming scan of a HOT chain after having returned
+ * an earlier member. Must still hold pin on current heap page.
+ */
+ Assert(BufferIsValid(scan->xs_cbuf));
+ Assert(ItemPointerGetBlockNumber(tid) ==
+ BufferGetBlockNumber(scan->xs_cbuf));
+ Assert(TransactionIdIsValid(scan->xs_prev_xmax));
+ offnum = scan->xs_next_hot;
+ at_chain_start = false;
+ scan->xs_next_hot = InvalidOffsetNumber;
+ }
+ else
+ {
+ bool found;
+ Buffer prev_buf;
+
+ /*
+ * If we scanned a whole HOT chain and found only dead tuples,
+ * tell index AM to kill its entry for that TID.
+ */
+ scan->kill_prior_tuple = scan->xs_hot_dead;
+
+ /*
+ * The AM's gettuple proc finds the next index entry matching the
+ * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid).
+ */
+ found = DatumGetBool(FunctionCall2(procedure,
+ PointerGetDatum(scan),
+ Int32GetDatum(direction)));
+
+ /* Reset kill flag immediately for safety */
+ scan->kill_prior_tuple = false;
+
+ /* If we're out of index entries, break out of outer loop */
+ if (!found)
+ break;
+
+ pgstat_count_index_tuples(scan->indexRelation, 1);
+
+ /* Switch to correct buffer if we don't have it already */
+ prev_buf = scan->xs_cbuf;
+ scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
+ scan->heapRelation,
+ ItemPointerGetBlockNumber(tid));
+
+ /*
+ * Prune page, but only if we weren't already on this page
+ */
+ if (prev_buf != scan->xs_cbuf)
+ heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
+ RecentGlobalXmin);
+
+ /* Prepare to scan HOT chain starting at index-referenced offnum */
+ offnum = ItemPointerGetOffsetNumber(tid);
+ at_chain_start = true;
+
+ /* We don't know what the first tuple's xmin should be */
+ scan->xs_prev_xmax = InvalidTransactionId;
+
+ /* Initialize flag to detect if all entries are dead */
+ scan->xs_hot_dead = true;
+ }
+
+ /* Obtain share-lock on the buffer so we can examine visibility */
+ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
- /* Reset kill flag immediately for safety */
- scan->kill_prior_tuple = false;
+ dp = (Page) BufferGetPage(scan->xs_cbuf);
- if (!found)
+ /* Scan through possible multiple members of HOT-chain */
+ for (;;)
{
- /* Release any held pin on a heap page */
- if (BufferIsValid(scan->xs_cbuf))
- {
- ReleaseBuffer(scan->xs_cbuf);
- scan->xs_cbuf = InvalidBuffer;
- }
- return NULL; /* failure exit */
- }
+ ItemId lp;
+ ItemPointer ctid;
- pgstat_count_index_tuples(scan->indexRelation, 1);
+ /* check for bogus TID */
+ if (offnum < FirstOffsetNumber ||
+ offnum > PageGetMaxOffsetNumber(dp))
+ break;
- /*
- * Fetch the heap tuple and see if it matches the snapshot.
- */
- if (heap_release_fetch(scan->heapRelation, scan->xs_snapshot,
- heapTuple, &scan->xs_cbuf, true,
- scan->indexRelation))
- break;
+ lp = PageGetItemId(dp, offnum);
- /* Skip if no undeleted tuple at this location */
- if (heapTuple->t_data == NULL)
- continue;
+ /* check for unused, dead, or redirected items */
+ if (!ItemIdIsNormal(lp))
+ {
+ /* We should only see a redirect at start of chain */
+ if (ItemIdIsRedirected(lp) && at_chain_start)
+ {
+ /* Follow the redirect */
+ offnum = ItemIdGetRedirect(lp);
+ at_chain_start = false;
+ continue;
+ }
+ /* else must be end of chain */
+ break;
+ }
- /*
- * If we can't see it, maybe no one else can either. Check to see if
- * the tuple is dead to all transactions. If so, signal the index AM
- * to not return it on future indexscans.
- *
- * We told heap_release_fetch to keep a pin on the buffer, so we can
- * re-access the tuple here. But we must re-lock the buffer first.
- */
- LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
+ /*
+ * We must initialize all of *heapTuple (ie, scan->xs_ctup)
+ * since it is returned to the executor on success.
+ */
+ heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
+ heapTuple->t_len = ItemIdGetLength(lp);
+ ItemPointerSetOffsetNumber(tid, offnum);
+ heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation);
+ ctid = &heapTuple->t_data->t_ctid;
+
+ /*
+ * Shouldn't see a HEAP_ONLY tuple at chain start. (This test
+ * should be unnecessary, since the chain root can't be removed
+ * while we have pin on the index entry, but let's make it anyway.)
+ */
+ if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
+ break;
+
+ /*
+ * The xmin should match the previous xmax value, else chain is
+ * broken. (Note: this test is not optional because it protects
+ * us against the case where the prior chain member's xmax
+ * aborted since we looked at it.)
+ */
+ if (TransactionIdIsValid(scan->xs_prev_xmax) &&
+ !TransactionIdEquals(scan->xs_prev_xmax,
+ HeapTupleHeaderGetXmin(heapTuple->t_data)))
+ break;
+
+ /* If it's visible per the snapshot, we must return it */
+ if (HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot,
+ scan->xs_cbuf))
+ {
+ /*
+ * If the snapshot is MVCC, we know that it could accept
+ * at most one member of the HOT chain, so we can skip
+ * examining any more members. Otherwise, check for
+ * continuation of the HOT-chain, and set state for next time.
+ */
+ if (IsMVCCSnapshot(scan->xs_snapshot))
+ scan->xs_next_hot = InvalidOffsetNumber;
+ else if (HeapTupleIsHotUpdated(heapTuple))
+ {
+ Assert(ItemPointerGetBlockNumber(ctid) ==
+ ItemPointerGetBlockNumber(tid));
+ scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid);
+ scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+ }
+ else
+ scan->xs_next_hot = InvalidOffsetNumber;
+
+ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ pgstat_count_heap_fetch(scan->indexRelation);
+
+ return heapTuple;
+ }
- if (HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
- scan->xs_cbuf) == HEAPTUPLE_DEAD)
- scan->kill_prior_tuple = true;
+ /*
+ * If we can't see it, maybe no one else can either. Check to see
+ * if the tuple is dead to all transactions. If we find that all
+ * the tuples in the HOT chain are dead, we'll signal the index AM
+ * to not return that TID on future indexscans.
+ */
+ if (scan->xs_hot_dead &&
+ HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin,
+ scan->xs_cbuf) != HEAPTUPLE_DEAD)
+ scan->xs_hot_dead = false;
+
+ /*
+ * Check to see if HOT chain continues past this tuple; if so
+ * fetch the next offnum (we don't bother storing it into
+ * xs_next_hot, but must store xs_prev_xmax), and loop around.
+ */
+ if (HeapTupleIsHotUpdated(heapTuple))
+ {
+ Assert(ItemPointerGetBlockNumber(ctid) ==
+ ItemPointerGetBlockNumber(tid));
+ offnum = ItemPointerGetOffsetNumber(ctid);
+ at_chain_start = false;
+ scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data);
+ }
+ else
+ break; /* end of chain */
+ } /* loop over a single HOT chain */
LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ /* Loop around to ask index AM for another TID */
+ scan->xs_next_hot = InvalidOffsetNumber;
+ }
+
+ /* Release any held pin on a heap page */
+ if (BufferIsValid(scan->xs_cbuf))
+ {
+ ReleaseBuffer(scan->xs_cbuf);
+ scan->xs_cbuf = InvalidBuffer;
}
- /* Success exit */
- return heapTuple;
+ return NULL; /* failure exit */
}
/* ----------------
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 7dbaa2c245f..5f7ecbe16da 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.159 2007/09/12 22:10:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.160 2007/09/20 17:56:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -193,8 +193,6 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
*/
for (;;)
{
- HeapTupleData htup;
- Buffer hbuffer;
ItemId curitemid;
IndexTuple curitup;
BlockNumber nblkno;
@@ -223,6 +221,9 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
*/
if (!ItemIdIsDead(curitemid))
{
+ ItemPointerData htid;
+ bool all_dead;
+
/*
* _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
* how we handling NULLs - and so we must not use _bt_compare
@@ -234,17 +235,20 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
/* okay, we gotta fetch the heap tuple ... */
curitup = (IndexTuple) PageGetItem(page, curitemid);
- htup.t_self = curitup->t_tid;
- if (heap_fetch(heapRel, &SnapshotDirty, &htup, &hbuffer,
- true, NULL))
+ htid = curitup->t_tid;
+
+ /*
+ * We check the whole HOT-chain to see if there is any tuple
+ * that satisfies SnapshotDirty. This is necessary because
+ * we have just a single index entry for the entire chain.
+ */
+ if (heap_hot_search(&htid, heapRel, &SnapshotDirty, &all_dead))
{
/* it is a duplicate */
TransactionId xwait =
(TransactionIdIsValid(SnapshotDirty.xmin)) ?
SnapshotDirty.xmin : SnapshotDirty.xmax;
- ReleaseBuffer(hbuffer);
-
/*
* If this tuple is being updated by other transaction
* then we have to wait for its commit/abort.
@@ -263,15 +267,22 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
* is itself now committed dead --- if so, don't complain.
* This is a waste of time in normal scenarios but we must
* do it to support CREATE INDEX CONCURRENTLY.
+ *
+ * We must follow HOT-chains here because during
+ * concurrent index build, we insert the root TID though
+ * the actual tuple may be somewhere in the HOT-chain.
+ * While following the chain we might not stop at the exact
+ * tuple which triggered the insert, but that's OK because
+ * if we find a live tuple anywhere in this chain, we have
+ * a unique key conflict. The other live tuple is not part
+ * of this chain because it had a different index entry.
*/
- htup.t_self = itup->t_tid;
- if (heap_fetch(heapRel, SnapshotSelf, &htup, &hbuffer,
- false, NULL))
+ htid = itup->t_tid;
+ if (heap_hot_search(&htid, heapRel, SnapshotSelf, NULL))
{
/* Normal case --- it's still live */
- ReleaseBuffer(hbuffer);
}
- else if (htup.t_data != NULL)
+ else
{
/*
* It's been deleted, so no error, and no need to
@@ -279,39 +290,27 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
*/
break;
}
- else
- {
- /* couldn't find the tuple?? */
- elog(ERROR, "failed to fetch tuple being inserted");
- }
ereport(ERROR,
(errcode(ERRCODE_UNIQUE_VIOLATION),
errmsg("duplicate key value violates unique constraint \"%s\"",
RelationGetRelationName(rel))));
}
- else if (htup.t_data != NULL)
+ else if (all_dead)
{
/*
- * Hmm, if we can't see the tuple, maybe it can be marked
- * killed. This logic should match index_getnext and
- * btgettuple.
+ * The conflicting tuple (or whole HOT chain) is dead to
+ * everyone, so we may as well mark the index entry
+ * killed.
*/
- LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
- if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
- hbuffer) == HEAPTUPLE_DEAD)
- {
- ItemIdMarkDead(curitemid);
- opaque->btpo_flags |= BTP_HAS_GARBAGE;
- /* be sure to mark the proper buffer dirty... */
- if (nbuf != InvalidBuffer)
- SetBufferCommitInfoNeedsSave(nbuf);
- else
- SetBufferCommitInfoNeedsSave(buf);
- }
- LockBuffer(hbuffer, BUFFER_LOCK_UNLOCK);
+ ItemIdMarkDead(curitemid);
+ opaque->btpo_flags |= BTP_HAS_GARBAGE;
+ /* be sure to mark the proper buffer dirty... */
+ if (nbuf != InvalidBuffer)
+ SetBufferCommitInfoNeedsSave(nbuf);
+ else
+ SetBufferCommitInfoNeedsSave(buf);
}
- ReleaseBuffer(hbuffer);
}
}
@@ -840,7 +839,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
- false) == InvalidOffsetNumber)
+ false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add hikey to the right sibling");
rightoff = OffsetNumberNext(rightoff);
}
@@ -865,7 +864,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
item = (IndexTuple) PageGetItem(origpage, itemid);
}
if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
- false) == InvalidOffsetNumber)
+ false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add hikey to the left sibling");
leftoff = OffsetNumberNext(leftoff);
@@ -1700,7 +1699,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
* benefit of _bt_restore_page().
*/
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY,
- false) == InvalidOffsetNumber)
+ false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add leftkey to new root page");
pfree(new_item);
@@ -1718,7 +1717,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
* insert the right page pointer into the new root page.
*/
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY,
- false) == InvalidOffsetNumber)
+ false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add rightkey to new root page");
pfree(new_item);
@@ -1805,7 +1804,7 @@ _bt_pgaddtup(Relation rel,
}
if (PageAddItem(page, (Item) itup, itemsize, itup_off,
- false) == InvalidOffsetNumber)
+ false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add item to the %s for \"%s\"",
where, RelationGetRelationName(rel));
}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 118dc22bb35..6293792b9f5 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -57,7 +57,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.112 2007/09/12 22:10:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.113 2007/09/20 17:56:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -400,7 +400,7 @@ _bt_sortaddtup(Page page,
}
if (PageAddItem(page, (Item) itup, itemsize, itup_off,
- false) == InvalidOffsetNumber)
+ false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to the index page");
}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index db64422b19f..499129c48f1 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.45 2007/09/12 22:10:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.46 2007/09/20 17:56:30 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -141,8 +141,8 @@ _bt_restore_page(Page page, char *from, int len)
memcpy(&itupdata, from, sizeof(IndexTupleData));
itemsz = IndexTupleDSize(itupdata);
itemsz = MAXALIGN(itemsz);
- if (PageAddItem(page, (Item) from, itemsz,
- FirstOffsetNumber, false) == InvalidOffsetNumber)
+ if (PageAddItem(page, (Item) from, itemsz, FirstOffsetNumber,
+ false, false) == InvalidOffsetNumber)
elog(PANIC, "_bt_restore_page: cannot add item to page");
from += itemsz;
}
@@ -238,7 +238,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
{
if (PageAddItem(page, (Item) datapos, datalen,
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
- false) == InvalidOffsetNumber)
+ false, false) == InvalidOffsetNumber)
elog(PANIC, "btree_insert_redo: failed to add item");
PageSetLSN(page, lsn);
@@ -389,7 +389,7 @@ btree_xlog_split(bool onleft, bool isroot,
if (onleft)
{
if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
- false) == InvalidOffsetNumber)
+ false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add new item to left page after split");
}
@@ -398,7 +398,7 @@ btree_xlog_split(bool onleft, bool isroot,
hiItem = PageGetItem(rpage, hiItemId);
if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId),
- P_HIKEY, false) == InvalidOffsetNumber)
+ P_HIKEY, false, false) == InvalidOffsetNumber)
elog(PANIC, "failed to add high key to left page after split");
/* Fix opaque fields */