aboutsummaryrefslogtreecommitdiff
path: root/src/backend/catalog/index.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/catalog/index.c')
-rw-r--r--src/backend/catalog/index.c348
1 files changed, 281 insertions, 67 deletions
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 9aa58e35f9a..8137377e7a5 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.284 2007/05/30 20:11:55 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.285 2007/09/20 17:56:30 tgl Exp $
*
*
* INTERFACE ROUTINES
@@ -410,6 +410,9 @@ UpdateIndexRelation(Oid indexoid,
values[Anum_pg_index_indisprimary - 1] = BoolGetDatum(primary);
values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false);
values[Anum_pg_index_indisvalid - 1] = BoolGetDatum(isvalid);
+ values[Anum_pg_index_indcheckxmin - 1] = BoolGetDatum(false);
+ /* we set isvalid and isready the same way */
+ values[Anum_pg_index_indisready - 1] = BoolGetDatum(isvalid);
values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey);
values[Anum_pg_index_indclass - 1] = PointerGetDatum(indclass);
values[Anum_pg_index_indoption - 1] = PointerGetDatum(indoption);
@@ -944,7 +947,11 @@ BuildIndexInfo(Relation index)
/* other info */
ii->ii_Unique = indexStruct->indisunique;
- ii->ii_Concurrent = false; /* assume normal case */
+ ii->ii_ReadyForInserts = indexStruct->indisready;
+
+ /* initialize index-build state to default */
+ ii->ii_Concurrent = false;
+ ii->ii_BrokenHotChain = false;
return ii;
}
@@ -1309,6 +1316,35 @@ index_build(Relation heapRelation,
Assert(PointerIsValid(stats));
/*
+ * If we found any potentially broken HOT chains, mark the index as
+ * not being usable until the current transaction is below the event
+ * horizon. See src/backend/access/heap/README.HOT for discussion.
+ */
+ if (indexInfo->ii_BrokenHotChain)
+ {
+ Oid indexId = RelationGetRelid(indexRelation);
+ Relation pg_index;
+ HeapTuple indexTuple;
+ Form_pg_index indexForm;
+
+ pg_index = heap_open(IndexRelationId, RowExclusiveLock);
+
+ indexTuple = SearchSysCacheCopy(INDEXRELID,
+ ObjectIdGetDatum(indexId),
+ 0, 0, 0);
+ if (!HeapTupleIsValid(indexTuple))
+ elog(ERROR, "cache lookup failed for index %u", indexId);
+ indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+
+ indexForm->indcheckxmin = true;
+ simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
+ CatalogUpdateIndexes(pg_index, indexTuple);
+
+ heap_freetuple(indexTuple);
+ heap_close(pg_index, RowExclusiveLock);
+ }
+
+ /*
* Update heap and index pg_class rows
*/
index_update_stats(heapRelation,
@@ -1346,6 +1382,11 @@ index_build(Relation heapRelation,
* must keep track of the number of index tuples; we don't do so here because
* the AM might reject some of the tuples for its own reasons, such as being
* unable to store NULLs.
+ *
+ * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect
+ * any potentially broken HOT chains. Currently, we set this if there are
+ * any RECENTLY_DEAD entries in a HOT chain, without trying very hard to
+ * detect whether they're really incompatible with the chain tip.
*/
double
IndexBuildHeapScan(Relation heapRelation,
@@ -1365,6 +1406,8 @@ IndexBuildHeapScan(Relation heapRelation,
ExprContext *econtext;
Snapshot snapshot;
TransactionId OldestXmin;
+ BlockNumber root_blkno = InvalidBlockNumber;
+ OffsetNumber root_offsets[MaxHeapTuplesPerPage];
/*
* sanity checks
@@ -1427,15 +1470,47 @@ IndexBuildHeapScan(Relation heapRelation,
CHECK_FOR_INTERRUPTS();
+ /*
+ * When dealing with a HOT-chain of updated tuples, we want to
+ * index the values of the live tuple (if any), but index it
+ * under the TID of the chain's root tuple. This approach is
+ * necessary to preserve the HOT-chain structure in the heap.
+ * So we need to be able to find the root item offset for every
+ * tuple that's in a HOT-chain. When first reaching a new page
+ * of the relation, call heap_get_root_tuples() to build a map
+ * of root item offsets on the page.
+ *
+ * It might look unsafe to use this information across buffer
+ * lock/unlock. However, we hold ShareLock on the table so no
+ * ordinary insert/update/delete should occur; and we hold pin on
+ * the buffer continuously while visiting the page, so no pruning
+ * operation can occur either.
+ *
+ * Note the implied assumption that there is no more than one live
+ * tuple per HOT-chain ...
+ */
+ if (scan->rs_cblock != root_blkno)
+ {
+ Page page = BufferGetPage(scan->rs_cbuf);
+
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+ heap_get_root_tuples(page, root_offsets);
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ root_blkno = scan->rs_cblock;
+ }
+
if (snapshot == SnapshotAny)
{
/* do our own time qual check */
bool indexIt;
+ recheck:
/*
* We could possibly get away with not locking the buffer here,
* since caller should hold ShareLock on the relation, but let's
- * be conservative about it.
+ * be conservative about it. (This remark is still correct
+ * even with HOT-pruning: our pin on the buffer prevents pruning.)
*/
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
@@ -1458,10 +1533,29 @@ IndexBuildHeapScan(Relation heapRelation,
* If tuple is recently deleted then we must index it
* anyway to preserve MVCC semantics. (Pre-existing
* transactions could try to use the index after we finish
- * building it, and may need to see such tuples.) Exclude
- * it from unique-checking, however.
+ * building it, and may need to see such tuples.)
+ *
+ * However, if it was HOT-updated then we must only index
+ * the live tuple at the end of the HOT-chain. Since this
+ * breaks semantics for pre-existing snapshots, mark
+ * the index as unusable for them.
+ *
+ * If we've already decided that the index will be unsafe
+ * for old snapshots, we may as well stop indexing
+ * recently-dead tuples, since there's no longer any
+ * point.
*/
- indexIt = true;
+ if (HeapTupleIsHotUpdated(heapTuple))
+ {
+ indexIt = false;
+ /* mark the index as unsafe for old snapshots */
+ indexInfo->ii_BrokenHotChain = true;
+ }
+ else if (indexInfo->ii_BrokenHotChain)
+ indexIt = false;
+ else
+ indexIt = true;
+ /* In any case, exclude the tuple from unique-checking */
tupleIsAlive = false;
break;
case HEAPTUPLE_INSERT_IN_PROGRESS:
@@ -1473,12 +1567,31 @@ IndexBuildHeapScan(Relation heapRelation,
* followed by CREATE INDEX within a transaction.) An
* exception occurs when reindexing a system catalog,
* because we often release lock on system catalogs before
- * committing.
+ * committing. In that case we wait for the inserting
+ * transaction to finish and check again. (We could do
+ * that on user tables too, but since the case is not
+ * expected it seems better to throw an error.)
*/
if (!TransactionIdIsCurrentTransactionId(
- HeapTupleHeaderGetXmin(heapTuple->t_data))
- && !IsSystemRelation(heapRelation))
- elog(ERROR, "concurrent insert in progress");
+ HeapTupleHeaderGetXmin(heapTuple->t_data)))
+ {
+ if (!IsSystemRelation(heapRelation))
+ elog(ERROR, "concurrent insert in progress");
+ else
+ {
+ /*
+ * Must drop the lock on the buffer before we wait
+ */
+ TransactionId xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+ XactLockTableWait(xwait);
+ goto recheck;
+ }
+ }
+ /*
+ * We must index such tuples, since if the index build
+ * commits then they're good.
+ */
indexIt = true;
tupleIsAlive = true;
break;
@@ -1491,19 +1604,48 @@ IndexBuildHeapScan(Relation heapRelation,
* followed by CREATE INDEX within a transaction.) An
* exception occurs when reindexing a system catalog,
* because we often release lock on system catalogs before
- * committing.
+ * committing. In that case we wait for the deleting
+ * transaction to finish and check again. (We could do
+ * that on user tables too, but since the case is not
+ * expected it seems better to throw an error.)
*/
Assert(!(heapTuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI));
if (!TransactionIdIsCurrentTransactionId(
- HeapTupleHeaderGetXmax(heapTuple->t_data))
- && !IsSystemRelation(heapRelation))
- elog(ERROR, "concurrent delete in progress");
- indexIt = true;
+ HeapTupleHeaderGetXmax(heapTuple->t_data)))
+ {
+ if (!IsSystemRelation(heapRelation))
+ elog(ERROR, "concurrent delete in progress");
+ else
+ {
+ /*
+ * Must drop the lock on the buffer before we wait
+ */
+ TransactionId xwait = HeapTupleHeaderGetXmax(heapTuple->t_data);
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+ XactLockTableWait(xwait);
+ goto recheck;
+ }
+ }
+ /*
+ * Otherwise, we have to treat these tuples just like
+ * RECENTLY_DELETED ones.
+ */
+ if (HeapTupleIsHotUpdated(heapTuple))
+ {
+ indexIt = false;
+ /* mark the index as unsafe for old snapshots */
+ indexInfo->ii_BrokenHotChain = true;
+ }
+ else if (indexInfo->ii_BrokenHotChain)
+ indexIt = false;
+ else
+ indexIt = true;
+ /* In any case, exclude the tuple from unique-checking */
tupleIsAlive = false;
break;
default:
elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
- indexIt = tupleIsAlive = false; /* keep compiler quiet */
+ indexIt = tupleIsAlive = false; /* keep compiler quiet */
break;
}
@@ -1552,9 +1694,33 @@ IndexBuildHeapScan(Relation heapRelation,
* pass the values[] and isnull[] arrays, instead.
*/
- /* Call the AM's callback routine to process the tuple */
- callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
- callback_state);
+ if (HeapTupleIsHeapOnly(heapTuple))
+ {
+ /*
+ * For a heap-only tuple, pretend its TID is that of the root.
+ * See src/backend/access/heap/README.HOT for discussion.
+ */
+ HeapTupleData rootTuple;
+ OffsetNumber offnum;
+
+ rootTuple = *heapTuple;
+ offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
+
+ Assert(OffsetNumberIsValid(root_offsets[offnum - 1]));
+
+ ItemPointerSetOffsetNumber(&rootTuple.t_self,
+ root_offsets[offnum - 1]);
+
+ /* Call the AM's callback routine to process the tuple */
+ callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive,
+ callback_state);
+ }
+ else
+ {
+ /* Call the AM's callback routine to process the tuple */
+ callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
+ callback_state);
+ }
}
heap_endscan(scan);
@@ -1574,8 +1740,15 @@ IndexBuildHeapScan(Relation heapRelation,
/*
* validate_index - support code for concurrent index builds
*
- * We do a concurrent index build by first building the index normally via
- * index_create(), while holding a weak lock that allows concurrent
+ * We do a concurrent index build by first inserting the catalog entry for the
+ * index via index_create(), marking it not indisready and not indisvalid.
+ * Then we commit our transaction and start a new one, then we wait for all
+ * transactions that could have been modifying the table to terminate. Now
+ * we know that any subsequently-started transactions will see the index and
+ * honor its constraints on HOT updates; so while existing HOT-chains might
+ * be broken with respect to the index, no currently live tuple will have an
+ * incompatible HOT update done to it. We now build the index normally via
+ * index_build(), while holding a weak lock that allows concurrent
* insert/update/delete. Also, we index only tuples that are valid
* as of the start of the scan (see IndexBuildHeapScan), whereas a normal
* build takes care to include recently-dead tuples. This is OK because
@@ -1586,11 +1759,10 @@ IndexBuildHeapScan(Relation heapRelation,
* if we used HeapTupleSatisfiesVacuum). This leaves us with an index that
* does not contain any tuples added to the table while we built the index.
*
- * Next, we commit the transaction so that the index becomes visible to other
- * backends, but it is marked not "indisvalid" to prevent the planner from
- * relying on it for indexscans. Then we wait for all transactions that
- * could have been modifying the table to terminate. At this point we
- * know that any subsequently-started transactions will see the index and
+ * Next, we mark the index "indisready" (but still not "indisvalid") and
+ * commit the second transaction and start a third. Again we wait for all
+ * transactions that could have been modifying the table to terminate. Now
+ * we know that any subsequently-started transactions will see the index and
* insert their new tuples into it. We then take a new reference snapshot
* which is passed to validate_index(). Any tuples that are valid according
* to this snap, but are not in the index, must be added to the index.
@@ -1610,7 +1782,7 @@ IndexBuildHeapScan(Relation heapRelation,
* Building a unique index this way is tricky: we might try to insert a
* tuple that is already dead or is in process of being deleted, and we
* mustn't have a uniqueness failure against an updated version of the same
- * row. We can check the tuple to see if it's already dead and tell
+ * row. We could try to check the tuple to see if it's already dead and tell
* index_insert() not to do the uniqueness check, but that still leaves us
* with a race condition against an in-progress update. To handle that,
* we expect the index AM to recheck liveness of the to-be-inserted tuple
@@ -1620,7 +1792,8 @@ IndexBuildHeapScan(Relation heapRelation,
* were alive at the time of the reference snapshot are gone; this is
* necessary to be sure there are none left with a serializable snapshot
* older than the reference (and hence possibly able to see tuples we did
- * not index). Then we mark the index valid and commit.
+ * not index). Then we mark the index "indisvalid" and commit. Subsequent
+ * transactions will be able to use it for queries.
*
* Doing two full table scans is a brute-force strategy. We could try to be
* cleverer, eg storing new tuples in a special area of the table (perhaps
@@ -1727,6 +1900,9 @@ validate_index_heapscan(Relation heapRelation,
TupleTableSlot *slot;
EState *estate;
ExprContext *econtext;
+ BlockNumber root_blkno = InvalidBlockNumber;
+ OffsetNumber root_offsets[MaxHeapTuplesPerPage];
+ bool in_index[MaxHeapTuplesPerPage];
/* state variables for the merge */
ItemPointer indexcursor = NULL;
@@ -1768,39 +1944,86 @@ validate_index_heapscan(Relation heapRelation,
while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
ItemPointer heapcursor = &heapTuple->t_self;
+ ItemPointerData rootTuple;
+ OffsetNumber root_offnum;
CHECK_FOR_INTERRUPTS();
state->htups += 1;
/*
+ * As commented in IndexBuildHeapScan, we should index heap-only tuples
+ * under the TIDs of their root tuples; so when we advance onto a new
+ * heap page, build a map of root item offsets on the page.
+ *
+ * This complicates merging against the tuplesort output: we will
+ * visit the live tuples in order by their offsets, but the root
+ * offsets that we need to compare against the index contents might
+ * be ordered differently. So we might have to "look back" within
+ * the tuplesort output, but only within the current page. We handle
+ * that by keeping a bool array in_index[] showing all the
+ * already-passed-over tuplesort output TIDs of the current page.
+ * We clear that array here, when advancing onto a new heap page.
+ */
+ if (scan->rs_cblock != root_blkno)
+ {
+ Page page = BufferGetPage(scan->rs_cbuf);
+
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+ heap_get_root_tuples(page, root_offsets);
+ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ memset(in_index, 0, sizeof(in_index));
+
+ root_blkno = scan->rs_cblock;
+ }
+
+ /* Convert actual tuple TID to root TID */
+ rootTuple = *heapcursor;
+ root_offnum = ItemPointerGetOffsetNumber(heapcursor);
+
+ if (HeapTupleIsHeapOnly(heapTuple))
+ {
+ root_offnum = root_offsets[root_offnum - 1];
+ Assert(OffsetNumberIsValid(root_offnum));
+ ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
+ }
+
+ /*
* "merge" by skipping through the index tuples until we find or pass
- * the current heap tuple.
+ * the current root tuple.
*/
while (!tuplesort_empty &&
(!indexcursor ||
- ItemPointerCompare(indexcursor, heapcursor) < 0))
+ ItemPointerCompare(indexcursor, &rootTuple) < 0))
{
Datum ts_val;
bool ts_isnull;
if (indexcursor)
+ {
+ /*
+ * Remember index items seen earlier on the current heap page
+ */
+ if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
+ in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
pfree(indexcursor);
+ }
+
tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
&ts_val, &ts_isnull);
Assert(tuplesort_empty || !ts_isnull);
indexcursor = (ItemPointer) DatumGetPointer(ts_val);
}
- if (tuplesort_empty ||
- ItemPointerCompare(indexcursor, heapcursor) > 0)
+ /*
+ * If the tuplesort has overshot *and* we didn't see a match earlier,
+ * then this tuple is missing from the index, so insert it.
+ */
+ if ((tuplesort_empty ||
+ ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
+ !in_index[root_offnum - 1])
{
- /*
- * We've overshot which means this heap tuple is missing from the
- * index, so insert it.
- */
- bool check_unique;
-
MemoryContextReset(econtext->ecxt_per_tuple_memory);
/* Set up for predicate or expression evaluation */
@@ -1828,39 +2051,29 @@ validate_index_heapscan(Relation heapRelation,
isnull);
/*
- * If the tuple is already committed dead, we still have to put it
- * in the index (because some xacts might be able to see it), but
- * we might as well suppress uniqueness checking. This is just an
- * optimization because the index AM is not supposed to raise a
- * uniqueness failure anyway.
- */
- if (indexInfo->ii_Unique)
- {
- /* must lock buffer to call HeapTupleSatisfiesVisibility */
- LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
-
- if (HeapTupleSatisfiesVisibility(heapTuple, SnapshotNow,
- scan->rs_cbuf))
- check_unique = true;
- else
- check_unique = false;
-
- LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
- }
- else
- check_unique = false;
-
- /*
* You'd think we should go ahead and build the index tuple here,
* but some index AMs want to do further processing on the data
* first. So pass the values[] and isnull[] arrays, instead.
*/
+
+ /*
+ * If the tuple is already committed dead, you might think we
+ * could suppress uniqueness checking, but this is no longer
+ * true in the presence of HOT, because the insert is actually
+ * a proxy for a uniqueness check on the whole HOT-chain. That
+ * is, the tuple we have here could be dead because it was already
+ * HOT-updated, and if so the updating transaction will not have
+ * thought it should insert index entries. The index AM will
+ * check the whole HOT-chain and correctly detect a conflict
+ * if there is one.
+ */
+
index_insert(indexRelation,
values,
isnull,
- heapcursor,
+ &rootTuple,
heapRelation,
- check_unique);
+ indexInfo->ii_Unique);
state->tups_inserted += 1;
}
@@ -1983,9 +2196,9 @@ reindex_index(Oid indexId)
ResetReindexProcessing();
/*
- * If the index is marked invalid (ie, it's from a failed CREATE INDEX
- * CONCURRENTLY), we can now mark it valid. This allows REINDEX to be
- * used to clean up in such cases.
+ * If the index is marked invalid or not ready (ie, it's from a failed
+ * CREATE INDEX CONCURRENTLY), we can now mark it valid. This allows
+ * REINDEX to be used to clean up in such cases.
*/
pg_index = heap_open(IndexRelationId, RowExclusiveLock);
@@ -1996,9 +2209,10 @@ reindex_index(Oid indexId)
elog(ERROR, "cache lookup failed for index %u", indexId);
indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
- if (!indexForm->indisvalid)
+ if (!indexForm->indisvalid || !indexForm->indisready)
{
indexForm->indisvalid = true;
+ indexForm->indisready = true;
simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
CatalogUpdateIndexes(pg_index, indexTuple);
}