aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access')
-rw-r--r--src/backend/access/brin/brin.c2
-rw-r--r--src/backend/access/common/reloptions.c17
-rw-r--r--src/backend/access/common/tupdesc.c15
-rw-r--r--src/backend/access/gist/gistutil.c14
-rw-r--r--src/backend/access/gist/gistvalidate.c6
-rw-r--r--src/backend/access/heap/heapam.c35
-rw-r--r--src/backend/access/heap/heapam_handler.c2
-rw-r--r--src/backend/access/heap/heapam_xlog.c7
-rw-r--r--src/backend/access/heap/vacuumlazy.c148
-rw-r--r--src/backend/access/nbtree/nbtpreprocesskeys.c412
-rw-r--r--src/backend/access/nbtree/nbtree.c32
-rw-r--r--src/backend/access/nbtree/nbtsearch.c530
-rw-r--r--src/backend/access/nbtree/nbtsort.c2
-rw-r--r--src/backend/access/nbtree/nbtutils.c390
-rw-r--r--src/backend/access/rmgrdesc/replorigindesc.c2
-rw-r--r--src/backend/access/rmgrdesc/xactdesc.c6
-rw-r--r--src/backend/access/rmgrdesc/xlogdesc.c6
-rw-r--r--src/backend/access/transam/commit_ts.c7
-rw-r--r--src/backend/access/transam/multixact.c16
-rw-r--r--src/backend/access/transam/timeline.c4
-rw-r--r--src/backend/access/transam/twophase.c253
-rw-r--r--src/backend/access/transam/xact.c13
-rw-r--r--src/backend/access/transam/xlog.c61
-rw-r--r--src/backend/access/transam/xlogbackup.c8
-rw-r--r--src/backend/access/transam/xlogprefetcher.c16
-rw-r--r--src/backend/access/transam/xlogreader.c62
-rw-r--r--src/backend/access/transam/xlogrecovery.c178
-rw-r--r--src/backend/access/transam/xlogutils.c2
28 files changed, 1320 insertions, 926 deletions
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 01e1db7f856..4204088fa0d 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -68,7 +68,7 @@ typedef struct BrinShared
int scantuplesortstates;
/* Query ID, for report in worker processes */
- uint64 queryid;
+ int64 queryid;
/*
* workersdonecv is used to monitor the progress of workers. All parallel
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 46c1dce222d..50747c16396 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -1243,8 +1243,9 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
}
else
{
- text *t;
+ const char *name;
const char *value;
+ text *t;
Size len;
/*
@@ -1291,11 +1292,19 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
* have just "name", assume "name=true" is meant. Note: the
* namespace is not output.
*/
+ name = def->defname;
if (def->arg != NULL)
value = defGetString(def);
else
value = "true";
+ /* Insist that name not contain "=", else "a=b=c" is ambiguous */
+ if (strchr(name, '=') != NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid option name \"%s\": must not contain \"=\"",
+ name)));
+
/*
* This is not a great place for this test, but there's no other
* convenient place to filter the option out. As WITH (oids =
@@ -1303,7 +1312,7 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
* amount of ugly.
*/
if (acceptOidsOff && def->defnamespace == NULL &&
- strcmp(def->defname, "oids") == 0)
+ strcmp(name, "oids") == 0)
{
if (defGetBoolean(def))
ereport(ERROR,
@@ -1313,11 +1322,11 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
continue;
}
- len = VARHDRSZ + strlen(def->defname) + 1 + strlen(value);
+ len = VARHDRSZ + strlen(name) + 1 + strlen(value);
/* +1 leaves room for sprintf's trailing null */
t = (text *) palloc(len + 1);
SET_VARSIZE(t, len);
- sprintf(VARDATA(t), "%s=%s", def->defname, value);
+ sprintf(VARDATA(t), "%s=%s", name, value);
astate = accumArrayResult(astate, PointerGetDatum(t),
false, TEXTOID,
diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c
index ffd0c78f905..020d00cd01c 100644
--- a/src/backend/access/common/tupdesc.c
+++ b/src/backend/access/common/tupdesc.c
@@ -142,11 +142,18 @@ void
verify_compact_attribute(TupleDesc tupdesc, int attnum)
{
#ifdef USE_ASSERT_CHECKING
- CompactAttribute *cattr = &tupdesc->compact_attrs[attnum];
+ CompactAttribute cattr;
Form_pg_attribute attr = TupleDescAttr(tupdesc, attnum);
CompactAttribute tmp;
/*
+ * Make a temp copy of the TupleDesc's CompactAttribute. This may be a
+ * shared TupleDesc and the attcacheoff might get changed by another
+ * backend.
+ */
+ memcpy(&cattr, &tupdesc->compact_attrs[attnum], sizeof(CompactAttribute));
+
+ /*
* Populate the temporary CompactAttribute from the corresponding
* Form_pg_attribute
*/
@@ -156,11 +163,11 @@ verify_compact_attribute(TupleDesc tupdesc, int attnum)
* Make the attcacheoff match since it's been reset to -1 by
* populate_compact_attribute_internal. Same with attnullability.
*/
- tmp.attcacheoff = cattr->attcacheoff;
- tmp.attnullability = cattr->attnullability;
+ tmp.attcacheoff = cattr.attcacheoff;
+ tmp.attnullability = cattr.attnullability;
/* Check the freshly populated CompactAttribute matches the TupleDesc's */
- Assert(memcmp(&tmp, cattr, sizeof(CompactAttribute)) == 0);
+ Assert(memcmp(&tmp, &cattr, sizeof(CompactAttribute)) == 0);
#endif
}
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index a6b701943d3..c0aa7d0222f 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -1058,11 +1058,11 @@ gistGetFakeLSN(Relation rel)
}
/*
- * This is a stratnum support function for GiST opclasses that use the
- * RT*StrategyNumber constants.
+ * This is a stratnum translation support function for GiST opclasses that use
+ * the RT*StrategyNumber constants.
*/
Datum
-gist_stratnum_common(PG_FUNCTION_ARGS)
+gist_translate_cmptype_common(PG_FUNCTION_ARGS)
{
CompareType cmptype = PG_GETARG_INT32(0);
@@ -1090,9 +1090,9 @@ gist_stratnum_common(PG_FUNCTION_ARGS)
/*
* Returns the opclass's private stratnum used for the given compare type.
*
- * Calls the opclass's GIST_STRATNUM_PROC support function, if any,
- * and returns the result.
- * Returns InvalidStrategy if the function is not defined.
+ * Calls the opclass's GIST_TRANSLATE_CMPTYPE_PROC support function, if any,
+ * and returns the result. Returns InvalidStrategy if the function is not
+ * defined.
*/
StrategyNumber
gisttranslatecmptype(CompareType cmptype, Oid opfamily)
@@ -1101,7 +1101,7 @@ gisttranslatecmptype(CompareType cmptype, Oid opfamily)
Datum result;
/* Check whether the function is provided. */
- funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_STRATNUM_PROC);
+ funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_TRANSLATE_CMPTYPE_PROC);
if (!OidIsValid(funcid))
return InvalidStrategy;
diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c
index 2a49e6d20f0..2ed6f74fce9 100644
--- a/src/backend/access/gist/gistvalidate.c
+++ b/src/backend/access/gist/gistvalidate.c
@@ -138,7 +138,7 @@ gistvalidate(Oid opclassoid)
ok = check_amproc_signature(procform->amproc, VOIDOID, true,
1, 1, INTERNALOID);
break;
- case GIST_STRATNUM_PROC:
+ case GIST_TRANSLATE_CMPTYPE_PROC:
ok = check_amproc_signature(procform->amproc, INT2OID, true,
1, 1, INT4OID) &&
procform->amproclefttype == ANYOID &&
@@ -265,7 +265,7 @@ gistvalidate(Oid opclassoid)
if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC ||
i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC ||
i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC ||
- i == GIST_STRATNUM_PROC)
+ i == GIST_TRANSLATE_CMPTYPE_PROC)
continue; /* optional methods */
ereport(INFO,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
@@ -336,7 +336,7 @@ gistadjustmembers(Oid opfamilyoid,
case GIST_FETCH_PROC:
case GIST_OPTIONS_PROC:
case GIST_SORTSUPPORT_PROC:
- case GIST_STRATNUM_PROC:
+ case GIST_TRANSLATE_CMPTYPE_PROC:
/* Optional, so force it to be a soft family dependency */
op->ref_is_hard = false;
op->ref_is_family = true;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 9ec8cda1c68..0dcd6ee817e 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -213,6 +213,27 @@ static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
#define TUPLOCK_from_mxstatus(status) \
(MultiXactStatusLock[(status)])
+/*
+ * Check that we have a valid snapshot if we might need TOAST access.
+ */
+static inline void
+AssertHasSnapshotForToast(Relation rel)
+{
+#ifdef USE_ASSERT_CHECKING
+
+ /* bootstrap mode in particular breaks this rule */
+ if (!IsNormalProcessingMode())
+ return;
+
+ /* if the relation doesn't have a TOAST table, we are good */
+ if (!OidIsValid(rel->rd_rel->reltoastrelid))
+ return;
+
+ Assert(HaveRegisteredOrActiveSnapshot());
+
+#endif /* USE_ASSERT_CHECKING */
+}
+
/* ----------------------------------------------------------------
* heap support routines
* ----------------------------------------------------------------
@@ -2066,6 +2087,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
RelationGetNumberOfAttributes(relation));
+ AssertHasSnapshotForToast(relation);
+
/*
* Fill in tuple header fields and toast the tuple if necessary.
*
@@ -2343,6 +2366,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
/* currently not needed (thus unsupported) for heap_multi_insert() */
Assert(!(options & HEAP_INSERT_NO_LOGICAL));
+ AssertHasSnapshotForToast(relation);
+
needwal = RelationNeedsWAL(relation);
saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
HEAP_DEFAULT_FILLFACTOR);
@@ -2765,6 +2790,8 @@ heap_delete(Relation relation, ItemPointer tid,
Assert(ItemPointerIsValid(tid));
+ AssertHasSnapshotForToast(relation);
+
/*
* Forbid this during a parallel operation, lest it allocate a combo CID.
* Other workers might need that combo CID for visibility checks, and we
@@ -3260,6 +3287,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
RelationGetNumberOfAttributes(relation));
+ AssertHasSnapshotForToast(relation);
+
/*
* Forbid this during a parallel operation, lest it allocate a combo CID.
* Other workers might need that combo CID for visibility checks, and we
@@ -4953,7 +4982,7 @@ l3:
case LockWaitError:
if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
status, infomask, relation,
- NULL, log_lock_failure))
+ NULL, log_lock_failures))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
@@ -4991,7 +5020,7 @@ l3:
}
break;
case LockWaitError:
- if (!ConditionalXactLockTableWait(xwait, log_lock_failure))
+ if (!ConditionalXactLockTableWait(xwait, log_lock_failures))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
@@ -5256,7 +5285,7 @@ heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
break;
case LockWaitError:
- if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failure))
+ if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failures))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index ac082fefa77..cb4bc35c93e 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -464,7 +464,7 @@ tuple_lock_retry:
return TM_WouldBlock;
break;
case LockWaitError:
- if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failure))
+ if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failures))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c
index 30f4c2d3c67..eb4bd3d6ae3 100644
--- a/src/backend/access/heap/heapam_xlog.c
+++ b/src/backend/access/heap/heapam_xlog.c
@@ -438,6 +438,9 @@ heap_xlog_insert(XLogReaderState *record)
ItemPointerSetBlockNumber(&target_tid, blkno);
ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
+ /* No freezing in the heap_insert() code path */
+ Assert(!(xlrec->flags & XLH_INSERT_ALL_FROZEN_SET));
+
/*
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
@@ -508,10 +511,6 @@ heap_xlog_insert(XLogReaderState *record)
if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
PageClearAllVisible(page);
- /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
- if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
- PageSetAllVisible(page);
-
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index f28326bad09..14036c27e87 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -423,7 +423,7 @@ typedef struct LVSavedErrInfo
/* non-export function prototypes */
static void lazy_scan_heap(LVRelState *vacrel);
static void heap_vacuum_eager_scan_setup(LVRelState *vacrel,
- VacuumParams *params);
+ const VacuumParams params);
static BlockNumber heap_vac_scan_next_block(ReadStream *stream,
void *callback_private_data,
void *per_buffer_data);
@@ -431,7 +431,7 @@ static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis);
static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf,
BlockNumber blkno, Page page,
bool sharelock, Buffer vmbuffer);
-static void lazy_scan_prune(LVRelState *vacrel, Buffer buf,
+static int lazy_scan_prune(LVRelState *vacrel, Buffer buf,
BlockNumber blkno, Page page,
Buffer vmbuffer, bool all_visible_according_to_vm,
bool *has_lpdead_items, bool *vm_page_frozen);
@@ -485,7 +485,7 @@ static void restore_vacuum_error_info(LVRelState *vacrel,
* vacuum options or for relfrozenxid/relminmxid advancement.
*/
static void
-heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params)
+heap_vacuum_eager_scan_setup(LVRelState *vacrel, const VacuumParams params)
{
uint32 randseed;
BlockNumber allvisible;
@@ -504,7 +504,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params)
vacrel->eager_scan_remaining_successes = 0;
/* If eager scanning is explicitly disabled, just return. */
- if (params->max_eager_freeze_failure_rate == 0)
+ if (params.max_eager_freeze_failure_rate == 0)
return;
/*
@@ -581,11 +581,11 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params)
vacrel->next_eager_scan_region_start = randseed % EAGER_SCAN_REGION_SIZE;
- Assert(params->max_eager_freeze_failure_rate > 0 &&
- params->max_eager_freeze_failure_rate <= 1);
+ Assert(params.max_eager_freeze_failure_rate > 0 &&
+ params.max_eager_freeze_failure_rate <= 1);
vacrel->eager_scan_max_fails_per_region =
- params->max_eager_freeze_failure_rate *
+ params.max_eager_freeze_failure_rate *
EAGER_SCAN_REGION_SIZE;
/*
@@ -612,7 +612,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params)
* and locked the relation.
*/
void
-heap_vacuum_rel(Relation rel, VacuumParams *params,
+heap_vacuum_rel(Relation rel, const VacuumParams params,
BufferAccessStrategy bstrategy)
{
LVRelState *vacrel;
@@ -634,9 +634,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
ErrorContextCallback errcallback;
char **indnames = NULL;
- verbose = (params->options & VACOPT_VERBOSE) != 0;
+ verbose = (params.options & VACOPT_VERBOSE) != 0;
instrument = (verbose || (AmAutoVacuumWorkerProcess() &&
- params->log_min_duration >= 0));
+ params.log_min_duration >= 0));
if (instrument)
{
pg_rusage_init(&ru0);
@@ -699,9 +699,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
* The truncate param allows user to avoid attempting relation truncation,
* though it can't force truncation to happen.
*/
- Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED);
- Assert(params->truncate != VACOPTVALUE_UNSPECIFIED &&
- params->truncate != VACOPTVALUE_AUTO);
+ Assert(params.index_cleanup != VACOPTVALUE_UNSPECIFIED);
+ Assert(params.truncate != VACOPTVALUE_UNSPECIFIED &&
+ params.truncate != VACOPTVALUE_AUTO);
/*
* While VacuumFailSafeActive is reset to false before calling this, we
@@ -711,14 +711,14 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
vacrel->consider_bypass_optimization = true;
vacrel->do_index_vacuuming = true;
vacrel->do_index_cleanup = true;
- vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED);
- if (params->index_cleanup == VACOPTVALUE_DISABLED)
+ vacrel->do_rel_truncate = (params.truncate != VACOPTVALUE_DISABLED);
+ if (params.index_cleanup == VACOPTVALUE_DISABLED)
{
/* Force disable index vacuuming up-front */
vacrel->do_index_vacuuming = false;
vacrel->do_index_cleanup = false;
}
- else if (params->index_cleanup == VACOPTVALUE_ENABLED)
+ else if (params.index_cleanup == VACOPTVALUE_ENABLED)
{
/* Force index vacuuming. Note that failsafe can still bypass. */
vacrel->consider_bypass_optimization = false;
@@ -726,7 +726,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
else
{
/* Default/auto, make all decisions dynamically */
- Assert(params->index_cleanup == VACOPTVALUE_AUTO);
+ Assert(params.index_cleanup == VACOPTVALUE_AUTO);
}
/* Initialize page counters explicitly (be tidy) */
@@ -757,7 +757,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
vacrel->vm_new_visible_pages = 0;
vacrel->vm_new_visible_frozen_pages = 0;
vacrel->vm_new_frozen_pages = 0;
- vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel);
/*
* Get cutoffs that determine which deleted tuples are considered DEAD,
@@ -776,7 +775,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
* to increase the number of dead tuples it can prune away.)
*/
vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs);
+ vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel);
vacrel->vistest = GlobalVisTestFor(rel);
+
/* Initialize state used to track oldest extant XID/MXID */
vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin;
vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact;
@@ -788,7 +789,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
*/
vacrel->skippedallvis = false;
skipwithvm = true;
- if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
+ if (params.options & VACOPT_DISABLE_PAGE_SKIPPING)
{
/*
* Force aggressive mode, and disable skipping blocks using the
@@ -829,7 +830,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
* is already dangerously old.)
*/
lazy_check_wraparound_failsafe(vacrel);
- dead_items_alloc(vacrel, params->nworkers);
+ dead_items_alloc(vacrel, params.nworkers);
/*
* Call lazy_scan_heap to perform all required heap pruning, index
@@ -946,9 +947,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
{
TimestampTz endtime = GetCurrentTimestamp();
- if (verbose || params->log_min_duration == 0 ||
+ if (verbose || params.log_min_duration == 0 ||
TimestampDifferenceExceeds(starttime, endtime,
- params->log_min_duration))
+ params.log_min_duration))
{
long secs_dur;
int usecs_dur;
@@ -983,10 +984,10 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
* Aggressiveness already reported earlier, in dedicated
* VACUUM VERBOSE ereport
*/
- Assert(!params->is_wraparound);
+ Assert(!params.is_wraparound);
msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n");
}
- else if (params->is_wraparound)
+ else if (params.is_wraparound)
{
/*
* While it's possible for a VACUUM to be both is_wraparound
@@ -1244,6 +1245,7 @@ lazy_scan_heap(LVRelState *vacrel)
Buffer buf;
Page page;
uint8 blk_info = 0;
+ int ndeleted = 0;
bool has_lpdead_items;
void *per_buffer_data = NULL;
bool vm_page_frozen = false;
@@ -1386,10 +1388,10 @@ lazy_scan_heap(LVRelState *vacrel)
* line pointers previously marked LP_DEAD.
*/
if (got_cleanup_lock)
- lazy_scan_prune(vacrel, buf, blkno, page,
- vmbuffer,
- blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM,
- &has_lpdead_items, &vm_page_frozen);
+ ndeleted = lazy_scan_prune(vacrel, buf, blkno, page,
+ vmbuffer,
+ blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM,
+ &has_lpdead_items, &vm_page_frozen);
/*
* Count an eagerly scanned page as a failure or a success.
@@ -1413,12 +1415,26 @@ lazy_scan_heap(LVRelState *vacrel)
if (vm_page_frozen)
{
- Assert(vacrel->eager_scan_remaining_successes > 0);
- vacrel->eager_scan_remaining_successes--;
+ if (vacrel->eager_scan_remaining_successes > 0)
+ vacrel->eager_scan_remaining_successes--;
if (vacrel->eager_scan_remaining_successes == 0)
{
/*
+ * Report only once that we disabled eager scanning. We
+ * may eagerly read ahead blocks in excess of the success
+ * or failure caps before attempting to freeze them, so we
+ * could reach here even after disabling additional eager
+ * scanning.
+ */
+ if (vacrel->eager_scan_max_fails_per_region > 0)
+ ereport(vacrel->verbose ? INFO : DEBUG2,
+ (errmsg("disabling eager scanning after freezing %u eagerly scanned blocks of relation \"%s.%s.%s\"",
+ orig_eager_scan_success_limit,
+ vacrel->dbname, vacrel->relnamespace,
+ vacrel->relname)));
+
+ /*
* If we hit our success cap, permanently disable eager
* scanning by setting the other eager scan management
* fields to their disabled values.
@@ -1426,19 +1442,10 @@ lazy_scan_heap(LVRelState *vacrel)
vacrel->eager_scan_remaining_fails = 0;
vacrel->next_eager_scan_region_start = InvalidBlockNumber;
vacrel->eager_scan_max_fails_per_region = 0;
-
- ereport(vacrel->verbose ? INFO : DEBUG2,
- (errmsg("disabling eager scanning after freezing %u eagerly scanned blocks of \"%s.%s.%s\"",
- orig_eager_scan_success_limit,
- vacrel->dbname, vacrel->relnamespace,
- vacrel->relname)));
}
}
- else
- {
- Assert(vacrel->eager_scan_remaining_fails > 0);
+ else if (vacrel->eager_scan_remaining_fails > 0)
vacrel->eager_scan_remaining_fails--;
- }
}
/*
@@ -1475,7 +1482,7 @@ lazy_scan_heap(LVRelState *vacrel)
* table has indexes. There will only be newly-freed space if we
* held the cleanup lock and lazy_scan_prune() was called.
*/
- if (got_cleanup_lock && vacrel->nindexes == 0 && has_lpdead_items &&
+ if (got_cleanup_lock && vacrel->nindexes == 0 && ndeleted > 0 &&
blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
{
FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
@@ -1866,8 +1873,6 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno,
*/
if (!PageIsAllVisible(page))
{
- uint8 old_vmbits;
-
START_CRIT_SECTION();
/* mark buffer dirty before writing a WAL record */
@@ -1887,24 +1892,16 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno,
log_newpage_buffer(buf, true);
PageSetAllVisible(page);
- old_vmbits = visibilitymap_set(vacrel->rel, blkno, buf,
- InvalidXLogRecPtr,
- vmbuffer, InvalidTransactionId,
- VISIBILITYMAP_ALL_VISIBLE |
- VISIBILITYMAP_ALL_FROZEN);
+ visibilitymap_set(vacrel->rel, blkno, buf,
+ InvalidXLogRecPtr,
+ vmbuffer, InvalidTransactionId,
+ VISIBILITYMAP_ALL_VISIBLE |
+ VISIBILITYMAP_ALL_FROZEN);
END_CRIT_SECTION();
- /*
- * If the page wasn't already set all-visible and/or all-frozen in
- * the VM, count it as newly set for logging.
- */
- if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0)
- {
- vacrel->vm_new_visible_pages++;
- vacrel->vm_new_visible_frozen_pages++;
- }
- else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0)
- vacrel->vm_new_frozen_pages++;
+ /* Count the newly all-frozen pages for logging */
+ vacrel->vm_new_visible_pages++;
+ vacrel->vm_new_visible_frozen_pages++;
}
freespace = PageGetHeapFreeSpace(page);
@@ -1940,8 +1937,10 @@ cmpOffsetNumbers(const void *a, const void *b)
* *vm_page_frozen is set to true if the page is newly set all-frozen in the
* VM. The caller currently only uses this for determining whether an eagerly
* scanned page was successfully set all-frozen.
+ *
+ * Returns the number of tuples deleted from the page during HOT pruning.
*/
-static void
+static int
lazy_scan_prune(LVRelState *vacrel,
Buffer buf,
BlockNumber blkno,
@@ -2212,6 +2211,8 @@ lazy_scan_prune(LVRelState *vacrel,
*vm_page_frozen = true;
}
}
+
+ return presult.ndeleted;
}
/*
@@ -2909,7 +2910,6 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
&all_frozen))
{
- uint8 old_vmbits;
uint8 flags = VISIBILITYMAP_ALL_VISIBLE;
if (all_frozen)
@@ -2919,25 +2919,15 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
}
PageSetAllVisible(page);
- old_vmbits = visibilitymap_set(vacrel->rel, blkno, buffer,
- InvalidXLogRecPtr,
- vmbuffer, visibility_cutoff_xid,
- flags);
-
- /*
- * If the page wasn't already set all-visible and/or all-frozen in the
- * VM, count it as newly set for logging.
- */
- if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0)
- {
- vacrel->vm_new_visible_pages++;
- if (all_frozen)
- vacrel->vm_new_visible_frozen_pages++;
- }
+ visibilitymap_set(vacrel->rel, blkno, buffer,
+ InvalidXLogRecPtr,
+ vmbuffer, visibility_cutoff_xid,
+ flags);
- else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 &&
- all_frozen)
- vacrel->vm_new_frozen_pages++;
+ /* Count the newly set VM page for logging */
+ vacrel->vm_new_visible_pages++;
+ if (all_frozen)
+ vacrel->vm_new_visible_frozen_pages++;
}
/* Revert to the previous phase information for error traceback */
diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c
index a136e4bbfdf..21c519cd108 100644
--- a/src/backend/access/nbtree/nbtpreprocesskeys.c
+++ b/src/backend/access/nbtree/nbtpreprocesskeys.c
@@ -16,6 +16,7 @@
#include "postgres.h"
#include "access/nbtree.h"
+#include "common/int.h"
#include "lib/qunique.h"
#include "utils/array.h"
#include "utils/lsyscache.h"
@@ -56,6 +57,8 @@ static void _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk,
BTArrayKeyInfo *array);
static void _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk,
BTArrayKeyInfo *array);
+static void _bt_unmark_keys(IndexScanDesc scan, int *keyDataMap);
+static int _bt_reorder_array_cmp(const void *a, const void *b);
static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys);
static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap);
static int _bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops_out,
@@ -96,7 +99,7 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg);
* incomplete sets of cross-type operators, we may fail to detect redundant
* or contradictory keys, but we can survive that.)
*
- * The output keys must be sorted by index attribute. Presently we expect
+ * Required output keys are sorted by index attribute. Presently we expect
* (but verify) that the input keys are already so sorted --- this is done
* by match_clauses_to_index() in indxpath.c. Some reordering of the keys
* within each attribute may be done as a byproduct of the processing here.
@@ -127,29 +130,36 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg);
* This has the potential to be much more efficient than a full index scan
* (though it behaves like a full scan when there's many distinct "x" values).
*
- * If possible, redundant keys are eliminated: we keep only the tightest
+ * Typically, redundant keys are eliminated: we keep only the tightest
* >/>= bound and the tightest </<= bound, and if there's an = key then
* that's the only one returned. (So, we return either a single = key,
* or one or two boundary-condition keys for each attr.) However, if we
* cannot compare two keys for lack of a suitable cross-type operator,
- * we cannot eliminate either. If there are two such keys of the same
- * operator strategy, the second one is just pushed into the output array
- * without further processing here. We may also emit both >/>= or both
- * </<= keys if we can't compare them. The logic about required keys still
- * works if we don't eliminate redundant keys.
- *
- * Note that one reason we need direction-sensitive required-key flags is
- * precisely that we may not be able to eliminate redundant keys. Suppose
- * we have "x > 4::int AND x > 10::bigint", and we are unable to determine
- * which key is more restrictive for lack of a suitable cross-type operator.
- * _bt_first will arbitrarily pick one of the keys to do the initial
- * positioning with. If it picks x > 4, then the x > 10 condition will fail
- * until we reach index entries > 10; but we can't stop the scan just because
- * x > 10 is failing. On the other hand, if we are scanning backwards, then
- * failure of either key is indeed enough to stop the scan. (In general, when
- * inequality keys are present, the initial-positioning code only promises to
- * position before the first possible match, not exactly at the first match,
- * for a forward scan; or after the last match for a backward scan.)
+ * we cannot eliminate either key.
+ *
+ * When all redundant keys could not be eliminated, we'll output a key array
+ * that can more or less be treated as if it had no redundant keys. Suppose
+ * we have "x > 4::int AND x > 10::bigint AND x < 70", and we are unable to
+ * determine which > key is more restrictive for lack of a suitable cross-type
+ * operator. We'll arbitrarily pick one of the > keys; the other > key won't
+ * be marked required. Obviously, the scan will be less efficient if we
+ * choose x > 4 over x > 10 -- but it can still largely proceed as if there
+ * was only a single > condition. "x > 10" will be placed at the end of the
+ * so->keyData[] output array. It'll always be evaluated last, after the keys
+ * that could be marked required in the usual way (after "x > 4 AND x < 70").
+ * This can sometimes result in so->keyData[] keys that aren't even in index
+ * attribute order (if the qual involves multiple attributes). The scan's
+ * required keys will still be in attribute order, though, so it can't matter.
+ *
+ * This scheme ensures that _bt_first always uses the same set of keys at the
+ * start of a forwards scan as those _bt_checkkeys uses to determine when to
+ * end a similar backwards scan (and vice-versa). _bt_advance_array_keys
+ * depends on this: it expects to be able to reliably predict what the next
+ * _bt_first call will do by testing whether _bt_checkkeys' routines report
+ * that the final tuple on the page is past the end of matches for the scan's
+ * keys with the scan direction flipped. If it is (if continuescan=false),
+ * then it follows that calling _bt_first will, at a minimum, relocate the
+ * scan to the very next leaf page (in the current scan direction).
*
* As a byproduct of this work, we can detect contradictory quals such
* as "x = 1 AND x > 2". If we see that, we return so->qual_ok = false,
@@ -188,7 +198,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
int numberOfEqualCols;
ScanKey inkeys;
BTScanKeyPreproc xform[BTMaxStrategyNumber];
- bool test_result;
+ bool test_result,
+ redundant_key_kept = false;
AttrNumber attno;
ScanKey arrayKeyData;
int *keyDataMap = NULL;
@@ -388,7 +399,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
xform[j].inkey = NULL;
xform[j].inkeyi = -1;
}
- /* else, cannot determine redundancy, keep both keys */
+ else
+ redundant_key_kept = true;
}
/* track number of attrs for which we have "=" keys */
numberOfEqualCols++;
@@ -409,6 +421,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
else
xform[BTLessStrategyNumber - 1].inkey = NULL;
}
+ else
+ redundant_key_kept = true;
}
/* try to keep only one of >, >= */
@@ -426,6 +440,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
else
xform[BTGreaterStrategyNumber - 1].inkey = NULL;
}
+ else
+ redundant_key_kept = true;
}
/*
@@ -466,25 +482,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
/* check strategy this key's operator corresponds to */
j = inkey->sk_strategy - 1;
- /* if row comparison, push it directly to the output array */
- if (inkey->sk_flags & SK_ROW_HEADER)
- {
- ScanKey outkey = &so->keyData[new_numberOfKeys++];
-
- memcpy(outkey, inkey, sizeof(ScanKeyData));
- if (arrayKeyData)
- keyDataMap[new_numberOfKeys - 1] = i;
- if (numberOfEqualCols == attno - 1)
- _bt_mark_scankey_required(outkey);
-
- /*
- * We don't support RowCompare using equality; such a qual would
- * mess up the numberOfEqualCols tracking.
- */
- Assert(j != (BTEqualStrategyNumber - 1));
- continue;
- }
-
if (inkey->sk_strategy == BTEqualStrategyNumber &&
(inkey->sk_flags & SK_SEARCHARRAY))
{
@@ -593,9 +590,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
* the new scan key.
*
* Note: We do things this way around so that our arrays are
- * always in the same order as their corresponding scan keys,
- * even with incomplete opfamilies. _bt_advance_array_keys
- * depends on this.
+ * always in the same order as their corresponding scan keys.
+ * _bt_preprocess_array_keys_final expects this.
*/
ScanKey outkey = &so->keyData[new_numberOfKeys++];
@@ -607,6 +603,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
xform[j].inkey = inkey;
xform[j].inkeyi = i;
xform[j].arrayidx = arrayidx;
+ redundant_key_kept = true;
}
}
}
@@ -622,6 +619,15 @@ _bt_preprocess_keys(IndexScanDesc scan)
if (arrayKeyData)
_bt_preprocess_array_keys_final(scan, keyDataMap);
+ /*
+ * If there are remaining redundant inequality keys, we must make sure
+ * that each index attribute has no more than one required >/>= key, and
+ * no more than one required </<= key. Attributes that have one or more
+ * required = keys now must keep only one required key (the first = key).
+ */
+ if (unlikely(redundant_key_kept) && so->qual_ok)
+ _bt_unmark_keys(scan, keyDataMap);
+
/* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */
}
@@ -746,9 +752,12 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
*
* Depending on the operator type, the key may be required for both scan
* directions or just one. Also, if the key is a row comparison header,
- * we have to mark its first subsidiary ScanKey as required. (Subsequent
- * subsidiary ScanKeys are normally for lower-order columns, and thus
- * cannot be required, since they're after the first non-equality scankey.)
+ * we have to mark the appropriate subsidiary ScanKeys as required. In such
+ * cases, the first subsidiary key is required, but subsequent ones are
+ * required only as long as they correspond to successive index columns and
+ * match the leading column as to sort direction. Otherwise the row
+ * comparison ordering is different from the index ordering and so we can't
+ * stop the scan on the basis of those lower-order columns.
*
* Note: when we set required-key flag bits in a subsidiary scankey, we are
* scribbling on a data structure belonging to the index AM's caller, not on
@@ -786,12 +795,25 @@ _bt_mark_scankey_required(ScanKey skey)
if (skey->sk_flags & SK_ROW_HEADER)
{
ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
+ AttrNumber attno = skey->sk_attno;
/* First subkey should be same column/operator as the header */
- Assert(subkey->sk_flags & SK_ROW_MEMBER);
- Assert(subkey->sk_attno == skey->sk_attno);
+ Assert(subkey->sk_attno == attno);
Assert(subkey->sk_strategy == skey->sk_strategy);
- subkey->sk_flags |= addflags;
+
+ for (;;)
+ {
+ Assert(subkey->sk_flags & SK_ROW_MEMBER);
+ if (subkey->sk_attno != attno)
+ break; /* non-adjacent key, so not required */
+ if (subkey->sk_strategy != skey->sk_strategy)
+ break; /* wrong direction, so not required */
+ subkey->sk_flags |= addflags;
+ if (subkey->sk_flags & SK_ROW_END)
+ break;
+ subkey++;
+ attno++;
+ }
}
}
@@ -847,8 +869,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
cmp_op;
StrategyNumber strat;
- Assert(!((leftarg->sk_flags | rightarg->sk_flags) &
- (SK_ROW_HEADER | SK_ROW_MEMBER)));
+ Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_MEMBER));
/*
* First, deal with cases where one or both args are NULL. This should
@@ -925,6 +946,16 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
}
/*
+ * We don't yet know how to determine redundancy when it involves a row
+ * compare key (barring simple cases involving IS NULL/IS NOT NULL)
+ */
+ if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_HEADER)
+ {
+ Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP));
+ return false;
+ }
+
+ /*
* If either leftarg or rightarg are equality-type array scankeys, we need
* specialized handling (since by now we know that IS NULL wasn't used)
*/
@@ -1468,6 +1499,283 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk,
}
/*
+ * _bt_unmark_keys() -- make superfluous required keys nonrequired after all
+ *
+ * When _bt_preprocess_keys fails to eliminate one or more redundant keys, it
+ * calls here to make sure that no index attribute has more than one > or >=
+ * key marked required, and no more than one required < or <= key. Attributes
+ * with = keys will always get one = key as their required key. All other
+ * keys that were initially marked required get "unmarked" here. That way,
+ * _bt_first and _bt_checkkeys will reliably agree on which keys to use to
+ * start and/or to end the scan.
+ *
+ * We also relocate keys that become/started out nonrequired to the end of
+ * so->keyData[]. That way, _bt_first and _bt_checkkeys cannot fail to reach
+ * a required key due to some earlier nonrequired key getting in the way.
+ *
+ * Only call here when _bt_compare_scankey_args returned false at least once
+ * (otherwise, calling here will just waste cycles).
+ */
+static void
+_bt_unmark_keys(IndexScanDesc scan, int *keyDataMap)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ AttrNumber attno;
+ bool *unmarkikey;
+ int nunmark,
+ nunmarked,
+ nkept,
+ firsti;
+ ScanKey keepKeys,
+ unmarkKeys;
+ FmgrInfo *keepOrderProcs = NULL,
+ *unmarkOrderProcs = NULL;
+ bool haveReqEquals,
+ haveReqForward,
+ haveReqBackward;
+
+ /*
+ * Do an initial pass over so->keyData[] that determines which keys to
+ * keep as required. We expect so->keyData[] to still be in attribute
+ * order when we're called (though we don't expect any particular order
+ * among each attribute's keys).
+ *
+ * When both equality and inequality keys remain on a single attribute, we
+ * *must* make sure that exactly one of the equalities remains required.
+ * Any requiredness markings that we might leave on later keys/attributes
+ * are predicated on there being required = keys on all prior columns.
+ */
+ unmarkikey = palloc0(so->numberOfKeys * sizeof(bool));
+ nunmark = 0;
+
+ /* Set things up for first key's attribute */
+ attno = so->keyData[0].sk_attno;
+ firsti = 0;
+ haveReqEquals = false;
+ haveReqForward = false;
+ haveReqBackward = false;
+ for (int i = 0; i < so->numberOfKeys; i++)
+ {
+ ScanKey origkey = &so->keyData[i];
+
+ if (origkey->sk_attno != attno)
+ {
+ /* Reset for next attribute */
+ attno = origkey->sk_attno;
+ firsti = i;
+
+ haveReqEquals = false;
+ haveReqForward = false;
+ haveReqBackward = false;
+ }
+
+ /* Equalities get priority over inequalities */
+ if (haveReqEquals)
+ {
+ /*
+ * We already found the first "=" key for this attribute. We've
+ * already decided that all its other keys will be unmarked.
+ */
+ Assert(!(origkey->sk_flags & SK_SEARCHNULL));
+ unmarkikey[i] = true;
+ nunmark++;
+ continue;
+ }
+ else if ((origkey->sk_flags & SK_BT_REQFWD) &&
+ (origkey->sk_flags & SK_BT_REQBKWD))
+ {
+ /*
+ * Found the first "=" key for attno. All other attno keys will
+ * be unmarked.
+ */
+ Assert(origkey->sk_strategy == BTEqualStrategyNumber);
+
+ haveReqEquals = true;
+ for (int j = firsti; j < i; j++)
+ {
+ /* Unmark any prior inequality keys on attno after all */
+ if (!unmarkikey[j])
+ {
+ unmarkikey[j] = true;
+ nunmark++;
+ }
+ }
+ continue;
+ }
+
+ /* Deal with inequalities next */
+ if ((origkey->sk_flags & SK_BT_REQFWD) && !haveReqForward)
+ {
+ haveReqForward = true;
+ continue;
+ }
+ else if ((origkey->sk_flags & SK_BT_REQBKWD) && !haveReqBackward)
+ {
+ haveReqBackward = true;
+ continue;
+ }
+
+ /*
+ * We have either a redundant inequality key that will be unmarked, or
+ * we have a key that wasn't marked required in the first place
+ */
+ unmarkikey[i] = true;
+ nunmark++;
+ }
+
+ /* Should only be called when _bt_compare_scankey_args reported failure */
+ Assert(nunmark > 0);
+
+ /*
+ * Next, allocate temp arrays: one for required keys that'll remain
+ * required, the other for all remaining keys
+ */
+ unmarkKeys = palloc(nunmark * sizeof(ScanKeyData));
+ keepKeys = palloc((so->numberOfKeys - nunmark) * sizeof(ScanKeyData));
+ nunmarked = 0;
+ nkept = 0;
+ if (so->numArrayKeys)
+ {
+ unmarkOrderProcs = palloc(nunmark * sizeof(FmgrInfo));
+ keepOrderProcs = palloc((so->numberOfKeys - nunmark) * sizeof(FmgrInfo));
+ }
+
+ /*
+ * Next, copy the contents of so->keyData[] into the appropriate temp
+ * array.
+ *
+ * Scans with = array keys need us to maintain invariants around the order
+ * of so->orderProcs[] and so->arrayKeys[] relative to so->keyData[]. See
+ * _bt_preprocess_array_keys_final for a full explanation.
+ */
+ for (int i = 0; i < so->numberOfKeys; i++)
+ {
+ ScanKey origkey = &so->keyData[i];
+ ScanKey unmark;
+
+ if (!unmarkikey[i])
+ {
+ /*
+ * Key gets to keep its original requiredness markings.
+ *
+ * Key will stay in its original position, unless we're going to
+ * unmark an earlier key (in which case this key gets moved back).
+ */
+ memcpy(keepKeys + nkept, origkey, sizeof(ScanKeyData));
+
+ if (so->numArrayKeys)
+ {
+ keyDataMap[i] = nkept;
+ memcpy(keepOrderProcs + nkept, &so->orderProcs[i],
+ sizeof(FmgrInfo));
+ }
+
+ nkept++;
+ continue;
+ }
+
+ /*
+ * Key will be unmarked as needed, and moved to the end of the array,
+ * next to other keys that will become (or always were) nonrequired
+ */
+ unmark = unmarkKeys + nunmarked;
+ memcpy(unmark, origkey, sizeof(ScanKeyData));
+
+ if (so->numArrayKeys)
+ {
+ keyDataMap[i] = (so->numberOfKeys - nunmark) + nunmarked;
+ memcpy(&unmarkOrderProcs[nunmarked], &so->orderProcs[i],
+ sizeof(FmgrInfo));
+ }
+
+ /*
+ * Preprocessing only generates skip arrays when it knows that they'll
+ * be the only required = key on the attr. We'll never unmark them.
+ */
+ Assert(!(unmark->sk_flags & SK_BT_SKIP));
+
+ /*
+ * Also shouldn't have to unmark an IS NULL or an IS NOT NULL key.
+ * They aren't cross-type, so an incomplete opfamily can't matter.
+ */
+ Assert(!(unmark->sk_flags & SK_ISNULL) ||
+ !(unmark->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)));
+
+ /* Clear requiredness flags on redundant key (and on any subkeys) */
+ unmark->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD);
+ if (unmark->sk_flags & SK_ROW_HEADER)
+ {
+ ScanKey subkey = (ScanKey) DatumGetPointer(unmark->sk_argument);
+
+ Assert(subkey->sk_strategy == unmark->sk_strategy);
+ for (;;)
+ {
+ Assert(subkey->sk_flags & SK_ROW_MEMBER);
+ subkey->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD);
+ if (subkey->sk_flags & SK_ROW_END)
+ break;
+ subkey++;
+ }
+ }
+
+ nunmarked++;
+ }
+
+ /* Copy both temp arrays back into so->keyData[] to reorder */
+ Assert(nkept == so->numberOfKeys - nunmark);
+ Assert(nunmarked == nunmark);
+ memcpy(so->keyData, keepKeys, sizeof(ScanKeyData) * nkept);
+ memcpy(so->keyData + nkept, unmarkKeys, sizeof(ScanKeyData) * nunmarked);
+
+ /* Done with temp arrays */
+ pfree(unmarkikey);
+ pfree(keepKeys);
+ pfree(unmarkKeys);
+
+ /*
+ * Now copy so->orderProcs[] temp entries needed by scans with = array
+ * keys back (just like with the so->keyData[] temp arrays)
+ */
+ if (so->numArrayKeys)
+ {
+ memcpy(so->orderProcs, keepOrderProcs, sizeof(FmgrInfo) * nkept);
+ memcpy(so->orderProcs + nkept, unmarkOrderProcs,
+ sizeof(FmgrInfo) * nunmarked);
+
+ /* Also fix-up array->scan_key references */
+ for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
+ {
+ BTArrayKeyInfo *array = &so->arrayKeys[arridx];
+
+ array->scan_key = keyDataMap[array->scan_key];
+ }
+
+ /*
+ * Sort so->arrayKeys[] based on its new BTArrayKeyInfo.scan_key
+ * offsets, so that its order matches so->keyData[] order as expected
+ */
+ qsort(so->arrayKeys, so->numArrayKeys, sizeof(BTArrayKeyInfo),
+ _bt_reorder_array_cmp);
+
+ /* Done with temp arrays */
+ pfree(unmarkOrderProcs);
+ pfree(keepOrderProcs);
+ }
+}
+
+/*
+ * qsort comparator for reordering so->arrayKeys[] BTArrayKeyInfo entries
+ */
+static int
+_bt_reorder_array_cmp(const void *a, const void *b)
+{
+ BTArrayKeyInfo *arraya = (BTArrayKeyInfo *) a;
+ BTArrayKeyInfo *arrayb = (BTArrayKeyInfo *) b;
+
+ return pg_cmp_s32(arraya->scan_key, arrayb->scan_key);
+}
+
+/*
* _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
*
* If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 765659887af..fdff960c130 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -228,6 +228,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
BTScanOpaque so = (BTScanOpaque) scan->opaque;
bool res;
+ Assert(scan->heapRelation != NULL);
+
/* btree indexes are never lossy */
scan->xs_recheck = false;
@@ -289,6 +291,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
int64 ntids = 0;
ItemPointer heapTid;
+ Assert(scan->heapRelation == NULL);
+
/* Each loop iteration performs another primitive index scan */
do
{
@@ -393,6 +397,34 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
BTScanPosInvalidate(so->currPos);
}
+ /*
+ * We prefer to eagerly drop leaf page pins before btgettuple returns.
+ * This avoids making VACUUM wait to acquire a cleanup lock on the page.
+ *
+ * We cannot safely drop leaf page pins during index-only scans due to a
+ * race condition involving VACUUM setting pages all-visible in the VM.
+ * It's also unsafe for plain index scans that use a non-MVCC snapshot.
+ *
+ * When we drop pins eagerly, the mechanism that marks so->killedItems[]
+ * index tuples LP_DEAD has to deal with concurrent TID recycling races.
+ * The scheme used to detect unsafe TID recycling won't work when scanning
+ * unlogged relations (since it involves saving an affected page's LSN).
+ * Opt out of eager pin dropping during unlogged relation scans for now
+ * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
+ *
+ * Also opt out of dropping leaf page pins eagerly during bitmap scans.
+ * Pins cannot be held for more than an instant during bitmap scans either
+ * way, so we might as well avoid wasting cycles on acquiring page LSNs.
+ *
+ * See nbtree/README section on making concurrent TID recycling safe.
+ *
+ * Note: so->dropPin should never change across rescans.
+ */
+ so->dropPin = (!scan->xs_want_itup &&
+ IsMVCCSnapshot(scan->xs_snapshot) &&
+ RelationNeedsWAL(scan->indexRelation) &&
+ scan->heapRelation != NULL);
+
so->markItemIndex = -1;
so->needPrimScan = false;
so->scanBehind = false;
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index fe9a3886913..4af1ff1e9e5 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,7 +25,7 @@
#include "utils/rel.h"
-static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
Buffer buf, bool forupdate, BTStack stack,
int access);
@@ -57,24 +57,29 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
/*
* _bt_drop_lock_and_maybe_pin()
*
- * Unlock the buffer; and if it is safe to release the pin, do that, too.
- * This will prevent vacuum from stalling in a blocked state trying to read a
- * page when a cursor is sitting on it.
- *
- * See nbtree/README section on making concurrent TID recycling safe.
+ * Unlock so->currPos.buf. If scan is so->dropPin, drop the pin, too.
+ * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
*/
-static void
-_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+static inline void
+_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
{
- _bt_unlockbuf(scan->indexRelation, sp->buf);
-
- if (IsMVCCSnapshot(scan->xs_snapshot) &&
- RelationNeedsWAL(scan->indexRelation) &&
- !scan->xs_want_itup)
+ if (!so->dropPin)
{
- ReleaseBuffer(sp->buf);
- sp->buf = InvalidBuffer;
+ /* Just drop the lock (not the pin) */
+ _bt_unlockbuf(rel, so->currPos.buf);
+ return;
}
+
+ /*
+ * Drop both the lock and the pin.
+ *
+ * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
+ * when concurrent heap TID recycling by VACUUM might have taken place.
+ */
+ Assert(RelationNeedsWAL(rel));
+ so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+ _bt_relbuf(rel, so->currPos.buf);
+ so->currPos.buf = InvalidBuffer;
}
/*
@@ -866,8 +871,8 @@ _bt_compare(Relation rel,
* if backwards scan, the last item) in the tree that satisfies the
* qualifications in the scan key. On success exit, data about the
* matching tuple(s) on the page has been loaded into so->currPos. We'll
- * drop all locks and hold onto a pin on page's buffer, except when
- * _bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM.
+ * drop all locks and hold onto a pin on page's buffer, except during
+ * so->dropPin scans, when we drop both the lock and the pin.
* _bt_returnitem sets the next item to return to scan on success exit.
*
* If there are no matching items in the index, we return false, with no
@@ -955,46 +960,51 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
/*----------
* Examine the scan keys to discover where we need to start the scan.
+ * The selected scan keys (at most one per index column) are remembered by
+ * storing their addresses into the local startKeys[] array. The final
+ * startKeys[] entry's strategy is set in strat_total. (Actually, there
+ * are a couple of cases where we force a less/more restrictive strategy.)
*
- * We want to identify the keys that can be used as starting boundaries;
- * these are =, >, or >= keys for a forward scan or =, <, <= keys for
- * a backwards scan. We can use keys for multiple attributes so long as
- * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
- * a > or < boundary or find an attribute with no boundary (which can be
- * thought of as the same as "> -infinity"), we can't use keys for any
- * attributes to its right, because it would break our simplistic notion
- * of what initial positioning strategy to use.
+ * We must use the key that was marked required (in the direction opposite
+ * our own scan's) during preprocessing. Each index attribute can only
+ * have one such required key. In general, the keys that we use to find
+ * an initial position when scanning forwards are the same keys that end
+ * the scan on the leaf level when scanning backwards (and vice-versa).
*
* When the scan keys include cross-type operators, _bt_preprocess_keys
- * may not be able to eliminate redundant keys; in such cases we will
- * arbitrarily pick a usable one for each attribute. This is correct
- * but possibly not optimal behavior. (For example, with keys like
- * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
- * x=5 would be more efficient.) Since the situation only arises given
- * a poorly-worded query plus an incomplete opfamily, live with it.
+ * may not be able to eliminate redundant keys; in such cases it will
+ * arbitrarily pick a usable key for each attribute (and scan direction),
+ * ensuring that there is no more than one key required in each direction.
+ * We stop considering further keys once we reach the first nonrequired
+ * key (which must come after all required keys), so this can't affect us.
+ *
+ * The required keys that we use as starting boundaries have to be =, >,
+ * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
+ * We can use keys for multiple attributes so long as the prior attributes
+ * had only =, >= (resp. =, <=) keys. These rules are very similar to the
+ * rules that preprocessing used to determine which keys to mark required.
+ * We cannot always use every required key as a positioning key, though.
+ * Skip arrays necessitate independently applying our own rules here.
+ * Skip arrays are always generally considered = array keys, but we'll
+ * nevertheless treat them as inequalities at certain points of the scan.
+ * When that happens, it _might_ have implications for the number of
+ * required keys that we can safely use for initial positioning purposes.
*
- * When both equality and inequality keys appear for a single attribute
- * (again, only possible when cross-type operators appear), we *must*
- * select one of the equality keys for the starting point, because
- * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
- * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
- * start at x=4, we will fail and stop before reaching x=10. If multiple
- * equality quals survive preprocessing, however, it doesn't matter which
- * one we use --- by definition, they are either redundant or
- * contradictory.
+ * For example, a forward scan with a skip array on its leading attribute
+ * (with no low_compare/high_compare) will have at least two required scan
+ * keys, but we won't use any of them as boundary keys during the scan's
+ * initial call here. Our positioning key during the first call here can
+ * be thought of as representing "> -infinity". Similarly, if such a skip
+ * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
+ * during the scan's initial call here; a lower-order key such as "b = 42"
+ * can't be used until the "a" array advances beyond MINVAL/low_compare.
*
- * In practice we rarely see any "attribute boundary key gaps" here.
- * Preprocessing can usually backfill skip array keys for any attributes
- * that were omitted from the original scan->keyData[] input keys. All
- * array keys are always considered = keys, but we'll sometimes need to
- * treat the current key value as if we were using an inequality strategy.
- * This happens with range skip arrays, which store inequality keys in the
- * array's low_compare/high_compare fields (used to find the first/last
- * set of matches, when = key will lack a usable sk_argument value).
- * These are always preferred over any redundant "standard" inequality
- * keys on the same column (per the usual rule about preferring = keys).
- * Note also that any column with an = skip array key can never have an
- * additional, contradictory = key.
+ * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
+ * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
+ * A subsequent call here might have us use "a = 'fop' AND b = 42". Note
+ * that we treat = and >= as equivalent when scanning forwards (just as we
+ * treat = and <= as equivalent when scanning backwards). We effectively
+ * do the same thing (though with a distinct "a" element/value) each time.
*
* All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
* array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
@@ -1006,21 +1016,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* traversing a lot of null entries at the start of the scan.
*
* In this loop, row-comparison keys are treated the same as keys on their
- * first (leftmost) columns. We'll add on lower-order columns of the row
- * comparison below, if possible.
+ * first (leftmost) columns. We'll add all lower-order columns of the row
+ * comparison that were marked required during preprocessing below.
*
- * The selected scan keys (at most one per index column) are remembered by
- * storing their addresses into the local startKeys[] array.
- *
- * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
- * the next primitive index scan (for scans with array keys) based in part
- * on an understanding of how it'll enable us to reposition the scan.
- * They're directly aware of how we'll sometimes cons up an explicit
- * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
- * symmetric "deduce NOT NULL" rule of their own. This allows top-level
- * scans to skip large groups of NULLs through repeated deductions about
- * key strictness (for a required inequality key) and whether NULLs in the
- * key's index column are stored last or first (relative to non-NULLs).
+ * _bt_advance_array_keys needs to know exactly how we'll reposition the
+ * scan (should it opt to schedule another primitive index scan). It is
+ * critical that primscans only be scheduled when they'll definitely make
+ * some useful progress. _bt_advance_array_keys does this by calling
+ * _bt_checkkeys routines that report whether a tuple is past the end of
+ * matches for the scan's keys (given the scan's current array elements).
+ * If the page's final tuple is "after the end of matches" for a scan that
+ * uses the *opposite* scan direction, then it must follow that it's also
+ * "before the start of matches" for the actual current scan direction.
+ * It is therefore essential that all of our initial positioning rules are
+ * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
* If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
* need to be kept in sync.
*----------
@@ -1029,18 +1038,17 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
if (so->numberOfKeys > 0)
{
AttrNumber curattr;
- ScanKey chosen;
+ ScanKey bkey;
ScanKey impliesNN;
ScanKey cur;
/*
- * chosen is the so-far-chosen key for the current attribute, if any.
- * We don't cast the decision in stone until we reach keys for the
- * next attribute.
+ * bkey will be set to the key that preprocessing left behind as the
+ * boundary key for this attribute, in this scan direction (if any)
*/
cur = so->keyData;
curattr = 1;
- chosen = NULL;
+ bkey = NULL;
/* Also remember any scankey that implies a NOT NULL constraint */
impliesNN = NULL;
@@ -1053,23 +1061,29 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
{
if (i >= so->numberOfKeys || cur->sk_attno != curattr)
{
+ /* Done looking for the curattr boundary key */
+ Assert(bkey == NULL ||
+ (bkey->sk_attno == curattr &&
+ (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
+ Assert(impliesNN == NULL ||
+ (impliesNN->sk_attno == curattr &&
+ (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
+
/*
- * Done looking at keys for curattr.
- *
* If this is a scan key for a skip array whose current
* element is MINVAL, choose low_compare (when scanning
* backwards it'll be MAXVAL, and we'll choose high_compare).
*
- * Note: if the array's low_compare key makes 'chosen' NULL,
+ * Note: if the array's low_compare key makes 'bkey' NULL,
* then we behave as if the array's first element is -inf,
* except when !array->null_elem implies a usable NOT NULL
* constraint.
*/
- if (chosen != NULL &&
- (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
+ if (bkey != NULL &&
+ (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
{
- int ikey = chosen - so->keyData;
- ScanKey skipequalitykey = chosen;
+ int ikey = bkey - so->keyData;
+ ScanKey skipequalitykey = bkey;
BTArrayKeyInfo *array = NULL;
for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
@@ -1082,35 +1096,35 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
if (ScanDirectionIsForward(dir))
{
Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
- chosen = array->low_compare;
+ bkey = array->low_compare;
}
else
{
Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
- chosen = array->high_compare;
+ bkey = array->high_compare;
}
- Assert(chosen == NULL ||
- chosen->sk_attno == skipequalitykey->sk_attno);
+ Assert(bkey == NULL ||
+ bkey->sk_attno == skipequalitykey->sk_attno);
if (!array->null_elem)
impliesNN = skipequalitykey;
else
- Assert(chosen == NULL && impliesNN == NULL);
+ Assert(bkey == NULL && impliesNN == NULL);
}
/*
* If we didn't find a usable boundary key, see if we can
* deduce a NOT NULL key
*/
- if (chosen == NULL && impliesNN != NULL &&
+ if (bkey == NULL && impliesNN != NULL &&
((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
ScanDirectionIsForward(dir) :
ScanDirectionIsBackward(dir)))
{
/* Yes, so build the key in notnullkeys[keysz] */
- chosen = &notnullkeys[keysz];
- ScanKeyEntryInitialize(chosen,
+ bkey = &notnullkeys[keysz];
+ ScanKeyEntryInitialize(bkey,
(SK_SEARCHNOTNULL | SK_ISNULL |
(impliesNN->sk_flags &
(SK_BT_DESC | SK_BT_NULLS_FIRST))),
@@ -1125,12 +1139,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
/*
- * If we still didn't find a usable boundary key, quit; else
- * save the boundary key pointer in startKeys.
+ * If preprocessing didn't leave a usable boundary key, quit;
+ * else save the boundary key pointer in startKeys[]
*/
- if (chosen == NULL)
+ if (bkey == NULL)
break;
- startKeys[keysz++] = chosen;
+ startKeys[keysz++] = bkey;
/*
* We can only consider adding more boundary keys when the one
@@ -1138,7 +1152,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* (during backwards scans we can only do so when the key that
* we just added to startKeys[] uses the = or <= strategy)
*/
- strat_total = chosen->sk_strategy;
+ strat_total = bkey->sk_strategy;
if (strat_total == BTGreaterStrategyNumber ||
strat_total == BTLessStrategyNumber)
break;
@@ -1149,19 +1163,19 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* make strat_total > or < (and stop adding boundary keys).
* This can only happen with opclasses that lack skip support.
*/
- if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
+ if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
{
- Assert(chosen->sk_flags & SK_BT_SKIP);
+ Assert(bkey->sk_flags & SK_BT_SKIP);
Assert(strat_total == BTEqualStrategyNumber);
if (ScanDirectionIsForward(dir))
{
- Assert(!(chosen->sk_flags & SK_BT_PRIOR));
+ Assert(!(bkey->sk_flags & SK_BT_PRIOR));
strat_total = BTGreaterStrategyNumber;
}
else
{
- Assert(!(chosen->sk_flags & SK_BT_NEXT));
+ Assert(!(bkey->sk_flags & SK_BT_NEXT));
strat_total = BTLessStrategyNumber;
}
@@ -1175,24 +1189,30 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
/*
* Done if that was the last scan key output by preprocessing.
- * Also done if there is a gap index attribute that lacks a
- * usable key (only possible when preprocessing was unable to
- * generate a skip array key to "fill in the gap").
+ * Also done if we've now examined all keys marked required.
*/
if (i >= so->numberOfKeys ||
- cur->sk_attno != curattr + 1)
+ !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
break;
/*
* Reset for next attr.
*/
+ Assert(cur->sk_attno == curattr + 1);
curattr = cur->sk_attno;
- chosen = NULL;
+ bkey = NULL;
impliesNN = NULL;
}
/*
- * Can we use this key as a starting boundary for this attr?
+ * If we've located the starting boundary key for curattr, we have
+ * no interest in curattr's other required key
+ */
+ if (bkey != NULL)
+ continue;
+
+ /*
+ * Is this key the starting boundary key for curattr?
*
* If not, does it imply a NOT NULL constraint? (Because
* SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
@@ -1202,27 +1222,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
{
case BTLessStrategyNumber:
case BTLessEqualStrategyNumber:
- if (chosen == NULL)
- {
- if (ScanDirectionIsBackward(dir))
- chosen = cur;
- else
- impliesNN = cur;
- }
+ if (ScanDirectionIsBackward(dir))
+ bkey = cur;
+ else if (impliesNN == NULL)
+ impliesNN = cur;
break;
case BTEqualStrategyNumber:
- /* override any non-equality choice */
- chosen = cur;
+ bkey = cur;
break;
case BTGreaterEqualStrategyNumber:
case BTGreaterStrategyNumber:
- if (chosen == NULL)
- {
- if (ScanDirectionIsForward(dir))
- chosen = cur;
- else
- impliesNN = cur;
- }
+ if (ScanDirectionIsForward(dir))
+ bkey = cur;
+ else if (impliesNN == NULL)
+ impliesNN = cur;
break;
}
}
@@ -1248,16 +1261,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
Assert(keysz <= INDEX_MAX_KEYS);
for (int i = 0; i < keysz; i++)
{
- ScanKey cur = startKeys[i];
+ ScanKey bkey = startKeys[i];
- Assert(cur->sk_attno == i + 1);
+ Assert(bkey->sk_attno == i + 1);
- if (cur->sk_flags & SK_ROW_HEADER)
+ if (bkey->sk_flags & SK_ROW_HEADER)
{
/*
* Row comparison header: look to the first row member instead
*/
- ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
+ ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
+ bool loosen_strat = false,
+ tighten_strat = false;
/*
* Cannot be a NULL in the first row member: _bt_preprocess_keys
@@ -1265,122 +1280,160 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* ever getting this far
*/
Assert(subkey->sk_flags & SK_ROW_MEMBER);
- Assert(subkey->sk_attno == cur->sk_attno);
+ Assert(subkey->sk_attno == bkey->sk_attno);
Assert(!(subkey->sk_flags & SK_ISNULL));
/*
+ * This is either a > or >= key (during backwards scans it is
+ * either < or <=) that was marked required during preprocessing.
+ * Later so->keyData[] keys can't have been marked required, so
+ * our row compare header key must be the final startKeys[] entry.
+ */
+ Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
+ Assert(i == keysz - 1);
+
+ /*
* The member scankeys are already in insertion format (ie, they
* have sk_func = 3-way-comparison function)
*/
memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
/*
- * If the row comparison is the last positioning key we accepted,
- * try to add additional keys from the lower-order row members.
- * (If we accepted independent conditions on additional index
- * columns, we use those instead --- doesn't seem worth trying to
- * determine which is more restrictive.) Note that this is OK
- * even if the row comparison is of ">" or "<" type, because the
- * condition applied to all but the last row member is effectively
- * ">=" or "<=", and so the extra keys don't break the positioning
- * scheme. But, by the same token, if we aren't able to use all
- * the row members, then the part of the row comparison that we
- * did use has to be treated as just a ">=" or "<=" condition, and
- * so we'd better adjust strat_total accordingly.
+ * Now look to later row compare members.
+ *
+ * If there's an "index attribute gap" between two row compare
+ * members, the second member won't have been marked required, and
+ * so can't be used as a starting boundary key here. The part of
+ * the row comparison that we do still use has to be treated as a
+ * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)"
+ * with an omitted intervening index attribute "b" will use an
+ * insertion scan key "a >= 1". Even the first "a = 1" tuple on
+ * the leaf level might satisfy the row compare qual.
+ *
+ * We're able to use a _more_ restrictive strategy when we reach a
+ * NULL row compare member, since they're always unsatisfiable.
+ * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
+ * insertion scan key "a > 1". All tuples where "a = 1" cannot
+ * possibly satisfy the row compare qual, so this is safe.
*/
- if (i == keysz - 1)
+ Assert(!(subkey->sk_flags & SK_ROW_END));
+ for (;;)
{
- bool used_all_subkeys = false;
+ subkey++;
+ Assert(subkey->sk_flags & SK_ROW_MEMBER);
- Assert(!(subkey->sk_flags & SK_ROW_END));
- for (;;)
+ if (subkey->sk_flags & SK_ISNULL)
{
- subkey++;
- Assert(subkey->sk_flags & SK_ROW_MEMBER);
- if (subkey->sk_attno != keysz + 1)
- break; /* out-of-sequence, can't use it */
- if (subkey->sk_strategy != cur->sk_strategy)
- break; /* wrong direction, can't use it */
- if (subkey->sk_flags & SK_ISNULL)
- break; /* can't use null keys */
- Assert(keysz < INDEX_MAX_KEYS);
- memcpy(inskey.scankeys + keysz, subkey,
- sizeof(ScanKeyData));
- keysz++;
- if (subkey->sk_flags & SK_ROW_END)
- {
- used_all_subkeys = true;
- break;
- }
+ /*
+ * NULL member key, can only use earlier keys.
+ *
+ * We deliberately avoid checking if this key is marked
+ * required. All earlier keys are required, and this key
+ * is unsatisfiable either way, so we can't miss anything.
+ */
+ tighten_strat = true;
+ break;
}
- if (!used_all_subkeys)
+
+ if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
{
- switch (strat_total)
- {
- case BTLessStrategyNumber:
- strat_total = BTLessEqualStrategyNumber;
- break;
- case BTGreaterStrategyNumber:
- strat_total = BTGreaterEqualStrategyNumber;
- break;
- }
+ /* nonrequired member key, can only use earlier keys */
+ loosen_strat = true;
+ break;
}
- break; /* done with outer loop */
+
+ Assert(subkey->sk_attno == keysz + 1);
+ Assert(subkey->sk_strategy == bkey->sk_strategy);
+ Assert(keysz < INDEX_MAX_KEYS);
+
+ memcpy(inskey.scankeys + keysz, subkey,
+ sizeof(ScanKeyData));
+ keysz++;
+ if (subkey->sk_flags & SK_ROW_END)
+ break;
}
- }
- else
- {
- /*
- * Ordinary comparison key. Transform the search-style scan key
- * to an insertion scan key by replacing the sk_func with the
- * appropriate btree comparison function.
- *
- * If scankey operator is not a cross-type comparison, we can use
- * the cached comparison function; otherwise gotta look it up in
- * the catalogs. (That can't lead to infinite recursion, since no
- * indexscan initiated by syscache lookup will use cross-data-type
- * operators.)
- *
- * We support the convention that sk_subtype == InvalidOid means
- * the opclass input type; this is a hack to simplify life for
- * ScanKeyInit().
- */
- if (cur->sk_subtype == rel->rd_opcintype[i] ||
- cur->sk_subtype == InvalidOid)
+ Assert(!(loosen_strat && tighten_strat));
+ if (loosen_strat)
{
- FmgrInfo *procinfo;
-
- procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
- ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
- cur->sk_flags,
- cur->sk_attno,
- InvalidStrategy,
- cur->sk_subtype,
- cur->sk_collation,
- procinfo,
- cur->sk_argument);
+ /* Use less restrictive strategy (and fewer member keys) */
+ switch (strat_total)
+ {
+ case BTLessStrategyNumber:
+ strat_total = BTLessEqualStrategyNumber;
+ break;
+ case BTGreaterStrategyNumber:
+ strat_total = BTGreaterEqualStrategyNumber;
+ break;
+ }
}
- else
+ if (tighten_strat)
{
- RegProcedure cmp_proc;
-
- cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
- rel->rd_opcintype[i],
- cur->sk_subtype,
- BTORDER_PROC);
- if (!RegProcedureIsValid(cmp_proc))
- elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
- BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
- cur->sk_attno, RelationGetRelationName(rel));
- ScanKeyEntryInitialize(inskey.scankeys + i,
- cur->sk_flags,
- cur->sk_attno,
- InvalidStrategy,
- cur->sk_subtype,
- cur->sk_collation,
- cmp_proc,
- cur->sk_argument);
+ /* Use more restrictive strategy (and fewer member keys) */
+ switch (strat_total)
+ {
+ case BTLessEqualStrategyNumber:
+ strat_total = BTLessStrategyNumber;
+ break;
+ case BTGreaterEqualStrategyNumber:
+ strat_total = BTGreaterStrategyNumber;
+ break;
+ }
}
+
+ /* done adding to inskey (row comparison keys always come last) */
+ break;
+ }
+
+ /*
+ * Ordinary comparison key/search-style key.
+ *
+ * Transform the search-style scan key to an insertion scan key by
+ * replacing the sk_func with the appropriate btree 3-way-comparison
+ * function.
+ *
+ * If scankey operator is not a cross-type comparison, we can use the
+ * cached comparison function; otherwise gotta look it up in the
+ * catalogs. (That can't lead to infinite recursion, since no
+ * indexscan initiated by syscache lookup will use cross-data-type
+ * operators.)
+ *
+ * We support the convention that sk_subtype == InvalidOid means the
+ * opclass input type; this hack simplifies life for ScanKeyInit().
+ */
+ if (bkey->sk_subtype == rel->rd_opcintype[i] ||
+ bkey->sk_subtype == InvalidOid)
+ {
+ FmgrInfo *procinfo;
+
+ procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
+ ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
+ bkey->sk_flags,
+ bkey->sk_attno,
+ InvalidStrategy,
+ bkey->sk_subtype,
+ bkey->sk_collation,
+ procinfo,
+ bkey->sk_argument);
+ }
+ else
+ {
+ RegProcedure cmp_proc;
+
+ cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
+ rel->rd_opcintype[i],
+ bkey->sk_subtype, BTORDER_PROC);
+ if (!RegProcedureIsValid(cmp_proc))
+ elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+ BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
+ bkey->sk_attno, RelationGetRelationName(rel));
+ ScanKeyEntryInitialize(inskey.scankeys + i,
+ bkey->sk_flags,
+ bkey->sk_attno,
+ InvalidStrategy,
+ bkey->sk_subtype,
+ bkey->sk_collation,
+ cmp_proc,
+ bkey->sk_argument);
}
}
@@ -1469,6 +1522,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
if (!BufferIsValid(so->currPos.buf))
{
+ Assert(!so->needPrimScan);
+
/*
* We only get here if the index is completely empty. Lock relation
* because nothing finer to lock exists. Without a buffer lock, it's
@@ -1487,7 +1542,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
if (!BufferIsValid(so->currPos.buf))
{
- Assert(!so->needPrimScan);
_bt_parallel_done(scan);
return false;
}
@@ -1610,7 +1664,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
so->currPos.prevPage = opaque->btpo_prev;
so->currPos.nextPage = opaque->btpo_next;
+ /* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
+ so->currPos.dir = dir;
+ so->currPos.nextTupleOffset = 0;
+ /* either moreRight or moreLeft should be set now (may be unset later) */
+ Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
+ so->currPos.moreLeft);
Assert(!P_IGNORE(opaque));
Assert(BTScanPosIsPinned(so->currPos));
Assert(!so->needPrimScan);
@@ -1626,14 +1686,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
so->currPos.currPage);
}
- /* initialize remaining currPos fields related to current page */
- so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
- so->currPos.dir = dir;
- so->currPos.nextTupleOffset = 0;
- /* either moreLeft or moreRight should be set now (may be unset later) */
- Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
- so->currPos.moreLeft);
-
PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
/* initialize local variables */
@@ -2107,10 +2159,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
*
* Wrapper on _bt_readnextpage that performs final steps for the current page.
*
- * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
- * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
- * the pin eagerly earlier on. The scan must have so->currPos.currPage set to
- * a valid block, in any case.
+ * On entry, so->currPos must be valid. Its buffer will be pinned, though
+ * never locked. (Actually, when so->dropPin there won't even be a pin held,
+ * though so->currPos.currPage must still be set to a valid block number.)
*/
static bool
_bt_steppage(IndexScanDesc scan, ScanDirection dir)
@@ -2251,12 +2302,14 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
*/
if (_bt_readpage(scan, dir, offnum, true))
{
+ Relation rel = scan->indexRelation;
+
/*
* _bt_readpage succeeded. Drop the lock (and maybe the pin) on
* so->currPos.buf in preparation for btgettuple returning tuples.
*/
Assert(BTScanPosIsPinned(so->currPos));
- _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+ _bt_drop_lock_and_maybe_pin(rel, so);
return true;
}
@@ -2278,9 +2331,12 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
* previously-saved right link or left link. lastcurrblkno is the page that
* was current at the point where the blkno link was saved, which we use to
* reason about concurrent page splits/page deletions during backwards scans.
+ * In the common case where seized=false, blkno is either so->currPos.nextPage
+ * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
*
- * On entry, caller shouldn't hold any locks or pins on any page (we work
- * directly off of blkno and lastcurrblkno instead). Parallel scan callers
+ * On entry, so->currPos shouldn't be locked by caller. so->currPos.buf must
+ * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
+ * won't need to be read again in almost all cases). Parallel scan callers
* that seized the scan before calling here should pass seized=true; such a
* caller's blkno and lastcurrblkno arguments come from the seized scan.
* seized=false callers just pass us the blkno/lastcurrblkno taken from their
@@ -2294,11 +2350,11 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
*
* On success exit, so->currPos is updated to contain data from the next
* interesting page, and we return true. We hold a pin on the buffer on
- * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe
- * to eagerly drop the pin (to avoid blocking VACUUM).
+ * success exit (except during so->dropPin index scans, when we drop the pin
+ * eagerly to avoid blocking VACUUM).
*
- * If there are no more matching records in the given direction, we drop all
- * locks and pins, invalidate so->currPos, and return false.
+ * If there are no more matching records in the given direction, we invalidate
+ * so->currPos (while ensuring it retains no locks or pins), and return false.
*
* We always release the scan for a parallel scan caller, regardless of
* success or failure; we'll call _bt_parallel_release as soon as possible.
@@ -2413,7 +2469,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
*/
Assert(so->currPos.currPage == blkno);
Assert(BTScanPosIsPinned(so->currPos));
- _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+ _bt_drop_lock_and_maybe_pin(rel, so);
return true;
}
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 3794cc924ad..9d70e89c1f3 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -105,7 +105,7 @@ typedef struct BTShared
int scantuplesortstates;
/* Query ID, for report in worker processes */
- uint64 queryid;
+ int64 queryid;
/*
* workersdonecv is used to monitor the progress of workers. All parallel
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 1a15dfcb7d3..9aed207995f 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -44,7 +44,6 @@ static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *arra
static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
bool *skip_array_set);
-static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
bool readpagetup, int sktrig, bool *scanBehind);
@@ -52,7 +51,6 @@ static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
int sktrig, bool sktrig_required);
#ifdef USE_ASSERT_CHECKING
-static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir);
static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan);
#endif
static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
@@ -1035,73 +1033,6 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
}
/*
- * _bt_rewind_nonrequired_arrays() -- Rewind SAOP arrays not marked required
- *
- * Called when _bt_advance_array_keys decides to start a new primitive index
- * scan on the basis of the current scan position being before the position
- * that _bt_first is capable of repositioning the scan to by applying an
- * inequality operator required in the opposite-to-scan direction only.
- *
- * Although equality strategy scan keys (for both arrays and non-arrays alike)
- * are either marked required in both directions or in neither direction,
- * there is a sense in which non-required arrays behave like required arrays.
- * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)",
- * the scan key on "c" is non-required, but nevertheless enables positioning
- * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the
- * first descent of the tree by _bt_first. Later on, there could also be a
- * second descent, that places the scan right before tuples >= "(200, 3, 5)".
- * _bt_first must never be allowed to build an insertion scan key whose "c"
- * entry is set to a value other than 5, the "c" array's first element/value.
- * (Actually, it's the first in the current scan direction. This example uses
- * a forward scan.)
- *
- * Calling here resets the array scan key elements for the scan's non-required
- * arrays. This is strictly necessary for correctness in a subset of cases
- * involving "required in opposite direction"-triggered primitive index scans.
- * Not all callers are at risk of _bt_first using a non-required array like
- * this, but advancement always resets the arrays when another primitive scan
- * is scheduled, just to keep things simple. Array advancement even makes
- * sure to reset non-required arrays during scans that have no inequalities.
- * (Advancement still won't call here when there are no inequalities, though
- * that's just because it's all handled indirectly instead.)
- *
- * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that
- * everybody got this right.
- *
- * Note: In practice almost all SAOP arrays are marked required during
- * preprocessing (if necessary by generating skip arrays). It is hardly ever
- * truly necessary to call here, but consistently doing so is simpler.
- */
-static void
-_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir)
-{
- Relation rel = scan->indexRelation;
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
- int arrayidx = 0;
-
- for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
- {
- ScanKey cur = so->keyData + ikey;
- BTArrayKeyInfo *array = NULL;
-
- if (!(cur->sk_flags & SK_SEARCHARRAY) ||
- cur->sk_strategy != BTEqualStrategyNumber)
- continue;
-
- array = &so->arrayKeys[arrayidx++];
- Assert(array->scan_key == ikey);
-
- if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
- continue;
-
- Assert(array->num_elems != -1); /* No non-required skip arrays */
-
- _bt_array_set_low_or_high(rel, cur, array,
- ScanDirectionIsForward(dir));
- }
-}
-
-/*
* _bt_tuple_before_array_skeys() -- too early to advance required arrays?
*
* We always compare the tuple using the current array keys (which we assume
@@ -1380,8 +1311,6 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
*/
if (so->needPrimScan)
{
- Assert(_bt_verify_arrays_bt_first(scan, dir));
-
/*
* Flag was set -- must call _bt_first again, which will reset the
* scan's needPrimScan flag
@@ -2007,14 +1936,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
*/
else if (has_required_opposite_direction_only && pstate->finaltup &&
unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup)))
- {
- /*
- * Make sure that any SAOP arrays that were not marked required by
- * preprocessing are reset to their first element for this direction
- */
- _bt_rewind_nonrequired_arrays(scan, dir);
goto new_prim_scan;
- }
continue_scan:
@@ -2045,8 +1967,6 @@ continue_scan:
*/
so->oppositeDirCheck = has_required_opposite_direction_only;
- _bt_rewind_nonrequired_arrays(scan, dir);
-
/*
* skip by setting "look ahead" mechanism's offnum for forwards scans
* (backwards scans check scanBehind flag directly instead)
@@ -2143,48 +2063,6 @@ end_toplevel_scan:
#ifdef USE_ASSERT_CHECKING
/*
- * Verify that the scan's qual state matches what we expect at the point that
- * _bt_start_prim_scan is about to start a just-scheduled new primitive scan.
- *
- * We enforce a rule against non-required array scan keys: they must start out
- * with whatever element is the first for the scan's current scan direction.
- * See _bt_rewind_nonrequired_arrays comments for an explanation.
- */
-static bool
-_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir)
-{
- BTScanOpaque so = (BTScanOpaque) scan->opaque;
- int arrayidx = 0;
-
- for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
- {
- ScanKey cur = so->keyData + ikey;
- BTArrayKeyInfo *array = NULL;
- int first_elem_dir;
-
- if (!(cur->sk_flags & SK_SEARCHARRAY) ||
- cur->sk_strategy != BTEqualStrategyNumber)
- continue;
-
- array = &so->arrayKeys[arrayidx++];
-
- if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
- ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
- continue;
-
- if (ScanDirectionIsForward(dir))
- first_elem_dir = 0;
- else
- first_elem_dir = array->num_elems - 1;
-
- if (array->cur_elem != first_elem_dir)
- return false;
- }
-
- return _bt_verify_keys_with_arraykeys(scan);
-}
-
-/*
* Verify that the scan's "so->keyData[]" scan keys are in agreement with
* its array key state
*/
@@ -2194,6 +2072,7 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan)
BTScanOpaque so = (BTScanOpaque) scan->opaque;
int last_sk_attno = InvalidAttrNumber,
arrayidx = 0;
+ bool nonrequiredseen = false;
if (!so->qual_ok)
return false;
@@ -2217,8 +2096,16 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan)
if (array->num_elems != -1 &&
cur->sk_argument != array->elem_values[array->cur_elem])
return false;
- if (last_sk_attno > cur->sk_attno)
- return false;
+ if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))
+ {
+ if (last_sk_attno > cur->sk_attno)
+ return false;
+ if (nonrequiredseen)
+ return false;
+ }
+ else
+ nonrequiredseen = true;
+
last_sk_attno = cur->sk_attno;
}
@@ -2551,37 +2438,12 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
if (!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
{
/* Scan key isn't marked required (corner case) */
- Assert(!(key->sk_flags & SK_ROW_HEADER));
break; /* unsafe */
}
if (key->sk_flags & SK_ROW_HEADER)
{
- /*
- * RowCompare inequality.
- *
- * Only the first subkey from a RowCompare can ever be marked
- * required (that happens when the row header is marked required).
- * There is no simple, general way for us to transitively deduce
- * whether or not every tuple on the page satisfies a RowCompare
- * key based only on firsttup and lasttup -- so we just give up.
- */
- if (!start_past_saop_eq && !so->skipScan)
- break; /* unsafe to go further */
-
- /*
- * We have to be even more careful with RowCompares that come
- * after an array: we assume it's unsafe to even bypass the array.
- * Calling _bt_start_array_keys to recover the scan's arrays
- * following use of forcenonrequired mode isn't compatible with
- * _bt_check_rowcompare's continuescan=false behavior with NULL
- * row compare members. _bt_advance_array_keys must not make a
- * decision on the basis of a key not being satisfied in the
- * opposite-to-scan direction until the scan reaches a leaf page
- * where the same key begins to be satisfied in scan direction.
- * The _bt_first !used_all_subkeys behavior makes this limitation
- * hard to work around some other way.
- */
- return; /* completely unsafe to set pstate.startikey */
+ /* RowCompare inequalities currently aren't supported */
+ break; /* "unsafe" */
}
if (key->sk_strategy != BTEqualStrategyNumber)
{
@@ -3078,6 +2940,31 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
Assert(subkey->sk_flags & SK_ROW_MEMBER);
+ /* When a NULL row member is compared, the row never matches */
+ if (subkey->sk_flags & SK_ISNULL)
+ {
+ /*
+ * Unlike the simple-scankey case, this isn't a disallowed case
+ * (except when it's the first row element that has the NULL arg).
+ * But it can never match. If all the earlier row comparison
+ * columns are required for the scan direction, we can stop the
+ * scan, because there can't be another tuple that will succeed.
+ */
+ Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument));
+ subkey--;
+ if (forcenonrequired)
+ {
+ /* treating scan's keys as non-required */
+ }
+ else if ((subkey->sk_flags & SK_BT_REQFWD) &&
+ ScanDirectionIsForward(dir))
+ *continuescan = false;
+ else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
+ ScanDirectionIsBackward(dir))
+ *continuescan = false;
+ return false;
+ }
+
if (subkey->sk_attno > tupnatts)
{
/*
@@ -3087,11 +2974,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
* attribute passes the qual.
*/
Assert(BTreeTupleIsPivot(tuple));
- cmpresult = 0;
- if (subkey->sk_flags & SK_ROW_END)
- break;
- subkey++;
- continue;
+ return true;
}
datum = index_getattr(tuple,
@@ -3101,6 +2984,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
if (isNull)
{
+ int reqflags;
+
if (forcenonrequired)
{
/* treating scan's keys as non-required */
@@ -3111,15 +2996,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
* Since NULLs are sorted before non-NULLs, we know we have
* reached the lower limit of the range of values for this
* index attr. On a backward scan, we can stop if this qual
- * is one of the "must match" subset. We can stop regardless
- * of whether the qual is > or <, so long as it's required,
- * because it's not possible for any future tuples to pass. On
- * a forward scan, however, we must keep going, because we may
- * have initially positioned to the start of the index.
- * (_bt_advance_array_keys also relies on this behavior during
- * forward scans.)
+ * is one of the "must match" subset. However, on a forwards
+ * scan, we must keep going, because we may have initially
+ * positioned to the start of the index.
+ *
+ * All required NULLS FIRST > row members can use NULL tuple
+ * values to end backwards scans, just like with other values.
+ * A qual "WHERE (a, b, c) > (9, 42, 'foo')" can terminate a
+ * backwards scan upon reaching the index's rightmost "a = 9"
+ * tuple whose "b" column contains a NULL (if not sooner).
+ * Since "b" is NULLS FIRST, we can treat its NULLs as "<" 42.
*/
- if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+ reqflags = SK_BT_REQBKWD;
+
+ /*
+ * When a most significant required NULLS FIRST < row compare
+ * member sees NULL tuple values during a backwards scan, it
+ * signals the end of matches for the whole row compare/scan.
+ * A qual "WHERE (a, b, c) < (9, 42, 'foo')" will terminate a
+ * backwards scan upon reaching the rightmost tuple whose "a"
+ * column has a NULL. The "a" NULL value is "<" 9, and yet
+ * our < row compare will still end the scan. (This isn't
+ * safe with later/lower-order row members. Notice that it
+ * can only happen with an "a" NULL some time after the scan
+ * completely stops needing to use its "b" and "c" members.)
+ */
+ if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument))
+ reqflags |= SK_BT_REQFWD; /* safe, first row member */
+
+ if ((subkey->sk_flags & reqflags) &&
ScanDirectionIsBackward(dir))
*continuescan = false;
}
@@ -3129,15 +3034,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
* Since NULLs are sorted after non-NULLs, we know we have
* reached the upper limit of the range of values for this
* index attr. On a forward scan, we can stop if this qual is
- * one of the "must match" subset. We can stop regardless of
- * whether the qual is > or <, so long as it's required,
- * because it's not possible for any future tuples to pass. On
- * a backward scan, however, we must keep going, because we
- * may have initially positioned to the end of the index.
- * (_bt_advance_array_keys also relies on this behavior during
- * backward scans.)
+ * one of the "must match" subset. However, on a backward
+ * scan, we must keep going, because we may have initially
+ * positioned to the end of the index.
+ *
+ * All required NULLS LAST < row members can use NULL tuple
+ * values to end forwards scans, just like with other values.
+ * A qual "WHERE (a, b, c) < (9, 42, 'foo')" can terminate a
+ * forwards scan upon reaching the index's leftmost "a = 9"
+ * tuple whose "b" column contains a NULL (if not sooner).
+ * Since "b" is NULLS LAST, we can treat its NULLs as ">" 42.
+ */
+ reqflags = SK_BT_REQFWD;
+
+ /*
+ * When a most significant required NULLS LAST > row compare
+ * member sees NULL tuple values during a forwards scan, it
+ * signals the end of matches for the whole row compare/scan.
+ * A qual "WHERE (a, b, c) > (9, 42, 'foo')" will terminate a
+ * forwards scan upon reaching the leftmost tuple whose "a"
+ * column has a NULL. The "a" NULL value is ">" 9, and yet
+ * our > row compare will end the scan. (This isn't safe with
+ * later/lower-order row members. Notice that it can only
+ * happen with an "a" NULL some time after the scan completely
+ * stops needing to use its "b" and "c" members.)
*/
- if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+ if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument))
+ reqflags |= SK_BT_REQBKWD; /* safe, first row member */
+
+ if ((subkey->sk_flags & reqflags) &&
ScanDirectionIsForward(dir))
*continuescan = false;
}
@@ -3148,30 +3073,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
return false;
}
- if (subkey->sk_flags & SK_ISNULL)
- {
- /*
- * Unlike the simple-scankey case, this isn't a disallowed case
- * (except when it's the first row element that has the NULL arg).
- * But it can never match. If all the earlier row comparison
- * columns are required for the scan direction, we can stop the
- * scan, because there can't be another tuple that will succeed.
- */
- Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument));
- subkey--;
- if (forcenonrequired)
- {
- /* treating scan's keys as non-required */
- }
- else if ((subkey->sk_flags & SK_BT_REQFWD) &&
- ScanDirectionIsForward(dir))
- *continuescan = false;
- else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
- ScanDirectionIsBackward(dir))
- *continuescan = false;
- return false;
- }
-
/* Perform the test --- three-way comparison not bool operator */
cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
subkey->sk_collation,
@@ -3330,87 +3231,85 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
* current page and killed tuples thereon (generally, this should only be
* called if so->numKilled > 0).
*
- * The caller does not have a lock on the page and may or may not have the
- * page pinned in a buffer. Note that read-lock is sufficient for setting
- * LP_DEAD status (which is only a hint).
- *
- * We match items by heap TID before assuming they are the right ones to
- * delete. We cope with cases where items have moved right due to insertions.
- * If an item has moved off the current page due to a split, we'll fail to
- * find it and do nothing (this is not an error case --- we assume the item
- * will eventually get marked in a future indexscan).
+ * Caller should not have a lock on the so->currPos page, but must hold a
+ * buffer pin when !so->dropPin. When we return, it still won't be locked.
+ * It'll continue to hold whatever pins were held before calling here.
*
- * Note that if we hold a pin on the target page continuously from initially
- * reading the items until applying this function, VACUUM cannot have deleted
- * any items from the page, and so there is no need to search left from the
- * recorded offset. (This observation also guarantees that the item is still
- * the right one to delete, which might otherwise be questionable since heap
- * TIDs can get recycled.) This holds true even if the page has been modified
- * by inserts and page splits, so there is no need to consult the LSN.
+ * We match items by heap TID before assuming they are the right ones to set
+ * LP_DEAD. If the scan is one that holds a buffer pin on the target page
+ * continuously from initially reading the items until applying this function
+ * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the
+ * page, so the page's TIDs can't have been recycled by now. There's no risk
+ * that we'll confuse a new index tuple that happens to use a recycled TID
+ * with a now-removed tuple with the same TID (that used to be on this same
+ * page). We can't rely on that during scans that drop buffer pins eagerly
+ * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
+ * the page LSN having not changed since back when _bt_readpage saw the page.
+ * We totally give up on setting LP_DEAD bits when the page LSN changed.
*
- * If the pin was released after reading the page, then we re-read it. If it
- * has been modified since we read it (as determined by the LSN), we dare not
- * flag any entries because it is possible that the old entry was vacuumed
- * away and the TID was re-used by a completely different heap tuple.
+ * We give up much less often during !so->dropPin scans, but it still happens.
+ * We cope with cases where items have moved right due to insertions. If an
+ * item has moved off the current page due to a split, we'll fail to find it
+ * and just give up on it.
*/
void
_bt_killitems(IndexScanDesc scan)
{
+ Relation rel = scan->indexRelation;
BTScanOpaque so = (BTScanOpaque) scan->opaque;
Page page;
BTPageOpaque opaque;
OffsetNumber minoff;
OffsetNumber maxoff;
- int i;
int numKilled = so->numKilled;
bool killedsomething = false;
- bool droppedpin PG_USED_FOR_ASSERTS_ONLY;
+ Buffer buf;
+ Assert(numKilled > 0);
Assert(BTScanPosIsValid(so->currPos));
+ Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
- /*
- * Always reset the scan state, so we don't look for same items on other
- * pages.
- */
+ /* Always invalidate so->killedItems[] before leaving so->currPos */
so->numKilled = 0;
- if (BTScanPosIsPinned(so->currPos))
+ if (!so->dropPin)
{
/*
* We have held the pin on this page since we read the index tuples,
* so all we need to do is lock it. The pin will have prevented
- * re-use of any TID on the page, so there is no need to check the
- * LSN.
+ * concurrent VACUUMs from recycling any of the TIDs on the page.
*/
- droppedpin = false;
- _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
-
- page = BufferGetPage(so->currPos.buf);
+ Assert(BTScanPosIsPinned(so->currPos));
+ buf = so->currPos.buf;
+ _bt_lockbuf(rel, buf, BT_READ);
}
else
{
- Buffer buf;
+ XLogRecPtr latestlsn;
- droppedpin = true;
- /* Attempt to re-read the buffer, getting pin and lock. */
- buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
+ Assert(!BTScanPosIsPinned(so->currPos));
+ Assert(RelationNeedsWAL(rel));
+ buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
- page = BufferGetPage(buf);
- if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
- so->currPos.buf = buf;
- else
+ latestlsn = BufferGetLSNAtomic(buf);
+ Assert(!XLogRecPtrIsInvalid(so->currPos.lsn));
+ Assert(so->currPos.lsn <= latestlsn);
+ if (so->currPos.lsn != latestlsn)
{
- /* Modified while not pinned means hinting is not safe. */
- _bt_relbuf(scan->indexRelation, buf);
+ /* Modified, give up on hinting */
+ _bt_relbuf(rel, buf);
return;
}
+
+ /* Unmodified, hinting is safe */
}
+ page = BufferGetPage(buf);
opaque = BTPageGetOpaque(page);
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
- for (i = 0; i < numKilled; i++)
+ for (int i = 0; i < numKilled; i++)
{
int itemIndex = so->killedItems[i];
BTScanPosItem *kitem = &so->currPos.items[itemIndex];
@@ -3442,7 +3341,7 @@ _bt_killitems(IndexScanDesc scan)
* correctness.
*
* Note that the page may have been modified in almost any way
- * since we first read it (in the !droppedpin case), so it's
+ * since we first read it (in the !so->dropPin case), so it's
* possible that this posting list tuple wasn't a posting list
* tuple when we first encountered its heap TIDs.
*/
@@ -3458,7 +3357,7 @@ _bt_killitems(IndexScanDesc scan)
* though only in the common case where the page can't
* have been concurrently modified
*/
- Assert(kitem->indexOffset == offnum || !droppedpin);
+ Assert(kitem->indexOffset == offnum || !so->dropPin);
/*
* Read-ahead to later kitems here.
@@ -3522,10 +3421,13 @@ _bt_killitems(IndexScanDesc scan)
if (killedsomething)
{
opaque->btpo_flags |= BTP_HAS_GARBAGE;
- MarkBufferDirtyHint(so->currPos.buf, true);
+ MarkBufferDirtyHint(buf, true);
}
- _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+ if (!so->dropPin)
+ _bt_unlockbuf(rel, buf);
+ else
+ _bt_relbuf(rel, buf);
}
diff --git a/src/backend/access/rmgrdesc/replorigindesc.c b/src/backend/access/rmgrdesc/replorigindesc.c
index 5dd74233996..35e3af2903e 100644
--- a/src/backend/access/rmgrdesc/replorigindesc.c
+++ b/src/backend/access/rmgrdesc/replorigindesc.c
@@ -29,7 +29,7 @@ replorigin_desc(StringInfo buf, XLogReaderState *record)
xlrec = (xl_replorigin_set *) rec;
- appendStringInfo(buf, "set %u; lsn %X/%X; force: %d",
+ appendStringInfo(buf, "set %u; lsn %X/%08X; force: %d",
xlrec->node_id,
LSN_FORMAT_ARGS(xlrec->remote_lsn),
xlrec->force);
diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c
index 305598e2865..f0f696855b9 100644
--- a/src/backend/access/rmgrdesc/xactdesc.c
+++ b/src/backend/access/rmgrdesc/xactdesc.c
@@ -359,7 +359,7 @@ xact_desc_commit(StringInfo buf, uint8 info, xl_xact_commit *xlrec, RepOriginId
if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN)
{
- appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s",
+ appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s",
origin_id,
LSN_FORMAT_ARGS(parsed.origin_lsn),
timestamptz_to_str(parsed.origin_timestamp));
@@ -384,7 +384,7 @@ xact_desc_abort(StringInfo buf, uint8 info, xl_xact_abort *xlrec, RepOriginId or
if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN)
{
- appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s",
+ appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s",
origin_id,
LSN_FORMAT_ARGS(parsed.origin_lsn),
timestamptz_to_str(parsed.origin_timestamp));
@@ -418,7 +418,7 @@ xact_desc_prepare(StringInfo buf, uint8 info, xl_xact_prepare *xlrec, RepOriginI
* way as PrepareRedoAdd().
*/
if (origin_id != InvalidRepOriginId)
- appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s",
+ appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s",
origin_id,
LSN_FORMAT_ARGS(parsed.origin_lsn),
timestamptz_to_str(parsed.origin_timestamp));
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 58040f28656..cd6c2a2f650 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -65,7 +65,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
{
CheckPoint *checkpoint = (CheckPoint *) rec;
- appendStringInfo(buf, "redo %X/%X; "
+ appendStringInfo(buf, "redo %X/%08X; "
"tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; "
"oldest xid %u in DB %u; oldest multi %u in DB %u; "
"oldest/newest commit timestamp xid: %u/%u; "
@@ -111,7 +111,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
XLogRecPtr startpoint;
memcpy(&startpoint, rec, sizeof(XLogRecPtr));
- appendStringInfo(buf, "%X/%X", LSN_FORMAT_ARGS(startpoint));
+ appendStringInfo(buf, "%X/%08X", LSN_FORMAT_ARGS(startpoint));
}
else if (info == XLOG_PARAMETER_CHANGE)
{
@@ -156,7 +156,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
xl_overwrite_contrecord xlrec;
memcpy(&xlrec, rec, sizeof(xl_overwrite_contrecord));
- appendStringInfo(buf, "lsn %X/%X; time %s",
+ appendStringInfo(buf, "lsn %X/%08X; time %s",
LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
timestamptz_to_str(xlrec.overwrite_time));
}
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 113fae1437a..225ff7ca9f2 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -707,6 +707,13 @@ ActivateCommitTs(void)
TransactionId xid;
int64 pageno;
+ /*
+ * During bootstrap, we should not register commit timestamps so skip the
+ * activation in this case.
+ */
+ if (IsBootstrapProcessingMode())
+ return;
+
/* If we've done this already, there's nothing to do */
LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
if (commitTsShared->commitTsActive)
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 3c06ac45532..7a7afe3edc6 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1847,7 +1847,7 @@ AtPrepare_MultiXact(void)
* Clean up after successful PREPARE TRANSACTION
*/
void
-PostPrepare_MultiXact(TransactionId xid)
+PostPrepare_MultiXact(FullTransactionId fxid)
{
MultiXactId myOldestMember;
@@ -1858,7 +1858,7 @@ PostPrepare_MultiXact(TransactionId xid)
myOldestMember = OldestMemberMXactId[MyProcNumber];
if (MultiXactIdIsValid(myOldestMember))
{
- ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false);
+ ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
/*
* Even though storing MultiXactId is atomic, acquire lock to make
@@ -1896,10 +1896,10 @@ PostPrepare_MultiXact(TransactionId xid)
* Recover the state of a prepared transaction at startup
*/
void
-multixact_twophase_recover(TransactionId xid, uint16 info,
+multixact_twophase_recover(FullTransactionId fxid, uint16 info,
void *recdata, uint32 len)
{
- ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false);
+ ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
MultiXactId oldestMember;
/*
@@ -1917,10 +1917,10 @@ multixact_twophase_recover(TransactionId xid, uint16 info,
* Similar to AtEOXact_MultiXact but for COMMIT PREPARED
*/
void
-multixact_twophase_postcommit(TransactionId xid, uint16 info,
+multixact_twophase_postcommit(FullTransactionId fxid, uint16 info,
void *recdata, uint32 len)
{
- ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, true);
+ ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true);
Assert(len == sizeof(MultiXactId));
@@ -1932,10 +1932,10 @@ multixact_twophase_postcommit(TransactionId xid, uint16 info,
* This is actually just the same as the COMMIT case.
*/
void
-multixact_twophase_postabort(TransactionId xid, uint16 info,
+multixact_twophase_postabort(FullTransactionId fxid, uint16 info,
void *recdata, uint32 len)
{
- multixact_twophase_postcommit(xid, info, recdata, len);
+ multixact_twophase_postcommit(fxid, info, recdata, len);
}
/*
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index a27f27cc037..186eb91f609 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -154,7 +154,7 @@ readTimeLineHistory(TimeLineID targetTLI)
if (*ptr == '\0' || *ptr == '#')
continue;
- nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo);
+ nfields = sscanf(fline, "%u\t%X/%08X", &tli, &switchpoint_hi, &switchpoint_lo);
if (nfields < 1)
{
@@ -399,7 +399,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
* parent file failed to end with one.
*/
snprintf(buffer, sizeof(buffer),
- "%s%u\t%X/%X\t%s\n",
+ "%s%u\t%X/%08X\t%s\n",
(srcfd < 0) ? "" : "\n",
parentTLI,
LSN_FORMAT_ARGS(switchpoint),
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 73a80559194..85cbe397cb2 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -159,7 +159,7 @@ typedef struct GlobalTransactionData
*/
XLogRecPtr prepare_start_lsn; /* XLOG offset of prepare record start */
XLogRecPtr prepare_end_lsn; /* XLOG offset of prepare record end */
- TransactionId xid; /* The GXACT id */
+ FullTransactionId fxid; /* The GXACT full xid */
Oid owner; /* ID of user that executed the xact */
ProcNumber locking_backend; /* backend currently working on the xact */
@@ -197,6 +197,7 @@ static GlobalTransaction MyLockedGxact = NULL;
static bool twophaseExitRegistered = false;
+static void PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning);
static void RecordTransactionCommitPrepared(TransactionId xid,
int nchildren,
TransactionId *children,
@@ -216,19 +217,19 @@ static void RecordTransactionAbortPrepared(TransactionId xid,
int nstats,
xl_xact_stats_item *stats,
const char *gid);
-static void ProcessRecords(char *bufptr, TransactionId xid,
+static void ProcessRecords(char *bufptr, FullTransactionId fxid,
const TwoPhaseCallback callbacks[]);
static void RemoveGXact(GlobalTransaction gxact);
static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len);
-static char *ProcessTwoPhaseBuffer(TransactionId xid,
+static char *ProcessTwoPhaseBuffer(FullTransactionId fxid,
XLogRecPtr prepare_start_lsn,
bool fromdisk, bool setParent, bool setNextXid);
-static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid,
+static void MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid,
const char *gid, TimestampTz prepared_at, Oid owner,
Oid databaseid);
-static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning);
-static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
+static void RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning);
+static void RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len);
/*
* Initialization of shared memory
@@ -356,7 +357,7 @@ PostPrepare_Twophase(void)
* Reserve the GID for the given transaction.
*/
GlobalTransaction
-MarkAsPreparing(TransactionId xid, const char *gid,
+MarkAsPreparing(FullTransactionId fxid, const char *gid,
TimestampTz prepared_at, Oid owner, Oid databaseid)
{
GlobalTransaction gxact;
@@ -407,7 +408,7 @@ MarkAsPreparing(TransactionId xid, const char *gid,
gxact = TwoPhaseState->freeGXacts;
TwoPhaseState->freeGXacts = gxact->next;
- MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid);
+ MarkAsPreparingGuts(gxact, fxid, gid, prepared_at, owner, databaseid);
gxact->ondisk = false;
@@ -430,11 +431,13 @@ MarkAsPreparing(TransactionId xid, const char *gid,
* Note: This function should be called with appropriate locks held.
*/
static void
-MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
- TimestampTz prepared_at, Oid owner, Oid databaseid)
+MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid,
+ const char *gid, TimestampTz prepared_at, Oid owner,
+ Oid databaseid)
{
PGPROC *proc;
int i;
+ TransactionId xid = XidFromFullTransactionId(fxid);
Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
@@ -479,7 +482,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
proc->subxidStatus.count = 0;
gxact->prepared_at = prepared_at;
- gxact->xid = xid;
+ gxact->fxid = fxid;
gxact->owner = owner;
gxact->locking_backend = MyProcNumber;
gxact->valid = false;
@@ -797,12 +800,12 @@ pg_prepared_xact(PG_FUNCTION_ARGS)
* caller had better hold it.
*/
static GlobalTransaction
-TwoPhaseGetGXact(TransactionId xid, bool lock_held)
+TwoPhaseGetGXact(FullTransactionId fxid, bool lock_held)
{
GlobalTransaction result = NULL;
int i;
- static TransactionId cached_xid = InvalidTransactionId;
+ static FullTransactionId cached_fxid = {InvalidTransactionId};
static GlobalTransaction cached_gxact = NULL;
Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock));
@@ -811,7 +814,7 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held)
* During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called
* repeatedly for the same XID. We can save work with a simple cache.
*/
- if (xid == cached_xid)
+ if (FullTransactionIdEquals(fxid, cached_fxid))
return cached_gxact;
if (!lock_held)
@@ -821,7 +824,7 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held)
{
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
- if (gxact->xid == xid)
+ if (FullTransactionIdEquals(gxact->fxid, fxid))
{
result = gxact;
break;
@@ -832,9 +835,10 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held)
LWLockRelease(TwoPhaseStateLock);
if (result == NULL) /* should not happen */
- elog(ERROR, "failed to find GlobalTransaction for xid %u", xid);
+ elog(ERROR, "failed to find GlobalTransaction for xid %u",
+ XidFromFullTransactionId(fxid));
- cached_xid = xid;
+ cached_fxid = fxid;
cached_gxact = result;
return result;
@@ -881,7 +885,7 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid,
*have_more = true;
break;
}
- result = gxact->xid;
+ result = XidFromFullTransactionId(gxact->fxid);
}
}
@@ -892,7 +896,7 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid,
/*
* TwoPhaseGetDummyProcNumber
- * Get the dummy proc number for prepared transaction specified by XID
+ * Get the dummy proc number for prepared transaction
*
* Dummy proc numbers are similar to proc numbers of real backends. They
* start at MaxBackends, and are unique across all currently active real
@@ -900,24 +904,24 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid,
* TwoPhaseStateLock will not be taken, so the caller had better hold it.
*/
ProcNumber
-TwoPhaseGetDummyProcNumber(TransactionId xid, bool lock_held)
+TwoPhaseGetDummyProcNumber(FullTransactionId fxid, bool lock_held)
{
- GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
+ GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held);
return gxact->pgprocno;
}
/*
* TwoPhaseGetDummyProc
- * Get the PGPROC that represents a prepared transaction specified by XID
+ * Get the PGPROC that represents a prepared transaction
*
* If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
* caller had better hold it.
*/
PGPROC *
-TwoPhaseGetDummyProc(TransactionId xid, bool lock_held)
+TwoPhaseGetDummyProc(FullTransactionId fxid, bool lock_held)
{
- GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
+ GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held);
return GetPGProcByNumber(gxact->pgprocno);
}
@@ -942,10 +946,8 @@ AdjustToFullTransactionId(TransactionId xid)
}
static inline int
-TwoPhaseFilePath(char *path, TransactionId xid)
+TwoPhaseFilePath(char *path, FullTransactionId fxid)
{
- FullTransactionId fxid = AdjustToFullTransactionId(xid);
-
return snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X%08X",
EpochFromFullTransactionId(fxid),
XidFromFullTransactionId(fxid));
@@ -1049,7 +1051,7 @@ void
StartPrepare(GlobalTransaction gxact)
{
PGPROC *proc = GetPGProcByNumber(gxact->pgprocno);
- TransactionId xid = gxact->xid;
+ TransactionId xid = XidFromFullTransactionId(gxact->fxid);
TwoPhaseFileHeader hdr;
TransactionId *children;
RelFileLocator *commitrels;
@@ -1281,10 +1283,11 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
* If it looks OK (has a valid magic number and CRC), return the palloc'd
* contents of the file, issuing an error when finding corrupted data. If
* missing_ok is true, which indicates that missing files can be safely
- * ignored, then return NULL. This state can be reached when doing recovery.
+ * ignored, then return NULL. This state can be reached when doing recovery
+ * after discarding two-phase files from frozen epochs.
*/
static char *
-ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
+ReadTwoPhaseFile(FullTransactionId fxid, bool missing_ok)
{
char path[MAXPGPATH];
char *buf;
@@ -1296,7 +1299,7 @@ ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
file_crc;
int r;
- TwoPhaseFilePath(path, xid);
+ TwoPhaseFilePath(path, fxid);
fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
if (fd < 0)
@@ -1426,12 +1429,12 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
if (errormsg)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not read two-phase state from WAL at %X/%X: %s",
+ errmsg("could not read two-phase state from WAL at %X/%08X: %s",
LSN_FORMAT_ARGS(lsn), errormsg)));
else
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not read two-phase state from WAL at %X/%X",
+ errmsg("could not read two-phase state from WAL at %X/%08X",
LSN_FORMAT_ARGS(lsn))));
}
@@ -1439,7 +1442,7 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
(XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("expected two-phase state data is not present in WAL at %X/%X",
+ errmsg("expected two-phase state data is not present in WAL at %X/%08X",
LSN_FORMAT_ARGS(lsn))));
if (len != NULL)
@@ -1461,6 +1464,7 @@ StandbyTransactionIdIsPrepared(TransactionId xid)
char *buf;
TwoPhaseFileHeader *hdr;
bool result;
+ FullTransactionId fxid;
Assert(TransactionIdIsValid(xid));
@@ -1468,7 +1472,8 @@ StandbyTransactionIdIsPrepared(TransactionId xid)
return false; /* nothing to do */
/* Read and validate file */
- buf = ReadTwoPhaseFile(xid, true);
+ fxid = AdjustToFullTransactionId(xid);
+ buf = ReadTwoPhaseFile(fxid, true);
if (buf == NULL)
return false;
@@ -1488,6 +1493,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
{
GlobalTransaction gxact;
PGPROC *proc;
+ FullTransactionId fxid;
TransactionId xid;
bool ondisk;
char *buf;
@@ -1509,7 +1515,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
*/
gxact = LockGXact(gid, GetUserId());
proc = GetPGProcByNumber(gxact->pgprocno);
- xid = gxact->xid;
+ fxid = gxact->fxid;
+ xid = XidFromFullTransactionId(fxid);
/*
* Read and validate 2PC state data. State data will typically be stored
@@ -1517,7 +1524,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
* to disk if for some reason they have lived for a long time.
*/
if (gxact->ondisk)
- buf = ReadTwoPhaseFile(xid, false);
+ buf = ReadTwoPhaseFile(fxid, false);
else
XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
@@ -1636,11 +1643,11 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
/* And now do the callbacks */
if (isCommit)
- ProcessRecords(bufptr, xid, twophase_postcommit_callbacks);
+ ProcessRecords(bufptr, fxid, twophase_postcommit_callbacks);
else
- ProcessRecords(bufptr, xid, twophase_postabort_callbacks);
+ ProcessRecords(bufptr, fxid, twophase_postabort_callbacks);
- PredicateLockTwoPhaseFinish(xid, isCommit);
+ PredicateLockTwoPhaseFinish(fxid, isCommit);
/*
* Read this value while holding the two-phase lock, as the on-disk 2PC
@@ -1664,7 +1671,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
* And now we can clean up any files we may have left.
*/
if (ondisk)
- RemoveTwoPhaseFile(xid, true);
+ RemoveTwoPhaseFile(fxid, true);
MyLockedGxact = NULL;
@@ -1677,7 +1684,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
* Scan 2PC state data in memory and call the indicated callbacks for each 2PC record.
*/
static void
-ProcessRecords(char *bufptr, TransactionId xid,
+ProcessRecords(char *bufptr, FullTransactionId fxid,
const TwoPhaseCallback callbacks[])
{
for (;;)
@@ -1691,24 +1698,28 @@ ProcessRecords(char *bufptr, TransactionId xid,
bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk));
if (callbacks[record->rmid] != NULL)
- callbacks[record->rmid] (xid, record->info, bufptr, record->len);
+ callbacks[record->rmid] (fxid, record->info, bufptr, record->len);
bufptr += MAXALIGN(record->len);
}
}
/*
- * Remove the 2PC file for the specified XID.
+ * Remove the 2PC file.
*
* If giveWarning is false, do not complain about file-not-present;
* this is an expected case during WAL replay.
+ *
+ * This routine is used at early stages at recovery where future and
+ * past orphaned files are checked, hence the FullTransactionId to build
+ * a complete file name fit for the removal.
*/
static void
-RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
+RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning)
{
char path[MAXPGPATH];
- TwoPhaseFilePath(path, xid);
+ TwoPhaseFilePath(path, fxid);
if (unlink(path))
if (errno != ENOENT || giveWarning)
ereport(WARNING,
@@ -1723,7 +1734,7 @@ RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
* Note: content and len don't include CRC.
*/
static void
-RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
+RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len)
{
char path[MAXPGPATH];
pg_crc32c statefile_crc;
@@ -1734,7 +1745,7 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
COMP_CRC32C(statefile_crc, content, len);
FIN_CRC32C(statefile_crc);
- TwoPhaseFilePath(path, xid);
+ TwoPhaseFilePath(path, fxid);
fd = OpenTransientFile(path,
O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY);
@@ -1846,7 +1857,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
int len;
XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len);
- RecreateTwoPhaseFile(gxact->xid, buf, len);
+ RecreateTwoPhaseFile(gxact->fxid, buf, len);
gxact->ondisk = true;
gxact->prepare_start_lsn = InvalidXLogRecPtr;
gxact->prepare_end_lsn = InvalidXLogRecPtr;
@@ -1897,19 +1908,17 @@ restoreTwoPhaseData(void)
if (strlen(clde->d_name) == 16 &&
strspn(clde->d_name, "0123456789ABCDEF") == 16)
{
- TransactionId xid;
FullTransactionId fxid;
char *buf;
fxid = FullTransactionIdFromU64(strtou64(clde->d_name, NULL, 16));
- xid = XidFromFullTransactionId(fxid);
- buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr,
+ buf = ProcessTwoPhaseBuffer(fxid, InvalidXLogRecPtr,
true, false, false);
if (buf == NULL)
continue;
- PrepareRedoAdd(buf, InvalidXLogRecPtr,
+ PrepareRedoAdd(fxid, buf, InvalidXLogRecPtr,
InvalidXLogRecPtr, InvalidRepOriginId);
}
}
@@ -1968,9 +1977,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
Assert(gxact->inredo);
- xid = gxact->xid;
-
- buf = ProcessTwoPhaseBuffer(xid,
+ buf = ProcessTwoPhaseBuffer(gxact->fxid,
gxact->prepare_start_lsn,
gxact->ondisk, false, true);
@@ -1981,6 +1988,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
* OK, we think this file is valid. Incorporate xid into the
* running-minimum result.
*/
+ xid = XidFromFullTransactionId(gxact->fxid);
if (TransactionIdPrecedes(xid, result))
result = xid;
@@ -2036,15 +2044,12 @@ StandbyRecoverPreparedTransactions(void)
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
{
- TransactionId xid;
char *buf;
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
Assert(gxact->inredo);
- xid = gxact->xid;
-
- buf = ProcessTwoPhaseBuffer(xid,
+ buf = ProcessTwoPhaseBuffer(gxact->fxid,
gxact->prepare_start_lsn,
gxact->ondisk, true, false);
if (buf != NULL)
@@ -2077,16 +2082,14 @@ RecoverPreparedTransactions(void)
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
{
- TransactionId xid;
char *buf;
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+ FullTransactionId fxid = gxact->fxid;
char *bufptr;
TwoPhaseFileHeader *hdr;
TransactionId *subxids;
const char *gid;
- xid = gxact->xid;
-
/*
* Reconstruct subtrans state for the transaction --- needed because
* pg_subtrans is not preserved over a restart. Note that we are
@@ -2096,17 +2099,20 @@ RecoverPreparedTransactions(void)
* SubTransSetParent has been set before, if the prepared transaction
* generated xid assignment records.
*/
- buf = ProcessTwoPhaseBuffer(xid,
+ buf = ProcessTwoPhaseBuffer(gxact->fxid,
gxact->prepare_start_lsn,
gxact->ondisk, true, false);
if (buf == NULL)
continue;
ereport(LOG,
- (errmsg("recovering prepared transaction %u from shared memory", xid)));
+ (errmsg("recovering prepared transaction %u of epoch %u from shared memory",
+ XidFromFullTransactionId(gxact->fxid),
+ EpochFromFullTransactionId(gxact->fxid))));
hdr = (TwoPhaseFileHeader *) buf;
- Assert(TransactionIdEquals(hdr->xid, xid));
+ Assert(TransactionIdEquals(hdr->xid,
+ XidFromFullTransactionId(gxact->fxid)));
bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
gid = (const char *) bufptr;
bufptr += MAXALIGN(hdr->gidlen);
@@ -2122,7 +2128,7 @@ RecoverPreparedTransactions(void)
* Recreate its GXACT and dummy PGPROC. But, check whether it was
* added in redo and already has a shmem entry for it.
*/
- MarkAsPreparingGuts(gxact, xid, gid,
+ MarkAsPreparingGuts(gxact, gxact->fxid, gid,
hdr->prepared_at,
hdr->owner, hdr->database);
@@ -2137,7 +2143,7 @@ RecoverPreparedTransactions(void)
/*
* Recover other state (notably locks) using resource managers.
*/
- ProcessRecords(bufptr, xid, twophase_recover_callbacks);
+ ProcessRecords(bufptr, fxid, twophase_recover_callbacks);
/*
* Release locks held by the standby process after we process each
@@ -2145,7 +2151,7 @@ RecoverPreparedTransactions(void)
* additional locks at any one time.
*/
if (InHotStandby)
- StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids);
+ StandbyReleaseLockTree(hdr->xid, hdr->nsubxacts, subxids);
/*
* We're done with recovering this transaction. Clear MyLockedGxact,
@@ -2164,7 +2170,7 @@ RecoverPreparedTransactions(void)
/*
* ProcessTwoPhaseBuffer
*
- * Given a transaction id, read it either from disk or read it directly
+ * Given a FullTransactionId, read it either from disk or read it directly
* via shmem xlog record pointer using the provided "prepare_start_lsn".
*
* If setParent is true, set up subtransaction parent linkages.
@@ -2173,13 +2179,12 @@ RecoverPreparedTransactions(void)
* value scanned.
*/
static char *
-ProcessTwoPhaseBuffer(TransactionId xid,
+ProcessTwoPhaseBuffer(FullTransactionId fxid,
XLogRecPtr prepare_start_lsn,
bool fromdisk,
bool setParent, bool setNextXid)
{
FullTransactionId nextXid = TransamVariables->nextXid;
- TransactionId origNextXid = XidFromFullTransactionId(nextXid);
TransactionId *subxids;
char *buf;
TwoPhaseFileHeader *hdr;
@@ -2191,41 +2196,46 @@ ProcessTwoPhaseBuffer(TransactionId xid,
Assert(prepare_start_lsn != InvalidXLogRecPtr);
/* Already processed? */
- if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+ if (TransactionIdDidCommit(XidFromFullTransactionId(fxid)) ||
+ TransactionIdDidAbort(XidFromFullTransactionId(fxid)))
{
if (fromdisk)
{
ereport(WARNING,
- (errmsg("removing stale two-phase state file for transaction %u",
- xid)));
- RemoveTwoPhaseFile(xid, true);
+ (errmsg("removing stale two-phase state file for transaction %u of epoch %u",
+ XidFromFullTransactionId(fxid),
+ EpochFromFullTransactionId(fxid))));
+ RemoveTwoPhaseFile(fxid, true);
}
else
{
ereport(WARNING,
- (errmsg("removing stale two-phase state from memory for transaction %u",
- xid)));
- PrepareRedoRemove(xid, true);
+ (errmsg("removing stale two-phase state from memory for transaction %u of epoch %u",
+ XidFromFullTransactionId(fxid),
+ EpochFromFullTransactionId(fxid))));
+ PrepareRedoRemoveFull(fxid, true);
}
return NULL;
}
/* Reject XID if too new */
- if (TransactionIdFollowsOrEquals(xid, origNextXid))
+ if (FullTransactionIdFollowsOrEquals(fxid, nextXid))
{
if (fromdisk)
{
ereport(WARNING,
- (errmsg("removing future two-phase state file for transaction %u",
- xid)));
- RemoveTwoPhaseFile(xid, true);
+ (errmsg("removing future two-phase state file for transaction %u of epoch %u",
+ XidFromFullTransactionId(fxid),
+ EpochFromFullTransactionId(fxid))));
+ RemoveTwoPhaseFile(fxid, true);
}
else
{
ereport(WARNING,
- (errmsg("removing future two-phase state from memory for transaction %u",
- xid)));
- PrepareRedoRemove(xid, true);
+ (errmsg("removing future two-phase state from memory for transaction %u of epoch %u",
+ XidFromFullTransactionId(fxid),
+ EpochFromFullTransactionId(fxid))));
+ PrepareRedoRemoveFull(fxid, true);
}
return NULL;
}
@@ -2233,7 +2243,7 @@ ProcessTwoPhaseBuffer(TransactionId xid,
if (fromdisk)
{
/* Read and validate file */
- buf = ReadTwoPhaseFile(xid, false);
+ buf = ReadTwoPhaseFile(fxid, false);
}
else
{
@@ -2243,18 +2253,20 @@ ProcessTwoPhaseBuffer(TransactionId xid,
/* Deconstruct header */
hdr = (TwoPhaseFileHeader *) buf;
- if (!TransactionIdEquals(hdr->xid, xid))
+ if (!TransactionIdEquals(hdr->xid, XidFromFullTransactionId(fxid)))
{
if (fromdisk)
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("corrupted two-phase state file for transaction %u",
- xid)));
+ errmsg("corrupted two-phase state file for transaction %u of epoch %u",
+ XidFromFullTransactionId(fxid),
+ EpochFromFullTransactionId(fxid))));
else
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("corrupted two-phase state in memory for transaction %u",
- xid)));
+ errmsg("corrupted two-phase state in memory for transaction %u of epoch %u",
+ XidFromFullTransactionId(fxid),
+ EpochFromFullTransactionId(fxid))));
}
/*
@@ -2268,14 +2280,14 @@ ProcessTwoPhaseBuffer(TransactionId xid,
{
TransactionId subxid = subxids[i];
- Assert(TransactionIdFollows(subxid, xid));
+ Assert(TransactionIdFollows(subxid, XidFromFullTransactionId(fxid)));
/* update nextXid if needed */
if (setNextXid)
AdvanceNextFullTransactionIdPastXid(subxid);
if (setParent)
- SubTransSetParent(subxid, xid);
+ SubTransSetParent(subxid, XidFromFullTransactionId(fxid));
}
return buf;
@@ -2466,8 +2478,9 @@ RecordTransactionAbortPrepared(TransactionId xid,
* data, the entry is marked as located on disk.
*/
void
-PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
- XLogRecPtr end_lsn, RepOriginId origin_id)
+PrepareRedoAdd(FullTransactionId fxid, char *buf,
+ XLogRecPtr start_lsn, XLogRecPtr end_lsn,
+ RepOriginId origin_id)
{
TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf;
char *bufptr;
@@ -2477,6 +2490,13 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
Assert(RecoveryInProgress());
+ if (!FullTransactionIdIsValid(fxid))
+ {
+ Assert(InRecovery);
+ fxid = FullTransactionIdFromAllowableAt(TransamVariables->nextXid,
+ hdr->xid);
+ }
+
bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
gid = (const char *) bufptr;
@@ -2505,14 +2525,15 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
{
char path[MAXPGPATH];
- TwoPhaseFilePath(path, hdr->xid);
+ Assert(InRecovery);
+ TwoPhaseFilePath(path, fxid);
if (access(path, F_OK) == 0)
{
ereport(reachedConsistency ? ERROR : WARNING,
(errmsg("could not recover two-phase state file for transaction %u",
hdr->xid),
- errdetail("Two-phase state file has been found in WAL record %X/%X, but this transaction has already been restored from disk.",
+ errdetail("Two-phase state file has been found in WAL record %X/%08X, but this transaction has already been restored from disk.",
LSN_FORMAT_ARGS(start_lsn))));
return;
}
@@ -2536,7 +2557,7 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
gxact->prepared_at = hdr->prepared_at;
gxact->prepare_start_lsn = start_lsn;
gxact->prepare_end_lsn = end_lsn;
- gxact->xid = hdr->xid;
+ gxact->fxid = fxid;
gxact->owner = hdr->owner;
gxact->locking_backend = INVALID_PROC_NUMBER;
gxact->valid = false;
@@ -2555,11 +2576,13 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
false /* backward */ , false /* WAL */ );
}
- elog(DEBUG2, "added 2PC data in shared memory for transaction %u", gxact->xid);
+ elog(DEBUG2, "added 2PC data in shared memory for transaction %u of epoch %u",
+ XidFromFullTransactionId(gxact->fxid),
+ EpochFromFullTransactionId(gxact->fxid));
}
/*
- * PrepareRedoRemove
+ * PrepareRedoRemoveFull
*
* Remove the corresponding gxact entry from TwoPhaseState. Also remove
* the 2PC file if a prepared transaction was saved via an earlier checkpoint.
@@ -2567,8 +2590,8 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
* Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState
* is updated.
*/
-void
-PrepareRedoRemove(TransactionId xid, bool giveWarning)
+static void
+PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning)
{
GlobalTransaction gxact = NULL;
int i;
@@ -2581,7 +2604,7 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
{
gxact = TwoPhaseState->prepXacts[i];
- if (gxact->xid == xid)
+ if (FullTransactionIdEquals(gxact->fxid, fxid))
{
Assert(gxact->inredo);
found = true;
@@ -2598,13 +2621,29 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
/*
* And now we can clean up any files we may have left.
*/
- elog(DEBUG2, "removing 2PC data for transaction %u", xid);
+ elog(DEBUG2, "removing 2PC data for transaction %u of epoch %u ",
+ XidFromFullTransactionId(fxid),
+ EpochFromFullTransactionId(fxid));
+
if (gxact->ondisk)
- RemoveTwoPhaseFile(xid, giveWarning);
+ RemoveTwoPhaseFile(fxid, giveWarning);
+
RemoveGXact(gxact);
}
/*
+ * Wrapper of PrepareRedoRemoveFull(), for TransactionIds.
+ */
+void
+PrepareRedoRemove(TransactionId xid, bool giveWarning)
+{
+ FullTransactionId fxid =
+ FullTransactionIdFromAllowableAt(TransamVariables->nextXid, xid);
+
+ PrepareRedoRemoveFull(fxid, giveWarning);
+}
+
+/*
* LookupGXact
* Check if the prepared transaction with the given GID, lsn and timestamp
* exists.
@@ -2648,7 +2687,7 @@ LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn,
* between publisher and subscriber.
*/
if (gxact->ondisk)
- buf = ReadTwoPhaseFile(gxact->xid, false);
+ buf = ReadTwoPhaseFile(gxact->fxid, false);
else
{
Assert(gxact->prepare_start_lsn);
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index b885513f765..41601fcb280 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2515,7 +2515,7 @@ static void
PrepareTransaction(void)
{
TransactionState s = CurrentTransactionState;
- TransactionId xid = GetCurrentTransactionId();
+ FullTransactionId fxid = GetCurrentFullTransactionId();
GlobalTransaction gxact;
TimestampTz prepared_at;
@@ -2644,7 +2644,7 @@ PrepareTransaction(void)
* Reserve the GID for this transaction. This could fail if the requested
* GID is invalid or already in use.
*/
- gxact = MarkAsPreparing(xid, prepareGID, prepared_at,
+ gxact = MarkAsPreparing(fxid, prepareGID, prepared_at,
GetUserId(), MyDatabaseId);
prepareGID = NULL;
@@ -2694,7 +2694,7 @@ PrepareTransaction(void)
* ProcArrayClearTransaction(). Otherwise, a GetLockConflicts() would
* conclude "xact already committed or aborted" for our locks.
*/
- PostPrepare_Locks(xid);
+ PostPrepare_Locks(fxid);
/*
* Let others know about no transaction in progress by me. This has to be
@@ -2738,9 +2738,9 @@ PrepareTransaction(void)
PostPrepare_smgr();
- PostPrepare_MultiXact(xid);
+ PostPrepare_MultiXact(fxid);
- PostPrepare_PredicateLocks(xid);
+ PostPrepare_PredicateLocks(fxid);
ResourceOwnerRelease(TopTransactionResourceOwner,
RESOURCE_RELEASE_LOCKS,
@@ -6420,7 +6420,8 @@ xact_redo(XLogReaderState *record)
* gxact entry.
*/
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
- PrepareRedoAdd(XLogRecGetData(record),
+ PrepareRedoAdd(InvalidFullTransactionId,
+ XLogRecGetData(record),
record->ReadRecPtr,
record->EndRecPtr,
XLogRecGetOrigin(record));
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1914859b2ee..a8cc6402d62 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1028,7 +1028,7 @@ XLogInsertRecord(XLogRecData *rdata,
oldCxt = MemoryContextSwitchTo(walDebugCxt);
initStringInfo(&buf);
- appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
+ appendStringInfo(&buf, "INSERT @ %X/%08X: ", LSN_FORMAT_ARGS(EndPos));
/*
* We have to piece together the WAL record data from the XLogRecData
@@ -1549,8 +1549,8 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto)
if (upto > reservedUpto)
{
ereport(LOG,
- (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
- LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
+ errmsg("request to flush past end of generated WAL; request %X/%08X, current position %X/%08X",
+ LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)));
upto = reservedUpto;
}
@@ -1716,7 +1716,7 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
if (expectedEndPtr != endptr)
- elog(PANIC, "could not find WAL buffer for %X/%X",
+ elog(PANIC, "could not find WAL buffer for %X/%08X",
LSN_FORMAT_ARGS(ptr));
}
else
@@ -1776,7 +1776,7 @@ WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count,
inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult);
if (startptr + count > inserted)
ereport(ERROR,
- errmsg("cannot read past end of generated WAL: requested %X/%X, current position %X/%X",
+ errmsg("cannot read past end of generated WAL: requested %X/%08X, current position %X/%08X",
LSN_FORMAT_ARGS(startptr + count),
LSN_FORMAT_ARGS(inserted)));
@@ -2281,7 +2281,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
#ifdef WAL_DEBUG
if (XLOG_DEBUG && npages > 0)
{
- elog(DEBUG1, "initialized %d pages, up to %X/%X",
+ elog(DEBUG1, "initialized %d pages, up to %X/%08X",
npages, LSN_FORMAT_ARGS(NewPageEndPtr));
}
#endif
@@ -2492,7 +2492,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
XLogRecPtr EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
if (LogwrtResult.Write >= EndPtr)
- elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
+ elog(PANIC, "xlog write request %X/%08X is past end of log %X/%08X",
LSN_FORMAT_ARGS(LogwrtResult.Write),
LSN_FORMAT_ARGS(EndPtr));
@@ -2892,7 +2892,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
if (!force && newMinRecoveryPoint < lsn)
elog(WARNING,
- "xlog min recovery request %X/%X is past current point %X/%X",
+ "xlog min recovery request %X/%08X is past current point %X/%08X",
LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
/* update control file */
@@ -2905,9 +2905,9 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
ereport(DEBUG2,
- (errmsg_internal("updated min recovery point to %X/%X on timeline %u",
- LSN_FORMAT_ARGS(newMinRecoveryPoint),
- newMinRecoveryPointTLI)));
+ errmsg_internal("updated min recovery point to %X/%08X on timeline %u",
+ LSN_FORMAT_ARGS(newMinRecoveryPoint),
+ newMinRecoveryPointTLI));
}
}
LWLockRelease(ControlFileLock);
@@ -2945,7 +2945,7 @@ XLogFlush(XLogRecPtr record)
#ifdef WAL_DEBUG
if (XLOG_DEBUG)
- elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
+ elog(LOG, "xlog flush request %X/%08X; write %X/%08X; flush %X/%08X",
LSN_FORMAT_ARGS(record),
LSN_FORMAT_ARGS(LogwrtResult.Write),
LSN_FORMAT_ARGS(LogwrtResult.Flush));
@@ -3078,7 +3078,7 @@ XLogFlush(XLogRecPtr record)
*/
if (LogwrtResult.Flush < record)
elog(ERROR,
- "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
+ "xlog flush request %X/%08X is not satisfied --- flushed only to %X/%08X",
LSN_FORMAT_ARGS(record),
LSN_FORMAT_ARGS(LogwrtResult.Flush));
}
@@ -3205,7 +3205,7 @@ XLogBackgroundFlush(void)
#ifdef WAL_DEBUG
if (XLOG_DEBUG)
- elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
+ elog(LOG, "xlog bg flush request write %X/%08X; flush: %X/%08X, current is write %X/%08X; flush %X/%08X",
LSN_FORMAT_ARGS(WriteRqst.Write),
LSN_FORMAT_ARGS(WriteRqst.Flush),
LSN_FORMAT_ARGS(LogwrtResult.Write),
@@ -6921,7 +6921,7 @@ LogCheckpointEnd(bool restartpoint)
"%d removed, %d recycled; write=%ld.%03d s, "
"sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
"longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
- "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X",
+ "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X",
CheckpointStats.ckpt_bufs_written,
(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
CheckpointStats.ckpt_slru_written,
@@ -6945,7 +6945,7 @@ LogCheckpointEnd(bool restartpoint)
"%d removed, %d recycled; write=%ld.%03d s, "
"sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
"longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
- "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X",
+ "estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X",
CheckpointStats.ckpt_bufs_written,
(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
CheckpointStats.ckpt_slru_written,
@@ -7498,6 +7498,10 @@ CreateCheckPoint(int flags)
if (PriorRedoPtr != InvalidXLogRecPtr)
UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
+#ifdef USE_INJECTION_POINTS
+ INJECTION_POINT("checkpoint-before-old-wal-removal", NULL);
+#endif
+
/*
* Delete old log files, those no longer needed for last checkpoint to
* prevent the disk holding the xlog from growing full.
@@ -7637,7 +7641,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
if (!RecoveryInProgress())
elog(ERROR, "can only be used at end of recovery");
if (pagePtr % XLOG_BLCKSZ != 0)
- elog(ERROR, "invalid position for missing continuation record %X/%X",
+ elog(ERROR, "invalid position for missing continuation record %X/%08X",
LSN_FORMAT_ARGS(pagePtr));
/* The current WAL insert position should be right after the page header */
@@ -7648,7 +7652,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
startPos += SizeOfXLogShortPHD;
recptr = GetXLogInsertRecPtr();
if (recptr != startPos)
- elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
+ elog(ERROR, "invalid WAL insert position %X/%08X for OVERWRITE_CONTRECORD",
LSN_FORMAT_ARGS(recptr));
START_CRIT_SECTION();
@@ -7678,7 +7682,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
/* check that the record was inserted to the right place */
if (ProcLastRecPtr != startPos)
- elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
+ elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%08X",
LSN_FORMAT_ARGS(ProcLastRecPtr));
XLogFlush(recptr);
@@ -7747,8 +7751,7 @@ RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
if (XLogHaveInvalidPages())
{
elog(DEBUG2,
- "could not record restart point at %X/%X because there "
- "are unresolved references to invalid pages",
+ "could not record restart point at %X/%08X because there are unresolved references to invalid pages",
LSN_FORMAT_ARGS(checkPoint->redo));
return;
}
@@ -7828,8 +7831,8 @@ CreateRestartPoint(int flags)
lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
{
ereport(DEBUG2,
- (errmsg_internal("skipping restartpoint, already performed at %X/%X",
- LSN_FORMAT_ARGS(lastCheckPoint.redo))));
+ errmsg_internal("skipping restartpoint, already performed at %X/%08X",
+ LSN_FORMAT_ARGS(lastCheckPoint.redo)));
UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
if (flags & CHECKPOINT_IS_SHUTDOWN)
@@ -8013,10 +8016,10 @@ CreateRestartPoint(int flags)
xtime = GetLatestXTime();
ereport((log_checkpoints ? LOG : DEBUG2),
- (errmsg("recovery restart point at %X/%X",
- LSN_FORMAT_ARGS(lastCheckPoint.redo)),
- xtime ? errdetail("Last completed transaction was at log time %s.",
- timestamptz_to_str(xtime)) : 0));
+ errmsg("recovery restart point at %X/%08X",
+ LSN_FORMAT_ARGS(lastCheckPoint.redo)),
+ xtime ? errdetail("Last completed transaction was at log time %s.",
+ timestamptz_to_str(xtime)) : 0);
/*
* Finally, execute archive_cleanup_command, if any.
@@ -8277,8 +8280,8 @@ XLogRestorePoint(const char *rpName)
RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
ereport(LOG,
- (errmsg("restore point \"%s\" created at %X/%X",
- rpName, LSN_FORMAT_ARGS(RecPtr))));
+ errmsg("restore point \"%s\" created at %X/%08X",
+ rpName, LSN_FORMAT_ARGS(RecPtr)));
return RecPtr;
}
diff --git a/src/backend/access/transam/xlogbackup.c b/src/backend/access/transam/xlogbackup.c
index 342590e0a46..cda4b38b7d6 100644
--- a/src/backend/access/transam/xlogbackup.c
+++ b/src/backend/access/transam/xlogbackup.c
@@ -42,7 +42,7 @@ build_backup_content(BackupState *state, bool ishistoryfile)
XLByteToSeg(state->startpoint, startsegno, wal_segment_size);
XLogFileName(startxlogfile, state->starttli, startsegno, wal_segment_size);
- appendStringInfo(result, "START WAL LOCATION: %X/%X (file %s)\n",
+ appendStringInfo(result, "START WAL LOCATION: %X/%08X (file %s)\n",
LSN_FORMAT_ARGS(state->startpoint), startxlogfile);
if (ishistoryfile)
@@ -52,11 +52,11 @@ build_backup_content(BackupState *state, bool ishistoryfile)
XLByteToSeg(state->stoppoint, stopsegno, wal_segment_size);
XLogFileName(stopxlogfile, state->stoptli, stopsegno, wal_segment_size);
- appendStringInfo(result, "STOP WAL LOCATION: %X/%X (file %s)\n",
+ appendStringInfo(result, "STOP WAL LOCATION: %X/%08X (file %s)\n",
LSN_FORMAT_ARGS(state->stoppoint), stopxlogfile);
}
- appendStringInfo(result, "CHECKPOINT LOCATION: %X/%X\n",
+ appendStringInfo(result, "CHECKPOINT LOCATION: %X/%08X\n",
LSN_FORMAT_ARGS(state->checkpointloc));
appendStringInfoString(result, "BACKUP METHOD: streamed\n");
appendStringInfo(result, "BACKUP FROM: %s\n",
@@ -81,7 +81,7 @@ build_backup_content(BackupState *state, bool ishistoryfile)
Assert(XLogRecPtrIsInvalid(state->istartpoint) == (state->istarttli == 0));
if (!XLogRecPtrIsInvalid(state->istartpoint))
{
- appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%X\n",
+ appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%08X\n",
LSN_FORMAT_ARGS(state->istartpoint));
appendStringInfo(result, "INCREMENTAL FROM TLI: %u\n",
state->istarttli);
diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c
index 7735562db01..ed3aacabc98 100644
--- a/src/backend/access/transam/xlogprefetcher.c
+++ b/src/backend/access/transam/xlogprefetcher.c
@@ -546,7 +546,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing all readahead until %X/%X is replayed due to possible TLI change",
+ "suppressing all readahead until %X/%08X is replayed due to possible TLI change",
LSN_FORMAT_ARGS(record->lsn));
#endif
@@ -579,7 +579,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing prefetch in database %u until %X/%X is replayed due to raw file copy",
+ "suppressing prefetch in database %u until %X/%08X is replayed due to raw file copy",
rlocator.dbOid,
LSN_FORMAT_ARGS(record->lsn));
#endif
@@ -607,7 +607,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation",
+ "suppressing prefetch in relation %u/%u/%u until %X/%08X is replayed, which creates the relation",
xlrec->rlocator.spcOid,
xlrec->rlocator.dbOid,
xlrec->rlocator.relNumber,
@@ -630,7 +630,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation",
+ "suppressing prefetch in relation %u/%u/%u from block %u until %X/%08X is replayed, which truncates the relation",
xlrec->rlocator.spcOid,
xlrec->rlocator.dbOid,
xlrec->rlocator.relNumber,
@@ -729,7 +729,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk",
+ "suppressing all prefetch in relation %u/%u/%u until %X/%08X is replayed, because the relation does not exist on disk",
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
reln->smgr_rlocator.locator.relNumber,
@@ -750,7 +750,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small",
+ "suppressing prefetch in relation %u/%u/%u from block %u until %X/%08X is replayed, because the relation is too small",
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
reln->smgr_rlocator.locator.relNumber,
@@ -928,7 +928,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator,
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)",
+ "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%08X is replayed (blocks >= %u filtered)",
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno,
LSN_FORMAT_ARGS(filter->filter_until_replayed),
filter->filter_from_block);
@@ -944,7 +944,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator,
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)",
+ "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%08X is replayed (whole database)",
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno,
LSN_FORMAT_ARGS(filter->filter_until_replayed));
#endif
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 2790ade1f91..ac1f801b1eb 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -617,7 +617,7 @@ restart:
}
else if (targetRecOff < pageHeaderSize)
{
- report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u",
+ report_invalid_record(state, "invalid record offset at %X/%08X: expected at least %u, got %u",
LSN_FORMAT_ARGS(RecPtr),
pageHeaderSize, targetRecOff);
goto err;
@@ -626,7 +626,7 @@ restart:
if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
targetRecOff == pageHeaderSize)
{
- report_invalid_record(state, "contrecord is requested by %X/%X",
+ report_invalid_record(state, "contrecord is requested by %X/%08X",
LSN_FORMAT_ARGS(RecPtr));
goto err;
}
@@ -667,7 +667,7 @@ restart:
if (total_len < SizeOfXLogRecord)
{
report_invalid_record(state,
- "invalid record length at %X/%X: expected at least %u, got %u",
+ "invalid record length at %X/%08X: expected at least %u, got %u",
LSN_FORMAT_ARGS(RecPtr),
(uint32) SizeOfXLogRecord, total_len);
goto err;
@@ -756,7 +756,7 @@ restart:
if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD))
{
report_invalid_record(state,
- "there is no contrecord flag at %X/%X",
+ "there is no contrecord flag at %X/%08X",
LSN_FORMAT_ARGS(RecPtr));
goto err;
}
@@ -769,7 +769,7 @@ restart:
total_len != (pageHeader->xlp_rem_len + gotlen))
{
report_invalid_record(state,
- "invalid contrecord length %u (expected %lld) at %X/%X",
+ "invalid contrecord length %u (expected %lld) at %X/%08X",
pageHeader->xlp_rem_len,
((long long) total_len) - gotlen,
LSN_FORMAT_ARGS(RecPtr));
@@ -1132,7 +1132,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
if (record->xl_tot_len < SizeOfXLogRecord)
{
report_invalid_record(state,
- "invalid record length at %X/%X: expected at least %u, got %u",
+ "invalid record length at %X/%08X: expected at least %u, got %u",
LSN_FORMAT_ARGS(RecPtr),
(uint32) SizeOfXLogRecord, record->xl_tot_len);
return false;
@@ -1140,7 +1140,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
if (!RmgrIdIsValid(record->xl_rmid))
{
report_invalid_record(state,
- "invalid resource manager ID %u at %X/%X",
+ "invalid resource manager ID %u at %X/%08X",
record->xl_rmid, LSN_FORMAT_ARGS(RecPtr));
return false;
}
@@ -1153,7 +1153,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
if (!(record->xl_prev < RecPtr))
{
report_invalid_record(state,
- "record with incorrect prev-link %X/%X at %X/%X",
+ "record with incorrect prev-link %X/%08X at %X/%08X",
LSN_FORMAT_ARGS(record->xl_prev),
LSN_FORMAT_ARGS(RecPtr));
return false;
@@ -1169,7 +1169,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
if (record->xl_prev != PrevRecPtr)
{
report_invalid_record(state,
- "record with incorrect prev-link %X/%X at %X/%X",
+ "record with incorrect prev-link %X/%08X at %X/%08X",
LSN_FORMAT_ARGS(record->xl_prev),
LSN_FORMAT_ARGS(RecPtr));
return false;
@@ -1207,7 +1207,7 @@ ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr)
if (!EQ_CRC32C(record->xl_crc, crc))
{
report_invalid_record(state,
- "incorrect resource manager data checksum in record at %X/%X",
+ "incorrect resource manager data checksum in record at %X/%08X",
LSN_FORMAT_ARGS(recptr));
return false;
}
@@ -1241,7 +1241,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
report_invalid_record(state,
- "invalid magic number %04X in WAL segment %s, LSN %X/%X, offset %u",
+ "invalid magic number %04X in WAL segment %s, LSN %X/%08X, offset %u",
hdr->xlp_magic,
fname,
LSN_FORMAT_ARGS(recptr),
@@ -1256,7 +1256,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
report_invalid_record(state,
- "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u",
+ "invalid info bits %04X in WAL segment %s, LSN %X/%08X, offset %u",
hdr->xlp_info,
fname,
LSN_FORMAT_ARGS(recptr),
@@ -1298,7 +1298,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
/* hmm, first page of file doesn't have a long header? */
report_invalid_record(state,
- "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u",
+ "invalid info bits %04X in WAL segment %s, LSN %X/%08X, offset %u",
hdr->xlp_info,
fname,
LSN_FORMAT_ARGS(recptr),
@@ -1318,7 +1318,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
report_invalid_record(state,
- "unexpected pageaddr %X/%X in WAL segment %s, LSN %X/%X, offset %u",
+ "unexpected pageaddr %X/%08X in WAL segment %s, LSN %X/%08X, offset %u",
LSN_FORMAT_ARGS(hdr->xlp_pageaddr),
fname,
LSN_FORMAT_ARGS(recptr),
@@ -1344,7 +1344,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
report_invalid_record(state,
- "out-of-sequence timeline ID %u (after %u) in WAL segment %s, LSN %X/%X, offset %u",
+ "out-of-sequence timeline ID %u (after %u) in WAL segment %s, LSN %X/%08X, offset %u",
hdr->xlp_tli,
state->latestPageTLI,
fname,
@@ -1756,7 +1756,7 @@ DecodeXLogRecord(XLogReaderState *state,
if (block_id <= decoded->max_block_id)
{
report_invalid_record(state,
- "out-of-order block_id %u at %X/%X",
+ "out-of-order block_id %u at %X/%08X",
block_id,
LSN_FORMAT_ARGS(state->ReadRecPtr));
goto err;
@@ -1780,14 +1780,14 @@ DecodeXLogRecord(XLogReaderState *state,
if (blk->has_data && blk->data_len == 0)
{
report_invalid_record(state,
- "BKPBLOCK_HAS_DATA set, but no data included at %X/%X",
+ "BKPBLOCK_HAS_DATA set, but no data included at %X/%08X",
LSN_FORMAT_ARGS(state->ReadRecPtr));
goto err;
}
if (!blk->has_data && blk->data_len != 0)
{
report_invalid_record(state,
- "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X",
+ "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%08X",
(unsigned int) blk->data_len,
LSN_FORMAT_ARGS(state->ReadRecPtr));
goto err;
@@ -1823,7 +1823,7 @@ DecodeXLogRecord(XLogReaderState *state,
blk->bimg_len == BLCKSZ))
{
report_invalid_record(state,
- "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
+ "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%08X",
(unsigned int) blk->hole_offset,
(unsigned int) blk->hole_length,
(unsigned int) blk->bimg_len,
@@ -1839,7 +1839,7 @@ DecodeXLogRecord(XLogReaderState *state,
(blk->hole_offset != 0 || blk->hole_length != 0))
{
report_invalid_record(state,
- "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
+ "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%08X",
(unsigned int) blk->hole_offset,
(unsigned int) blk->hole_length,
LSN_FORMAT_ARGS(state->ReadRecPtr));
@@ -1853,7 +1853,7 @@ DecodeXLogRecord(XLogReaderState *state,
blk->bimg_len == BLCKSZ)
{
report_invalid_record(state,
- "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%X",
+ "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%08X",
(unsigned int) blk->bimg_len,
LSN_FORMAT_ARGS(state->ReadRecPtr));
goto err;
@@ -1868,7 +1868,7 @@ DecodeXLogRecord(XLogReaderState *state,
blk->bimg_len != BLCKSZ)
{
report_invalid_record(state,
- "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%X",
+ "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%08X",
(unsigned int) blk->data_len,
LSN_FORMAT_ARGS(state->ReadRecPtr));
goto err;
@@ -1884,7 +1884,7 @@ DecodeXLogRecord(XLogReaderState *state,
if (rlocator == NULL)
{
report_invalid_record(state,
- "BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
+ "BKPBLOCK_SAME_REL set but no previous rel at %X/%08X",
LSN_FORMAT_ARGS(state->ReadRecPtr));
goto err;
}
@@ -1896,7 +1896,7 @@ DecodeXLogRecord(XLogReaderState *state,
else
{
report_invalid_record(state,
- "invalid block_id %u at %X/%X",
+ "invalid block_id %u at %X/%08X",
block_id, LSN_FORMAT_ARGS(state->ReadRecPtr));
goto err;
}
@@ -1963,7 +1963,7 @@ DecodeXLogRecord(XLogReaderState *state,
shortdata_err:
report_invalid_record(state,
- "record with invalid length at %X/%X",
+ "record with invalid length at %X/%08X",
LSN_FORMAT_ARGS(state->ReadRecPtr));
err:
*errormsg = state->errormsg_buf;
@@ -2073,14 +2073,14 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
!record->record->blocks[block_id].in_use)
{
report_invalid_record(record,
- "could not restore image at %X/%X with invalid block %d specified",
+ "could not restore image at %X/%08X with invalid block %d specified",
LSN_FORMAT_ARGS(record->ReadRecPtr),
block_id);
return false;
}
if (!record->record->blocks[block_id].has_image)
{
- report_invalid_record(record, "could not restore image at %X/%X with invalid state, block %d",
+ report_invalid_record(record, "could not restore image at %X/%08X with invalid state, block %d",
LSN_FORMAT_ARGS(record->ReadRecPtr),
block_id);
return false;
@@ -2107,7 +2107,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
bkpb->bimg_len, BLCKSZ - bkpb->hole_length) <= 0)
decomp_success = false;
#else
- report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d",
+ report_invalid_record(record, "could not restore image at %X/%08X compressed with %s not supported by build, block %d",
LSN_FORMAT_ARGS(record->ReadRecPtr),
"LZ4",
block_id);
@@ -2124,7 +2124,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
if (ZSTD_isError(decomp_result))
decomp_success = false;
#else
- report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d",
+ report_invalid_record(record, "could not restore image at %X/%08X compressed with %s not supported by build, block %d",
LSN_FORMAT_ARGS(record->ReadRecPtr),
"zstd",
block_id);
@@ -2133,7 +2133,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
}
else
{
- report_invalid_record(record, "could not restore image at %X/%X compressed with unknown method, block %d",
+ report_invalid_record(record, "could not restore image at %X/%08X compressed with unknown method, block %d",
LSN_FORMAT_ARGS(record->ReadRecPtr),
block_id);
return false;
@@ -2141,7 +2141,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
if (!decomp_success)
{
- report_invalid_record(record, "could not decompress image at %X/%X, block %d",
+ report_invalid_record(record, "could not decompress image at %X/%08X, block %d",
LSN_FORMAT_ARGS(record->ReadRecPtr),
block_id);
return false;
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 6ce979f2d8b..23878b2dd91 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -620,10 +620,10 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
* than ControlFile->checkPoint is used.
*/
ereport(LOG,
- (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
- LSN_FORMAT_ARGS(RedoStartLSN),
- LSN_FORMAT_ARGS(CheckPointLoc),
- CheckPointTLI)));
+ errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
+ LSN_FORMAT_ARGS(RedoStartLSN),
+ LSN_FORMAT_ARGS(CheckPointLoc),
+ CheckPointTLI));
/*
* When a backup_label file is present, we want to roll forward from
@@ -636,8 +636,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
ereport(DEBUG1,
- (errmsg_internal("checkpoint record is at %X/%X",
- LSN_FORMAT_ARGS(CheckPointLoc))));
+ errmsg_internal("checkpoint record is at %X/%08X",
+ LSN_FORMAT_ARGS(CheckPointLoc)));
InRecovery = true; /* force recovery even if SHUTDOWNED */
/*
@@ -652,23 +652,23 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
if (!ReadRecord(xlogprefetcher, LOG, false,
checkPoint.ThisTimeLineID))
ereport(FATAL,
- (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
- LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
- errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
- "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
- "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
- DataDir, DataDir, DataDir, DataDir)));
+ errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
+ LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
+ errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
+ "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+ "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+ DataDir, DataDir, DataDir, DataDir));
}
}
else
{
ereport(FATAL,
- (errmsg("could not locate required checkpoint record at %X/%X",
- LSN_FORMAT_ARGS(CheckPointLoc)),
- errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
- "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
- "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
- DataDir, DataDir, DataDir, DataDir)));
+ errmsg("could not locate required checkpoint record at %X/%08X",
+ LSN_FORMAT_ARGS(CheckPointLoc)),
+ errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
+ "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+ "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+ DataDir, DataDir, DataDir, DataDir));
wasShutdown = false; /* keep compiler quiet */
}
@@ -773,8 +773,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
*/
if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
ereport(LOG,
- (errmsg("restarting backup recovery with redo LSN %X/%X",
- LSN_FORMAT_ARGS(ControlFile->backupStartPoint))));
+ errmsg("restarting backup recovery with redo LSN %X/%08X",
+ LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
/* Get the last valid checkpoint record. */
CheckPointLoc = ControlFile->checkPoint;
@@ -786,8 +786,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
if (record != NULL)
{
ereport(DEBUG1,
- (errmsg_internal("checkpoint record is at %X/%X",
- LSN_FORMAT_ARGS(CheckPointLoc))));
+ errmsg_internal("checkpoint record is at %X/%08X",
+ LSN_FORMAT_ARGS(CheckPointLoc)));
}
else
{
@@ -798,8 +798,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
* simplify processing around checkpoints.
*/
ereport(PANIC,
- (errmsg("could not locate a valid checkpoint record at %X/%X",
- LSN_FORMAT_ARGS(CheckPointLoc))));
+ errmsg("could not locate a valid checkpoint record at %X/%08X",
+ LSN_FORMAT_ARGS(CheckPointLoc)));
}
memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
@@ -824,8 +824,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
recoveryTargetName)));
else if (recoveryTarget == RECOVERY_TARGET_LSN)
ereport(LOG,
- (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
- LSN_FORMAT_ARGS(recoveryTargetLSN))));
+ errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
+ LSN_FORMAT_ARGS(recoveryTargetLSN)));
else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
ereport(LOG,
(errmsg("starting point-in-time recovery to earliest consistent point")));
@@ -855,7 +855,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
(errmsg("requested timeline %u is not a child of this server's history",
recoveryTargetTLI),
/* translator: %s is a backup_label file or a pg_control file */
- errdetail("Latest checkpoint in file \"%s\" is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
+ errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
haveBackupLabel ? "backup_label" : "pg_control",
LSN_FORMAT_ARGS(CheckPointLoc),
CheckPointTLI,
@@ -870,15 +870,15 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
ControlFile->minRecoveryPointTLI)
ereport(FATAL,
- (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
- recoveryTargetTLI,
- LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
- ControlFile->minRecoveryPointTLI)));
+ errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
+ recoveryTargetTLI,
+ LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
+ ControlFile->minRecoveryPointTLI));
ereport(DEBUG1,
- (errmsg_internal("redo record is at %X/%X; shutdown %s",
- LSN_FORMAT_ARGS(checkPoint.redo),
- wasShutdown ? "true" : "false")));
+ errmsg_internal("redo record is at %X/%08X; shutdown %s",
+ LSN_FORMAT_ARGS(checkPoint.redo),
+ wasShutdown ? "true" : "false"));
ereport(DEBUG1,
(errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
U64FromFullTransactionId(checkPoint.nextXid),
@@ -1253,14 +1253,14 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
* is pretty crude, but we are not expecting any variability in the file
* format).
*/
- if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
+ if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
&hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
ereport(FATAL,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
RedoStartLSN = ((uint64) hi) << 32 | lo;
RedoStartTLI = tli_from_walseg;
- if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
+ if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
&hi, &lo, &ch) != 3 || ch != '\n')
ereport(FATAL,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -1332,7 +1332,7 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
tli_from_file, BACKUP_LABEL_FILE)));
}
- if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
+ if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
ereport(FATAL,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("this is an incremental backup, not a data directory"),
@@ -1722,8 +1722,8 @@ PerformWalRecovery(void)
if (record->xl_rmid != RM_XLOG_ID ||
(record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
ereport(FATAL,
- (errmsg("unexpected record type found at redo point %X/%X",
- LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
+ errmsg("unexpected record type found at redo point %X/%08X",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
}
else
{
@@ -1745,8 +1745,8 @@ PerformWalRecovery(void)
RmgrStartup();
ereport(LOG,
- (errmsg("redo starts at %X/%X",
- LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
+ errmsg("redo starts at %X/%08X",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
/* Prepare to report progress of the redo phase. */
if (!StandbyMode)
@@ -1758,7 +1758,7 @@ PerformWalRecovery(void)
do
{
if (!StandbyMode)
- ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
+ ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
#ifdef WAL_DEBUG
@@ -1767,7 +1767,7 @@ PerformWalRecovery(void)
StringInfoData buf;
initStringInfo(&buf);
- appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
+ appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
xlog_outrec(&buf, xlogreader);
@@ -1880,9 +1880,9 @@ PerformWalRecovery(void)
RmgrCleanup();
ereport(LOG,
- (errmsg("redo done at %X/%X system usage: %s",
- LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
- pg_rusage_show(&ru0))));
+ errmsg("redo done at %X/%08X system usage: %s",
+ LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
+ pg_rusage_show(&ru0)));
xtime = GetLatestXTime();
if (xtime)
ereport(LOG,
@@ -2092,7 +2092,7 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
- elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
+ elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
LSN_FORMAT_ARGS(record->overwrittenRecPtr));
@@ -2101,9 +2101,9 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
missingContrecPtr = InvalidXLogRecPtr;
ereport(LOG,
- (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
- LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
- timestamptz_to_str(xlrec.overwrite_time))));
+ errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
+ LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
+ timestamptz_to_str(xlrec.overwrite_time)));
/* Verifying the record should only happen once */
record->overwrittenRecPtr = InvalidXLogRecPtr;
@@ -2129,7 +2129,7 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
backupEndPoint = lsn;
}
else
- elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
+ elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
}
}
@@ -2224,9 +2224,9 @@ CheckRecoveryConsistency(void)
backupEndRequired = false;
ereport(LOG,
- (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
- LSN_FORMAT_ARGS(saveBackupStartPoint),
- LSN_FORMAT_ARGS(saveBackupEndPoint))));
+ errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
+ LSN_FORMAT_ARGS(saveBackupStartPoint),
+ LSN_FORMAT_ARGS(saveBackupEndPoint)));
}
/*
@@ -2255,8 +2255,8 @@ CheckRecoveryConsistency(void)
reachedConsistency = true;
SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
ereport(LOG,
- (errmsg("consistent recovery state reached at %X/%X",
- LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
+ errmsg("consistent recovery state reached at %X/%08X",
+ LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
}
/*
@@ -2293,7 +2293,7 @@ rm_redo_error_callback(void *arg)
xlog_block_info(&buf, record);
/* translator: %s is a WAL record description */
- errcontext("WAL redo at %X/%X for %s",
+ errcontext("WAL redo at %X/%08X for %s",
LSN_FORMAT_ARGS(record->ReadRecPtr),
buf.data);
@@ -2328,7 +2328,7 @@ xlog_outdesc(StringInfo buf, XLogReaderState *record)
static void
xlog_outrec(StringInfo buf, XLogReaderState *record)
{
- appendStringInfo(buf, "prev %X/%X; xid %u",
+ appendStringInfo(buf, "prev %X/%08X; xid %u",
LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
XLogRecGetXid(record));
@@ -2416,10 +2416,10 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
lsn < minRecoveryPoint &&
newTLI > minRecoveryPointTLI)
ereport(PANIC,
- (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
- newTLI,
- LSN_FORMAT_ARGS(minRecoveryPoint),
- minRecoveryPointTLI)));
+ errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
+ newTLI,
+ LSN_FORMAT_ARGS(minRecoveryPoint),
+ minRecoveryPointTLI));
/* Looks good */
}
@@ -2621,8 +2621,8 @@ recoveryStopsBefore(XLogReaderState *record)
recoveryStopTime = 0;
recoveryStopName[0] = '\0';
ereport(LOG,
- (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
- LSN_FORMAT_ARGS(recoveryStopLSN))));
+ errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
+ LSN_FORMAT_ARGS(recoveryStopLSN)));
return true;
}
@@ -2789,8 +2789,8 @@ recoveryStopsAfter(XLogReaderState *record)
recoveryStopTime = 0;
recoveryStopName[0] = '\0';
ereport(LOG,
- (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
- LSN_FORMAT_ARGS(recoveryStopLSN))));
+ errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
+ LSN_FORMAT_ARGS(recoveryStopLSN)));
return true;
}
@@ -2910,7 +2910,7 @@ getRecoveryStopReason(void)
timestamptz_to_str(recoveryStopTime));
else if (recoveryTarget == RECOVERY_TARGET_LSN)
snprintf(reason, sizeof(reason),
- "%s LSN %X/%X\n",
+ "%s LSN %X/%08X\n",
recoveryStopAfter ? "after" : "before",
LSN_FORMAT_ARGS(recoveryStopLSN));
else if (recoveryTarget == RECOVERY_TARGET_NAME)
@@ -3213,11 +3213,11 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
XLogFileName(fname, xlogreader->seg.ws_tli, segno,
wal_segment_size);
ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
- (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
- xlogreader->latestPageTLI,
- fname,
- LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
- offset)));
+ errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
+ xlogreader->latestPageTLI,
+ fname,
+ LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
+ offset));
record = NULL;
}
@@ -3429,14 +3429,14 @@ retry:
errno = save_errno;
ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
(errcode_for_file_access(),
- errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
+ errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
fname, LSN_FORMAT_ARGS(targetPagePtr),
readOff)));
}
else
ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
(errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
+ errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
fname, LSN_FORMAT_ARGS(targetPagePtr),
readOff, r, (Size) XLOG_BLCKSZ)));
goto next_record_is_invalid;
@@ -3718,7 +3718,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
wait_time = wal_retrieve_retry_interval -
TimestampDifferenceMilliseconds(last_fail_time, now);
- elog(LOG, "waiting for WAL to become available at %X/%X",
+ elog(LOG, "waiting for WAL to become available at %X/%08X",
LSN_FORMAT_ARGS(RecPtr));
/* Do background tasks that might benefit us later. */
@@ -3864,7 +3864,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
if (curFileTLI > 0 && tli < curFileTLI)
- elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
+ elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
LSN_FORMAT_ARGS(tliRecPtr),
tli, curFileTLI);
}
@@ -4177,10 +4177,10 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
if (currentTle->end < replayLSN)
{
ereport(LOG,
- (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
- newtarget,
- replayTLI,
- LSN_FORMAT_ARGS(replayLSN))));
+ errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
+ newtarget,
+ replayTLI,
+ LSN_FORMAT_ARGS(replayLSN)));
return false;
}
@@ -4994,13 +4994,25 @@ check_recovery_target_timeline(char **newval, void **extra, GucSource source)
rttg = RECOVERY_TARGET_TIMELINE_LATEST;
else
{
+ char *endp;
+ uint64 timeline;
+
rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
errno = 0;
- strtoul(*newval, NULL, 0);
- if (errno == EINVAL || errno == ERANGE)
+ timeline = strtou64(*newval, &endp, 0);
+
+ if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
+ {
+ GUC_check_errdetail("\"%s\" is not a valid number.",
+ "recovery_target_timeline");
+ return false;
+ }
+
+ if (timeline < 1 || timeline > PG_UINT32_MAX)
{
- GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
+ GUC_check_errdetail("\"%s\" must be between %u and %u.",
+ "recovery_target_timeline", 1, UINT_MAX);
return false;
}
}
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index c389b27f77d..27ea52fdfee 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -795,7 +795,7 @@ XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage,
list_free_deep(timelineHistory);
- elog(DEBUG3, "switched to timeline %u valid until %X/%X",
+ elog(DEBUG3, "switched to timeline %u valid until %X/%08X",
state->currTLI,
LSN_FORMAT_ARGS(state->currTLIValidUntil));
}