aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/time
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2020-08-12 16:03:49 -0700
committerAndres Freund <andres@anarazel.de>2020-08-12 16:03:49 -0700
commitdc7420c2c9274a283779ec19718d2d16323640c0 (patch)
tree1ec40b9eebbf7913780ac6a7d6193605c25f1aa2 /src/backend/utils/time
parent1f42d35a1d6144a23602b2c0bc7f97f3046cf890 (diff)
downloadpostgresql-dc7420c2c9274a283779ec19718d2d16323640c0.tar.gz
postgresql-dc7420c2c9274a283779ec19718d2d16323640c0.zip
snapshot scalability: Don't compute global horizons while building snapshots.
To make GetSnapshotData() more scalable, it cannot not look at at each proc's xmin: While snapshot contents do not need to change whenever a read-only transaction commits or a snapshot is released, a proc's xmin is modified in those cases. The frequency of xmin modifications leads to, particularly on higher core count systems, many cache misses inside GetSnapshotData(), despite the data underlying a snapshot not changing. That is the most significant source of GetSnapshotData() scaling poorly on larger systems. Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons / thresholds as it has so far. But we don't really have to: The horizons don't actually change that much between GetSnapshotData() calls. Nor are the horizons actually used every time a snapshot is built. The trick this commit introduces is to delay computation of accurate horizons until there use and using horizon boundaries to determine whether accurate horizons need to be computed. The use of RecentGlobal[Data]Xmin to decide whether a row version could be removed has been replaces with new GlobalVisTest* functions. These use two thresholds to determine whether a row can be pruned: 1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed are definitely still visible. 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can definitely be removed GetSnapshotData() updates definitely_needed to be the xmin of the computed snapshot. When testing whether a row can be removed (with GlobalVisTestIsRemovableXid()) and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID < definitely_needed) the boundaries can be recomputed to be more accurate. As it is not cheap to compute accurate boundaries, we limit the number of times that happens in short succession. As the boundaries used by GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by GetSnapshotData()), it is likely that further test can benefit from an earlier computation of accurate horizons. To avoid regressing performance when old_snapshot_threshold is set (as that requires an accurate horizon to be computed), heap_page_prune_opt() doesn't unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the computation of the limited horizon, and the triggering of errors (with SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove tuples. This commit just removes the accesses to PGXACT->xmin from GetSnapshotData(), but other members of PGXACT residing in the same cache line are accessed. Therefore this in itself does not result in a significant improvement. Subsequent commits will take advantage of the fact that GetSnapshotData() now does not need to access xmins anymore. Note: This contains a workaround in heap_page_prune_opt() to keep the snapshot_too_old tests working. While that workaround is ugly, the tests currently are not meaningful, and it seems best to address them separately. Author: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Thomas Munro <thomas.munro@gmail.com> Reviewed-By: David Rowley <dgrowleyml@gmail.com> Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
Diffstat (limited to 'src/backend/utils/time')
-rw-r--r--src/backend/utils/time/snapmgr.c250
1 files changed, 134 insertions, 116 deletions
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 6b6c8571e23..604d823f686 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -157,16 +157,9 @@ static Snapshot HistoricSnapshot = NULL;
* These are updated by GetSnapshotData. We initialize them this way
* for the convenience of TransactionIdIsInProgress: even in bootstrap
* mode, we don't want it to say that BootstrapTransactionId is in progress.
- *
- * RecentGlobalXmin and RecentGlobalDataXmin are initialized to
- * InvalidTransactionId, to ensure that no one tries to use a stale
- * value. Readers should ensure that it has been set to something else
- * before using it.
*/
TransactionId TransactionXmin = FirstNormalTransactionId;
TransactionId RecentXmin = FirstNormalTransactionId;
-TransactionId RecentGlobalXmin = InvalidTransactionId;
-TransactionId RecentGlobalDataXmin = InvalidTransactionId;
/* (table, ctid) => (cmin, cmax) mapping during timetravel */
static HTAB *tuplecid_data = NULL;
@@ -581,9 +574,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid,
* Even though we are not going to use the snapshot it computes, we must
* call GetSnapshotData, for two reasons: (1) to be sure that
* CurrentSnapshotData's XID arrays have been allocated, and (2) to update
- * RecentXmin and RecentGlobalXmin. (We could alternatively include those
- * two variables in exported snapshot files, but it seems better to have
- * snapshot importers compute reasonably up-to-date values for them.)
+ * the state for GlobalVis*.
*/
CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
@@ -957,36 +948,6 @@ xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg)
}
/*
- * Get current RecentGlobalXmin value, as a FullTransactionId.
- */
-FullTransactionId
-GetFullRecentGlobalXmin(void)
-{
- FullTransactionId nextxid_full;
- uint32 nextxid_epoch;
- TransactionId nextxid_xid;
- uint32 epoch;
-
- Assert(TransactionIdIsNormal(RecentGlobalXmin));
-
- /*
- * Compute the epoch from the next XID's epoch. This relies on the fact
- * that RecentGlobalXmin must be within the 2 billion XID horizon from the
- * next XID.
- */
- nextxid_full = ReadNextFullTransactionId();
- nextxid_epoch = EpochFromFullTransactionId(nextxid_full);
- nextxid_xid = XidFromFullTransactionId(nextxid_full);
-
- if (RecentGlobalXmin > nextxid_xid)
- epoch = nextxid_epoch - 1;
- else
- epoch = nextxid_epoch;
-
- return FullTransactionIdFromEpochAndXid(epoch, RecentGlobalXmin);
-}
-
-/*
* SnapshotResetXmin
*
* If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid.
@@ -1753,106 +1714,157 @@ GetOldSnapshotThresholdTimestamp(void)
return threshold_timestamp;
}
-static void
+void
SetOldSnapshotThresholdTimestamp(TimestampTz ts, TransactionId xlimit)
{
SpinLockAcquire(&oldSnapshotControl->mutex_threshold);
+ Assert(oldSnapshotControl->threshold_timestamp <= ts);
+ Assert(TransactionIdPrecedesOrEquals(oldSnapshotControl->threshold_xid, xlimit));
oldSnapshotControl->threshold_timestamp = ts;
oldSnapshotControl->threshold_xid = xlimit;
SpinLockRelease(&oldSnapshotControl->mutex_threshold);
}
/*
+ * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
+ * currently are broken, and discussion of what to do about them is
+ * ongoing. See
+ * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
+ */
+void
+SnapshotTooOldMagicForTest(void)
+{
+ TimestampTz ts = GetSnapshotCurrentTimestamp();
+
+ Assert(old_snapshot_threshold == 0);
+
+ ts -= 5 * USECS_PER_SEC;
+
+ SpinLockAcquire(&oldSnapshotControl->mutex_threshold);
+ oldSnapshotControl->threshold_timestamp = ts;
+ SpinLockRelease(&oldSnapshotControl->mutex_threshold);
+}
+
+/*
+ * If there is a valid mapping for the timestamp, set *xlimitp to
+ * that. Returns whether there is such a mapping.
+ */
+static bool
+GetOldSnapshotFromTimeMapping(TimestampTz ts, TransactionId *xlimitp)
+{
+ bool in_mapping = false;
+
+ Assert(ts == AlignTimestampToMinuteBoundary(ts));
+
+ LWLockAcquire(OldSnapshotTimeMapLock, LW_SHARED);
+
+ if (oldSnapshotControl->count_used > 0
+ && ts >= oldSnapshotControl->head_timestamp)
+ {
+ int offset;
+
+ offset = ((ts - oldSnapshotControl->head_timestamp)
+ / USECS_PER_MINUTE);
+ if (offset > oldSnapshotControl->count_used - 1)
+ offset = oldSnapshotControl->count_used - 1;
+ offset = (oldSnapshotControl->head_offset + offset)
+ % OLD_SNAPSHOT_TIME_MAP_ENTRIES;
+
+ *xlimitp = oldSnapshotControl->xid_by_minute[offset];
+
+ in_mapping = true;
+ }
+
+ LWLockRelease(OldSnapshotTimeMapLock);
+
+ return in_mapping;
+}
+
+/*
* TransactionIdLimitedForOldSnapshots
*
- * Apply old snapshot limit, if any. This is intended to be called for page
- * pruning and table vacuuming, to allow old_snapshot_threshold to override
- * the normal global xmin value. Actual testing for snapshot too old will be
- * based on whether a snapshot timestamp is prior to the threshold timestamp
- * set in this function.
+ * Apply old snapshot limit. This is intended to be called for page pruning
+ * and table vacuuming, to allow old_snapshot_threshold to override the normal
+ * global xmin value. Actual testing for snapshot too old will be based on
+ * whether a snapshot timestamp is prior to the threshold timestamp set in
+ * this function.
+ *
+ * If the limited horizon allows a cleanup action that otherwise would not be
+ * possible, SetOldSnapshotThresholdTimestamp(*limit_ts, *limit_xid) needs to
+ * be called before that cleanup action.
*/
-TransactionId
+bool
TransactionIdLimitedForOldSnapshots(TransactionId recentXmin,
- Relation relation)
+ Relation relation,
+ TransactionId *limit_xid,
+ TimestampTz *limit_ts)
{
- if (TransactionIdIsNormal(recentXmin)
- && old_snapshot_threshold >= 0
- && RelationAllowsEarlyPruning(relation))
- {
- TimestampTz ts = GetSnapshotCurrentTimestamp();
- TransactionId xlimit = recentXmin;
- TransactionId latest_xmin;
- TimestampTz update_ts;
- bool same_ts_as_threshold = false;
+ TimestampTz ts;
+ TransactionId xlimit = recentXmin;
+ TransactionId latest_xmin;
+ TimestampTz next_map_update_ts;
+ TransactionId threshold_timestamp;
+ TransactionId threshold_xid;
- SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin);
- latest_xmin = oldSnapshotControl->latest_xmin;
- update_ts = oldSnapshotControl->next_map_update;
- SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin);
+ Assert(TransactionIdIsNormal(recentXmin));
+ Assert(OldSnapshotThresholdActive());
+ Assert(limit_ts != NULL && limit_xid != NULL);
- /*
- * Zero threshold always overrides to latest xmin, if valid. Without
- * some heuristic it will find its own snapshot too old on, for
- * example, a simple UPDATE -- which would make it useless for most
- * testing, but there is no principled way to ensure that it doesn't
- * fail in this way. Use a five-second delay to try to get useful
- * testing behavior, but this may need adjustment.
- */
- if (old_snapshot_threshold == 0)
- {
- if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin)
- && TransactionIdFollows(latest_xmin, xlimit))
- xlimit = latest_xmin;
+ if (!RelationAllowsEarlyPruning(relation))
+ return false;
- ts -= 5 * USECS_PER_SEC;
- SetOldSnapshotThresholdTimestamp(ts, xlimit);
+ ts = GetSnapshotCurrentTimestamp();
- return xlimit;
- }
+ SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin);
+ latest_xmin = oldSnapshotControl->latest_xmin;
+ next_map_update_ts = oldSnapshotControl->next_map_update;
+ SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin);
+ /*
+ * Zero threshold always overrides to latest xmin, if valid. Without some
+ * heuristic it will find its own snapshot too old on, for example, a
+ * simple UPDATE -- which would make it useless for most testing, but
+ * there is no principled way to ensure that it doesn't fail in this way.
+ * Use a five-second delay to try to get useful testing behavior, but this
+ * may need adjustment.
+ */
+ if (old_snapshot_threshold == 0)
+ {
+ if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin)
+ && TransactionIdFollows(latest_xmin, xlimit))
+ xlimit = latest_xmin;
+
+ ts -= 5 * USECS_PER_SEC;
+ }
+ else
+ {
ts = AlignTimestampToMinuteBoundary(ts)
- (old_snapshot_threshold * USECS_PER_MINUTE);
/* Check for fast exit without LW locking. */
SpinLockAcquire(&oldSnapshotControl->mutex_threshold);
- if (ts == oldSnapshotControl->threshold_timestamp)
- {
- xlimit = oldSnapshotControl->threshold_xid;
- same_ts_as_threshold = true;
- }
+ threshold_timestamp = oldSnapshotControl->threshold_timestamp;
+ threshold_xid = oldSnapshotControl->threshold_xid;
SpinLockRelease(&oldSnapshotControl->mutex_threshold);
- if (!same_ts_as_threshold)
+ if (ts == threshold_timestamp)
+ {
+ /*
+ * Current timestamp is in same bucket as the the last limit that
+ * was applied. Reuse.
+ */
+ xlimit = threshold_xid;
+ }
+ else if (ts == next_map_update_ts)
+ {
+ /*
+ * FIXME: This branch is super iffy - but that should probably
+ * fixed separately.
+ */
+ xlimit = latest_xmin;
+ }
+ else if (GetOldSnapshotFromTimeMapping(ts, &xlimit))
{
- if (ts == update_ts)
- {
- xlimit = latest_xmin;
- if (NormalTransactionIdFollows(xlimit, recentXmin))
- SetOldSnapshotThresholdTimestamp(ts, xlimit);
- }
- else
- {
- LWLockAcquire(OldSnapshotTimeMapLock, LW_SHARED);
-
- if (oldSnapshotControl->count_used > 0
- && ts >= oldSnapshotControl->head_timestamp)
- {
- int offset;
-
- offset = ((ts - oldSnapshotControl->head_timestamp)
- / USECS_PER_MINUTE);
- if (offset > oldSnapshotControl->count_used - 1)
- offset = oldSnapshotControl->count_used - 1;
- offset = (oldSnapshotControl->head_offset + offset)
- % OLD_SNAPSHOT_TIME_MAP_ENTRIES;
- xlimit = oldSnapshotControl->xid_by_minute[offset];
-
- if (NormalTransactionIdFollows(xlimit, recentXmin))
- SetOldSnapshotThresholdTimestamp(ts, xlimit);
- }
-
- LWLockRelease(OldSnapshotTimeMapLock);
- }
}
/*
@@ -1867,12 +1879,18 @@ TransactionIdLimitedForOldSnapshots(TransactionId recentXmin,
if (TransactionIdIsNormal(latest_xmin)
&& TransactionIdPrecedes(latest_xmin, xlimit))
xlimit = latest_xmin;
+ }
+
+ if (TransactionIdIsValid(xlimit) &&
+ TransactionIdFollowsOrEquals(xlimit, recentXmin))
+ {
+ *limit_ts = ts;
+ *limit_xid = xlimit;
- if (NormalTransactionIdFollows(xlimit, recentXmin))
- return xlimit;
+ return true;
}
- return recentXmin;
+ return false;
}
/*