Redesign tablesample method API, and do extensive code review.

The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
author: Tom Lane <tgl@sss.pgh.pa.us> 2015-07-25 14:39:00 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2015-07-25 14:39:00 -0400
commit: dd7a8f66ed278eef2f001a98e2312336c61ee527 (patch)
tree: 4daf4c4b1daddc8fc31d7448522b9e66f4369fd7 /src/backend/access/tablesample/system.c
parent: b26e3d660df51a088d14c3c2cfce5990c13c1195 (diff)
download: postgresql-dd7a8f66ed278eef2f001a98e2312336c61ee527.tar.gz
postgresql-dd7a8f66ed278eef2f001a98e2312336c61ee527.zip
1 files changed, 193 insertions, 119 deletions
diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c
index 1d834369a4b..43c5dab7161 100644
--- a/src/backend/access/tablesample/system.c
+++ b/src/backend/access/tablesample/system.c
@@ -1,186 +1,260 @@
 /*-------------------------------------------------------------------------
  *
  * system.c
- *	  interface routines for system tablesample method
+ *	  support routines for SYSTEM tablesample method
  *
+ * To ensure repeatability of samples, it is necessary that selection of a
+ * given tuple be history-independent; otherwise syncscanning would break
+ * repeatability, to say nothing of logically-irrelevant maintenance such
+ * as physical extension or shortening of the relation.
  *
- * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * To achieve that, we proceed by hashing each candidate block number together
+ * with the active seed, and then selecting it if the hash is less than the
+ * cutoff value computed from the selection probability by BeginSampleScan.
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  src/backend/utils/tablesample/system.c
+ *	  src/backend/access/tablesample/system.c
  *
  *-------------------------------------------------------------------------
  */
 
 #include "postgres.h"
 
-#include "fmgr.h"
+#ifdef _MSC_VER
+#include <float.h>				/* for _isnan */
+#endif
+#include <math.h>
 
-#include "access/tablesample.h"
+#include "access/hash.h"
 #include "access/relscan.h"
-#include "nodes/execnodes.h"
-#include "nodes/relation.h"
+#include "access/tsmapi.h"
+#include "catalog/pg_type.h"
 #include "optimizer/clauses.h"
-#include "storage/bufmgr.h"
-#include "utils/sampling.h"
+#include "optimizer/cost.h"
+#include "utils/builtins.h"
 
 
-/*
- * State
- */
+/* Private state */
 typedef struct
 {
-	BlockSamplerData bs;
+	uint64		cutoff;			/* select blocks with hash less than this */
 	uint32		seed;			/* random seed */
-	BlockNumber nblocks;		/* number of block in relation */
-	int			samplesize;		/* number of blocks to return */
+	BlockNumber nextblock;		/* next block to consider sampling */
 	OffsetNumber lt;			/* last tuple returned from current block */
 } SystemSamplerData;
 
 
-/*
- * Initializes the state.
- */
-Datum
-tsm_system_init(PG_FUNCTION_ARGS)
-{
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	uint32		seed = PG_GETARG_UINT32(1);
-	float4		percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2);
-	HeapScanDesc scan = tsdesc->heapScan;
-	SystemSamplerData *sampler;
+static void system_samplescangetsamplesize(PlannerInfo *root,
+							   RelOptInfo *baserel,
+							   List *paramexprs,
+							   BlockNumber *pages,
+							   double *tuples);
+static void system_initsamplescan(SampleScanState *node,
+					  int eflags);
+static void system_beginsamplescan(SampleScanState *node,
+					   Datum *params,
+					   int nparams,
+					   uint32 seed);
+static BlockNumber system_nextsampleblock(SampleScanState *node);
+static OffsetNumber system_nextsampletuple(SampleScanState *node,
+					   BlockNumber blockno,
+					   OffsetNumber maxoffset);
 
-	if (percent < 0 || percent > 100)
-		ereport(ERROR,
-				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-				 errmsg("invalid sample size"),
-				 errhint("Sample size must be numeric value between 0 and 100 (inclusive).")));
-
-	sampler = palloc0(sizeof(SystemSamplerData));
-
-	/* Remember initial values for reinit */
-	sampler->seed = seed;
-	sampler->nblocks = scan->rs_nblocks;
-	sampler->samplesize = 1 + (int) (sampler->nblocks * (percent / 100.0));
-	sampler->lt = InvalidOffsetNumber;
-
-	BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize,
-					  sampler->seed);
-
-	tsdesc->tsmdata = (void *) sampler;
-
-	PG_RETURN_VOID();
-}
 
 /*
- * Get next block number or InvalidBlockNumber when we're done.
- *
- * Uses the same logic as ANALYZE for picking the random blocks.
+ * Create a TsmRoutine descriptor for the SYSTEM method.
  */
 Datum
-tsm_system_nextblock(PG_FUNCTION_ARGS)
+tsm_system_handler(PG_FUNCTION_ARGS)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
-	BlockNumber blockno;
-
-	if (!BlockSampler_HasMore(&sampler->bs))
-		PG_RETURN_UINT32(InvalidBlockNumber);
-
-	blockno = BlockSampler_Next(&sampler->bs);
-
-	PG_RETURN_UINT32(blockno);
+	TsmRoutine *tsm = makeNode(TsmRoutine);
+
+	tsm->parameterTypes = list_make1_oid(FLOAT4OID);
+	tsm->repeatable_across_queries = true;
+	tsm->repeatable_across_scans = true;
+	tsm->SampleScanGetSampleSize = system_samplescangetsamplesize;
+	tsm->InitSampleScan = system_initsamplescan;
+	tsm->BeginSampleScan = system_beginsamplescan;
+	tsm->NextSampleBlock = system_nextsampleblock;
+	tsm->NextSampleTuple = system_nextsampletuple;
+	tsm->EndSampleScan = NULL;
+
+	PG_RETURN_POINTER(tsm);
 }
 
 /*
- * Get next tuple offset in current block or InvalidOffsetNumber if we are done
- * with this block.
+ * Sample size estimation.
  */
-Datum
-tsm_system_nexttuple(PG_FUNCTION_ARGS)
+static void
+system_samplescangetsamplesize(PlannerInfo *root,
+							   RelOptInfo *baserel,
+							   List *paramexprs,
+							   BlockNumber *pages,
+							   double *tuples)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	OffsetNumber maxoffset = PG_GETARG_UINT16(2);
-	SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
-	OffsetNumber tupoffset = sampler->lt;
+	Node	   *pctnode;
+	float4		samplefract;
 
-	if (tupoffset == InvalidOffsetNumber)
-		tupoffset = FirstOffsetNumber;
-	else
-		tupoffset++;
+	/* Try to extract an estimate for the sample percentage */
+	pctnode = (Node *) linitial(paramexprs);
+	pctnode = estimate_expression_value(root, pctnode);
 
-	if (tupoffset > maxoffset)
-		tupoffset = InvalidOffsetNumber;
+	if (IsA(pctnode, Const) &&
+		!((Const *) pctnode)->constisnull)
+	{
+		samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
+		if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
+			samplefract /= 100.0f;
+		else
+		{
+			/* Default samplefract if the value is bogus */
+			samplefract = 0.1f;
+		}
+	}
+	else
+	{
+		/* Default samplefract if we didn't obtain a non-null Const */
+		samplefract = 0.1f;
+	}
 
-	sampler->lt = tupoffset;
+	/* We'll visit a sample of the pages ... */
+	*pages = clamp_row_est(baserel->pages * samplefract);
 
-	PG_RETURN_UINT16(tupoffset);
+	/* ... and hopefully get a representative number of tuples from them */
+	*tuples = clamp_row_est(baserel->tuples * samplefract);
 }
 
 /*
- * Cleanup method.
+ * Initialize during executor setup.
  */
-Datum
-tsm_system_end(PG_FUNCTION_ARGS)
+static void
+system_initsamplescan(SampleScanState *node, int eflags)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-
-	pfree(tsdesc->tsmdata);
-
-	PG_RETURN_VOID();
+	node->tsm_state = palloc0(sizeof(SystemSamplerData));
 }
 
 /*
- * Reset state (called by ReScan).
+ * Examine parameters and prepare for a sample scan.
  */
-Datum
-tsm_system_reset(PG_FUNCTION_ARGS)
+static void
+system_beginsamplescan(SampleScanState *node,
+					   Datum *params,
+					   int nparams,
+					   uint32 seed)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	double		percent = DatumGetFloat4(params[0]);
 
+	if (percent < 0 || percent > 100 || isnan(percent))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+				 errmsg("sample percentage must be between 0 and 100")));
+
+	/*
+	 * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
+	 * store that as a uint64, of course.  Note that this gives strictly
+	 * correct behavior at the limits of zero or one probability.
+	 */
+	sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
+	sampler->seed = seed;
+	sampler->nextblock = 0;
 	sampler->lt = InvalidOffsetNumber;
-	BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize,
-					  sampler->seed);
 
-	PG_RETURN_VOID();
+	/*
+	 * Bulkread buffer access strategy probably makes sense unless we're
+	 * scanning a very small fraction of the table.  The 1% cutoff here is a
+	 * guess.  We should use pagemode visibility checking, since we scan all
+	 * tuples on each selected page.
+	 */
+	node->use_bulkread = (percent >= 1);
+	node->use_pagemode = true;
 }
 
 /*
- * Costing function.
+ * Select next block to sample.
  */
-Datum
-tsm_system_cost(PG_FUNCTION_ARGS)
+static BlockNumber
+system_nextsampleblock(SampleScanState *node)
 {
-	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-	Path	   *path = (Path *) PG_GETARG_POINTER(1);
-	RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2);
-	List	   *args = (List *) PG_GETARG_POINTER(3);
-	BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4);
-	double	   *tuples = (double *) PG_GETARG_POINTER(5);
-	Node	   *pctnode;
-	float4		samplesize;
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	HeapScanDesc scan = node->ss.ss_currentScanDesc;
+	BlockNumber nextblock = sampler->nextblock;
+	uint32		hashinput[2];
+
+	/*
+	 * We compute the hash by applying hash_any to an array of 2 uint32's
+	 * containing the block number and seed.  This is efficient to set up, and
+	 * with the current implementation of hash_any, it gives
+	 * machine-independent results, which is a nice property for regression
+	 * testing.
+	 *
+	 * These words in the hash input are the same throughout the block:
+	 */
+	hashinput[1] = sampler->seed;
+
+	/*
+	 * Loop over block numbers until finding suitable block or reaching end of
+	 * relation.
+	 */
+	for (; nextblock < scan->rs_nblocks; nextblock++)
+	{
+		uint32		hash;
 
-	pctnode = linitial(args);
-	pctnode = estimate_expression_value(root, pctnode);
+		hashinput[0] = nextblock;
 
-	if (IsA(pctnode, RelabelType))
-		pctnode = (Node *) ((RelabelType *) pctnode)->arg;
+		hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
+									   (int) sizeof(hashinput)));
+		if (hash < sampler->cutoff)
+			break;
+	}
 
-	if (IsA(pctnode, Const))
+	if (nextblock < scan->rs_nblocks)
 	{
-		samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue);
-		samplesize /= 100.0;
+		/* Found a suitable block; remember where we should start next time */
+		sampler->nextblock = nextblock + 1;
+		return nextblock;
 	}
+
+	/* Done, but let's reset nextblock to 0 for safety. */
+	sampler->nextblock = 0;
+	return InvalidBlockNumber;
+}
+
+/*
+ * Select next sampled tuple in current block.
+ *
+ * In block sampling, we just want to sample all the tuples in each selected
+ * block.
+ *
+ * It is OK here to return an offset without knowing if the tuple is visible
+ * (or even exists); nodeSamplescan.c will deal with that.
+ *
+ * When we reach end of the block, return InvalidOffsetNumber which tells
+ * SampleScan to go to next block.
+ */
+static OffsetNumber
+system_nextsampletuple(SampleScanState *node,
+					   BlockNumber blockno,
+					   OffsetNumber maxoffset)
+{
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	OffsetNumber tupoffset = sampler->lt;
+
+	/* Advance to next possible offset on page */
+	if (tupoffset == InvalidOffsetNumber)
+		tupoffset = FirstOffsetNumber;
 	else
-	{
-		/* Default samplesize if the estimation didn't return Const. */
-		samplesize = 0.1f;
-	}
+		tupoffset++;
 
-	*pages = baserel->pages * samplesize;
-	*tuples = path->rows * samplesize;
-	path->rows = *tuples;
+	/* Done? */
+	if (tupoffset > maxoffset)
+		tupoffset = InvalidOffsetNumber;
+
+	sampler->lt = tupoffset;
 
-	PG_RETURN_VOID();
+	return tupoffset;
 }
author	Tom Lane <tgl@sss.pgh.pa.us>	2015-07-25 14:39:00 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2015-07-25 14:39:00 -0400
commit	dd7a8f66ed278eef2f001a98e2312336c61ee527 (patch)
tree	4daf4c4b1daddc8fc31d7448522b9e66f4369fd7 /src/backend/access/tablesample/system.c
parent	b26e3d660df51a088d14c3c2cfce5990c13c1195 (diff)
download	postgresql-dd7a8f66ed278eef2f001a98e2312336c61ee527.tar.gz postgresql-dd7a8f66ed278eef2f001a98e2312336c61ee527.zip