diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2015-07-25 14:39:00 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2015-07-25 14:39:00 -0400 |
commit | dd7a8f66ed278eef2f001a98e2312336c61ee527 (patch) | |
tree | 4daf4c4b1daddc8fc31d7448522b9e66f4369fd7 /src/backend/access/tablesample/tablesample.c | |
parent | b26e3d660df51a088d14c3c2cfce5990c13c1195 (diff) | |
download | postgresql-dd7a8f66ed278eef2f001a98e2312336c61ee527.tar.gz postgresql-dd7a8f66ed278eef2f001a98e2312336c61ee527.zip |
Redesign tablesample method API, and do extensive code review.
The original implementation of TABLESAMPLE modeled the tablesample method
API on index access methods, which wasn't a good choice because, without
specialized DDL commands, there's no way to build an extension that can
implement a TSM. (Raw inserts into system catalogs are not an acceptable
thing to do, because we can't undo them during DROP EXTENSION, nor will
pg_upgrade behave sanely.) Instead adopt an API more like procedural
language handlers or foreign data wrappers, wherein the only SQL-level
support object needed is a single handler function identified by having
a special return type. This lets us get rid of the supporting catalog
altogether, so that no custom DDL support is needed for the feature.
Adjust the API so that it can support non-constant tablesample arguments
(the original coding assumed we could evaluate the argument expressions at
ExecInitSampleScan time, which is undesirable even if it weren't outright
unsafe), and discourage sampling methods from looking at invisible tuples.
Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable
within and across queries, as required by the SQL standard, and deal more
honestly with methods that can't support that requirement.
Make a full code-review pass over the tablesample additions, and fix
assorted bugs, omissions, infelicities, and cosmetic issues (such as
failure to put the added code stanzas in a consistent ordering).
Improve EXPLAIN's output of tablesample plans, too.
Back-patch to 9.5 so that we don't have to support the original API
in production.
Diffstat (limited to 'src/backend/access/tablesample/tablesample.c')
-rw-r--r-- | src/backend/access/tablesample/tablesample.c | 355 |
1 files changed, 15 insertions, 340 deletions
diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c index f21d42c8e38..b8ad7ced743 100644 --- a/src/backend/access/tablesample/tablesample.c +++ b/src/backend/access/tablesample/tablesample.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * tablesample.c - * TABLESAMPLE internal API + * Support functions for TABLESAMPLE feature * * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -10,356 +10,31 @@ * IDENTIFICATION * src/backend/access/tablesample/tablesample.c * - * TABLESAMPLE is the SQL standard clause for sampling the relations. - * - * The API is interface between the Executor and the TABLESAMPLE Methods. - * - * TABLESAMPLE Methods are implementations of actual sampling algorithms which - * can be used for returning a sample of the source relation. - * Methods don't read the table directly but are asked for block number and - * tuple offset which they want to examine (or return) and the tablesample - * interface implemented here does the reading for them. - * - * We currently only support sampling of the physical relations, but in the - * future we might extend the API to support subqueries as well. - * * ------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/tablesample.h" - -#include "catalog/pg_tablesample_method.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "storage/bufmgr.h" -#include "storage/predicate.h" -#include "utils/rel.h" -#include "utils/tqual.h" - - -static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan); - - -/* - * Initialize the TABLESAMPLE Descriptor and the TABLESAMPLE Method. - */ -TableSampleDesc * -tablesample_init(SampleScanState *scanstate, TableSampleClause *tablesample) -{ - FunctionCallInfoData fcinfo; - int i; - List *args = tablesample->args; - ListCell *arg; - ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; - TableSampleDesc *tsdesc = (TableSampleDesc *) palloc0(sizeof(TableSampleDesc)); - - /* Load functions */ - fmgr_info(tablesample->tsminit, &(tsdesc->tsminit)); - fmgr_info(tablesample->tsmnextblock, &(tsdesc->tsmnextblock)); - fmgr_info(tablesample->tsmnexttuple, &(tsdesc->tsmnexttuple)); - if (OidIsValid(tablesample->tsmexaminetuple)) - fmgr_info(tablesample->tsmexaminetuple, &(tsdesc->tsmexaminetuple)); - else - tsdesc->tsmexaminetuple.fn_oid = InvalidOid; - fmgr_info(tablesample->tsmreset, &(tsdesc->tsmreset)); - fmgr_info(tablesample->tsmend, &(tsdesc->tsmend)); - - InitFunctionCallInfoData(fcinfo, &tsdesc->tsminit, - list_length(args) + 2, - InvalidOid, NULL, NULL); - - tsdesc->tupDesc = scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; - tsdesc->heapScan = scanstate->ss.ss_currentScanDesc; - - /* First argument for init function is always TableSampleDesc */ - fcinfo.arg[0] = PointerGetDatum(tsdesc); - fcinfo.argnull[0] = false; +#include "access/tsmapi.h" - /* - * Second arg for init function is always REPEATABLE. - * - * If tablesample->repeatable is NULL then REPEATABLE clause was not - * specified, and we insert a random value as default. - * - * When specified, the expression cannot evaluate to NULL. - */ - if (tablesample->repeatable) - { - ExprState *argstate = ExecInitExpr((Expr *) tablesample->repeatable, - (PlanState *) scanstate); - - fcinfo.arg[1] = ExecEvalExpr(argstate, econtext, - &fcinfo.argnull[1], NULL); - if (fcinfo.argnull[1]) - ereport(ERROR, - (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), - errmsg("REPEATABLE clause must be NOT NULL numeric value"))); - } - else - { - fcinfo.arg[1] = UInt32GetDatum(random()); - fcinfo.argnull[1] = false; - } - - /* Rest of the arguments come from user. */ - i = 2; - foreach(arg, args) - { - Expr *argexpr = (Expr *) lfirst(arg); - ExprState *argstate = ExecInitExpr(argexpr, (PlanState *) scanstate); - - fcinfo.arg[i] = ExecEvalExpr(argstate, econtext, - &fcinfo.argnull[i], NULL); - i++; - } - Assert(i == fcinfo.nargs); - - (void) FunctionCallInvoke(&fcinfo); - - return tsdesc; -} /* - * Get next tuple from TABLESAMPLE Method. - */ -HeapTuple -tablesample_getnext(TableSampleDesc *desc) -{ - HeapScanDesc scan = desc->heapScan; - HeapTuple tuple = &(scan->rs_ctup); - bool pagemode = scan->rs_pageatatime; - BlockNumber blockno; - Page page; - bool page_all_visible; - ItemId itemid; - OffsetNumber tupoffset, - maxoffset; - - if (!scan->rs_inited) - { - /* - * return null immediately if relation is empty - */ - if (scan->rs_nblocks == 0) - { - Assert(!BufferIsValid(scan->rs_cbuf)); - tuple->t_data = NULL; - return NULL; - } - blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, - PointerGetDatum(desc))); - if (!BlockNumberIsValid(blockno)) - { - tuple->t_data = NULL; - return NULL; - } - - heapgetpage(scan, blockno); - scan->rs_inited = true; - } - else - { - /* continue from previously returned page/tuple */ - blockno = scan->rs_cblock; /* current page */ - } - - /* - * When pagemode is disabled, the scan will do visibility checks for each - * tuple it finds so the buffer needs to be locked. - */ - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); - - page = (Page) BufferGetPage(scan->rs_cbuf); - page_all_visible = PageIsAllVisible(page); - maxoffset = PageGetMaxOffsetNumber(page); - - for (;;) - { - CHECK_FOR_INTERRUPTS(); - - tupoffset = DatumGetUInt16(FunctionCall3(&desc->tsmnexttuple, - PointerGetDatum(desc), - UInt32GetDatum(blockno), - UInt16GetDatum(maxoffset))); - - if (OffsetNumberIsValid(tupoffset)) - { - bool visible; - bool found; - - /* Skip invalid tuple pointers. */ - itemid = PageGetItemId(page, tupoffset); - if (!ItemIdIsNormal(itemid)) - continue; - - tuple->t_data = (HeapTupleHeader) PageGetItem((Page) page, itemid); - tuple->t_len = ItemIdGetLength(itemid); - ItemPointerSet(&(tuple->t_self), blockno, tupoffset); - - if (page_all_visible) - visible = true; - else - visible = SampleTupleVisible(tuple, tupoffset, scan); - - /* - * Let the sampling method examine the actual tuple and decide if - * we should return it. - * - * Note that we let it examine even invisible tuples for - * statistical purposes, but not return them since user should - * never see invisible tuples. - */ - if (OidIsValid(desc->tsmexaminetuple.fn_oid)) - { - found = DatumGetBool(FunctionCall4(&desc->tsmexaminetuple, - PointerGetDatum(desc), - UInt32GetDatum(blockno), - PointerGetDatum(tuple), - BoolGetDatum(visible))); - /* Should not happen if sampling method is well written. */ - if (found && !visible) - elog(ERROR, "Sampling method wanted to return invisible tuple"); - } - else - found = visible; - - /* Found visible tuple, return it. */ - if (found) - { - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); - break; - } - else - { - /* Try next tuple from same page. */ - continue; - } - } - - - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, - PointerGetDatum(desc))); - - /* - * Report our new scan position for synchronization purposes. We don't - * do that when moving backwards, however. That would just mess up any - * other forward-moving scanners. - * - * Note: we do this before checking for end of scan so that the final - * state of the position hint is back at the start of the rel. That's - * not strictly necessary, but otherwise when you run the same query - * multiple times the starting position would shift a little bit - * backwards on every invocation, which is confusing. We don't - * guarantee any specific ordering in general, though. - */ - if (scan->rs_syncscan) - ss_report_location(scan->rs_rd, BlockNumberIsValid(blockno) ? - blockno : scan->rs_startblock); - - /* - * Reached end of scan. - */ - if (!BlockNumberIsValid(blockno)) - { - if (BufferIsValid(scan->rs_cbuf)) - ReleaseBuffer(scan->rs_cbuf); - scan->rs_cbuf = InvalidBuffer; - scan->rs_cblock = InvalidBlockNumber; - tuple->t_data = NULL; - scan->rs_inited = false; - return NULL; - } - - heapgetpage(scan, blockno); - - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); - - page = (Page) BufferGetPage(scan->rs_cbuf); - page_all_visible = PageIsAllVisible(page); - maxoffset = PageGetMaxOffsetNumber(page); - } - - pgstat_count_heap_getnext(scan->rs_rd); - - return &(scan->rs_ctup); -} - -/* - * Reset the sampling to starting state - */ -void -tablesample_reset(TableSampleDesc *desc) -{ - (void) FunctionCall1(&desc->tsmreset, PointerGetDatum(desc)); -} - -/* - * Signal the sampling method that the scan has finished. - */ -void -tablesample_end(TableSampleDesc *desc) -{ - (void) FunctionCall1(&desc->tsmend, PointerGetDatum(desc)); -} - -/* - * Check visibility of the tuple. + * GetTsmRoutine --- get a TsmRoutine struct by invoking the handler. + * + * This is a convenience routine that's just meant to check for errors. */ -static bool -SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) +TsmRoutine * +GetTsmRoutine(Oid tsmhandler) { - /* - * If this scan is reading whole pages at a time, there is already - * visibility info present in rs_vistuples so we can just search it for - * the tupoffset. - */ - if (scan->rs_pageatatime) - { - int start = 0, - end = scan->rs_ntuples - 1; - - /* - * Do the binary search over rs_vistuples, it's already sorted by - * OffsetNumber so we don't need to do any sorting ourselves here. - * - * We could use bsearch() here but it's slower for integers because of - * the function call overhead and because it needs boiler plate code - * it would not save us anything code-wise anyway. - */ - while (start <= end) - { - int mid = start + (end - start) / 2; - OffsetNumber curoffset = scan->rs_vistuples[mid]; - - if (curoffset == tupoffset) - return true; - else if (curoffset > tupoffset) - end = mid - 1; - else - start = mid + 1; - } - - return false; - } - else - { - /* No pagemode, we have to check the tuple itself. */ - Snapshot snapshot = scan->rs_snapshot; - Buffer buffer = scan->rs_cbuf; + Datum datum; + TsmRoutine *routine; - bool visible = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); + datum = OidFunctionCall1(tsmhandler, PointerGetDatum(NULL)); + routine = (TsmRoutine *) DatumGetPointer(datum); - CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, buffer, - snapshot); + if (routine == NULL || !IsA(routine, TsmRoutine)) + elog(ERROR, "tablesample handler function %u did not return a TsmRoutine struct", + tsmhandler); - return visible; - } + return routine; } |