diff options
author | David Rowley <drowley@postgresql.org> | 2021-02-27 22:59:36 +1300 |
---|---|---|
committer | David Rowley <drowley@postgresql.org> | 2021-02-27 22:59:36 +1300 |
commit | bb437f995d47405ecd92cf66df71f7f7e40ed460 (patch) | |
tree | 0ee50f8a501e1ecc30d5cfd0eeb6ed0bcd41e2b2 /src/backend/executor | |
parent | f4adc41c4f92cc91d507b19e397140c35bb9fd71 (diff) | |
download | postgresql-bb437f995d47405ecd92cf66df71f7f7e40ed460.tar.gz postgresql-bb437f995d47405ecd92cf66df71f7f7e40ed460.zip |
Add TID Range Scans to support efficient scanning ranges of TIDs
This adds a new executor node named TID Range Scan. The query planner
will generate paths for TID Range scans when quals are discovered on base
relations which search for ranges on the table's ctid column. These
ranges may be open at either end. For example, WHERE ctid >= '(10,0)';
will return all tuples on page 10 and over.
To support this, two new optional callback functions have been added to
table AM. scan_set_tidrange is used to set the scan range to just the
given range of TIDs. scan_getnextslot_tidrange fetches the next tuple
in the given range.
For AMs were scanning ranges of TIDs would not make sense, these functions
can be set to NULL in the TableAmRoutine. The query planner won't
generate TID Range Scan Paths in that case.
Author: Edmund Horner, David Rowley
Reviewed-by: David Rowley, Tomas Vondra, Tom Lane, Andres Freund, Zhihong Yu
Discussion: https://postgr.es/m/CAMyN-kB-nFTkF=VA_JPwFNo08S0d-Yk0F741S2B7LDmYAi8eyA@mail.gmail.com
Diffstat (limited to 'src/backend/executor')
-rw-r--r-- | src/backend/executor/Makefile | 1 | ||||
-rw-r--r-- | src/backend/executor/execAmi.c | 6 | ||||
-rw-r--r-- | src/backend/executor/execCurrent.c | 1 | ||||
-rw-r--r-- | src/backend/executor/execProcnode.c | 10 | ||||
-rw-r--r-- | src/backend/executor/nodeTidrangescan.c | 413 |
5 files changed, 431 insertions, 0 deletions
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile index f990c6473a3..74ac59faa13 100644 --- a/src/backend/executor/Makefile +++ b/src/backend/executor/Makefile @@ -67,6 +67,7 @@ OBJS = \ nodeSubplan.o \ nodeSubqueryscan.o \ nodeTableFuncscan.o \ + nodeTidrangescan.o \ nodeTidscan.o \ nodeUnique.o \ nodeValuesscan.o \ diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 23bdb53cd10..4543ac79edf 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -51,6 +51,7 @@ #include "executor/nodeSubplan.h" #include "executor/nodeSubqueryscan.h" #include "executor/nodeTableFuncscan.h" +#include "executor/nodeTidrangescan.h" #include "executor/nodeTidscan.h" #include "executor/nodeUnique.h" #include "executor/nodeValuesscan.h" @@ -197,6 +198,10 @@ ExecReScan(PlanState *node) ExecReScanTidScan((TidScanState *) node); break; + case T_TidRangeScanState: + ExecReScanTidRangeScan((TidRangeScanState *) node); + break; + case T_SubqueryScanState: ExecReScanSubqueryScan((SubqueryScanState *) node); break; @@ -562,6 +567,7 @@ ExecSupportsBackwardScan(Plan *node) case T_SeqScan: case T_TidScan: + case T_TidRangeScan: case T_FunctionScan: case T_ValuesScan: case T_CteScan: diff --git a/src/backend/executor/execCurrent.c b/src/backend/executor/execCurrent.c index 33221a4d6ce..4f430fb1603 100644 --- a/src/backend/executor/execCurrent.c +++ b/src/backend/executor/execCurrent.c @@ -336,6 +336,7 @@ search_plan_tree(PlanState *node, Oid table_oid, case T_IndexOnlyScanState: case T_BitmapHeapScanState: case T_TidScanState: + case T_TidRangeScanState: case T_ForeignScanState: case T_CustomScanState: { diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 414df50a054..29766d8196f 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -109,6 +109,7 @@ #include "executor/nodeSubplan.h" #include "executor/nodeSubqueryscan.h" #include "executor/nodeTableFuncscan.h" +#include "executor/nodeTidrangescan.h" #include "executor/nodeTidscan.h" #include "executor/nodeUnique.h" #include "executor/nodeValuesscan.h" @@ -238,6 +239,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags) estate, eflags); break; + case T_TidRangeScan: + result = (PlanState *) ExecInitTidRangeScan((TidRangeScan *) node, + estate, eflags); + break; + case T_SubqueryScan: result = (PlanState *) ExecInitSubqueryScan((SubqueryScan *) node, estate, eflags); @@ -637,6 +643,10 @@ ExecEndNode(PlanState *node) ExecEndTidScan((TidScanState *) node); break; + case T_TidRangeScanState: + ExecEndTidRangeScan((TidRangeScanState *) node); + break; + case T_SubqueryScanState: ExecEndSubqueryScan((SubqueryScanState *) node); break; diff --git a/src/backend/executor/nodeTidrangescan.c b/src/backend/executor/nodeTidrangescan.c new file mode 100644 index 00000000000..2b0d205d7dd --- /dev/null +++ b/src/backend/executor/nodeTidrangescan.c @@ -0,0 +1,413 @@ +/*------------------------------------------------------------------------- + * + * nodeTidrangescan.c + * Routines to support TID range scans of relations + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeTidrangescan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "access/sysattr.h" +#include "access/tableam.h" +#include "catalog/pg_operator.h" +#include "executor/execdebug.h" +#include "executor/nodeTidrangescan.h" +#include "nodes/nodeFuncs.h" +#include "storage/bufmgr.h" +#include "utils/rel.h" + + +#define IsCTIDVar(node) \ + ((node) != NULL && \ + IsA((node), Var) && \ + ((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \ + ((Var *) (node))->varlevelsup == 0) + +typedef enum +{ + TIDEXPR_UPPER_BOUND, + TIDEXPR_LOWER_BOUND +} TidExprType; + +/* Upper or lower range bound for scan */ +typedef struct TidOpExpr +{ + TidExprType exprtype; /* type of op; lower or upper */ + ExprState *exprstate; /* ExprState for a TID-yielding subexpr */ + bool inclusive; /* whether op is inclusive */ +} TidOpExpr; + +/* + * For the given 'expr', build and return an appropriate TidOpExpr taking into + * account the expr's operator and operand order. + */ +static TidOpExpr * +MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate) +{ + Node *arg1 = get_leftop((Expr *) expr); + Node *arg2 = get_rightop((Expr *) expr); + ExprState *exprstate = NULL; + bool invert = false; + TidOpExpr *tidopexpr; + + if (IsCTIDVar(arg1)) + exprstate = ExecInitExpr((Expr *) arg2, &tidstate->ss.ps); + else if (IsCTIDVar(arg2)) + { + exprstate = ExecInitExpr((Expr *) arg1, &tidstate->ss.ps); + invert = true; + } + else + elog(ERROR, "could not identify CTID variable"); + + tidopexpr = (TidOpExpr *) palloc(sizeof(TidOpExpr)); + tidopexpr->inclusive = false; /* for now */ + + switch (expr->opno) + { + case TIDLessEqOperator: + tidopexpr->inclusive = true; + /* fall through */ + case TIDLessOperator: + tidopexpr->exprtype = invert ? TIDEXPR_LOWER_BOUND : TIDEXPR_UPPER_BOUND; + break; + case TIDGreaterEqOperator: + tidopexpr->inclusive = true; + /* fall through */ + case TIDGreaterOperator: + tidopexpr->exprtype = invert ? TIDEXPR_UPPER_BOUND : TIDEXPR_LOWER_BOUND; + break; + default: + elog(ERROR, "could not identify CTID operator"); + } + + tidopexpr->exprstate = exprstate; + + return tidopexpr; +} + +/* + * Extract the qual subexpressions that yield TIDs to search for, + * and compile them into ExprStates if they're ordinary expressions. + */ +static void +TidExprListCreate(TidRangeScanState *tidrangestate) +{ + TidRangeScan *node = (TidRangeScan *) tidrangestate->ss.ps.plan; + List *tidexprs = NIL; + ListCell *l; + + foreach(l, node->tidrangequals) + { + OpExpr *opexpr = lfirst(l); + TidOpExpr *tidopexpr; + + if (!IsA(opexpr, OpExpr)) + elog(ERROR, "could not identify CTID expression"); + + tidopexpr = MakeTidOpExpr(opexpr, tidrangestate); + tidexprs = lappend(tidexprs, tidopexpr); + } + + tidrangestate->trss_tidexprs = tidexprs; +} + +/* ---------------------------------------------------------------- + * TidRangeEval + * + * Compute and set node's block and offset range to scan by evaluating + * the trss_tidexprs. Returns false if we detect the range cannot + * contain any tuples. Returns true if it's possible for the range to + * contain tuples. + * ---------------------------------------------------------------- + */ +static bool +TidRangeEval(TidRangeScanState *node) +{ + ExprContext *econtext = node->ss.ps.ps_ExprContext; + ItemPointerData lowerBound; + ItemPointerData upperBound; + ListCell *l; + + /* + * Set the upper and lower bounds to the absolute limits of the range of + * the ItemPointer type. Below we'll try to narrow this range on either + * side by looking at the TidOpExprs. + */ + ItemPointerSet(&lowerBound, 0, 0); + ItemPointerSet(&upperBound, InvalidBlockNumber, PG_UINT16_MAX); + + foreach(l, node->trss_tidexprs) + { + TidOpExpr *tidopexpr = (TidOpExpr *) lfirst(l); + ItemPointer itemptr; + bool isNull; + + /* Evaluate this bound. */ + itemptr = (ItemPointer) + DatumGetPointer(ExecEvalExprSwitchContext(tidopexpr->exprstate, + econtext, + &isNull)); + + /* If the bound is NULL, *nothing* matches the qual. */ + if (isNull) + return false; + + if (tidopexpr->exprtype == TIDEXPR_LOWER_BOUND) + { + ItemPointerData lb; + + ItemPointerCopy(itemptr, &lb); + + /* + * Normalize non-inclusive ranges to become inclusive. The + * resulting ItemPointer here may not be a valid item pointer. + */ + if (!tidopexpr->inclusive) + ItemPointerInc(&lb); + + /* Check if we can narrow the range using this qual */ + if (ItemPointerCompare(&lb, &lowerBound) > 0) + ItemPointerCopy(&lb, &lowerBound); + } + + else if (tidopexpr->exprtype == TIDEXPR_UPPER_BOUND) + { + ItemPointerData ub; + + ItemPointerCopy(itemptr, &ub); + + /* + * Normalize non-inclusive ranges to become inclusive. The + * resulting ItemPointer here may not be a valid item pointer. + */ + if (!tidopexpr->inclusive) + ItemPointerDec(&ub); + + /* Check if we can narrow the range using this qual */ + if (ItemPointerCompare(&ub, &upperBound) < 0) + ItemPointerCopy(&ub, &upperBound); + } + } + + ItemPointerCopy(&lowerBound, &node->trss_mintid); + ItemPointerCopy(&upperBound, &node->trss_maxtid); + + return true; +} + +/* ---------------------------------------------------------------- + * TidRangeNext + * + * Retrieve a tuple from the TidRangeScan node's currentRelation + * using the TIDs in the TidRangeScanState information. + * + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +TidRangeNext(TidRangeScanState *node) +{ + TableScanDesc scandesc; + EState *estate; + ScanDirection direction; + TupleTableSlot *slot; + + /* + * extract necessary information from TID scan node + */ + scandesc = node->ss.ss_currentScanDesc; + estate = node->ss.ps.state; + slot = node->ss.ss_ScanTupleSlot; + direction = estate->es_direction; + + if (!node->trss_inScan) + { + /* First time through, compute TID range to scan */ + if (!TidRangeEval(node)) + return NULL; + + if (scandesc == NULL) + { + scandesc = table_beginscan_tidrange(node->ss.ss_currentRelation, + estate->es_snapshot, + &node->trss_mintid, + &node->trss_maxtid); + node->ss.ss_currentScanDesc = scandesc; + } + else + { + /* rescan with the updated TID range */ + table_rescan_tidrange(scandesc, &node->trss_mintid, + &node->trss_maxtid); + } + + node->trss_inScan = true; + } + + /* Fetch the next tuple. */ + if (!table_scan_getnextslot_tidrange(scandesc, direction, slot)) + { + node->trss_inScan = false; + ExecClearTuple(slot); + } + + return slot; +} + +/* + * TidRangeRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +TidRangeRecheck(TidRangeScanState *node, TupleTableSlot *slot) +{ + return true; +} + +/* ---------------------------------------------------------------- + * ExecTidRangeScan(node) + * + * Scans the relation using tids and returns the next qualifying tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * + * Conditions: + * -- the "cursor" maintained by the AMI is positioned at the tuple + * returned previously. + * + * Initial States: + * -- the relation indicated is opened for TID range scanning. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecTidRangeScan(PlanState *pstate) +{ + TidRangeScanState *node = castNode(TidRangeScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) TidRangeNext, + (ExecScanRecheckMtd) TidRangeRecheck); +} + +/* ---------------------------------------------------------------- + * ExecReScanTidRangeScan(node) + * ---------------------------------------------------------------- + */ +void +ExecReScanTidRangeScan(TidRangeScanState *node) +{ + /* mark scan as not in progress, and tid range list as not computed yet */ + node->trss_inScan = false; + + /* + * We must wait until TidRangeNext before calling table_rescan_tidrange. + */ + ExecScanReScan(&node->ss); +} + +/* ---------------------------------------------------------------- + * ExecEndTidRangeScan + * + * Releases any storage allocated through C routines. + * Returns nothing. + * ---------------------------------------------------------------- + */ +void +ExecEndTidRangeScan(TidRangeScanState *node) +{ + TableScanDesc scan = node->ss.ss_currentScanDesc; + + if (scan != NULL) + table_endscan(scan); + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clear out tuple table slots + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); +} + +/* ---------------------------------------------------------------- + * ExecInitTidRangeScan + * + * Initializes the tid range scan's state information, creates + * scan keys, and opens the scan relation. + * + * Parameters: + * node: TidRangeScan node produced by the planner. + * estate: the execution state initialized in InitPlan. + * ---------------------------------------------------------------- + */ +TidRangeScanState * +ExecInitTidRangeScan(TidRangeScan *node, EState *estate, int eflags) +{ + TidRangeScanState *tidrangestate; + Relation currentRelation; + + /* + * create state structure + */ + tidrangestate = makeNode(TidRangeScanState); + tidrangestate->ss.ps.plan = (Plan *) node; + tidrangestate->ss.ps.state = estate; + tidrangestate->ss.ps.ExecProcNode = ExecTidRangeScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &tidrangestate->ss.ps); + + /* + * mark scan as not in progress, and TID range as not computed yet + */ + tidrangestate->trss_inScan = false; + + /* + * open the scan relation + */ + currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); + + tidrangestate->ss.ss_currentRelation = currentRelation; + tidrangestate->ss.ss_currentScanDesc = NULL; /* no table scan here */ + + /* + * get the scan type from the relation descriptor. + */ + ExecInitScanTupleSlot(estate, &tidrangestate->ss, + RelationGetDescr(currentRelation), + table_slot_callbacks(currentRelation)); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&tidrangestate->ss.ps); + ExecAssignScanProjectionInfo(&tidrangestate->ss); + + /* + * initialize child expressions + */ + tidrangestate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) tidrangestate); + + TidExprListCreate(tidrangestate); + + /* + * all done. + */ + return tidrangestate; +} |