diff options
author | Andres Freund <andres@anarazel.de> | 2015-05-16 03:40:59 +0200 |
---|---|---|
committer | Andres Freund <andres@anarazel.de> | 2015-05-16 03:46:31 +0200 |
commit | f3d3118532175541a9a96ed78881a3b04a057128 (patch) | |
tree | d06e7177843c563491f3a132d29fec0f60f69bd3 /src/backend/parser/parse_clause.c | |
parent | 6e4415c6aa428132dd41c8bf23a0885fca0f2271 (diff) | |
download | postgresql-f3d3118532175541a9a96ed78881a3b04a057128.tar.gz postgresql-f3d3118532175541a9a96ed78881a3b04a057128.zip |
Support GROUPING SETS, CUBE and ROLLUP.
This SQL standard functionality allows to aggregate data by different
GROUP BY clauses at once. Each grouping set returns rows with columns
grouped by in other sets set to NULL.
This could previously be achieved by doing each grouping as a separate
query, conjoined by UNION ALLs. Besides being considerably more concise,
grouping sets will in many cases be faster, requiring only one scan over
the underlying data.
The current implementation of grouping sets only supports using sorting
for input. Individual sets that share a sort order are computed in one
pass. If there are sets that don't share a sort order, additional sort &
aggregation steps are performed. These additional passes are sourced by
the previous sort step; thus avoiding repeated scans of the source data.
The code is structured in a way that adding support for purely using
hash aggregation or a mix of hashing and sorting is possible. Sorting
was chosen to be supported first, as it is the most generic method of
implementation.
Instead of, as in an earlier versions of the patch, representing the
chain of sort and aggregation steps as full blown planner and executor
nodes, all but the first sort are performed inside the aggregation node
itself. This avoids the need to do some unusual gymnastics to handle
having to return aggregated and non-aggregated tuples from underlying
nodes, as well as having to shut down underlying nodes early to limit
memory usage. The optimizer still builds Sort/Agg node to describe each
phase, but they're not part of the plan tree, but instead additional
data for the aggregation node. They're a convenient and preexisting way
to describe aggregation and sorting. The first (and possibly only) sort
step is still performed as a separate execution step. That retains
similarity with existing group by plans, makes rescans fairly simple,
avoids very deep plans (leading to slow explains) and easily allows to
avoid the sorting step if the underlying data is sorted by other means.
A somewhat ugly side of this patch is having to deal with a grammar
ambiguity between the new CUBE keyword and the cube extension/functions
named cube (and rollup). To avoid breaking existing deployments of the
cube extension it has not been renamed, neither has cube been made a
reserved keyword. Instead precedence hacking is used to make GROUP BY
cube(..) refer to the CUBE grouping sets feature, and not the function
cube(). To actually group by a function cube(), unlikely as that might
be, the function name has to be quoted.
Needs a catversion bump because stored rules may change.
Author: Andrew Gierth and Atri Sharma, with contributions from Andres Freund
Reviewed-By: Andres Freund, Noah Misch, Tom Lane, Svenne Krap, Tomas
Vondra, Erik Rijkers, Marti Raudsepp, Pavel Stehule
Discussion: CAOeZVidmVRe2jU6aMk_5qkxnB7dfmPROzM7Ur8JPW5j8Y5X-Lw@mail.gmail.com
Diffstat (limited to 'src/backend/parser/parse_clause.c')
-rw-r--r-- | src/backend/parser/parse_clause.c | 504 |
1 files changed, 460 insertions, 44 deletions
diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index 6b1bbe57d0e..a90bcf40c9d 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -15,6 +15,8 @@ #include "postgres.h" +#include "miscadmin.h" + #include "access/heapam.h" #include "catalog/catalog.h" #include "access/htup_details.h" @@ -43,7 +45,6 @@ #include "utils/rel.h" #include "utils/syscache.h" - /* Convenience macro for the most common makeNamespaceItem() case */ #define makeDefaultNSItem(rte) makeNamespaceItem(rte, true, true, false, true) @@ -1725,40 +1726,181 @@ findTargetlistEntrySQL99(ParseState *pstate, Node *node, List **tlist, return target_result; } +/*------------------------------------------------------------------------- + * Flatten out parenthesized sublists in grouping lists, and some cases + * of nested grouping sets. + * + * Inside a grouping set (ROLLUP, CUBE, or GROUPING SETS), we expect the + * content to be nested no more than 2 deep: i.e. ROLLUP((a,b),(c,d)) is + * ok, but ROLLUP((a,(b,c)),d) is flattened to ((a,b,c),d), which we then + * normalize to ((a,b,c),(d)). + * + * CUBE or ROLLUP can be nested inside GROUPING SETS (but not the reverse), + * and we leave that alone if we find it. But if we see GROUPING SETS inside + * GROUPING SETS, we can flatten and normalize as follows: + * GROUPING SETS (a, (b,c), GROUPING SETS ((c,d),(e)), (f,g)) + * becomes + * GROUPING SETS ((a), (b,c), (c,d), (e), (f,g)) + * + * This is per the spec's syntax transformations, but these are the only such + * transformations we do in parse analysis, so that queries retain the + * originally specified grouping set syntax for CUBE and ROLLUP as much as + * possible when deparsed. (Full expansion of the result into a list of + * grouping sets is left to the planner.) + * + * When we're done, the resulting list should contain only these possible + * elements: + * - an expression + * - a CUBE or ROLLUP with a list of expressions nested 2 deep + * - a GROUPING SET containing any of: + * - expression lists + * - empty grouping sets + * - CUBE or ROLLUP nodes with lists nested 2 deep + * The return is a new list, but doesn't deep-copy the old nodes except for + * GroupingSet nodes. + * + * As a side effect, flag whether the list has any GroupingSet nodes. + *------------------------------------------------------------------------- + */ +static Node * +flatten_grouping_sets(Node *expr, bool toplevel, bool *hasGroupingSets) +{ + /* just in case of pathological input */ + check_stack_depth(); + + if (expr == (Node *) NIL) + return (Node *) NIL; + + switch (expr->type) + { + case T_RowExpr: + { + RowExpr *r = (RowExpr *) expr; + if (r->row_format == COERCE_IMPLICIT_CAST) + return flatten_grouping_sets((Node *) r->args, + false, NULL); + } + break; + case T_GroupingSet: + { + GroupingSet *gset = (GroupingSet *) expr; + ListCell *l2; + List *result_set = NIL; + + if (hasGroupingSets) + *hasGroupingSets = true; + + /* + * at the top level, we skip over all empty grouping sets; the + * caller can supply the canonical GROUP BY () if nothing is left. + */ + + if (toplevel && gset->kind == GROUPING_SET_EMPTY) + return (Node *) NIL; + + foreach(l2, gset->content) + { + Node *n2 = flatten_grouping_sets(lfirst(l2), false, NULL); + + result_set = lappend(result_set, n2); + } + + /* + * At top level, keep the grouping set node; but if we're in a nested + * grouping set, then we need to concat the flattened result into the + * outer list if it's simply nested. + */ + + if (toplevel || (gset->kind != GROUPING_SET_SETS)) + { + return (Node *) makeGroupingSet(gset->kind, result_set, gset->location); + } + else + return (Node *) result_set; + } + case T_List: + { + List *result = NIL; + ListCell *l; + + foreach(l, (List *)expr) + { + Node *n = flatten_grouping_sets(lfirst(l), toplevel, hasGroupingSets); + if (n != (Node *) NIL) + { + if (IsA(n,List)) + result = list_concat(result, (List *) n); + else + result = lappend(result, n); + } + } + + return (Node *) result; + } + default: + break; + } + + return expr; +} + /* - * transformGroupClause - - * transform a GROUP BY clause + * Transform a single expression within a GROUP BY clause or grouping set. * - * GROUP BY items will be added to the targetlist (as resjunk columns) - * if not already present, so the targetlist must be passed by reference. + * The expression is added to the targetlist if not already present, and to the + * flatresult list (which will become the groupClause) if not already present + * there. The sortClause is consulted for operator and sort order hints. * - * This is also used for window PARTITION BY clauses (which act almost the - * same, but are always interpreted per SQL99 rules). + * Returns the ressortgroupref of the expression. + * + * flatresult reference to flat list of SortGroupClause nodes + * seen_local bitmapset of sortgrouprefs already seen at the local level + * pstate ParseState + * gexpr node to transform + * targetlist reference to TargetEntry list + * sortClause ORDER BY clause (SortGroupClause nodes) + * exprKind expression kind + * useSQL99 SQL99 rather than SQL92 syntax + * toplevel false if within any grouping set */ -List * -transformGroupClause(ParseState *pstate, List *grouplist, - List **targetlist, List *sortClause, - ParseExprKind exprKind, bool useSQL99) +static Index +transformGroupClauseExpr(List **flatresult, Bitmapset *seen_local, + ParseState *pstate, Node *gexpr, + List **targetlist, List *sortClause, + ParseExprKind exprKind, bool useSQL99, bool toplevel) { - List *result = NIL; - ListCell *gl; + TargetEntry *tle; + bool found = false; + + if (useSQL99) + tle = findTargetlistEntrySQL99(pstate, gexpr, + targetlist, exprKind); + else + tle = findTargetlistEntrySQL92(pstate, gexpr, + targetlist, exprKind); - foreach(gl, grouplist) + if (tle->ressortgroupref > 0) { - Node *gexpr = (Node *) lfirst(gl); - TargetEntry *tle; - bool found = false; + ListCell *sl; - if (useSQL99) - tle = findTargetlistEntrySQL99(pstate, gexpr, - targetlist, exprKind); - else - tle = findTargetlistEntrySQL92(pstate, gexpr, - targetlist, exprKind); + /* + * Eliminate duplicates (GROUP BY x, x) but only at local level. + * (Duplicates in grouping sets can affect the number of returned + * rows, so can't be dropped indiscriminately.) + * + * Since we don't care about anything except the sortgroupref, + * we can use a bitmapset rather than scanning lists. + */ + if (bms_is_member(tle->ressortgroupref,seen_local)) + return 0; - /* Eliminate duplicates (GROUP BY x, x) */ - if (targetIsInSortList(tle, InvalidOid, result)) - continue; + /* + * If we're already in the flat clause list, we don't need + * to consider adding ourselves again. + */ + found = targetIsInSortList(tle, InvalidOid, *flatresult); + if (found) + return tle->ressortgroupref; /* * If the GROUP BY tlist entry also appears in ORDER BY, copy operator @@ -1770,35 +1912,308 @@ transformGroupClause(ParseState *pstate, List *grouplist, * sort step, and it allows the user to choose the equality semantics * used by GROUP BY, should she be working with a datatype that has * more than one equality operator. + * + * If we're in a grouping set, though, we force our requested ordering + * to be NULLS LAST, because if we have any hope of using a sorted agg + * for the job, we're going to be tacking on generated NULL values + * after the corresponding groups. If the user demands nulls first, + * another sort step is going to be inevitable, but that's the + * planner's problem. */ - if (tle->ressortgroupref > 0) + + foreach(sl, sortClause) { - ListCell *sl; + SortGroupClause *sc = (SortGroupClause *) lfirst(sl); - foreach(sl, sortClause) + if (sc->tleSortGroupRef == tle->ressortgroupref) { - SortGroupClause *sc = (SortGroupClause *) lfirst(sl); + SortGroupClause *grpc = copyObject(sc); + if (!toplevel) + grpc->nulls_first = false; + *flatresult = lappend(*flatresult, grpc); + found = true; + break; + } + } + } - if (sc->tleSortGroupRef == tle->ressortgroupref) - { - result = lappend(result, copyObject(sc)); - found = true; + /* + * If no match in ORDER BY, just add it to the result using default + * sort/group semantics. + */ + if (!found) + *flatresult = addTargetToGroupList(pstate, tle, + *flatresult, *targetlist, + exprLocation(gexpr), + true); + + /* + * _something_ must have assigned us a sortgroupref by now... + */ + + return tle->ressortgroupref; +} + +/* + * Transform a list of expressions within a GROUP BY clause or grouping set. + * + * The list of expressions belongs to a single clause within which duplicates + * can be safely eliminated. + * + * Returns an integer list of ressortgroupref values. + * + * flatresult reference to flat list of SortGroupClause nodes + * pstate ParseState + * list nodes to transform + * targetlist reference to TargetEntry list + * sortClause ORDER BY clause (SortGroupClause nodes) + * exprKind expression kind + * useSQL99 SQL99 rather than SQL92 syntax + * toplevel false if within any grouping set + */ +static List * +transformGroupClauseList(List **flatresult, + ParseState *pstate, List *list, + List **targetlist, List *sortClause, + ParseExprKind exprKind, bool useSQL99, bool toplevel) +{ + Bitmapset *seen_local = NULL; + List *result = NIL; + ListCell *gl; + + foreach(gl, list) + { + Node *gexpr = (Node *) lfirst(gl); + + Index ref = transformGroupClauseExpr(flatresult, + seen_local, + pstate, + gexpr, + targetlist, + sortClause, + exprKind, + useSQL99, + toplevel); + if (ref > 0) + { + seen_local = bms_add_member(seen_local, ref); + result = lappend_int(result, ref); + } + } + + return result; +} + +/* + * Transform a grouping set and (recursively) its content. + * + * The grouping set might be a GROUPING SETS node with other grouping sets + * inside it, but SETS within SETS have already been flattened out before + * reaching here. + * + * Returns the transformed node, which now contains SIMPLE nodes with lists + * of ressortgrouprefs rather than expressions. + * + * flatresult reference to flat list of SortGroupClause nodes + * pstate ParseState + * gset grouping set to transform + * targetlist reference to TargetEntry list + * sortClause ORDER BY clause (SortGroupClause nodes) + * exprKind expression kind + * useSQL99 SQL99 rather than SQL92 syntax + * toplevel false if within any grouping set + */ +static Node * +transformGroupingSet(List **flatresult, + ParseState *pstate, GroupingSet *gset, + List **targetlist, List *sortClause, + ParseExprKind exprKind, bool useSQL99, bool toplevel) +{ + ListCell *gl; + List *content = NIL; + + Assert(toplevel || gset->kind != GROUPING_SET_SETS); + + foreach(gl, gset->content) + { + Node *n = lfirst(gl); + + if (IsA(n, List)) + { + List *l = transformGroupClauseList(flatresult, + pstate, (List *) n, + targetlist, sortClause, + exprKind, useSQL99, false); + + content = lappend(content, makeGroupingSet(GROUPING_SET_SIMPLE, + l, + exprLocation(n))); + } + else if (IsA(n, GroupingSet)) + { + GroupingSet *gset2 = (GroupingSet *) lfirst(gl); + + content = lappend(content, transformGroupingSet(flatresult, + pstate, gset2, + targetlist, sortClause, + exprKind, useSQL99, false)); + } + else + { + Index ref = transformGroupClauseExpr(flatresult, + NULL, + pstate, + n, + targetlist, + sortClause, + exprKind, + useSQL99, + false); + + content = lappend(content, makeGroupingSet(GROUPING_SET_SIMPLE, + list_make1_int(ref), + exprLocation(n))); + } + } + + /* Arbitrarily cap the size of CUBE, which has exponential growth */ + if (gset->kind == GROUPING_SET_CUBE) + { + if (list_length(content) > 12) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("CUBE is limited to 12 elements"), + parser_errposition(pstate, gset->location))); + } + + return (Node *) makeGroupingSet(gset->kind, content, gset->location); +} + + +/* + * transformGroupClause - + * transform a GROUP BY clause + * + * GROUP BY items will be added to the targetlist (as resjunk columns) + * if not already present, so the targetlist must be passed by reference. + * + * This is also used for window PARTITION BY clauses (which act almost the + * same, but are always interpreted per SQL99 rules). + * + * Grouping sets make this a lot more complex than it was. Our goal here is + * twofold: we make a flat list of SortGroupClause nodes referencing each + * distinct expression used for grouping, with those expressions added to the + * targetlist if needed. At the same time, we build the groupingSets tree, + * which stores only ressortgrouprefs as integer lists inside GroupingSet nodes + * (possibly nested, but limited in depth: a GROUPING_SET_SETS node can contain + * nested SIMPLE, CUBE or ROLLUP nodes, but not more sets - we flatten that + * out; while CUBE and ROLLUP can contain only SIMPLE nodes). + * + * We skip much of the hard work if there are no grouping sets. + * + * One subtlety is that the groupClause list can end up empty while the + * groupingSets list is not; this happens if there are only empty grouping + * sets, or an explicit GROUP BY (). This has the same effect as specifying + * aggregates or a HAVING clause with no GROUP BY; the output is one row per + * grouping set even if the input is empty. + * + * Returns the transformed (flat) groupClause. + * + * pstate ParseState + * grouplist clause to transform + * groupingSets reference to list to contain the grouping set tree + * targetlist reference to TargetEntry list + * sortClause ORDER BY clause (SortGroupClause nodes) + * exprKind expression kind + * useSQL99 SQL99 rather than SQL92 syntax + */ +List * +transformGroupClause(ParseState *pstate, List *grouplist, List **groupingSets, + List **targetlist, List *sortClause, + ParseExprKind exprKind, bool useSQL99) +{ + List *result = NIL; + List *flat_grouplist; + List *gsets = NIL; + ListCell *gl; + bool hasGroupingSets = false; + Bitmapset *seen_local = NULL; + + /* + * Recursively flatten implicit RowExprs. (Technically this is only + * needed for GROUP BY, per the syntax rules for grouping sets, but + * we do it anyway.) + */ + flat_grouplist = (List *) flatten_grouping_sets((Node *) grouplist, + true, + &hasGroupingSets); + + /* + * If the list is now empty, but hasGroupingSets is true, it's because + * we elided redundant empty grouping sets. Restore a single empty + * grouping set to leave a canonical form: GROUP BY () + */ + + if (flat_grouplist == NIL && hasGroupingSets) + { + flat_grouplist = list_make1(makeGroupingSet(GROUPING_SET_EMPTY, + NIL, + exprLocation((Node *) grouplist))); + } + + foreach(gl, flat_grouplist) + { + Node *gexpr = (Node *) lfirst(gl); + + if (IsA(gexpr, GroupingSet)) + { + GroupingSet *gset = (GroupingSet *) gexpr; + + switch (gset->kind) + { + case GROUPING_SET_EMPTY: + gsets = lappend(gsets, gset); + break; + case GROUPING_SET_SIMPLE: + /* can't happen */ + Assert(false); + break; + case GROUPING_SET_SETS: + case GROUPING_SET_CUBE: + case GROUPING_SET_ROLLUP: + gsets = lappend(gsets, + transformGroupingSet(&result, + pstate, gset, + targetlist, sortClause, + exprKind, useSQL99, true)); break; - } } } + else + { + Index ref = transformGroupClauseExpr(&result, seen_local, + pstate, gexpr, + targetlist, sortClause, + exprKind, useSQL99, true); - /* - * If no match in ORDER BY, just add it to the result using default - * sort/group semantics. - */ - if (!found) - result = addTargetToGroupList(pstate, tle, - result, *targetlist, - exprLocation(gexpr), - true); + if (ref > 0) + { + seen_local = bms_add_member(seen_local, ref); + if (hasGroupingSets) + gsets = lappend(gsets, + makeGroupingSet(GROUPING_SET_SIMPLE, + list_make1_int(ref), + exprLocation(gexpr))); + } + } } + /* parser should prevent this */ + Assert(gsets == NIL || groupingSets != NULL); + + if (groupingSets) + *groupingSets = gsets; + return result; } @@ -1903,6 +2318,7 @@ transformWindowDefinitions(ParseState *pstate, true /* force SQL99 rules */ ); partitionClause = transformGroupClause(pstate, windef->partitionClause, + NULL, targetlist, orderClause, EXPR_KIND_WINDOW_PARTITION, |