1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
|
/*-------------------------------------------------------------------------
*
* nodeAppend.c
* routines to handle append nodes.
*
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/executor/nodeAppend.c
*
*-------------------------------------------------------------------------
*/
/* INTERFACE ROUTINES
* ExecInitAppend - initialize the append node
* ExecAppend - retrieve the next tuple from the node
* ExecEndAppend - shut down the append node
* ExecReScanAppend - rescan the append node
*
* NOTES
* Each append node contains a list of one or more subplans which
* must be iteratively processed (forwards or backwards).
* Tuples are retrieved by executing the 'whichplan'th subplan
* until the subplan stops returning tuples, at which point that
* plan is shut down and the next started up.
*
* Append nodes don't make use of their left and right
* subtrees, rather they maintain a list of subplans so
* a typical append node looks like this in the plan tree:
*
* ...
* /
* Append -------+------+------+--- nil
* / \ | | |
* nil nil ... ... ...
* subplans
*
* Append nodes are currently used for unions, and to support
* inheritance queries, where several relations need to be scanned.
* For example, in our standard person/student/employee/student-emp
* example, where student and employee inherit from person
* and student-emp inherits from student and employee, the
* query:
*
* select name from person
*
* generates the plan:
*
* |
* Append -------+-------+--------+--------+
* / \ | | | |
* nil nil Scan Scan Scan Scan
* | | | |
* person employee student student-emp
*/
#include "postgres.h"
#include "executor/execdebug.h"
#include "executor/nodeAppend.h"
#include "miscadmin.h"
/* Shared state for parallel-aware Append. */
struct ParallelAppendState
{
LWLock pa_lock; /* mutual exclusion to choose next subplan */
int pa_next_plan; /* next plan to choose by any worker */
/*
* pa_finished[i] should be true if no more workers should select subplan
* i. for a non-partial plan, this should be set to true as soon as a
* worker selects the plan; for a partial plan, it remains false until
* some worker executes the plan to completion.
*/
bool pa_finished[FLEXIBLE_ARRAY_MEMBER];
};
#define INVALID_SUBPLAN_INDEX -1
static TupleTableSlot *ExecAppend(PlanState *pstate);
static bool choose_next_subplan_locally(AppendState *node);
static bool choose_next_subplan_for_leader(AppendState *node);
static bool choose_next_subplan_for_worker(AppendState *node);
/* ----------------------------------------------------------------
* ExecInitAppend
*
* Begin all of the subscans of the append node.
*
* (This is potentially wasteful, since the entire result of the
* append node may not be scanned, but this way all of the
* structures get allocated in the executor's top level memory
* block instead of that of the call to ExecAppend.)
* ----------------------------------------------------------------
*/
AppendState *
ExecInitAppend(Append *node, EState *estate, int eflags)
{
AppendState *appendstate = makeNode(AppendState);
PlanState **appendplanstates;
int nplans;
int i;
ListCell *lc;
/* check for unsupported flags */
Assert(!(eflags & EXEC_FLAG_MARK));
/*
* Lock the non-leaf tables in the partition tree controlled by this node.
* It's a no-op for non-partitioned parent tables.
*/
ExecLockNonLeafAppendTables(node->partitioned_rels, estate);
/*
* Set up empty vector of subplan states
*/
nplans = list_length(node->appendplans);
appendplanstates = (PlanState **) palloc0(nplans * sizeof(PlanState *));
/*
* create new AppendState for our append node
*/
appendstate->ps.plan = (Plan *) node;
appendstate->ps.state = estate;
appendstate->ps.ExecProcNode = ExecAppend;
appendstate->appendplans = appendplanstates;
appendstate->as_nplans = nplans;
/*
* Miscellaneous initialization
*
* Append plans don't have expression contexts because they never call
* ExecQual or ExecProject.
*/
/*
* append nodes still have Result slots, which hold pointers to tuples, so
* we have to initialize them.
*/
ExecInitResultTupleSlot(estate, &appendstate->ps);
/*
* call ExecInitNode on each of the plans to be executed and save the
* results into the array "appendplans".
*/
i = 0;
foreach(lc, node->appendplans)
{
Plan *initNode = (Plan *) lfirst(lc);
appendplanstates[i] = ExecInitNode(initNode, estate, eflags);
i++;
}
/*
* initialize output tuple type
*/
ExecAssignResultTypeFromTL(&appendstate->ps);
appendstate->ps.ps_ProjInfo = NULL;
/*
* Parallel-aware append plans must choose the first subplan to execute by
* looking at shared memory, but non-parallel-aware append plans can
* always start with the first subplan.
*/
appendstate->as_whichplan =
appendstate->ps.plan->parallel_aware ? INVALID_SUBPLAN_INDEX : 0;
/* If parallel-aware, this will be overridden later. */
appendstate->choose_next_subplan = choose_next_subplan_locally;
return appendstate;
}
/* ----------------------------------------------------------------
* ExecAppend
*
* Handles iteration over multiple subplans.
* ----------------------------------------------------------------
*/
static TupleTableSlot *
ExecAppend(PlanState *pstate)
{
AppendState *node = castNode(AppendState, pstate);
/* If no subplan has been chosen, we must choose one before proceeding. */
if (node->as_whichplan == INVALID_SUBPLAN_INDEX &&
!node->choose_next_subplan(node))
return ExecClearTuple(node->ps.ps_ResultTupleSlot);
for (;;)
{
PlanState *subnode;
TupleTableSlot *result;
CHECK_FOR_INTERRUPTS();
/*
* figure out which subplan we are currently processing
*/
Assert(node->as_whichplan >= 0 && node->as_whichplan < node->as_nplans);
subnode = node->appendplans[node->as_whichplan];
/*
* get a tuple from the subplan
*/
result = ExecProcNode(subnode);
if (!TupIsNull(result))
{
/*
* If the subplan gave us something then return it as-is. We do
* NOT make use of the result slot that was set up in
* ExecInitAppend; there's no need for it.
*/
return result;
}
/* choose new subplan; if none, we're done */
if (!node->choose_next_subplan(node))
return ExecClearTuple(node->ps.ps_ResultTupleSlot);
}
}
/* ----------------------------------------------------------------
* ExecEndAppend
*
* Shuts down the subscans of the append node.
*
* Returns nothing of interest.
* ----------------------------------------------------------------
*/
void
ExecEndAppend(AppendState *node)
{
PlanState **appendplans;
int nplans;
int i;
/*
* get information from the node
*/
appendplans = node->appendplans;
nplans = node->as_nplans;
/*
* shut down each of the subscans
*/
for (i = 0; i < nplans; i++)
ExecEndNode(appendplans[i]);
}
void
ExecReScanAppend(AppendState *node)
{
int i;
for (i = 0; i < node->as_nplans; i++)
{
PlanState *subnode = node->appendplans[i];
/*
* ExecReScan doesn't know about my subplans, so I have to do
* changed-parameter signaling myself.
*/
if (node->ps.chgParam != NULL)
UpdateChangedParamSet(subnode, node->ps.chgParam);
/*
* If chgParam of subnode is not null then plan will be re-scanned by
* first ExecProcNode.
*/
if (subnode->chgParam == NULL)
ExecReScan(subnode);
}
node->as_whichplan =
node->ps.plan->parallel_aware ? INVALID_SUBPLAN_INDEX : 0;
}
/* ----------------------------------------------------------------
* Parallel Append Support
* ----------------------------------------------------------------
*/
/* ----------------------------------------------------------------
* ExecAppendEstimate
*
* Compute the amount of space we'll need in the parallel
* query DSM, and inform pcxt->estimator about our needs.
* ----------------------------------------------------------------
*/
void
ExecAppendEstimate(AppendState *node,
ParallelContext *pcxt)
{
node->pstate_len =
add_size(offsetof(ParallelAppendState, pa_finished),
sizeof(bool) * node->as_nplans);
shm_toc_estimate_chunk(&pcxt->estimator, node->pstate_len);
shm_toc_estimate_keys(&pcxt->estimator, 1);
}
/* ----------------------------------------------------------------
* ExecAppendInitializeDSM
*
* Set up shared state for Parallel Append.
* ----------------------------------------------------------------
*/
void
ExecAppendInitializeDSM(AppendState *node,
ParallelContext *pcxt)
{
ParallelAppendState *pstate;
pstate = shm_toc_allocate(pcxt->toc, node->pstate_len);
memset(pstate, 0, node->pstate_len);
LWLockInitialize(&pstate->pa_lock, LWTRANCHE_PARALLEL_APPEND);
shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id, pstate);
node->as_pstate = pstate;
node->choose_next_subplan = choose_next_subplan_for_leader;
}
/* ----------------------------------------------------------------
* ExecAppendReInitializeDSM
*
* Reset shared state before beginning a fresh scan.
* ----------------------------------------------------------------
*/
void
ExecAppendReInitializeDSM(AppendState *node, ParallelContext *pcxt)
{
ParallelAppendState *pstate = node->as_pstate;
pstate->pa_next_plan = 0;
memset(pstate->pa_finished, 0, sizeof(bool) * node->as_nplans);
}
/* ----------------------------------------------------------------
* ExecAppendInitializeWorker
*
* Copy relevant information from TOC into planstate, and initialize
* whatever is required to choose and execute the optimal subplan.
* ----------------------------------------------------------------
*/
void
ExecAppendInitializeWorker(AppendState *node, ParallelWorkerContext *pwcxt)
{
node->as_pstate = shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false);
node->choose_next_subplan = choose_next_subplan_for_worker;
}
/* ----------------------------------------------------------------
* choose_next_subplan_locally
*
* Choose next subplan for a non-parallel-aware Append,
* returning false if there are no more.
* ----------------------------------------------------------------
*/
static bool
choose_next_subplan_locally(AppendState *node)
{
int whichplan = node->as_whichplan;
/* We should never see INVALID_SUBPLAN_INDEX in this case. */
Assert(whichplan >= 0 && whichplan <= node->as_nplans);
if (ScanDirectionIsForward(node->ps.state->es_direction))
{
if (whichplan >= node->as_nplans - 1)
return false;
node->as_whichplan++;
}
else
{
if (whichplan <= 0)
return false;
node->as_whichplan--;
}
return true;
}
/* ----------------------------------------------------------------
* choose_next_subplan_for_leader
*
* Try to pick a plan which doesn't commit us to doing much
* work locally, so that as much work as possible is done in
* the workers. Cheapest subplans are at the end.
* ----------------------------------------------------------------
*/
static bool
choose_next_subplan_for_leader(AppendState *node)
{
ParallelAppendState *pstate = node->as_pstate;
Append *append = (Append *) node->ps.plan;
/* Backward scan is not supported by parallel-aware plans */
Assert(ScanDirectionIsForward(node->ps.state->es_direction));
LWLockAcquire(&pstate->pa_lock, LW_EXCLUSIVE);
if (node->as_whichplan != INVALID_SUBPLAN_INDEX)
{
/* Mark just-completed subplan as finished. */
node->as_pstate->pa_finished[node->as_whichplan] = true;
}
else
{
/* Start with last subplan. */
node->as_whichplan = node->as_nplans - 1;
}
/* Loop until we find a subplan to execute. */
while (pstate->pa_finished[node->as_whichplan])
{
if (node->as_whichplan == 0)
{
pstate->pa_next_plan = INVALID_SUBPLAN_INDEX;
node->as_whichplan = INVALID_SUBPLAN_INDEX;
LWLockRelease(&pstate->pa_lock);
return false;
}
node->as_whichplan--;
}
/* If non-partial, immediately mark as finished. */
if (node->as_whichplan < append->first_partial_plan)
node->as_pstate->pa_finished[node->as_whichplan] = true;
LWLockRelease(&pstate->pa_lock);
return true;
}
/* ----------------------------------------------------------------
* choose_next_subplan_for_worker
*
* Choose next subplan for a parallel-aware Append, returning
* false if there are no more.
*
* We start from the first plan and advance through the list;
* when we get back to the end, we loop back to the first
* partial plan. This assigns the non-partial plans first in
* order of descending cost and then spreads out the workers
* as evenly as possible across the remaining partial plans.
* ----------------------------------------------------------------
*/
static bool
choose_next_subplan_for_worker(AppendState *node)
{
ParallelAppendState *pstate = node->as_pstate;
Append *append = (Append *) node->ps.plan;
/* Backward scan is not supported by parallel-aware plans */
Assert(ScanDirectionIsForward(node->ps.state->es_direction));
LWLockAcquire(&pstate->pa_lock, LW_EXCLUSIVE);
/* Mark just-completed subplan as finished. */
if (node->as_whichplan != INVALID_SUBPLAN_INDEX)
node->as_pstate->pa_finished[node->as_whichplan] = true;
/* If all the plans are already done, we have nothing to do */
if (pstate->pa_next_plan == INVALID_SUBPLAN_INDEX)
{
LWLockRelease(&pstate->pa_lock);
return false;
}
/* Save the plan from which we are starting the search. */
node->as_whichplan = pstate->pa_next_plan;
/* Loop until we find a subplan to execute. */
while (pstate->pa_finished[pstate->pa_next_plan])
{
if (pstate->pa_next_plan < node->as_nplans - 1)
{
/* Advance to next plan. */
pstate->pa_next_plan++;
}
else if (node->as_whichplan > append->first_partial_plan)
{
/* Loop back to first partial plan. */
pstate->pa_next_plan = append->first_partial_plan;
}
else
{
/*
* At last plan, and either there are no partial plans or we've
* tried them all. Arrange to bail out.
*/
pstate->pa_next_plan = node->as_whichplan;
}
if (pstate->pa_next_plan == node->as_whichplan)
{
/* We've tried everything! */
pstate->pa_next_plan = INVALID_SUBPLAN_INDEX;
LWLockRelease(&pstate->pa_lock);
return false;
}
}
/* Pick the plan we found, and advance pa_next_plan one more time. */
node->as_whichplan = pstate->pa_next_plan++;
if (pstate->pa_next_plan >= node->as_nplans)
{
if (append->first_partial_plan < node->as_nplans)
pstate->pa_next_plan = append->first_partial_plan;
else
{
/*
* We have only non-partial plans, and we already chose the last
* one; so arrange for the other workers to immediately bail out.
*/
pstate->pa_next_plan = INVALID_SUBPLAN_INDEX;
}
}
/* If non-partial, immediately mark as finished. */
if (node->as_whichplan < append->first_partial_plan)
node->as_pstate->pa_finished[node->as_whichplan] = true;
LWLockRelease(&pstate->pa_lock);
return true;
}
|