11 files changed, 803 insertions, 390 deletions
diff --git a/doc/src/sgml/ref/create_trigger.sgml b/doc/src/sgml/ref/create_trigger.sgml
index 18efe6a9ed7..065c8272710 100644
--- a/doc/src/sgml/ref/create_trigger.sgml
+++ b/doc/src/sgml/ref/create_trigger.sgml
@@ -52,7 +52,7 @@ CREATE [ CONSTRAINT ] TRIGGER <replaceable class="PARAMETER">name</replaceable>
    trigger will be associated with the specified table, view, or foreign table
    and will execute the specified
    function <replaceable class="parameter">function_name</replaceable> when
-   certain events occur.
+   certain operations are performed on that table.
   </para>
 
   <para>
@@ -82,10 +82,7 @@ CREATE [ CONSTRAINT ] TRIGGER <replaceable class="PARAMETER">name</replaceable>
    executes once for any given operation, regardless of how many rows
    it modifies (in particular, an operation that modifies zero rows
    will still result in the execution of any applicable <literal>FOR
-   EACH STATEMENT</literal> triggers).  Note that with an
-   <command>INSERT</command> with an <literal>ON CONFLICT DO UPDATE</>
-   clause, both <command>INSERT</command> and
-   <command>UPDATE</command> statement level trigger will be fired.
+   EACH STATEMENT</literal> triggers).
   </para>
 
   <para>
@@ -174,7 +171,8 @@ CREATE [ CONSTRAINT ] TRIGGER <replaceable class="PARAMETER">name</replaceable>
    <firstterm>constraint trigger</>.  This is the same as a regular trigger
    except that the timing of the trigger firing can be adjusted using
    <xref linkend="SQL-SET-CONSTRAINTS">.
-   Constraint triggers must be <literal>AFTER ROW</> triggers on tables.  They
+   Constraint triggers must be <literal>AFTER ROW</> triggers on plain
+   tables (not foreign tables).  They
    can be fired either at the end of the statement causing the triggering
    event, or at the end of the containing transaction; in the latter case they
    are said to be <firstterm>deferred</>.  A pending deferred-trigger firing
@@ -184,18 +182,29 @@ CREATE [ CONSTRAINT ] TRIGGER <replaceable class="PARAMETER">name</replaceable>
   </para>
 
   <para>
-   The <literal>REFERENCING</> option is only allowed for an <literal>AFTER</>
-   trigger which is not a constraint trigger.  <literal>OLD TABLE</> may only
-   be specified once, and only on a trigger which can fire on
-   <literal>UPDATE</> or <literal>DELETE</>.  <literal>NEW TABLE</> may only
-   be specified once, and only on a trigger which can fire on
-   <literal>UPDATE</> or <literal>INSERT</>.
+   The <literal>REFERENCING</> option enables collection
+   of <firstterm>transition relations</>, which are row sets that include all
+   of the rows inserted, deleted, or modified by the current SQL statement.
+   This feature lets the trigger see a global view of what the statement did,
+   not just one row at a time.  This option is only allowed for
+   an <literal>AFTER</> trigger that is not a constraint trigger; also, if
+   the trigger is an <literal>UPDATE</> trigger, it must not specify
+   a <replaceable class="parameter">column_name</replaceable> list.
+   <literal>OLD TABLE</> may only be specified once, and only for a trigger
+   that can fire on <literal>UPDATE</> or <literal>DELETE</>; it creates a
+   transition relation containing the <firstterm>before-images</> of all rows
+   updated or deleted by the statement.
+   Similarly, <literal>NEW TABLE</> may only be specified once, and only for
+   a trigger that can fire on <literal>UPDATE</> or <literal>INSERT</>;
+   it creates a transition relation containing the <firstterm>after-images</>
+   of all rows updated or inserted by the statement.
   </para>
 
   <para>
    <command>SELECT</command> does not modify any rows so you cannot
-   create <command>SELECT</command> triggers. Rules and views are more
-   appropriate in such cases.
+   create <command>SELECT</command> triggers.  Rules and views may provide
+   workable solutions to problems that seem to need <command>SELECT</command>
+   triggers.
   </para>
 
   <para>
@@ -300,12 +309,9 @@ UPDATE OF <replaceable>column_name1</replaceable> [, <replaceable>column_name2</
     <term><literal>REFERENCING</literal></term>
     <listitem>
      <para>
-      This immediately precedes the declaration of one or two relations which
-      can be used to read the before and/or after images of all rows directly
-      affected by the triggering statement.  An <literal>AFTER EACH ROW</>
-      trigger is allowed to use both these transition relation names and the
-      row names (<literal>OLD</> and <literal>NEW</>) which reference each
-      individual row for which the trigger fires.
+      This keyword immediately precedes the declaration of one or two
+      relation names that provide access to the transition relations of the
+      triggering statement.
      </para>
     </listitem>
    </varlistentry>
@@ -315,8 +321,9 @@ UPDATE OF <replaceable>column_name1</replaceable> [, <replaceable>column_name2</
     <term><literal>NEW TABLE</literal></term>
     <listitem>
      <para>
-      This specifies whether the named relation contains the before or after
-      images for rows affected by the statement which fired the trigger.
+      This clause indicates whether the following relation name is for the
+      before-image transition relation or the after-image transition
+      relation.
      </para>
     </listitem>
    </varlistentry>
@@ -325,7 +332,8 @@ UPDATE OF <replaceable>column_name1</replaceable> [, <replaceable>column_name2</
     <term><replaceable class="PARAMETER">transition_relation_name</replaceable></term>
     <listitem>
      <para>
-      The (unqualified) name to be used within the trigger for this relation.
+      The (unqualified) name to be used within the trigger for this
+      transition relation.
      </para>
     </listitem>
    </varlistentry>
@@ -459,6 +467,35 @@ UPDATE OF <replaceable>column_name1</replaceable> [, <replaceable>column_name2</
   </para>
 
   <para>
+   In some cases it is possible for a single SQL command to fire more than
+   one kind of trigger.  For instance an <command>INSERT</command> with
+   an <literal>ON CONFLICT DO UPDATE</> clause may cause both insert and
+   update operations, so it will fire both kinds of triggers as needed.
+   The transition relations supplied to triggers are
+   specific to their event type; thus an <command>INSERT</command> trigger
+   will see only the inserted rows, while an <command>UPDATE</command>
+   trigger will see only the updated rows.
+  </para>
+
+  <para>
+   Row updates or deletions caused by foreign-key enforcement actions, such
+   as <literal>ON UPDATE CASCADE</> or <literal>ON DELETE SET NULL</>, are
+   treated as part of the SQL command that caused them (note that such
+   actions are never deferred).  Relevant triggers on the affected table will
+   be fired, so that this provides another way in which a SQL command might
+   fire triggers not directly matching its type.  In simple cases, triggers
+   that request transition relations will see all changes caused in their
+   table by a single original SQL command as a single transition relation.
+   However, there are cases in which the presence of an <literal>AFTER ROW</>
+   trigger that requests transition relations will cause the foreign-key
+   enforcement actions triggered by a single SQL command to be split into
+   multiple steps, each with its own transition relation(s).  In such cases,
+   any <literal>AFTER STATEMENT</> triggers that are present will be fired
+   once per creation of a transition relation, ensuring that the triggers see
+   each affected row once and only once.
+  </para>
+
+  <para>
     Modifying a partitioned table or a table with inheritance children fires
     statement-level triggers directly attached to that table, but not
     statement-level triggers for its partitions or child tables.  In contrast,
@@ -589,19 +626,30 @@ CREATE TRIGGER paired_items_update
    <itemizedlist>
     <listitem>
      <para>
-      While transition tables for <literal>AFTER</> triggers are specified
-      using the <literal>REFERENCING</> clause in the standard way, the row
-      variables used in <literal>FOR EACH ROW</> triggers may not be
-      specified in <literal>REFERENCING</> clause.  They are available in a
-      manner which is dependent on the language in which the trigger function
-      is written.  Some languages effectively behave as though there is a
-      <literal>REFERENCING</> clause containing <literal>OLD ROW AS OLD NEW
-      ROW AS NEW</>.
+      While transition table names for <literal>AFTER</> triggers are
+      specified using the <literal>REFERENCING</> clause in the standard way,
+      the row variables used in <literal>FOR EACH ROW</> triggers may not be
+      specified in a <literal>REFERENCING</> clause.  They are available in a
+      manner that is dependent on the language in which the trigger function
+      is written, but is fixed for any one language.  Some languages
+      effectively behave as though there is a <literal>REFERENCING</> clause
+      containing <literal>OLD ROW AS OLD NEW ROW AS NEW</>.
      </para>
     </listitem>
 
     <listitem>
-     <para><productname>PostgreSQL</productname> only allows the execution
+     <para>
+      The standard allows transition tables to be used with
+      column-specific <literal>UPDATE</> triggers, but then the set of rows
+      that should be visible in the transition tables depends on the
+      trigger's column list.  This is not currently implemented by
+      <productname>PostgreSQL</productname>.
+     </para>
+    </listitem>
+
+    <listitem>
+     <para>
+      <productname>PostgreSQL</productname> only allows the execution
       of a user-defined function for the triggered action.  The standard
       allows the execution of a number of other SQL commands, such as
       <command>CREATE TABLE</command>, as the triggered action.  This
diff --git a/doc/src/sgml/trigger.sgml b/doc/src/sgml/trigger.sgml
index 950245d19a2..a16256056f0 100644
--- a/doc/src/sgml/trigger.sgml
+++ b/doc/src/sgml/trigger.sgml
@@ -41,17 +41,13 @@
     On tables and foreign tables, triggers can be defined to execute either
     before or after any <command>INSERT</command>, <command>UPDATE</command>,
     or <command>DELETE</command> operation, either once per modified row,
-    or once per <acronym>SQL</acronym> statement.  If an
-    <command>INSERT</command> contains an <literal>ON CONFLICT DO UPDATE</>
-    clause, it is possible that the effects of a BEFORE insert trigger and
-    a BEFORE update trigger can both be applied together, if a reference to
-    an <varname>EXCLUDED</> column appears.  <command>UPDATE</command>
-    triggers can moreover be set to fire only if certain columns are
-    mentioned in the <literal>SET</literal> clause of the
-    <command>UPDATE</command> statement.  Triggers can also fire for
-    <command>TRUNCATE</command> statements.  If a trigger event occurs,
+    or once per <acronym>SQL</acronym> statement.
+    <command>UPDATE</command> triggers can moreover be set to fire only if
+    certain columns are mentioned in the <literal>SET</literal> clause of
+    the <command>UPDATE</command> statement.  Triggers can also fire
+    for <command>TRUNCATE</command> statements.  If a trigger event occurs,
     the trigger's function is called at the appropriate time to handle the
-    event.  Foreign tables do not support the TRUNCATE statement at all.
+    event.
    </para>
 
    <para>
@@ -97,10 +93,7 @@
     two types of triggers are sometimes called <firstterm>row-level</>
     triggers and <firstterm>statement-level</> triggers,
     respectively. Triggers on <command>TRUNCATE</command> may only be
-    defined at statement level.  On views, triggers that fire before or
-    after may only be defined at statement level, while triggers that fire
-    instead of an <command>INSERT</command>, <command>UPDATE</command>,
-    or <command>DELETE</command> may only be defined at row level.
+    defined at statement level, not per-row.
    </para>
 
    <para>
@@ -117,9 +110,9 @@
     operated on, while row-level <literal>AFTER</> triggers fire at the end of
     the statement (but before any statement-level <literal>AFTER</> triggers).
     These types of triggers may only be defined on non-partitioned tables and
-    foreign tables.  Row-level <literal>INSTEAD OF</> triggers may only be
-    defined on views, and fire immediately as each row in the view is
-    identified as needing to be operated on.
+    foreign tables, not views.  <literal>INSTEAD OF</> triggers may only be
+    defined on views, and only at row level; they fire immediately as each
+    row in the view is identified as needing to be operated on.
    </para>
 
    <para>
@@ -132,18 +125,19 @@
 
    <para>
     If an <command>INSERT</command> contains an <literal>ON CONFLICT
-    DO UPDATE</> clause, it is possible that the effects of all
-    row-level <literal>BEFORE</> <command>INSERT</command> triggers
-    and all row-level <literal>BEFORE</literal> <command>UPDATE</command> triggers can
+    DO UPDATE</> clause, it is possible that the effects of
+    row-level <literal>BEFORE</> <command>INSERT</command> triggers and
+    row-level <literal>BEFORE</literal> <command>UPDATE</command> triggers can
     both be applied in a way that is apparent from the final state of
     the updated row, if an <varname>EXCLUDED</> column is referenced.
     There need not be an <varname>EXCLUDED</> column reference for
-    both sets of row-level <literal>BEFORE</literal> triggers to execute, though.  The
+    both sets of row-level <literal>BEFORE</literal> triggers to execute,
+    though.  The
     possibility of surprising outcomes should be considered when there
     are both <literal>BEFORE</> <command>INSERT</command> and
     <literal>BEFORE</> <command>UPDATE</command> row-level triggers
-    that both affect a row being inserted/updated (this can still be
-    problematic if the modifications are more or less equivalent if
+    that change a row being inserted/updated (this can be
+    problematic even if the modifications are more or less equivalent, if
     they're not also idempotent).  Note that statement-level
     <command>UPDATE</command> triggers are executed when <literal>ON
     CONFLICT DO UPDATE</> is specified, regardless of whether or not
@@ -314,8 +308,18 @@
     <varname>NEW</varname> row for <command>INSERT</command> and
     <command>UPDATE</command> triggers, and/or the <varname>OLD</varname> row
     for <command>UPDATE</command> and <command>DELETE</command> triggers.
-    Statement-level triggers do not currently have any way to examine the
-    individual row(s) modified by the statement.
+   </para>
+
+   <para>
+    By default, statement-level triggers do not have any way to examine the
+    individual row(s) modified by the statement.  But an <literal>AFTER
+    STATEMENT</> trigger can request that <firstterm>transition tables</>
+    be created to make the sets of affected rows available to the trigger.
+    <literal>AFTER ROW</> triggers can also request transition tables, so
+    that they can see the total changes in the table as well as the change in
+    the individual row they are currently being fired for.  The syntax for
+    examining the transition tables again depends on the programming language
+    that is being used.
    </para>
 
   </sect1>
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 375a25fbcf8..ad1fcd8d77b 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2429,12 +2429,17 @@ CopyFrom(CopyState cstate)
 	/* Triggers might need a slot as well */
 	estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
 
+	/* Prepare to catch AFTER triggers. */
+	AfterTriggerBeginQuery();
+
 	/*
 	 * If there are any triggers with transition tables on the named relation,
 	 * we need to be prepared to capture transition tuples.
 	 */
 	cstate->transition_capture =
-		MakeTransitionCaptureState(cstate->rel->trigdesc);
+		MakeTransitionCaptureState(cstate->rel->trigdesc,
+								   RelationGetRelid(cstate->rel),
+								   CMD_INSERT);
 
 	/*
 	 * If the named relation is a partitioned table, initialize state for
@@ -2510,9 +2515,6 @@ CopyFrom(CopyState cstate)
 		bufferedTuples = palloc(MAX_BUFFERED_TUPLES * sizeof(HeapTuple));
 	}
 
-	/* Prepare to catch AFTER triggers. */
-	AfterTriggerBeginQuery();
-
 	/*
 	 * Check BEFORE STATEMENT insertion triggers. It's debatable whether we
 	 * should do this for COPY, since it's not really an "INSERT" statement as
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 269c9e17dd1..7e391a10921 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -234,6 +234,11 @@ CreateTrigger(CreateTrigStmt *stmt, const char *queryString,
 							RelationGetRelationName(rel)),
 					 errdetail("Foreign tables cannot have TRUNCATE triggers.")));
 
+		/*
+		 * We disallow constraint triggers to protect the assumption that
+		 * triggers on FKs can't be deferred.  See notes with AfterTriggers
+		 * data structures, below.
+		 */
 		if (stmt->isconstraint)
 			ereport(ERROR,
 					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
@@ -418,6 +423,26 @@ CreateTrigger(CreateTrigStmt *stmt, const char *queryString,
 						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 						 errmsg("transition tables cannot be specified for triggers with more than one event")));
 
+			/*
+			 * We currently don't allow column-specific triggers with
+			 * transition tables.  Per spec, that seems to require
+			 * accumulating separate transition tables for each combination of
+			 * columns, which is a lot of work for a rather marginal feature.
+			 */
+			if (stmt->columns != NIL)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("transition tables cannot be specified for triggers with column lists")));
+
+			/*
+			 * We disallow constraint triggers with transition tables, to
+			 * protect the assumption that such triggers can't be deferred.
+			 * See notes with AfterTriggers data structures, below.
+			 *
+			 * Currently this is enforced by the grammar, so just Assert here.
+			 */
+			Assert(!stmt->isconstraint);
+
 			if (tt->isNew)
 			{
 				if (!(TRIGGER_FOR_INSERT(tgtype) ||
@@ -2086,96 +2111,6 @@ FindTriggerIncompatibleWithInheritance(TriggerDesc *trigdesc)
 }
 
 /*
- * Make a TransitionCaptureState object from a given TriggerDesc.  The
- * resulting object holds the flags which control whether transition tuples
- * are collected when tables are modified, and the tuplestores themselves.
- * Note that we copy the flags from a parent table into this struct (rather
- * than using each relation's TriggerDesc directly) so that we can use it to
- * control the collection of transition tuples from child tables.
- *
- * If there are no triggers with transition tables configured for 'trigdesc',
- * then return NULL.
- *
- * The resulting object can be passed to the ExecAR* functions.  The caller
- * should set tcs_map or tcs_original_insert_tuple as appropriate when dealing
- * with child tables.
- */
-TransitionCaptureState *
-MakeTransitionCaptureState(TriggerDesc *trigdesc)
-{
-	TransitionCaptureState *state = NULL;
-
-	if (trigdesc != NULL &&
-		(trigdesc->trig_delete_old_table || trigdesc->trig_update_old_table ||
-		 trigdesc->trig_update_new_table || trigdesc->trig_insert_new_table))
-	{
-		MemoryContext oldcxt;
-		ResourceOwner saveResourceOwner;
-
-		/*
-		 * Normally DestroyTransitionCaptureState should be called after
-		 * executing all AFTER triggers for the current statement.
-		 *
-		 * To handle error cleanup, TransitionCaptureState and the tuplestores
-		 * it contains will live in the current [sub]transaction's memory
-		 * context.  Likewise for the current resource owner, because we also
-		 * want to clean up temporary files spilled to disk by the tuplestore
-		 * in that scenario.  This scope is sufficient, because AFTER triggers
-		 * with transition tables cannot be deferred (only constraint triggers
-		 * can be deferred, and constraint triggers cannot have transition
-		 * tables).  The AFTER trigger queue may contain pointers to this
-		 * TransitionCaptureState, but any such entries will be processed or
-		 * discarded before the end of the current [sub]transaction.
-		 *
-		 * If a future release allows deferred triggers with transition
-		 * tables, we'll need to reconsider the scope of the
-		 * TransitionCaptureState object.
-		 */
-		oldcxt = MemoryContextSwitchTo(CurTransactionContext);
-		saveResourceOwner = CurrentResourceOwner;
-
-		state = (TransitionCaptureState *)
-			palloc0(sizeof(TransitionCaptureState));
-		state->tcs_delete_old_table = trigdesc->trig_delete_old_table;
-		state->tcs_update_old_table = trigdesc->trig_update_old_table;
-		state->tcs_update_new_table = trigdesc->trig_update_new_table;
-		state->tcs_insert_new_table = trigdesc->trig_insert_new_table;
-		PG_TRY();
-		{
-			CurrentResourceOwner = CurTransactionResourceOwner;
-			if (trigdesc->trig_delete_old_table || trigdesc->trig_update_old_table)
-				state->tcs_old_tuplestore = tuplestore_begin_heap(false, false, work_mem);
-			if (trigdesc->trig_insert_new_table)
-				state->tcs_insert_tuplestore = tuplestore_begin_heap(false, false, work_mem);
-			if (trigdesc->trig_update_new_table)
-				state->tcs_update_tuplestore = tuplestore_begin_heap(false, false, work_mem);
-		}
-		PG_CATCH();
-		{
-			CurrentResourceOwner = saveResourceOwner;
-			PG_RE_THROW();
-		}
-		PG_END_TRY();
-		CurrentResourceOwner = saveResourceOwner;
-		MemoryContextSwitchTo(oldcxt);
-	}
-
-	return state;
-}
-
-void
-DestroyTransitionCaptureState(TransitionCaptureState *tcs)
-{
-	if (tcs->tcs_insert_tuplestore != NULL)
-		tuplestore_end(tcs->tcs_insert_tuplestore);
-	if (tcs->tcs_update_tuplestore != NULL)
-		tuplestore_end(tcs->tcs_update_tuplestore);
-	if (tcs->tcs_old_tuplestore != NULL)
-		tuplestore_end(tcs->tcs_old_tuplestore);
-	pfree(tcs);
-}
-
-/*
  * Call a trigger function.
  *
  *		trigdata: trigger descriptor.
@@ -3338,9 +3273,11 @@ TriggerEnabled(EState *estate, ResultRelInfo *relinfo,
  * during the current transaction tree.  (BEFORE triggers are fired
  * immediately so we don't need any persistent state about them.)  The struct
  * and most of its subsidiary data are kept in TopTransactionContext; however
- * the individual event records are kept in a separate sub-context.  This is
- * done mainly so that it's easy to tell from a memory context dump how much
- * space is being eaten by trigger events.
+ * some data that can be discarded sooner appears in the CurTransactionContext
+ * of the relevant subtransaction.  Also, the individual event records are
+ * kept in a separate sub-context of TopTransactionContext.  This is done
+ * mainly so that it's easy to tell from a memory context dump how much space
+ * is being eaten by trigger events.
  *
  * Because the list of pending events can grow large, we go to some
  * considerable effort to minimize per-event memory consumption.  The event
@@ -3400,6 +3337,13 @@ typedef SetConstraintStateData *SetConstraintState;
  * tuple(s).  This permits storing tuples once regardless of the number of
  * row-level triggers on a foreign table.
  *
+ * Note that we need triggers on foreign tables to be fired in exactly the
+ * order they were queued, so that the tuples come out of the tuplestore in
+ * the right order.  To ensure that, we forbid deferrable (constraint)
+ * triggers on foreign tables.  This also ensures that such triggers do not
+ * get deferred into outer trigger query levels, meaning that it's okay to
+ * destroy the tuplestore at the end of the query level.
+ *
  * Statement-level triggers always bear AFTER_TRIGGER_1CTID, though they
  * require no ctid field.  We lack the flag bit space to neatly represent that
  * distinct case, and it seems unlikely to be worth much trouble.
@@ -3433,7 +3377,7 @@ typedef struct AfterTriggerSharedData
 	Oid			ats_tgoid;		/* the trigger's ID */
 	Oid			ats_relid;		/* the relation it's on */
 	CommandId	ats_firing_id;	/* ID for firing cycle */
-	TransitionCaptureState *ats_transition_capture;
+	struct AfterTriggersTableData *ats_table;	/* transition table access */
 } AfterTriggerSharedData;
 
 typedef struct AfterTriggerEventData *AfterTriggerEvent;
@@ -3505,6 +3449,14 @@ typedef struct AfterTriggerEventList
 #define for_each_event_chunk(eptr, cptr, evtlist) \
 	for_each_chunk(cptr, evtlist) for_each_event(eptr, cptr)
 
+/* Macros for iterating from a start point that might not be list start */
+#define for_each_chunk_from(cptr) \
+	for (; cptr != NULL; cptr = cptr->next)
+#define for_each_event_from(eptr, cptr) \
+	for (; \
+		 (char *) eptr < (cptr)->freeptr; \
+		 eptr = (AfterTriggerEvent) (((char *) eptr) + SizeofTriggerEvent(eptr)))
+
 
 /*
  * All per-transaction data for the AFTER TRIGGERS module.
@@ -3529,60 +3481,107 @@ typedef struct AfterTriggerEventList
  * query_depth is the current depth of nested AfterTriggerBeginQuery calls
  * (-1 when the stack is empty).
  *
- * query_stack[query_depth] is a list of AFTER trigger events queued by the
- * current query (and the query_stack entries below it are lists of trigger
- * events queued by calling queries).  None of these are valid until the
- * matching AfterTriggerEndQuery call occurs.  At that point we fire
- * immediate-mode triggers, and append any deferred events to the main events
- * list.
+ * query_stack[query_depth] is the per-query-level data, including these fields:
+ *
+ * events is a list of AFTER trigger events queued by the current query.
+ * None of these are valid until the matching AfterTriggerEndQuery call
+ * occurs.  At that point we fire immediate-mode triggers, and append any
+ * deferred events to the main events list.
  *
- * fdw_tuplestores[query_depth] is a tuplestore containing the foreign tuples
- * needed for the current query.
+ * fdw_tuplestore is a tuplestore containing the foreign-table tuples
+ * needed by events queued by the current query.  (Note: we use just one
+ * tuplestore even though more than one foreign table might be involved.
+ * This is okay because tuplestores don't really care what's in the tuples
+ * they store; but it's possible that someday it'd break.)
  *
- * maxquerydepth is just the allocated length of query_stack and the
- * tuplestores.
+ * tables is a List of AfterTriggersTableData structs for target tables
+ * of the current query (see below).
  *
- * state_stack is a stack of pointers to saved copies of the SET CONSTRAINTS
- * state data; each subtransaction level that modifies that state first
+ * maxquerydepth is just the allocated length of query_stack.
+ *
+ * trans_stack holds per-subtransaction data, including these fields:
+ *
+ * state is NULL or a pointer to a saved copy of the SET CONSTRAINTS
+ * state data.  Each subtransaction level that modifies that state first
  * saves a copy, which we use to restore the state if we abort.
  *
- * events_stack is a stack of copies of the events head/tail pointers,
+ * events is a copy of the events head/tail pointers,
  * which we use to restore those values during subtransaction abort.
  *
- * depth_stack is a stack of copies of subtransaction-start-time query_depth,
+ * query_depth is the subtransaction-start-time value of query_depth,
  * which we similarly use to clean up at subtransaction abort.
  *
- * firing_stack is a stack of copies of subtransaction-start-time
- * firing_counter.  We use this to recognize which deferred triggers were
- * fired (or marked for firing) within an aborted subtransaction.
+ * firing_counter is the subtransaction-start-time value of firing_counter.
+ * We use this to recognize which deferred triggers were fired (or marked
+ * for firing) within an aborted subtransaction.
  *
  * We use GetCurrentTransactionNestLevel() to determine the correct array
- * index in these stacks.  maxtransdepth is the number of allocated entries in
- * each stack.  (By not keeping our own stack pointer, we can avoid trouble
+ * index in trans_stack.  maxtransdepth is the number of allocated entries in
+ * trans_stack.  (By not keeping our own stack pointer, we can avoid trouble
  * in cases where errors during subxact abort cause multiple invocations
  * of AfterTriggerEndSubXact() at the same nesting depth.)
+ *
+ * We create an AfterTriggersTableData struct for each target table of the
+ * current query, and each operation mode (INSERT/UPDATE/DELETE), that has
+ * either transition tables or AFTER STATEMENT triggers.  This is used to
+ * hold the relevant transition tables, as well as info tracking whether
+ * we already queued the AFTER STATEMENT triggers.  (We use that info to
+ * prevent, as much as possible, firing the same AFTER STATEMENT trigger
+ * more than once per statement.)  These structs, along with the transition
+ * table tuplestores, live in the (sub)transaction's CurTransactionContext.
+ * That's sufficient lifespan because we don't allow transition tables to be
+ * used by deferrable triggers, so they only need to survive until
+ * AfterTriggerEndQuery.
  */
+typedef struct AfterTriggersQueryData AfterTriggersQueryData;
+typedef struct AfterTriggersTransData AfterTriggersTransData;
+typedef struct AfterTriggersTableData AfterTriggersTableData;
+
 typedef struct AfterTriggersData
 {
 	CommandId	firing_counter; /* next firing ID to assign */
 	SetConstraintState state;	/* the active S C state */
 	AfterTriggerEventList events;	/* deferred-event list */
-	int			query_depth;	/* current query list index */
-	AfterTriggerEventList *query_stack; /* events pending from each query */
-	Tuplestorestate **fdw_tuplestores;	/* foreign tuples for one row from
-										 * each query */
-	int			maxquerydepth;	/* allocated len of above array */
 	MemoryContext event_cxt;	/* memory context for events, if any */
 
-	/* these fields are just for resetting at subtrans abort: */
+	/* per-query-level data: */
+	AfterTriggersQueryData *query_stack;	/* array of structs shown below */
+	int			query_depth;	/* current index in above array */
+	int			maxquerydepth;	/* allocated len of above array */
 
-	SetConstraintState *state_stack;	/* stacked S C states */
-	AfterTriggerEventList *events_stack;	/* stacked list pointers */
-	int		   *depth_stack;	/* stacked query_depths */
-	CommandId  *firing_stack;	/* stacked firing_counters */
-	int			maxtransdepth;	/* allocated len of above arrays */
+	/* per-subtransaction-level data: */
+	AfterTriggersTransData *trans_stack;	/* array of structs shown below */
+	int			maxtransdepth;	/* allocated len of above array */
 } AfterTriggersData;
 
+struct AfterTriggersQueryData
+{
+	AfterTriggerEventList events;	/* events pending from this query */
+	Tuplestorestate *fdw_tuplestore;	/* foreign tuples for said events */
+	List	   *tables;			/* list of AfterTriggersTableData, see below */
+};
+
+struct AfterTriggersTransData
+{
+	/* these fields are just for resetting at subtrans abort: */
+	SetConstraintState state;	/* saved S C state, or NULL if not yet saved */
+	AfterTriggerEventList events;	/* saved list pointer */
+	int			query_depth;	/* saved query_depth */
+	CommandId	firing_counter; /* saved firing_counter */
+};
+
+struct AfterTriggersTableData
+{
+	/* relid + cmdType form the lookup key for these structs: */
+	Oid			relid;			/* target table's OID */
+	CmdType		cmdType;		/* event type, CMD_INSERT/UPDATE/DELETE */
+	bool		closed;			/* true when no longer OK to add tuples */
+	bool		stmt_trig_done; /* did we already queue stmt-level triggers? */
+	AfterTriggerEventList stmt_trig_events; /* if so, saved list pointer */
+	Tuplestorestate *old_tuplestore;	/* "old" transition table, if any */
+	Tuplestorestate *new_tuplestore;	/* "new" transition table, if any */
+};
+
 static AfterTriggersData afterTriggers;
 
 static void AfterTriggerExecute(AfterTriggerEvent event,
@@ -3591,38 +3590,41 @@ static void AfterTriggerExecute(AfterTriggerEvent event,
 					Instrumentation *instr,
 					MemoryContext per_tuple_context,
 					TupleTableSlot *trig_tuple_slot1,
-					TupleTableSlot *trig_tuple_slot2,
-					TransitionCaptureState *transition_capture);
+					TupleTableSlot *trig_tuple_slot2);
+static AfterTriggersTableData *GetAfterTriggersTableData(Oid relid,
+						  CmdType cmdType);
+static void AfterTriggerFreeQuery(AfterTriggersQueryData *qs);
 static SetConstraintState SetConstraintStateCreate(int numalloc);
 static SetConstraintState SetConstraintStateCopy(SetConstraintState state);
 static SetConstraintState SetConstraintStateAddItem(SetConstraintState state,
 						  Oid tgoid, bool tgisdeferred);
+static void cancel_prior_stmt_triggers(Oid relid, CmdType cmdType, int tgevent);
 
 
 /*
- * Gets a current query transition tuplestore and initializes it if necessary.
+ * Get the FDW tuplestore for the current trigger query level, creating it
+ * if necessary.
  */
 static Tuplestorestate *
-GetTriggerTransitionTuplestore(Tuplestorestate **tss)
+GetCurrentFDWTuplestore(void)
 {
 	Tuplestorestate *ret;
 
-	ret = tss[afterTriggers.query_depth];
+	ret = afterTriggers.query_stack[afterTriggers.query_depth].fdw_tuplestore;
 	if (ret == NULL)
 	{
 		MemoryContext oldcxt;
 		ResourceOwner saveResourceOwner;
 
 		/*
-		 * Make the tuplestore valid until end of transaction.  This is the
-		 * allocation lifespan of the associated events list, but we really
+		 * Make the tuplestore valid until end of subtransaction.  We really
 		 * only need it until AfterTriggerEndQuery().
 		 */
-		oldcxt = MemoryContextSwitchTo(TopTransactionContext);
+		oldcxt = MemoryContextSwitchTo(CurTransactionContext);
 		saveResourceOwner = CurrentResourceOwner;
 		PG_TRY();
 		{
-			CurrentResourceOwner = TopTransactionResourceOwner;
+			CurrentResourceOwner = CurTransactionResourceOwner;
 			ret = tuplestore_begin_heap(false, false, work_mem);
 		}
 		PG_CATCH();
@@ -3634,7 +3636,7 @@ GetTriggerTransitionTuplestore(Tuplestorestate **tss)
 		CurrentResourceOwner = saveResourceOwner;
 		MemoryContextSwitchTo(oldcxt);
 
-		tss[afterTriggers.query_depth] = ret;
+		afterTriggers.query_stack[afterTriggers.query_depth].fdw_tuplestore = ret;
 	}
 
 	return ret;
@@ -3780,7 +3782,7 @@ afterTriggerAddEvent(AfterTriggerEventList *events,
 		if (newshared->ats_tgoid == evtshared->ats_tgoid &&
 			newshared->ats_relid == evtshared->ats_relid &&
 			newshared->ats_event == evtshared->ats_event &&
-			newshared->ats_transition_capture == evtshared->ats_transition_capture &&
+			newshared->ats_table == evtshared->ats_table &&
 			newshared->ats_firing_id == 0)
 			break;
 	}
@@ -3892,8 +3894,7 @@ AfterTriggerExecute(AfterTriggerEvent event,
 					FmgrInfo *finfo, Instrumentation *instr,
 					MemoryContext per_tuple_context,
 					TupleTableSlot *trig_tuple_slot1,
-					TupleTableSlot *trig_tuple_slot2,
-					TransitionCaptureState *transition_capture)
+					TupleTableSlot *trig_tuple_slot2)
 {
 	AfterTriggerShared evtshared = GetTriggerSharedData(event);
 	Oid			tgoid = evtshared->ats_tgoid;
@@ -3934,9 +3935,7 @@ AfterTriggerExecute(AfterTriggerEvent event,
 	{
 		case AFTER_TRIGGER_FDW_FETCH:
 			{
-				Tuplestorestate *fdw_tuplestore =
-				GetTriggerTransitionTuplestore
-				(afterTriggers.fdw_tuplestores);
+				Tuplestorestate *fdw_tuplestore = GetCurrentFDWTuplestore();
 
 				if (!tuplestore_gettupleslot(fdw_tuplestore, true, false,
 											 trig_tuple_slot1))
@@ -4006,36 +4005,25 @@ AfterTriggerExecute(AfterTriggerEvent event,
 	}
 
 	/*
-	 * Set up the tuplestore information.
+	 * Set up the tuplestore information to let the trigger have access to
+	 * transition tables.  When we first make a transition table available to
+	 * a trigger, mark it "closed" so that it cannot change anymore.  If any
+	 * additional events of the same type get queued in the current trigger
+	 * query level, they'll go into new transition tables.
 	 */
 	LocTriggerData.tg_oldtable = LocTriggerData.tg_newtable = NULL;
-	if (transition_capture != NULL)
+	if (evtshared->ats_table)
 	{
 		if (LocTriggerData.tg_trigger->tgoldtable)
-			LocTriggerData.tg_oldtable = transition_capture->tcs_old_tuplestore;
-		if (LocTriggerData.tg_trigger->tgnewtable)
 		{
-			/*
-			 * Currently a trigger with transition tables may only be defined
-			 * for a single event type (here AFTER INSERT or AFTER UPDATE, but
-			 * not AFTER INSERT OR ...).
-			 */
-			Assert((TRIGGER_FOR_INSERT(LocTriggerData.tg_trigger->tgtype) != 0) ^
-				   (TRIGGER_FOR_UPDATE(LocTriggerData.tg_trigger->tgtype) != 0));
+			LocTriggerData.tg_oldtable = evtshared->ats_table->old_tuplestore;
+			evtshared->ats_table->closed = true;
+		}
 
-			/*
-			 * Show either the insert or update new tuple images, depending on
-			 * which event type the trigger was registered for.  A single
-			 * statement may have produced both in the case of INSERT ... ON
-			 * CONFLICT ... DO UPDATE, and in that case the event determines
-			 * which tuplestore the trigger sees as the NEW TABLE.
-			 */
-			if (TRIGGER_FOR_INSERT(LocTriggerData.tg_trigger->tgtype))
-				LocTriggerData.tg_newtable =
-					transition_capture->tcs_insert_tuplestore;
-			else
-				LocTriggerData.tg_newtable =
-					transition_capture->tcs_update_tuplestore;
+		if (LocTriggerData.tg_trigger->tgnewtable)
+		{
+			LocTriggerData.tg_newtable = evtshared->ats_table->new_tuplestore;
+			evtshared->ats_table->closed = true;
 		}
 	}
 
@@ -4245,8 +4233,7 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
 				 * won't try to re-fire it.
 				 */
 				AfterTriggerExecute(event, rel, trigdesc, finfo, instr,
-									per_tuple_context, slot1, slot2,
-									evtshared->ats_transition_capture);
+									per_tuple_context, slot1, slot2);
 
 				/*
 				 * Mark the event as done.
@@ -4296,6 +4283,166 @@ afterTriggerInvokeEvents(AfterTriggerEventList *events,
 }
 
 
+/*
+ * GetAfterTriggersTableData
+ *
+ * Find or create an AfterTriggersTableData struct for the specified
+ * trigger event (relation + operation type).  Ignore existing structs
+ * marked "closed"; we don't want to put any additional tuples into them,
+ * nor change their stmt-triggers-fired state.
+ *
+ * Note: the AfterTriggersTableData list is allocated in the current
+ * (sub)transaction's CurTransactionContext.  This is OK because
+ * we don't need it to live past AfterTriggerEndQuery.
+ */
+static AfterTriggersTableData *
+GetAfterTriggersTableData(Oid relid, CmdType cmdType)
+{
+	AfterTriggersTableData *table;
+	AfterTriggersQueryData *qs;
+	MemoryContext oldcxt;
+	ListCell   *lc;
+
+	/* Caller should have ensured query_depth is OK. */
+	Assert(afterTriggers.query_depth >= 0 &&
+		   afterTriggers.query_depth < afterTriggers.maxquerydepth);
+	qs = &afterTriggers.query_stack[afterTriggers.query_depth];
+
+	foreach(lc, qs->tables)
+	{
+		table = (AfterTriggersTableData *) lfirst(lc);
+		if (table->relid == relid && table->cmdType == cmdType &&
+			!table->closed)
+			return table;
+	}
+
+	oldcxt = MemoryContextSwitchTo(CurTransactionContext);
+
+	table = (AfterTriggersTableData *) palloc0(sizeof(AfterTriggersTableData));
+	table->relid = relid;
+	table->cmdType = cmdType;
+	qs->tables = lappend(qs->tables, table);
+
+	MemoryContextSwitchTo(oldcxt);
+
+	return table;
+}
+
+
+/*
+ * MakeTransitionCaptureState
+ *
+ * Make a TransitionCaptureState object for the given TriggerDesc, target
+ * relation, and operation type.  The TCS object holds all the state needed
+ * to decide whether to capture tuples in transition tables.
+ *
+ * If there are no triggers in 'trigdesc' that request relevant transition
+ * tables, then return NULL.
+ *
+ * The resulting object can be passed to the ExecAR* functions.  The caller
+ * should set tcs_map or tcs_original_insert_tuple as appropriate when dealing
+ * with child tables.
+ *
+ * Note that we copy the flags from a parent table into this struct (rather
+ * than subsequently using the relation's TriggerDesc directly) so that we can
+ * use it to control collection of transition tuples from child tables.
+ *
+ * Per SQL spec, all operations of the same kind (INSERT/UPDATE/DELETE)
+ * on the same table during one query should share one transition table.
+ * Therefore, the Tuplestores are owned by an AfterTriggersTableData struct
+ * looked up using the table OID + CmdType, and are merely referenced by
+ * the TransitionCaptureState objects we hand out to callers.
+ */
+TransitionCaptureState *
+MakeTransitionCaptureState(TriggerDesc *trigdesc, Oid relid, CmdType cmdType)
+{
+	TransitionCaptureState *state;
+	bool		need_old,
+				need_new;
+	AfterTriggersTableData *table;
+	MemoryContext oldcxt;
+	ResourceOwner saveResourceOwner;
+
+	if (trigdesc == NULL)
+		return NULL;
+
+	/* Detect which table(s) we need. */
+	switch (cmdType)
+	{
+		case CMD_INSERT:
+			need_old = false;
+			need_new = trigdesc->trig_insert_new_table;
+			break;
+		case CMD_UPDATE:
+			need_old = trigdesc->trig_update_old_table;
+			need_new = trigdesc->trig_update_new_table;
+			break;
+		case CMD_DELETE:
+			need_old = trigdesc->trig_delete_old_table;
+			need_new = false;
+			break;
+		default:
+			elog(ERROR, "unexpected CmdType: %d", (int) cmdType);
+			need_old = need_new = false;	/* keep compiler quiet */
+			break;
+	}
+	if (!need_old && !need_new)
+		return NULL;
+
+	/* Check state, like AfterTriggerSaveEvent. */
+	if (afterTriggers.query_depth < 0)
+		elog(ERROR, "MakeTransitionCaptureState() called outside of query");
+
+	/* Be sure we have enough space to record events at this query depth. */
+	if (afterTriggers.query_depth >= afterTriggers.maxquerydepth)
+		AfterTriggerEnlargeQueryState();
+
+	/*
+	 * Find or create an AfterTriggersTableData struct to hold the
+	 * tuplestore(s).  If there's a matching struct but it's marked closed,
+	 * ignore it; we need a newer one.
+	 *
+	 * Note: the AfterTriggersTableData list, as well as the tuplestores, are
+	 * allocated in the current (sub)transaction's CurTransactionContext, and
+	 * the tuplestores are managed by the (sub)transaction's resource owner.
+	 * This is sufficient lifespan because we do not allow triggers using
+	 * transition tables to be deferrable; they will be fired during
+	 * AfterTriggerEndQuery, after which it's okay to delete the data.
+	 */
+	table = GetAfterTriggersTableData(relid, cmdType);
+
+	/* Now create required tuplestore(s), if we don't have them already. */
+	oldcxt = MemoryContextSwitchTo(CurTransactionContext);
+	saveResourceOwner = CurrentResourceOwner;
+	PG_TRY();
+	{
+		CurrentResourceOwner = CurTransactionResourceOwner;
+		if (need_old && table->old_tuplestore == NULL)
+			table->old_tuplestore = tuplestore_begin_heap(false, false, work_mem);
+		if (need_new && table->new_tuplestore == NULL)
+			table->new_tuplestore = tuplestore_begin_heap(false, false, work_mem);
+	}
+	PG_CATCH();
+	{
+		CurrentResourceOwner = saveResourceOwner;
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+	CurrentResourceOwner = saveResourceOwner;
+	MemoryContextSwitchTo(oldcxt);
+
+	/* Now build the TransitionCaptureState struct, in caller's context */
+	state = (TransitionCaptureState *) palloc0(sizeof(TransitionCaptureState));
+	state->tcs_delete_old_table = trigdesc->trig_delete_old_table;
+	state->tcs_update_old_table = trigdesc->trig_update_old_table;
+	state->tcs_update_new_table = trigdesc->trig_update_new_table;
+	state->tcs_insert_new_table = trigdesc->trig_insert_new_table;
+	state->tcs_private = table;
+
+	return state;
+}
+
+
 /* ----------
  * AfterTriggerBeginXact()
  *
@@ -4319,14 +4466,10 @@ AfterTriggerBeginXact(void)
 	 */
 	Assert(afterTriggers.state == NULL);
 	Assert(afterTriggers.query_stack == NULL);
-	Assert(afterTriggers.fdw_tuplestores == NULL);
 	Assert(afterTriggers.maxquerydepth == 0);
 	Assert(afterTriggers.event_cxt == NULL);
 	Assert(afterTriggers.events.head == NULL);
-	Assert(afterTriggers.state_stack == NULL);
-	Assert(afterTriggers.events_stack == NULL);
-	Assert(afterTriggers.depth_stack == NULL);
-	Assert(afterTriggers.firing_stack == NULL);
+	Assert(afterTriggers.trans_stack == NULL);
 	Assert(afterTriggers.maxtransdepth == 0);
 }
 
@@ -4362,9 +4505,6 @@ AfterTriggerBeginQuery(void)
 void
 AfterTriggerEndQuery(EState *estate)
 {
-	AfterTriggerEventList *events;
-	Tuplestorestate *fdw_tuplestore;
-
 	/* Must be inside a query, too */
 	Assert(afterTriggers.query_depth >= 0);
 
@@ -4393,38 +4533,86 @@ AfterTriggerEndQuery(EState *estate)
 	 * will instead fire any triggers in a dedicated query level.  Foreign key
 	 * enforcement triggers do add to the current query level, thanks to their
 	 * passing fire_triggers = false to SPI_execute_snapshot().  Other
-	 * C-language triggers might do likewise.  Be careful here: firing a
-	 * trigger could result in query_stack being repalloc'd, so we can't save
-	 * its address across afterTriggerInvokeEvents calls.
+	 * C-language triggers might do likewise.
 	 *
 	 * If we find no firable events, we don't have to increment
 	 * firing_counter.
 	 */
 	for (;;)
 	{
-		events = &afterTriggers.query_stack[afterTriggers.query_depth];
-		if (afterTriggerMarkEvents(events, &afterTriggers.events, true))
+		AfterTriggersQueryData *qs;
+
+		/*
+		 * Firing a trigger could result in query_stack being repalloc'd, so
+		 * we must recalculate qs after each afterTriggerInvokeEvents call.
+		 */
+		qs = &afterTriggers.query_stack[afterTriggers.query_depth];
+
+		if (afterTriggerMarkEvents(&qs->events, &afterTriggers.events, true))
 		{
 			CommandId	firing_id = afterTriggers.firing_counter++;
 
 			/* OK to delete the immediate events after processing them */
-			if (afterTriggerInvokeEvents(events, firing_id, estate, true))
+			if (afterTriggerInvokeEvents(&qs->events, firing_id, estate, true))
 				break;			/* all fired */
 		}
 		else
 			break;
 	}
 
-	/* Release query-local storage for events, including tuplestore if any */
-	fdw_tuplestore = afterTriggers.fdw_tuplestores[afterTriggers.query_depth];
-	if (fdw_tuplestore)
+	/* Release query-level-local storage, including tuplestores if any */
+	AfterTriggerFreeQuery(&afterTriggers.query_stack[afterTriggers.query_depth]);
+
+	afterTriggers.query_depth--;
+}
+
+
+/*
+ * AfterTriggerFreeQuery
+ *	Release subsidiary storage for a trigger query level.
+ *	This includes closing down tuplestores.
+ *	Note: it's important for this to be safe if interrupted by an error
+ *	and then called again for the same query level.
+ */
+static void
+AfterTriggerFreeQuery(AfterTriggersQueryData *qs)
+{
+	Tuplestorestate *ts;
+	List	   *tables;
+	ListCell   *lc;
+
+	/* Drop the trigger events */
+	afterTriggerFreeEventList(&qs->events);
+
+	/* Drop FDW tuplestore if any */
+	ts = qs->fdw_tuplestore;
+	qs->fdw_tuplestore = NULL;
+	if (ts)
+		tuplestore_end(ts);
+
+	/* Release per-table subsidiary storage */
+	tables = qs->tables;
+	foreach(lc, tables)
 	{
-		tuplestore_end(fdw_tuplestore);
-		afterTriggers.fdw_tuplestores[afterTriggers.query_depth] = NULL;
+		AfterTriggersTableData *table = (AfterTriggersTableData *) lfirst(lc);
+
+		ts = table->old_tuplestore;
+		table->old_tuplestore = NULL;
+		if (ts)
+			tuplestore_end(ts);
+		ts = table->new_tuplestore;
+		table->new_tuplestore = NULL;
+		if (ts)
+			tuplestore_end(ts);
 	}
-	afterTriggerFreeEventList(&afterTriggers.query_stack[afterTriggers.query_depth]);
 
-	afterTriggers.query_depth--;
+	/*
+	 * Now free the AfterTriggersTableData structs and list cells.  Reset list
+	 * pointer first; if list_free_deep somehow gets an error, better to leak
+	 * that storage than have an infinite loop.
+	 */
+	qs->tables = NIL;
+	list_free_deep(tables);
 }
 
 
@@ -4521,10 +4709,7 @@ AfterTriggerEndXact(bool isCommit)
 	 * large, we let the eventual reset of TopTransactionContext free the
 	 * memory instead of doing it here.
 	 */
-	afterTriggers.state_stack = NULL;
-	afterTriggers.events_stack = NULL;
-	afterTriggers.depth_stack = NULL;
-	afterTriggers.firing_stack = NULL;
+	afterTriggers.trans_stack = NULL;
 	afterTriggers.maxtransdepth = 0;
 
 
@@ -4534,7 +4719,6 @@ AfterTriggerEndXact(bool isCommit)
 	 * memory here.
 	 */
 	afterTriggers.query_stack = NULL;
-	afterTriggers.fdw_tuplestores = NULL;
 	afterTriggers.maxquerydepth = 0;
 	afterTriggers.state = NULL;
 
@@ -4553,48 +4737,28 @@ AfterTriggerBeginSubXact(void)
 	int			my_level = GetCurrentTransactionNestLevel();
 
 	/*
-	 * Allocate more space in the stacks if needed.  (Note: because the
+	 * Allocate more space in the trans_stack if needed.  (Note: because the
 	 * minimum nest level of a subtransaction is 2, we waste the first couple
-	 * entries of each array; not worth the notational effort to avoid it.)
+	 * entries of the array; not worth the notational effort to avoid it.)
 	 */
 	while (my_level >= afterTriggers.maxtransdepth)
 	{
 		if (afterTriggers.maxtransdepth == 0)
 		{
-			MemoryContext old_cxt;
-
-			old_cxt = MemoryContextSwitchTo(TopTransactionContext);
-
-#define DEFTRIG_INITALLOC 8
-			afterTriggers.state_stack = (SetConstraintState *)
-				palloc(DEFTRIG_INITALLOC * sizeof(SetConstraintState));
-			afterTriggers.events_stack = (AfterTriggerEventList *)
-				palloc(DEFTRIG_INITALLOC * sizeof(AfterTriggerEventList));
-			afterTriggers.depth_stack = (int *)
-				palloc(DEFTRIG_INITALLOC * sizeof(int));
-			afterTriggers.firing_stack = (CommandId *)
-				palloc(DEFTRIG_INITALLOC * sizeof(CommandId));
-			afterTriggers.maxtransdepth = DEFTRIG_INITALLOC;
-
-			MemoryContextSwitchTo(old_cxt);
+			/* Arbitrarily initialize for max of 8 subtransaction levels */
+			afterTriggers.trans_stack = (AfterTriggersTransData *)
+				MemoryContextAlloc(TopTransactionContext,
+								   8 * sizeof(AfterTriggersTransData));
+			afterTriggers.maxtransdepth = 8;
 		}
 		else
 		{
-			/* repalloc will keep the stacks in the same context */
+			/* repalloc will keep the stack in the same context */
 			int			new_alloc = afterTriggers.maxtransdepth * 2;
 
-			afterTriggers.state_stack = (SetConstraintState *)
-				repalloc(afterTriggers.state_stack,
-						 new_alloc * sizeof(SetConstraintState));
-			afterTriggers.events_stack = (AfterTriggerEventList *)
-				repalloc(afterTriggers.events_stack,
-						 new_alloc * sizeof(AfterTriggerEventList));
-			afterTriggers.depth_stack = (int *)
-				repalloc(afterTriggers.depth_stack,
-						 new_alloc * sizeof(int));
-			afterTriggers.firing_stack = (CommandId *)
-				repalloc(afterTriggers.firing_stack,
-						 new_alloc * sizeof(CommandId));
+			afterTriggers.trans_stack = (AfterTriggersTransData *)
+				repalloc(afterTriggers.trans_stack,
+						 new_alloc * sizeof(AfterTriggersTransData));
 			afterTriggers.maxtransdepth = new_alloc;
 		}
 	}
@@ -4604,10 +4768,10 @@ AfterTriggerBeginSubXact(void)
 	 * is not saved until/unless changed.  Likewise, we don't make a
 	 * per-subtransaction event context until needed.
 	 */
-	afterTriggers.state_stack[my_level] = NULL;
-	afterTriggers.events_stack[my_level] = afterTriggers.events;
-	afterTriggers.depth_stack[my_level] = afterTriggers.query_depth;
-	afterTriggers.firing_stack[my_level] = afterTriggers.firing_counter;
+	afterTriggers.trans_stack[my_level].state = NULL;
+	afterTriggers.trans_stack[my_level].events = afterTriggers.events;
+	afterTriggers.trans_stack[my_level].query_depth = afterTriggers.query_depth;
+	afterTriggers.trans_stack[my_level].firing_counter = afterTriggers.firing_counter;
 }
 
 /*
@@ -4631,70 +4795,58 @@ AfterTriggerEndSubXact(bool isCommit)
 	{
 		Assert(my_level < afterTriggers.maxtransdepth);
 		/* If we saved a prior state, we don't need it anymore */
-		state = afterTriggers.state_stack[my_level];
+		state = afterTriggers.trans_stack[my_level].state;
 		if (state != NULL)
 			pfree(state);
 		/* this avoids double pfree if error later: */
-		afterTriggers.state_stack[my_level] = NULL;
+		afterTriggers.trans_stack[my_level].state = NULL;
 		Assert(afterTriggers.query_depth ==
-			   afterTriggers.depth_stack[my_level]);
+			   afterTriggers.trans_stack[my_level].query_depth);
 	}
 	else
 	{
 		/*
 		 * Aborting.  It is possible subxact start failed before calling
 		 * AfterTriggerBeginSubXact, in which case we mustn't risk touching
-		 * stack levels that aren't there.
+		 * trans_stack levels that aren't there.
 		 */
 		if (my_level >= afterTriggers.maxtransdepth)
 			return;
 
 		/*
-		 * Release any event lists from queries being aborted, and restore
+		 * Release query-level storage for queries being aborted, and restore
 		 * query_depth to its pre-subxact value.  This assumes that a
 		 * subtransaction will not add events to query levels started in a
 		 * earlier transaction state.
 		 */
-		while (afterTriggers.query_depth > afterTriggers.depth_stack[my_level])
+		while (afterTriggers.query_depth > afterTriggers.trans_stack[my_level].query_depth)
 		{
 			if (afterTriggers.query_depth < afterTriggers.maxquerydepth)
-			{
-				Tuplestorestate *ts;
-
-				ts = afterTriggers.fdw_tuplestores[afterTriggers.query_depth];
-				if (ts)
-				{
-					tuplestore_end(ts);
-					afterTriggers.fdw_tuplestores[afterTriggers.query_depth] = NULL;
-				}
-
-				afterTriggerFreeEventList(&afterTriggers.query_stack[afterTriggers.query_depth]);
-			}
-
+				AfterTriggerFreeQuery(&afterTriggers.query_stack[afterTriggers.query_depth]);
 			afterTriggers.query_depth--;
 		}
 		Assert(afterTriggers.query_depth ==
-			   afterTriggers.depth_stack[my_level]);
+			   afterTriggers.trans_stack[my_level].query_depth);
 
 		/*
 		 * Restore the global deferred-event list to its former length,
 		 * discarding any events queued by the subxact.
 		 */
 		afterTriggerRestoreEventList(&afterTriggers.events,
-									 &afterTriggers.events_stack[my_level]);
+									 &afterTriggers.trans_stack[my_level].events);
 
 		/*
 		 * Restore the trigger state.  If the saved state is NULL, then this
 		 * subxact didn't save it, so it doesn't need restoring.
 		 */
-		state = afterTriggers.state_stack[my_level];
+		state = afterTriggers.trans_stack[my_level].state;
 		if (state != NULL)
 		{
 			pfree(afterTriggers.state);
 			afterTriggers.state = state;
 		}
 		/* this avoids double pfree if error later: */
-		afterTriggers.state_stack[my_level] = NULL;
+		afterTriggers.trans_stack[my_level].state = NULL;
 
 		/*
 		 * Scan for any remaining deferred events that were marked DONE or IN
@@ -4704,7 +4856,7 @@ AfterTriggerEndSubXact(bool isCommit)
 		 * (This essentially assumes that the current subxact includes all
 		 * subxacts started after it.)
 		 */
-		subxact_firing_id = afterTriggers.firing_stack[my_level];
+		subxact_firing_id = afterTriggers.trans_stack[my_level].firing_counter;
 		for_each_event_chunk(event, chunk, afterTriggers.events)
 		{
 			AfterTriggerShared evtshared = GetTriggerSharedData(event);
@@ -4740,12 +4892,9 @@ AfterTriggerEnlargeQueryState(void)
 	{
 		int			new_alloc = Max(afterTriggers.query_depth + 1, 8);
 
-		afterTriggers.query_stack = (AfterTriggerEventList *)
+		afterTriggers.query_stack = (AfterTriggersQueryData *)
 			MemoryContextAlloc(TopTransactionContext,
-							   new_alloc * sizeof(AfterTriggerEventList));
-		afterTriggers.fdw_tuplestores = (Tuplestorestate **)
-			MemoryContextAllocZero(TopTransactionContext,
-								   new_alloc * sizeof(Tuplestorestate *));
+							   new_alloc * sizeof(AfterTriggersQueryData));
 		afterTriggers.maxquerydepth = new_alloc;
 	}
 	else
@@ -4755,27 +4904,22 @@ AfterTriggerEnlargeQueryState(void)
 		int			new_alloc = Max(afterTriggers.query_depth + 1,
 									old_alloc * 2);
 
-		afterTriggers.query_stack = (AfterTriggerEventList *)
+		afterTriggers.query_stack = (AfterTriggersQueryData *)
 			repalloc(afterTriggers.query_stack,
-					 new_alloc * sizeof(AfterTriggerEventList));
-		afterTriggers.fdw_tuplestores = (Tuplestorestate **)
-			repalloc(afterTriggers.fdw_tuplestores,
-					 new_alloc * sizeof(Tuplestorestate *));
-		/* Clear newly-allocated slots for subsequent lazy initialization. */
-		memset(afterTriggers.fdw_tuplestores + old_alloc,
-			   0, (new_alloc - old_alloc) * sizeof(Tuplestorestate *));
+					 new_alloc * sizeof(AfterTriggersQueryData));
 		afterTriggers.maxquerydepth = new_alloc;
 	}
 
-	/* Initialize new query lists to empty */
+	/* Initialize new array entries to empty */
 	while (init_depth < afterTriggers.maxquerydepth)
 	{
-		AfterTriggerEventList *events;
+		AfterTriggersQueryData *qs = &afterTriggers.query_stack[init_depth];
 
-		events = &afterTriggers.query_stack[init_depth];
-		events->head = NULL;
-		events->tail = NULL;
-		events->tailfree = NULL;
+		qs->events.head = NULL;
+		qs->events.tail = NULL;
+		qs->events.tailfree = NULL;
+		qs->fdw_tuplestore = NULL;
+		qs->tables = NIL;
 
 		++init_depth;
 	}
@@ -4873,9 +5017,9 @@ AfterTriggerSetState(ConstraintsSetStmt *stmt)
 	 * save it so it can be restored if the subtransaction aborts.
 	 */
 	if (my_level > 1 &&
-		afterTriggers.state_stack[my_level] == NULL)
+		afterTriggers.trans_stack[my_level].state == NULL)
 	{
-		afterTriggers.state_stack[my_level] =
+		afterTriggers.trans_stack[my_level].state =
 			SetConstraintStateCopy(afterTriggers.state);
 	}
 
@@ -5184,7 +5328,7 @@ AfterTriggerPendingOnRel(Oid relid)
 	 */
 	for (depth = 0; depth <= afterTriggers.query_depth && depth < afterTriggers.maxquerydepth; depth++)
 	{
-		for_each_event_chunk(event, chunk, afterTriggers.query_stack[depth])
+		for_each_event_chunk(event, chunk, afterTriggers.query_stack[depth].events)
 		{
 			AfterTriggerShared evtshared = GetTriggerSharedData(event);
 
@@ -5229,7 +5373,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 	TriggerDesc *trigdesc = relinfo->ri_TrigDesc;
 	AfterTriggerEventData new_event;
 	AfterTriggerSharedData new_shared;
-	char		relkind = relinfo->ri_RelationDesc->rd_rel->relkind;
+	char		relkind = rel->rd_rel->relkind;
 	int			tgtype_event;
 	int			tgtype_level;
 	int			i;
@@ -5266,7 +5410,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 			Tuplestorestate *old_tuplestore;
 
 			Assert(oldtup != NULL);
-			old_tuplestore = transition_capture->tcs_old_tuplestore;
+			old_tuplestore = transition_capture->tcs_private->old_tuplestore;
 
 			if (map != NULL)
 			{
@@ -5284,10 +5428,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 			Tuplestorestate *new_tuplestore;
 
 			Assert(newtup != NULL);
-			if (event == TRIGGER_EVENT_INSERT)
-				new_tuplestore = transition_capture->tcs_insert_tuplestore;
-			else
-				new_tuplestore = transition_capture->tcs_update_tuplestore;
+			new_tuplestore = transition_capture->tcs_private->new_tuplestore;
 
 			if (original_insert_tuple != NULL)
 				tuplestore_puttuple(new_tuplestore, original_insert_tuple);
@@ -5316,6 +5457,11 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 	 * The event code will be used both as a bitmask and an array offset, so
 	 * validation is important to make sure we don't walk off the edge of our
 	 * arrays.
+	 *
+	 * Also, if we're considering statement-level triggers, check whether we
+	 * already queued a set of them for this event, and cancel the prior set
+	 * if so.  This preserves the behavior that statement-level triggers fire
+	 * just once per statement and fire after row-level triggers.
 	 */
 	switch (event)
 	{
@@ -5334,6 +5480,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 				Assert(newtup == NULL);
 				ItemPointerSetInvalid(&(new_event.ate_ctid1));
 				ItemPointerSetInvalid(&(new_event.ate_ctid2));
+				cancel_prior_stmt_triggers(RelationGetRelid(rel),
+										   CMD_INSERT, event);
 			}
 			break;
 		case TRIGGER_EVENT_DELETE:
@@ -5351,6 +5499,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 				Assert(newtup == NULL);
 				ItemPointerSetInvalid(&(new_event.ate_ctid1));
 				ItemPointerSetInvalid(&(new_event.ate_ctid2));
+				cancel_prior_stmt_triggers(RelationGetRelid(rel),
+										   CMD_DELETE, event);
 			}
 			break;
 		case TRIGGER_EVENT_UPDATE:
@@ -5368,6 +5518,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 				Assert(newtup == NULL);
 				ItemPointerSetInvalid(&(new_event.ate_ctid1));
 				ItemPointerSetInvalid(&(new_event.ate_ctid2));
+				cancel_prior_stmt_triggers(RelationGetRelid(rel),
+										   CMD_UPDATE, event);
 			}
 			break;
 		case TRIGGER_EVENT_TRUNCATE:
@@ -5407,9 +5559,7 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 		{
 			if (fdw_tuplestore == NULL)
 			{
-				fdw_tuplestore =
-					GetTriggerTransitionTuplestore
-					(afterTriggers.fdw_tuplestores);
+				fdw_tuplestore = GetCurrentFDWTuplestore();
 				new_event.ate_flags = AFTER_TRIGGER_FDW_FETCH;
 			}
 			else
@@ -5465,6 +5615,8 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 
 		/*
 		 * Fill in event structure and add it to the current query's queue.
+		 * Note we set ats_table to NULL whenever this trigger doesn't use
+		 * transition tables, to improve sharability of the shared event data.
 		 */
 		new_shared.ats_event =
 			(event & TRIGGER_EVENT_OPMASK) |
@@ -5474,11 +5626,13 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 		new_shared.ats_tgoid = trigger->tgoid;
 		new_shared.ats_relid = RelationGetRelid(rel);
 		new_shared.ats_firing_id = 0;
-		/* deferrable triggers cannot access transition data */
-		new_shared.ats_transition_capture =
-			trigger->tgdeferrable ? NULL : transition_capture;
+		if ((trigger->tgoldtable || trigger->tgnewtable) &&
+			transition_capture != NULL)
+			new_shared.ats_table = transition_capture->tcs_private;
+		else
+			new_shared.ats_table = NULL;
 
-		afterTriggerAddEvent(&afterTriggers.query_stack[afterTriggers.query_depth],
+		afterTriggerAddEvent(&afterTriggers.query_stack[afterTriggers.query_depth].events,
 							 &new_event, &new_shared);
 	}
 
@@ -5496,6 +5650,100 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
 	}
 }
 
+/*
+ * If we previously queued a set of AFTER STATEMENT triggers for the given
+ * relation + operation, and they've not been fired yet, cancel them.  The
+ * caller will queue a fresh set that's after any row-level triggers that may
+ * have been queued by the current sub-statement, preserving (as much as
+ * possible) the property that AFTER ROW triggers fire before AFTER STATEMENT
+ * triggers, and that the latter only fire once.  This deals with the
+ * situation where several FK enforcement triggers sequentially queue triggers
+ * for the same table into the same trigger query level.  We can't fully
+ * prevent odd behavior though: if there are AFTER ROW triggers taking
+ * transition tables, we don't want to change the transition tables once the
+ * first such trigger has seen them.  In such a case, any additional events
+ * will result in creating new transition tables and allowing new firings of
+ * statement triggers.
+ *
+ * This also saves the current event list location so that a later invocation
+ * of this function can cheaply find the triggers we're about to queue and
+ * cancel them.
+ */
+static void
+cancel_prior_stmt_triggers(Oid relid, CmdType cmdType, int tgevent)
+{
+	AfterTriggersTableData *table;
+	AfterTriggersQueryData *qs = &afterTriggers.query_stack[afterTriggers.query_depth];
+
+	/*
+	 * We keep this state in the AfterTriggersTableData that also holds
+	 * transition tables for the relation + operation.  In this way, if we are
+	 * forced to make a new set of transition tables because more tuples get
+	 * entered after we've already fired triggers, we will allow a new set of
+	 * statement triggers to get queued without canceling the old ones.
+	 */
+	table = GetAfterTriggersTableData(relid, cmdType);
+
+	if (table->stmt_trig_done)
+	{
+		/*
+		 * We want to start scanning from the tail location that existed just
+		 * before we inserted any statement triggers.  But the events list
+		 * might've been entirely empty then, in which case scan from the
+		 * current head.
+		 */
+		AfterTriggerEvent event;
+		AfterTriggerEventChunk *chunk;
+
+		if (table->stmt_trig_events.tail)
+		{
+			chunk = table->stmt_trig_events.tail;
+			event = (AfterTriggerEvent) table->stmt_trig_events.tailfree;
+		}
+		else
+		{
+			chunk = qs->events.head;
+			event = NULL;
+		}
+
+		for_each_chunk_from(chunk)
+		{
+			if (event == NULL)
+				event = (AfterTriggerEvent) CHUNK_DATA_START(chunk);
+			for_each_event_from(event, chunk)
+			{
+				AfterTriggerShared evtshared = GetTriggerSharedData(event);
+
+				/*
+				 * Exit loop when we reach events that aren't AS triggers for
+				 * the target relation.
+				 */
+				if (evtshared->ats_relid != relid)
+					goto done;
+				if ((evtshared->ats_event & TRIGGER_EVENT_OPMASK) != tgevent)
+					goto done;
+				if (!TRIGGER_FIRED_FOR_STATEMENT(evtshared->ats_event))
+					goto done;
+				if (!TRIGGER_FIRED_AFTER(evtshared->ats_event))
+					goto done;
+				/* OK, mark it DONE */
+				event->ate_flags &= ~AFTER_TRIGGER_IN_PROGRESS;
+				event->ate_flags |= AFTER_TRIGGER_DONE;
+			}
+			/* signal we must reinitialize event ptr for next chunk */
+			event = NULL;
+		}
+	}
+done:
+
+	/* In any case, save current insertion point for next time */
+	table->stmt_trig_done = true;
+	table->stmt_trig_events = qs->events;
+}
+
+/*
+ * SQL function pg_trigger_depth()
+ */
 Datum
 pg_trigger_depth(PG_FUNCTION_ARGS)
 {
diff --git a/src/backend/executor/README b/src/backend/executor/README
index a0045067fb8..b3e74aa1a54 100644
--- a/src/backend/executor/README
+++ b/src/backend/executor/README
@@ -241,11 +241,11 @@ This is a sketch of control flow for full query processing:
 		CreateExecutorState
 			creates per-query context
 		switch to per-query context to run ExecInitNode
+		AfterTriggerBeginQuery
 		ExecInitNode --- recursively scans plan tree
 			CreateExprContext
 				creates per-tuple context
 			ExecInitExpr
-		AfterTriggerBeginQuery
 
 	ExecutorRun
 		ExecProcNode --- recursively called in per-query context
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 9dcc358ec27..396b7a1e83f 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -252,17 +252,17 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
 	estate->es_instrument = queryDesc->instrument_options;
 
 	/*
-	 * Initialize the plan state tree
-	 */
-	InitPlan(queryDesc, eflags);
-
-	/*
 	 * Set up an AFTER-trigger statement context, unless told not to, or
 	 * unless it's EXPLAIN-only mode (when ExecutorFinish won't be called).
 	 */
 	if (!(eflags & (EXEC_FLAG_SKIP_TRIGGERS | EXEC_FLAG_EXPLAIN_ONLY)))
 		AfterTriggerBeginQuery();
 
+	/*
+	 * Initialize the plan state tree
+	 */
+	InitPlan(queryDesc, eflags);
+
 	MemoryContextSwitchTo(oldcontext);
 }
 
@@ -1174,6 +1174,7 @@ CheckValidResultRel(ResultRelInfo *resultRelInfo, CmdType operation)
 			switch (operation)
 			{
 				case CMD_INSERT:
+
 					/*
 					 * If foreign partition to do tuple-routing for, skip the
 					 * check; it's disallowed elsewhere.
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index d78e868154e..7b5214c9996 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -342,6 +342,9 @@ ExecInsert(ModifyTableState *mtstate,
 				mtstate->mt_transition_capture->tcs_map = NULL;
 			}
 		}
+		if (mtstate->mt_oc_transition_capture != NULL)
+			mtstate->mt_oc_transition_capture->tcs_map =
+				mtstate->mt_transition_tupconv_maps[leaf_part_index];
 
 		/*
 		 * We might need to convert from the parent rowtype to the partition
@@ -1157,6 +1160,8 @@ lreplace:;
 	/* AFTER ROW UPDATE Triggers */
 	ExecARUpdateTriggers(estate, resultRelInfo, tupleid, oldtuple, tuple,
 						 recheckIndexes,
+						 mtstate->operation == CMD_INSERT ?
+						 mtstate->mt_oc_transition_capture :
 						 mtstate->mt_transition_capture);
 
 	list_free(recheckIndexes);
@@ -1443,7 +1448,7 @@ fireASTriggers(ModifyTableState *node)
 			if (node->mt_onconflict == ONCONFLICT_UPDATE)
 				ExecASUpdateTriggers(node->ps.state,
 									 resultRelInfo,
-									 node->mt_transition_capture);
+									 node->mt_oc_transition_capture);
 			ExecASInsertTriggers(node->ps.state, resultRelInfo,
 								 node->mt_transition_capture);
 			break;
@@ -1473,14 +1478,24 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
 
 	/* Check for transition tables on the directly targeted relation. */
 	mtstate->mt_transition_capture =
-		MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc);
+		MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc,
+								   RelationGetRelid(targetRelInfo->ri_RelationDesc),
+								   mtstate->operation);
+	if (mtstate->operation == CMD_INSERT &&
+		mtstate->mt_onconflict == ONCONFLICT_UPDATE)
+		mtstate->mt_oc_transition_capture =
+			MakeTransitionCaptureState(targetRelInfo->ri_TrigDesc,
+									   RelationGetRelid(targetRelInfo->ri_RelationDesc),
+									   CMD_UPDATE);
 
 	/*
 	 * If we found that we need to collect transition tuples then we may also
 	 * need tuple conversion maps for any children that have TupleDescs that
-	 * aren't compatible with the tuplestores.
+	 * aren't compatible with the tuplestores.  (We can share these maps
+	 * between the regular and ON CONFLICT cases.)
 	 */
-	if (mtstate->mt_transition_capture != NULL)
+	if (mtstate->mt_transition_capture != NULL ||
+		mtstate->mt_oc_transition_capture != NULL)
 	{
 		ResultRelInfo *resultRelInfos;
 		int			numResultRelInfos;
@@ -1521,10 +1536,12 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
 		/*
 		 * Install the conversion map for the first plan for UPDATE and DELETE
 		 * operations.  It will be advanced each time we switch to the next
-		 * plan.  (INSERT operations set it every time.)
+		 * plan.  (INSERT operations set it every time, so we need not update
+		 * mtstate->mt_oc_transition_capture here.)
 		 */
-		mtstate->mt_transition_capture->tcs_map =
-			mtstate->mt_transition_tupconv_maps[0];
+		if (mtstate->mt_transition_capture)
+			mtstate->mt_transition_capture->tcs_map =
+				mtstate->mt_transition_tupconv_maps[0];
 	}
 }
 
@@ -1628,13 +1645,19 @@ ExecModifyTable(PlanState *pstate)
 				estate->es_result_relation_info = resultRelInfo;
 				EvalPlanQualSetPlan(&node->mt_epqstate, subplanstate->plan,
 									node->mt_arowmarks[node->mt_whichplan]);
+				/* Prepare to convert transition tuples from this child. */
 				if (node->mt_transition_capture != NULL)
 				{
-					/* Prepare to convert transition tuples from this child. */
 					Assert(node->mt_transition_tupconv_maps != NULL);
 					node->mt_transition_capture->tcs_map =
 						node->mt_transition_tupconv_maps[node->mt_whichplan];
 				}
+				if (node->mt_oc_transition_capture != NULL)
+				{
+					Assert(node->mt_transition_tupconv_maps != NULL);
+					node->mt_oc_transition_capture->tcs_map =
+						node->mt_transition_tupconv_maps[node->mt_whichplan];
+				}
 				continue;
 			}
 			else
@@ -1933,8 +1956,12 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 		mtstate->mt_partition_tuple_slot = partition_tuple_slot;
 	}
 
-	/* Build state for collecting transition tuples */
-	ExecSetupTransitionCaptureState(mtstate, estate);
+	/*
+	 * Build state for collecting transition tuples.  This requires having a
+	 * valid trigger query context, so skip it in explain-only mode.
+	 */
+	if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+		ExecSetupTransitionCaptureState(mtstate, estate);
 
 	/*
 	 * Initialize any WITH CHECK OPTION constraints if needed.
@@ -2318,16 +2345,6 @@ ExecEndModifyTable(ModifyTableState *node)
 	int			i;
 
 	/*
-	 * Free transition tables, unless this query is being run in
-	 * EXEC_FLAG_SKIP_TRIGGERS mode, which means that it may have queued AFTER
-	 * triggers that won't be run till later.  In that case we'll just leak
-	 * the transition tables till end of (sub)transaction.
-	 */
-	if (node->mt_transition_capture != NULL &&
-		!(node->ps.state->es_top_eflags & EXEC_FLAG_SKIP_TRIGGERS))
-		DestroyTransitionCaptureState(node->mt_transition_capture);
-
-	/*
 	 * Allow any FDWs to shut down
 	 */
 	for (i = 0; i < node->mt_nplans; i++)
diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h
index aeb363f13e8..adbcfa1297a 100644
--- a/src/include/commands/trigger.h
+++ b/src/include/commands/trigger.h
@@ -43,13 +43,21 @@ typedef struct TriggerData
 
 /*
  * The state for capturing old and new tuples into transition tables for a
- * single ModifyTable node.
+ * single ModifyTable node (or other operation source, e.g. copy.c).
+ *
+ * This is per-caller to avoid conflicts in setting tcs_map or
+ * tcs_original_insert_tuple.  Note, however, that the pointed-to
+ * private data may be shared across multiple callers.
  */
+struct AfterTriggersTableData;	/* private in trigger.c */
+
 typedef struct TransitionCaptureState
 {
 	/*
 	 * Is there at least one trigger specifying each transition relation on
 	 * the relation explicitly named in the DML statement or COPY command?
+	 * Note: in current usage, these flags could be part of the private state,
+	 * but it seems possibly useful to let callers see them.
 	 */
 	bool		tcs_delete_old_table;
 	bool		tcs_update_old_table;
@@ -60,7 +68,7 @@ typedef struct TransitionCaptureState
 	 * For UPDATE and DELETE, AfterTriggerSaveEvent may need to convert the
 	 * new and old tuples from a child table's format to the format of the
 	 * relation named in a query so that it is compatible with the transition
-	 * tuplestores.
+	 * tuplestores.  The caller must store the conversion map here if so.
 	 */
 	TupleConversionMap *tcs_map;
 
@@ -74,17 +82,9 @@ typedef struct TransitionCaptureState
 	HeapTuple	tcs_original_insert_tuple;
 
 	/*
-	 * The tuplestores backing the transition tables.  We use separate
-	 * tuplestores for INSERT and UPDATE, because INSERT ... ON CONFLICT ...
-	 * DO UPDATE causes INSERT and UPDATE triggers to fire and needs a way to
-	 * keep track of the new tuple images resulting from the two cases
-	 * separately.  We only need a single old image tuplestore, because there
-	 * is no statement that can both update and delete at the same time.
+	 * Private data including the tuplestore(s) into which to insert tuples.
 	 */
-	Tuplestorestate *tcs_old_tuplestore;	/* for DELETE and UPDATE old
-											 * images */
-	Tuplestorestate *tcs_insert_tuplestore; /* for INSERT new images */
-	Tuplestorestate *tcs_update_tuplestore; /* for UPDATE new images */
+	struct AfterTriggersTableData *tcs_private;
 } TransitionCaptureState;
 
 /*
@@ -174,8 +174,9 @@ extern void RelationBuildTriggers(Relation relation);
 extern TriggerDesc *CopyTriggerDesc(TriggerDesc *trigdesc);
 
 extern const char *FindTriggerIncompatibleWithInheritance(TriggerDesc *trigdesc);
-extern TransitionCaptureState *MakeTransitionCaptureState(TriggerDesc *trigdesc);
-extern void DestroyTransitionCaptureState(TransitionCaptureState *tcs);
+
+extern TransitionCaptureState *MakeTransitionCaptureState(TriggerDesc *trigdesc,
+						   Oid relid, CmdType cmdType);
 
 extern void FreeTriggerDesc(TriggerDesc *trigdesc);
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index a46a56ebc10..0f5d47ba8e4 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -983,7 +983,9 @@ typedef struct ModifyTableState
 	/* Per partition tuple conversion map */
 	TupleTableSlot *mt_partition_tuple_slot;
 	struct TransitionCaptureState *mt_transition_capture;
-	/* controls transition table population */
+	/* controls transition table population for specified operation */
+	struct TransitionCaptureState *mt_oc_transition_capture;
+	/* controls transition table population for INSERT...ON CONFLICT UPDATE */
 	TupleConversionMap **mt_transition_tupconv_maps;
 	/* Per plan/partition tuple conversion */
 } ModifyTableState;
diff --git a/src/test/regress/expected/triggers.out b/src/test/regress/expected/triggers.out
index 620fac1e2c5..3ab6be3421c 100644
--- a/src/test/regress/expected/triggers.out
+++ b/src/test/regress/expected/triggers.out
@@ -2217,6 +2217,23 @@ with wcte as (insert into table1 values (42))
   insert into table2 values ('hello world');
 NOTICE:  trigger = table2_trig, new table = ("hello world")
 NOTICE:  trigger = table1_trig, new table = (42)
+with wcte as (insert into table1 values (43))
+  insert into table1 values (44);
+NOTICE:  trigger = table1_trig, new table = (43), (44)
+select * from table1;
+ a  
+----
+ 42
+ 44
+ 43
+(3 rows)
+
+select * from table2;
+      a      
+-------------
+ hello world
+(1 row)
+
 drop table table1;
 drop table table2;
 --
@@ -2256,6 +2273,14 @@ create trigger my_table_multievent_trig
   after insert or update on my_table referencing new table as new_table
   for each statement execute procedure dump_insert();
 ERROR:  transition tables cannot be specified for triggers with more than one event
+--
+-- Verify that you can't create a trigger with transition tables with
+-- a column list.
+--
+create trigger my_table_col_update_trig
+  after update of b on my_table referencing new table as new_table
+  for each statement execute procedure dump_insert();
+ERROR:  transition tables cannot be specified for triggers with column lists
 drop table my_table;
 --
 -- Test firing of triggers with transition tables by foreign key cascades
@@ -2299,8 +2324,7 @@ select * from trig_table;
 (6 rows)
 
 delete from refd_table where length(b) = 3;
-NOTICE:  trigger = trig_table_delete_trig, old table = (2,"two a"), (2,"two b")
-NOTICE:  trigger = trig_table_delete_trig, old table = (11,"one a"), (11,"one b")
+NOTICE:  trigger = trig_table_delete_trig, old table = (2,"two a"), (2,"two b"), (11,"one a"), (11,"one b")
 select * from trig_table;
  a |    b    
 ---+---------
@@ -2309,6 +2333,30 @@ select * from trig_table;
 (2 rows)
 
 drop table refd_table, trig_table;
+--
+-- self-referential FKs are even more fun
+--
+create table self_ref (a int primary key,
+                       b int references self_ref(a) on delete cascade);
+create trigger self_ref_r_trig
+  after delete on self_ref referencing old table as old_table
+  for each row execute procedure dump_delete();
+create trigger self_ref_s_trig
+  after delete on self_ref referencing old table as old_table
+  for each statement execute procedure dump_delete();
+insert into self_ref values (1, null), (2, 1), (3, 2);
+delete from self_ref where a = 1;
+NOTICE:  trigger = self_ref_r_trig, old table = (1,), (2,1)
+NOTICE:  trigger = self_ref_r_trig, old table = (1,), (2,1)
+NOTICE:  trigger = self_ref_s_trig, old table = (1,), (2,1)
+NOTICE:  trigger = self_ref_r_trig, old table = (3,2)
+NOTICE:  trigger = self_ref_s_trig, old table = (3,2)
+-- without AR trigger, cascaded deletes all end up in one transition table
+drop trigger self_ref_r_trig on self_ref;
+insert into self_ref values (1, null), (2, 1), (3, 2), (4, 3);
+delete from self_ref where a = 1;
+NOTICE:  trigger = self_ref_s_trig, old table = (1,), (2,1), (3,2), (4,3)
+drop table self_ref;
 -- cleanup
 drop function dump_insert();
 drop function dump_update();
diff --git a/src/test/regress/sql/triggers.sql b/src/test/regress/sql/triggers.sql
index c6deb56c507..30bb7d17b08 100644
--- a/src/test/regress/sql/triggers.sql
+++ b/src/test/regress/sql/triggers.sql
@@ -1729,6 +1729,12 @@ create trigger table2_trig
 with wcte as (insert into table1 values (42))
   insert into table2 values ('hello world');
 
+with wcte as (insert into table1 values (43))
+  insert into table1 values (44);
+
+select * from table1;
+select * from table2;
+
 drop table table1;
 drop table table2;
 
@@ -1769,6 +1775,15 @@ create trigger my_table_multievent_trig
   after insert or update on my_table referencing new table as new_table
   for each statement execute procedure dump_insert();
 
+--
+-- Verify that you can't create a trigger with transition tables with
+-- a column list.
+--
+
+create trigger my_table_col_update_trig
+  after update of b on my_table referencing new table as new_table
+  for each statement execute procedure dump_insert();
+
 drop table my_table;
 
 --
@@ -1812,6 +1827,33 @@ select * from trig_table;
 
 drop table refd_table, trig_table;
 
+--
+-- self-referential FKs are even more fun
+--
+
+create table self_ref (a int primary key,
+                       b int references self_ref(a) on delete cascade);
+
+create trigger self_ref_r_trig
+  after delete on self_ref referencing old table as old_table
+  for each row execute procedure dump_delete();
+create trigger self_ref_s_trig
+  after delete on self_ref referencing old table as old_table
+  for each statement execute procedure dump_delete();
+
+insert into self_ref values (1, null), (2, 1), (3, 2);
+
+delete from self_ref where a = 1;
+
+-- without AR trigger, cascaded deletes all end up in one transition table
+drop trigger self_ref_r_trig on self_ref;
+
+insert into self_ref values (1, null), (2, 1), (3, 2), (4, 3);
+
+delete from self_ref where a = 1;
+
+drop table self_ref;
+
 -- cleanup
 drop function dump_insert();
 drop function dump_update();