7 files changed, 418 insertions, 20 deletions
diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c
index 39e2c07ad3d..86f938686c3 100644
--- a/src/backend/access/gin/ginbtree.c
+++ b/src/backend/access/gin/ginbtree.c
@@ -19,6 +19,7 @@
 #include "access/xloginsert.h"
 #include "miscadmin.h"
 #include "storage/predicate.h"
+#include "utils/injection_point.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -28,6 +29,8 @@ static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
 						   Buffer childbuf, GinStatsData *buildStats);
 static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack,
 						   bool freestack, GinStatsData *buildStats);
+static void ginFinishOldSplit(GinBtree btree, GinBtreeStack *stack,
+							  GinStatsData *buildStats, int access);
 
 /*
  * Lock buffer by needed method for search.
@@ -108,7 +111,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode,
 		 * encounter on the way.
 		 */
 		if (!searchMode && GinPageIsIncompleteSplit(page))
-			ginFinishSplit(btree, stack, false, NULL);
+			ginFinishOldSplit(btree, stack, NULL, access);
 
 		/*
 		 * ok, page is correctly locked, we should check to move right ..,
@@ -128,7 +131,7 @@ ginFindLeafPage(GinBtree btree, bool searchMode,
 			page = BufferGetPage(stack->buffer);
 
 			if (!searchMode && GinPageIsIncompleteSplit(page))
-				ginFinishSplit(btree, stack, false, NULL);
+				ginFinishOldSplit(btree, stack, NULL, access);
 		}
 
 		if (GinPageIsLeaf(page))	/* we found, return locked page */
@@ -164,8 +167,11 @@ ginFindLeafPage(GinBtree btree, bool searchMode,
  * Step right from current page.
  *
  * The next page is locked first, before releasing the current page. This is
- * crucial to protect from concurrent page deletion (see comment in
- * ginDeletePage).
+ * crucial to prevent concurrent VACUUM from deleting a page that we are about
+ * to step to. (The lock-coupling isn't strictly necessary when we are
+ * traversing the tree to find an insert location, because page deletion grabs
+ * a cleanup lock on the root to prevent any concurrent inserts. See Page
+ * deletion section in the README. But there's no harm in doing it always.)
  */
 Buffer
 ginStepRight(Buffer buffer, Relation index, int lockmode)
@@ -262,7 +268,7 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack)
 			ptr->parent = root;
 			ptr->off = InvalidOffsetNumber;
 
-			ginFinishSplit(btree, ptr, false, NULL);
+			ginFinishOldSplit(btree, ptr, NULL, GIN_EXCLUSIVE);
 		}
 
 		leftmostBlkno = btree->getLeftMostChild(btree, page);
@@ -291,7 +297,7 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack)
 				ptr->parent = root;
 				ptr->off = InvalidOffsetNumber;
 
-				ginFinishSplit(btree, ptr, false, NULL);
+				ginFinishOldSplit(btree, ptr, NULL, GIN_EXCLUSIVE);
 			}
 		}
 
@@ -670,15 +676,6 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
 	bool		done;
 	bool		first = true;
 
-	/*
-	 * freestack == false when we encounter an incompletely split page during
-	 * a scan, while freestack == true is used in the normal scenario that a
-	 * split is finished right after the initial insert.
-	 */
-	if (!freestack)
-		elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"",
-			 stack->blkno, RelationGetRelationName(btree->index));
-
 	/* this loop crawls up the stack until the insertion is complete */
 	do
 	{
@@ -686,6 +683,13 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
 		void	   *insertdata;
 		BlockNumber updateblkno;
 
+#ifdef USE_INJECTION_POINTS
+		if (GinPageIsLeaf(BufferGetPage(stack->buffer)))
+			INJECTION_POINT("gin-leave-leaf-split-incomplete");
+		else
+			INJECTION_POINT("gin-leave-internal-split-incomplete");
+#endif
+
 		/* search parent to lock */
 		LockBuffer(parent->buffer, GIN_EXCLUSIVE);
 
@@ -699,7 +703,7 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
 		 * would fail.
 		 */
 		if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
-			ginFinishSplit(btree, parent, false, buildStats);
+			ginFinishOldSplit(btree, parent, buildStats, GIN_EXCLUSIVE);
 
 		/* move right if it's needed */
 		page = BufferGetPage(parent->buffer);
@@ -723,7 +727,7 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
 			page = BufferGetPage(parent->buffer);
 
 			if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
-				ginFinishSplit(btree, parent, false, buildStats);
+				ginFinishOldSplit(btree, parent, buildStats, GIN_EXCLUSIVE);
 		}
 
 		/* insert the downlink */
@@ -760,6 +764,43 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
 }
 
 /*
+ * An entry point to ginFinishSplit() that is used when we stumble upon an
+ * existing incompletely split page in the tree, as opposed to completing a
+ * split that we just made outselves. The difference is that stack->buffer may
+ * be merely share-locked on entry, and will be upgraded to exclusive mode.
+ *
+ * Note: Upgrading the lock momentarily releases it. Doing that in a scan
+ * would not be OK, because a concurrent VACUUM might delete the page while
+ * we're not holding the lock. It's OK in an insert, though, because VACUUM
+ * has a different mechanism that prevents it from running concurrently with
+ * inserts. (Namely, it holds a cleanup lock on the root.)
+ */
+static void
+ginFinishOldSplit(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats, int access)
+{
+	INJECTION_POINT("gin-finish-incomplete-split");
+	elog(DEBUG1, "finishing incomplete split of block %u in gin index \"%s\"",
+		 stack->blkno, RelationGetRelationName(btree->index));
+
+	if (access == GIN_SHARE)
+	{
+		LockBuffer(stack->buffer, GIN_UNLOCK);
+		LockBuffer(stack->buffer, GIN_EXCLUSIVE);
+
+		if (!GinPageIsIncompleteSplit(BufferGetPage(stack->buffer)))
+		{
+			/*
+			 * Someone else already completed the split while we were not
+			 * holding the lock.
+			 */
+			return;
+		}
+	}
+
+	ginFinishSplit(btree, stack, false, buildStats);
+}
+
+/*
  * Insert a value to tree described by stack.
  *
  * The value to be inserted is given in 'insertdata'. Its format depends
@@ -779,7 +820,7 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, void *insertdata,
 
 	/* If the leaf page was incompletely split, finish the split first */
 	if (GinPageIsIncompleteSplit(BufferGetPage(stack->buffer)))
-		ginFinishSplit(btree, stack, false, buildStats);
+		ginFinishOldSplit(btree, stack, buildStats, GIN_EXCLUSIVE);
 
 	done = ginPlaceToPage(btree, stack,
 						  insertdata, InvalidBlockNumber,
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index e32c8925f67..89aa41b5e31 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -40,9 +40,9 @@ SUBDIRS = \
 
 
 ifeq ($(enable_injection_points),yes)
-SUBDIRS += injection_points
+SUBDIRS += injection_points gin
 else
-ALWAYS_SUBDIRS += injection_points
+ALWAYS_SUBDIRS += injection_points gin
 endif
 
 ifeq ($(with_ssl),openssl)
diff --git a/src/test/modules/gin/Makefile b/src/test/modules/gin/Makefile
new file mode 100644
index 00000000000..a7731b7fa18
--- /dev/null
+++ b/src/test/modules/gin/Makefile
@@ -0,0 +1,16 @@
+# src/test/modules/gin/Makefile
+
+EXTRA_INSTALL = src/test/modules/injection_points
+
+REGRESS = gin_incomplete_splits
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/gin
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/gin/expected/gin_incomplete_splits.out b/src/test/modules/gin/expected/gin_incomplete_splits.out
new file mode 100644
index 00000000000..9a60f8c128a
--- /dev/null
+++ b/src/test/modules/gin/expected/gin_incomplete_splits.out
@@ -0,0 +1,180 @@
+--
+-- The test uses a GIN index over int[].  The table contains arrays
+-- with integers from 1 to :next_i.  Each integer occurs exactly once,
+-- no gaps or duplicates, although the index does contain some
+-- duplicate elements because some of the inserting transactions are
+-- rolled back during the test. The exact contents of the table depend
+-- on the physical layout of the index, which in turn depends at least
+-- on the block size, so instead of check for the exact contents, we
+-- check those invariants.  :next_i psql variable is maintained at all
+-- times to hold the last inserted integer + 1.
+--
+-- This uses injection points to cause errors that leave some page
+-- splits in "incomplete" state
+create extension injection_points;
+-- Use the index for all the queries
+set enable_seqscan=off;
+-- Print a NOTICE whenever an incomplete split gets fixed
+SELECT injection_points_attach('gin-finish-incomplete-split', 'notice');
+ injection_points_attach 
+-------------------------
+ 
+(1 row)
+
+--
+-- First create the test table and some helper functions
+--
+create table gin_incomplete_splits(i int4[]) with (autovacuum_enabled = off);
+create index on gin_incomplete_splits using gin (i) with (fastupdate = off);
+-- Creates an array with all integers from $1 (inclusive) $2 (exclusive)
+create function range_array(int, int) returns int[] language sql immutable as $$
+  select array_agg(g) from generate_series($1, $2 - 1) g
+$$;
+-- Inserts an array with 'n' rows to the test table. Pass :next_i as
+-- the first argument, returns the new value for :next_i.
+create function insert_n(first_i int, n int) returns int language plpgsql as $$
+begin
+  insert into gin_incomplete_splits values (range_array(first_i, first_i+n));
+  return first_i + n;
+end;
+$$;
+-- Inserts to the table until an insert fails. Like insert_n(), returns the
+-- new value for :next_i.
+create function insert_until_fail(next_i int, step int default 1) returns int language plpgsql as $$
+declare
+  i integer;
+begin
+  -- Insert arrays with 'step' elements each, until an error occurs.
+  loop
+    begin
+      select insert_n(next_i, step) into next_i;
+    exception when others then
+      raise notice 'failed with: %', sqlerrm;
+      exit;
+    end;
+
+    -- The caller is expected to set an injection point that eventuall
+    -- causes an error. But bail out if still no error after 10000
+    -- attempts, so that we don't get stuck in an infinite loop.
+    i := i + 1;
+    if i = 10000 then
+      raise 'no error on inserts after ';
+    end if;
+  end loop;
+
+  return next_i;
+end;
+$$;
+-- Check the invariants.
+create function verify(next_i int) returns bool language plpgsql as $$
+declare
+  a integer[];
+  elem integer;
+  c integer;
+begin
+  -- Perform a scan over the trailing part of the index, where the
+  -- possible incomplete splits are. (We don't check the whole table,
+  -- because that'd be pretty slow.)
+  c := 0;
+  -- Find all arrays that overlap with the last 200 inserted integers. Or
+  -- the next 100, which shouldn't exist.
+  for a in select i from gin_incomplete_splits where i && range_array(next_i - 200, next_i + 100)
+  loop
+    -- count all elements in those arrays in the window.
+    foreach elem in ARRAY a loop
+      if elem >= next_i then
+        raise 'unexpected value % in array', elem;
+      end if;
+      if elem >= next_i - 200 then
+        c := c + 1;
+      end if;
+    end loop;
+  end loop;
+  if c <> 200 then
+    raise 'unexpected count % ', c;
+  end if;
+
+  return true;
+end;
+$$;
+-- Insert one array to get started.
+select insert_n(1, 1000) as next_i
+\gset
+select verify(:next_i);
+ verify 
+--------
+ t
+(1 row)
+
+--
+-- Test incomplete leaf split
+--
+SELECT injection_points_attach('gin-leave-leaf-split-incomplete', 'error');
+ injection_points_attach 
+-------------------------
+ 
+(1 row)
+
+select insert_until_fail(:next_i) as next_i
+\gset
+NOTICE:  failed with: error triggered for injection point gin-leave-leaf-split-incomplete
+SELECT injection_points_detach('gin-leave-leaf-split-incomplete');
+ injection_points_detach 
+-------------------------
+ 
+(1 row)
+
+-- Verify that a scan works even though there's an incomplete split
+select verify(:next_i);
+ verify 
+--------
+ t
+(1 row)
+
+-- Insert some more rows, finishing the split
+select insert_n(:next_i, 10) as next_i
+\gset
+NOTICE:  notice triggered for injection point gin-finish-incomplete-split
+-- Verify that a scan still works
+select verify(:next_i);
+ verify 
+--------
+ t
+(1 row)
+
+--
+-- Test incomplete internal page split
+--
+SELECT injection_points_attach('gin-leave-internal-split-incomplete', 'error');
+ injection_points_attach 
+-------------------------
+ 
+(1 row)
+
+select insert_until_fail(:next_i, 100) as next_i
+\gset
+NOTICE:  failed with: error triggered for injection point gin-leave-internal-split-incomplete
+SELECT injection_points_detach('gin-leave-internal-split-incomplete');
+ injection_points_detach 
+-------------------------
+ 
+(1 row)
+
+ -- Verify that a scan works even though there's an incomplete split
+select verify(:next_i);
+ verify 
+--------
+ t
+(1 row)
+
+-- Insert some more rows, finishing the split
+select insert_n(:next_i, 10) as next_i
+\gset
+NOTICE:  notice triggered for injection point gin-finish-incomplete-split
+-- Verify that a scan still works
+select verify(:next_i);
+ verify 
+--------
+ t
+(1 row)
+
diff --git a/src/test/modules/gin/meson.build b/src/test/modules/gin/meson.build
new file mode 100644
index 00000000000..9734b51de2d
--- /dev/null
+++ b/src/test/modules/gin/meson.build
@@ -0,0 +1,16 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+if not get_option('injection_points')
+  subdir_done()
+endif
+
+tests += {
+  'name': 'gin',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'gin_incomplete_splits',
+    ],
+  },
+}
diff --git a/src/test/modules/gin/sql/gin_incomplete_splits.sql b/src/test/modules/gin/sql/gin_incomplete_splits.sql
new file mode 100644
index 00000000000..4e180b25195
--- /dev/null
+++ b/src/test/modules/gin/sql/gin_incomplete_splits.sql
@@ -0,0 +1,144 @@
+--
+-- The test uses a GIN index over int[].  The table contains arrays
+-- with integers from 1 to :next_i.  Each integer occurs exactly once,
+-- no gaps or duplicates, although the index does contain some
+-- duplicate elements because some of the inserting transactions are
+-- rolled back during the test. The exact contents of the table depend
+-- on the physical layout of the index, which in turn depends at least
+-- on the block size, so instead of check for the exact contents, we
+-- check those invariants.  :next_i psql variable is maintained at all
+-- times to hold the last inserted integer + 1.
+--
+
+-- This uses injection points to cause errors that leave some page
+-- splits in "incomplete" state
+create extension injection_points;
+
+-- Use the index for all the queries
+set enable_seqscan=off;
+
+-- Print a NOTICE whenever an incomplete split gets fixed
+SELECT injection_points_attach('gin-finish-incomplete-split', 'notice');
+
+--
+-- First create the test table and some helper functions
+--
+create table gin_incomplete_splits(i int4[]) with (autovacuum_enabled = off);
+
+create index on gin_incomplete_splits using gin (i) with (fastupdate = off);
+
+-- Creates an array with all integers from $1 (inclusive) $2 (exclusive)
+create function range_array(int, int) returns int[] language sql immutable as $$
+  select array_agg(g) from generate_series($1, $2 - 1) g
+$$;
+
+-- Inserts an array with 'n' rows to the test table. Pass :next_i as
+-- the first argument, returns the new value for :next_i.
+create function insert_n(first_i int, n int) returns int language plpgsql as $$
+begin
+  insert into gin_incomplete_splits values (range_array(first_i, first_i+n));
+  return first_i + n;
+end;
+$$;
+
+-- Inserts to the table until an insert fails. Like insert_n(), returns the
+-- new value for :next_i.
+create function insert_until_fail(next_i int, step int default 1) returns int language plpgsql as $$
+declare
+  i integer;
+begin
+  -- Insert arrays with 'step' elements each, until an error occurs.
+  loop
+    begin
+      select insert_n(next_i, step) into next_i;
+    exception when others then
+      raise notice 'failed with: %', sqlerrm;
+      exit;
+    end;
+
+    -- The caller is expected to set an injection point that eventuall
+    -- causes an error. But bail out if still no error after 10000
+    -- attempts, so that we don't get stuck in an infinite loop.
+    i := i + 1;
+    if i = 10000 then
+      raise 'no error on inserts after ';
+    end if;
+  end loop;
+
+  return next_i;
+end;
+$$;
+
+-- Check the invariants.
+create function verify(next_i int) returns bool language plpgsql as $$
+declare
+  a integer[];
+  elem integer;
+  c integer;
+begin
+  -- Perform a scan over the trailing part of the index, where the
+  -- possible incomplete splits are. (We don't check the whole table,
+  -- because that'd be pretty slow.)
+  c := 0;
+  -- Find all arrays that overlap with the last 200 inserted integers. Or
+  -- the next 100, which shouldn't exist.
+  for a in select i from gin_incomplete_splits where i && range_array(next_i - 200, next_i + 100)
+  loop
+    -- count all elements in those arrays in the window.
+    foreach elem in ARRAY a loop
+      if elem >= next_i then
+        raise 'unexpected value % in array', elem;
+      end if;
+      if elem >= next_i - 200 then
+        c := c + 1;
+      end if;
+    end loop;
+  end loop;
+  if c <> 200 then
+    raise 'unexpected count % ', c;
+  end if;
+
+  return true;
+end;
+$$;
+
+-- Insert one array to get started.
+select insert_n(1, 1000) as next_i
+\gset
+select verify(:next_i);
+
+
+--
+-- Test incomplete leaf split
+--
+SELECT injection_points_attach('gin-leave-leaf-split-incomplete', 'error');
+select insert_until_fail(:next_i) as next_i
+\gset
+SELECT injection_points_detach('gin-leave-leaf-split-incomplete');
+
+-- Verify that a scan works even though there's an incomplete split
+select verify(:next_i);
+
+-- Insert some more rows, finishing the split
+select insert_n(:next_i, 10) as next_i
+\gset
+-- Verify that a scan still works
+select verify(:next_i);
+
+
+--
+-- Test incomplete internal page split
+--
+SELECT injection_points_attach('gin-leave-internal-split-incomplete', 'error');
+select insert_until_fail(:next_i, 100) as next_i
+\gset
+SELECT injection_points_detach('gin-leave-internal-split-incomplete');
+
+ -- Verify that a scan works even though there's an incomplete split
+select verify(:next_i);
+
+-- Insert some more rows, finishing the split
+select insert_n(:next_i, 10) as next_i
+\gset
+-- Verify that a scan still works
+select verify(:next_i);
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 397e0906e6f..8fbe742d385 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -5,6 +5,7 @@ subdir('commit_ts')
 subdir('delay_execution')
 subdir('dummy_index_am')
 subdir('dummy_seclabel')
+subdir('gin')
 subdir('injection_points')
 subdir('ldap_password_func')
 subdir('libpq_pipeline')