1 files changed, 109 insertions, 73 deletions
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index aa8e0e42fc3..d600670d26d 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -69,25 +69,25 @@
  * using Algorithm D.
  *
  * When merging runs, we use a heap containing just the frontmost tuple from
- * each source run; we repeatedly output the smallest tuple and insert the
- * next tuple from its source tape (if any).  When the heap empties, the merge
- * is complete.  The basic merge algorithm thus needs very little memory ---
- * only M tuples for an M-way merge, and M is constrained to a small number.
- * However, we can still make good use of our full workMem allocation by
- * pre-reading additional tuples from each source tape.  Without prereading,
- * our access pattern to the temporary file would be very erratic; on average
- * we'd read one block from each of M source tapes during the same time that
- * we're writing M blocks to the output tape, so there is no sequentiality of
- * access at all, defeating the read-ahead methods used by most Unix kernels.
- * Worse, the output tape gets written into a very random sequence of blocks
- * of the temp file, ensuring that things will be even worse when it comes
- * time to read that tape.  A straightforward merge pass thus ends up doing a
- * lot of waiting for disk seeks.  We can improve matters by prereading from
- * each source tape sequentially, loading about workMem/M bytes from each tape
- * in turn.  Then we run the merge algorithm, writing but not reading until
- * one of the preloaded tuple series runs out.  Then we switch back to preread
- * mode, fill memory again, and repeat.  This approach helps to localize both
- * read and write accesses.
+ * each source run; we repeatedly output the smallest tuple and replace it
+ * with the next tuple from its source tape (if any).  When the heap empties,
+ * the merge is complete.  The basic merge algorithm thus needs very little
+ * memory --- only M tuples for an M-way merge, and M is constrained to a
+ * small number.  However, we can still make good use of our full workMem
+ * allocation by pre-reading additional tuples from each source tape.  Without
+ * prereading, our access pattern to the temporary file would be very erratic;
+ * on average we'd read one block from each of M source tapes during the same
+ * time that we're writing M blocks to the output tape, so there is no
+ * sequentiality of access at all, defeating the read-ahead methods used by
+ * most Unix kernels.  Worse, the output tape gets written into a very random
+ * sequence of blocks of the temp file, ensuring that things will be even
+ * worse when it comes time to read that tape.  A straightforward merge pass
+ * thus ends up doing a lot of waiting for disk seeks.  We can improve matters
+ * by prereading from each source tape sequentially, loading about workMem/M
+ * bytes from each tape in turn.  Then we run the merge algorithm, writing but
+ * not reading until one of the preloaded tuple series runs out.  Then we
+ * switch back to preread mode, fill memory again, and repeat.  This approach
+ * helps to localize both read and write accesses.
  *
  * When the caller requests random access to the sort result, we form
  * the final sorted run on a logical tape which is then "frozen", so
@@ -569,8 +569,10 @@ static void make_bounded_heap(Tuplesortstate *state);
 static void sort_bounded_heap(Tuplesortstate *state);
 static void tuplesort_sort_memtuples(Tuplesortstate *state);
 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
-					  int tupleindex, bool checkIndex);
-static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
+					  bool checkIndex);
+static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple,
+						   bool checkIndex);
+static void tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex);
 static void reversedirection(Tuplesortstate *state);
 static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
 static void markrunend(Tuplesortstate *state, int tapenum);
@@ -1617,10 +1619,10 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
 			}
 			else
 			{
-				/* discard top of heap, sift up, insert new tuple */
+				/* discard top of heap, replacing it with the new tuple */
 				free_sort_tuple(state, &state->memtuples[0]);
-				tuplesort_heap_siftup(state, false);
-				tuplesort_heap_insert(state, tuple, 0, false);
+				tuple->tupindex = 0;	/* not used */
+				tuplesort_heap_replace_top(state, tuple, false);
 			}
 			break;
 
@@ -1665,7 +1667,8 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)
 				 * initial COMPARETUP() call is required for the tuple, to
 				 * determine that the tuple does not belong in RUN_FIRST).
 				 */
-				tuplesort_heap_insert(state, tuple, state->currentRun, true);
+				tuple->tupindex = state->currentRun;
+				tuplesort_heap_insert(state, tuple, true);
 			}
 			else
 			{
@@ -1987,7 +1990,6 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 				 * more generally.
 				 */
 				*stup = state->memtuples[0];
-				tuplesort_heap_siftup(state, false);
 				if ((tupIndex = state->mergenext[srcTape]) == 0)
 				{
 					/*
@@ -2008,18 +2010,25 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 					 */
 					if ((tupIndex = state->mergenext[srcTape]) == 0)
 					{
+						/* Remove the top node from the heap */
+						tuplesort_heap_delete_top(state, false);
 						/* Free tape's buffer, avoiding dangling pointer */
 						if (state->batchUsed)
 							mergebatchfreetape(state, srcTape, stup, should_free);
 						return true;
 					}
 				}
-				/* pull next preread tuple from list, insert in heap */
+
+				/*
+				 * pull next preread tuple from list, and replace the returned
+				 * tuple at top of the heap with it.
+				 */
 				newtup = &state->memtuples[tupIndex];
 				state->mergenext[srcTape] = newtup->tupindex;
 				if (state->mergenext[srcTape] == 0)
 					state->mergelast[srcTape] = 0;
-				tuplesort_heap_insert(state, newtup, srcTape, false);
+				newtup->tupindex = srcTape;
+				tuplesort_heap_replace_top(state, newtup, false);
 				/* put the now-unused memtuples entry on the freelist */
 				newtup->tupindex = state->mergefreelist;
 				state->mergefreelist = tupIndex;
@@ -2394,7 +2403,8 @@ inittapes(Tuplesortstate *state)
 			/* Must copy source tuple to avoid possible overwrite */
 			SortTuple	stup = state->memtuples[j];
 
-			tuplesort_heap_insert(state, &stup, 0, false);
+			stup.tupindex = RUN_FIRST;
+			tuplesort_heap_insert(state, &stup, false);
 		}
 		Assert(state->memtupcount == ntuples);
 	}
@@ -2642,22 +2652,29 @@ mergeonerun(Tuplesortstate *state)
 		/* writetup adjusted total free space, now fix per-tape space */
 		spaceFreed = state->availMem - priorAvail;
 		state->mergeavailmem[srcTape] += spaceFreed;
-		/* compact the heap */
-		tuplesort_heap_siftup(state, false);
 		if ((tupIndex = state->mergenext[srcTape]) == 0)
 		{
 			/* out of preloaded data on this tape, try to read more */
 			mergepreread(state);
 			/* if still no data, we've reached end of run on this tape */
 			if ((tupIndex = state->mergenext[srcTape]) == 0)
+			{
+				/* remove the written-out tuple from the heap */
+				tuplesort_heap_delete_top(state, false);
 				continue;
+			}
 		}
-		/* pull next preread tuple from list, insert in heap */
+
+		/*
+		 * pull next preread tuple from list, and replace the written-out
+		 * tuple in the heap with it.
+		 */
 		tup = &state->memtuples[tupIndex];
 		state->mergenext[srcTape] = tup->tupindex;
 		if (state->mergenext[srcTape] == 0)
 			state->mergelast[srcTape] = 0;
-		tuplesort_heap_insert(state, tup, srcTape, false);
+		tup->tupindex = srcTape;
+		tuplesort_heap_replace_top(state, tup, false);
 		/* put the now-unused memtuples entry on the freelist */
 		tup->tupindex = state->mergefreelist;
 		state->mergefreelist = tupIndex;
@@ -2793,7 +2810,8 @@ beginmerge(Tuplesortstate *state, bool finalMergeBatch)
 			state->mergenext[srcTape] = tup->tupindex;
 			if (state->mergenext[srcTape] == 0)
 				state->mergelast[srcTape] = 0;
-			tuplesort_heap_insert(state, tup, srcTape, false);
+			tup->tupindex = srcTape;
+			tuplesort_heap_insert(state, tup, false);
 			/* put the now-unused memtuples entry on the freelist */
 			tup->tupindex = state->mergefreelist;
 			state->mergefreelist = tupIndex;
@@ -3275,13 +3293,12 @@ dumptuples(Tuplesortstate *state, bool alltuples)
 			 * Still holding out for a case favorable to replacement
 			 * selection. Still incrementally spilling using heap.
 			 *
-			 * Dump the heap's frontmost entry, and sift up to remove it from
-			 * the heap.
+			 * Dump the heap's frontmost entry, and remove it from the heap.
 			 */
 			Assert(state->memtupcount > 0);
 			WRITETUP(state, state->tp_tapenum[state->destTape],
 					 &state->memtuples[0]);
-			tuplesort_heap_siftup(state, true);
+			tuplesort_heap_delete_top(state, true);
 		}
 		else
 		{
@@ -3644,27 +3661,29 @@ make_bounded_heap(Tuplesortstate *state)
 	state->memtupcount = 0;		/* make the heap empty */
 	for (i = 0; i < tupcount; i++)
 	{
-		if (state->memtupcount >= state->bound &&
-		  COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
-		{
-			/* New tuple would just get thrown out, so skip it */
-			free_sort_tuple(state, &state->memtuples[i]);
-			CHECK_FOR_INTERRUPTS();
-		}
-		else
+		if (state->memtupcount < state->bound)
 		{
 			/* Insert next tuple into heap */
 			/* Must copy source tuple to avoid possible overwrite */
 			SortTuple	stup = state->memtuples[i];
 
-			tuplesort_heap_insert(state, &stup, 0, false);
-
-			/* If heap too full, discard largest entry */
-			if (state->memtupcount > state->bound)
+			stup.tupindex = 0;	/* not used */
+			tuplesort_heap_insert(state, &stup, false);
+		}
+		else
+		{
+			/*
+			 * The heap is full.  Replace the largest entry with the new
+			 * tuple, or just discard it, if it's larger than anything already
+			 * in the heap.
+			 */
+			if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
 			{
-				free_sort_tuple(state, &state->memtuples[0]);
-				tuplesort_heap_siftup(state, false);
+				free_sort_tuple(state, &state->memtuples[i]);
+				CHECK_FOR_INTERRUPTS();
 			}
+			else
+				tuplesort_heap_replace_top(state, &state->memtuples[i], false);
 		}
 	}
 
@@ -3685,16 +3704,16 @@ sort_bounded_heap(Tuplesortstate *state)
 	Assert(tupcount == state->bound);
 
 	/*
-	 * We can unheapify in place because each sift-up will remove the largest
-	 * entry, which we can promptly store in the newly freed slot at the end.
-	 * Once we're down to a single-entry heap, we're done.
+	 * We can unheapify in place because each delete-top call will remove the
+	 * largest entry, which we can promptly store in the newly freed slot at
+	 * the end.  Once we're down to a single-entry heap, we're done.
 	 */
 	while (state->memtupcount > 1)
 	{
 		SortTuple	stup = state->memtuples[0];
 
 		/* this sifts-up the next-largest entry and decreases memtupcount */
-		tuplesort_heap_siftup(state, false);
+		tuplesort_heap_delete_top(state, false);
 		state->memtuples[state->memtupcount] = stup;
 	}
 	state->memtupcount = tupcount;
@@ -3738,30 +3757,21 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
  * Insert a new tuple into an empty or existing heap, maintaining the
  * heap invariant.  Caller is responsible for ensuring there's room.
  *
- * Note: we assume *tuple is a temporary variable that can be scribbled on.
- * For some callers, tuple actually points to a memtuples[] entry above the
+ * Note: For some callers, tuple points to a memtuples[] entry above the
  * end of the heap.  This is safe as long as it's not immediately adjacent
  * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
  * is, it might get overwritten before being moved into the heap!
  */
 static void
 tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
-					  int tupleindex, bool checkIndex)
+					  bool checkIndex)
 {
 	SortTuple  *memtuples;
 	int			j;
 
-	/*
-	 * Save the tupleindex --- see notes above about writing on *tuple. It's a
-	 * historical artifact that tupleindex is passed as a separate argument
-	 * and not in *tuple, but it's notationally convenient so let's leave it
-	 * that way.
-	 */
-	tuple->tupindex = tupleindex;
-
 	memtuples = state->memtuples;
 	Assert(state->memtupcount < state->memtupsize);
-	Assert(!checkIndex || tupleindex == RUN_FIRST);
+	Assert(!checkIndex || tuple->tupindex == RUN_FIRST);
 
 	CHECK_FOR_INTERRUPTS();
 
@@ -3783,25 +3793,51 @@ tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
 }
 
 /*
- * The tuple at state->memtuples[0] has been removed from the heap.
- * Decrement memtupcount, and sift up to maintain the heap invariant.
+ * Remove the tuple at state->memtuples[0] from the heap.  Decrement
+ * memtupcount, and sift up to maintain the heap invariant.
+ *
+ * The caller has already free'd the tuple the top node points to,
+ * if necessary.
  */
 static void
-tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex)
+tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex)
 {
 	SortTuple  *memtuples = state->memtuples;
 	SortTuple  *tuple;
-	int			i,
-				n;
 
 	Assert(!checkIndex || state->currentRun == RUN_FIRST);
 	if (--state->memtupcount <= 0)
 		return;
 
+	/*
+	 * Remove the last tuple in the heap, and re-insert it, by replacing the
+	 * current top node with it.
+	 */
+	tuple = &memtuples[state->memtupcount];
+	tuplesort_heap_replace_top(state, tuple, checkIndex);
+}
+
+/*
+ * Replace the tuple at state->memtuples[0] with a new tuple.  Sift up to
+ * maintain the heap invariant.
+ *
+ * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H,
+ * Heapsort, steps H3-H8).
+ */
+static void
+tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple,
+						   bool checkIndex)
+{
+	SortTuple  *memtuples = state->memtuples;
+	int			i,
+				n;
+
+	Assert(!checkIndex || state->currentRun == RUN_FIRST);
+	Assert(state->memtupcount >= 1);
+
 	CHECK_FOR_INTERRUPTS();
 
 	n = state->memtupcount;
-	tuple = &memtuples[n];		/* tuple that must be reinserted */
 	i = 0;						/* i is where the "hole" is */
 	for (;;)
 	{