Improve parallel scheduling logic in pg_dump/pg_restore.

Previously, the way this worked was that a parallel pg_dump would re-order the TABLE_DATA items in the dump's TOC into decreasing size order, and separately re-order (some of) the INDEX items into decreasing size order. Then pg_dump would dump the items in that order. Later, parallel pg_restore just followed the TOC order. This method had lots of deficiencies: * TOC ordering randomly differed between parallel and non-parallel dumps, and was hard to predict in the former case, causing problems for building stable pg_dump test cases. * Parallel restore only followed a well-chosen order if the dump had been done in parallel; in particular, this never happened for restore from custom-format dumps. * The best order for restore isn't necessarily the same as for dump, and it's not really static either because of locking considerations. * TABLE_DATA and INDEX items aren't the only things that might take a lot of work during restore. Scheduling was particularly stupid for the BLOBS item, which might require lots of work during dump as well as restore, but was left to the end in either case. This patch removes the logic that changed the TOC order, fixing the test instability problem. Instead, we sort the parallelizable items just before processing them during a parallel dump. Independently of that, parallel restore prioritizes the ready-to-execute tasks based on the size of the underlying table. In the case of dependent tasks such as index, constraint, or foreign key creation, the largest relevant table is used as the metric for estimating the task length. (This is pretty crude, but it should be enough to avoid the case we want to avoid, which is ending the run with just a few large tasks such that we can't make use of all N workers.) Patch by me, responding to a complaint from Peter Eisentraut, who also reviewed the patch. Discussion: https://postgr.es/m/5137fe12-d0a2-4971-61b6-eb4e7e8875f8@2ndquadrant.com
author: Tom Lane <tgl@sss.pgh.pa.us> 2018-09-14 17:31:51 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2018-09-14 17:31:51 -0400
commit: 548e50976ce721b5e927d42a105c2f05b51b52a6 (patch)
tree: 9d492f63d5f8715d72c903448467623fb1dd737a /src/bin/pg_dump/pg_dump.c
parent: 20bef2c3110af295501919bac463b87ac58876de (diff)
download: postgresql-548e50976ce721b5e927d42a105c2f05b51b52a6.tar.gz
postgresql-548e50976ce721b5e927d42a105c2f05b51b52a6.zip
1 files changed, 57 insertions, 27 deletions
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index f0ea83e6a97..0687a819147 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -54,6 +54,7 @@
 #include "catalog/pg_trigger_d.h"
 #include "catalog/pg_type_d.h"
 #include "libpq/libpq-fs.h"
+#include "storage/block.h"
 
 #include "dumputils.h"
 #include "parallel.h"
@@ -845,10 +846,6 @@ main(int argc, char **argv)
 	 */
 	sortDumpableObjectsByTypeName(dobjs, numObjs);
 
-	/* If we do a parallel dump, we want the largest tables to go first */
-	if (archiveFormat == archDirectory && numWorkers > 1)
-		sortDataAndIndexObjectsBySize(dobjs, numObjs);
-
 	sortDumpableObjects(dobjs, numObjs,
 						boundaryObjs[0].dumpId, boundaryObjs[1].dumpId);
 
@@ -2156,13 +2153,28 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo)
 	 * See comments for BuildArchiveDependencies.
 	 */
 	if (tdinfo->dobj.dump & DUMP_COMPONENT_DATA)
-		ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
-					 tbinfo->dobj.name, tbinfo->dobj.namespace->dobj.name,
-					 NULL, tbinfo->rolname,
-					 false, "TABLE DATA", SECTION_DATA,
-					 "", "", copyStmt,
-					 &(tbinfo->dobj.dumpId), 1,
-					 dumpFn, tdinfo);
+	{
+		TocEntry   *te;
+
+		te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
+						  tbinfo->dobj.name, tbinfo->dobj.namespace->dobj.name,
+						  NULL, tbinfo->rolname,
+						  false, "TABLE DATA", SECTION_DATA,
+						  "", "", copyStmt,
+						  &(tbinfo->dobj.dumpId), 1,
+						  dumpFn, tdinfo);
+
+		/*
+		 * Set the TocEntry's dataLength in case we are doing a parallel dump
+		 * and want to order dump jobs by table size.  We choose to measure
+		 * dataLength in table pages during dump, so no scaling is needed.
+		 * However, relpages is declared as "integer" in pg_class, and hence
+		 * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
+		 * Cast so that we get the right interpretation of table sizes
+		 * exceeding INT_MAX pages.
+		 */
+		te->dataLength = (BlockNumber) tbinfo->relpages;
+	}
 
 	destroyPQExpBuffer(copyBuf);
 	destroyPQExpBuffer(clistBuf);
@@ -6759,8 +6771,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
 				i_conoid,
 				i_condef,
 				i_tablespace,
-				i_indreloptions,
-				i_relpages;
+				i_indreloptions;
 	int			ntups;
 
 	for (i = 0; i < numTables; i++)
@@ -6807,7 +6818,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
 							  "i.indnkeyatts AS indnkeyatts, "
 							  "i.indnatts AS indnatts, "
 							  "i.indkey, i.indisclustered, "
-							  "i.indisreplident, t.relpages, "
+							  "i.indisreplident, "
 							  "c.contype, c.conname, "
 							  "c.condeferrable, c.condeferred, "
 							  "c.tableoid AS contableoid, "
@@ -6844,7 +6855,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
 							  "i.indnatts AS indnkeyatts, "
 							  "i.indnatts AS indnatts, "
 							  "i.indkey, i.indisclustered, "
-							  "i.indisreplident, t.relpages, "
+							  "i.indisreplident, "
 							  "c.contype, c.conname, "
 							  "c.condeferrable, c.condeferred, "
 							  "c.tableoid AS contableoid, "
@@ -6877,7 +6888,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
 							  "i.indnatts AS indnkeyatts, "
 							  "i.indnatts AS indnatts, "
 							  "i.indkey, i.indisclustered, "
-							  "false AS indisreplident, t.relpages, "
+							  "false AS indisreplident, "
 							  "c.contype, c.conname, "
 							  "c.condeferrable, c.condeferred, "
 							  "c.tableoid AS contableoid, "
@@ -6906,7 +6917,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
 							  "i.indnatts AS indnkeyatts, "
 							  "i.indnatts AS indnatts, "
 							  "i.indkey, i.indisclustered, "
-							  "false AS indisreplident, t.relpages, "
+							  "false AS indisreplident, "
 							  "c.contype, c.conname, "
 							  "c.condeferrable, c.condeferred, "
 							  "c.tableoid AS contableoid, "
@@ -6938,7 +6949,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
 							  "t.relnatts AS indnkeyatts, "
 							  "t.relnatts AS indnatts, "
 							  "i.indkey, i.indisclustered, "
-							  "false AS indisreplident, t.relpages, "
+							  "false AS indisreplident, "
 							  "c.contype, c.conname, "
 							  "c.condeferrable, c.condeferred, "
 							  "c.tableoid AS contableoid, "
@@ -6974,7 +6985,6 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
 		i_indkey = PQfnumber(res, "indkey");
 		i_indisclustered = PQfnumber(res, "indisclustered");
 		i_indisreplident = PQfnumber(res, "indisreplident");
-		i_relpages = PQfnumber(res, "relpages");
 		i_contype = PQfnumber(res, "contype");
 		i_conname = PQfnumber(res, "conname");
 		i_condeferrable = PQfnumber(res, "condeferrable");
@@ -7013,7 +7023,6 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
 			indxinfo[j].indisclustered = (PQgetvalue(res, j, i_indisclustered)[0] == 't');
 			indxinfo[j].indisreplident = (PQgetvalue(res, j, i_indisreplident)[0] == 't');
 			indxinfo[j].parentidx = atooid(PQgetvalue(res, j, i_parentidx));
-			indxinfo[j].relpages = atoi(PQgetvalue(res, j, i_relpages));
 			contype = *(PQgetvalue(res, j, i_contype));
 
 			if (contype == 'p' || contype == 'u' || contype == 'x')
@@ -8206,6 +8215,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 							  "'' AS attfdwoptions,\n");
 
 		if (fout->remoteVersion >= 90100)
+		{
 			/*
 			 * Since we only want to dump COLLATE clauses for attributes whose
 			 * collation is different from their type's default, we use a CASE
@@ -8214,6 +8224,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 			appendPQExpBuffer(q,
 							  "CASE WHEN a.attcollation <> t.typcollation "
 							  "THEN a.attcollation ELSE 0 END AS attcollation,\n");
+		}
 		else
 			appendPQExpBuffer(q,
 							  "0 AS attcollation,\n");
@@ -8225,8 +8236,8 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 			appendPQExpBuffer(q,
 							  "'' AS attoptions\n");
 
+		/* need left join here to not fail on dropped columns ... */
 		appendPQExpBuffer(q,
-						  /* need left join here to not fail on dropped columns ... */
 						  "FROM pg_catalog.pg_attribute a LEFT JOIN pg_catalog.pg_type t "
 						  "ON a.atttypid = t.oid\n"
 						  "WHERE a.attrelid = '%u'::pg_catalog.oid "
@@ -9772,12 +9783,31 @@ dumpDumpableObject(Archive *fout, DumpableObject *dobj)
 			break;
 		case DO_BLOB_DATA:
 			if (dobj->dump & DUMP_COMPONENT_DATA)
-				ArchiveEntry(fout, dobj->catId, dobj->dumpId,
-							 dobj->name, NULL, NULL, "",
-							 false, "BLOBS", SECTION_DATA,
-							 "", "", NULL,
-							 NULL, 0,
-							 dumpBlobs, NULL);
+			{
+				TocEntry   *te;
+
+				te = ArchiveEntry(fout, dobj->catId, dobj->dumpId,
+								  dobj->name, NULL, NULL, "",
+								  false, "BLOBS", SECTION_DATA,
+								  "", "", NULL,
+								  NULL, 0,
+								  dumpBlobs, NULL);
+
+				/*
+				 * Set the TocEntry's dataLength in case we are doing a
+				 * parallel dump and want to order dump jobs by table size.
+				 * (We need some size estimate for every TocEntry with a
+				 * DataDumper function.)  We don't currently have any cheap
+				 * way to estimate the size of blobs, but it doesn't matter;
+				 * let's just set the size to a large value so parallel dumps
+				 * will launch this job first.  If there's lots of blobs, we
+				 * win, and if there aren't, we don't lose much.  (If you want
+				 * to improve on this, really what you should be thinking
+				 * about is allowing blob dumping to be parallelized, not just
+				 * getting a smarter estimate for the single TOC entry.)
+				 */
+				te->dataLength = MaxBlockNumber;
+			}
 			break;
 		case DO_POLICY:
 			dumpPolicy(fout, (PolicyInfo *) dobj);
author	Tom Lane <tgl@sss.pgh.pa.us>	2018-09-14 17:31:51 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2018-09-14 17:31:51 -0400
commit	548e50976ce721b5e927d42a105c2f05b51b52a6 (patch)
tree	9d492f63d5f8715d72c903448467623fb1dd737a /src/bin/pg_dump/pg_dump.c
parent	20bef2c3110af295501919bac463b87ac58876de (diff)
download	postgresql-548e50976ce721b5e927d42a105c2f05b51b52a6.tar.gz postgresql-548e50976ce721b5e927d42a105c2f05b51b52a6.zip