Do COPY FROM encoding conversion/verification in larger chunks.

This gives a small performance gain, by reducing the number of calls to the conversion/verification function, and letting it work with larger inputs. Also, reorganizing the input pipeline makes it easier to parallelize the input parsing: after the input has been converted to the database encoding, the next stage of finding the newlines can be done in parallel, because there cannot be any newline chars "embedded" in multi-byte characters in the encodings that we support as server encodings. This changes behavior in one corner case: if client and server encodings are the same single-byte encoding (e.g. latin1), previously the input would not be checked for zero bytes ('\0'). Any fields containing zero bytes would be truncated at the zero. But if encoding conversion was needed, the conversion routine would throw an error on the zero. After this commit, the input is always checked for zeros. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01%40iki.fi
author: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2021-04-01 12:23:40 +0300
committer: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2021-04-01 12:23:40 +0300
commit: f82de5c46bdf8cd65812a7b04c9509c218e1545d (patch)
tree: f9d687f0e1f50666a4a4cf8fbe366a2cd7e43d1c /src/backend/commands/copyfrom.c
parent: ea1b99a6619cd9dcfd46b82ac0d926b0b80e0ae9 (diff)
download: postgresql-f82de5c46bdf8cd65812a7b04c9509c218e1545d.tar.gz
postgresql-f82de5c46bdf8cd65812a7b04c9509c218e1545d.zip
1 files changed, 51 insertions, 29 deletions
diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 74dbb709fe7..be2e3d7354f 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -3,6 +3,12 @@
  * copyfrom.c
  *		COPY <table> FROM file/program/client
  *
+ * This file contains routines needed to efficiently load tuples into a
+ * table.  That includes looking up the correct partition, firing triggers,
+ * calling the table AM function to insert the data, and updating indexes.
+ * Reading data from the input file or client and parsing it into Datums
+ * is handled in copyfromparse.c.
+ *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
@@ -23,6 +29,7 @@
 #include "access/tableam.h"
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "catalog/namespace.h"
 #include "commands/copy.h"
 #include "commands/copyfrom_internal.h"
 #include "commands/progress.h"
@@ -87,7 +94,7 @@ typedef struct CopyMultiInsertInfo
 	List	   *multiInsertBuffers; /* List of tracked CopyMultiInsertBuffers */
 	int			bufferedTuples; /* number of tuples buffered over all buffers */
 	int			bufferedBytes;	/* number of bytes from all buffered tuples */
-	CopyFromState	cstate;			/* Copy state for this CopyMultiInsertInfo */
+	CopyFromState cstate;		/* Copy state for this CopyMultiInsertInfo */
 	EState	   *estate;			/* Executor state used for COPY */
 	CommandId	mycid;			/* Command Id used for COPY */
 	int			ti_options;		/* table insert options */
@@ -107,7 +114,7 @@ static void ClosePipeFromProgram(CopyFromState cstate);
 void
 CopyFromErrorCallback(void *arg)
 {
-	CopyFromState	cstate = (CopyFromState) arg;
+	CopyFromState cstate = (CopyFromState) arg;
 	char		curlineno_str[32];
 
 	snprintf(curlineno_str, sizeof(curlineno_str), UINT64_FORMAT,
@@ -149,15 +156,9 @@ CopyFromErrorCallback(void *arg)
 			/*
 			 * Error is relevant to a particular line.
 			 *
-			 * If line_buf still contains the correct line, and it's already
-			 * transcoded, print it. If it's still in a foreign encoding, it's
-			 * quite likely that the error is precisely a failure to do
-			 * encoding conversion (ie, bad data). We dare not try to convert
-			 * it, and at present there's no way to regurgitate it without
-			 * conversion. So we have to punt and just report the line number.
+			 * If line_buf still contains the correct line, print it.
 			 */
-			if (cstate->line_buf_valid &&
-				(cstate->line_buf_converted || !cstate->need_transcoding))
+			if (cstate->line_buf_valid)
 			{
 				char	   *lineval;
 
@@ -300,7 +301,7 @@ CopyMultiInsertBufferFlush(CopyMultiInsertInfo *miinfo,
 	MemoryContext oldcontext;
 	int			i;
 	uint64		save_cur_lineno;
-	CopyFromState	cstate = miinfo->cstate;
+	CopyFromState cstate = miinfo->cstate;
 	EState	   *estate = miinfo->estate;
 	CommandId	mycid = miinfo->mycid;
 	int			ti_options = miinfo->ti_options;
@@ -1191,7 +1192,7 @@ BeginCopyFrom(ParseState *pstate,
 			  List *attnamelist,
 			  List *options)
 {
-	CopyFromState	cstate;
+	CopyFromState cstate;
 	bool		pipe = (filename == NULL);
 	TupleDesc	tupDesc;
 	AttrNumber	num_phys_attrs,
@@ -1229,7 +1230,7 @@ BeginCopyFrom(ParseState *pstate,
 	oldcontext = MemoryContextSwitchTo(cstate->copycontext);
 
 	/* Extract options from the statement node tree */
-	ProcessCopyOptions(pstate, &cstate->opts, true /* is_from */, options);
+	ProcessCopyOptions(pstate, &cstate->opts, true /* is_from */ , options);
 
 	/* Process the target relation */
 	cstate->rel = rel;
@@ -1320,15 +1321,20 @@ BeginCopyFrom(ParseState *pstate,
 		cstate->file_encoding = cstate->opts.file_encoding;
 
 	/*
-	 * Set up encoding conversion info.  Even if the file and server encodings
-	 * are the same, we must apply pg_any_to_server() to validate data in
-	 * multibyte encodings.
+	 * Look up encoding conversion function.
 	 */
-	cstate->need_transcoding =
-		(cstate->file_encoding != GetDatabaseEncoding() ||
-		 pg_database_encoding_max_length() > 1);
-	/* See Multibyte encoding comment above */
-	cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->file_encoding);
+	if (cstate->file_encoding == GetDatabaseEncoding() ||
+		cstate->file_encoding == PG_SQL_ASCII ||
+		GetDatabaseEncoding() == PG_SQL_ASCII)
+	{
+		cstate->need_transcoding = false;
+	}
+	else
+	{
+		cstate->need_transcoding = true;
+		cstate->conversion_proc = FindDefaultConversionProc(cstate->file_encoding,
+															GetDatabaseEncoding());
+	}
 
 	cstate->copy_src = COPY_FILE;	/* default */
 
@@ -1339,7 +1345,6 @@ BeginCopyFrom(ParseState *pstate,
 	oldcontext = MemoryContextSwitchTo(cstate->copycontext);
 
 	/* Initialize state variables */
-	cstate->reached_eof = false;
 	cstate->eol_type = EOL_UNKNOWN;
 	cstate->cur_relname = RelationGetRelationName(cstate->rel);
 	cstate->cur_lineno = 0;
@@ -1347,19 +1352,36 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->cur_attval = NULL;
 
 	/*
-	 * Set up variables to avoid per-attribute overhead.  attribute_buf and
-	 * raw_buf are used in both text and binary modes, but we use line_buf
-	 * only in text mode.
+	 * Allocate buffers for the input pipeline.
+	 *
+	 * attribute_buf and raw_buf are used in both text and binary modes, but
+	 * input_buf and line_buf only in text mode.
 	 */
-	initStringInfo(&cstate->attribute_buf);
-	cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
+	cstate->raw_buf = palloc(RAW_BUF_SIZE + 1);
 	cstate->raw_buf_index = cstate->raw_buf_len = 0;
+	cstate->raw_reached_eof = false;
+
 	if (!cstate->opts.binary)
 	{
+		/*
+		 * If encoding conversion is needed, we need another buffer to hold
+		 * the converted input data.  Otherwise, we can just point input_buf
+		 * to the same buffer as raw_buf.
+		 */
+		if (cstate->need_transcoding)
+		{
+			cstate->input_buf = (char *) palloc(INPUT_BUF_SIZE + 1);
+			cstate->input_buf_index = cstate->input_buf_len = 0;
+		}
+		else
+			cstate->input_buf = cstate->raw_buf;
+		cstate->input_reached_eof = false;
+
 		initStringInfo(&cstate->line_buf);
-		cstate->line_buf_converted = false;
 	}
 
+	initStringInfo(&cstate->attribute_buf);
+
 	/* Assign range table, we'll need it in CopyFrom. */
 	if (pstate)
 		cstate->range_table = pstate->p_rtable;
@@ -1584,7 +1606,7 @@ ClosePipeFromProgram(CopyFromState cstate)
 		 * should not report that as an error.  Otherwise, SIGPIPE indicates a
 		 * problem.
 		 */
-		if (!cstate->reached_eof &&
+		if (!cstate->raw_reached_eof &&
 			wait_result_is_signal(pclose_rc, SIGPIPE))
 			return;
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2021-04-01 12:23:40 +0300
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2021-04-01 12:23:40 +0300
commit	f82de5c46bdf8cd65812a7b04c9509c218e1545d (patch)
tree	f9d687f0e1f50666a4a4cf8fbe366a2cd7e43d1c /src/backend/commands/copyfrom.c
parent	ea1b99a6619cd9dcfd46b82ac0d926b0b80e0ae9 (diff)
download	postgresql-f82de5c46bdf8cd65812a7b04c9509c218e1545d.tar.gz postgresql-f82de5c46bdf8cd65812a7b04c9509c218e1545d.zip