From 31595993901484d24c9ba62428c7abec207dd55e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 6 Aug 2012 13:02:15 +0300
Subject: Perform conversion from Python unicode to string/bytes object via
 UTF-8.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We used to convert the unicode object directly to a string in the server
encoding by calling Python's PyUnicode_AsEncodedString function. In other
words, we used Python's routines to do the encoding. However, that has a
few problems. First of all, it required keeping a mapping table of Python
encoding names and PostgreSQL encodings. But the real killer was that Python
doesn't support EUC_TW and MULE_INTERNAL encodings at all.

Instead, convert the Python unicode object to UTF-8, and use PostgreSQL's
encoding conversion functions to convert from UTF-8 to server encoding. We
were already doing the same in the other direction in PLyUnicode_FromString,
so this is more consistent, too.

Note: This makes SQL_ASCII to behave more leniently. We used to map
SQL_ASCII to Python's 'ascii', which on Python means strict 7-bit ASCII
only, so you got an error if the python string contained anything but pure
ASCII. You no longer get an error; you get the UTF-8 representation of the
string instead.

Backpatch to 9.0, where these conversions were introduced.

Jan Urbański
---
 src/pl/plpython/plpython.c | 98 +++++++++++++++++++++-------------------------
 1 file changed, 44 insertions(+), 54 deletions(-)

(limited to 'src/pl/plpython/plpython.c')

diff --git a/src/pl/plpython/plpython.c b/src/pl/plpython/plpython.c
index c0030d05a84..c6f8c387a87 100644
--- a/src/pl/plpython/plpython.c
+++ b/src/pl/plpython/plpython.c
@@ -4869,66 +4869,56 @@ PLy_free(void *ptr)
 static PyObject *
 PLyUnicode_Bytes(PyObject *unicode)
 {
-	PyObject   *rv;
-	const char *serverenc;
+	PyObject	*bytes, *rv;
+	char		*utf8string, *encoded;
+
+	/* First encode the Python unicode object with UTF-8. */
+	bytes = PyUnicode_AsUTF8String(unicode);
+	if (bytes == NULL)
+		PLy_elog(ERROR, "could not convert Python Unicode object to bytes");
+
+	utf8string = PyBytes_AsString(bytes);
+	if (utf8string == NULL) {
+		Py_DECREF(bytes);
+		PLy_elog(ERROR, "could not extract bytes from encoded string");
+	}
 
 	/*
-	 * Map PostgreSQL encoding to a Python encoding name.
+	 * Then convert to server encoding if necessary.
+	 *
+	 * PyUnicode_AsEncodedString could be used to encode the object directly
+	 * in the server encoding, but Python doesn't support all the encodings
+	 * that PostgreSQL does (EUC_TW and MULE_INTERNAL). UTF-8 is used as an
+	 * intermediary in PLyUnicode_FromString as well.
 	 */
-	switch (GetDatabaseEncoding())
+	if (GetDatabaseEncoding() != PG_UTF8)
 	{
-		case PG_SQL_ASCII:
-			/*
-			 * Mapping SQL_ASCII to Python's 'ascii' is a bit bogus. Python's
-			 * 'ascii' means true 7-bit only ASCII, while PostgreSQL's
-			 * SQL_ASCII means that anything is allowed, and the system doesn't
-			 * try to interpret the bytes in any way. But not sure what else
-			 * to do, and we haven't heard any complaints...
-			 */
-			serverenc = "ascii";
-			break;
-		case PG_WIN1250:
-			serverenc = "cp1250";
-			break;
-		case PG_WIN1251:
-			serverenc = "cp1251";
-			break;
-		case PG_WIN1252:
-			serverenc = "cp1252";
-			break;
-		case PG_WIN1253:
-			serverenc = "cp1253";
-			break;
-		case PG_WIN1254:
-			serverenc = "cp1254";
-			break;
-		case PG_WIN1255:
-			serverenc = "cp1255";
-			break;
-		case PG_WIN1256:
-			serverenc = "cp1256";
-			break;
-		case PG_WIN1257:
-			serverenc = "cp1257";
-			break;
-		case PG_WIN1258:
-			serverenc = "cp1258";
-			break;
-		case PG_WIN866:
-			serverenc = "cp866";
-			break;
-		case PG_WIN874:
-			serverenc = "cp874";
-			break;
-		default:
-			/* Other encodings have the same name in Python. */
-			serverenc = GetDatabaseEncodingName();
-			break;
+		PG_TRY();
+		{
+			encoded = (char *) pg_do_encoding_conversion(
+				(unsigned char *) utf8string,
+				strlen(utf8string),
+				PG_UTF8,
+				GetDatabaseEncoding());
+		}
+		PG_CATCH();
+		{
+			Py_DECREF(bytes);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
 	}
+	else
+		encoded = utf8string;
 
-	rv = PyUnicode_AsEncodedString(unicode, serverenc, "strict");
-	if (rv == NULL)
-		PLy_elog(ERROR, "could not convert Python Unicode object to PostgreSQL server encoding");
+	/* finally, build a bytes object in the server encoding */
+	rv = PyBytes_FromStringAndSize(encoded, strlen(encoded));
+
+	/* if pg_do_encoding_conversion allocated memory, free it now */
+	if (utf8string != encoded)
+		pfree(encoded);
+
+	Py_DECREF(bytes);
 	return rv;
 }
 
-- 
cgit v1.2.3