diff options
author | Heikki Linnakangas <heikki.linnakangas@iki.fi> | 2012-08-06 13:02:15 +0300 |
---|---|---|
committer | Heikki Linnakangas <heikki.linnakangas@iki.fi> | 2012-08-06 14:33:27 +0300 |
commit | 31595993901484d24c9ba62428c7abec207dd55e (patch) | |
tree | e528688bf4c50f09e26347691ea4c5162bdcd5d9 /src/pl/plpython/plpython.c | |
parent | c9c95202b036ea75dec68a4afc1802f35b77a966 (diff) | |
download | postgresql-31595993901484d24c9ba62428c7abec207dd55e.tar.gz postgresql-31595993901484d24c9ba62428c7abec207dd55e.zip |
Perform conversion from Python unicode to string/bytes object via UTF-8.
We used to convert the unicode object directly to a string in the server
encoding by calling Python's PyUnicode_AsEncodedString function. In other
words, we used Python's routines to do the encoding. However, that has a
few problems. First of all, it required keeping a mapping table of Python
encoding names and PostgreSQL encodings. But the real killer was that Python
doesn't support EUC_TW and MULE_INTERNAL encodings at all.
Instead, convert the Python unicode object to UTF-8, and use PostgreSQL's
encoding conversion functions to convert from UTF-8 to server encoding. We
were already doing the same in the other direction in PLyUnicode_FromString,
so this is more consistent, too.
Note: This makes SQL_ASCII to behave more leniently. We used to map
SQL_ASCII to Python's 'ascii', which on Python means strict 7-bit ASCII
only, so you got an error if the python string contained anything but pure
ASCII. You no longer get an error; you get the UTF-8 representation of the
string instead.
Backpatch to 9.0, where these conversions were introduced.
Jan UrbaĆski
Diffstat (limited to 'src/pl/plpython/plpython.c')
-rw-r--r-- | src/pl/plpython/plpython.c | 98 |
1 files changed, 44 insertions, 54 deletions
diff --git a/src/pl/plpython/plpython.c b/src/pl/plpython/plpython.c index c0030d05a84..c6f8c387a87 100644 --- a/src/pl/plpython/plpython.c +++ b/src/pl/plpython/plpython.c @@ -4869,66 +4869,56 @@ PLy_free(void *ptr) static PyObject * PLyUnicode_Bytes(PyObject *unicode) { - PyObject *rv; - const char *serverenc; + PyObject *bytes, *rv; + char *utf8string, *encoded; + + /* First encode the Python unicode object with UTF-8. */ + bytes = PyUnicode_AsUTF8String(unicode); + if (bytes == NULL) + PLy_elog(ERROR, "could not convert Python Unicode object to bytes"); + + utf8string = PyBytes_AsString(bytes); + if (utf8string == NULL) { + Py_DECREF(bytes); + PLy_elog(ERROR, "could not extract bytes from encoded string"); + } /* - * Map PostgreSQL encoding to a Python encoding name. + * Then convert to server encoding if necessary. + * + * PyUnicode_AsEncodedString could be used to encode the object directly + * in the server encoding, but Python doesn't support all the encodings + * that PostgreSQL does (EUC_TW and MULE_INTERNAL). UTF-8 is used as an + * intermediary in PLyUnicode_FromString as well. */ - switch (GetDatabaseEncoding()) + if (GetDatabaseEncoding() != PG_UTF8) { - case PG_SQL_ASCII: - /* - * Mapping SQL_ASCII to Python's 'ascii' is a bit bogus. Python's - * 'ascii' means true 7-bit only ASCII, while PostgreSQL's - * SQL_ASCII means that anything is allowed, and the system doesn't - * try to interpret the bytes in any way. But not sure what else - * to do, and we haven't heard any complaints... - */ - serverenc = "ascii"; - break; - case PG_WIN1250: - serverenc = "cp1250"; - break; - case PG_WIN1251: - serverenc = "cp1251"; - break; - case PG_WIN1252: - serverenc = "cp1252"; - break; - case PG_WIN1253: - serverenc = "cp1253"; - break; - case PG_WIN1254: - serverenc = "cp1254"; - break; - case PG_WIN1255: - serverenc = "cp1255"; - break; - case PG_WIN1256: - serverenc = "cp1256"; - break; - case PG_WIN1257: - serverenc = "cp1257"; - break; - case PG_WIN1258: - serverenc = "cp1258"; - break; - case PG_WIN866: - serverenc = "cp866"; - break; - case PG_WIN874: - serverenc = "cp874"; - break; - default: - /* Other encodings have the same name in Python. */ - serverenc = GetDatabaseEncodingName(); - break; + PG_TRY(); + { + encoded = (char *) pg_do_encoding_conversion( + (unsigned char *) utf8string, + strlen(utf8string), + PG_UTF8, + GetDatabaseEncoding()); + } + PG_CATCH(); + { + Py_DECREF(bytes); + PG_RE_THROW(); + } + PG_END_TRY(); } + else + encoded = utf8string; - rv = PyUnicode_AsEncodedString(unicode, serverenc, "strict"); - if (rv == NULL) - PLy_elog(ERROR, "could not convert Python Unicode object to PostgreSQL server encoding"); + /* finally, build a bytes object in the server encoding */ + rv = PyBytes_FromStringAndSize(encoded, strlen(encoded)); + + /* if pg_do_encoding_conversion allocated memory, free it now */ + if (utf8string != encoded) + pfree(encoded); + + Py_DECREF(bytes); return rv; } |