diff options
author | Noah Misch <noah@leadboat.com> | 2017-11-11 11:10:53 -0800 |
---|---|---|
committer | Noah Misch <noah@leadboat.com> | 2017-11-11 11:11:21 -0800 |
commit | e7083dfce5754e38063502c683b7b7fa57021893 (patch) | |
tree | b47df0cdb68a1678af339ea3f6ab4e87621cb716 | |
parent | 9efd83bfd32b38c675b263f532ebe9a6ec53149e (diff) | |
download | postgresql-e7083dfce5754e38063502c683b7b7fa57021893.tar.gz postgresql-e7083dfce5754e38063502c683b7b7fa57021893.zip |
Ignore XML declaration in xpath_internal(), for UTF8 databases.
When a value contained an XML declaration naming some other encoding,
this function interpreted UTF8 bytes as the named encoding, yielding
mojibake. xml_parse() already has similar logic. This would be
necessary but not sufficient for non-UTF8 databases, so preserve
behavior there until the xpath facility can support such databases
comprehensively. Back-patch to 9.3 (all supported versions).
Pavel Stehule and Noah Misch
Discussion: https://postgr.es/m/CAFj8pRC-dM=tT=QkGi+Achkm+gwPmjyOayGuUfXVumCxkDgYWg@mail.gmail.com
-rw-r--r-- | src/backend/utils/adt/xml.c | 14 | ||||
-rw-r--r-- | src/test/regress/expected/xml.out | 31 | ||||
-rw-r--r-- | src/test/regress/expected/xml_1.out | 35 | ||||
-rw-r--r-- | src/test/regress/expected/xml_2.out | 31 | ||||
-rw-r--r-- | src/test/regress/sql/xml.sql | 32 |
5 files changed, 142 insertions, 1 deletions
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 268d577b94b..96f187c6ef4 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -3780,6 +3780,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, int32 xpath_len; xmlChar *string; xmlChar *xpath_expr; + size_t xmldecl_len = 0; int i; int ndim; Datum *ns_names_uris; @@ -3840,6 +3841,16 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, memcpy(xpath_expr, VARDATA(xpath_expr_text), xpath_len); xpath_expr[xpath_len] = '\0'; + /* + * In a UTF8 database, skip any xml declaration, which might assert + * another encoding. Ignore parse_xml_decl() failure, letting + * xmlCtxtReadMemory() report parse errors. Documentation disclaims + * xpath() support for non-ASCII data in non-UTF8 databases, so leave + * those scenarios bug-compatible with historical behavior. + */ + if (GetDatabaseEncoding() == PG_UTF8) + parse_xml_decl(string, &xmldecl_len, NULL, NULL, NULL); + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); PG_TRY(); @@ -3854,7 +3865,8 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces, if (ctxt == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); - doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0); + doc = xmlCtxtReadMemory(ctxt, (char *) string + xmldecl_len, + len - xmldecl_len, NULL, NULL, 0); if (doc == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, "could not parse XML document"); diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out index b451b63847e..580e401ff65 100644 --- a/src/test/regress/expected/xml.out +++ b/src/test/regress/expected/xml.out @@ -670,6 +670,37 @@ SELECT xpath('/nosuchtag', '<root/>'); {} (1 row) +-- Round-trip non-ASCII data through xpath(). +DO $$ +DECLARE + xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>'; + degree_symbol text; + res xml[]; +BEGIN + -- Per the documentation, xpath() doesn't work on non-ASCII data when + -- the server encoding is not UTF8. The EXCEPTION block below, + -- currently dead code, will be relevant if we remove this limitation. + IF current_setting('server_encoding') <> 'UTF8' THEN + RAISE LOG 'skip: encoding % unsupported for xml', + current_setting('server_encoding'); + RETURN; + END IF; + + degree_symbol := convert_from('\xc2b0', 'UTF8'); + res := xpath('text()', (xml_declaration || + '<x>' || degree_symbol || '</x>')::xml); + IF degree_symbol <> res[1]::text THEN + RAISE 'expected % (%), got % (%)', + degree_symbol, convert_to(degree_symbol, 'UTF8'), + res[1], convert_to(res[1]::text, 'UTF8'); + END IF; +EXCEPTION + -- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8" + WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM; + -- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist + WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM; +END +$$; -- Test xmlexists and xpath_exists SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>'); xmlexists diff --git a/src/test/regress/expected/xml_1.out b/src/test/regress/expected/xml_1.out index d7027030c36..2cc4804e493 100644 --- a/src/test/regress/expected/xml_1.out +++ b/src/test/regress/expected/xml_1.out @@ -576,6 +576,41 @@ LINE 1: SELECT xpath('/nosuchtag', '<root/>'); ^ DETAIL: This functionality requires the server to be built with libxml support. HINT: You need to rebuild PostgreSQL using --with-libxml. +-- Round-trip non-ASCII data through xpath(). +DO $$ +DECLARE + xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>'; + degree_symbol text; + res xml[]; +BEGIN + -- Per the documentation, xpath() doesn't work on non-ASCII data when + -- the server encoding is not UTF8. The EXCEPTION block below, + -- currently dead code, will be relevant if we remove this limitation. + IF current_setting('server_encoding') <> 'UTF8' THEN + RAISE LOG 'skip: encoding % unsupported for xml', + current_setting('server_encoding'); + RETURN; + END IF; + + degree_symbol := convert_from('\xc2b0', 'UTF8'); + res := xpath('text()', (xml_declaration || + '<x>' || degree_symbol || '</x>')::xml); + IF degree_symbol <> res[1]::text THEN + RAISE 'expected % (%), got % (%)', + degree_symbol, convert_to(degree_symbol, 'UTF8'), + res[1], convert_to(res[1]::text, 'UTF8'); + END IF; +EXCEPTION + -- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8" + WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM; + -- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist + WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM; +END +$$; +ERROR: unsupported XML feature +DETAIL: This functionality requires the server to be built with libxml support. +HINT: You need to rebuild PostgreSQL using --with-libxml. +CONTEXT: PL/pgSQL function inline_code_block line 17 at assignment -- Test xmlexists and xpath_exists SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>'); ERROR: unsupported XML feature diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out index 23c4e767f0b..3a959480740 100644 --- a/src/test/regress/expected/xml_2.out +++ b/src/test/regress/expected/xml_2.out @@ -650,6 +650,37 @@ SELECT xpath('/nosuchtag', '<root/>'); {} (1 row) +-- Round-trip non-ASCII data through xpath(). +DO $$ +DECLARE + xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>'; + degree_symbol text; + res xml[]; +BEGIN + -- Per the documentation, xpath() doesn't work on non-ASCII data when + -- the server encoding is not UTF8. The EXCEPTION block below, + -- currently dead code, will be relevant if we remove this limitation. + IF current_setting('server_encoding') <> 'UTF8' THEN + RAISE LOG 'skip: encoding % unsupported for xml', + current_setting('server_encoding'); + RETURN; + END IF; + + degree_symbol := convert_from('\xc2b0', 'UTF8'); + res := xpath('text()', (xml_declaration || + '<x>' || degree_symbol || '</x>')::xml); + IF degree_symbol <> res[1]::text THEN + RAISE 'expected % (%), got % (%)', + degree_symbol, convert_to(degree_symbol, 'UTF8'), + res[1], convert_to(res[1]::text, 'UTF8'); + END IF; +EXCEPTION + -- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8" + WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM; + -- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist + WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM; +END +$$; -- Test xmlexists and xpath_exists SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>'); xmlexists diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql index 08a0b30067a..ac80d2f1a8c 100644 --- a/src/test/regress/sql/xml.sql +++ b/src/test/regress/sql/xml.sql @@ -189,6 +189,38 @@ SELECT xpath('count(//*)=3', '<root><sub/><sub/></root>'); SELECT xpath('name(/*)', '<root><sub/><sub/></root>'); SELECT xpath('/nosuchtag', '<root/>'); +-- Round-trip non-ASCII data through xpath(). +DO $$ +DECLARE + xml_declaration text := '<?xml version="1.0" encoding="ISO-8859-1"?>'; + degree_symbol text; + res xml[]; +BEGIN + -- Per the documentation, xpath() doesn't work on non-ASCII data when + -- the server encoding is not UTF8. The EXCEPTION block below, + -- currently dead code, will be relevant if we remove this limitation. + IF current_setting('server_encoding') <> 'UTF8' THEN + RAISE LOG 'skip: encoding % unsupported for xml', + current_setting('server_encoding'); + RETURN; + END IF; + + degree_symbol := convert_from('\xc2b0', 'UTF8'); + res := xpath('text()', (xml_declaration || + '<x>' || degree_symbol || '</x>')::xml); + IF degree_symbol <> res[1]::text THEN + RAISE 'expected % (%), got % (%)', + degree_symbol, convert_to(degree_symbol, 'UTF8'), + res[1], convert_to(res[1]::text, 'UTF8'); + END IF; +EXCEPTION + -- character with byte sequence 0xc2 0xb0 in encoding "UTF8" has no equivalent in encoding "LATIN8" + WHEN untranslatable_character THEN RAISE LOG 'skip: %', SQLERRM; + -- default conversion function for encoding "UTF8" to "MULE_INTERNAL" does not exist + WHEN undefined_function THEN RAISE LOG 'skip: %', SQLERRM; +END +$$; + -- Test xmlexists and xpath_exists SELECT xmlexists('//town[text() = ''Toronto'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>'); SELECT xmlexists('//town[text() = ''Cwmbran'']' PASSING BY REF '<towns><town>Bidford-on-Avon</town><town>Cwmbran</town><town>Bristol</town></towns>'); |