diff options
Diffstat (limited to 'src/backend/utils/adt/xml.c')
-rw-r--r-- | src/backend/utils/adt/xml.c | 141 |
1 files changed, 125 insertions, 16 deletions
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 1116b773427..f6bc499320f 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -141,6 +141,7 @@ static int parse_xml_decl(const xmlChar *str, size_t *lenp, xmlChar **version, xmlChar **encoding, int *standalone); static bool print_xml_decl(StringInfo buf, const xmlChar *version, pg_enc encoding, int standalone); +static bool xml_doctype_in_content(const xmlChar *str); static xmlDocPtr xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace, int encoding); static text *xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt); @@ -1243,8 +1244,15 @@ parse_xml_decl(const xmlChar *str, size_t *lenp, if (xmlStrncmp(p, (xmlChar *) "<?xml", 5) != 0) goto finished; - /* if next char is name char, it's a PI like <?xml-stylesheet ...?> */ - utf8len = strlen((const char *) (p + 5)); + /* + * If next char is a name char, it's a PI like <?xml-stylesheet ...?> + * rather than an XMLDecl, so we have done what we came to do and found no + * XMLDecl. + * + * We need an input length value for xmlGetUTF8Char, but there's no need + * to count the whole document size, so use strnlen not strlen. + */ + utf8len = strnlen((const char *) (p + 5), MAX_MULTIBYTE_CHAR_LEN); utf8char = xmlGetUTF8Char(p + 5, &utf8len); if (PG_XMLISNAMECHAR(utf8char)) goto finished; @@ -1415,6 +1423,88 @@ print_xml_decl(StringInfo buf, const xmlChar *version, return false; } +/* + * Test whether an input that is to be parsed as CONTENT contains a DTD. + * + * The SQL/XML:2003 definition of CONTENT ("XMLDecl? content") is not + * satisfied by a document with a DTD, which is a bit of a wart, as it means + * the CONTENT type is not a proper superset of DOCUMENT. SQL/XML:2006 and + * later fix that, by redefining content with reference to the "more + * permissive" Document Node of the XQuery/XPath Data Model, such that any + * DOCUMENT value is indeed also a CONTENT value. That definition is more + * useful, as CONTENT becomes usable for parsing input of unknown form (think + * pg_restore). + * + * As used below in parse_xml when parsing for CONTENT, libxml does not give + * us the 2006+ behavior, but only the 2003; it will choke if the input has + * a DTD. But we can provide the 2006+ definition of CONTENT easily enough, + * by detecting this case first and simply doing the parse as DOCUMENT. + * + * A DTD can be found arbitrarily far in, but that would be a contrived case; + * it will ordinarily start within a few dozen characters. The only things + * that can precede it are an XMLDecl (here, the caller will have called + * parse_xml_decl already), whitespace, comments, and processing instructions. + * This function need only return true if it sees a valid sequence of such + * things leading to <!DOCTYPE. It can simply return false in any other + * cases, including malformed input; that will mean the input gets parsed as + * CONTENT as originally planned, with libxml reporting any errors. + * + * This is only to be called from xml_parse, when pg_xml_init has already + * been called. The input is already in UTF8 encoding. + */ +static bool +xml_doctype_in_content(const xmlChar *str) +{ + const xmlChar *p = str; + + for (;;) + { + const xmlChar *e; + + SKIP_XML_SPACE(p); + if (*p != '<') + return false; + p++; + + if (*p == '!') + { + p++; + + /* if we see <!DOCTYPE, we can return true */ + if (xmlStrncmp(p, (xmlChar *) "DOCTYPE", 7) == 0) + return true; + + /* otherwise, if it's not a comment, fail */ + if (xmlStrncmp(p, (xmlChar *) "--", 2) != 0) + return false; + /* find end of comment: find -- and a > must follow */ + p = xmlStrstr(p + 2, (xmlChar *) "--"); + if (!p || p[2] != '>') + return false; + /* advance over comment, and keep scanning */ + p += 3; + continue; + } + + /* otherwise, if it's not a PI <?target something?>, fail */ + if (*p != '?') + return false; + p++; + + /* find end of PI (the string ?> is forbidden within a PI) */ + e = xmlStrstr(p, (xmlChar *) "?>"); + if (!e) + return false; + + /* we don't check PIs carefully, but do reject "xml" target */ + if (e - p >= 3 && xmlStrncasecmp(p, (xmlChar *) "xml", 3) == 0) + return false; + + /* advance over PI, keep scanning */ + p = e + 2; + } +} + /* * Convert a C string to XML internal representation @@ -1450,6 +1540,12 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace, /* Use a TRY block to ensure we clean up correctly */ PG_TRY(); { + bool parse_as_document = false; + int res_code; + size_t count = 0; + xmlChar *version = NULL; + int standalone = 0; + xmlInitParser(); ctxt = xmlNewParserCtxt(); @@ -1457,7 +1553,25 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace, xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); + /* Decide whether to parse as document or content */ if (xmloption_arg == XMLOPTION_DOCUMENT) + parse_as_document = true; + else + { + /* Parse and skip over the XML declaration, if any */ + res_code = parse_xml_decl(utf8string, + &count, &version, NULL, &standalone); + if (res_code != 0) + xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT, + "invalid XML content: invalid XML declaration", + res_code); + + /* Is there a DOCTYPE element? */ + if (xml_doctype_in_content(utf8string + count)) + parse_as_document = true; + } + + if (parse_as_document) { /* * Note, that here we try to apply DTD defaults @@ -1472,23 +1586,18 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace, XML_PARSE_NOENT | XML_PARSE_DTDATTR | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS)); if (doc == NULL || xmlerrcxt->err_occurred) - xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, - "invalid XML document"); + { + /* Use original option to decide which error code to throw */ + if (xmloption_arg == XMLOPTION_DOCUMENT) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, + "invalid XML document"); + else + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_CONTENT, + "invalid XML content"); + } } else { - int res_code; - size_t count; - xmlChar *version; - int standalone; - - res_code = parse_xml_decl(utf8string, - &count, &version, NULL, &standalone); - if (res_code != 0) - xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT, - "invalid XML content: invalid XML declaration", - res_code); - doc = xmlNewDoc(version); Assert(doc->encoding == NULL); doc->encoding = xmlStrdup((const xmlChar *) "UTF-8"); |