aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/xml.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/xml.c')
-rw-r--r--src/backend/utils/adt/xml.c141
1 files changed, 125 insertions, 16 deletions
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 1116b773427..f6bc499320f 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -141,6 +141,7 @@ static int parse_xml_decl(const xmlChar *str, size_t *lenp,
xmlChar **version, xmlChar **encoding, int *standalone);
static bool print_xml_decl(StringInfo buf, const xmlChar *version,
pg_enc encoding, int standalone);
+static bool xml_doctype_in_content(const xmlChar *str);
static xmlDocPtr xml_parse(text *data, XmlOptionType xmloption_arg,
bool preserve_whitespace, int encoding);
static text *xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt);
@@ -1243,8 +1244,15 @@ parse_xml_decl(const xmlChar *str, size_t *lenp,
if (xmlStrncmp(p, (xmlChar *) "<?xml", 5) != 0)
goto finished;
- /* if next char is name char, it's a PI like <?xml-stylesheet ...?> */
- utf8len = strlen((const char *) (p + 5));
+ /*
+ * If next char is a name char, it's a PI like <?xml-stylesheet ...?>
+ * rather than an XMLDecl, so we have done what we came to do and found no
+ * XMLDecl.
+ *
+ * We need an input length value for xmlGetUTF8Char, but there's no need
+ * to count the whole document size, so use strnlen not strlen.
+ */
+ utf8len = strnlen((const char *) (p + 5), MAX_MULTIBYTE_CHAR_LEN);
utf8char = xmlGetUTF8Char(p + 5, &utf8len);
if (PG_XMLISNAMECHAR(utf8char))
goto finished;
@@ -1415,6 +1423,88 @@ print_xml_decl(StringInfo buf, const xmlChar *version,
return false;
}
+/*
+ * Test whether an input that is to be parsed as CONTENT contains a DTD.
+ *
+ * The SQL/XML:2003 definition of CONTENT ("XMLDecl? content") is not
+ * satisfied by a document with a DTD, which is a bit of a wart, as it means
+ * the CONTENT type is not a proper superset of DOCUMENT. SQL/XML:2006 and
+ * later fix that, by redefining content with reference to the "more
+ * permissive" Document Node of the XQuery/XPath Data Model, such that any
+ * DOCUMENT value is indeed also a CONTENT value. That definition is more
+ * useful, as CONTENT becomes usable for parsing input of unknown form (think
+ * pg_restore).
+ *
+ * As used below in parse_xml when parsing for CONTENT, libxml does not give
+ * us the 2006+ behavior, but only the 2003; it will choke if the input has
+ * a DTD. But we can provide the 2006+ definition of CONTENT easily enough,
+ * by detecting this case first and simply doing the parse as DOCUMENT.
+ *
+ * A DTD can be found arbitrarily far in, but that would be a contrived case;
+ * it will ordinarily start within a few dozen characters. The only things
+ * that can precede it are an XMLDecl (here, the caller will have called
+ * parse_xml_decl already), whitespace, comments, and processing instructions.
+ * This function need only return true if it sees a valid sequence of such
+ * things leading to <!DOCTYPE. It can simply return false in any other
+ * cases, including malformed input; that will mean the input gets parsed as
+ * CONTENT as originally planned, with libxml reporting any errors.
+ *
+ * This is only to be called from xml_parse, when pg_xml_init has already
+ * been called. The input is already in UTF8 encoding.
+ */
+static bool
+xml_doctype_in_content(const xmlChar *str)
+{
+ const xmlChar *p = str;
+
+ for (;;)
+ {
+ const xmlChar *e;
+
+ SKIP_XML_SPACE(p);
+ if (*p != '<')
+ return false;
+ p++;
+
+ if (*p == '!')
+ {
+ p++;
+
+ /* if we see <!DOCTYPE, we can return true */
+ if (xmlStrncmp(p, (xmlChar *) "DOCTYPE", 7) == 0)
+ return true;
+
+ /* otherwise, if it's not a comment, fail */
+ if (xmlStrncmp(p, (xmlChar *) "--", 2) != 0)
+ return false;
+ /* find end of comment: find -- and a > must follow */
+ p = xmlStrstr(p + 2, (xmlChar *) "--");
+ if (!p || p[2] != '>')
+ return false;
+ /* advance over comment, and keep scanning */
+ p += 3;
+ continue;
+ }
+
+ /* otherwise, if it's not a PI <?target something?>, fail */
+ if (*p != '?')
+ return false;
+ p++;
+
+ /* find end of PI (the string ?> is forbidden within a PI) */
+ e = xmlStrstr(p, (xmlChar *) "?>");
+ if (!e)
+ return false;
+
+ /* we don't check PIs carefully, but do reject "xml" target */
+ if (e - p >= 3 && xmlStrncasecmp(p, (xmlChar *) "xml", 3) == 0)
+ return false;
+
+ /* advance over PI, keep scanning */
+ p = e + 2;
+ }
+}
+
/*
* Convert a C string to XML internal representation
@@ -1450,6 +1540,12 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
/* Use a TRY block to ensure we clean up correctly */
PG_TRY();
{
+ bool parse_as_document = false;
+ int res_code;
+ size_t count = 0;
+ xmlChar *version = NULL;
+ int standalone = 0;
+
xmlInitParser();
ctxt = xmlNewParserCtxt();
@@ -1457,7 +1553,25 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
"could not allocate parser context");
+ /* Decide whether to parse as document or content */
if (xmloption_arg == XMLOPTION_DOCUMENT)
+ parse_as_document = true;
+ else
+ {
+ /* Parse and skip over the XML declaration, if any */
+ res_code = parse_xml_decl(utf8string,
+ &count, &version, NULL, &standalone);
+ if (res_code != 0)
+ xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT,
+ "invalid XML content: invalid XML declaration",
+ res_code);
+
+ /* Is there a DOCTYPE element? */
+ if (xml_doctype_in_content(utf8string + count))
+ parse_as_document = true;
+ }
+
+ if (parse_as_document)
{
/*
* Note, that here we try to apply DTD defaults
@@ -1472,23 +1586,18 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
XML_PARSE_NOENT | XML_PARSE_DTDATTR
| (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS));
if (doc == NULL || xmlerrcxt->err_occurred)
- xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
- "invalid XML document");
+ {
+ /* Use original option to decide which error code to throw */
+ if (xmloption_arg == XMLOPTION_DOCUMENT)
+ xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
+ "invalid XML document");
+ else
+ xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_CONTENT,
+ "invalid XML content");
+ }
}
else
{
- int res_code;
- size_t count;
- xmlChar *version;
- int standalone;
-
- res_code = parse_xml_decl(utf8string,
- &count, &version, NULL, &standalone);
- if (res_code != 0)
- xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT,
- "invalid XML content: invalid XML declaration",
- res_code);
-
doc = xmlNewDoc(version);
Assert(doc->encoding == NULL);
doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");