Add support for parsing of large XML data (>= 10MB)

This commit adds XML_PARSE_HUGE to the libxml2 functions used in core for the parsing of XML objects, raising up the original limit of 10MB supported by libxml2. In most code paths of upstream, XML_MAX_TEXT_LENGTH (10^7) is the historical limit that gets upgraded to XML_MAX_HUGE_LENGTH (10^9) once XML_PARSE_HUGE is given to the parser calls. These are still limited by any palloc() calls for text, up to 1GB. This offers the possibility to handle within the backend XML objects larger than 10MB in general, with also a higher depth limit. This change affects the contrib module xml2, the xml data type and SQL/XML. Author: Dmitry Koval Reviewed-by: Tom Lane, Michael Paquier Discussion: https://postgr.es/m/18274-98d16bc03520665f@postgresql.org
author: Michael Paquier <michael@paquier.xyz> 2024-01-17 14:03:55 +0900
committer: Michael Paquier <michael@paquier.xyz> 2024-01-17 14:03:55 +0900
commit: 2197d06224a14dba544e05e3df1efcca83bdd9ef (patch)
tree: 90ec4d973f0f11d068ce824efddcb054e343ec11
parent: 65c5864d7fac46516f17ee89085e349a87ee5bd7 (diff)
download: postgresql-2197d06224a14dba544e05e3df1efcca83bdd9ef.tar.gz
postgresql-2197d06224a14dba544e05e3df1efcca83bdd9ef.zip
3 files changed, 33 insertions, 13 deletions
diff --git a/contrib/xml2/xpath.c b/contrib/xml2/xpath.c
index a967257546a..a2cec95f3fa 100644
--- a/contrib/xml2/xpath.c
+++ b/contrib/xml2/xpath.c
@@ -381,7 +381,7 @@ pgxml_xpath(text *document, xmlChar *xpath, xpath_workspace *workspace)
 	{
 		workspace->doctree = xmlReadMemory((char *) VARDATA_ANY(document),
 										   docsize, NULL, NULL,
-										   XML_PARSE_NOENT);
+										   XML_PARSE_HUGE | XML_PARSE_NOENT);
 		if (workspace->doctree != NULL)
 		{
 			workspace->ctxt = xmlXPathNewContext(workspace->doctree);
@@ -626,7 +626,7 @@ xpath_table(PG_FUNCTION_ARGS)
 			if (xmldoc)
 				doctree = xmlReadMemory(xmldoc, strlen(xmldoc),
 										NULL, NULL,
-										XML_PARSE_NOENT);
+										XML_PARSE_HUGE | XML_PARSE_NOENT);
 			else				/* treat NULL as not well-formed */
 				doctree = NULL;
 
diff --git a/contrib/xml2/xslt_proc.c b/contrib/xml2/xslt_proc.c
index f30a3a42c03..9cbc05db1ab 100644
--- a/contrib/xml2/xslt_proc.c
+++ b/contrib/xml2/xslt_proc.c
@@ -87,7 +87,7 @@ xslt_process(PG_FUNCTION_ARGS)
 		/* Parse document */
 		doctree = xmlReadMemory((char *) VARDATA_ANY(doct),
 								VARSIZE_ANY_EXHDR(doct), NULL, NULL,
-								XML_PARSE_NOENT);
+								XML_PARSE_HUGE | XML_PARSE_NOENT);
 
 		if (doctree == NULL)
 			xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION,
@@ -96,7 +96,7 @@ xslt_process(PG_FUNCTION_ARGS)
 		/* Same for stylesheet */
 		ssdoc = xmlReadMemory((char *) VARDATA_ANY(ssheet),
 							  VARSIZE_ANY_EXHDR(ssheet), NULL, NULL,
-							  XML_PARSE_NOENT);
+							  XML_PARSE_HUGE | XML_PARSE_NOENT);
 
 		if (ssdoc == NULL)
 			xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION,
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index f869c680afd..d3db75eb877 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -1688,8 +1688,8 @@ xml_doctype_in_content(const xmlChar *str)
  * xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode).
  *
  * If parsed_nodes isn't NULL and the input is not an XML document, the list
- * of parsed nodes from the xmlParseBalancedChunkMemory call will be returned
- * to *parsed_nodes.
+ * of parsed nodes from the xmlParseInNodeContext call will be returned to
+ * *parsed_nodes.
  *
  * Errors normally result in ereport(ERROR), but if escontext is an
  * ErrorSaveContext, then "safe" errors are reported there instead, and the
@@ -1795,7 +1795,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
 			doc = xmlCtxtReadDoc(ctxt, utf8string,
 								 NULL,
 								 "UTF-8",
-								 XML_PARSE_NOENT | XML_PARSE_DTDATTR
+								 XML_PARSE_NOENT | XML_PARSE_DTDATTR | XML_PARSE_HUGE
 								 | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS));
 			if (doc == NULL || xmlerrcxt->err_occurred)
 			{
@@ -1828,10 +1828,30 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
 			/* allow empty content */
 			if (*(utf8string + count))
 			{
-				res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0,
-													   utf8string + count,
-													   parsed_nodes);
-				if (res_code != 0 || xmlerrcxt->err_occurred)
+				const char *data;
+				xmlNodePtr	root;
+				xmlNodePtr	lst;
+				xmlParserErrors xml_error;
+
+				data = (const char *) (utf8string + count);
+
+				/*
+				 * Create a fake root node.  The xmlNewDoc() function creates
+				 * an XML document without any nodes, and this is required for
+				 * xmlParseInNodeContext() that is able to handle
+				 * XML_PARSE_HUGE.
+				 */
+				root = xmlNewNode(NULL, (const xmlChar *) "content-root");
+				if (root == NULL || xmlerrcxt->err_occurred)
+					xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
+								"could not allocate xml node");
+				xmlDocSetRootElement(doc, root);
+
+				/* Try to parse string with using root node context. */
+				xml_error = xmlParseInNodeContext(root, data, strlen(data),
+												  XML_PARSE_HUGE,
+												  parsed_nodes ? parsed_nodes : &lst);
+				if (xml_error != XML_ERR_OK || xmlerrcxt->err_occurred)
 				{
 					xml_errsave(escontext, xmlerrcxt,
 								ERRCODE_INVALID_XML_CONTENT,
@@ -4344,7 +4364,7 @@ xpath_internal(text *xpath_expr_text, xmltype *data, ArrayType *namespaces,
 			xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
 						"could not allocate parser context");
 		doc = xmlCtxtReadMemory(ctxt, (char *) string + xmldecl_len,
-								len - xmldecl_len, NULL, NULL, 0);
+								len - xmldecl_len, NULL, NULL, XML_PARSE_HUGE);
 		if (doc == NULL || xmlerrcxt->err_occurred)
 			xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
 						"could not parse XML document");
@@ -4675,7 +4695,7 @@ XmlTableSetDocument(TableFuncScanState *state, Datum value)
 
 	PG_TRY();
 	{
-		doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, 0);
+		doc = xmlCtxtReadMemory(xtCxt->ctxt, (char *) xstr, length, NULL, NULL, XML_PARSE_HUGE);
 		if (doc == NULL || xtCxt->xmlerrcxt->err_occurred)
 			xml_ereport(xtCxt->xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT,
 						"could not parse XML document");
author	Michael Paquier <michael@paquier.xyz>	2024-01-17 14:03:55 +0900
committer	Michael Paquier <michael@paquier.xyz>	2024-01-17 14:03:55 +0900
commit	2197d06224a14dba544e05e3df1efcca83bdd9ef (patch)
tree	90ec4d973f0f11d068ce824efddcb054e343ec11
parent	65c5864d7fac46516f17ee89085e349a87ee5bd7 (diff)
download	postgresql-2197d06224a14dba544e05e3df1efcca83bdd9ef.tar.gz postgresql-2197d06224a14dba544e05e3df1efcca83bdd9ef.zip