From 020841071bf401245b9ed012752d85fd7e500431 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter_e@gmx.net>
Date: Thu, 18 Jan 2007 13:59:11 +0000
Subject: [PATCH] Clean up encoding issues in the xml type: In text mode,
 encoding declarations are ignored and removed, in binary mode they are
 honored as specified by the XML standard.

---
 doc/src/sgml/datatype.sgml        | 105 +++++++++++++++-
 src/backend/utils/adt/xml.c       | 251 ++++++++++++++++++++++++++++++--------
 src/test/regress/expected/xml.out |   3 +-
 3 files changed, 300 insertions(+), 59 deletions(-)
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 269998a3dd..47c2e5c74a 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/datatype.sgml,v 1.184 2007/01/14 22:37:59 neilc Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/datatype.sgml,v 1.185 2007/01/18 13:59:11 petere Exp $ -->
 
  <chapter id="datatype">
   <title id="datatype-title">Data Types</title>
@@ -3418,8 +3418,107 @@ SELECT * FROM pg_attribute
     advantage over storing XML data in a <type>text</type> field is that it
     checks the input values for well-formedness, and there are support
     functions to perform type-safe operations on it; see <xref
-    linkend="functions-xml">.  Currently, there is no support for
-    validation against a specific <acronym>XML</> schema.
+    linkend="functions-xml">.
+   </para>
+
+   <para>
+    In particular, the <type>xml</type> type can store well-formed
+    <quote>documents</quote>, as defined by the XML standard, as well
+    as <quote>content</quote> fragments, which are defined by the
+    production <literal>XMLDecl? content</literal> in the XML
+    standard.  Roughly, this means that content fragments can have
+    more than one top-level element or character node.  The expression
+    <literal><replaceable>xmlvalue</replaceable> IS DOCUMENT</literal>
+    can be used to evaluate whether a particular <type>xml</type>
+    value is a full document or only a content fragment.
+   </para>
+
+   <para>
+    To produce a value of type <type>xml</type> from character data,
+    use the function <function>xmlparse</function>:
+<synopsis>
+XMLPARSE ( { DOCUMENT | CONTENT } <replaceable>value</replaceable>)
+</synopsis>
+    Examples:
+<programlisting><![CDATA[
+XMLPARSE (DOCUMENT '<?xml version="1.0"?><book><title>Manual</title><chapter>...</chapter><book>')
+XMLPARSE (CONTENT 'abc<foo>bar</bar><bar>foo</foo>')
+]]></programlisting>
+    While this is the only way to convert character strings into XML
+    values according to the SQL standard, the PostgreSQL-specific
+    syntaxes
+<programlisting><![CDATA[
+xml '<foo>bar</foo>'
+'<foo>bar</foo>'::xml
+]]></programlisting>
+    can also be used.
+   </para>
+
+   <para>
+    The <type>xml</type> type does not validate its input values
+    against a possibly included document type declaration (DTD).
+   </para>
+
+   <para>
+    The inverse operation, producing character string type values from
+    <type>xml</type>, uses the function
+    <function>xmlserialize</function>:
+<synopsis>
+XMLSERIALIZE ( { DOCUMENT | CONTENT } <replaceable>value</replaceable> AS <replaceable>type</replaceable> )
+</synopsis>
+    <replaceable>type</replaceable> can be one of
+    <type>character</type>, <type>character varying</type>, or
+    <type>text</type> (or an alias name for those).  Again, according
+    to the SQL standard, this is the only way to convert between type
+    <type>xml</type> and character types, but PostgreSQL also allows
+    you to simply cast the value.
+   </para>
+
+   <para>
+    Care must be taken when dealing with multiple character encodings
+    on the client, server, and in the XML data passed through them.
+    When using the text mode to pass queries to the server and query
+    results to the client (which is the normal mode), PostgreSQL
+    converts all character data passed between the client and the
+    server and vice versa to the character encoding of the respective
+    end; see <xref linkend="multibyte">.  This includes string
+    representations of XML values, such as in the above examples.
+    This would ordinarily mean that encoding declarations contained in
+    XML data might become invalid as the character data is converted
+    to other encodings while travelling between client and server,
+    while the embedded encoding declaration is not changed.  To cope
+    with this behavior, an encoding declaration contained in a
+    character string presented for input to the <type>xml</type> type
+    is <emphasis>ignored</emphasis>, and the content is always assumed
+    to be in the current server encoding.  Consequently, for correct
+    processing, such character strings of XML data must be sent off
+    from the client in the current client encoding.  It is the
+    responsibility of the client to either convert the document to the
+    current client encoding before sending it off to the server or to
+    adjust the client encoding appropriately.  On output, values of
+    type <type>xml</type> will not have an encoding declaration, and
+    clients must assume that the data is in the current client
+    encoding.
+   </para>
+
+   <para>
+    When using the binary mode to pass query parameters to the server
+    and query results back the the client, no character set conversion
+    is performed, so the situation is different.  In this case, an
+    encoding declaration in the XML data will be observed, and if it
+    is absent, the data will be assumed to be in UTF-8 (as required by
+    the XML standard; note that PostgreSQL does not support UTF-16 at
+    all).  On output, data will have an encoding declaration
+    specifying the client encoding, unless the client encoding is
+    UTF-8, in which case it will be omitted.
+   </para>
+
+   <para>
+    Needless to say, processing XML data with PostgreSQL will be less
+    error-prone and more efficient if data encoding, client encoding,
+    and server encoding are the same.  Since XML data is internally
+    processed in UTF-8, computations will be most efficient if the
+    server encoding is also UTF-8.
    </para>
 
    <para>
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 87cb5b0d64..03bbda97dd 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.17 2007/01/14 13:11:54 petere Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.18 2007/01/18 13:59:11 petere Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -68,7 +68,8 @@ static void 	xml_errorHandler(void *ctxt, const char *msg, ...);
 static void 	xml_ereport_by_code(int level, int sqlcode,
 									const char *msg, int errcode);
 static xmlChar *xml_text2xmlChar(text *in);
-static xmlDocPtr xml_parse(text *data, bool is_document, bool preserve_whitespace);
+static int		parse_xml_decl(const xmlChar *str, size_t *lenp, xmlChar **version, xmlChar **encoding, int *standalone);
+static xmlDocPtr xml_parse(text *data, bool is_document, bool preserve_whitespace, xmlChar *encoding);
 
 #endif /* USE_LIBXML */
 
@@ -96,7 +97,7 @@ xml_in(PG_FUNCTION_ARGS)
 	 * Parse the data to check if it is well-formed XML data.  Assume
 	 * that ERROR occurred if parsing failed.
 	 */
-	doc = xml_parse(vardata, false, true);
+	doc = xml_parse(vardata, false, true, NULL);
 	xmlFreeDoc(doc);
 
 	PG_RETURN_XML_P(vardata);
@@ -107,19 +108,102 @@ xml_in(PG_FUNCTION_ARGS)
 }
 
 
+#define PG_XML_DEFAULT_VERSION "1.0"
+
+
+static char *
+xml_out_internal(xmltype *x, pg_enc target_encoding)
+{
+	char		*str;
+	size_t		len;
+#ifdef USE_LIBXML
+	xmlChar		*version;
+	xmlChar		*encoding;
+	int			standalone;
+	int			res_code;
+#endif
+
+	len = VARSIZE(x) - VARHDRSZ;
+	str = palloc(len + 1);
+	memcpy(str, VARDATA(x), len);
+	str[len] = '\0';
+
+#ifdef USE_LIBXML
+	/*
+	 * On output, we adjust the XML declaration as follows.  (These
+	 * rules are the moral equivalent of the clause "Serialization of
+	 * an XML value" in the SQL standard.)
+	 *
+	 * We try to avoid generating an XML declaration if possible.
+	 * This is so that you don't get trivial things like xml '<foo/>'
+	 * resulting in '<?xml version="1.0"?><foo/>', which would surely
+	 * be annoying.  We must provide a declaration if the standalone
+	 * property is specified or if we include an encoding
+	 * specification.  If we have a declaration, we must specify a
+	 * version (XML requires this).  Otherwise we only make a
+	 * declaration if the version is not "1.0", which is the default
+	 * version specified in SQL:2003.
+	 */
+	if ((res_code = parse_xml_decl((xmlChar *) str, &len, &version, &encoding, &standalone)) == 0)
+	{
+		StringInfoData buf;
+
+		initStringInfo(&buf);
+
+		if ((version && strcmp((char *) version, PG_XML_DEFAULT_VERSION) != 0)
+			|| (target_encoding && target_encoding != PG_UTF8)
+			|| standalone != -1)
+		{
+			appendStringInfoString(&buf, "<?xml");
+			if (version)
+				appendStringInfo(&buf, " version=\"%s\"", version);
+			else
+				appendStringInfo(&buf, " version=\"%s\"", PG_XML_DEFAULT_VERSION);
+			if (target_encoding && target_encoding != PG_UTF8)
+				/* XXX might be useful to convert this to IANA names
+				 * (ISO-8859-1 instead of LATIN1 etc.); needs field
+				 * experience */
+				appendStringInfo(&buf, " encoding=\"%s\"", pg_encoding_to_char(target_encoding));
+			if (standalone == 1)
+				appendStringInfoString(&buf, " standalone=\"yes\"");
+			else if (standalone == 0)
+				appendStringInfoString(&buf, " standalone=\"no\"");
+			appendStringInfoString(&buf, "?>");
+		}
+		else
+		{
+			/*
+			 * If we are not going to produce an XML declaration, eat
+			 * a single newline in the original string to prevent
+			 * empty first lines in the output.
+			 */
+			if (*(str + len) == '\n')
+				len += 1;
+		}
+		appendStringInfoString(&buf, str + len);
+
+		return buf.data;
+	}
+
+	xml_ereport_by_code(WARNING, ERRCODE_INTERNAL_ERROR,
+						"could not parse XML declaration in stored value", res_code);
+#endif
+	return str;
+}
+
+
 Datum
 xml_out(PG_FUNCTION_ARGS)
 {
-	xmltype		*s = PG_GETARG_XML_P(0);
-	char		*result;
-	int32		len;
-
-	len = VARSIZE(s) - VARHDRSZ;
-	result = palloc(len + 1);
-	memcpy(result, VARDATA(s), len);
-	result[len] = '\0';
+	xmltype	   *x = PG_GETARG_XML_P(0);
 
-	PG_RETURN_CSTRING(result);
+	/*
+	 * xml_out removes the encoding property in all cases.  This is
+	 * because we cannot control from here whether the datum will be
+	 * converted to a different client encoding, so we'd do more harm
+	 * than good by including it.
+	 */
+	PG_RETURN_CSTRING(xml_out_internal(x, 0));
 }
 
 
@@ -130,23 +214,44 @@ xml_recv(PG_FUNCTION_ARGS)
 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
 	xmltype	   *result;
 	char	   *str;
+	char	   *newstr;
 	int			nbytes;
 	xmlDocPtr	doc;
+	xmlChar	   *encoding = NULL;
 
 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
 
-	result = (xmltype *) palloc(nbytes + VARHDRSZ);
+	result = palloc(nbytes + VARHDRSZ);
 	VARATT_SIZEP(result) = nbytes + VARHDRSZ;
 	memcpy(VARDATA(result), str, nbytes);
-	pfree(str);
+
+	parse_xml_decl((xmlChar *) str, NULL, NULL, &encoding, NULL);
 
 	/*
 	 * Parse the data to check if it is well-formed XML data.  Assume
 	 * that ERROR occurred if parsing failed.
 	 */
-	doc = xml_parse(result, false, true);
+	doc = xml_parse(result, false, true, encoding);
 	xmlFreeDoc(doc);
 
+	newstr = (char *) pg_do_encoding_conversion((unsigned char *) str,
+												nbytes,
+												encoding ? pg_char_to_encoding((char *) encoding) : PG_UTF8,
+												GetDatabaseEncoding());
+
+	pfree(str);
+
+	if (newstr != str)
+	{
+		free(result);
+
+		nbytes = strlen(newstr);
+
+		result = palloc(nbytes + VARHDRSZ);
+		VARATT_SIZEP(result) = nbytes + VARHDRSZ;
+		memcpy(VARDATA(result), newstr, nbytes);
+	}
+
 	PG_RETURN_XML_P(result);
 #else
 	NO_XML_SUPPORT();
@@ -159,10 +264,11 @@ Datum
 xml_send(PG_FUNCTION_ARGS)
 {
 	xmltype	   *x = PG_GETARG_XML_P(0);
+	char	   *outval = xml_out_internal(x, pg_get_client_encoding());
 	StringInfoData buf;
 
 	pq_begintypsend(&buf);
-	pq_sendbytes(&buf, VARDATA(x), VARSIZE(x) - VARHDRSZ);
+	pq_sendstring(&buf, outval);
 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 }
 
@@ -191,6 +297,21 @@ stringinfo_to_xmltype(StringInfo buf)
 
 
 static xmltype *
+cstring_to_xmltype(const char *string)
+{
+	int32		len;
+	xmltype	   *result;
+
+	len = strlen(string) + VARHDRSZ;
+	result = palloc(len);
+	VARATT_SIZEP(result) = len;
+	memcpy(VARDATA(result), string, len - VARHDRSZ);
+
+	return result;
+}
+
+
+static xmltype *
 xmlBuffer_to_xmltype(xmlBufferPtr buf)
 {
 	int32		len;
@@ -211,7 +332,7 @@ xmlcomment(PG_FUNCTION_ARGS)
 {
 #ifdef USE_LIBXML
 	text *arg = PG_GETARG_TEXT_P(0);
-	int len =  VARATT_SIZEP(arg) - VARHDRSZ;
+	int len =  VARSIZE(arg) - VARHDRSZ;
 	StringInfoData buf;
 	int i;
 
@@ -310,7 +431,7 @@ xmlparse(text *data, bool is_document, bool preserve_whitespace)
 #ifdef USE_LIBXML
 	xmlDocPtr	doc;
 
-	doc = xml_parse(data, is_document, preserve_whitespace);
+	doc = xml_parse(data, is_document, preserve_whitespace, NULL);
 	xmlFreeDoc(doc);
 
 	return (xmltype *) data;
@@ -383,7 +504,7 @@ xmlroot(xmltype *data, text *version, int standalone)
 	xmlBufferPtr buffer;
 	xmlSaveCtxtPtr save;
 
-	doc = xml_parse((text *) data, true, true);
+	doc = xml_parse((text *) data, true, true, NULL);
 
 	if (version)
 		doc->version = xmlStrdup(xml_text2xmlChar(version));
@@ -404,13 +525,16 @@ xmlroot(xmltype *data, text *version, int standalone)
 	}
 
 	buffer = xmlBufferCreate();
-	save = xmlSaveToBuffer(buffer, NULL, 0);
+	save = xmlSaveToBuffer(buffer, "UTF-8", 0);
 	xmlSaveDoc(save, doc);
 	xmlSaveClose(save);
 
 	xmlFreeDoc(doc);
 
-	result = xmlBuffer_to_xmltype(buffer);
+	result = cstring_to_xmltype((char *) pg_do_encoding_conversion((unsigned char *) xmlBufferContent(buffer),
+																   xmlBufferLength(buffer),
+																   PG_UTF8,
+																   GetDatabaseEncoding()));
 	xmlBufferFree(buffer);
 	return result;
 #else
@@ -525,7 +649,7 @@ xml_is_document(xmltype *arg)
 
 	PG_TRY();
 	{
-		doc = xml_parse((text *) arg, true, true);
+		doc = xml_parse((text *) arg, true, true, NULL);
 		result = true;
 	}
 	PG_CATCH();
@@ -622,13 +746,21 @@ xml_init(void)
 #define SKIP_XML_SPACE(p) while (xmlIsBlank_ch(*(p))) (p)++
 
 static int
-parse_xml_decl(const xmlChar *str, size_t *len, xmlChar **encoding, int *standalone)
+parse_xml_decl(const xmlChar *str, size_t *lenp, xmlChar **version, xmlChar **encoding, int *standalone)
 {
 	const xmlChar *p;
 	const xmlChar *save_p;
+	size_t		len;
 
 	p = str;
 
+	if (version)
+		*version = NULL;
+	if (encoding)
+		*encoding = NULL;
+	if (standalone)
+		*standalone = -1;
+
 	if (xmlStrncmp(p, (xmlChar *)"<?xml", 5) != 0)
 		goto finished;
 
@@ -645,9 +777,21 @@ parse_xml_decl(const xmlChar *str, size_t *len, xmlChar **encoding, int *standal
 		return XML_ERR_VERSION_MISSING;
 	p += 1;
 	SKIP_XML_SPACE(p);
-	if (xmlStrncmp(p, (xmlChar *)"'1.0'", 5) != 0 && xmlStrncmp(p, (xmlChar *)"\"1.0\"", 5) != 0)
+
+	if (*p == '\'' || *p == '"')
+	{
+		const xmlChar *q;
+
+		q = xmlStrchr(p + 1, *p);
+		if (!q)
+			return XML_ERR_VERSION_MISSING;
+
+		if (version)
+			*version = xmlStrndup(p + 1, q - p - 1);
+		p = q + 1;
+	}
+	else
 		return XML_ERR_VERSION_MISSING;
-	p += 5;
 
 	/* encoding */
 	save_p = p;
@@ -670,6 +814,7 @@ parse_xml_decl(const xmlChar *str, size_t *len, xmlChar **encoding, int *standal
 			if (!q)
 				return XML_ERR_MISSING_ENCODING;
 
+			if (encoding)
 			*encoding = xmlStrndup(p + 1, q - p - 1);
 			p = q + 1;
 		}
@@ -679,7 +824,6 @@ parse_xml_decl(const xmlChar *str, size_t *len, xmlChar **encoding, int *standal
 	else
 	{
 		p = save_p;
-		*encoding = NULL;
 	}
 
 	/* standalone */
@@ -710,7 +854,6 @@ parse_xml_decl(const xmlChar *str, size_t *len, xmlChar **encoding, int *standal
 	else
 	{
 		p = save_p;
-		*standalone = -1;
 	}
 
 	SKIP_XML_SPACE(p);
@@ -719,8 +862,15 @@ parse_xml_decl(const xmlChar *str, size_t *len, xmlChar **encoding, int *standal
 	p += 2;
 
 finished:
-	if (len)
-		*len = (p - str);
+	len = p - str;
+
+	for (p = str; p < str + len; p++)
+		if (*p > 127)
+			return XML_ERR_INVALID_CHAR;
+
+	if (lenp)
+		*lenp = len;
+
 	return XML_ERR_OK;
 }
 
@@ -732,17 +882,24 @@ finished:
  * TODO what about internal URI for docs? (see PG_XML_DEFAULT_URI below)
  */
 static xmlDocPtr
-xml_parse(text *data, bool is_document, bool preserve_whitespace)
+xml_parse(text *data, bool is_document, bool preserve_whitespace, xmlChar *encoding)
 {
-	int					res_code;
 	int32				len;
 	xmlChar				*string;
+	xmlChar				*utf8string;
 	xmlParserCtxtPtr 	ctxt = NULL;
 	xmlDocPtr 			doc = NULL;
 
 	len = VARSIZE(data) - VARHDRSZ; /* will be useful later */
 	string = xml_text2xmlChar(data);
 
+	utf8string = pg_do_encoding_conversion(string,
+										   len,
+										   encoding
+										   ? pg_char_to_encoding((char *) encoding)
+										   : GetDatabaseEncoding(),
+										   PG_UTF8);
+
 	xml_init();
 
 	/* We use a PG_TRY block to ensure libxml is cleaned up on error */
@@ -762,8 +919,9 @@ xml_parse(text *data, bool is_document, bool preserve_whitespace)
 			 * As for external DTDs, we try to support them too, (see
 			 * SQL/XML:10.16.7.e)
 			 */
-			doc = xmlCtxtReadMemory(ctxt, (char *) string, len,
-									PG_XML_DEFAULT_URI, NULL,
+			doc = xmlCtxtReadDoc(ctxt, utf8string,
+								 PG_XML_DEFAULT_URI,
+								 "UTF-8",
 									XML_PARSE_NOENT | XML_PARSE_DTDATTR
 									| (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS));
 			if (doc == NULL)
@@ -772,41 +930,26 @@ xml_parse(text *data, bool is_document, bool preserve_whitespace)
 		}
 		else
 		{
+			int			res_code;
 			size_t count;
-			xmlChar *encoding = NULL;
+			xmlChar	   *version = NULL;
 			int standalone = -1;
 
 			doc = xmlNewDoc(NULL);
 
-			res_code = parse_xml_decl(string, &count, &encoding, &standalone);
+			res_code = parse_xml_decl(utf8string, &count, &version, NULL, &standalone);
 
-			/* TODO resolve: xmlParseBalancedChunkMemory assumes that string is UTF8 encoded! */
 			if (res_code == 0)
-				res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, string + count, NULL);
+				res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, utf8string + count, NULL);
 			if (res_code != 0)
 				xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT,
 									"invalid XML content", res_code);
 
-			doc->encoding = encoding;
+			doc->version = xmlStrdup(version);
+			doc->encoding = xmlStrdup((xmlChar *) "UTF-8");
 			doc->standalone = standalone;
 		}
 
-		/* TODO encoding issues
-		 * (thoughts:
-		 * 		CASE:
-		 *   		- XML data has explicit encoding attribute in its prolog
-		 *   		- if not, assume that enc. of XML data is the same as client's one
-		 *
-		 * 		The common rule is to accept the XML data only if its encoding
-		 * 		is the same as encoding of the storage (server's). The other possible
-		 * 		option is to accept all the docs, but DO TRANSFORMATION and, if needed,
-		 * 		change the prolog.
-		 *
-		 * 		I think I'd stick the first way (for the 1st version),
-		 * 		it's much simplier (less errors...)
-		 * ) */
-		/* ... */
-
 		if (ctxt)
 			xmlFreeParserCtxt(ctxt);
 		xmlCleanupParser();
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index c33fd8e414..db03e43f2a 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -177,8 +177,7 @@ SELECT xmlpi(name foo, '   bar');
 
 SELECT xmlroot(xml '<foo/>', version no value, standalone no value);
         xmlroot        
------------------------
- <?xml version="1.0"?>
+---------
  <foo/>
  
 (1 row)
-- 
2.11.0