1 /*-------------------------------------------------------------------------
4 * XML data type support.
7 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.3 2006/12/24 00:29:19 tgl Exp $
12 *-------------------------------------------------------------------------
16 * Generally, XML type support is only available when libxml use was
17 * configured during the build. But even if that is not done, the
18 * type and all the functions are available, but most of them will
19 * fail. For one thing, this avoids having to manage variant catalog
20 * installations. But it also has nice effects such as that you can
21 * dump a database containing XML type data even if the server is not
28 #include <libxml/chvalid.h>
29 #include <libxml/parser.h>
30 #include <libxml/tree.h>
31 #include <libxml/uri.h>
32 #include <libxml/xmlerror.h>
33 #endif /* USE_LIBXML */
36 #include "mb/pg_wchar.h"
37 #include "nodes/execnodes.h"
38 #include "utils/builtins.h"
39 #include "utils/xml.h"
44 #define PG_XML_DEFAULT_URI "dummy.xml"
45 #define XML_ERRBUF_SIZE 200
48 static void xml_init(void);
49 static void *xml_palloc(size_t size);
50 static void *xml_repalloc(void *ptr, size_t size);
51 static void xml_pfree(void *ptr);
52 static char *xml_pstrdup(const char *string);
53 static void xml_ereport(int level, char *msg, void *ctxt);
54 static void xml_errorHandler(void *ctxt, const char *msg, ...);
55 static void xml_ereport_by_code(int level, char *msg, int errcode);
56 static xmlChar *xml_text2xmlChar(text *in);
57 static xmlDocPtr xml_parse(text *data, int opts, bool is_document);
60 /* Global variables */
61 /* taken from contrib/xml2 */
62 /* FIXME: DO NOT USE global vars !!! */
63 char *xml_errbuf; /* per line error buffer */
64 char *xml_errmsg = NULL; /* overall error message */
66 #endif /* USE_LIBXML */
69 #define NO_XML_SUPPORT() ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("no XML support in this installation")))
73 xml_in(PG_FUNCTION_ARGS)
76 char *s = PG_GETARG_CSTRING(0);
81 vardata = palloc(len + VARHDRSZ);
82 VARATT_SIZEP(vardata) = len + VARHDRSZ;
83 memcpy(VARDATA(vardata), s, len);
86 * Parse the data to check if it is well-formed XML data. Assume
87 * that ERROR occurred if parsing failed. Do we need DTD
88 * validation (if DTD exists)?
90 xml_parse(vardata, XML_PARSE_DTDATTR | XML_PARSE_DTDVALID, false);
92 PG_RETURN_XML_P(vardata);
101 xml_out(PG_FUNCTION_ARGS)
103 xmltype *s = PG_GETARG_XML_P(0);
107 len = VARSIZE(s) - VARHDRSZ;
108 result = palloc(len + 1);
109 memcpy(result, VARDATA(s), len);
112 PG_RETURN_CSTRING(result);
118 appendStringInfoText(StringInfo str, const text *t)
120 appendBinaryStringInfo(str, VARDATA(t), VARSIZE(t) - VARHDRSZ);
125 stringinfo_to_xmltype(StringInfo buf)
130 len = buf->len + VARHDRSZ;
131 result = palloc(len);
132 VARATT_SIZEP(result) = len;
133 memcpy(VARDATA(result), buf->data, buf->len);
141 xmlcomment(PG_FUNCTION_ARGS)
144 text *arg = PG_GETARG_TEXT_P(0);
145 int len = VARATT_SIZEP(arg) - VARHDRSZ;
149 /* check for "--" in string or "-" at the end */
150 for (i = 1; i < len; i++)
151 if ((VARDATA(arg)[i] == '-' && VARDATA(arg)[i - 1] == '-')
152 || (VARDATA(arg)[i] == '-' && i == len - 1))
154 (errcode(ERRCODE_INVALID_XML_COMMENT),
155 errmsg("invalid XML comment")));
157 initStringInfo(&buf);
158 appendStringInfo(&buf, "<!--");
159 appendStringInfoText(&buf, arg);
160 appendStringInfo(&buf, "-->");
162 PG_RETURN_XML_P(stringinfo_to_xmltype(&buf));
171 texttoxml(PG_FUNCTION_ARGS)
173 text *data = PG_GETARG_TEXT_P(0);
175 PG_RETURN_XML_P(xmlparse(data, false, true));
180 xmlparse(text *data, bool is_document, bool preserve_whitespace)
183 if (!preserve_whitespace)
185 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
186 errmsg("XMLPARSE with STRIP WHITESPACE is not implemented")));
189 * Note, that here we try to apply DTD defaults
190 * (XML_PARSE_DTDATTR) according to SQL/XML:10.16.7.d: 'Default
191 * valies defined by internal DTD are applied'. As for external
192 * DTDs, we try to support them too, (see SQL/XML:10.16.7.e)
194 xml_parse(data, XML_PARSE_DTDATTR, is_document);
196 return (xmltype *) data;
205 xmlpi(char *target, text *arg)
211 if (pg_strncasecmp(target, "xml", 3) == 0)
213 (errcode(ERRCODE_INVALID_XML_PROCESSING_INSTRUCTION),
214 errmsg("invalid XML processing instruction"),
215 errdetail("XML processing instruction target name cannot start with \"xml\".")));
217 initStringInfo(&buf);
219 appendStringInfo(&buf, "<?%s", target);
225 string = DatumGetCString(DirectFunctionCall1(textout,
226 PointerGetDatum(arg)));
227 if (strstr(string, "?>") != NULL)
229 (errcode(ERRCODE_INVALID_XML_PROCESSING_INSTRUCTION),
230 errmsg("invalid XML processing instruction"),
231 errdetail("XML processing instruction cannot contain \"?>\".")));
233 appendStringInfoChar(&buf, ' ');
234 appendStringInfoString(&buf, string);
237 appendStringInfoString(&buf, "?>");
239 result = stringinfo_to_xmltype(&buf);
250 xmlroot(xmltype *data, text *version, int standalone)
256 initStringInfo(&buf);
259 * FIXME: This is probably supposed to be cleverer if there
260 * already is an XML preamble.
262 appendStringInfo(&buf,"<?xml");
265 appendStringInfo(&buf, " version=\"");
266 appendStringInfoText(&buf, version);
267 appendStringInfo(&buf, "\"");
270 appendStringInfo(&buf, " standalone=\"%s\"",
271 (standalone == 1 ? "yes" : "no"));
272 appendStringInfo(&buf, "?>");
273 appendStringInfoText(&buf, (text *) data);
275 result = stringinfo_to_xmltype(&buf);
286 * Validate document (given as string) against DTD (given as external link)
287 * TODO !!! use text instead of cstring for second arg
288 * TODO allow passing DTD as a string value (not only as an URI)
289 * TODO redesign (see comment with '!!!' below)
292 xmlvalidate(PG_FUNCTION_ARGS)
295 text *data = PG_GETARG_TEXT_P(0);
296 text *dtdOrUri = PG_GETARG_TEXT_P(1);
298 xmlParserCtxtPtr ctxt; /* the parser context */
299 xmlDocPtr doc; /* the resulting document tree */
304 ctxt = xmlNewParserCtxt();
306 xml_ereport(ERROR, "could not allocate parser context", ctxt);
307 doc = xmlCtxtReadMemory(ctxt, (char *) VARDATA(data),
308 VARSIZE(data) - VARHDRSZ, PG_XML_DEFAULT_URI, NULL, 0);
310 xml_ereport(ERROR, "could not parse XML data", ctxt);
313 uri = xmlCreateURI();
314 ereport(NOTICE, (errcode(0),errmsg(" dtd - %s", dtdOrUri)));
315 dtd = palloc(sizeof(xmlDtdPtr));
316 uri = xmlParseURI(dtdOrUri);
318 xml_ereport(ERROR, "not implemented yet... (TODO)", ctxt);
321 dtd = xmlParseDTD(NULL, xml_text2xmlChar(dtdOrUri));
327 xmlFreeParserCtxt(ctxt);
329 xml_ereport(ERROR, "could not load DTD", ctxt);
332 if (xmlValidateDtd(xmlNewValidCtxt(), doc, dtd) == 1)
339 xmlFreeParserCtxt(ctxt);
344 xml_ereport(NOTICE, "validation against DTD failed", ctxt);
346 PG_RETURN_BOOL(result);
347 #else /* not USE_LIBXML */
350 #endif /* not USE_LIBXML */
357 * Container for some init stuff (not good design!)
358 * TODO xmlChar is utf8-char, make proper tuning (initdb with enc!=utf8 and check)
364 * Currently, we have no pure UTF-8 support for internals -- check
367 if (sizeof (char) != sizeof (xmlChar))
369 (errmsg("could not initialize XML library"),
370 errdetail("libxml2 has incompatible char type: sizeof(char)=%u, sizeof(xmlChar)=%u.",
371 (int) sizeof(char), (int) sizeof(xmlChar))));
373 xmlMemSetup(xml_pfree, xml_palloc, xml_repalloc, xml_pstrdup);
376 /* do not flood PG's logfile with libxml error messages - reset error handler*/
377 xmlSetGenericErrorFunc(NULL, xml_errorHandler);
379 xml_errbuf = palloc(XML_ERRBUF_SIZE);
380 memset(xml_errbuf, 0, XML_ERRBUF_SIZE);
385 * Convert a C string to XML internal representation
386 * (same things as for TEXT, but with checking the data for well-formedness
387 * and, moreover, validation against DTD, if needed).
388 * NOTICE: We use TEXT type as internal storage type. In the future,
389 * we plan to create own storage type (maybe several types/strategies)
390 * TODO predefined DTDs / XSDs and validation
391 * TODO validation against XML Schema
392 * TODO maybe, libxml2's xmlreader is better? (do not construct DOM, yet do not use SAX - see xml_reader.c)
393 * TODO what about internal URI for docs? (see PG_XML_DEFAULT_URI below)
396 xml_parse(text *data, int opts, bool is_document)
398 bool validationFailed = FALSE;
399 xmlParserCtxtPtr ctxt; /* the parser context */
400 xmlDocPtr doc; /* the resulting document tree */
404 #ifdef XML_DEBUG_DTD_CONST
405 xmlDtdPtr dtd; /* pointer to DTD */
410 len = VARSIZE(data) - VARHDRSZ; /* will be useful later */
411 string = xml_text2xmlChar(data);
413 ctxt = xmlNewParserCtxt();
415 xml_ereport(ERROR, "could not allocate parser context", ctxt);
417 /* first, we try to parse the string as XML doc, then, as XML chunk */
418 ereport(DEBUG3, (errmsg("string to parse: %s", string)));
419 if (len >= 5 && strncmp((char *) string, "<?xml", 5) == 0)
421 /* consider it as DOCUMENT */
422 doc = xmlCtxtReadMemory(ctxt, (char *) string, len,
423 PG_XML_DEFAULT_URI, NULL, opts);
426 xml_ereport(ERROR, "could not parse XML data", ctxt);
428 xmlFreeParserCtxt(ctxt);
430 ereport(ERROR, (errmsg("could not parse XML data")));
436 /* attempt to parse the string as if it is an XML fragment */
437 ereport(DEBUG3, (errmsg("the string is not an XML doc, trying to parse as a CHUNK")));
438 doc = xmlNewDoc(NULL);
439 /* TODO resolve: xmlParseBalancedChunkMemory assumes that string is UTF8 encoded! */
440 res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, string, NULL);
443 xmlFreeParserCtxt(ctxt);
445 xml_ereport_by_code(ERROR, "could not parse XML data", res_code);
449 #ifdef XML_DEBUG_DTD_CONST
450 dtd = xmlParseDTD(NULL, (xmlChar *) XML_DEBUG_DTD_CONST);
451 xml_ereport(DEBUG3, "solid path to DTD was defined for debugging purposes", ctxt);
454 xml_ereport(ERROR, "could not parse DTD data", ctxt);
458 /* if dtd for our xml data is detected... */
459 if ((doc->intSubset != NULL) || (doc->extSubset != NULL))
462 /* assume that inline DTD exists - validation should be performed */
463 #ifdef XML_DEBUG_DTD_CONST
464 if (xmlValidateDtd(xmlNewValidCtxt(), doc, dtd) != 1)
466 if (ctxt->valid == 0)
469 /* DTD exists, but validator reported 'validation failed' */
470 validationFailed = TRUE;
474 if (validationFailed)
475 xml_ereport(WARNING, "validation against DTD failed", ctxt);
477 /* TODO encoding issues
480 * - XML data has explicit encoding attribute in its prolog
481 * - if not, assume that enc. of XML data is the same as client's one
483 * The common rule is to accept the XML data only if its encoding
484 * is the same as encoding of the storage (server's). The other possible
485 * option is to accept all the docs, but DO TRANSFORMATION and, if needed,
488 * I think I'd stick the first way (for the 1st version),
489 * it's much simplier (less errors...)
493 xmlFreeParserCtxt(ctxt);
496 ereport(DEBUG3, (errmsg("XML data successfully parsed, encoding: %s",
497 (char *) doc->encoding)));
504 * xmlChar<->text convertions
507 xml_text2xmlChar(text *in)
509 int32 len = VARSIZE(in) - VARHDRSZ;
512 res = palloc(len + 1);
513 memcpy(res, VARDATA(in), len);
521 * Wrappers for memory management functions
524 xml_palloc(size_t size)
531 xml_repalloc(void *ptr, size_t size)
533 return repalloc(ptr, size);
545 xml_pstrdup(const char *string)
547 return pstrdup(string);
552 * Wrapper for "ereport" function.
553 * Adds detail - libxml's native error message, if any.
556 xml_ereport(int level, char *msg, void *ctxt)
560 xmlErrorPtr libxmlErr = NULL;
562 if (xml_errmsg != NULL)
564 ereport(DEBUG1, (errmsg("%s", xml_errmsg)));
569 libxmlErr = xmlCtxtGetLastError(ctxt);
571 if (libxmlErr == NULL)
575 xmlFreeParserCtxt(ctxt);
578 ereport(level, (errmsg(msg)));
582 /* as usual, libxml error message contains '\n'; get rid of it */
583 xmlErrLen = strlen(libxmlErr->message); /* - 1; */
584 xmlErrDetail = (char *) palloc(xmlErrLen);
585 for (i = 0; i < xmlErrLen; i++)
587 if (libxmlErr->message[i] == '\n')
588 xmlErrDetail[i] = '.';
590 xmlErrDetail[i] = libxmlErr->message[i];
594 xmlFreeParserCtxt(ctxt);
597 ereport(level, (errmsg(msg), errdetail("%s", xmlErrDetail)));
603 * Error handler for libxml error messages
606 xml_errorHandler(void *ctxt, const char *msg,...)
611 vsnprintf(xml_errbuf, XML_ERRBUF_SIZE, msg, args);
613 /* Now copy the argument across */
614 if (xml_errmsg == NULL)
615 xml_errmsg = pstrdup(xml_errbuf);
618 int32 xsize = strlen(xml_errmsg);
620 xml_errmsg = repalloc(xml_errmsg, (size_t) (xsize + strlen(xml_errbuf) + 1));
621 strncpy(&xml_errmsg[xsize - 1], xml_errbuf, strlen(xml_errbuf));
622 xml_errmsg[xsize + strlen(xml_errbuf) - 1] = '\0';
624 memset(xml_errbuf, 0, XML_ERRBUF_SIZE);
629 * Return error message by libxml error code
630 * TODO make them closer to recommendations from Postgres manual
633 xml_ereport_by_code(int level, char *msg, int code)
639 ereport(level, (errmsg(msg)));
644 case XML_ERR_INTERNAL_ERROR:
645 det = "libxml internal error";
647 case XML_ERR_ENTITY_LOOP:
648 det = "Detected an entity reference loop";
650 case XML_ERR_ENTITY_NOT_STARTED:
651 det = "EntityValue: \" or ' expected";
653 case XML_ERR_ENTITY_NOT_FINISHED:
654 det = "EntityValue: \" or ' expected";
656 case XML_ERR_ATTRIBUTE_NOT_STARTED:
657 det = "AttValue: \" or ' expected";
659 case XML_ERR_LT_IN_ATTRIBUTE:
660 det = "Unescaped '<' not allowed in attributes values";
662 case XML_ERR_LITERAL_NOT_STARTED:
663 det = "SystemLiteral \" or ' expected";
665 case XML_ERR_LITERAL_NOT_FINISHED:
666 det = "Unfinished System or Public ID \" or ' expected";
668 case XML_ERR_MISPLACED_CDATA_END:
669 det = "Sequence ']]>' not allowed in content";
671 case XML_ERR_URI_REQUIRED:
672 det = "SYSTEM or PUBLIC, the URI is missing";
674 case XML_ERR_PUBID_REQUIRED:
675 det = "PUBLIC, the Public Identifier is missing";
677 case XML_ERR_HYPHEN_IN_COMMENT:
678 det = "Comment must not contain '--' (double-hyphen)";
680 case XML_ERR_PI_NOT_STARTED:
681 det = "xmlParsePI : no target name";
683 case XML_ERR_RESERVED_XML_NAME:
684 det = "Invalid PI name";
686 case XML_ERR_NOTATION_NOT_STARTED:
687 det = "NOTATION: Name expected here";
689 case XML_ERR_NOTATION_NOT_FINISHED:
690 det = "'>' required to close NOTATION declaration";
692 case XML_ERR_VALUE_REQUIRED:
693 det = "Entity value required";
695 case XML_ERR_URI_FRAGMENT:
696 det = "Fragment not allowed";
698 case XML_ERR_ATTLIST_NOT_STARTED:
699 det = "'(' required to start ATTLIST enumeration";
701 case XML_ERR_NMTOKEN_REQUIRED:
702 det = "NmToken expected in ATTLIST enumeration";
704 case XML_ERR_ATTLIST_NOT_FINISHED:
705 det = "')' required to finish ATTLIST enumeration";
707 case XML_ERR_MIXED_NOT_STARTED:
708 det = "MixedContentDecl : '|' or ')*' expected";
710 case XML_ERR_PCDATA_REQUIRED:
711 det = "MixedContentDecl : '#PCDATA' expected";
713 case XML_ERR_ELEMCONTENT_NOT_STARTED:
714 det = "ContentDecl : Name or '(' expected";
716 case XML_ERR_ELEMCONTENT_NOT_FINISHED:
717 det = "ContentDecl : ',' '|' or ')' expected";
719 case XML_ERR_PEREF_IN_INT_SUBSET:
720 det = "PEReference: forbidden within markup decl in internal subset";
722 case XML_ERR_GT_REQUIRED:
723 det = "Expected '>'";
725 case XML_ERR_CONDSEC_INVALID:
726 det = "XML conditional section '[' expected";
728 case XML_ERR_EXT_SUBSET_NOT_FINISHED:
729 det = "Content error in the external subset";
731 case XML_ERR_CONDSEC_INVALID_KEYWORD:
732 det = "conditional section INCLUDE or IGNORE keyword expected";
734 case XML_ERR_CONDSEC_NOT_FINISHED:
735 det = "XML conditional section not closed";
737 case XML_ERR_XMLDECL_NOT_STARTED:
738 det = "Text declaration '<?xml' required";
740 case XML_ERR_XMLDECL_NOT_FINISHED:
741 det = "parsing XML declaration: '?>' expected";
743 case XML_ERR_EXT_ENTITY_STANDALONE:
744 det = "external parsed entities cannot be standalone";
746 case XML_ERR_ENTITYREF_SEMICOL_MISSING:
747 det = "EntityRef: expecting ';'";
749 case XML_ERR_DOCTYPE_NOT_FINISHED:
750 det = "DOCTYPE improperly terminated";
752 case XML_ERR_LTSLASH_REQUIRED:
753 det = "EndTag: '</' not found";
755 case XML_ERR_EQUAL_REQUIRED:
756 det = "Expected '='";
758 case XML_ERR_STRING_NOT_CLOSED:
759 det = "String not closed expecting \" or '";
761 case XML_ERR_STRING_NOT_STARTED:
762 det = "String not started expecting ' or \"";
764 case XML_ERR_ENCODING_NAME:
765 det = "Invalid XML encoding name";
767 case XML_ERR_STANDALONE_VALUE:
768 det = "Standalone accepts only 'yes' or 'no'";
770 case XML_ERR_DOCUMENT_EMPTY:
771 det = "Document is empty";
773 case XML_ERR_DOCUMENT_END:
774 det = "Extra content at the end of the document";
776 case XML_ERR_NOT_WELL_BALANCED:
777 det = "Chunk is not well balanced";
779 case XML_ERR_EXTRA_CONTENT:
780 det = "Extra content at the end of well balanced chunk";
782 case XML_ERR_VERSION_MISSING:
783 det = "Malformed declaration expecting version";
785 /* more err codes... Please, keep the order! */
786 case XML_ERR_ATTRIBUTE_WITHOUT_VALUE: /* 41 */
787 det ="Attribute without value";
789 case XML_ERR_ATTRIBUTE_REDEFINED:
790 det ="Attribute defined more than once in the same element";
792 case XML_ERR_COMMENT_NOT_FINISHED: /* 45 */
793 det = "Comment is not finished";
795 case XML_ERR_NAME_REQUIRED: /* 68 */
796 det = "Element name not found";
798 case XML_ERR_TAG_NOT_FINISHED: /* 77 */
799 det = "Closing tag not found";
802 det = "Unregistered error (libxml error code: %d)";
803 ereport(DEBUG1, (errmsg("Check out \"libxml/xmlerror.h\" and bring errcode \"%d\" processing to \"xml.c\".", code)));
806 if (xml_errmsg != NULL)
808 ereport(DEBUG1, (errmsg("%s", xml_errmsg)));
812 ereport(level, (errmsg(msg), errdetail(det, code)));
817 * Convert one char in the current server encoding to a Unicode
821 sqlchar_to_unicode(char *s)
827 utf8string = (char *) pg_do_encoding_conversion((unsigned char *) s,
829 GetDatabaseEncoding(),
832 save_enc = GetDatabaseEncoding();
833 SetDatabaseEncoding(PG_UTF8);
834 pg_mb2wchar_with_len(utf8string, &ret, pg_mblen(s));
835 SetDatabaseEncoding(save_enc);
842 is_valid_xml_namefirst(pg_wchar c)
844 /* (Letter | '_' | ':') */
845 return (xmlIsBaseCharQ(c) || xmlIsIdeographicQ(c)
846 || c == '_' || c == ':');
851 is_valid_xml_namechar(pg_wchar c)
853 /* Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender */
854 return (xmlIsBaseCharQ(c) || xmlIsIdeographicQ(c)
856 || c == '.' || c == '-' || c == '_' || c == ':'
857 || xmlIsCombiningQ(c)
858 || xmlIsExtenderQ(c));
860 #endif /* USE_LIBXML */
864 * Map SQL identifier to XML name; see SQL/XML:2003 section 9.1.
867 map_sql_identifier_to_xml_name(char *ident, bool fully_escaped)
873 initStringInfo(&buf);
875 for (p = ident; *p; p += pg_mblen(p))
877 if (*p == ':' && (p == ident || fully_escaped))
878 appendStringInfo(&buf, "_x003A_");
879 else if (*p == '_' && *(p+1) == 'x')
880 appendStringInfo(&buf, "_x005F_");
881 else if (fully_escaped && p == ident &&
882 pg_strncasecmp(p, "xml", 3) == 0)
885 appendStringInfo(&buf, "_x0078_");
887 appendStringInfo(&buf, "_x0058_");
891 pg_wchar u = sqlchar_to_unicode(p);
894 ? !is_valid_xml_namefirst(u)
895 : !is_valid_xml_namechar(u))
896 appendStringInfo(&buf, "_x%04X_", (unsigned int) u);
898 appendBinaryStringInfo(&buf, p, pg_mblen(p));
903 #else /* not USE_LIBXML */
906 #endif /* not USE_LIBXML */