1 /* Parser interface for DOM-based parser (libxml) rather than
2 stream-based SAX-type parser */
6 #include "executor/spi.h"
9 #include "lib/stringinfo.h"
13 #include <libxml/xpath.h>
14 #include <libxml/tree.h>
15 #include <libxml/xmlmemory.h>
16 #include <libxml/xmlerror.h>
17 #include <libxml/parserInternals.h>
24 static void *pgxml_palloc(size_t size);
25 static void *pgxml_repalloc(void *ptr, size_t size);
26 static void pgxml_pfree(void *ptr);
27 static char *pgxml_pstrdup(const char *string);
28 static void pgxml_errorHandler(void *ctxt, const char *msg,...);
30 void elog_error(int level, char *explain, int force);
31 void pgxml_parser_init(void);
33 static xmlChar *pgxmlNodeSetToText(xmlNodeSetPtr nodeset,
34 xmlChar * toptagname, xmlChar * septagname,
37 text *pgxml_result_to_text(xmlXPathObjectPtr res, xmlChar * toptag,
38 xmlChar * septag, xmlChar * plainsep);
40 xmlChar *pgxml_texttoxmlchar(text *textstring);
42 static xmlXPathObjectPtr pgxml_xpath(text *document, xmlChar * xpath);
45 Datum xml_is_well_formed(PG_FUNCTION_ARGS);
46 Datum xml_encode_special_chars(PG_FUNCTION_ARGS);
47 Datum xpath_nodeset(PG_FUNCTION_ARGS);
48 Datum xpath_string(PG_FUNCTION_ARGS);
49 Datum xpath_number(PG_FUNCTION_ARGS);
50 Datum xpath_bool(PG_FUNCTION_ARGS);
51 Datum xpath_list(PG_FUNCTION_ARGS);
52 Datum xpath_table(PG_FUNCTION_ARGS);
54 /* Global variables */
55 char *errbuf; /* per line error buffer */
56 char *pgxml_errorMsg = NULL; /* overall error message */
58 /* Convenience macros */
60 #define GET_TEXT(cstrp) DatumGetTextP(DirectFunctionCall1(textin, CStringGetDatum(cstrp)))
61 #define GET_STR(textp) DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(textp)))
63 #define ERRBUF_SIZE 200
65 /* memory handling passthrough functions (e.g. palloc, pstrdup are
66 currently macros, and the others might become so...) */
69 pgxml_palloc(size_t size)
71 /* elog(DEBUG1,"Alloc %d in CMC %p",size,CurrentMemoryContext); */
76 pgxml_repalloc(void *ptr, size_t size)
78 /* elog(DEBUG1,"ReAlloc in CMC %p",CurrentMemoryContext);*/
79 return repalloc(ptr, size);
83 pgxml_pfree(void *ptr)
85 /* elog(DEBUG1,"Free in CMC %p",CurrentMemoryContext); */
90 pgxml_pstrdup(const char *string)
92 return pstrdup(string);
95 /* The error handling function. This formats an error message and sets
96 * a flag - an ereport will be issued prior to return
100 pgxml_errorHandler(void *ctxt, const char *msg,...)
105 vsnprintf(errbuf, ERRBUF_SIZE, msg, args);
107 /* Now copy the argument across */
108 if (pgxml_errorMsg == NULL)
109 pgxml_errorMsg = pstrdup(errbuf);
112 int32 xsize = strlen(pgxml_errorMsg);
114 pgxml_errorMsg = repalloc(pgxml_errorMsg,
115 (size_t) (xsize + strlen(errbuf) + 1));
116 strncpy(&pgxml_errorMsg[xsize - 1], errbuf, strlen(errbuf));
117 pgxml_errorMsg[xsize + strlen(errbuf) - 1] = '\0';
120 memset(errbuf, 0, ERRBUF_SIZE);
123 /* This function reports the current message at the level specified */
125 elog_error(int level, char *explain, int force)
127 if (force || (pgxml_errorMsg != NULL))
129 if (pgxml_errorMsg == NULL)
131 ereport(level, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
136 ereport(level, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
137 errmsg("%s:%s", explain, pgxml_errorMsg)));
138 pfree(pgxml_errorMsg);
147 * This code could also set parser settings from user-supplied info.
148 * Quite how these settings are made is another matter :)
151 xmlMemSetup(pgxml_pfree, pgxml_palloc, pgxml_repalloc, pgxml_pstrdup);
154 xmlSetGenericErrorFunc(NULL, pgxml_errorHandler);
156 xmlSubstituteEntitiesDefault(1);
157 xmlLoadExtDtdDefaultValue = 1;
159 pgxml_errorMsg = NULL;
161 errbuf = palloc(200);
162 memset(errbuf, 0, 200);
167 /* Returns true if document is well-formed */
169 PG_FUNCTION_INFO_V1(xml_is_well_formed);
172 xml_is_well_formed(PG_FUNCTION_ARGS)
174 /* called as xml_is_well_formed(document) */
176 text *t = PG_GETARG_TEXT_P(0); /* document buffer */
177 int32 docsize = VARSIZE(t) - VARHDRSZ;
181 doctree = xmlParseMemory((char *) VARDATA(t), docsize);
185 PG_RETURN_BOOL(false); /* i.e. not well-formed */
189 PG_RETURN_BOOL(true);
193 /* Encodes special characters (<, >, &, " and \r) as XML entities */
195 PG_FUNCTION_INFO_V1(xml_encode_special_chars);
198 xml_encode_special_chars(PG_FUNCTION_ARGS)
200 text *tin = PG_GETARG_TEXT_P(0);
206 ts = pgxml_texttoxmlchar(tin);
208 tt = xmlEncodeSpecialChars(NULL, ts);
212 ressize = strlen(tt);
213 tout = (text *) palloc(ressize + VARHDRSZ);
214 memcpy(VARDATA(tout), tt, ressize);
215 VARATT_SIZEP(tout) = ressize + VARHDRSZ;
219 PG_RETURN_TEXT_P(tout);
224 pgxmlNodeSetToText(xmlNodeSetPtr nodeset,
225 xmlChar * toptagname,
226 xmlChar * septagname,
229 /* Function translates a nodeset into a text representation */
232 * iterates over each node in the set and calls xmlNodeDump to write it to
233 * an xmlBuffer -from which an xmlChar * string is returned.
236 /* each representation is surrounded by <tagname> ... </tagname> */
239 * plainsep is an ordinary (not tag) seperator - if used, then nodes are
240 * cast to string as output method
248 buf = xmlBufferCreate();
250 if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0))
252 xmlBufferWriteChar(buf, "<");
253 xmlBufferWriteCHAR(buf, toptagname);
254 xmlBufferWriteChar(buf, ">");
258 for (i = 0; i < nodeset->nodeNr; i++)
261 if (plainsep != NULL)
263 xmlBufferWriteCHAR(buf,
264 xmlXPathCastNodeToString(nodeset->nodeTab[i]));
266 /* If this isn't the last entry, write the plain sep. */
267 if (i < (nodeset->nodeNr) - 1)
268 xmlBufferWriteChar(buf, plainsep);
274 if ((septagname != NULL) && (xmlStrlen(septagname) > 0))
276 xmlBufferWriteChar(buf, "<");
277 xmlBufferWriteCHAR(buf, septagname);
278 xmlBufferWriteChar(buf, ">");
281 nodeset->nodeTab[i]->doc,
285 if ((septagname != NULL) && (xmlStrlen(septagname) > 0))
287 xmlBufferWriteChar(buf, "</");
288 xmlBufferWriteCHAR(buf, septagname);
289 xmlBufferWriteChar(buf, ">");
295 if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0))
297 xmlBufferWriteChar(buf, "</");
298 xmlBufferWriteCHAR(buf, toptagname);
299 xmlBufferWriteChar(buf, ">");
301 result = xmlStrdup(buf->content);
307 /* Translate a PostgreSQL "varlena" -i.e. a variable length parameter
308 * into the libxml2 representation
312 pgxml_texttoxmlchar(text *textstring)
317 txsize = VARSIZE(textstring) - VARHDRSZ;
318 res = (xmlChar *) palloc(txsize + 1);
319 memcpy((char *) res, VARDATA(textstring), txsize);
324 /* Public visible XPath functions */
326 /* This is a "raw" xpath function. Check that it returns child elements
330 PG_FUNCTION_INFO_V1(xpath_nodeset);
333 xpath_nodeset(PG_FUNCTION_ARGS)
343 /* PG_GETARG_TEXT_P(0) is document buffer */
344 xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */
346 toptag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2));
347 septag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(3));
349 pathsize = VARSIZE(xpathsupp) - VARHDRSZ;
351 xpath = pgxml_texttoxmlchar(xpathsupp);
353 xpres = pgxml_result_to_text(
354 pgxml_xpath(PG_GETARG_TEXT_P(0), xpath),
355 toptag, septag, NULL);
357 /* xmlCleanupParser(); done by result_to_text routine */
362 PG_RETURN_TEXT_P(xpres);
365 /* The following function is almost identical, but returns the elements in */
368 PG_FUNCTION_INFO_V1(xpath_list);
371 xpath_list(PG_FUNCTION_ARGS)
380 /* PG_GETARG_TEXT_P(0) is document buffer */
381 xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */
383 plainsep = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2));
385 pathsize = VARSIZE(xpathsupp) - VARHDRSZ;
387 xpath = pgxml_texttoxmlchar(xpathsupp);
389 xpres = pgxml_result_to_text(
390 pgxml_xpath(PG_GETARG_TEXT_P(0), xpath),
391 NULL, NULL, plainsep);
393 /* xmlCleanupParser(); done by result_to_text routine */
398 PG_RETURN_TEXT_P(xpres);
402 PG_FUNCTION_INFO_V1(xpath_string);
405 xpath_string(PG_FUNCTION_ARGS)
413 /* PG_GETARG_TEXT_P(0) is document buffer */
414 xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */
416 pathsize = VARSIZE(xpathsupp) - VARHDRSZ;
419 * We encapsulate the supplied path with "string()" = 8 chars + 1 for NUL
422 /* We could try casting to string using the libxml function? */
424 xpath = (xmlChar *) palloc(pathsize + 9);
425 memcpy((char *) (xpath + 7), VARDATA(xpathsupp), pathsize);
426 strncpy((char *) xpath, "string(", 7);
427 xpath[pathsize + 7] = ')';
428 xpath[pathsize + 8] = '\0';
430 xpres = pgxml_result_to_text(
431 pgxml_xpath(PG_GETARG_TEXT_P(0), xpath),
439 PG_RETURN_TEXT_P(xpres);
443 PG_FUNCTION_INFO_V1(xpath_number);
446 xpath_number(PG_FUNCTION_ARGS)
455 xmlXPathObjectPtr res;
457 /* PG_GETARG_TEXT_P(0) is document buffer */
458 xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */
460 pathsize = VARSIZE(xpathsupp) - VARHDRSZ;
462 xpath = pgxml_texttoxmlchar(xpathsupp);
464 res = pgxml_xpath(PG_GETARG_TEXT_P(0), xpath);
473 fRes = xmlXPathCastToNumber(res);
475 if (xmlXPathIsNaN(fRes))
478 PG_RETURN_FLOAT4(fRes);
483 PG_FUNCTION_INFO_V1(xpath_bool);
486 xpath_bool(PG_FUNCTION_ARGS)
495 xmlXPathObjectPtr res;
497 /* PG_GETARG_TEXT_P(0) is document buffer */
498 xpathsupp = PG_GETARG_TEXT_P(1); /* XPath expression */
500 pathsize = VARSIZE(xpathsupp) - VARHDRSZ;
502 xpath = pgxml_texttoxmlchar(xpathsupp);
504 res = pgxml_xpath(PG_GETARG_TEXT_P(0), xpath);
510 PG_RETURN_BOOL(false);
513 bRes = xmlXPathCastToBoolean(res);
515 PG_RETURN_BOOL(bRes);
521 /* Core function to evaluate XPath query */
524 pgxml_xpath(text *document, xmlChar * xpath)
528 xmlXPathContextPtr ctxt;
529 xmlXPathObjectPtr res;
531 xmlXPathCompExprPtr comppath;
536 docsize = VARSIZE(document) - VARHDRSZ;
540 doctree = xmlParseMemory((char *) VARDATA(document), docsize);
542 { /* not well-formed */
546 ctxt = xmlXPathNewContext(doctree);
547 ctxt->node = xmlDocGetRootElement(doctree);
550 /* compile the path */
551 comppath = xmlXPathCompile(xpath);
552 if (comppath == NULL)
556 elog_error(ERROR, "XPath Syntax Error", 1);
561 /* Now evaluate the path expression. */
562 res = xmlXPathCompiledEval(comppath, ctxt);
563 xmlXPathFreeCompExpr(comppath);
567 xmlXPathFreeContext(ctxt);
568 /* xmlCleanupParser(); */
573 /* xmlFreeDoc(doctree); */
579 pgxml_result_to_text(xmlXPathObjectPtr res,
596 xpresstr = pgxmlNodeSetToText(res->nodesetval,
602 xpresstr = xmlStrdup(res->stringval);
606 elog(NOTICE, "unsupported XQuery result: %d", res->type);
607 xpresstr = xmlStrdup("<unsupported/>");
611 /* Now convert this result back to text */
612 ressize = strlen(xpresstr);
613 xpres = (text *) palloc(ressize + VARHDRSZ);
614 memcpy(VARDATA(xpres), xpresstr, ressize);
615 VARATT_SIZEP(xpres) = ressize + VARHDRSZ;
617 /* Free various storage */
619 /* xmlFreeDoc(doctree); -- will die at end of tuple anyway */
623 elog_error(ERROR, "XPath error", 0);
629 /* xpath_table is a table function. It needs some tidying (as do the
630 * other functions here!
633 PG_FUNCTION_INFO_V1(xpath_table);
636 xpath_table(PG_FUNCTION_ARGS)
638 /* SPI (input tuple) support */
639 SPITupleTable *tuptable;
641 TupleDesc spi_tupdesc;
643 /* Output tuple (tuplestore) support */
644 Tuplestorestate *tupstore = NULL;
645 TupleDesc ret_tupdesc;
648 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
649 AttInMetadata *attinmeta;
650 MemoryContext per_query_ctx;
651 MemoryContext oldcontext;
653 /* Function parameters */
654 char *pkeyfield = GET_STR(PG_GETARG_TEXT_P(0));
655 char *xmlfield = GET_STR(PG_GETARG_TEXT_P(1));
656 char *relname = GET_STR(PG_GETARG_TEXT_P(2));
657 char *xpathset = GET_STR(PG_GETARG_TEXT_P(3));
658 char *condition = GET_STR(PG_GETARG_TEXT_P(4));
663 xmlChar *pathsep = "|";
670 int rownr; /* For issuing multiple rows from one original
672 int had_values; /* To determine end of nodeset results */
674 StringInfoData query_buf;
676 /* We only have a valid tuple description in table function mode */
677 if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
679 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
680 errmsg("set-valued function called in context that cannot accept a set")));
681 if (rsinfo->expectedDesc == NULL)
683 (errcode(ERRCODE_SYNTAX_ERROR),
684 errmsg("xpath_table must be called as a table function")));
687 * We want to materialise because it means that we don't have to carry
688 * libxml2 parser state between invocations of this function
690 if (!(rsinfo->allowedModes & SFRM_Materialize))
692 (errcode(ERRCODE_SYNTAX_ERROR),
693 errmsg("xpath_table requires Materialize mode, but it is not "
694 "allowed in this context")));
697 * The tuplestore must exist in a higher context than this function call
698 * (per_query_ctx is used)
701 per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
702 oldcontext = MemoryContextSwitchTo(per_query_ctx);
705 * Create the tuplestore - work_mem is the max in-memory size before a
706 * file is created on disk to hold it.
708 tupstore = tuplestore_begin_heap(true, false, work_mem);
710 MemoryContextSwitchTo(oldcontext);
712 /* get the requested return tuple description */
713 ret_tupdesc = CreateTupleDescCopy(rsinfo->expectedDesc);
716 * At the moment we assume that the returned attributes make sense for the
717 * XPath specififed (i.e. we trust the caller). It's not fatal if they get
718 * it wrong - the input function for the column type will raise an error
719 * if the path result can't be converted into the correct binary
723 attinmeta = TupleDescGetAttInMetadata(ret_tupdesc);
725 /* Set return mode and allocate value space. */
726 rsinfo->returnMode = SFRM_Materialize;
727 rsinfo->setDesc = ret_tupdesc;
729 values = (char **) palloc(ret_tupdesc->natts * sizeof(char *));
731 xpaths = (xmlChar **) palloc(ret_tupdesc->natts * sizeof(xmlChar *));
733 /* Split XPaths. xpathset is a writable CString. */
735 /* Note that we stop splitting once we've done all needed for tupdesc */
741 xpaths[numpaths] = pos;
742 pos = strstr(pos, pathsep);
749 } while ((pos != NULL) && (numpaths < (ret_tupdesc->natts - 1)));
751 /* Now build query */
752 initStringInfo(&query_buf);
754 /* Build initial sql statement */
755 appendStringInfo(&query_buf, "SELECT %s, %s FROM %s WHERE %s",
763 if ((ret = SPI_connect()) < 0)
764 elog(ERROR, "xpath_table: SPI_connect returned %d", ret);
766 if ((ret = SPI_exec(query_buf.data, 0)) != SPI_OK_SELECT)
767 elog(ERROR, "xpath_table: SPI execution failed for query %s", query_buf.data);
769 proc = SPI_processed;
770 /* elog(DEBUG1,"xpath_table: SPI returned %d rows",proc); */
771 tuptable = SPI_tuptable;
772 spi_tupdesc = tuptable->tupdesc;
774 /* Switch out of SPI context */
775 MemoryContextSwitchTo(oldcontext);
778 /* Check that SPI returned correct result. If you put a comma into one of
779 * the function parameters, this will catch it when the SPI query returns
783 if (spi_tupdesc->natts != 2)
785 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
786 errmsg("expression returning multiple columns is not valid in parameter list"),
787 errdetail("Expected two columns in SPI result, got %d.", spi_tupdesc->natts)));
790 /* Setup the parser. Beware that this must happen in the same context as the
791 * cleanup - which means that any error from here on must do cleanup to
792 * ensure that the entity table doesn't get freed by being out of context.
796 /* For each row i.e. document returned from SPI */
797 for (i = 0; i < proc; i++)
803 xmlXPathContextPtr ctxt;
804 xmlXPathObjectPtr res;
808 xmlXPathCompExprPtr comppath;
810 /* Extract the row data as C Strings */
812 spi_tuple = tuptable->vals[i];
813 pkey = SPI_getvalue(spi_tuple, spi_tupdesc, 1);
814 xmldoc = SPI_getvalue(spi_tuple, spi_tupdesc, 2);
818 * Clear the values array, so that not-well-formed documents return
819 * NULL in all columns.
822 /* Note that this also means that spare columns will be NULL. */
823 for (j = 0; j < ret_tupdesc->natts; j++)
826 /* Insert primary key */
829 /* Parse the document */
830 doctree = xmlParseMemory(xmldoc, strlen(xmldoc));
833 { /* not well-formed, so output all-NULL tuple */
835 ret_tuple = BuildTupleFromCStrings(attinmeta, values);
836 oldcontext = MemoryContextSwitchTo(per_query_ctx);
837 tuplestore_puttuple(tupstore, ret_tuple);
838 MemoryContextSwitchTo(oldcontext);
839 heap_freetuple(ret_tuple);
843 /* New loop here - we have to deal with nodeset results */
848 /* Now evaluate the set of xpaths. */
850 for (j = 0; j < numpaths; j++)
853 ctxt = xmlXPathNewContext(doctree);
854 ctxt->node = xmlDocGetRootElement(doctree);
855 xmlSetGenericErrorFunc(ctxt, pgxml_errorHandler);
857 /* compile the path */
858 comppath = xmlXPathCompile(xpaths[j]);
859 if (comppath == NULL)
864 elog_error(ERROR, "XPath Syntax Error", 1);
866 PG_RETURN_NULL(); /* Keep compiler happy */
869 /* Now evaluate the path expression. */
870 res = xmlXPathCompiledEval(comppath, ctxt);
871 xmlXPathFreeCompExpr(comppath);
878 /* We see if this nodeset has enough nodes */
879 if ((res->nodesetval != NULL) && (rownr < res->nodesetval->nodeNr))
882 xmlXPathCastNodeToString(res->nodesetval->nodeTab[rownr]);
891 resstr = xmlStrdup(res->stringval);
895 elog(NOTICE, "unsupported XQuery result: %d", res->type);
896 resstr = xmlStrdup("<unsupported/>");
901 * Insert this into the appropriate column in the
904 values[j + 1] = resstr;
906 xmlXPathFreeContext(ctxt);
908 /* Now add the tuple to the output, if there is one. */
911 ret_tuple = BuildTupleFromCStrings(attinmeta, values);
912 oldcontext = MemoryContextSwitchTo(per_query_ctx);
913 tuplestore_puttuple(tupstore, ret_tuple);
914 MemoryContextSwitchTo(oldcontext);
915 heap_freetuple(ret_tuple);
920 } while (had_values);
931 /* Needed to flag completeness in 7.3.1. 7.4 defines it as a no-op. */
932 tuplestore_donestoring(tupstore);
936 rsinfo->setResult = tupstore;
939 * SFRM_Materialize mode expects us to return a NULL Datum. The actual
940 * tuples are in our tuplestore and passed back through rsinfo->setResult.
941 * rsinfo->setDesc is set to the tuple description that we actually used
942 * to build our tuples with, so the caller can verify we did what it was