From 113bb9b5ac45354583fd37d6be9e51f24afc5f62 Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Mon, 30 Jul 2001 14:59:02 +0000 Subject: [PATCH] XML conversion utility, requires expat library. John Gray --- contrib/README | 6 +- contrib/xml/Makefile | 43 +++++++ contrib/xml/README | 78 ++++++++++++ contrib/xml/TODO | 83 +++++++++++++ contrib/xml/pgxml.c | 310 +++++++++++++++++++++++++++++++++++++++++++++++ contrib/xml/pgxml.h | 60 +++++++++ contrib/xml/pgxml.source | 7 ++ contrib/xml/xpath-yacc | 178 +++++++++++++++++++++++++++ 8 files changed, 764 insertions(+), 1 deletion(-) create mode 100644 contrib/xml/Makefile create mode 100644 contrib/xml/README create mode 100644 contrib/xml/TODO create mode 100644 contrib/xml/pgxml.c create mode 100644 contrib/xml/pgxml.h create mode 100644 contrib/xml/pgxml.source create mode 100644 contrib/xml/xpath-yacc diff --git a/contrib/README b/contrib/README index 6c46010b57..2e071e2f83 100644 --- a/contrib/README +++ b/contrib/README @@ -1,6 +1,6 @@ The PostgreSQL contrib tree -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +--------------------------- This subtree contains tools, modules, and examples that are not maintained as part of the core PostgreSQL system, mainly because @@ -177,3 +177,7 @@ userlock - vacuumlo - Remove orphaned large objects by Peter T Mount + +xml - + Storing XML in PostgreSQL + by John Gray diff --git a/contrib/xml/Makefile b/contrib/xml/Makefile new file mode 100644 index 0000000000..39e012dd1f --- /dev/null +++ b/contrib/xml/Makefile @@ -0,0 +1,43 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Adapted from tutorial makefile +#------------------------------------------------------------------------- + +subdir = contrib/xml +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global + +override CFLAGS+= $(CFLAGS_SL) + + +# +# DLOBJS is the dynamically-loaded object files. The "funcs" queries +# include CREATE FUNCTIONs that load routines from these files. +# +DLOBJS= pgxml$(DLSUFFIX) + + +QUERIES= pgxml.sql + +all: $(DLOBJS) $(QUERIES) + +# Requires the expat library + +%.so: %.o + $(CC) -shared -lexpat -o $@ $< + + +%.sql: %.source + if [ -z "$$USER" ]; then USER=$$LOGNAME; fi; \ + if [ -z "$$USER" ]; then USER=`whoami`; fi; \ + if [ -z "$$USER" ]; then echo 'Cannot deduce $$USER.'; exit 1; fi; \ + rm -f $@; \ + C=`pwd`; \ + sed -e "s:_CWD_:$$C:g" \ + -e "s:_OBJWD_:$$C:g" \ + -e "s:_DLSUFFIX_:$(DLSUFFIX):g" \ + -e "s/_USER_/$$USER/g" < $< > $@ + +clean: + rm -f $(DLOBJS) $(QUERIES) diff --git a/contrib/xml/README b/contrib/xml/README new file mode 100644 index 0000000000..068615eaa8 --- /dev/null +++ b/contrib/xml/README @@ -0,0 +1,78 @@ +This package contains a couple of simple routines for hooking the +expat XML parser up to PostgreSQL. This is a work-in-progress and all +very basic at the moment (see the file TODO for some outline of what +remains to be done). + +At present, two functions are defined, one which checks +well-formedness, and the other which performs very simple XPath-type +queries. + +Prerequisite: + +expat parser 1.95.0 or newer (http://expat.sourceforge.net) + +I used a shared library version -I'm sure you could use a static +library if you wished though. I had no problems compiling from source. + +Function documentation and usage: +--------------------------------- + +pgxml_parse(text) returns bool + parses the provided text and returns true or false if it is +well-formed or not. It returns NULL if the parser couldn't be +created for any reason. + +pgxml_xpath(text doc, text xpath, int n) returns text + parses doc and returns the cdata of the nth occurence of +the "XPath" listed. See below for details on the syntax. + + +Example: + +Given a table docstore: + + Attribute | Type | Modifier +-----------+---------+---------- + docid | integer | + document | text | + +containing documents such as (these are archaeological site +descriptions, in case anyone is wondering): + + + + Church Farm, Ashton Keynes + watching brief + SU04209424 + + +one can type: + +select docid, +pgxml_xpath(document,'/site/name',1) as sitename, +pgxml_xpath(document,'/site/location',1) as location + from docstore; + +and get as output: + + docid | sitename | location +-------+-----------------------------+------------ + 1 | Church Farm, Ashton Keynes | SU04209424 + 2 | Glebe Farm, Long Itchington | SP41506500 +(2 rows) + + +"XPath" syntax supported +------------------------ + +At present it only supports paths of the form: +'tag1/tag2' or '/tag1/tag2' + +The first case will find any within a , the second will +find any within a at the top level of the document. + +The real XPath is much more complex (see TODO file). + + +John Gray 26 July 2001 + diff --git a/contrib/xml/TODO b/contrib/xml/TODO new file mode 100644 index 0000000000..5bec69b4a7 --- /dev/null +++ b/contrib/xml/TODO @@ -0,0 +1,83 @@ +PGXML TODO List +=============== + +Some of these items still require much more thought! The data model +for XML documents and the parsing model of expat don't really fit so +well with a standard SQL model. + +1. Generalised XML parsing support + +Allow a user to specify handlers (in any PL) to be used by the parser. +This must permit distinct sets of parser settings -user may want some +documents in a database to parsed with one set of handlers, others +with a different set. + +i.e. the pgxml_parse function would take as parameters (document, +parsername) where parsername was the identifier for a collection of +handler etc. settings. + +"Stub" handlers in the pgxml code would invoke the functions through +the standard fmgr interface. The parser interface would define the +prototype for these functions. How does the handler function know +which document/context has resulted it in being called? + +Mechanism for defining collection of parser settings (in a table? -but +maybe copied for efficiency into a structure when first required by a +query?) + +2. Support for other parsers + +Expat may not be the best choice as a parser because a new parser +instance is needed for each document i.e. all the handlers must be set +again for each document. Another parser may have a more efficient way +of parsing a set of documents identically. + +3. XPath support + +Proper XPath support. I really need to sit down and plough +through the specification... + +The very simple text comparison system currently used is too +basic. Need to convert the path to an ordered list of nodes. Each node +is an element qualifier, and may have a list of attribute +qualifications attached. This probably requires lexx/yacc combination. +(James Clark has written a yacc grammar for XPath). Not all the +features of XPath are necessarily relevant. + +An option to return subdocuments (i.e. subelements AND cdata, not just +cdata). This should maybe be the default. + +4. Multiple occurences of elements. + +This section is all very sketchy, and has various weaknesses. + +Is there a good way to optimise/index the results of certain XPath +operations to make them faster?: + +select docid, pgxml_xpath(document,'/site/location',1) as location +where pgxml_xpath(document,'/site/name',1) = 'Church Farm'; + +and with multiple element occurences in a document? + +select d.docid, pgxml_xpath(d.document,'/site/location',1) +from docstore d, +pgxml_xpaths('docstore','document','feature/type','docid') ft +where ft.key = d.docid and ft.value ='Limekiln'; + +pgxml_xpaths params are relname, attrname, xpath, returnkey. It would +return a set of two-element tuples (key,value) consisting of the value of +returnkey, and the cdata value of the xpath. The XML document would be +defined by relname and attrname. + +The pgxml_xpaths function could be the basis of a functional index, +which could speed up the above query very substantially, working +through the normal query planner mechanism. Syntax above is fragile +through using names rather than OID. + +John Gray + + + + + + diff --git a/contrib/xml/pgxml.c b/contrib/xml/pgxml.c new file mode 100644 index 0000000000..4728903157 --- /dev/null +++ b/contrib/xml/pgxml.c @@ -0,0 +1,310 @@ +/******************************************************** + * Interface code to parse an XML document using expat + ********************************************************/ + +#include "postgres.h" +#include "fmgr.h" + +#include "expat.h" +#include "pgxml.h" + +/* Memory management - we make expat use standard pg MM */ + +XML_Memory_Handling_Suite mhs; + +/* passthrough functions (palloc is a macro) */ + +static void *pgxml_palloc(size_t size) +{ + return palloc(size); +} + +static void *pgxml_repalloc(void *ptr, size_t size) +{ + return repalloc(ptr,size); +} + +static void pgxml_pfree(void *ptr) +{ + return pfree(ptr); +} + +static void pgxml_mhs_init() +{ + mhs.malloc_fcn = pgxml_palloc; + mhs.realloc_fcn = pgxml_repalloc; + mhs.free_fcn = pgxml_pfree; +} + +static void pgxml_handler_init() +{ + /* This code should set up the relevant handlers from user-supplied + settings. Quite how these settings are made is another matter :) */ +} + +/* Returns true if document is well-formed */ + +PG_FUNCTION_INFO_V1(pgxml_parse); + +Datum +pgxml_parse(PG_FUNCTION_ARGS) +{ + /* called as pgxml_parse(document) */ + XML_Parser p; + text *t = PG_GETARG_TEXT_P(0); /*document buffer */ + int32 docsize = VARSIZE(t) - VARHDRSZ; + + pgxml_mhs_init(); + + pgxml_handler_init(); + + p = XML_ParserCreate_MM(NULL,&mhs,NULL); + if (! p) { + elog(ERROR, "pgxml: Could not create expat parser"); + PG_RETURN_NULL(); /* seems appropriate if we couldn't parse */ + } + + if (! XML_Parse(p, (char *)VARDATA(t) , docsize, 1)) { + /* elog(NOTICE, "Parse error at line %d:%s", + XML_GetCurrentLineNumber(p), + XML_ErrorString(XML_GetErrorCode(p))); */ + XML_ParserFree(p); + PG_RETURN_BOOL(false); + } + + XML_ParserFree(p); + PG_RETURN_BOOL(true); +} + +/* XPath handling functions */ + +/* XPath support here is for a very skeletal kind of XPath! + It was easy to program though... */ + +/* This first is the core function that builds a result set. The + actual functions called by the user manipulate that result set + in various ways. +*/ + +static XPath_Results *build_xpath_results(text *doc, text *pathstr) +{ + XPath_Results *xpr; + char *res; + pgxml_udata *udata; + XML_Parser p; + int32 docsize; + + xpr = (XPath_Results *) palloc((sizeof(XPath_Results))); + memset((void *)xpr, 0, sizeof(XPath_Results)); + xpr->rescount=0; + + docsize=VARSIZE(doc)-VARHDRSZ; + + /* res isn't going to be the real return type, it is just a buffer */ + + res = (char *) palloc(docsize); + memset((void *)res, 0, docsize); + + xpr->resbuf = res; + + udata = (pgxml_udata *) palloc((sizeof(pgxml_udata))); + memset((void *)udata,0,sizeof(pgxml_udata)); + + udata->currentpath[0]='\0'; + udata->textgrab=0; + + udata->path= (char *) palloc(VARSIZE(pathstr)); + memcpy(udata->path, VARDATA(pathstr), VARSIZE(pathstr)-VARHDRSZ); + + udata->path[VARSIZE(pathstr)-VARHDRSZ]='\0'; + + udata->resptr = res; + udata->reslen = 0; + + udata->xpres = xpr; + + /* Now fire up the parser */ + pgxml_mhs_init(); + + p = XML_ParserCreate_MM(NULL,&mhs,NULL); + if (! p) { + elog(ERROR, "pgxml: Could not create expat parser"); + pfree(xpr); + pfree(udata->path); + pfree(udata); + pfree(res); + return NULL; + } + XML_SetUserData(p, (void *)udata); + + /* Set the handlers */ + + XML_SetElementHandler(p, pgxml_starthandler, pgxml_endhandler); + XML_SetCharacterDataHandler(p, pgxml_charhandler); + + if (! XML_Parse(p, (char *)VARDATA(doc) , docsize, 1)) { + /* elog(NOTICE, "Parse error at line %d:%s", + XML_GetCurrentLineNumber(p), + XML_ErrorString(XML_GetErrorCode(p))); */ + XML_ParserFree(p); + pfree(xpr); + pfree(udata->path); + pfree(udata); + + return NULL; + } + + pfree(udata->path); + pfree(udata); + XML_ParserFree(p); + return xpr; +} + + +PG_FUNCTION_INFO_V1(pgxml_xpath); + +Datum +pgxml_xpath(PG_FUNCTION_ARGS) +{ + /* called as pgxml_xpath(document,pathstr, index) for the moment*/ + + XPath_Results *xpresults; + text *restext; + + text *t = PG_GETARG_TEXT_P(0); /*document buffer */ + text *t2= PG_GETARG_TEXT_P(1); + int32 ind = PG_GETARG_INT32(2) - 1; + + xpresults = build_xpath_results(t,t2); + + /* This needs to be changed depending on the mechanism for returning + our set of results. */ + + if (xpresults==NULL) /*parse error (not WF or parser failure) */ + { + PG_RETURN_NULL(); + } + + if (ind >= (xpresults->rescount)) + { + PG_RETURN_NULL(); + } + + restext = (text *) palloc(xpresults->reslens[ind]+VARHDRSZ); + memcpy(VARDATA(restext),xpresults->results[ind],xpresults->reslens[ind]); + + VARATT_SIZEP(restext) = xpresults->reslens[ind]+VARHDRSZ; + + pfree(xpresults->resbuf); + pfree(xpresults); + + PG_RETURN_TEXT_P(restext); +} + + +static void pgxml_pathcompare(void *userData) +{ + char *matchpos; + + matchpos=strstr(UD->currentpath, UD->path); + + if (matchpos == NULL) { /* Should we have more logic here ? */ + if (UD->textgrab) { + UD->textgrab=0; + pgxml_finalisegrabbedtext(userData); + } + return; + } + /* OK, we have a match of some sort. Now we need to check that + our match is anchored to the *end* of the string AND + that it is immediately preceded by a '/'*/ + /* This test wouldn't work if strlen (UD->path) overran the length + of the currentpath, but that's not possible because we got a match! */ + + if ((matchpos + strlen(UD->path))[0]=='\0') + { + if ((UD->path)[0]=='/') { + if (matchpos == UD->currentpath) { + UD->textgrab=1; + } + } else { + if ((matchpos-1)[0]=='/') { + UD->textgrab=1; + } + } + } +} + +static void pgxml_starthandler(void *userData, const XML_Char *name, + const XML_Char **atts) +{ + + char sepstr[]="/"; + + if ((strlen(name)+strlen(UD->currentpath))>MAXPATHLENGTH-2) { + elog(NOTICE,"Path too long"); + } else { + strncat(UD->currentpath,sepstr,1); + strcat(UD->currentpath, name); + } + if (UD->textgrab) + { + /* Depending on user preference, should we "reconstitute" + the element into the result text? + */ + } else { + pgxml_pathcompare(userData); + } +} + +static void pgxml_endhandler(void *userData, const XML_Char *name) +{ + /* Start by removing the current element off the end of the + currentpath */ + + char *sepptr; + + sepptr=strrchr(UD->currentpath,'/'); + if (sepptr==NULL) { + elog(ERROR,"There's a problem..."); + sepptr=UD->currentpath; + } + if (strcmp(name, sepptr+1) !=0) { + elog(NOTICE,"Wanted [%s], got [%s]",sepptr,name); + /* unmatched entry, so do nothing */ + } else { + sepptr[0]='\0'; /* Chop that element off the end */ + } + + if (UD->textgrab) { + pgxml_pathcompare(userData); + } + +} + +static void pgxml_charhandler(void *userData, const XML_Char *s, int len) +{ + if (UD->textgrab) { + if (len>0) { + memcpy(UD->resptr,s,len); + UD->resptr += len; + UD->reslen += len; + } + } +} +/* Should I be using PG list types here? */ + +static void pgxml_finalisegrabbedtext(void *userData) +{ + /* In res/reslen, we have a single result. */ + UD->xpres->results[UD->xpres->rescount]= UD->resptr - UD->reslen; + UD->xpres->reslens[UD->xpres->rescount]= UD->reslen; + UD->reslen=0; + UD->xpres->rescount++; + + /* This effectively concatenates all the results together but we + do know where one ends and the next begins */ +} + + + diff --git a/contrib/xml/pgxml.h b/contrib/xml/pgxml.h new file mode 100644 index 0000000000..848264c23d --- /dev/null +++ b/contrib/xml/pgxml.h @@ -0,0 +1,60 @@ +/* Header for pg xml parser interface */ + +static void *pgxml_palloc(size_t size); +static void *pgxml_repalloc(void *ptr, size_t size); +static void pgxml_pfree(void *ptr); +static void pgxml_mhs_init(); +static void pgxml_handler_init(); +Datum pgxml_parse(PG_FUNCTION_ARGS); +Datum pgxml_xpath(PG_FUNCTION_ARGS); +static void pgxml_starthandler(void *userData, const XML_Char *name, + const XML_Char **atts); +static void pgxml_endhandler(void *userData, const XML_Char *name); +static void pgxml_charhandler(void *userData, const XML_Char *s, int len); +static void pgxml_pathcompare(void *userData); +static void pgxml_finalisegrabbedtext(void *userData); + +#define MAXPATHLENGTH 512 +#define MAXRESULTS 100 + + +typedef struct { + int rescount; + char *results[MAXRESULTS]; + int32 reslens[MAXRESULTS]; + char *resbuf; /* pointer to the result buffer for pfree */ +} XPath_Results; + + + +typedef struct { + char currentpath[MAXPATHLENGTH]; + char *path; + int textgrab; + char *resptr; + int32 reslen; + XPath_Results *xpres; +} pgxml_udata; + + +#define UD ((pgxml_udata *) userData) + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/xml/pgxml.source b/contrib/xml/pgxml.source new file mode 100644 index 0000000000..6f425077c1 --- /dev/null +++ b/contrib/xml/pgxml.source @@ -0,0 +1,7 @@ +--SQL for XML parser + +CREATE FUNCTION pgxml_parse(text) RETURNS bool + AS '_OBJWD_/pgxml_DLSUFFIX_' LANGUAGE 'c' WITH (isStrict); + +CREATE FUNCTION pgxml_xpath(text,text,int) RETURNS text + AS '_OBJWD_/pgxml_DLSUFFIX_' LANGUAGE 'c' WITH (isStrict); \ No newline at end of file diff --git a/contrib/xml/xpath-yacc b/contrib/xml/xpath-yacc new file mode 100644 index 0000000000..0732bbba93 --- /dev/null +++ b/contrib/xml/xpath-yacc @@ -0,0 +1,178 @@ + +%token QNAME +%token NAME_COLON_STAR +%token DOT +%token DOT_DOT +%token AT +%token AXIS_NAME +%token FUNCTION_NAME +%token COMMENT +%token PI +%token TEXT +%token NODE +%token STAR +%token LPAR +%token RPAR +%token LSQB +%token RSQB +%token LITERAL +%token NUMBER +%token COLON_COLON +%token DOLLAR_QNAME +%token SLASH +%token SLASH_SLASH +%token VBAR +%token COMMA +%token PLUS +%token MINUS +%token EQUALS +%token GT +%token LT +%token GTE +%token LTE +%token MULTIPLY +%token AND +%token OR +%token MOD +%token DIV +%token QUO + +%% + +expr : + or_expr + ; + +or_expr : + and_expr + | or_expr OR and_expr + ; + +and_expr : + equality_expr + | and_expr AND equality_expr + ; + +equality_expr : + relational_expr + | equality_expr EQUALS relational_expr + ; + +relational_expr : + additive_expr + | relational_expr LT additive_expr + | relational_expr GT additive_expr + | relational_expr LTE additive_expr + | relational_expr GTE additive_expr + ; + +additive_expr : + multiplicative_expr + | additive_expr PLUS multiplicative_expr + | additive_expr MINUS multiplicative_expr + ; + +multiplicative_expr : + unary_expr + | multiplicative_expr MULTIPLY unary_expr + | multiplicative_expr DIV unary_expr + | multiplicative_expr MOD unary_expr + ; + +unary_expr : + union_expr + | '-' unary_expr + ; + +union_expr : + path_expr + | union_expr VBAR path_expr + ; + +path_expr : + location_path + | primary_expr predicates segment + ; + +segment : + /* empty */ + | SLASH relative_location_path + | SLASH_SLASH relative_location_path + ; + +location_path : + relative_location_path + | absolute_location_path + ; + +absolute_location_path : + SLASH + | SLASH relative_location_path + | SLASH_SLASH relative_location_path + ; + +relative_location_path : + step + | relative_location_path SLASH step + | relative_location_path SLASH_SLASH step + ; + +step : + axis node_test predicates + | DOT + | DOT_DOT + ; + +axis: + /* empty */ + | AXIS_NAME COLON_COLON + | AT + ; + +predicates : + /* empty */ + | predicates LSQB expr RSQB + ; + +primary_expr : + DOLLAR_QNAME + | LPAR expr RPAR + | LITERAL + | NUMBER + | function_call + ; + +function_call : + FUNCTION_NAME LPAR opt_args RPAR + ; + +opt_args : + /* empty */ + | args + ; + +args : + expr + | args COMMA expr + ; + +node_test : + QNAME + | STAR + | NAME_COLON_STAR + | PI LPAR opt_literal RPAR + | COMMENT LPAR RPAR + | TEXT LPAR RPAR + | NODE LPAR RPAR + ; + +opt_literal : + /* empty */ + | LITERAL + ; + + + + + + -- 2.11.0