2 * Copyright (C) 2005-2008 Atsushi Konno All rights reserved.
3 * Copyright (C) 2005 QSDN,Inc. All rights reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
21 #include "chxj_apache.h"
22 #include "qs_parse_string.h"
23 #include "qs_parse_tag.h"
30 #define NL_COUNT_MAX (10)
32 typedef struct node_stack_element {
34 struct node_stack_element *next;
35 struct node_stack_element **ref;
38 typedef struct node_stack {
39 NodeStackElement head;
40 NodeStackElement tail;
43 static int s_cut_tag (const char *s, int len);
44 static int s_cut_text(const char *s, int len, int script);
45 static void qs_dump_node(Doc *doc, Node *node, int indent);
46 static void qs_push_node(Doc *doc, Node *node, NodeStack stack);
47 static Node *qs_pop_node(Doc *doc, NodeStack stack);
48 #ifdef DUMP_NODE_STACK
49 static void qs_dump_node_stack(Doc *doc, NodeStack stack);
51 static void qs_free_node_stack(Doc *doc, NodeStack stack);
52 static void s_error_check(Doc *doc, const char *name, int line, NodeStack node_stack, NodeStack err_stack);
56 qs_parse_string(Doc *doc, const char *src, int srclen)
73 memset(encoding, 0, 256);
75 doc->now_parent_node = qs_init_root_node(doc);
76 if (! src || srclen <= 0) {
77 return doc->root_node;
80 node_stack = apr_palloc(doc->r->pool, sizeof(struct node_stack));
81 memset(node_stack, 0, sizeof(struct node_stack));
82 err_stack = apr_palloc(doc->r->pool, sizeof(struct node_stack));
83 memset(err_stack, 0, sizeof(struct node_stack));
86 node_stack = calloc(sizeof(struct node_stack), 1);
87 err_stack = calloc(sizeof(struct node_stack), 1);
91 * It is the pre reading.
92 * Because I want to specify encoding.
94 for (ii=0; ii<srclen; ii++) {
95 if (src[ii] == '\n') nl_cnt++;
96 if (nl_cnt >= NL_COUNT_MAX) break; /* not found <?xml ...> */
98 if (is_white_space(src[ii]))
101 if ((unsigned char)'<' == src[ii]) {
102 int endpoint = s_cut_tag(&src[ii], srclen - ii);
104 node = qs_parse_tag(doc, &src[ii], endpoint);
107 if (node->name[0] != '?') break;
109 if (strcasecmp(node->name, "?xml") == 0) {
111 for(parse_attr = node->attr;
112 parse_attr && *encoding == '\0';
113 parse_attr = parse_attr->next) {
114 if (STRCASEEQ('e','E',"encoding",parse_attr->name)) {
115 switch (*parse_attr->value) {
118 if (strcasecmp(parse_attr->value, "x-sjis" ) == 0) {
119 strcpy((char*)encoding, (char*)"NONE");
125 if ((strcasecmp(parse_attr->value, "Shift_JIS") == 0)
126 || (strcasecmp(parse_attr->value, "SJIS" ) == 0)
127 || (strcasecmp(parse_attr->value, "Shift-JIS") == 0)) {
128 strcpy((char*)encoding, (char*)"NONE");
134 if ((strcasecmp(parse_attr->value, "EUC_JP") == 0)
135 || (strcasecmp(parse_attr->value, "EUC-JP") == 0)
136 || (strcasecmp(parse_attr->value, "EUCJP" ) == 0)) {
137 strcpy((char*)encoding, "EUC-JP");
143 if ((strcasecmp(parse_attr->value, "UTF-8") == 0)
144 || (strcasecmp(parse_attr->value, "UTF8") == 0)) {
145 strcpy((char*)encoding, "UTF-8");
150 strcpy((char*)encoding, "NONE");
161 if (strcasecmp(encoding, "NONE") != 0 && strlen(encoding) != 0) {
163 olen = srclen * 4 + 1;
164 sv_osrc = osrc =(char *)apr_palloc(doc->pool, olen);
165 memset((char*)osrc, 0, olen);
166 if ((cd = iconv_open("CP932", encoding)) != (iconv_t) -1) {
168 ibuf = apr_palloc(doc->pool, ilen+1);
169 memset(ibuf, 0, ilen+1);
170 memcpy(ibuf, src, ilen);
172 size_t result = iconv(cd, &ibuf, &ilen, &osrc, &olen);
173 if (result == (size_t)(-1)) {
184 * Now, true parsing is done here.
187 for (ii=0; ii<srclen; ii++) {
188 if (src[ii] == '\n') nl_cnt++;
189 if (doc->parse_mode != PARSE_MODE_NO_PARSE
190 && is_white_space(src[ii])
191 && (doc->now_parent_node == NULL || !STRCASEEQ('p','P',"pre",doc->now_parent_node->name))) {
194 if ((unsigned char)'<' == src[ii]) {
195 int endpoint = s_cut_tag(&src[ii], srclen - ii);
197 node = qs_parse_tag(doc, &src[ii], endpoint);
201 if (node->name[0] == '/' ) {
202 if (doc->parse_mode == PARSE_MODE_CHTML) {
203 if (has_child(&(node->name[1]))) {
204 if (doc->now_parent_node->parent != NULL) {
205 doc->now_parent_node = doc->now_parent_node->parent;
206 doc->parse_mode = PARSE_MODE_CHTML;
208 if (STRCASEEQ('s','S',"script",&node->name[1])) {
211 s_error_check(doc, &node->name[1], node->line, node_stack, err_stack);
219 if (doc->parse_mode == PARSE_MODE_NO_PARSE) {
220 if (STRCASEEQ('c','C',"chxj:if",&node->name[1]) || STRCASEEQ('p','P',"plaintext",&node->name[1])) {
221 if (doc->now_parent_node->parent != NULL) {
222 doc->now_parent_node = doc->now_parent_node->parent;
223 doc->parse_mode = PARSE_MODE_CHTML;
224 s_error_check(doc, &node->name[1], node->line, node_stack, err_stack);
229 if (doc->parse_mode != PARSE_MODE_NO_PARSE) {
233 if (*node->name == '!' && strncmp(node->name, "!--", 3) == 0) {
237 qs_add_child_node(doc,node);
238 if ((has_child(node->name) && doc->parse_mode != PARSE_MODE_NO_PARSE) || STRCASEEQ('p','P',"plaintext",node->name)) {
239 qs_push_node(doc, node, node_stack);
242 if (doc->parse_mode == PARSE_MODE_NO_PARSE) {
243 if (node->name[0] == '/')
247 if (doc->parse_mode == PARSE_MODE_CHTML && STRCASEEQ('c','C',"chxj:if", node->name)) {
249 doc->parse_mode = PARSE_MODE_NO_PARSE;
250 doc->now_parent_node = node;
251 for(parse_attr = node->attr;
253 parse_attr = parse_attr->next) {
254 if (STRCASEEQ('p','P',"parse",parse_attr->name)) {
255 if (STRCASEEQ('t','T',"true",parse_attr->value)) {
256 doc->parse_mode = PARSE_MODE_CHTML;
261 else if (doc->parse_mode == PARSE_MODE_CHTML && STRCASEEQ('p','P',"plaintext",node->name)) {
262 doc->parse_mode = PARSE_MODE_NO_PARSE;
263 doc->now_parent_node = node;
266 if (doc->parse_mode == PARSE_MODE_CHTML && has_child(node->name)) {
267 doc->now_parent_node = node;
269 if (STRCASEEQ('s','S',"script", node->name)) {
272 if (doc->parse_mode == PARSE_MODE_CHTML && node->closed_by_itself) {
273 if (has_child(node->name)) {
274 if (doc->now_parent_node->parent != NULL) {
275 doc->now_parent_node = doc->now_parent_node->parent;
276 doc->parse_mode = PARSE_MODE_CHTML;
278 if (STRCASEEQ('s','S',"script",node->name)) {
281 s_error_check(doc, node->name, node->line, node_stack, err_stack);
291 int endpoint = s_cut_text(&src[ii], srclen - ii, script_flag);
292 Node *node = qs_new_tag(doc);
293 node->value = (char*)apr_palloc(doc->pool,endpoint+1);
294 node->name = (char*)apr_palloc(doc->pool,4+1);
295 node->otext = (char*)apr_palloc(doc->pool,endpoint+1);
296 node->size = endpoint;
298 memset(node->value, 0, endpoint+1);
299 memset(node->otext, 0, endpoint+1);
300 memset(node->name, 0, 4+1 );
301 memcpy(node->value, &src[ii], endpoint);
302 memcpy(node->name, "text", 4);
303 memcpy(node->otext,node->value, endpoint);
305 qs_add_child_node(doc,node);
306 ii += (endpoint - 1);
310 QX_LOGGER_DEBUG("parse_string end");
313 if (doc->r != NULL) {
314 qs_dump_node(doc, doc->root_node, 0);
317 #ifdef DUMP_NODE_STACK
318 qs_dump_node_stack(doc, node_stack);
322 for (prevNode = qs_pop_node(doc,node_stack);
324 prevNode = qs_pop_node(doc, node_stack)) {
325 if (has_child(prevNode->name)) {
327 ERR(doc->r, "tag parse error (perhaps, not close). tag_name:[%s] line:[%d]", prevNode->name, prevNode->line);
329 fprintf(stderr, "error :tag parse error (perhaps, not close). tag_name:[%s] line:[%d]\n", prevNode->name, prevNode->line);
333 qs_free_node_stack(doc, node_stack); node_stack = NULL;
334 qs_free_node_stack(doc, err_stack); err_stack = NULL;
335 return doc->root_node;
340 s_error_check(Doc *doc, const char *name, int line, NodeStack node_stack, NodeStack err_stack)
344 for (prevNode = qs_pop_node(doc,node_stack);
346 prevNode = qs_pop_node(doc, node_stack)) {
347 if (prevNode && strcasecmp(prevNode->name, name) != 0) {
348 qs_push_node(doc, prevNode, err_stack);
355 Node *tmpNode = qs_pop_node(doc,node_stack);
356 if (tmpNode == NULL && err != 1) {
358 ERR(doc->r, "tag parse error (perhaps, miss spell). tag_name:[%s] line:[%d]", name, line);
360 fprintf(stderr, "error :tag parse error (perhaps, miss spell). tag_name:[%s] line:[%d]\n", name, line);
361 for (prevNode = qs_pop_node(doc,err_stack);
363 prevNode = qs_pop_node(doc, err_stack)) {
364 qs_push_node(doc, prevNode, node_stack);
368 for (prevNode = qs_pop_node(doc,err_stack);
370 prevNode = qs_pop_node(doc, err_stack)) {
372 ERR(doc->r, "tag parse error (perhaps, not close). tag_name:[%s] line:[%d]", prevNode->name, prevNode->line);
374 fprintf(stderr, "error :tag parse error (perhaps, not close). tag_name:[%s] line:[%d]\n", prevNode->name, prevNode->line);
376 qs_push_node(doc, tmpNode, node_stack);
384 qs_dump_node(Doc* doc, Node* node, int indent)
386 Node* p = (Node*)qs_get_child_node(doc,node);
388 for (;p;p = (Node*)qs_get_next_node(doc,p)) {
390 if ((char*)qs_get_node_value(doc,p) != NULL) {
391 DBG(doc->r,"%*.*sNode:[%s][%s]\n", indent,indent," ",
392 (char*)qs_get_node_name(doc,p),
393 (char*)qs_get_node_value(doc,p));
396 DBG(doc->r,"%*.*sNode:[%s]\n", indent,indent," ", qs_get_node_name(doc,p));
398 for (attr = (Attr*)qs_get_attr(doc,p); attr; attr = (Attr*)qs_get_next_attr(doc,attr)) {
399 DBG(doc->r,"%*.*s ATTR:[%s]\n", indent,indent," ", (char *)qs_get_attr_name(doc,attr));
400 DBG(doc->r,"%*.*s VAL :[%s]\n", indent,indent," ", (char *)qs_get_attr_value(doc,attr));
402 qs_dump_node(doc,p, indent+4);
409 s_cut_tag(const char* s, int len)
416 if (strncmp("<!--", s, 4) == 0) {
419 else if (strncasecmp("<![CDATA[", s, sizeof("<![CDATA[")-1) == 0) {
423 for (ii=0;ii<len; ii++) {
424 if (is_sjis_kanji(s[ii])) {
428 if (is_sjis_kana(s[ii]))
431 if (is_white_space(s[ii]))
438 if (comment && s[ii] == '-') {
439 if (strncmp(&s[ii], "-->", 3) == 0) {
444 if (cdata && s[ii] == ']') {
445 if (strncmp(&s[ii], "]]>", 3) == 0) {
451 if (!cdata && !comment && s[ii] == '>') {
462 s_cut_text(const char* s, int len, int script)
468 for (ii=0;ii<len; ii++) {
469 if (is_sjis_kanji(s[ii])) {
473 if (is_sjis_kana(s[ii]))
476 if (is_white_space(s[ii]))
485 if (s[ii] == '\'' && !dq) {
490 if (s[ii] == '"' && !sq) {
496 if (!sq && !dq && s[ii] == '<')
506 qs_init_root_node(Doc *doc)
508 doc->root_node = (Node*)apr_palloc(doc->pool,sizeof(struct Node));
509 if (doc->root_node == NULL) {
510 QX_LOGGER_FATAL("Out Of Memory");
513 doc->root_node->next = NULL;
514 doc->root_node->parent = NULL;
515 doc->root_node->child = NULL;
516 doc->root_node->attr = NULL;
518 doc->root_node->name = (char*)apr_palloc(doc->pool, 5);
519 if (doc->root_node->name == NULL) {
520 QX_LOGGER_FATAL("Out Of Memory");
522 memset(doc->root_node->name, 0, 5);
523 strcpy(doc->root_node->name, "ROOT");
525 return doc->root_node;
531 qs_add_child_node(Doc *doc,Node *node)
535 node->child_tail = NULL;
536 node->parent = doc->now_parent_node;
537 if (doc->now_parent_node->child == NULL) {
538 doc->now_parent_node->child = node;
539 doc->now_parent_node->child_tail = node;
543 QX_LOGGER_DEBUG("search child free node");
545 doc->now_parent_node->child_tail->next = node;
546 doc->now_parent_node->child_tail = node;
554 qs_get_root(Doc *doc) {
555 return doc->root_node;
562 qs_get_node_value(Doc *UNUSED(doc), Node *node) {
570 qs_get_node_name(Doc *UNUSED(doc), Node *node) {
577 qs_get_child_node(Doc *UNUSED(doc), Node *node) {
585 qs_get_next_node(Doc* UNUSED(doc), Node *node) {
592 qs_get_attr(Doc *UNUSED(doc), Node *node) {
600 qs_get_next_attr(Doc *UNUSED(doc), Attr *attr) {
607 qs_get_attr_name(Doc *UNUSED(doc), Attr *attr) {
614 qs_get_attr_value(Doc *UNUSED(doc), Attr *attr) {
619 qs_get_node_size(Doc *UNUSED(doc), Node *node) {
624 #define list_insert(node, point) do { \
625 node->ref = point->ref; \
627 node->next = point; \
628 point->ref = &node->next; \
631 #define list_remove(node) do { \
632 *node->ref = node->next; \
633 node->next->ref = node->ref; \
638 qs_push_node(Doc* doc, Node *node, NodeStack stack)
640 NodeStackElement elem;
641 if (doc->r != NULL) {
642 elem = apr_palloc(doc->r->pool, sizeof(struct node_stack_element));
643 memset(elem, 0, sizeof(struct node_stack_element));
646 elem = malloc(sizeof(struct node_stack_element));
647 memset(elem, 0, sizeof(struct node_stack_element));
650 if (stack->head == NULL) {
652 if (doc->r != NULL) {
653 stack->head = apr_palloc(doc->r->pool, sizeof(struct node_stack_element));
654 memset(stack->head, 0, sizeof(struct node_stack_element));
657 stack->head = malloc(sizeof(struct node_stack_element));
658 memset(stack->head, 0, sizeof(struct node_stack_element));
660 stack->head->next = stack->head;
661 stack->head->ref = &stack->head->next;
663 list_insert(elem, stack->head);
667 #include "apr_ring.h"
670 qs_pop_node(Doc *doc, NodeStack stack)
672 NodeStackElement tail = stack->tail;
675 if (tail == NULL) return NULL;
676 if (tail == stack->head) return NULL;
681 stack->tail = (NodeStackElement)((apr_size_t)stack->head->ref - (apr_size_t)APR_OFFSETOF(struct node_stack_element, next));
688 #ifdef DUMP_NODE_STACK
690 qs_dump_node_stack(Doc *doc, NodeStack stack)
692 NodeStackElement elm;
693 for (elm = stack->head->next;elm != stack->head; elm = elm->next) {
694 if (doc->r) DBG(doc->r, "name:[%s]", elm->node->name);
695 else fprintf(stderr, "[%x] name:[%s] next:[%x]\n", (apr_size_t)elm, elm->node->name, (apr_size_t)elm->next);
701 qs_free_node_stack(Doc *doc, NodeStack stack)
703 if (doc->r == NULL && stack != NULL) {
705 for (elm = qs_pop_node(doc, stack);elm; elm = qs_pop_node(doc,stack))