1 /* lexer.c -- Lexer for html parser
3 (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4 See tidy.h for the copyright notice.
9 Given a file stream fp it returns a sequence of tokens.
11 GetToken(fp) gets the next token
12 UngetToken(fp) provides one level undo
14 The tags include an attribute list:
16 - linked list of attribute/value nodes
17 - each node has 2 NULL-terminated strings.
18 - entities are replaced in attribute values
20 white space is compacted if not in preformatted mode
21 If not in preformatted mode then leading white space
22 is discarded and subsequent white space sequences
23 compacted to single space characters.
25 If XmlTags is no then Tag names are folded to upper
26 case and attribute names to lower case.
29 - Doctype subset and marked sections
50 #if !defined(NDEBUG) && defined(_MSC_VER)
51 static Bool show_attrs = yes;
53 static char buffer[MX_TXT+8]; /* NOTE extra for '...'\0 tail */
54 static tmbstr get_text_string(Lexer* lexer, Node *node)
56 uint len = node->end - node->start;
57 tmbstr cp = lexer->lexbuf + node->start;
58 tmbstr end = lexer->lexbuf + node->end;
79 static void Show_Node( TidyDocImpl* doc, const char *msg, Node *node )
81 Lexer* lexer = doc->lexer;
82 Bool lex = ((msg[0] == 'l')&&(msg[1] == 'e')) ? yes : no;
83 int line = ( doc->lexer ? doc->lexer->lines : 0 );
84 int col = ( doc->lexer ? doc->lexer->columns : 0 );
85 SPRTF("R=%d C=%d: ", line, col );
86 if (lexer && lexer->token && (lexer->token->type == TextNode)) {
88 uint len = node->end - node->start;
89 tmbstr cp = get_text_string( lexer, node );
90 SPRTF("Returning %s TextNode [%s]%u %s\n", msg, cp, len,
91 lex ? "lexer" : "stream");
93 SPRTF("Returning %s TextNode %p... %s\n", msg, node,
94 lex ? "lexer" : "stream");
99 tmbstr name = node->element ? node->element : "blank";
100 SPRTF("Returning %s node <%s", msg, name);
101 for (av = node->attributes; av; av = av->next) {
102 name = av->attribute;
106 SPRTF("=\"%s\"", av->value);
110 SPRTF("> %s\n", lex ? "lexer" : "stream");
112 SPRTF("Returning %s node %p <%s>... %s\n", msg, node,
113 node->element ? node->element : "blank",
114 lex ? "lexer" : "stream");
118 #define GTDBG(a,b,c) Show_Node(a,b,c)
123 /* Forward references
125 /* swallows closing '>' */
126 static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
128 static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty,
129 Node **asp, Node **php );
131 static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
132 Bool *isempty, int *pdelim );
134 static Node *ParseDocTypeDecl(TidyDocImpl* doc);
136 static void AddAttrToList( AttVal** list, AttVal* av );
138 /* used to classify characters for lexical purposes */
139 #define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
140 static uint lexmap[128];
142 #define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name)
143 #define IsValidXMLElemName(name) TY_(IsValidXMLID)(name)
145 static struct _doctypes
152 } const W3C_Doctypes[] =
154 { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, },
155 { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, },
156 { 2, HT20, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, },
157 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, },
158 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, },
159 { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, },
160 { 6, H40S, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" },
161 { 8, H40T, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" },
162 { 7, H40F, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" },
163 { 3, H41S, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" },
164 { 5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" },
165 { 4, H41F, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" },
166 { 9, X10S, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" },
167 { 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" },
168 { 10, X10F, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" },
169 { 12, XH11, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" },
170 { 13, XB10, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" },
172 { 20, HT50, "HTML5", NULL, NULL },
173 { 21, XH50, "XHTML5", NULL, NULL },
175 /* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */
177 { 14, XP10, "XHTML Print 1.0", "-//W3C//DTD XHTML-Print 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd" },
178 { 14, XP10, "XHTML Print 1.0", "-//PWG//DTD XHTML-Print 1.0//EN", "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" },
181 { 0, 0, NULL, NULL, NULL }
184 int TY_(HTMLVersion)(TidyDocImpl* doc)
189 uint vers = doc->lexer->versions;
190 uint dtver = doc->lexer->doctype;
191 TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
192 Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
193 !cfgBool(doc, TidyHtmlOut);
194 Bool html4 = dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose || VERS_FROM40 & dtver;
196 if (xhtml && dtver == VERS_UNKNOWN) return XH50;
197 if (dtver == VERS_UNKNOWN) return HT50;
198 /* Issue #167 - if NOT XHTML, and doctype is default VERS_HTML5, then return HT50 */
199 if (!xhtml && (dtver == VERS_HTML5)) return HT50;
201 for (i = 0; W3C_Doctypes[i].name; ++i)
203 if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
204 (html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers)))
207 if (vers & W3C_Doctypes[i].vers &&
208 (W3C_Doctypes[i].score < score || !score))
210 score = W3C_Doctypes[i].score;
216 return W3C_Doctypes[j].vers;
221 static ctmbstr GetFPIFromVers(uint vers)
225 for (i = 0; W3C_Doctypes[i].name; ++i)
226 if (W3C_Doctypes[i].vers == vers)
227 return W3C_Doctypes[i].fpi;
232 static ctmbstr GetSIFromVers(uint vers)
236 for (i = 0; W3C_Doctypes[i].name; ++i)
237 if (W3C_Doctypes[i].vers == vers)
238 return W3C_Doctypes[i].si;
243 static ctmbstr GetNameFromVers(uint vers)
247 for (i = 0; W3C_Doctypes[i].name; ++i)
248 if (W3C_Doctypes[i].vers == vers)
249 return W3C_Doctypes[i].name;
254 static uint GetVersFromFPI(ctmbstr fpi)
258 for (i = 0; W3C_Doctypes[i].name; ++i)
259 if (W3C_Doctypes[i].fpi != NULL && TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0)
260 return W3C_Doctypes[i].vers;
265 /* everything is allowed in proprietary version of HTML */
266 /* this is handled here rather than in the tag/attr dicts */
267 void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
269 doc->lexer->versions &= (vers | VERS_PROPRIETARY);
272 Bool TY_(IsWhite)(uint c)
276 return (map & white)!=0;
279 Bool TY_(IsNewline)(uint c)
282 return (map & newline)!=0;
285 Bool TY_(IsDigit)(uint c)
291 return (map & digit)!=0;
294 static Bool IsDigitHex(uint c)
300 return (map & digithex)!=0;
303 Bool TY_(IsLetter)(uint c)
309 return (map & letter)!=0;
312 Bool TY_(IsHTMLSpace)(uint c)
314 return c == 0x020 || c == 0x009 || c == 0x00a || c == 0x00c || c == 0x00d;
317 Bool TY_(IsNamechar)(uint c)
320 return (map & namechar)!=0;
323 Bool TY_(IsXMLLetter)(uint c)
325 return ((c >= 0x41 && c <= 0x5a) ||
326 (c >= 0x61 && c <= 0x7a) ||
327 (c >= 0xc0 && c <= 0xd6) ||
328 (c >= 0xd8 && c <= 0xf6) ||
329 (c >= 0xf8 && c <= 0xff) ||
330 (c >= 0x100 && c <= 0x131) ||
331 (c >= 0x134 && c <= 0x13e) ||
332 (c >= 0x141 && c <= 0x148) ||
333 (c >= 0x14a && c <= 0x17e) ||
334 (c >= 0x180 && c <= 0x1c3) ||
335 (c >= 0x1cd && c <= 0x1f0) ||
336 (c >= 0x1f4 && c <= 0x1f5) ||
337 (c >= 0x1fa && c <= 0x217) ||
338 (c >= 0x250 && c <= 0x2a8) ||
339 (c >= 0x2bb && c <= 0x2c1) ||
341 (c >= 0x388 && c <= 0x38a) ||
343 (c >= 0x38e && c <= 0x3a1) ||
344 (c >= 0x3a3 && c <= 0x3ce) ||
345 (c >= 0x3d0 && c <= 0x3d6) ||
350 (c >= 0x3e2 && c <= 0x3f3) ||
351 (c >= 0x401 && c <= 0x40c) ||
352 (c >= 0x40e && c <= 0x44f) ||
353 (c >= 0x451 && c <= 0x45c) ||
354 (c >= 0x45e && c <= 0x481) ||
355 (c >= 0x490 && c <= 0x4c4) ||
356 (c >= 0x4c7 && c <= 0x4c8) ||
357 (c >= 0x4cb && c <= 0x4cc) ||
358 (c >= 0x4d0 && c <= 0x4eb) ||
359 (c >= 0x4ee && c <= 0x4f5) ||
360 (c >= 0x4f8 && c <= 0x4f9) ||
361 (c >= 0x531 && c <= 0x556) ||
363 (c >= 0x561 && c <= 0x586) ||
364 (c >= 0x5d0 && c <= 0x5ea) ||
365 (c >= 0x5f0 && c <= 0x5f2) ||
366 (c >= 0x621 && c <= 0x63a) ||
367 (c >= 0x641 && c <= 0x64a) ||
368 (c >= 0x671 && c <= 0x6b7) ||
369 (c >= 0x6ba && c <= 0x6be) ||
370 (c >= 0x6c0 && c <= 0x6ce) ||
371 (c >= 0x6d0 && c <= 0x6d3) ||
373 (c >= 0x6e5 && c <= 0x6e6) ||
374 (c >= 0x905 && c <= 0x939) ||
376 (c >= 0x958 && c <= 0x961) ||
377 (c >= 0x985 && c <= 0x98c) ||
378 (c >= 0x98f && c <= 0x990) ||
379 (c >= 0x993 && c <= 0x9a8) ||
380 (c >= 0x9aa && c <= 0x9b0) ||
382 (c >= 0x9b6 && c <= 0x9b9) ||
383 (c >= 0x9dc && c <= 0x9dd) ||
384 (c >= 0x9df && c <= 0x9e1) ||
385 (c >= 0x9f0 && c <= 0x9f1) ||
386 (c >= 0xa05 && c <= 0xa0a) ||
387 (c >= 0xa0f && c <= 0xa10) ||
388 (c >= 0xa13 && c <= 0xa28) ||
389 (c >= 0xa2a && c <= 0xa30) ||
390 (c >= 0xa32 && c <= 0xa33) ||
391 (c >= 0xa35 && c <= 0xa36) ||
392 (c >= 0xa38 && c <= 0xa39) ||
393 (c >= 0xa59 && c <= 0xa5c) ||
395 (c >= 0xa72 && c <= 0xa74) ||
396 (c >= 0xa85 && c <= 0xa8b) ||
398 (c >= 0xa8f && c <= 0xa91) ||
399 (c >= 0xa93 && c <= 0xaa8) ||
400 (c >= 0xaaa && c <= 0xab0) ||
401 (c >= 0xab2 && c <= 0xab3) ||
402 (c >= 0xab5 && c <= 0xab9) ||
405 (c >= 0xb05 && c <= 0xb0c) ||
406 (c >= 0xb0f && c <= 0xb10) ||
407 (c >= 0xb13 && c <= 0xb28) ||
408 (c >= 0xb2a && c <= 0xb30) ||
409 (c >= 0xb32 && c <= 0xb33) ||
410 (c >= 0xb36 && c <= 0xb39) ||
412 (c >= 0xb5c && c <= 0xb5d) ||
413 (c >= 0xb5f && c <= 0xb61) ||
414 (c >= 0xb85 && c <= 0xb8a) ||
415 (c >= 0xb8e && c <= 0xb90) ||
416 (c >= 0xb92 && c <= 0xb95) ||
417 (c >= 0xb99 && c <= 0xb9a) ||
419 (c >= 0xb9e && c <= 0xb9f) ||
420 (c >= 0xba3 && c <= 0xba4) ||
421 (c >= 0xba8 && c <= 0xbaa) ||
422 (c >= 0xbae && c <= 0xbb5) ||
423 (c >= 0xbb7 && c <= 0xbb9) ||
424 (c >= 0xc05 && c <= 0xc0c) ||
425 (c >= 0xc0e && c <= 0xc10) ||
426 (c >= 0xc12 && c <= 0xc28) ||
427 (c >= 0xc2a && c <= 0xc33) ||
428 (c >= 0xc35 && c <= 0xc39) ||
429 (c >= 0xc60 && c <= 0xc61) ||
430 (c >= 0xc85 && c <= 0xc8c) ||
431 (c >= 0xc8e && c <= 0xc90) ||
432 (c >= 0xc92 && c <= 0xca8) ||
433 (c >= 0xcaa && c <= 0xcb3) ||
434 (c >= 0xcb5 && c <= 0xcb9) ||
436 (c >= 0xce0 && c <= 0xce1) ||
437 (c >= 0xd05 && c <= 0xd0c) ||
438 (c >= 0xd0e && c <= 0xd10) ||
439 (c >= 0xd12 && c <= 0xd28) ||
440 (c >= 0xd2a && c <= 0xd39) ||
441 (c >= 0xd60 && c <= 0xd61) ||
442 (c >= 0xe01 && c <= 0xe2e) ||
444 (c >= 0xe32 && c <= 0xe33) ||
445 (c >= 0xe40 && c <= 0xe45) ||
446 (c >= 0xe81 && c <= 0xe82) ||
448 (c >= 0xe87 && c <= 0xe88) ||
451 (c >= 0xe94 && c <= 0xe97) ||
452 (c >= 0xe99 && c <= 0xe9f) ||
453 (c >= 0xea1 && c <= 0xea3) ||
456 (c >= 0xeaa && c <= 0xeab) ||
457 (c >= 0xead && c <= 0xeae) ||
459 (c >= 0xeb2 && c <= 0xeb3) ||
461 (c >= 0xec0 && c <= 0xec4) ||
462 (c >= 0xf40 && c <= 0xf47) ||
463 (c >= 0xf49 && c <= 0xf69) ||
464 (c >= 0x10a0 && c <= 0x10c5) ||
465 (c >= 0x10d0 && c <= 0x10f6) ||
467 (c >= 0x1102 && c <= 0x1103) ||
468 (c >= 0x1105 && c <= 0x1107) ||
470 (c >= 0x110b && c <= 0x110c) ||
471 (c >= 0x110e && c <= 0x1112) ||
478 (c >= 0x1154 && c <= 0x1155) ||
480 (c >= 0x115f && c <= 0x1161) ||
485 (c >= 0x116d && c <= 0x116e) ||
486 (c >= 0x1172 && c <= 0x1173) ||
491 (c >= 0x11ae && c <= 0x11af) ||
492 (c >= 0x11b7 && c <= 0x11b8) ||
494 (c >= 0x11bc && c <= 0x11c2) ||
498 (c >= 0x1e00 && c <= 0x1e9b) ||
499 (c >= 0x1ea0 && c <= 0x1ef9) ||
500 (c >= 0x1f00 && c <= 0x1f15) ||
501 (c >= 0x1f18 && c <= 0x1f1d) ||
502 (c >= 0x1f20 && c <= 0x1f45) ||
503 (c >= 0x1f48 && c <= 0x1f4d) ||
504 (c >= 0x1f50 && c <= 0x1f57) ||
508 (c >= 0x1f5f && c <= 0x1f7d) ||
509 (c >= 0x1f80 && c <= 0x1fb4) ||
510 (c >= 0x1fb6 && c <= 0x1fbc) ||
512 (c >= 0x1fc2 && c <= 0x1fc4) ||
513 (c >= 0x1fc6 && c <= 0x1fcc) ||
514 (c >= 0x1fd0 && c <= 0x1fd3) ||
515 (c >= 0x1fd6 && c <= 0x1fdb) ||
516 (c >= 0x1fe0 && c <= 0x1fec) ||
517 (c >= 0x1ff2 && c <= 0x1ff4) ||
518 (c >= 0x1ff6 && c <= 0x1ffc) ||
520 (c >= 0x212a && c <= 0x212b) ||
522 (c >= 0x2180 && c <= 0x2182) ||
523 (c >= 0x3041 && c <= 0x3094) ||
524 (c >= 0x30a1 && c <= 0x30fa) ||
525 (c >= 0x3105 && c <= 0x312c) ||
526 (c >= 0xac00 && c <= 0xd7a3) ||
527 (c >= 0x4e00 && c <= 0x9fa5) ||
529 (c >= 0x3021 && c <= 0x3029) ||
530 (c >= 0x4e00 && c <= 0x9fa5) ||
532 (c >= 0x3021 && c <= 0x3029));
535 Bool TY_(IsXMLNamechar)(uint c)
537 return (TY_(IsXMLLetter)(c) ||
538 c == '.' || c == '_' ||
539 c == ':' || c == '-' ||
540 (c >= 0x300 && c <= 0x345) ||
541 (c >= 0x360 && c <= 0x361) ||
542 (c >= 0x483 && c <= 0x486) ||
543 (c >= 0x591 && c <= 0x5a1) ||
544 (c >= 0x5a3 && c <= 0x5b9) ||
545 (c >= 0x5bb && c <= 0x5bd) ||
547 (c >= 0x5c1 && c <= 0x5c2) ||
549 (c >= 0x64b && c <= 0x652) ||
551 (c >= 0x6d6 && c <= 0x6dc) ||
552 (c >= 0x6dd && c <= 0x6df) ||
553 (c >= 0x6e0 && c <= 0x6e4) ||
554 (c >= 0x6e7 && c <= 0x6e8) ||
555 (c >= 0x6ea && c <= 0x6ed) ||
556 (c >= 0x901 && c <= 0x903) ||
558 (c >= 0x93e && c <= 0x94c) ||
560 (c >= 0x951 && c <= 0x954) ||
561 (c >= 0x962 && c <= 0x963) ||
562 (c >= 0x981 && c <= 0x983) ||
566 (c >= 0x9c0 && c <= 0x9c4) ||
567 (c >= 0x9c7 && c <= 0x9c8) ||
568 (c >= 0x9cb && c <= 0x9cd) ||
570 (c >= 0x9e2 && c <= 0x9e3) ||
575 (c >= 0xa40 && c <= 0xa42) ||
576 (c >= 0xa47 && c <= 0xa48) ||
577 (c >= 0xa4b && c <= 0xa4d) ||
578 (c >= 0xa70 && c <= 0xa71) ||
579 (c >= 0xa81 && c <= 0xa83) ||
581 (c >= 0xabe && c <= 0xac5) ||
582 (c >= 0xac7 && c <= 0xac9) ||
583 (c >= 0xacb && c <= 0xacd) ||
584 (c >= 0xb01 && c <= 0xb03) ||
586 (c >= 0xb3e && c <= 0xb43) ||
587 (c >= 0xb47 && c <= 0xb48) ||
588 (c >= 0xb4b && c <= 0xb4d) ||
589 (c >= 0xb56 && c <= 0xb57) ||
590 (c >= 0xb82 && c <= 0xb83) ||
591 (c >= 0xbbe && c <= 0xbc2) ||
592 (c >= 0xbc6 && c <= 0xbc8) ||
593 (c >= 0xbca && c <= 0xbcd) ||
595 (c >= 0xc01 && c <= 0xc03) ||
596 (c >= 0xc3e && c <= 0xc44) ||
597 (c >= 0xc46 && c <= 0xc48) ||
598 (c >= 0xc4a && c <= 0xc4d) ||
599 (c >= 0xc55 && c <= 0xc56) ||
600 (c >= 0xc82 && c <= 0xc83) ||
601 (c >= 0xcbe && c <= 0xcc4) ||
602 (c >= 0xcc6 && c <= 0xcc8) ||
603 (c >= 0xcca && c <= 0xccd) ||
604 (c >= 0xcd5 && c <= 0xcd6) ||
605 (c >= 0xd02 && c <= 0xd03) ||
606 (c >= 0xd3e && c <= 0xd43) ||
607 (c >= 0xd46 && c <= 0xd48) ||
608 (c >= 0xd4a && c <= 0xd4d) ||
611 (c >= 0xe34 && c <= 0xe3a) ||
612 (c >= 0xe47 && c <= 0xe4e) ||
614 (c >= 0xeb4 && c <= 0xeb9) ||
615 (c >= 0xebb && c <= 0xebc) ||
616 (c >= 0xec8 && c <= 0xecd) ||
617 (c >= 0xf18 && c <= 0xf19) ||
623 (c >= 0xf71 && c <= 0xf84) ||
624 (c >= 0xf86 && c <= 0xf8b) ||
625 (c >= 0xf90 && c <= 0xf95) ||
627 (c >= 0xf99 && c <= 0xfad) ||
628 (c >= 0xfb1 && c <= 0xfb7) ||
630 (c >= 0x20d0 && c <= 0x20dc) ||
632 (c >= 0x302a && c <= 0x302f) ||
635 (c >= 0x30 && c <= 0x39) ||
636 (c >= 0x660 && c <= 0x669) ||
637 (c >= 0x6f0 && c <= 0x6f9) ||
638 (c >= 0x966 && c <= 0x96f) ||
639 (c >= 0x9e6 && c <= 0x9ef) ||
640 (c >= 0xa66 && c <= 0xa6f) ||
641 (c >= 0xae6 && c <= 0xaef) ||
642 (c >= 0xb66 && c <= 0xb6f) ||
643 (c >= 0xbe7 && c <= 0xbef) ||
644 (c >= 0xc66 && c <= 0xc6f) ||
645 (c >= 0xce6 && c <= 0xcef) ||
646 (c >= 0xd66 && c <= 0xd6f) ||
647 (c >= 0xe50 && c <= 0xe59) ||
648 (c >= 0xed0 && c <= 0xed9) ||
649 (c >= 0xf20 && c <= 0xf29) ||
658 (c >= 0x3031 && c <= 0x3035) ||
659 (c >= 0x309d && c <= 0x309e) ||
660 (c >= 0x30fc && c <= 0x30fe));
668 return (map & lowercase)!=0;
672 Bool TY_(IsUpper)(uint c)
676 return (map & uppercase)!=0;
679 uint TY_(ToLower)(uint c)
689 uint TY_(ToUpper)(uint c)
694 c += (uint) ('A' - 'a' );
700 char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps )
702 if ( !cfgBool(doc, TidyXmlTags) )
706 c = (tmbchar) ToUpper(c);
708 else /* force to lower case */
710 c = (tmbchar) ToLower(c);
718 return last character in string
719 this is useful when trailing quotemark
720 is missing on an attribute
722 static tmbchar LastChar( tmbstr str )
726 int n = TY_(tmbstrlen)(str);
733 node->type is one of these:
738 #define StartEndTag 4
741 Lexer* TY_(NewLexer)( TidyDocImpl* doc )
743 Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
747 TidyClearMemory( lexer, sizeof(Lexer) );
749 lexer->allocator = doc->allocator;
752 lexer->state = LEX_CONTENT;
754 lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
755 lexer->doctype = VERS_UNKNOWN;
756 lexer->root = &doc->root;
761 static Bool EndOfInput( TidyDocImpl* doc )
763 assert( doc->docIn != NULL );
764 return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) );
767 void TY_(FreeLexer)( TidyDocImpl* doc )
769 Lexer *lexer = doc->lexer;
772 TY_(FreeStyles)( doc );
775 if ( lexer->pushed || lexer->itoken )
778 TY_(FreeNode)( doc, lexer->itoken );
779 TY_(FreeNode)( doc, lexer->token );
782 while ( lexer->istacksize > 0 )
783 TY_(PopInline)( doc, NULL );
785 TidyDocFree( doc, lexer->istack );
786 TidyDocFree( doc, lexer->lexbuf );
787 TidyDocFree( doc, lexer );
792 /* Lexer uses bigger memory chunks than pprint as
793 ** it must hold the entire input document. not just
794 ** the last line or three.
796 static void AddByte( Lexer *lexer, tmbchar ch )
798 if ( lexer->lexsize + 2 >= lexer->lexlength )
801 uint allocAmt = lexer->lexlength;
802 while ( lexer->lexsize + 2 >= allocAmt )
809 buf = (tmbstr) TidyRealloc( lexer->allocator, lexer->lexbuf, allocAmt );
812 TidyClearMemory( buf + lexer->lexlength,
813 allocAmt - lexer->lexlength );
815 lexer->lexlength = allocAmt;
819 lexer->lexbuf[ lexer->lexsize++ ] = ch;
820 lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */
823 static void ChangeChar( Lexer *lexer, tmbchar c )
825 if ( lexer->lexsize > 0 )
827 lexer->lexbuf[ lexer->lexsize-1 ] = c;
831 /* store character c as UTF-8 encoded byte stream */
832 void TY_(AddCharToLexer)( Lexer *lexer, uint c )
834 int i, err, count = 0;
835 tmbchar buf[10] = {0};
837 err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
840 #if 0 && defined(_DEBUG)
841 fprintf( stderr, "lexer UTF-8 encoding error for U+%x : ", c );
843 /* replacement character 0xFFFD encoded as UTF-8 */
844 buf[0] = (byte) 0xEF;
845 buf[1] = (byte) 0xBF;
846 buf[2] = (byte) 0xBD;
850 for ( i = 0; i < count; ++i )
851 AddByte( lexer, buf[i] );
854 static void AddStringToLexer( Lexer *lexer, ctmbstr str )
858 /* Many (all?) compilers will sign-extend signed chars (the default) when
859 ** converting them to unsigned integer values. We must cast our char to
860 ** unsigned char before assigning it to prevent this from happening.
862 while( 0 != (c = (unsigned char) *str++ ))
863 TY_(AddCharToLexer)( lexer, c );
867 static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
869 lexer->lines = doc->docIn->curline;
870 lexer->columns = doc->docIn->curcol;
874 No longer attempts to insert missing ';' for unknown
875 enitities unless one was present already, since this
876 gives unexpected results.
878 For example: <a href="something.htm?foo&bar&fred">
879 was tidied to: <a href="something.htm?foo&bar;&fred;">
880 rather than: <a href="something.htm?foo&bar&fred">
882 My thanks for Maurice Buxton for spotting this.
884 Also Randy Waki pointed out the following case for the
885 04 Aug 00 version (bug #433012):
887 For example: <a href="something.htm?id=1&lang=en">
888 was tidied to: <a href="something.htm?id=1⟨=en">
889 rather than: <a href="something.htm?id=1&lang=en">
891 where "lang" is a known entity (#9001), but browsers would
892 misinterpret "⟨" because it had a value > 256.
894 So the case of an apparently known entity with a value > 256 and
895 missing a semicolon is handled specially.
897 "ParseEntity" is also a bit of a misnomer - it handles entities and
898 numeric character references. Invalid NCR's are now reported.
900 static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
909 typedef Bool (*ENTfn)(uint);
910 const ENTfn entFn[] = {
916 ENTState entState = ENT_default;
918 Bool semicolon = no, found = no;
919 Bool isXml = cfgBool( doc, TidyXmlTags );
920 Bool preserveEntities = cfgBool( doc, TidyPreserveEntities );
921 uint c, ch, startcol, entver = 0;
922 Lexer* lexer = doc->lexer;
924 start = lexer->lexsize - 1; /* to start at "&" */
925 startcol = doc->docIn->curcol - 1;
927 while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
936 if (charRead == 1 && c == '#')
938 #if SUPPORT_ASIAN_ENCODINGS
939 if ( !cfgBool(doc, TidyNCR) ||
940 cfg(doc, TidyInCharEncoding) == BIG5 ||
941 cfg(doc, TidyInCharEncoding) == SHIFTJIS )
943 TY_(UngetChar)('#', doc->docIn);
947 TY_(AddCharToLexer)( lexer, c );
948 entState = ENT_numdec;
951 else if (charRead == 2 && entState == ENT_numdec
952 && (c == 'x' || (!isXml && c == 'X')) )
954 TY_(AddCharToLexer)( lexer, c );
955 entState = ENT_numhex;
959 if ( entFn[entState](c) )
961 TY_(AddCharToLexer)( lexer, c );
965 /* otherwise put it back */
966 TY_(UngetChar)( c, doc->docIn );
970 /* make sure entity is NULL terminated */
971 lexer->lexbuf[lexer->lexsize] = '\0';
973 /* Should contrain version to XML/XHTML if '
974 ** is encountered. But this is not possible with
975 ** Tidy's content model bit mask.
977 if ( TY_(tmbstrcmp)(lexer->lexbuf+start, "&apos") == 0
978 && !cfgBool(doc, TidyXmlOut)
980 && !cfgBool(doc, TidyXhtmlOut) )
981 TY_(ReportEntityError)( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 );
983 if (( mode == OtherNamespace ) && ( c == ';' ))
985 /* #130 MathML attr and entity fix! */
989 preserveEntities = yes;
993 /* Lookup entity code and version
995 found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
998 /* deal with unrecognized or invalid entities */
999 /* #433012 - fix by Randy Waki 17 Feb 01 */
1000 /* report invalid NCR's - Terry Teague 01 Sep 01 */
1001 if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') )
1003 /* set error position just before offending character */
1004 SetLexerLocus( doc, lexer );
1005 lexer->columns = startcol;
1007 if (lexer->lexsize > start + 1)
1009 if (ch >= 128 && ch <= 159)
1011 /* invalid numeric character reference */
1014 int replaceMode = DISCARDED_CHAR;
1016 if ( TY_(ReplacementCharEncoding) == WIN1252 )
1017 c1 = TY_(DecodeWin1252)( ch );
1018 else if ( TY_(ReplacementCharEncoding) == MACROMAN )
1019 c1 = TY_(DecodeMacRoman)( ch );
1022 replaceMode = REPLACED_CHAR;
1024 if ( c != ';' ) /* issue warning if not terminated by ';' */
1025 TY_(ReportEntityError)( doc, MISSING_SEMICOLON_NCR,
1026 lexer->lexbuf+start, c );
1028 TY_(ReportEncodingError)(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR);
1032 /* make the replacement */
1033 lexer->lexsize = start;
1034 TY_(AddCharToLexer)( lexer, c1 );
1040 lexer->lexsize = start;
1046 TY_(ReportEntityError)( doc, UNKNOWN_ENTITY,
1047 lexer->lexbuf+start, ch );
1050 TY_(AddCharToLexer)( lexer, ';' );
1053 TY_(ReportEntityError)( doc, UNESCAPED_AMPERSAND,
1054 lexer->lexbuf+start, ch );
1058 if ( c != ';' ) /* issue warning if not terminated by ';' */
1060 /* set error position just before offending chararcter */
1061 SetLexerLocus( doc, lexer );
1062 lexer->columns = startcol;
1063 TY_(ReportEntityError)( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c );
1066 if (preserveEntities)
1067 TY_(AddCharToLexer)( lexer, ';' );
1070 lexer->lexsize = start;
1071 if ( ch == 160 && (mode == Preformatted) )
1073 TY_(AddCharToLexer)( lexer, ch );
1075 if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) )
1076 AddStringToLexer( lexer, "amp;" );
1079 /* Detect extended vs. basic entities */
1080 TY_(ConstrainVersion)( doc, entver );
1084 static tmbchar ParseTagName( TidyDocImpl* doc )
1086 Lexer *lexer = doc->lexer;
1087 uint c = lexer->lexbuf[ lexer->txtstart ];
1088 Bool xml = cfgBool(doc, TidyXmlTags);
1090 /* fold case of first character in buffer */
1091 if (!xml && TY_(IsUpper)(c))
1092 lexer->lexbuf[lexer->txtstart] = (tmbchar) TY_(ToLower)(c);
1094 while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
1096 if ((!xml && !TY_(IsNamechar)(c)) ||
1097 (xml && !TY_(IsXMLNamechar)(c)))
1100 /* fold case of subsequent characters */
1101 if (!xml && TY_(IsUpper)(c))
1102 c = TY_(ToLower)(c);
1104 TY_(AddCharToLexer)(lexer, c);
1107 lexer->txtend = lexer->lexsize;
1112 Used for elements and text nodes
1113 element name is NULL for text nodes
1114 start and end are offsets into lexbuf
1115 which contains the textual content of
1116 all elements in the parse tree.
1118 parent and content allow traversal
1119 of the parse tree in any direction.
1120 attributes are represented as a linked
1121 list of AttVal nodes which hold the
1122 strings for attribute/value pairs.
1126 Node *TY_(NewNode)(TidyAllocator* allocator, Lexer *lexer)
1128 Node* node = (Node*) TidyAlloc( allocator, sizeof(Node) );
1129 TidyClearMemory( node, sizeof(Node) );
1132 node->line = lexer->lines;
1133 node->column = lexer->columns;
1135 node->type = TextNode;
1136 #if !defined(NDEBUG) && defined(_MSC_VER) && defined(DEBUG_ALLOCATION)
1137 SPRTF("Allocated node %p\n", node );
1142 /* used to clone heading nodes when split by an <HR> */
1143 Node *TY_(CloneNode)( TidyDocImpl* doc, Node *element )
1145 Lexer* lexer = doc->lexer;
1146 Node *node = TY_(NewNode)( lexer->allocator, lexer );
1148 node->start = lexer->lexsize;
1149 node->end = lexer->lexsize;
1153 node->parent = element->parent;
1154 node->type = element->type;
1155 node->closed = element->closed;
1156 node->implicit = element->implicit;
1157 node->tag = element->tag;
1158 node->element = TY_(tmbstrdup)( doc->allocator, element->element );
1159 node->attributes = TY_(DupAttrs)( doc, element->attributes );
1164 /* free node's attributes */
1165 void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node )
1167 while ( node->attributes )
1169 AttVal *av = node->attributes;
1171 if ( av->attribute )
1173 if ( (attrIsID(av) || attrIsNAME(av)) &&
1174 TY_(IsAnchorElement)(doc, node) )
1176 TY_(RemoveAnchorByNode)( doc, av->value, node );
1180 node->attributes = av->next;
1181 TY_(FreeAttribute)( doc, av );
1185 /* doesn't repair attribute list linkage */
1186 void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av )
1188 TY_(FreeNode)( doc, av->asp );
1189 TY_(FreeNode)( doc, av->php );
1190 TidyDocFree( doc, av->attribute );
1191 TidyDocFree( doc, av->value );
1192 TidyDocFree( doc, av );
1195 /* detach attribute from node
1197 void TY_(DetachAttribute)( Node *node, AttVal *attr )
1199 AttVal *av, *prev = NULL;
1201 for ( av = node->attributes; av; av = av->next )
1206 prev->next = attr->next;
1208 node->attributes = attr->next;
1215 /* detach attribute from node then free it
1217 void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr )
1219 TY_(DetachAttribute)( node, attr );
1220 TY_(FreeAttribute)( doc, attr );
1224 Free document nodes by iterating through peers and recursing
1225 through children. Set next to NULL before calling TY_(FreeNode)()
1226 to avoid freeing peer nodes. Doesn't patch up prev/next links.
1228 void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
1230 #if !defined(NDEBUG) && defined(_MSC_VER) && defined(DEBUG_ALLOCATION)
1231 if (node) SPRTF("Free node %p\n", node );
1233 /* this is no good ;=((
1234 if (node && doc && doc->lexer) {
1235 if (node == doc->lexer->token) {
1236 doc->lexer->token = NULL; // TY_(NewNode)( doc->lexer->allocator, doc->lexer );
1239 ----------------- */
1242 Node* next = node->next;
1244 TY_(FreeAttrs)( doc, node );
1245 TY_(FreeNode)( doc, node->content );
1246 TidyDocFree( doc, node->element );
1247 #ifdef TIDY_STORE_ORIGINAL_TEXT
1249 TidyDocFree(doc, node->otext);
1251 if (RootNode != node->type)
1252 TidyDocFree( doc, node );
1254 node->content = NULL;
1260 #ifdef TIDY_STORE_ORIGINAL_TEXT
1261 void StoreOriginalTextInToken(TidyDocImpl* doc, Node* node, uint count)
1263 if (!doc->storeText)
1266 if (count >= doc->docIn->otextlen)
1269 if (!doc->docIn->otextsize)
1274 node->otext = doc->docIn->otextbuf;
1275 doc->docIn->otextbuf = NULL;
1276 doc->docIn->otextlen = 0;
1277 doc->docIn->otextsize = 0;
1281 uint len = doc->docIn->otextlen;
1282 tmbstr buf1 = (tmbstr)TidyDocAlloc(doc, len - count + 1);
1283 tmbstr buf2 = (tmbstr)TidyDocAlloc(doc, count + 1);
1288 for (i = 0; i < len - count; ++i)
1289 buf1[i] = doc->docIn->otextbuf[i];
1293 for (j = 0; j + i < len; ++j)
1294 buf2[j] = doc->docIn->otextbuf[j + i];
1298 TidyDocFree(doc, doc->docIn->otextbuf);
1300 doc->docIn->otextbuf = buf2;
1301 doc->docIn->otextlen = count;
1302 doc->docIn->otextsize = count + 1;
1307 Node* TY_(TextToken)( Lexer *lexer )
1309 Node *node = TY_(NewNode)( lexer->allocator, lexer );
1310 node->start = lexer->txtstart;
1311 node->end = lexer->txtend;
1315 /* used for creating preformatted text from Word2000 */
1316 Node *TY_(NewLineNode)( Lexer *lexer )
1318 Node *node = TY_(NewNode)( lexer->allocator, lexer );
1319 node->start = lexer->lexsize;
1320 TY_(AddCharToLexer)( lexer, (uint)'\n' );
1321 node->end = lexer->lexsize;
1325 /* used for adding a for Word2000 */
1326 Node* TY_(NewLiteralTextNode)( Lexer *lexer, ctmbstr txt )
1328 Node *node = TY_(NewNode)( lexer->allocator, lexer );
1329 node->start = lexer->lexsize;
1330 AddStringToLexer( lexer, txt );
1331 node->end = lexer->lexsize;
1335 static Node* TagToken( TidyDocImpl* doc, NodeType type )
1337 Lexer* lexer = doc->lexer;
1338 Node* node = TY_(NewNode)( lexer->allocator, lexer );
1340 node->element = TY_(tmbstrndup)( doc->allocator,
1341 lexer->lexbuf + lexer->txtstart,
1342 lexer->txtend - lexer->txtstart );
1343 node->start = lexer->txtstart;
1344 node->end = lexer->txtstart;
1346 if ( type == StartTag || type == StartEndTag || type == EndTag )
1347 TY_(FindTag)(doc, node);
1352 static Node* NewToken(TidyDocImpl* doc, NodeType type)
1354 Lexer* lexer = doc->lexer;
1355 Node* node = TY_(NewNode)(lexer->allocator, lexer);
1357 node->start = lexer->txtstart;
1358 node->end = lexer->txtend;
1359 #ifdef TIDY_STORE_ORIGINAL_TEXT
1360 StoreOriginalTextInToken(doc, node, 0);
1365 #define CommentToken(doc) NewToken(doc, CommentTag)
1366 #define DocTypeToken(doc) NewToken(doc, DocTypeTag)
1367 #define PIToken(doc) NewToken(doc, ProcInsTag)
1368 #define AspToken(doc) NewToken(doc, AspTag)
1369 #define JsteToken(doc) NewToken(doc, JsteTag)
1370 #define PhpToken(doc) NewToken(doc, PhpTag)
1371 #define XmlDeclToken(doc) NewToken(doc, XmlDecl)
1372 #define SectionToken(doc) NewToken(doc, SectionTag)
1373 #define CDATAToken(doc) NewToken(doc, CDATATag)
1375 void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str )
1378 while(0 != (c = *str++) )
1379 TY_(AddCharToLexer)( lexer, c );
1383 void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len )
1388 for ( ix=0; ix < len && (c = *str++); ++ix )
1389 TY_(AddCharToLexer)(lexer, c);
1393 /* find doctype element */
1394 Node *TY_(FindDocType)( TidyDocImpl* doc )
1397 for ( node = (doc ? doc->root.content : NULL);
1398 node && node->type != DocTypeTag;
1404 /* find parent container element */
1405 Node* TY_(FindContainer)( Node* node )
1407 for ( node = (node ? node->parent : NULL);
1408 node && TY_(nodeHasCM)(node, CM_INLINE);
1409 node = node->parent )
1416 /* find html element */
1417 Node *TY_(FindHTML)( TidyDocImpl* doc )
1420 for ( node = (doc ? doc->root.content : NULL);
1421 node && !nodeIsHTML(node);
1428 /* find XML Declaration */
1429 Node *TY_(FindXmlDecl)(TidyDocImpl* doc)
1432 for ( node = (doc ? doc->root.content : NULL);
1433 node && !(node->type == XmlDecl);
1441 Node *TY_(FindHEAD)( TidyDocImpl* doc )
1443 Node *node = TY_(FindHTML)( doc );
1447 for ( node = node->content;
1448 node && !nodeIsHEAD(node);
1456 Node *TY_(FindTITLE)(TidyDocImpl* doc)
1458 Node *node = TY_(FindHEAD)(doc);
1461 for (node = node->content;
1462 node && !nodeIsTITLE(node);
1463 node = node->next) {}
1468 Node *TY_(FindBody)( TidyDocImpl* doc )
1470 Node *node = ( doc ? doc->root.content : NULL );
1472 while ( node && !nodeIsHTML(node) )
1478 node = node->content;
1479 while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) )
1482 if ( node && nodeIsFRAMESET(node) )
1484 node = node->content;
1485 while ( node && !nodeIsNOFRAMES(node) )
1490 node = node->content;
1491 while ( node && !nodeIsBODY(node) )
1499 /* add meta element for Tidy */
1500 Bool TY_(AddGenerator)( TidyDocImpl* doc )
1504 Node *head = TY_(FindHEAD)( doc );
1509 #ifdef PLATFORM_NAME
1510 TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 for "PLATFORM_NAME" version %s",
1511 tidyLibraryVersion());
1513 TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 version %s", tidyLibraryVersion());
1516 for ( node = head->content; node; node = node->next )
1518 if ( nodeIsMETA(node) )
1520 attval = TY_(AttrGetById)(node, TidyAttr_NAME);
1522 if (AttrValueIs(attval, "generator"))
1524 attval = TY_(AttrGetById)(node, TidyAttr_CONTENT);
1526 if (AttrHasValue(attval) &&
1527 TY_(tmbstrncasecmp)(attval->value, "HTML Tidy", 9) == 0)
1529 /* update the existing content to reflect the */
1530 /* actual version of Tidy currently being used */
1532 TidyDocFree(doc, attval->value);
1533 attval->value = TY_(tmbstrdup)(doc->allocator, buf);
1540 if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
1542 node = TY_(InferredTag)(doc, TidyTag_META);
1543 TY_(AddAttribute)( doc, node, "name", "generator" );
1544 TY_(AddAttribute)( doc, node, "content", buf );
1545 TY_(InsertNodeAtStart)( head, node );
1553 /*\ examine <!DOCTYPE ...> to identify version
1554 * Issue #167 and #169
1557 * <!DOCTYPE html SYSTEM "about:legacy-compat">
1560 static uint FindGivenVersion( TidyDocImpl* doc, Node* doctype )
1562 AttVal * fpi = TY_(GetAttrByName)(doctype, "PUBLIC");
1565 if (!fpi || !fpi->value)
1567 if (doctype->element && (TY_(tmbstrcmp)(doctype->element,"html") == 0))
1569 return VERS_HTML5; /* TODO: do we need to check MORE? */
1571 /* TODO: Consider warning, error message */
1572 return VERS_UNKNOWN;
1574 vers = GetVersFromFPI(fpi->value);
1576 if (VERS_XHTML & vers)
1578 TY_(SetOptionBool)(doc, TidyXmlOut, yes);
1579 TY_(SetOptionBool)(doc, TidyXhtmlOut, yes);
1580 doc->lexer->isvoyager = yes;
1583 /* todo: add a warning if case does not match? */
1584 TidyDocFree(doc, fpi->value);
1585 fpi->value = TY_(tmbstrdup)(doc->allocator, GetFPIFromVers(vers));
1590 /* return guessed version */
1591 uint TY_(ApparentVersion)( TidyDocImpl* doc )
1593 if ((doc->lexer->doctype == XH11 ||
1594 doc->lexer->doctype == XB10) &&
1595 (doc->lexer->versions & doc->lexer->doctype))
1596 return doc->lexer->doctype;
1598 return TY_(HTMLVersion)(doc);
1601 ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool ARG_UNUSED(isXhtml) )
1603 ctmbstr name = GetNameFromVers(vers);
1605 /* this test has moved to ReportMarkupVersion() in localize.c, for localization reasons */
1608 name = "HTML Proprietary";
1614 Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc )
1616 Bool isXhtml = doc->lexer->isvoyager;
1619 /* Do not warn in XHTML mode */
1623 /* Do not warn if emitted doctype is proprietary */
1624 if ( TY_(HTMLVersionNameFromCode)(doc->lexer->versionEmitted, isXhtml ) == NULL )
1627 /* Do not warn if no SI is possible */
1628 if ( GetSIFromVers(doc->lexer->versionEmitted) == NULL )
1631 if ( (doctype = TY_(FindDocType)( doc )) != NULL
1632 && TY_(GetAttrByName)(doctype, "SYSTEM") == NULL )
1639 /* Put DOCTYPE declaration between the
1640 ** <?xml version "1.0" ... ?> declaration, if any,
1641 ** and the <html> tag. Should also work for any comments,
1642 ** etc. that may precede the <html> tag.
1645 static Node* NewDocTypeNode( TidyDocImpl* doc )
1647 Node* doctype = NULL;
1648 Node* html = TY_(FindHTML)( doc );
1653 doctype = TY_(NewNode)( doc->allocator, NULL );
1654 doctype->type = DocTypeTag;
1655 TY_(InsertNodeBeforeElement)(html, doctype);
1659 Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc )
1661 Lexer *lexer = doc->lexer;
1662 Node *doctype = TY_(FindDocType)( doc );
1663 TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
1664 ctmbstr pub = "PUBLIC";
1665 ctmbstr sys = "SYSTEM";
1667 lexer->versionEmitted = TY_(ApparentVersion)( doc );
1669 if (dtmode == TidyDoctypeOmit)
1672 TY_(DiscardElement)(doc, doctype);
1676 if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype))
1681 doctype = NewDocTypeNode(doc);
1682 doctype->element = TY_(tmbstrdup)(doc->allocator, "html");
1686 doctype->element = TY_(tmbstrtolower)(doctype->element);
1691 case TidyDoctypeHtml5:
1693 TY_(RepairAttrValue)(doc, doctype, pub, NULL);
1694 TY_(RepairAttrValue)(doc, doctype, sys, NULL);
1695 lexer->versionEmitted = XH50;
1697 case TidyDoctypeStrict:
1698 /* XHTML 1.0 Strict */
1699 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
1700 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
1701 lexer->versionEmitted = X10S;
1703 case TidyDoctypeLoose:
1704 /* XHTML 1.0 Transitional */
1705 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
1706 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
1707 lexer->versionEmitted = X10T;
1709 case TidyDoctypeUser:
1710 /* user defined document type declaration */
1711 TY_(RepairAttrValue)(doc, doctype, pub, cfgStr(doc, TidyDoctype));
1712 TY_(RepairAttrValue)(doc, doctype, sys, "");
1714 case TidyDoctypeAuto:
1715 if (lexer->doctype == VERS_UNKNOWN) {
1716 lexer->versionEmitted = XH50;
1719 else if (lexer->versions & XH11 && lexer->doctype == XH11)
1721 if (!TY_(GetAttrByName)(doctype, sys))
1722 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1723 lexer->versionEmitted = XH11;
1726 else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40))
1728 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(XH11));
1729 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1730 lexer->versionEmitted = XH11;
1732 else if (lexer->versions & XB10 && lexer->doctype == XB10)
1734 if (!TY_(GetAttrByName)(doctype, sys))
1735 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XB10));
1736 lexer->versionEmitted = XB10;
1739 else if (lexer->versions & VERS_HTML40_STRICT)
1741 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
1742 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
1743 lexer->versionEmitted = X10S;
1745 else if (lexer->versions & VERS_FRAMESET)
1747 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10F));
1748 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10F));
1749 lexer->versionEmitted = X10F;
1751 else if (lexer->versions & VERS_LOOSE)
1753 TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
1754 TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
1755 lexer->versionEmitted = X10T;
1760 TY_(DiscardElement)(doc, doctype);
1764 case TidyDoctypeOmit:
1772 /* fixup doctype if missing */
1773 Bool TY_(FixDocType)( TidyDocImpl* doc )
1775 Lexer* lexer = doc->lexer;
1776 Node* doctype = TY_(FindDocType)( doc );
1777 uint dtmode = cfg( doc, TidyDoctypeMode );
1778 uint guessed = VERS_UNKNOWN;
1781 /* Issue #167 - found doctype, and doctype is default VERS_HTML5, set VERS_HTML5 and return yes */
1782 if (doctype && (dtmode == TidyDoctypeAuto) &&
1783 (lexer->doctype == VERS_HTML5) )
1785 lexer->versionEmitted = lexer->doctype;
1788 if (dtmode == TidyDoctypeAuto &&
1789 lexer->versions & lexer->doctype &&
1790 !(VERS_XHTML & lexer->doctype && !lexer->isvoyager)
1791 && TY_(FindDocType)(doc))
1793 lexer->versionEmitted = lexer->doctype;
1797 if (dtmode == TidyDoctypeOmit)
1800 TY_(DiscardElement)( doc, doctype );
1801 lexer->versionEmitted = TY_(ApparentVersion)( doc );
1805 if (cfgBool(doc, TidyXmlOut))
1809 hadSI = TY_(GetAttrByName)(doctype, "SYSTEM") != NULL;
1811 if ((dtmode == TidyDoctypeStrict ||
1812 dtmode == TidyDoctypeLoose) && doctype)
1814 TY_(DiscardElement)(doc, doctype);
1820 case TidyDoctypeHtml5:
1823 case TidyDoctypeStrict:
1826 case TidyDoctypeLoose:
1829 case TidyDoctypeAuto:
1830 guessed = TY_(HTMLVersion)(doc);
1834 lexer->versionEmitted = guessed;
1835 if (guessed == VERS_UNKNOWN)
1840 doctype->element = TY_(tmbstrtolower)(doctype->element);
1844 doctype = NewDocTypeNode(doc);
1845 doctype->element = TY_(tmbstrdup)(doc->allocator, "html");
1848 TY_(RepairAttrValue)(doc, doctype, "PUBLIC", GetFPIFromVers(guessed));
1851 TY_(RepairAttrValue)(doc, doctype, "SYSTEM", GetSIFromVers(guessed));
1856 /* ensure XML document starts with <?xml version="1.0"?> */
1857 /* add encoding attribute if not using ASCII or UTF-8 output */
1858 Bool TY_(FixXmlDecl)( TidyDocImpl* doc )
1861 AttVal *version, *encoding;
1862 Lexer*lexer = doc->lexer;
1863 Node* root = &doc->root;
1865 if ( root->content && root->content->type == XmlDecl )
1867 xml = root->content;
1871 xml = TY_(NewNode)(lexer->allocator, lexer);
1872 xml->type = XmlDecl;
1873 if ( root->content )
1874 TY_(InsertNodeBeforeElement)(root->content, xml);
1876 root->content = xml;
1879 version = TY_(GetAttrByName)(xml, "version");
1880 encoding = TY_(GetAttrByName)(xml, "encoding");
1883 We need to insert a check if declared encoding
1884 and output encoding mismatch and fix the XML
1885 declaration accordingly!!!
1888 if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 )
1890 ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
1892 TY_(AddAttribute)( doc, xml, "encoding", enc );
1895 if ( version == NULL )
1896 TY_(AddAttribute)( doc, xml, "version", "1.0" );
1900 Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id)
1902 Lexer *lexer = doc->lexer;
1903 Node *node = TY_(NewNode)( lexer->allocator, lexer );
1904 const Dict* dict = TY_(LookupTagDef)(id);
1906 assert( dict != NULL );
1908 node->type = StartTag;
1909 node->implicit = yes;
1910 node->element = TY_(tmbstrdup)(doc->allocator, dict->name);
1912 node->start = lexer->txtstart;
1913 node->end = lexer->txtend;
1918 static Bool ExpectsContent(Node *node)
1920 if (node->type != StartTag)
1923 /* unknown element? */
1924 if (node->tag == NULL)
1927 if (node->tag->model & CM_EMPTY)
1934 create a text node for the contents of
1935 a CDATA element like style or script
1936 which ends with </foo> for some foo.
1946 static Node *GetCDATA( TidyDocImpl* doc, Node *container )
1948 Lexer* lexer = doc->lexer;
1951 CDATAState state = CDATA_INTERMEDIATE;
1956 Bool hasSrc = TY_(AttrGetById)(container, TidyAttr_SRC) != NULL;
1958 SetLexerLocus( doc, lexer );
1959 lexer->waswhite = no;
1960 lexer->txtstart = lexer->txtend = lexer->lexsize;
1962 /* seen start tag, look for matching end tag */
1963 while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
1965 TY_(AddCharToLexer)(lexer, c);
1966 lexer->txtend = lexer->lexsize;
1968 if (state == CDATA_INTERMEDIATE)
1972 if (isEmpty && !TY_(IsWhite)(c))
1977 c = TY_(ReadChar)(doc->docIn);
1979 if (TY_(IsLetter)(c))
1981 /* <head><script src=foo><meta name=foo content=bar>*/
1982 if (hasSrc && isEmpty && nodeIsSCRIPT(container))
1984 /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
1985 lexer->lexsize = lexer->txtstart;
1986 TY_(UngetChar)(c, doc->docIn);
1987 TY_(UngetChar)('<', doc->docIn);
1990 TY_(AddCharToLexer)(lexer, c);
1991 start = lexer->lexsize - 1;
1992 state = CDATA_STARTTAG;
1996 TY_(AddCharToLexer)(lexer, c);
1998 c = TY_(ReadChar)(doc->docIn);
2000 if (!TY_(IsLetter)(c))
2002 TY_(UngetChar)(c, doc->docIn);
2005 TY_(UngetChar)(c, doc->docIn);
2007 start = lexer->lexsize;
2008 state = CDATA_ENDTAG;
2012 /* recognize document.write("<script><\/script>") */
2013 TY_(AddCharToLexer)(lexer, c);
2015 c = TY_(ReadChar)(doc->docIn);
2019 TY_(UngetChar)(c, doc->docIn);
2023 TY_(AddCharToLexer)(lexer, c);
2024 c = TY_(ReadChar)(doc->docIn);
2026 if (!TY_(IsLetter)(c))
2028 TY_(UngetChar)(c, doc->docIn);
2031 TY_(UngetChar)(c, doc->docIn);
2033 start = lexer->lexsize;
2034 state = CDATA_ENDTAG;
2038 TY_(UngetChar)(c, doc->docIn);
2041 /* '<' + Letter found */
2042 else if (state == CDATA_STARTTAG)
2044 if (TY_(IsLetter)(c))
2047 matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
2048 TY_(tmbstrlen)(container->element)) == 0;
2052 state = CDATA_INTERMEDIATE;
2054 /* '<' + '/' + Letter found */
2055 else if (state == CDATA_ENDTAG)
2057 if (TY_(IsLetter)(c))
2060 matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
2061 TY_(tmbstrlen)(container->element)) == 0;
2063 if (isEmpty && !matches)
2065 /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
2067 for (i = lexer->lexsize - 1; i >= start; --i)
2068 TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
2069 TY_(UngetChar)('/', doc->docIn);
2070 TY_(UngetChar)('<', doc->docIn);
2074 if (matches && nested-- <= 0)
2076 for (i = lexer->lexsize - 1; i >= start; --i)
2077 TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
2078 TY_(UngetChar)('/', doc->docIn);
2079 TY_(UngetChar)('<', doc->docIn);
2080 lexer->lexsize -= (lexer->lexsize - start) + 2;
2083 else if (lexer->lexbuf[start - 2] != '\\')
2085 /* if the end tag is not already escaped using backslash */
2086 SetLexerLocus( doc, lexer );
2087 lexer->columns -= 3;
2088 TY_(ReportError)(doc, NULL, NULL, BAD_CDATA_CONTENT);
2090 /* if javascript insert backslash before / */
2091 if (TY_(IsJavaScript)(container))
2093 for (i = lexer->lexsize; i > start-1; --i)
2094 lexer->lexbuf[i] = lexer->lexbuf[i-1];
2096 lexer->lexbuf[start-1] = '\\';
2100 state = CDATA_INTERMEDIATE;
2104 lexer->lexsize = lexer->txtstart = lexer->txtend;
2106 lexer->txtend = lexer->lexsize;
2108 if (c == EndOfStream)
2109 TY_(ReportError)(doc, container, NULL, MISSING_ENDTAG_FOR );
2111 /* this was disabled for some reason... */
2113 if (lexer->txtend > lexer->txtstart)
2114 return TextToken(lexer);
2118 return TY_(TextToken)(lexer);
2122 void TY_(UngetToken)( TidyDocImpl* doc )
2124 doc->lexer->pushed = yes;
2127 #ifdef TIDY_STORE_ORIGINAL_TEXT
2128 #define CondReturnTextNode(doc, skip) \
2129 if (lexer->txtend > lexer->txtstart) \
2131 lexer->token = TY_(TextToken)(lexer); \
2132 StoreOriginalTextInToken(doc, lexer->token, skip); \
2133 return lexer->token; \
2136 #if !defined(NDEBUG) && defined(_MSC_VER)
2137 #define CondReturnTextNode(doc, skip) \
2138 if (lexer->txtend > lexer->txtstart) { \
2139 Node *_node = TY_(TextToken)(lexer); \
2140 lexer->token = _node; \
2141 GTDBG(doc,"text_node",_node); \
2146 #define CondReturnTextNode(doc, skip) \
2147 if (lexer->txtend > lexer->txtstart) \
2149 lexer->token = TY_(TextToken)(lexer); \
2150 return lexer->token; \
2156 modes for GetToken()
2158 MixedContent -- for elements which don't accept PCDATA
2159 Preformatted -- white space preserved as is
2160 IgnoreMarkup -- for CDATA elements such as script, style
2162 static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode );
2164 Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode )
2167 Lexer* lexer = doc->lexer;
2169 if (lexer->pushed || lexer->itoken)
2171 /* Deal with previously returned duplicate inline token */
2174 /* itoken rejected */
2178 node = lexer->itoken;
2179 GTDBG(doc,"lex-itoken", node);
2182 /* itoken has been accepted */
2183 lexer->itoken = NULL;
2186 /* duplicate inlines in preference to pushed text nodes when appropriate */
2188 if (lexer->token->type != TextNode
2189 || !(lexer->insert || lexer->inode)) {
2190 node = lexer->token;
2191 GTDBG(doc,"lex-token", node);
2194 lexer->itoken = TY_(InsertedToken)( doc );
2195 node = lexer->itoken;
2196 GTDBG(doc,"lex-inserted", node);
2200 assert( !(lexer->pushed || lexer->itoken) );
2202 /* at start of block elements, unclosed inline
2203 elements are inserted into the token stream */
2204 if (lexer->insert || lexer->inode) {
2205 /*\ Issue #92: could fix by the following, but instead chose not to stack these 2
2206 * if ( !(lexer->insert && (nodeIsINS(lexer->insert) || nodeIsDEL(lexer->insert))) ) {
2208 lexer->token = TY_(InsertedToken)( doc );
2209 node = lexer->token;
2210 GTDBG(doc,"lex-inserted2", node);
2214 if (mode == CdataContent)
2216 assert( lexer->parent != NULL );
2217 node = GetCDATA(doc, lexer->parent);
2218 GTDBG(doc,"lex-cdata", node);
2222 return GetTokenFromStream( doc, mode );
2225 #if !defined(NDEBUG) && defined(_MSC_VER)
2226 static void check_me(char *name)
2228 SPRTF("Have node %s\n", name);
2232 static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
2234 Lexer* lexer = doc->lexer;
2235 uint c, lexdump, badcomment = 0;
2237 AttVal *attributes = NULL;
2240 /* Lexer->token must be set on return. Nullify it for safety. */
2241 lexer->token = NULL;
2243 SetLexerLocus( doc, lexer );
2244 lexer->waswhite = no;
2246 lexer->txtstart = lexer->txtend = lexer->lexsize;
2248 while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
2250 if (lexer->insertspace)
2252 TY_(AddCharToLexer)(lexer, ' ');
2253 lexer->waswhite = yes;
2254 lexer->insertspace = no;
2257 if (c == 160 && (mode == Preformatted))
2260 TY_(AddCharToLexer)(lexer, c);
2262 switch (lexer->state)
2264 case LEX_CONTENT: /* element content */
2267 Discard white space if appropriate. Its cheaper
2268 to do this here rather than in parser methods
2269 for elements that don't have mixed content.
2271 if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace)
2272 && lexer->lexsize == lexer->txtstart + 1)
2275 lexer->waswhite = no;
2276 SetLexerLocus( doc, lexer );
2282 lexer->state = LEX_GT;
2286 if (TY_(IsWhite)(c))
2288 /* was previous character white? */
2289 if (lexer->waswhite)
2291 if (mode != Preformatted && mode != IgnoreMarkup)
2294 SetLexerLocus( doc, lexer );
2297 else /* prev character wasn't white */
2299 lexer->waswhite = yes;
2301 if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
2302 ChangeChar(lexer, ' ');
2307 else if (c == '&' && mode != IgnoreMarkup)
2308 ParseEntity( doc, mode );
2310 /* this is needed to avoid trimming trailing whitespace */
2311 if (mode == IgnoreWhitespace)
2312 mode = MixedContent;
2314 lexer->waswhite = no;
2317 case LEX_GT: /* < */
2319 /* check for endtag */
2322 if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2324 TY_(UngetChar)(c, doc->docIn);
2328 TY_(AddCharToLexer)(lexer, c);
2330 if (TY_(IsLetter)(c))
2332 lexer->lexsize -= 3;
2333 lexer->txtend = lexer->lexsize;
2334 TY_(UngetChar)(c, doc->docIn);
2335 lexer->state = LEX_ENDTAG;
2336 lexer->lexbuf[lexer->lexsize] = '\0'; /* debug */
2337 doc->docIn->curcol -= 2;
2339 /* if some text before the </ return it now */
2340 if (lexer->txtend > lexer->txtstart)
2342 /* trim space character before end tag */
2343 if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ')
2345 lexer->lexsize -= 1;
2346 lexer->txtend = lexer->lexsize;
2348 lexer->token = TY_(TextToken)(lexer);
2349 #ifdef TIDY_STORE_ORIGINAL_TEXT
2350 StoreOriginalTextInToken(doc, lexer->token, 3);
2352 node = lexer->token;
2353 GTDBG(doc,"text", node);
2357 continue; /* no text so keep going */
2360 /* otherwise treat as CDATA */
2361 lexer->waswhite = no;
2362 lexer->state = LEX_CONTENT;
2366 if (mode == IgnoreMarkup)
2368 /* otherwise treat as CDATA */
2369 lexer->waswhite = no;
2370 lexer->state = LEX_CONTENT;
2375 look out for comments, doctype or marked sections
2376 this isn't quite right, but its getting there ...
2380 c = TY_(ReadChar)(doc->docIn);
2384 c = TY_(ReadChar)(doc->docIn);
2388 lexer->state = LEX_COMMENT; /* comment */
2389 lexer->lexsize -= 2;
2390 lexer->txtend = lexer->lexsize;
2392 CondReturnTextNode(doc, 4)
2394 lexer->txtstart = lexer->lexsize;
2398 TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT );
2400 else if (c == 'd' || c == 'D')
2402 /* todo: check for complete "<!DOCTYPE" not just <!D */
2406 lexer->state = LEX_DOCTYPE; /* doctype */
2407 lexer->lexsize -= 2;
2408 lexer->txtend = lexer->lexsize;
2409 mode = IgnoreWhitespace;
2411 /* skip until white space or '>' */
2415 c = TY_(ReadChar)(doc->docIn);
2418 if (c == EndOfStream || c == '>')
2420 TY_(UngetChar)(c, doc->docIn);
2425 if (!TY_(IsWhite)(c))
2428 /* and skip to end of whitespace */
2432 c = TY_(ReadChar)(doc->docIn);
2435 if (c == EndOfStream || c == '>')
2437 TY_(UngetChar)(c, doc->docIn);
2442 if (TY_(IsWhite)(c))
2445 TY_(UngetChar)(c, doc->docIn);
2452 CondReturnTextNode(doc, (skip + 3))
2454 lexer->txtstart = lexer->lexsize;
2459 /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
2460 lexer->lexsize -= 2;
2461 lexer->state = LEX_SECTION;
2462 lexer->txtend = lexer->lexsize;
2464 CondReturnTextNode(doc, 2)
2466 lexer->txtstart = lexer->lexsize;
2472 /* else swallow characters up to and including next '>' */
2473 while ((c = TY_(ReadChar)(doc->docIn)) != '>')
2475 if (c == EndOfStream)
2477 TY_(UngetChar)(c, doc->docIn);
2482 lexer->lexsize -= 2;
2483 lexer->lexbuf[lexer->lexsize] = '\0';
2484 lexer->state = LEX_CONTENT;
2489 processing instructions
2494 lexer->lexsize -= 2;
2495 lexer->state = LEX_PROCINSTR;
2496 lexer->txtend = lexer->lexsize;
2498 CondReturnTextNode(doc, 2)
2500 lexer->txtstart = lexer->lexsize;
2504 /* Microsoft ASP's e.g. <% ... server-code ... %> */
2507 lexer->lexsize -= 2;
2508 lexer->state = LEX_ASP;
2509 lexer->txtend = lexer->lexsize;
2511 CondReturnTextNode(doc, 2)
2513 lexer->txtstart = lexer->lexsize;
2517 /* Netscapes JSTE e.g. <# ... server-code ... #> */
2520 lexer->lexsize -= 2;
2521 lexer->state = LEX_JSTE;
2522 lexer->txtend = lexer->lexsize;
2524 CondReturnTextNode(doc, 2)
2526 lexer->txtstart = lexer->lexsize;
2530 /* check for start tag */
2531 if (TY_(IsLetter)(c))
2533 TY_(UngetChar)(c, doc->docIn); /* push back letter */
2534 TY_(UngetChar)('<', doc->docIn);
2535 lexer->lexsize -= 2; /* discard "<" + letter */
2536 lexer->txtend = lexer->lexsize;
2537 lexer->state = LEX_STARTTAG; /* ready to read tag name */
2539 CondReturnTextNode(doc, 2)
2541 /* lexer->txtstart = lexer->lexsize; missing here? */
2542 continue; /* no text so keep going */
2545 /* fix for bug 762102 */
2548 TY_(UngetChar)(c, doc->docIn);
2552 /* otherwise treat as CDATA */
2553 lexer->state = LEX_CONTENT;
2554 lexer->waswhite = no;
2557 case LEX_ENDTAG: /* </letter */
2558 lexer->txtstart = lexer->lexsize - 1;
2559 doc->docIn->curcol += 2;
2560 c = ParseTagName( doc );
2561 lexer->token = TagToken( doc, EndTag ); /* create endtag token */
2562 lexer->lexsize = lexer->txtend = lexer->txtstart;
2565 while ( c != '>' && c != EndOfStream )
2567 c = TY_(ReadChar)(doc->docIn);
2570 if (c == EndOfStream)
2572 TY_(FreeNode)( doc, lexer->token );
2576 lexer->state = LEX_CONTENT;
2577 lexer->waswhite = no;
2578 #ifdef TIDY_STORE_ORIGINAL_TEXT
2579 StoreOriginalTextInToken(doc, lexer->token, 0); /* hmm... */
2581 node = lexer->token;
2582 GTDBG(doc,"endtag", node);
2583 return node; /* the endtag token */
2585 case LEX_STARTTAG: /* first letter of tagname */
2586 c = TY_(ReadChar)(doc->docIn);
2587 ChangeChar(lexer, (tmbchar)c);
2588 lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */
2589 c = ParseTagName( doc );
2592 lexer->token = TagToken( doc, StartTag ); /* [i_a]2 'isempty' is always false, thanks to code 2 lines above */
2594 /* parse attributes, consuming closing ">" */
2598 TY_(UngetChar)(c, doc->docIn);
2600 attributes = ParseAttrs( doc, &isempty );
2604 lexer->token->type = StartEndTag;
2606 lexer->token->attributes = attributes;
2607 lexer->lexsize = lexer->txtend = lexer->txtstart;
2609 /* swallow newline following start tag */
2610 /* special check needed for CRLF sequence */
2611 /* this doesn't apply to empty elements */
2612 /* nor to preformatted content that needs escaping */
2614 if ((mode != Preformatted && ExpectsContent(lexer->token))
2615 || nodeIsBR(lexer->token) || nodeIsHR(lexer->token))
2617 c = TY_(ReadChar)(doc->docIn);
2619 if (c != '\n' && c != '\f')
2620 TY_(UngetChar)(c, doc->docIn);
2622 lexer->waswhite = yes; /* to swallow leading whitespace */
2625 lexer->waswhite = no;
2627 lexer->state = LEX_CONTENT;
2628 if (lexer->token->tag == NULL)
2630 if (mode != OtherNamespace) /* [i_a]2 only issue warning if NOT 'OtherNamespace', and tag null */
2631 TY_(ReportFatal)( doc, NULL, lexer->token, UNKNOWN_ELEMENT );
2633 else if ( !cfgBool(doc, TidyXmlTags) )
2635 Node* curr = lexer->token;
2636 TY_(ConstrainVersion)( doc, curr->tag->versions );
2638 if ( curr->tag->versions & VERS_PROPRIETARY )
2640 if ( !cfgBool(doc, TidyMakeClean) ||
2641 ( !nodeIsNOBR(curr) && !nodeIsWBR(curr) ) )
2643 TY_(ReportError)(doc, NULL, curr, PROPRIETARY_ELEMENT );
2645 if ( nodeIsLAYER(curr) )
2646 doc->badLayout |= USING_LAYER;
2647 else if ( nodeIsSPACER(curr) )
2648 doc->badLayout |= USING_SPACER;
2649 else if ( nodeIsNOBR(curr) )
2650 doc->badLayout |= USING_NOBR;
2654 TY_(RepairDuplicateAttributes)( doc, curr, no );
2656 TY_(RepairDuplicateAttributes)( doc, lexer->token, yes );
2657 #ifdef TIDY_STORE_ORIGINAL_TEXT
2658 StoreOriginalTextInToken(doc, lexer->token, 0);
2660 node = lexer->token;
2661 GTDBG(doc,"starttag", node);
2662 return node; /* return start tag */
2664 case LEX_COMMENT: /* seen <!-- so look for --> */
2669 c = TY_(ReadChar)(doc->docIn);
2670 TY_(AddCharToLexer)(lexer, c);
2676 c = TY_(ReadChar)(doc->docIn);
2681 TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT );
2683 /* do not store closing -- in lexbuf */
2684 lexer->lexsize -= 2;
2685 lexer->txtend = lexer->lexsize;
2686 lexer->lexbuf[lexer->lexsize] = '\0';
2687 lexer->state = LEX_CONTENT;
2688 lexer->waswhite = no;
2689 lexer->token = CommentToken(doc);
2691 /* now look for a line break */
2693 c = TY_(ReadChar)(doc->docIn);
2696 lexer->token->linebreak = yes;
2698 TY_(UngetChar)(c, doc->docIn);
2700 node = lexer->token;
2701 GTDBG(doc,"comment", node);
2705 /* note position of first such error in the comment */
2708 SetLexerLocus( doc, lexer );
2709 lexer->columns -= 3;
2714 if ( cfgBool(doc, TidyFixComments) )
2715 lexer->lexbuf[lexer->lexsize - 2] = '=';
2717 /* if '-' then look for '>' to end the comment */
2720 TY_(AddCharToLexer)(lexer, c);
2724 /* otherwise continue to look for --> */
2725 lexer->lexbuf[lexer->lexsize - 1] = '=';
2727 /* http://tidy.sf.net/bug/1266647 */
2728 TY_(AddCharToLexer)(lexer, c);
2732 case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */
2734 /* use ParseDocTypeDecl() to tokenize doctype declaration */
2735 TY_(UngetChar)(c, doc->docIn);
2736 lexer->lexsize -= 1;
2737 lexer->token = ParseDocTypeDecl(doc);
2739 lexer->txtend = lexer->lexsize;
2740 lexer->lexbuf[lexer->lexsize] = '\0';
2741 lexer->state = LEX_CONTENT;
2742 lexer->waswhite = no;
2744 /* make a note of the version named by the 1st doctype */
2745 if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags))
2747 lexer->doctype = FindGivenVersion(doc, lexer->token);
2748 if (lexer->doctype != VERS_HTML5)
2751 * Back to legacy HTML4 mode for -
2752 * Issue #167 & #169 - TidyTag_A
2753 * Issue #196 - TidyTag_CAPTION
2756 TY_(AdjustTags)(doc); /* Dynamically modify the tags table */
2759 node = lexer->token;
2760 GTDBG(doc,"doctype", node);
2763 case LEX_PROCINSTR: /* seen <? so look for '>' */
2764 /* check for PHP preprocessor instructions <?php ... ?> */
2766 if (lexer->lexsize - lexer->txtstart == 3)
2768 if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "php", 3) == 0)
2770 lexer->state = LEX_PHP;
2775 if (lexer->lexsize - lexer->txtstart == 4)
2777 if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 &&
2778 TY_(IsWhite)(lexer->lexbuf[lexer->txtstart + 3]))
2780 lexer->state = LEX_XMLDECL;
2786 if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */
2791 /* now look for '>' */
2792 c = TY_(ReadChar)(doc->docIn);
2794 if (c == EndOfStream)
2796 TY_(ReportError)(doc, NULL, NULL, UNEXPECTED_END_OF_FILE );
2797 TY_(UngetChar)(c, doc->docIn);
2801 TY_(AddCharToLexer)(lexer, c);
2808 lexer->lexsize -= 1;
2815 for (i = 0; i < lexer->lexsize - lexer->txtstart &&
2816 !TY_(IsWhite)(lexer->lexbuf[i + lexer->txtstart]); ++i)
2819 closed = lexer->lexbuf[lexer->lexsize - 1] == '?';
2822 lexer->lexsize -= 1;
2824 lexer->txtstart += i;
2825 lexer->txtend = lexer->lexsize;
2826 lexer->lexbuf[lexer->lexsize] = '\0';
2828 lexer->token = PIToken(doc);
2829 lexer->token->closed = closed;
2830 lexer->token->element = TY_(tmbstrndup)(doc->allocator,
2832 lexer->txtstart - i, i);
2836 lexer->txtend = lexer->lexsize;
2837 lexer->lexbuf[lexer->lexsize] = '\0';
2838 lexer->token = PIToken(doc);
2841 lexer->state = LEX_CONTENT;
2842 lexer->waswhite = no;
2843 node = lexer->token;
2844 GTDBG(doc,"procinstr", node);
2847 case LEX_ASP: /* seen <% so look for "%>" */
2851 /* now look for '>' */
2852 c = TY_(ReadChar)(doc->docIn);
2857 TY_(UngetChar)(c, doc->docIn);
2861 lexer->lexsize -= 1;
2862 lexer->txtend = lexer->lexsize;
2863 lexer->lexbuf[lexer->lexsize] = '\0';
2864 lexer->state = LEX_CONTENT;
2865 lexer->waswhite = no;
2866 lexer->token = AspToken(doc);
2867 node = lexer->token;
2868 GTDBG(doc,"ASP", node);
2869 return node; /* the endtag token */
2873 case LEX_JSTE: /* seen <# so look for "#>" */
2877 /* now look for '>' */
2878 c = TY_(ReadChar)(doc->docIn);
2883 TY_(UngetChar)(c, doc->docIn);
2887 lexer->lexsize -= 1;
2888 lexer->txtend = lexer->lexsize;
2889 lexer->lexbuf[lexer->lexsize] = '\0';
2890 lexer->state = LEX_CONTENT;
2891 lexer->waswhite = no;
2892 lexer->token = JsteToken(doc);
2893 node = lexer->token;
2894 GTDBG(doc,"JSTE", node);
2895 return node; /* the JSTE token */
2898 case LEX_PHP: /* seen "<?php" so look for "?>" */
2902 /* now look for '>' */
2903 c = TY_(ReadChar)(doc->docIn);
2907 TY_(UngetChar)(c, doc->docIn);
2911 lexer->lexsize -= 1;
2912 lexer->txtend = lexer->lexsize;
2913 lexer->lexbuf[lexer->lexsize] = '\0';
2914 lexer->state = LEX_CONTENT;
2915 lexer->waswhite = no;
2916 lexer->token = PhpToken(doc);
2917 node = lexer->token;
2918 GTDBG(doc,"PHP", node);
2919 return node; /* the PHP token */
2921 case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */
2923 if (TY_(IsWhite)(c) && c != '?')
2926 /* get pseudo-attribute */
2935 TY_(UngetChar)(c, doc->docIn);
2937 name = ParseAttribute( doc, &isempty, &asp, &php );
2941 /* fix for http://tidy.sf.net/bug/788031 */
2942 lexer->lexsize -= 1;
2943 lexer->txtend = lexer->txtstart;
2944 lexer->lexbuf[lexer->txtend] = '\0';
2945 lexer->state = LEX_CONTENT;
2946 lexer->waswhite = no;
2947 lexer->token = XmlDeclToken(doc);
2948 lexer->token->attributes = attributes;
2949 node = lexer->token;
2950 GTDBG(doc,"xml", node);
2951 return node; /* the xml token */
2954 av = TY_(NewAttribute)(doc);
2955 av->attribute = name;
2956 av->value = ParseValue( doc, name, yes, &isempty, &pdelim );
2958 av->dict = TY_(FindAttribute)( doc, av );
2960 AddAttrToList( &attributes, av );
2964 /* now look for '>' */
2965 c = TY_(ReadChar)(doc->docIn);
2969 TY_(UngetChar)(c, doc->docIn);
2972 lexer->lexsize -= 1;
2973 lexer->txtend = lexer->txtstart;
2974 lexer->lexbuf[lexer->txtend] = '\0';
2975 lexer->state = LEX_CONTENT;
2976 lexer->waswhite = no;
2977 lexer->token = XmlDeclToken(doc);
2978 lexer->token->attributes = attributes;
2979 node = lexer->token;
2980 GTDBG(doc,"XML", node);
2981 return node; /* the XML token */
2983 case LEX_SECTION: /* seen "<![" so look for "]>" */
2986 if (lexer->lexsize == (lexer->txtstart + 6) &&
2987 TY_(tmbstrncmp)(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0)
2989 lexer->state = LEX_CDATA;
2990 lexer->lexsize -= 6;
2998 /* now look for '>' */
2999 c = TY_(ReadChar)(doc->docIn);
3004 /* Issue #153 - can also be ]'-->' */
3007 c = TY_(ReadChar)(doc->docIn);
3010 c = TY_(ReadChar)(doc->docIn);
3013 TY_(UngetChar)(c, doc->docIn);
3014 TY_(UngetChar)('-', doc->docIn);
3015 TY_(UngetChar)('-', doc->docIn);
3019 TY_(AddCharToLexer)(lexer, '-'); TY_(AddCharToLexer)(lexer, '-'); lexdump = 0;
3020 got output <![endif]--]> - needs furhter fix in pprint section output
3025 TY_(UngetChar)(c, doc->docIn);
3026 TY_(UngetChar)('-', doc->docIn);
3032 TY_(UngetChar)(c, doc->docIn);
3037 lexer->lexsize -= lexdump;
3038 lexer->txtend = lexer->lexsize;
3039 lexer->lexbuf[lexer->lexsize] = '\0';
3040 lexer->state = LEX_CONTENT;
3041 lexer->waswhite = no;
3042 lexer->token = SectionToken(doc);
3043 node = lexer->token;
3044 GTDBG(doc,"SECTION", node);
3045 return node; /* the SECTION token */
3047 case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
3051 /* now look for ']' */
3052 c = TY_(ReadChar)(doc->docIn);
3056 TY_(UngetChar)(c, doc->docIn);
3060 /* now look for '>' */
3061 c = TY_(ReadChar)(doc->docIn);
3065 TY_(UngetChar)(c, doc->docIn);
3066 TY_(UngetChar)(']', doc->docIn);
3070 lexer->lexsize -= 1;
3071 lexer->txtend = lexer->lexsize;
3072 lexer->lexbuf[lexer->lexsize] = '\0';
3073 lexer->state = LEX_CONTENT;
3074 lexer->waswhite = no;
3075 lexer->token = CDATAToken(doc);
3076 node = lexer->token;
3077 GTDBG(doc,"CDATA", node);
3078 return node; /* the CDATA token */
3082 if (lexer->state == LEX_CONTENT) /* text string */
3084 lexer->txtend = lexer->lexsize;
3086 if (lexer->txtend > lexer->txtstart)
3088 TY_(UngetChar)(c, doc->docIn);
3090 if (lexer->lexbuf[lexer->lexsize - 1] == ' ')
3092 lexer->lexsize -= 1;
3093 lexer->txtend = lexer->lexsize;
3095 lexer->token = TY_(TextToken)(lexer);
3096 #ifdef TIDY_STORE_ORIGINAL_TEXT
3097 StoreOriginalTextInToken(doc, lexer->token, 0); /* ? */
3099 node = lexer->token;
3100 GTDBG(doc,"textstring", node);
3101 return node; /* the textstring token */
3104 else if (lexer->state == LEX_COMMENT) /* comment */
3106 if (c == EndOfStream)
3107 TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT );
3109 lexer->txtend = lexer->lexsize;
3110 lexer->lexbuf[lexer->lexsize] = '\0';
3111 lexer->state = LEX_CONTENT;
3112 lexer->waswhite = no;
3113 lexer->token = CommentToken(doc);
3114 node = lexer->token;
3115 GTDBG(doc,"COMMENT", node);
3116 return node; /* the COMMENT token */
3119 #if !defined(NDEBUG) && defined(_MSC_VER)
3120 SPRTF("Returning NULL...\n");
3125 static void MapStr( ctmbstr str, uint code )
3129 uint i = (byte) *str++;
3134 void TY_(InitMap)(void)
3136 MapStr("\r\n\f", newline|white);
3137 MapStr(" \t", white);
3138 MapStr("-.:_", namechar);
3139 MapStr("0123456789", digit|digithex|namechar);
3140 MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar);
3141 MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar);
3142 MapStr("abcdefABCDEF", digithex);
3146 parser for ASP within start tags
3148 Some people use ASP for to customize attributes
3149 Tidy isn't really well suited to dealing with ASP
3150 This is a workaround for attributes, but won't
3151 deal with the case where the ASP is used to tailor
3152 the attribute value. Here is an example of a work
3153 around for using ASP in attribute values:
3155 href='<%=rsSchool.Fields("ID").Value%>'
3157 where the ASP that generates the attribute value
3158 is masked from Tidy by the quotemarks.
3162 static Node *ParseAsp( TidyDocImpl* doc )
3164 Lexer* lexer = doc->lexer;
3168 lexer->txtstart = lexer->lexsize;
3172 if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3175 TY_(AddCharToLexer)(lexer, c);
3181 if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3184 TY_(AddCharToLexer)(lexer, c);
3188 lexer->lexsize -= 2;
3193 lexer->txtend = lexer->lexsize;
3194 if (lexer->txtend > lexer->txtstart)
3195 asp = AspToken(doc);
3197 lexer->txtstart = lexer->txtend;
3203 PHP is like ASP but is based upon XML
3204 processing instructions, e.g. <?php ... ?>
3206 static Node *ParsePhp( TidyDocImpl* doc )
3208 Lexer* lexer = doc->lexer;
3212 lexer->txtstart = lexer->lexsize;
3216 if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3219 TY_(AddCharToLexer)(lexer, c);
3225 if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3228 TY_(AddCharToLexer)(lexer, c);
3232 lexer->lexsize -= 2;
3237 lexer->txtend = lexer->lexsize;
3238 if (lexer->txtend > lexer->txtstart)
3239 php = PhpToken(doc);
3241 lexer->txtstart = lexer->txtend;
3245 /* consumes the '>' terminating start tags */
3246 static tmbstr ParseAttribute( TidyDocImpl* doc, Bool *isempty,
3247 Node **asp, Node **php)
3249 Lexer* lexer = doc->lexer;
3254 *asp = NULL; /* clear asp pointer */
3255 *php = NULL; /* clear php pointer */
3257 /* skip white space before the attribute */
3261 c = TY_(ReadChar)( doc->docIn );
3266 c = TY_(ReadChar)( doc->docIn );
3274 TY_(UngetChar)(c, doc->docIn);
3284 c = TY_(ReadChar)(doc->docIn);
3288 *asp = ParseAsp( doc );
3293 *php = ParsePhp( doc );
3297 TY_(UngetChar)(c, doc->docIn);
3298 TY_(UngetChar)('<', doc->docIn);
3299 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3305 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN );
3309 if (c == '"' || c == '\'')
3311 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
3315 if (c == EndOfStream)
3317 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3318 TY_(UngetChar)(c, doc->docIn);
3323 if (!TY_(IsWhite)(c))
3327 start = lexer->lexsize;
3332 /* but push back '=' for parseValue() */
3333 if (c == '=' || c == '>')
3335 TY_(UngetChar)(c, doc->docIn);
3339 if (c == '<' || c == EndOfStream)
3341 TY_(UngetChar)(c, doc->docIn);
3345 if (lastc == '-' && (c == '"' || c == '\''))
3349 TY_(UngetChar)(c, doc->docIn);
3353 if (TY_(IsWhite)(c))
3356 /* what should be done about non-namechar characters? */
3357 /* currently these are incorporated into the attr name */
3359 if ( !cfgBool(doc, TidyXmlTags) && TY_(IsUpper)(c) )
3360 c = TY_(ToLower)(c);
3362 TY_(AddCharToLexer)( lexer, c );
3364 c = TY_(ReadChar)(doc->docIn);
3367 /* handle attribute names with multibyte chars */
3368 len = lexer->lexsize - start;
3369 attr = (len > 0 ? TY_(tmbstrndup)(doc->allocator,
3370 lexer->lexbuf+start, len) : NULL);
3371 lexer->lexsize = start;
3376 invoked when < is seen in place of attribute value
3377 but terminates on whitespace if not ASP, PHP or Tango
3378 this routine recognizes ' and " quoted strings
3380 static int ParseServerInstruction( TidyDocImpl* doc )
3382 Lexer* lexer = doc->lexer;
3387 c = TY_(ReadChar)(doc->docIn);
3388 TY_(AddCharToLexer)(lexer, c);
3390 /* check for ASP, PHP or Tango */
3391 if (c == '%' || c == '?' || c == '@')
3396 c = TY_(ReadChar)(doc->docIn);
3398 if (c == EndOfStream)
3404 TY_(AddCharToLexer)(lexer, c);
3406 TY_(UngetChar)(c, doc->docIn);
3411 /* if not recognized as ASP, PHP or Tango */
3412 /* then also finish value on whitespace */
3415 if (TY_(IsWhite)(c))
3419 TY_(AddCharToLexer)(lexer, c);
3425 c = TY_(ReadChar)(doc->docIn);
3426 if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3428 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3429 TY_(UngetChar)(c, doc->docIn);
3432 if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3434 TY_(UngetChar)(c, doc->docIn);
3435 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3438 TY_(AddCharToLexer)(lexer, c);
3449 c = TY_(ReadChar)(doc->docIn);
3450 if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3452 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3453 TY_(UngetChar)(c, doc->docIn);
3456 if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3458 TY_(UngetChar)(c, doc->docIn);
3459 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3462 TY_(AddCharToLexer)(lexer, c);
3471 /* values start with "=" or " = " etc. */
3472 /* doesn't consume the ">" at end of start tag */
3474 static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name,
3475 Bool foldCase, Bool *isempty, int *pdelim)
3477 Lexer* lexer = doc->lexer;
3481 uint c, lastc, delim, quotewarning;
3484 delim = (tmbchar) 0;
3488 Henry Zrepa reports that some folk are using the
3489 embed element with script attributes where newlines
3490 are significant and must be preserved
3492 if ( cfgBool(doc, TidyLiteralAttribs) )
3495 /* skip white space before the '=' */
3499 c = TY_(ReadChar)(doc->docIn);
3501 if (c == EndOfStream)
3503 TY_(UngetChar)(c, doc->docIn);
3507 if (!TY_(IsWhite)(c))
3512 c should be '=' if there is a value
3513 other legal possibilities are white
3517 if (c != '=' && c != '"' && c != '\'')
3519 TY_(UngetChar)(c, doc->docIn);
3523 /* skip white space after '=' */
3527 c = TY_(ReadChar)(doc->docIn);
3529 if (c == EndOfStream)
3531 TY_(UngetChar)(c, doc->docIn);
3535 if (!TY_(IsWhite)(c))
3539 /* check for quote marks */
3541 if (c == '"' || c == '\'')
3545 start = lexer->lexsize;
3546 TY_(AddCharToLexer)(lexer, c);
3547 *pdelim = ParseServerInstruction( doc );
3548 len = lexer->lexsize - start;
3549 lexer->lexsize = start;
3550 return (len > 0 ? TY_(tmbstrndup)(doc->allocator,
3551 lexer->lexbuf+start, len) : NULL);
3554 TY_(UngetChar)(c, doc->docIn);
3557 and read the value string
3558 check for quote mark if needed
3562 start = lexer->lexsize;
3567 lastc = c; /* track last character */
3568 c = TY_(ReadChar)(doc->docIn);
3570 if (c == EndOfStream)
3572 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3573 TY_(UngetChar)(c, doc->docIn);
3577 if (delim == (tmbchar)0)
3581 TY_(UngetChar)(c, doc->docIn);
3585 if (c == '"' || c == '\'')
3589 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
3591 /* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */
3592 /* this doesn't handle <a title=foo"/> which browsers treat as */
3593 /* 'foo"/' nor <a title=foo" /> which browser treat as 'foo"' */
3595 c = TY_(ReadChar)(doc->docIn);
3598 TY_(AddCharToLexer)(lexer, q);
3599 TY_(UngetChar)(c, doc->docIn);
3604 TY_(UngetChar)(c, doc->docIn);
3611 TY_(UngetChar)(c, doc->docIn);
3613 TY_(UngetChar)(c, doc->docIn);
3614 TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3619 For cases like <br clear=all/> need to avoid treating /> as
3620 part of the attribute value, however care is needed to avoid
3621 so treating <a href=http://www.acme.com/> in this way, which
3622 would map the <a> tag to <a href="http://www.acme.com"/>
3626 /* peek ahead in case of /> */
3627 c = TY_(ReadChar)(doc->docIn);
3629 if ( c == '>' && !TY_(IsUrl)(doc, name) )
3632 TY_(UngetChar)(c, doc->docIn);
3636 /* unget peeked character */
3637 TY_(UngetChar)(c, doc->docIn);
3641 else /* delim is '\'' or '"' */
3646 if (c == '\n' || c == '<' || c == '>')
3655 TY_(AddCharToLexer)(lexer, c);
3656 ParseEntity( doc, IgnoreWhitespace );
3657 if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge)
3658 ChangeChar(lexer, ' ');
3663 kludge for JavaScript attribute values
3664 with line continuations in string literals
3668 c = TY_(ReadChar)(doc->docIn);
3672 TY_(UngetChar)(c, doc->docIn);
3677 if (TY_(IsWhite)(c))
3684 /* discard line breaks in quoted URLs */
3685 /* #438650 - fix by Randy Waki */
3686 if ( c == '\n' && TY_(IsUrl)(doc, name) )
3688 /* warn that we discard this newline */
3689 TY_(ReportAttrError)( doc, lexer->token, NULL, NEWLINE_IN_URI);
3697 if (TY_(IsUrl)(doc, name) )
3698 TY_(ReportAttrError)( doc, lexer->token, NULL, WHITE_IN_URI);
3703 else if (foldCase && TY_(IsUpper)(c))
3704 c = TY_(ToLower)(c);
3706 TY_(AddCharToLexer)(lexer, c);
3709 if (quotewarning > 10 && seen_gt && munge)
3712 there is almost certainly a missing trailing quote mark
3713 as we have see too many newlines, < or > characters.
3715 an exception is made for Javascript attributes and the
3716 javascript URL scheme which may legitimately include < and >,
3717 and for attributes starting with "<xml " as generated by
3720 if ( !TY_(IsScript)(doc, name) &&
3721 !(TY_(IsUrl)(doc, name) && TY_(tmbstrncmp)(lexer->lexbuf+start, "javascript:", 11) == 0) &&
3722 !(TY_(tmbstrncmp)(lexer->lexbuf+start, "<xml ", 5) == 0)
3724 TY_(ReportFatal)( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE );
3727 len = lexer->lexsize - start;
3728 lexer->lexsize = start;
3731 if (len > 0 || delim)
3733 /* ignore leading and trailing white space for all but title, alt, value */
3734 /* and prompts attributes unless --literal-attributes is set to yes */
3735 /* #994841 - Whitespace is removed from value attributes */
3738 TY_(tmbstrcasecmp)(name, "alt") &&
3739 TY_(tmbstrcasecmp)(name, "title") &&
3740 TY_(tmbstrcasecmp)(name, "value") &&
3741 TY_(tmbstrcasecmp)(name, "prompt"))
3743 while (TY_(IsWhite)(lexer->lexbuf[start+len-1]))
3746 while (TY_(IsWhite)(lexer->lexbuf[start]) && start < len)
3753 value = TY_(tmbstrndup)(doc->allocator, lexer->lexbuf + start, len);
3758 /* note delimiter if given */
3759 *pdelim = (delim ? delim : '"');
3764 /* attr must be non-NULL */
3765 static Bool IsValidAttrName( ctmbstr attr )
3767 uint i, c = attr[0];
3769 /* first character should be a letter */
3770 if (!TY_(IsLetter)(c))
3773 /* remaining characters should be namechars */
3774 for( i = 1; i < TY_(tmbstrlen)(attr); i++)
3778 if (TY_(IsNamechar)(c))
3787 /* create a new attribute */
3788 AttVal *TY_(NewAttribute)( TidyDocImpl* doc )
3790 AttVal *av = (AttVal*) TidyDocAlloc( doc, sizeof(AttVal) );
3791 TidyClearMemory( av, sizeof(AttVal) );
3795 /* create a new attribute with given name and value */
3796 AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
3799 AttVal *av = TY_(NewAttribute)(doc);
3800 av->attribute = TY_(tmbstrdup)(doc->allocator, name);
3801 av->value = TY_(tmbstrdup)(doc->allocator, value);
3803 av->dict = TY_(FindAttribute)( doc, av );
3807 static void AddAttrToList( AttVal** list, AttVal* av )
3809 if ( *list == NULL )
3813 AttVal* here = *list;
3814 while ( here->next )
3820 void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av )
3822 AddAttrToList(&node->attributes, av);
3825 void TY_(InsertAttributeAtStart)( Node *node, AttVal *av )
3827 av->next = node->attributes;
3828 node->attributes = av;
3831 /* swallows closing '>' */
3833 static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty )
3835 Lexer* lexer = doc->lexer;
3843 while ( !EndOfInput(doc) )
3845 tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php );
3847 if (attribute == NULL)
3849 /* check if attributes are created by ASP markup */
3852 av = TY_(NewAttribute)(doc);
3854 AddAttrToList( &list, av );
3858 /* check if attributes are created by PHP markup */
3861 av = TY_(NewAttribute)(doc);
3863 AddAttrToList( &list, av );
3870 value = ParseValue( doc, attribute, no, isempty, &delim );
3872 if (attribute && (IsValidAttrName(attribute) ||
3873 (cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute))))
3875 av = TY_(NewAttribute)(doc);
3877 av->attribute = attribute;
3879 av->dict = TY_(FindAttribute)( doc, av );
3880 AddAttrToList( &list, av );
3884 av = TY_(NewAttribute)(doc);
3885 av->attribute = attribute;
3888 if (LastChar(attribute) == '"')
3889 TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK);
3890 else if (value == NULL)
3891 TY_(ReportAttrError)(doc, lexer->token, av, MISSING_ATTR_VALUE);
3893 TY_(ReportAttrError)(doc, lexer->token, av, INVALID_ATTRIBUTE);
3895 TY_(FreeAttribute)( doc, av );
3903 Returns document type declarations like
3905 <!DOCTYPE foo PUBLIC "fpi" "sysid">
3906 <!DOCTYPE bar SYSTEM "sysid">
3907 <!DOCTYPE baz [ <!ENTITY ouml "ö"> ]>
3911 <foo PUBLIC="fpi" SYSTEM="sysid" />
3912 <bar SYSTEM="sysid" />
3913 <baz> <!ENTITY ouml "&#246"> </baz>
3915 static Node *ParseDocTypeDecl(TidyDocImpl* doc)
3917 Lexer *lexer = doc->lexer;
3918 int start = lexer->lexsize;
3919 ParseDocTypeDeclState state = DT_DOCTYPENAME;
3924 Node* node = TY_(NewNode)(lexer->allocator, lexer);
3925 node->type = DocTypeTag;
3926 node->start = lexer->txtstart;
3927 node->end = lexer->txtend;
3929 lexer->waswhite = no;
3931 /* todo: reset lexer->lexsize when appropriate to avoid wasting memory */
3933 while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
3935 /* convert newlines to spaces */
3936 if (state != DT_INTSUBSET)
3937 c = c == '\n' ? ' ' : c;
3939 /* convert white-space sequences to single space character */
3940 if (TY_(IsWhite)(c) && state != DT_INTSUBSET)
3942 if (!lexer->waswhite)
3944 TY_(AddCharToLexer)(lexer, c);
3945 lexer->waswhite = yes;
3955 TY_(AddCharToLexer)(lexer, c);
3956 lexer->waswhite = no;
3961 case DT_INTERMEDIATE:
3962 /* determine what's next */
3963 if (TY_(ToUpper)(c) == 'P' || TY_(ToUpper)(c) == 'S')
3965 start = lexer->lexsize - 1;
3966 state = DT_PUBLICSYSTEM;
3971 start = lexer->lexsize;
3972 state = DT_INTSUBSET;
3975 else if (c == '\'' || c == '"')
3977 start = lexer->lexsize;
3979 state = DT_QUOTEDSTRING;
3986 node->end = --(lexer->lexsize);
3988 si = TY_(GetAttrByName)(node, "SYSTEM");
3990 TY_(CheckUrl)(doc, node, si);
3992 if (!node->element || !IsValidXMLElemName(node->element))
3994 TY_(ReportError)(doc, NULL, NULL, MALFORMED_DOCTYPE);
3995 TY_(FreeNode)(doc, node);
3998 #ifdef TIDY_STORE_ORIGINAL_TEXT
3999 StoreOriginalTextInToken(doc, node, 0);
4008 case DT_DOCTYPENAME:
4009 /* read document type name */
4010 if (TY_(IsWhite)(c) || c == '>' || c == '[')
4012 node->element = TY_(tmbstrndup)(doc->allocator,
4013 lexer->lexbuf + start,
4014 lexer->lexsize - start - 1);
4015 if (c == '>' || c == '[')
4018 TY_(UngetChar)(c, doc->docIn);
4021 state = DT_INTERMEDIATE;
4025 case DT_PUBLICSYSTEM:
4026 /* read PUBLIC/SYSTEM */
4027 if (TY_(IsWhite)(c) || c == '>')
4029 char *attname = TY_(tmbstrndup)(doc->allocator,
4030 lexer->lexbuf + start,
4031 lexer->lexsize - start - 1);
4032 hasfpi = !(TY_(tmbstrcasecmp)(attname, "SYSTEM") == 0);
4034 TidyDocFree(doc, attname);
4036 /* todo: report an error if SYSTEM/PUBLIC not uppercase */
4041 TY_(UngetChar)(c, doc->docIn);
4044 state = DT_INTERMEDIATE;
4048 case DT_QUOTEDSTRING:
4049 /* read quoted string */
4052 char *value = TY_(tmbstrndup)(doc->allocator,
4053 lexer->lexbuf + start,
4054 lexer->lexsize - start - 1);
4055 AttVal* att = TY_(AddAttribute)(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value);
4056 TidyDocFree(doc, value);
4059 state = DT_INTERMEDIATE;
4065 /* read internal subset */
4069 lexer->txtstart = start;
4070 lexer->txtend = lexer->lexsize - 1;
4071 subset = TY_(TextToken)(lexer);
4072 TY_(InsertNodeAtEnd)(node, subset);
4073 state = DT_INTERMEDIATE;
4079 /* document type declaration not finished */
4080 TY_(ReportError)(doc, NULL, NULL, MALFORMED_DOCTYPE);
4081 TY_(FreeNode)(doc, node);
4088 * indent-tabs-mode: nil
4090 * eval: (c-set-offset 'substatement-open 0)