2 clean.c -- clean up misuse of presentation markup
4 (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5 See tidy.h for the copyright notice.
7 Filters from other formats such as Microsoft Word
8 often make excessive use of presentation markup such
9 as font tags, B, I, and the align attribute. By applying
10 a set of production rules, it is straight forward to
11 transform this to use CSS.
13 Some rules replace some of the children of an element by
14 style properties on the element, e.g.
16 <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
18 Such rules are applied to the element's content and then
19 to the element itself until none of the rules more apply.
20 Having applied all the rules to an element, it will have
21 a style attribute with one or more properties.
23 Other rules strip the element they apply to, replacing
24 it by style properties on the contents, e.g.
26 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
28 These rules are applied to an element before processing
29 its content and replace the current element by the first
30 element in the exposed content.
32 After applying both sets of rules, you can replace the
33 style attribute by a class value and style rule in the
34 document head. To support this, an association of styles
35 and class names is built.
37 A naive approach is to rely on string matching to test
38 when two property lists are the same. A better approach
39 would be to first sort the properties before matching.
56 static Node* CleanNode( TidyDocImpl* doc, Node *node );
58 static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
60 const Dict* dict = TY_(LookupTagDef)( tid );
61 TidyDocFree( doc, node->element );
62 node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
66 static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
73 TidyDocFree(doc, props->name);
74 TidyDocFree(doc, props->value);
75 TidyDocFree(doc, props);
80 static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
82 StyleProp *first, *prev, *prop;
90 cmp = TY_(tmbstrcmp)(props->name, name);
94 /* this property is already defined, ignore new value */
100 /* insert before this */
102 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
103 prop->name = TY_(tmbstrdup)(doc->allocator, name);
104 prop->value = TY_(tmbstrdup)(doc->allocator, value);
119 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
120 prop->name = TY_(tmbstrdup)(doc->allocator, name);
121 prop->value = TY_(tmbstrdup)(doc->allocator, value);
133 Create sorted linked list of properties from style string
134 It temporarily places nulls in place of ':' and ';' to
135 delimit the strings for the property name and value.
136 Some systems don't allow you to NULL literal strings,
137 so to avoid this, a copy is made first.
139 static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
141 tmbstr name, value = NULL, name_end, value_end, line;
144 line = TY_(tmbstrdup)(doc->allocator, style);
156 if (*name_end == ':')
158 value = name_end + 1;
165 if (*name_end != ':')
168 while ( value && *value == ' ')
176 if (*value_end == ';')
188 prop = InsertProperty(doc, prop, name, value);
194 name = value_end + 1;
201 TidyDocFree(doc, line); /* free temporary copy */
205 static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
213 for (len = 0, prop = props; prop; prop = prop->next)
215 len += TY_(tmbstrlen)(prop->name) + 2;
217 len += TY_(tmbstrlen)(prop->value) + 2;
220 style = (tmbstr) TidyDocAlloc(doc, len+1);
223 for (p = style, prop = props; prop; prop = prop->next)
240 if (prop->next == NULL)
252 create string with merged properties
253 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
258 prop = CreateProps(doc, NULL, style);
259 prop = CreateProps(doc, prop, property);
260 line = CreatePropString(doc, prop);
261 FreeStyleProps(doc, prop);
266 void TY_(FreeStyles)( TidyDocImpl* doc )
268 Lexer* lexer = doc->lexer;
271 TagStyle *style, *next;
272 for ( style = lexer->styles; style; style = next )
275 TidyDocFree( doc, style->tag );
276 TidyDocFree( doc, style->tag_class );
277 TidyDocFree( doc, style->properties );
278 TidyDocFree( doc, style );
283 static tmbstr GensymClass( TidyDocImpl* doc )
285 tmbchar buf[512]; /* CSSPrefix is limited to 256 characters */
286 ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
287 if ( pfx == NULL || *pfx == 0 )
290 TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
291 return TY_(tmbstrdup)(doc->allocator, buf);
294 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
296 Lexer* lexer = doc->lexer;
299 for (style = lexer->styles; style; style=style->next)
301 if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
302 TY_(tmbstrcmp)(style->properties, properties) == 0)
303 return style->tag_class;
306 style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
307 style->tag = TY_(tmbstrdup)(doc->allocator, tag);
308 style->tag_class = GensymClass( doc );
309 style->properties = TY_(tmbstrdup)( doc->allocator, properties );
310 style->next = lexer->styles;
311 lexer->styles = style;
312 return style->tag_class;
316 Add class="foo" to node
318 static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
320 AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
323 if there already is a class attribute
324 then append class name after a space.
327 TY_(AppendToClassAttr)( doc, classattr, classname );
328 else /* create new class attribute */
329 TY_(AddAttribute)( doc, node, "class", classname );
332 void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
336 classname = FindStyle( doc, node->element, stylevalue );
337 AddClass( doc, node, classname);
341 Find style attribute in node, and replace it
342 by corresponding class attribute. Search for
343 class in style dictionary otherwise gensym
344 new class and add to dictionary.
346 Assumes that node doesn't have a class attribute
348 static void Style2Rule( TidyDocImpl* doc, Node *node)
350 AttVal *styleattr, *classattr;
353 styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
357 /* fix for http://tidy.sf.net/bug/850215 */
358 if (!styleattr->value)
360 TY_(RemoveAttribute)(doc, node, styleattr);
364 classname = FindStyle( doc, node->element, styleattr->value );
365 classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
368 if there already is a class attribute
369 then append class name after an underscore
373 TY_(AppendToClassAttr)( doc, classattr, classname );
374 TY_(RemoveAttribute)( doc, node, styleattr );
376 else /* reuse style attribute for class attribute */
378 TidyDocFree(doc, styleattr->attribute);
379 TidyDocFree(doc, styleattr->value);
380 styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
381 styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
386 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
388 if ( selector && color )
390 TY_(AddStringLiteral)(lexer, selector);
391 TY_(AddStringLiteral)(lexer, " { color: ");
392 TY_(AddStringLiteral)(lexer, color);
393 TY_(AddStringLiteral)(lexer, " }\n");
398 move presentation attribs from body to style element
400 background="foo" -> body { background-image: url(foo) }
401 bgcolor="foo" -> body { background-color: foo }
402 text="foo" -> body { color: foo }
403 link="foo" -> :link { color: foo }
404 vlink="foo" -> :visited { color: foo }
405 alink="foo" -> :active { color: foo }
407 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
409 Lexer* lexer = doc->lexer;
411 tmbstr bgcolor = NULL;
415 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
419 TY_(RemoveAttribute)( doc, body, attr );
422 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
424 bgcolor = attr->value;
426 TY_(RemoveAttribute)( doc, body, attr );
429 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
433 TY_(RemoveAttribute)( doc, body, attr );
436 if ( bgurl || bgcolor || color )
438 TY_(AddStringLiteral)(lexer, " body {\n");
441 TY_(AddStringLiteral)(lexer, " background-image: url(");
442 TY_(AddStringLiteral)(lexer, bgurl);
443 TY_(AddStringLiteral)(lexer, ");\n");
444 TidyDocFree(doc, bgurl);
448 TY_(AddStringLiteral)(lexer, " background-color: ");
449 TY_(AddStringLiteral)(lexer, bgcolor);
450 TY_(AddStringLiteral)(lexer, ";\n");
451 TidyDocFree(doc, bgcolor);
455 TY_(AddStringLiteral)(lexer, " color: ");
456 TY_(AddStringLiteral)(lexer, color);
457 TY_(AddStringLiteral)(lexer, ";\n");
458 TidyDocFree(doc, color);
461 TY_(AddStringLiteral)(lexer, " }\n");
464 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
466 AddColorRule(lexer, " :link", attr->value);
467 TY_(RemoveAttribute)( doc, body, attr );
470 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
472 AddColorRule(lexer, " :visited", attr->value);
473 TY_(RemoveAttribute)( doc, body, attr );
476 if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
478 AddColorRule(lexer, " :active", attr->value);
479 TY_(RemoveAttribute)( doc, body, attr );
483 static Bool NiceBody( TidyDocImpl* doc )
485 Node* node = TY_(FindBody)(doc);
488 if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
489 TY_(AttrGetById)(node, TidyAttr_BGCOLOR) ||
490 TY_(AttrGetById)(node, TidyAttr_TEXT) ||
491 TY_(AttrGetById)(node, TidyAttr_LINK) ||
492 TY_(AttrGetById)(node, TidyAttr_VLINK) ||
493 TY_(AttrGetById)(node, TidyAttr_ALINK))
495 doc->badLayout |= USING_BODY;
503 /* create style element using rules from dictionary */
504 static void CreateStyleElement( TidyDocImpl* doc )
506 Lexer* lexer = doc->lexer;
507 Node *node, *head, *body;
511 if ( lexer->styles == NULL && NiceBody(doc) )
514 node = TY_(NewNode)( doc->allocator, lexer );
515 node->type = StartTag;
516 node->implicit = yes;
517 node->element = TY_(tmbstrdup)(doc->allocator, "style");
518 TY_(FindTag)( doc, node );
520 /* insert type attribute */
521 av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
522 TY_(InsertAttributeAtStart)( node, av );
524 body = TY_(FindBody)( doc );
525 lexer->txtstart = lexer->lexsize;
527 CleanBodyAttrs( doc, body );
529 for (style = lexer->styles; style; style = style->next)
531 TY_(AddCharToLexer)(lexer, ' ');
532 TY_(AddStringLiteral)(lexer, style->tag);
533 TY_(AddCharToLexer)(lexer, '.');
534 TY_(AddStringLiteral)(lexer, style->tag_class);
535 TY_(AddCharToLexer)(lexer, ' ');
536 TY_(AddCharToLexer)(lexer, '{');
537 TY_(AddStringLiteral)(lexer, style->properties);
538 TY_(AddCharToLexer)(lexer, '}');
539 TY_(AddCharToLexer)(lexer, '\n');
542 lexer->txtend = lexer->lexsize;
544 TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
547 now insert style element into document head
549 doc is root node. search its children for html node
550 the head node should be first child of html node
552 if ( NULL != (head = TY_(FindHEAD)( doc )) )
553 TY_(InsertNodeAtEnd)( head, node );
557 /* ensure bidirectional links are consistent */
558 void TY_(FixNodeLinks)(Node *node)
563 node->prev->next = node;
565 node->parent->content = node;
568 node->next->prev = node;
570 node->parent->last = node;
572 for (child = node->content; child; child = child->next)
573 child->parent = node;
577 used to strip child of node when
578 the node has one and only one child
580 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
584 child = node->content;
585 node->content = child->content;
586 node->last = child->last;
587 child->content = NULL;
588 TY_(FreeNode)(doc, child);
590 for (child = node->content; child; child = child->next)
591 child->parent = node;
595 used to strip font start and end tags.
596 Extricate "element", replace it by its content and delete it.
598 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
600 if (element->content)
602 Node *node, *parent = element->parent;
604 element->last->next = element->next;
608 element->next->prev = element->last;
611 parent->last = element->last;
615 element->content->prev = element->prev;
616 element->prev->next = element->content;
619 parent->content = element->content;
621 for (node = element->content; node; node = node->next)
622 node->parent = parent;
624 *pnode = element->content;
626 element->next = element->content = NULL;
627 TY_(FreeNode)(doc, element);
631 *pnode = TY_(DiscardElement)(doc, element);
636 Create new string that consists of the
637 combined style properties in s1 and s2
639 To merge property lists, we build a linked
640 list of property/values and insert properties
641 into the list in order, merging values for
642 the same property name.
644 static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
649 prop = CreateProps(doc, NULL, s1);
650 prop = CreateProps(doc, prop, s2);
651 s = CreatePropString(doc, prop);
652 FreeStyleProps(doc, prop);
657 Add style property to element, creating style
658 attribute as needed and adding ; delimiter
660 void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
662 AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
664 /* if style attribute already exists then insert property */
668 if (av->value != NULL)
670 tmbstr s = MergeProperties( doc, av->value, property );
671 TidyDocFree( doc, av->value );
676 av->value = TY_(tmbstrdup)( doc->allocator, property );
679 else /* else create new style attribute */
681 av = TY_(NewAttributeEx)( doc, "style", property, '"' );
682 TY_(InsertAttributeAtStart)( node, av );
686 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
689 tmbstr s1, s2, names;
691 for (s2 = NULL, av = child->attributes; av; av = av->next)
700 for (s1 = NULL, av = node->attributes; av; av = av->next)
711 if (s2) /* merge class names from both */
714 l1 = TY_(tmbstrlen)(s1);
715 l2 = TY_(tmbstrlen)(s2);
716 names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
717 TY_(tmbstrcpy)(names, s1);
719 TY_(tmbstrcpy)(names+l1+1, s2);
720 TidyDocFree(doc, av->value);
724 else if (s2) /* copy class names from child */
726 av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
727 TY_(InsertAttributeAtStart)( node, av );
731 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
734 tmbstr s1, s2, style;
737 the child may have a class attribute used
738 for attaching styles, if so the class name
739 needs to be copied to node's class
741 MergeClasses(doc, node, child);
743 for (s2 = NULL, av = child->attributes; av; av = av->next)
752 for (s1 = NULL, av = node->attributes; av; av = av->next)
763 if (s2) /* merge styles from both */
765 style = MergeProperties(doc, s1, s2);
766 TidyDocFree(doc, av->value);
770 else if (s2) /* copy style of child */
772 av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
773 TY_(InsertAttributeAtStart)( node, av );
777 static ctmbstr FontSize2Name(ctmbstr size)
779 static const ctmbstr sizes[7] =
781 "60%", "70%", "80%", NULL,
782 "120%", "150%", "200%"
785 /* increment of 0.8 */
786 static const ctmbstr minussizes[] =
788 "100%", "80%", "64%", "51%",
792 /* increment of 1.2 */
793 static const ctmbstr plussizes[] =
795 "100%", "120%", "144%", "172%",
796 "207%", "248%", "298%"
802 if ('0' <= size[0] && size[0] <= '6')
804 int n = size[0] - '0';
810 if ('0' <= size[1] && size[1] <= '6')
812 int n = size[1] - '0';
813 return minussizes[n];
815 return "smaller"; /*"70%"; */
818 if ('0' <= size[1] && size[1] <= '6')
820 int n = size[1] - '0';
824 return "larger"; /* "140%" */
827 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
830 TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
831 TY_(AddStyleProperty)( doc, node, buf );
834 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
836 ctmbstr value = NULL;
840 if (TY_(tmbstrcmp)(size, "6") == 0)
842 else if (TY_(tmbstrcmp)(size, "5") == 0)
844 else if (TY_(tmbstrcmp)(size, "4") == 0)
849 TidyDocFree(doc, node->element);
850 node->element = TY_(tmbstrdup)(doc->allocator, value);
851 TY_(FindTag)(doc, node);
856 value = FontSize2Name(size);
861 TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
862 TY_(AddStyleProperty)( doc, node, buf );
866 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
869 TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
870 TY_(AddStyleProperty)( doc, node, buf );
873 /* force alignment value to lower case */
874 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
879 TY_(tmbstrcpy)( buf, "text-align: " );
880 for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
882 if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
886 TY_(AddStyleProperty)( doc, node, buf );
890 add style properties to node corresponding to
891 the font face, size and color attributes
893 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
897 if (AttrHasValue(av))
900 AddFontFace( doc, node, av->value );
901 else if (attrIsSIZE(av))
902 AddFontSize( doc, node, av->value );
903 else if (attrIsCOLOR(av))
904 AddFontColor( doc, node, av->value );
911 Symptom: <p align=center>
912 Action: <p style="text-align: center">
914 static void TextAlign( TidyDocImpl* doc, Node* node )
920 for (av = node->attributes; av; av = av->next)
925 prev->next = av->next;
927 node->attributes = av->next;
930 AddAlign( doc, node, av->value );
932 TY_(FreeAttribute)(doc, av);
941 Symptom: <table bgcolor="red">
942 Action: <table style="background-color: red">
944 static void TableBgColor( TidyDocImpl* doc, Node* node )
949 if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
951 TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
952 TY_(RemoveAttribute)( doc, node, attr );
953 TY_(AddStyleProperty)( doc, node, buf );
958 The clean up rules use the pnode argument to return the
959 next node when the original node has been deleted
963 Symptom: <dir> <li> where <li> is only child
964 Action: coerce <dir> <li> to <div> with indent.
967 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
971 if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
973 child = node->content;
978 /* check child has no peers */
983 if ( !nodeIsLI(child) )
986 if ( !child->implicit )
989 /* coerce dir to div */
990 node->tag = TY_(LookupTagDef)( TidyTag_DIV );
991 TidyDocFree( doc, node->element );
992 node->element = TY_(tmbstrdup)(doc->allocator, "div");
993 TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
994 StripOnlyChild( doc, node );
1003 Action: replace <center> by <div style="text-align: center">
1006 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1008 if ( nodeIsCENTER(node) )
1010 #if 0 // 00000000 what is this doing inside an nodeIsCENTER(node)??? 0000000
1011 if ( cfgBool(doc, TidyDropFontTags) )
1015 Node *last = node->last;
1016 DiscardContainer( doc, node, pnode );
1018 node = TY_(InferredTag)(doc, TidyTag_BR);
1019 TY_(InsertNodeAfterElement)(last, node);
1023 Node *prev = node->prev, *next = node->next,
1024 *parent = node->parent;
1025 DiscardContainer( doc, node, pnode );
1027 node = TY_(InferredTag)(doc, TidyTag_BR);
1029 TY_(InsertNodeBeforeElement)(next, node);
1031 TY_(InsertNodeAfterElement)(prev, node);
1033 TY_(InsertNodeAtStart)(parent, node);
1038 #endif // 00000000 what is this doing inside an nodeIsCENTER(node)??? 0000000
1039 RenameElem( doc, node, TidyTag_DIV );
1040 TY_(AddStyleProperty)( doc, node, "text-align: center" );
1047 /* Copy child attributes to node. Duplicate attributes are overwritten.
1048 Unique attributes (such as ID) disable the action.
1049 Attributes style and class are not dealt with. A call to MergeStyles
1052 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1057 /* Detect attributes that cannot be merged or overwritten. */
1058 if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1059 && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1062 /* Move child attributes to node. Attributes in node
1063 can be overwritten or merged. */
1064 for (av2 = child->attributes; av2; )
1066 /* Dealt by MergeStyles. */
1067 if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1072 /* Avoid duplicates in node */
1073 if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1074 && (av1=TY_(AttrGetById)(node, id))!= NULL)
1075 TY_(RemoveAttribute)( doc, node, av1 );
1077 /* Move attribute from child to node */
1078 TY_(DetachAttribute)( child, av2 );
1082 TY_(InsertAttributeAtEnd)( node, av1 );
1089 Symptom <XX><XX>...</XX></XX>
1090 Action: merge the two XXs
1092 For instance, this is useful after nested <dir>s used by Word
1093 for indenting have been converted to <div>s
1095 If state is "no", no merging.
1096 If state is "yes", inner element is discarded. Only Style and Class
1097 attributes are merged using MergeStyles().
1098 If state is "auto", atttibutes are merged as described in CopyAttrs().
1099 Style and Class attributes are merged using MergeStyles().
1101 static Bool MergeNestedElements( TidyDocImpl* doc,
1102 TidyTagId Id, TidyTriState state, Node *node,
1103 Node **ARG_UNUSED(pnode))
1107 if ( state == TidyNoState
1108 || !TagIsId(node, Id) )
1111 child = node->content;
1114 || child->next != NULL
1115 || !TagIsId(child, Id) )
1118 if ( state == TidyAutoState
1119 && CopyAttrs(doc, node, child) == no )
1122 MergeStyles( doc, node, child );
1123 StripOnlyChild( doc, node );
1128 Symptom: <ul><li><ul>...</ul></li></ul>
1129 Action: discard outer list
1132 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1136 if ( nodeIsUL(node) || nodeIsOL(node) )
1138 child = node->content;
1143 /* check child has no peers */
1148 list = child->content;
1153 if (list->tag != node->tag)
1156 /* check list has no peers */
1160 *pnode = list; /* Set node to resume iteration */
1162 /* move inner list node into position of outer node */
1163 list->prev = node->prev;
1164 list->next = node->next;
1165 list->parent = node->parent;
1166 TY_(FixNodeLinks)(list);
1168 /* get rid of outer ul and its li */
1169 child->content = NULL;
1170 TY_(FreeNode)( doc, child ); /* See test #427841. */
1172 node->content = NULL;
1174 TY_(FreeNode)( doc, node );
1178 If prev node was a list the chances are this node
1179 should be appended to that list. Word has no way of
1180 recognizing nested lists and just uses indents
1185 if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1186 && list->prev->last )
1191 child = list->last; /* <li> */
1193 list->next = node->next;
1194 TY_(FixNodeLinks)(list);
1196 node->parent = child;
1198 node->prev = child->last;
1199 TY_(FixNodeLinks)(node);
1200 CleanNode( doc, node );
1210 /* Find CSS equivalent in a SPAN element */
1212 Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
1220 const CSS_SpanEq[] =
1222 { TidyTag_B, "font-weight: bold", no },
1223 { TidyTag_I, "font-style: italic", no },
1224 { TidyTag_S, "text-decoration: line-through", yes},
1225 { TidyTag_STRIKE, "text-decoration: line-through", yes},
1226 { TidyTag_U, "text-decoration: underline", yes},
1227 { TidyTag_UNKNOWN, NULL, no }
1231 for (i=0; CSS_SpanEq[i].CSSeq; ++i)
1232 if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
1233 && TagIsId(node, CSS_SpanEq[i].id) )
1235 *s = CSS_SpanEq[i].CSSeq;
1241 /* Necessary conditions to apply BlockStyle(). */
1242 static Bool CanApplyBlockStyle( Node *node )
1244 if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1245 && !nodeIsDIV(node) && !nodeIsP(node)
1246 && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1254 Symptom: the only child of a block-level element is a
1255 presentation element such as B, I or FONT
1257 Action: add style "font-weight: bold" to the block and
1258 strip the <b> element, leaving its children.
1263 <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1268 <p style="font-weight: bold; font-family: Arial; font-size: 6">
1269 Draft Recommended Practice
1272 This code also replaces the align attribute by a style attribute.
1273 However, to avoid CSS problems with Navigator 4, this isn't done
1274 for the elements: caption, tr and table
1276 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1281 /* check for bgcolor */
1282 if ( nodeIsTABLE(node)
1283 || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
1284 TableBgColor( doc, node );
1286 if (CanApplyBlockStyle(node))
1288 /* check for align attribute */
1289 if ( !nodeIsCAPTION(node) )
1290 TextAlign( doc, node );
1292 child = node->content;
1296 /* check child has no peers */
1300 if ( FindCSSSpanEq(child, &CSSeq, no) )
1302 MergeStyles( doc, node, child );
1303 TY_(AddStyleProperty)( doc, node, CSSeq );
1304 StripOnlyChild( doc, node );
1307 else if ( nodeIsFONT(child) )
1309 MergeStyles( doc, node, child );
1310 AddFontStyles( doc, node, child->attributes );
1311 StripOnlyChild( doc, node );
1319 /* Necessary conditions to apply InlineStyle(). */
1320 static Bool CanApplyInlineStyle( Node *node )
1322 return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
1325 /* the only child of table cell or an inline element such as em */
1326 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1331 if ( CanApplyInlineStyle(node) )
1333 child = node->content;
1338 /* check child has no peers */
1343 if ( FindCSSSpanEq(child, &CSSeq, no) )
1345 MergeStyles( doc, node, child );
1346 TY_(AddStyleProperty)( doc, node, CSSeq );
1347 StripOnlyChild( doc, node );
1350 else if ( nodeIsFONT(child) )
1352 MergeStyles( doc, node, child );
1353 AddFontStyles( doc, node, child->attributes );
1354 StripOnlyChild( doc, node );
1363 Transform element to equivalent CSS
1365 static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
1366 Node **ARG_UNUSED(pnode) )
1370 /* if node is the only child of parent element then leave alone
1371 Do so only if BlockStyle may be succesful. */
1372 if ( node->parent->content == node && node->next == NULL &&
1373 (CanApplyBlockStyle(node->parent)
1374 || CanApplyInlineStyle(node->parent)) )
1377 if ( FindCSSSpanEq(node, &CSSeq, yes) )
1379 RenameElem( doc, node, TidyTag_SPAN );
1380 TY_(AddStyleProperty)( doc, node, CSSeq );
1387 Replace font elements by span elements, deleting
1388 the font element's attributes and replacing them
1389 by a single style attribute.
1391 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1393 AttVal *av, *style, *next;
1395 if ( nodeIsFONT(node) )
1397 if ( cfgBool(doc, TidyDropFontTags) )
1399 DiscardContainer( doc, node, pnode );
1403 /* if node is the only child of parent element then leave alone
1404 Do so only if BlockStyle may be succesful. */
1405 if ( node->parent->content == node && node->next == NULL &&
1406 CanApplyBlockStyle(node->parent) )
1409 AddFontStyles( doc, node, node->attributes );
1411 /* extract style attribute and free the rest */
1412 av = node->attributes;
1419 if (attrIsSTYLE(av))
1426 TY_(FreeAttribute)( doc, av );
1431 node->attributes = style;
1432 RenameElem( doc, node, TidyTag_SPAN );
1440 Applies all matching rules to a node.
1442 Node* CleanNode( TidyDocImpl* doc, Node *node )
1445 TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1446 TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
1448 for (next = node; TY_(nodeIsElement)(node); node = next)
1450 if ( Dir2Div(doc, node, &next) )
1453 /* Special case: true result means
1454 ** that arg node and its parent no longer exist.
1455 ** So we must jump back up the CreateStyleProperties()
1456 ** call stack until we have a valid node reference.
1458 if ( NestedList(doc, node, &next) )
1461 if ( Center2Div(doc, node, &next) )
1464 if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1467 if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
1470 if ( BlockStyle(doc, node, &next) )
1473 if ( InlineStyle(doc, node, &next) )
1476 if ( InlineElementToCSS(doc, node, &next) )
1479 if ( Font2Span(doc, node, &next) )
1488 /* Special case: if the current node is destroyed by
1489 ** CleanNode() lower in the tree, this node and its parent
1490 ** no longer exist. So we must jump back up the CleanTree()
1491 ** call stack until we have a valid node reference.
1494 static Node* CleanTree( TidyDocImpl* doc, Node *node )
1499 for (child = node->content; child != NULL; child = child->next)
1501 child = CleanTree( doc, child );
1507 return CleanNode( doc, node );
1510 static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1516 for (child = node->content;
1517 child != NULL; child = child->next)
1519 DefineStyleRules( doc, child );
1523 Style2Rule( doc, node );
1526 void TY_(CleanDocument)( TidyDocImpl* doc )
1528 /* placeholder. CleanTree()/CleanNode() will not
1531 CleanTree( doc, &doc->root );
1533 if ( cfgBool(doc, TidyMakeClean) )
1535 DefineStyleRules( doc, &doc->root );
1536 CreateStyleElement( doc );
1540 /* simplifies <b><b> ... </b> ...</b> etc. */
1541 void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1549 if ( (nodeIsB(node) || nodeIsI(node))
1550 && node->parent && node->parent->tag == node->tag)
1552 /* strip redundant inner element */
1553 DiscardContainer( doc, node, &next );
1558 if ( node->content )
1559 TY_(NestedEmphasis)( doc, node->content );
1567 /* replace i by em and b by strong */
1568 void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1572 if ( nodeIsI(node) )
1573 RenameElem( doc, node, TidyTag_EM );
1574 else if ( nodeIsB(node) )
1575 RenameElem( doc, node, TidyTag_STRONG );
1577 if ( node->content )
1578 TY_(EmFromI)( doc, node->content );
1584 static Bool HasOneChild(Node *node)
1586 return (node->content && node->content->next == NULL);
1590 Some people use dir or ul without an li
1591 to indent the content. The pattern to
1592 look for is a list with a single implicit
1593 li. This is recursively replaced by an
1594 implicit blockquote.
1596 void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1601 TY_(List2BQ)( doc, node->content );
1603 if ( node->tag && node->tag->parser == TY_(ParseList) &&
1604 HasOneChild(node) && node->content->implicit )
1606 StripOnlyChild( doc, node );
1607 RenameElem( doc, node, TidyTag_BLOCKQUOTE );
1608 node->implicit = yes;
1617 Replace implicit blockquote by div with an indent
1618 taking care to reduce nested blockquotes to a single
1619 div with the indent set to match the nesting depth
1621 void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1623 tmbchar indent_buf[ 32 ];
1628 if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1632 while( HasOneChild(node) &&
1633 nodeIsBLOCKQUOTE(node->content) &&
1637 StripOnlyChild( doc, node );
1641 TY_(BQ2Div)( doc, node->content );
1643 TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1646 RenameElem( doc, node, TidyTag_DIV );
1647 TY_(AddStyleProperty)(doc, node, indent_buf );
1649 else if (node->content)
1650 TY_(BQ2Div)( doc, node->content );
1657 static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1661 for ( check=node; check; check = check->parent )
1663 if ( nodeIsTD(check) )
1669 /* node is <![if ...]> prune up to <![endif]> */
1670 static Node* PruneSection( TidyDocImpl* doc, Node *node )
1672 Lexer* lexer = doc->lexer;
1676 ctmbstr lexbuf = lexer->lexbuf + node->start;
1677 if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1679 Node* cell = FindEnclosingCell( doc, node );
1682 /* Need to put into cell so it doesn't look weird
1684 Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1685 assert( (byte)'\240' == (byte)160 );
1686 TY_(InsertNodeBeforeElement)( node, nbsp );
1690 /* discard node and returns next, unless it is a text node */
1691 if ( node->type == TextNode )
1694 node = TY_(DiscardElement)( doc, node );
1699 if (node->type == SectionTag)
1701 if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1703 node = PruneSection( doc, node );
1707 if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1709 node = TY_(DiscardElement)( doc, node );
1718 void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1720 Lexer* lexer = doc->lexer;
1723 if (node->type == SectionTag)
1725 /* prune up to matching endif */
1726 if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1727 (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1729 node = PruneSection( doc, node );
1733 /* discard others as well */
1734 node = TY_(DiscardElement)( doc, node );
1739 TY_(DropSections)( doc, node->content );
1745 static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
1747 AttVal *attr, *next, *prev = NULL;
1749 for ( attr = node->attributes; attr; attr = next )
1753 /* special check for class="Code" denoting pre text */
1754 /* Pass thru user defined styles as HTML class names */
1755 if (attrIsCLASS(attr))
1757 if (AttrValueIs(attr, "Code") ||
1758 TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1765 if (attrIsCLASS(attr) ||
1766 attrIsSTYLE(attr) ||
1768 ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1769 (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1770 (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1775 node->attributes = next;
1777 TY_(FreeAttribute)( doc, attr );
1784 /* Word2000 uses span excessively, so we strip span out */
1785 static Node* StripSpan( TidyDocImpl* doc, Node* span )
1787 Node *node, *prev = NULL, *content;
1790 deal with span elements that have content
1791 by splicing the content in place of the span
1792 after having processed it
1795 TY_(CleanWord2000)( doc, span->content );
1796 content = span->content;
1803 content = content->next;
1804 TY_(RemoveNode)(node);
1805 TY_(InsertNodeBeforeElement)(span, node);
1812 content = content->next;
1813 TY_(RemoveNode)(node);
1814 TY_(InsertNodeAfterElement)(prev, node);
1818 if (span->next == NULL)
1819 span->parent->last = prev;
1822 span->content = NULL;
1823 TY_(DiscardElement)( doc, span );
1827 /* map non-breaking spaces to regular spaces */
1828 void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1832 if ( node->content )
1833 TY_(NormalizeSpaces)( lexer, node->content );
1835 if (TY_(nodeIsText)(node))
1838 tmbstr p = lexer->lexbuf + node->start;
1840 for (i = node->start; i < node->end; ++i)
1842 c = (byte) lexer->lexbuf[i];
1844 /* look for UTF-8 multibyte character */
1846 i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1851 p = TY_(PutUTF8)(p, c);
1853 node->end = p - lexer->lexbuf;
1860 /* used to hunt for hidden preformatted sections */
1861 static Bool NoMargins(Node *node)
1863 AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1865 if ( !AttrHasValue(attval) )
1868 /* search for substring "margin-top: 0" */
1869 if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1872 /* search for substring "margin-bottom: 0" */
1873 if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1879 /* does element have a single space as its content? */
1880 static Bool SingleSpace( Lexer* lexer, Node* node )
1882 if ( node->content )
1884 node = node->content;
1886 if ( node->next != NULL )
1889 if ( node->type != TextNode )
1892 if ( (node->end - node->start) == 1 &&
1893 lexer->lexbuf[node->start] == ' ' )
1896 if ( (node->end - node->start) == 2 )
1899 TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1909 This is a major clean up to strip out all the extra stuff you get
1910 when you save as web page from Word 2000. It doesn't yet know what
1911 to do with VML tags, but these will appear as errors unless you
1912 declare them as new tags, such as o:p which needs to be declared
1915 void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1917 /* used to a list from a sequence of bulletted p's */
1918 Lexer* lexer = doc->lexer;
1923 /* get rid of Word's xmlns attributes */
1924 if ( nodeIsHTML(node) )
1926 /* check that it's a Word 2000 document */
1927 if ( !TY_(GetAttrByName)(node, "xmlns:o") &&
1928 !cfgBool(doc, TidyMakeBare) )
1931 TY_(FreeAttrs)( doc, node );
1934 /* fix up preformatted sections by looking for a
1935 ** sequence of paragraphs with zero top/bottom margin
1937 if ( nodeIsP(node) )
1939 if (NoMargins(node))
1942 TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1944 PurgeWord2000Attributes( doc, node );
1947 TY_(CleanWord2000)( doc, node->content );
1952 /* continue to strip p's */
1954 while ( nodeIsP(node) && NoMargins(node) )
1957 TY_(RemoveNode)(node);
1958 TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1959 TY_(InsertNodeAtEnd)(pre, node);
1960 StripSpan( doc, node );
1969 if (node->tag && (node->tag->model & CM_BLOCK)
1970 && SingleSpace(lexer, node))
1972 node = StripSpan( doc, node );
1975 /* discard Word's style verbiage */
1976 if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1977 node->type == CommentTag )
1979 node = TY_(DiscardElement)( doc, node );
1983 /* strip out all span and font tags Word scatters so liberally! */
1984 if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1986 node = StripSpan( doc, node );
1990 if ( nodeIsLINK(node) )
1992 AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1994 if (AttrValueIs(attr, "File-List"))
1996 node = TY_(DiscardElement)( doc, node );
2001 /* discards <o:p> which encodes the paragraph mark */
2002 if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
2005 DiscardContainer( doc, node, &next );
2010 /* discard empty paragraphs */
2012 if ( node->content == NULL && nodeIsP(node) )
2014 /* Use the existing function to ensure consistency */
2015 Node *next = TY_(TrimEmptyElement)( doc, node );
2020 if ( nodeIsP(node) )
2022 AttVal *attr, *atrStyle;
2024 attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
2025 atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
2027 (JES) Sometimes Word marks a list item with the following hokie syntax
2028 <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2029 translate these into <li>
2031 /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
2032 /* map <p class="MsoListNumber"> to <ol>...</ol> */
2033 if ( AttrValueIs(attr, "MsoListBullet") ||
2034 AttrValueIs(attr, "MsoListNumber") ||
2035 AttrContains(atrStyle, "mso-list:") )
2037 TidyTagId listType = TidyTag_UL;
2038 if (AttrValueIs(attr, "MsoListNumber"))
2039 listType = TidyTag_OL;
2041 TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
2043 if ( !list || TagId(list) != listType )
2045 const Dict* tag = TY_(LookupTagDef)( listType );
2046 list = TY_(InferredTag)(doc, tag->id);
2047 TY_(InsertNodeBeforeElement)(node, list);
2050 PurgeWord2000Attributes( doc, node );
2052 if ( node->content )
2053 TY_(CleanWord2000)( doc, node->content );
2055 /* remove node and append to contents of list */
2056 TY_(RemoveNode)(node);
2057 TY_(InsertNodeAtEnd)(list, node);
2060 /* map sequence of <p class="Code"> to <pre>...</pre> */
2061 else if (AttrValueIs(attr, "Code"))
2063 Node *br = TY_(NewLineNode)(lexer);
2064 TY_(NormalizeSpaces)(lexer, node->content);
2066 if ( !list || TagId(list) != TidyTag_PRE )
2068 list = TY_(InferredTag)(doc, TidyTag_PRE);
2069 TY_(InsertNodeBeforeElement)(node, list);
2072 /* remove node and append to contents of list */
2073 TY_(RemoveNode)(node);
2074 TY_(InsertNodeAtEnd)(list, node);
2075 StripSpan( doc, node );
2076 TY_(InsertNodeAtEnd)(list, br);
2088 /* strip out style and class attributes */
2089 if (TY_(nodeIsElement)(node))
2090 PurgeWord2000Attributes( doc, node );
2093 TY_(CleanWord2000)( doc, node->content );
2099 Bool TY_(IsWord2000)( TidyDocImpl* doc )
2103 Node *html = TY_(FindHTML)( doc );
2105 if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2108 /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2109 head = TY_(FindHEAD)( doc );
2113 for (node = head->content; node; node = node->next)
2115 if ( !nodeIsMETA(node) )
2118 attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2120 if ( !AttrValueIs(attval, "generator") )
2123 attval = TY_(AttrGetById)( node, TidyAttr_CONTENT );
2125 if ( AttrContains(attval, "Microsoft") )
2133 /* where appropriate move object elements from head to body */
2134 void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2136 Node *node, *next, *head = NULL, *body = NULL;
2141 for ( node = html->content; node != NULL; node = node->next )
2143 if ( nodeIsHEAD(node) )
2146 if ( nodeIsBODY(node) )
2150 if ( head != NULL && body != NULL )
2152 for (node = head->content; node != NULL; node = next)
2156 if ( nodeIsOBJECT(node) )
2161 for (child = node->content; child != NULL; child = child->next)
2163 /* bump to body unless content is param */
2164 if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2165 || !nodeIsPARAM(child) )
2174 TY_(RemoveNode)( node );
2175 TY_(InsertNodeAtStart)( body, node );
2182 /* This is disabled due to http://tidy.sf.net/bug/681116 */
2184 void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
2187 Bool bBRDeleted = no;
2189 if (NULL == pParent)
2192 /* First, check the status of All My Children */
2193 pNode = pParent->content;
2194 while (NULL != pNode )
2196 /* The node may get trimmed, so save the next pointer, if any */
2197 Node *pNext = pNode->next;
2198 FixBrakes( pDoc, pNode );
2203 /* As long as my last child is a <br />, move it to my last peer */
2204 if ( nodeCMIsBlock( pParent ))
2206 for ( pNode = pParent->last;
2207 NULL != pNode && nodeIsBR( pNode );
2208 pNode = pParent->last )
2210 if ( NULL == pNode->attributes && no == bBRDeleted )
2212 TY_(DiscardElement)( pDoc, pNode );
2217 TY_(RemoveNode)( pNode );
2218 TY_(InsertNodeAfterElement)( pParent, pNode );
2221 TY_(TrimEmptyElement)( pDoc, pParent );
2226 void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2229 StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL;
2230 tmbstr s, pszBegin, pszEnd;
2231 ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
2236 if (!nodeIsHEAD(head))
2237 head = TY_(FindHEAD)(doc);
2242 /* Find any <meta http-equiv='Content-Type' content='...' /> */
2243 for (pNode = head->content; NULL != pNode; pNode = pNode->next)
2245 AttVal* httpEquiv = TY_(AttrGetById)(pNode, TidyAttr_HTTP_EQUIV);
2246 AttVal* metaContent = TY_(AttrGetById)(pNode, TidyAttr_CONTENT);
2248 if ( !nodeIsMETA(pNode) || !metaContent ||
2249 !AttrValueIs(httpEquiv, "Content-Type") )
2252 pszBegin = s = TY_(tmbstrdup)( doc->allocator, metaContent->value );
2253 while (pszBegin && *pszBegin)
2255 while (isspace( *pszBegin ))
2258 while ('\0' != *pszEnd && ';' != *pszEnd)
2260 if (';' == *pszEnd )
2262 if (pszEnd > pszBegin)
2264 prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
2265 prop->name = TY_(tmbstrdup)( doc->allocator, pszBegin );
2269 if (NULL != pLastProp)
2270 pLastProp->next = prop;
2278 TidyDocFree( doc, s );
2280 /* find the charset property */
2281 for (prop = pFirstProp; NULL != prop; prop = prop->next)
2283 if (0 != TY_(tmbstrncasecmp)( prop->name, "charset", 7 ))
2286 TidyDocFree( doc, prop->name );
2287 prop->name = (tmbstr)TidyDocAlloc( doc, 8 + TY_(tmbstrlen)(enc) + 1 );
2288 TY_(tmbstrcpy)(prop->name, "charset=");
2289 TY_(tmbstrcpy)(prop->name+8, enc);
2290 s = CreatePropString( doc, pFirstProp );
2291 TidyDocFree( doc, metaContent->value );
2292 metaContent->value = s;
2295 /* #718127, prevent memory leakage */
2296 FreeStyleProps(doc, pFirstProp);
2302 void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2310 if (node->type == CommentTag)
2312 TY_(RemoveNode)(node);
2313 TY_(FreeNode)(doc, node);
2319 TY_(DropComments)(doc, node->content);
2325 void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2333 if (nodeIsFONT(node))
2335 DiscardContainer(doc, node, &next);
2341 TY_(DropFontElements)(doc, node->content, &next);
2347 void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2355 if (nodeIsWBR(node))
2358 text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2359 TY_(InsertNodeAfterElement)(node, text);
2360 TY_(RemoveNode)(node);
2361 TY_(FreeNode)(doc, node);
2367 TY_(WbrToSpace)(doc, node->content);
2374 Filters from Word and PowerPoint often use smart
2375 quotes resulting in character codes between 128
2376 and 159. Unfortunately, the corresponding HTML 4.0
2377 entities for these are not widely supported. The
2378 following converts dashes and quotation marks to
2379 the nearest ASCII equivalent. My thanks to
2380 Andrzej Novosiolov for his help with this code.
2382 Note: The old code in the pretty printer applied
2383 this to all node types and attribute values while
2384 this routine applies it only to text nodes. First,
2385 Microsoft Office products rarely put the relevant
2386 characters into these tokens, second support for
2387 them is much better now and last but not least, it
2388 can be harmful to replace these characters since
2389 US-ASCII quote marks are often used as syntax
2390 characters, a simple
2392 <a onmouseover="alert('‘')">...</a>
2394 would be broken if the U+2018 is replaced by "'".
2395 The old code would neither take care whether the
2396 quote mark is already used as delimiter,
2398 <p title='‘'>...</p>
2402 <p title='''>...</p>
2404 Since browser support is much better nowadays and
2405 high-quality typography is better than ASCII it'd
2406 be probably a good idea to drop the feature...
2408 void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2411 Lexer* lexer = doc->lexer;
2417 if (TY_(nodeIsText)(node))
2420 tmbstr p = lexer->lexbuf + node->start;
2422 for (i = node->start; i < node->end; ++i)
2424 c = (unsigned char) lexer->lexbuf[i];
2427 i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2429 if (c >= 0x2013 && c <= 0x201E)
2433 case 0x2013: /* en dash */
2434 case 0x2014: /* em dash */
2437 case 0x2018: /* left single quotation mark */
2438 case 0x2019: /* right single quotation mark */
2439 case 0x201A: /* single low-9 quotation mark */
2442 case 0x201C: /* left double quotation mark */
2443 case 0x201D: /* right double quotation mark */
2444 case 0x201E: /* double low-9 quotation mark */
2450 p = TY_(PutUTF8)(p, c);
2453 node->end = p - lexer->lexbuf;
2457 TY_(DowngradeTypography)(doc, node->content);
2463 void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2471 if (node->tag && node->tag->parser == TY_(ParsePre))
2473 TY_(NormalizeSpaces)(doc->lexer, node->content);
2479 TY_(ReplacePreformattedSpaces)(doc, node->content);
2485 void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2493 if (node->type == CDATATag)
2494 node->type = TextNode;
2497 TY_(ConvertCDATANodes)(doc, node->content);
2504 FixLanguageInformation ensures that the document contains (only)
2505 the attributes for language information desired by the output
2506 document type. For example, for XHTML 1.0 documents both
2507 'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2508 is desired and for HTML 4.01 only 'lang' is desired.
2510 void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2518 /* todo: report modifications made here to the report system */
2520 if (TY_(nodeIsElement)(node))
2522 AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2523 AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2525 if (lang && xmlLang)
2528 todo: check whether both attributes are in sync,
2529 here or elsewhere, where elsewhere is probably
2531 AD - March 2005: not mandatory according the standards.
2534 else if (lang && wantXmlLang)
2536 if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2537 & doc->lexer->versionEmitted)
2538 TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2540 else if (xmlLang && wantLang)
2542 if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2543 & doc->lexer->versionEmitted)
2544 TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2547 if (lang && !wantLang)
2548 TY_(RemoveAttribute)(doc, node, lang);
2550 if (xmlLang && !wantXmlLang)
2551 TY_(RemoveAttribute)(doc, node, xmlLang);
2555 TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
2562 Set/fix/remove <html xmlns='...'>
2564 void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2566 Node* html = TY_(FindHTML)(doc);
2572 xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2576 if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2577 TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2581 TY_(RemoveAttribute)(doc, html, xmlns);
2588 void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2596 if (TY_(IsAnchorElement)(doc, node))
2598 AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2599 AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2600 Bool hadName = name!=NULL;
2601 Bool hadId = id!=NULL;
2602 Bool IdEmitted = no;
2603 Bool NameEmitted = no;
2605 /* todo: how are empty name/id attributes handled? */
2609 Bool NameHasValue = AttrHasValue(name);
2610 Bool IdHasValue = AttrHasValue(id);
2611 if ( (NameHasValue != IdHasValue) ||
2612 (NameHasValue && IdHasValue &&
2613 TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2614 TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2616 else if (name && wantId)
2618 if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2619 & doc->lexer->versionEmitted)
2621 if (TY_(IsValidHTMLID)(name->value))
2623 TY_(RepairAttrValue)(doc, node, "id", name->value);
2627 TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2630 else if (id && wantName)
2632 if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2633 & doc->lexer->versionEmitted)
2635 /* todo: do not assume id is valid */
2636 TY_(RepairAttrValue)(doc, node, "name", id->value);
2642 /* make sure that Name has been emitted if requested */
2643 && (hadName || !wantName || NameEmitted) ) {
2644 if (!wantId && !wantName)
2645 TY_(RemoveAnchorByNode)(doc, id->value, node);
2646 TY_(RemoveAttribute)(doc, node, id);
2649 if (name && !wantName
2650 /* make sure that Id has been emitted if requested */
2651 && (hadId || !wantId || IdEmitted) ) {
2652 if (!wantId && !wantName)
2653 TY_(RemoveAnchorByNode)(doc, name->value, node);
2654 TY_(RemoveAttribute)(doc, node, name);
2659 TY_(FixAnchors)(doc, node->content, wantName, wantId);
2668 * indent-tabs-mode: nil
2670 * eval: (c-set-offset 'substatement-open 0)