1 from __future__ import absolute_import, division, unicode_literals
6 class Filter(base.Filter):
8 previous1 = previous2 = None
9 for token in self.source:
10 if previous1 is not None:
11 yield previous2, previous1, token
14 if previous1 is not None:
15 yield previous2, previous1, None
18 for previous, token, next in self.slider():
20 if type == "StartTag":
22 not self.is_optional_start(token["name"], previous, next)):
24 elif type == "EndTag":
25 if not self.is_optional_end(token["name"], next):
30 def is_optional_start(self, tagname, previous, next):
31 type = next and next["type"] or None
33 # An html element's start tag may be omitted if the first thing
34 # inside the html element is not a space character or a comment.
35 return type not in ("Comment", "SpaceCharacters")
36 elif tagname == 'head':
37 # A head element's start tag may be omitted if the first thing
38 # inside the head element is an element.
39 # XXX: we also omit the start tag if the head element is empty
40 if type in ("StartTag", "EmptyTag"):
42 elif type == "EndTag":
43 return next["name"] == "head"
44 elif tagname == 'body':
45 # A body element's start tag may be omitted if the first thing
46 # inside the body element is not a space character or a comment,
47 # except if the first thing inside the body element is a script
48 # or style element and the node immediately preceding the body
49 # element is a head element whose end tag has been omitted.
50 if type in ("Comment", "SpaceCharacters"):
52 elif type == "StartTag":
53 # XXX: we do not look at the preceding event, so we never omit
54 # the body element's start tag if it's followed by a script or
56 return next["name"] not in ('script', 'style')
59 elif tagname == 'colgroup':
60 # A colgroup element's start tag may be omitted if the first thing
61 # inside the colgroup element is a col element, and if the element
62 # is not immediately preceded by another colgroup element whose
63 # end tag has been omitted.
64 if type in ("StartTag", "EmptyTag"):
65 # XXX: we do not look at the preceding event, so instead we never
66 # omit the colgroup element's end tag when it is immediately
67 # followed by another colgroup element. See is_optional_end.
68 return next["name"] == "col"
71 elif tagname == 'tbody':
72 # A tbody element's start tag may be omitted if the first thing
73 # inside the tbody element is a tr element, and if the element is
74 # not immediately preceded by a tbody, thead, or tfoot element
75 # whose end tag has been omitted.
76 if type == "StartTag":
77 # omit the thead and tfoot elements' end tag when they are
78 # immediately followed by a tbody element. See is_optional_end.
79 if previous and previous['type'] == 'EndTag' and \
80 previous['name'] in ('tbody', 'thead', 'tfoot'):
82 return next["name"] == 'tr'
87 def is_optional_end(self, tagname, next):
88 type = next and next["type"] or None
89 if tagname in ('html', 'head', 'body'):
90 # An html element's end tag may be omitted if the html element
91 # is not immediately followed by a space character or a comment.
92 return type not in ("Comment", "SpaceCharacters")
93 elif tagname in ('li', 'optgroup', 'tr'):
94 # A li element's end tag may be omitted if the li element is
95 # immediately followed by another li element or if there is
96 # no more content in the parent element.
97 # An optgroup element's end tag may be omitted if the optgroup
98 # element is immediately followed by another optgroup element,
99 # or if there is no more content in the parent element.
100 # A tr element's end tag may be omitted if the tr element is
101 # immediately followed by another tr element, or if there is
102 # no more content in the parent element.
103 if type == "StartTag":
104 return next["name"] == tagname
106 return type == "EndTag" or type is None
107 elif tagname in ('dt', 'dd'):
108 # A dt element's end tag may be omitted if the dt element is
109 # immediately followed by another dt element or a dd element.
110 # A dd element's end tag may be omitted if the dd element is
111 # immediately followed by another dd element or a dt element,
112 # or if there is no more content in the parent element.
113 if type == "StartTag":
114 return next["name"] in ('dt', 'dd')
115 elif tagname == 'dd':
116 return type == "EndTag" or type is None
120 # A p element's end tag may be omitted if the p element is
121 # immediately followed by an address, article, aside,
122 # blockquote, datagrid, dialog, dir, div, dl, fieldset,
123 # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
124 # nav, ol, p, pre, section, table, or ul, element, or if
125 # there is no more content in the parent element.
126 if type in ("StartTag", "EmptyTag"):
127 return next["name"] in ('address', 'article', 'aside',
128 'blockquote', 'datagrid', 'dialog',
129 'dir', 'div', 'dl', 'fieldset', 'footer',
130 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
131 'header', 'hr', 'menu', 'nav', 'ol',
132 'p', 'pre', 'section', 'table', 'ul')
134 return type == "EndTag" or type is None
135 elif tagname == 'option':
136 # An option element's end tag may be omitted if the option
137 # element is immediately followed by another option element,
138 # or if it is immediately followed by an <code>optgroup</code>
139 # element, or if there is no more content in the parent
141 if type == "StartTag":
142 return next["name"] in ('option', 'optgroup')
144 return type == "EndTag" or type is None
145 elif tagname in ('rt', 'rp'):
146 # An rt element's end tag may be omitted if the rt element is
147 # immediately followed by an rt or rp element, or if there is
148 # no more content in the parent element.
149 # An rp element's end tag may be omitted if the rp element is
150 # immediately followed by an rt or rp element, or if there is
151 # no more content in the parent element.
152 if type == "StartTag":
153 return next["name"] in ('rt', 'rp')
155 return type == "EndTag" or type is None
156 elif tagname == 'colgroup':
157 # A colgroup element's end tag may be omitted if the colgroup
158 # element is not immediately followed by a space character or
160 if type in ("Comment", "SpaceCharacters"):
162 elif type == "StartTag":
163 # XXX: we also look for an immediately following colgroup
164 # element. See is_optional_start.
165 return next["name"] != 'colgroup'
168 elif tagname in ('thead', 'tbody'):
169 # A thead element's end tag may be omitted if the thead element
170 # is immediately followed by a tbody or tfoot element.
171 # A tbody element's end tag may be omitted if the tbody element
172 # is immediately followed by a tbody or tfoot element, or if
173 # there is no more content in the parent element.
174 # A tfoot element's end tag may be omitted if the tfoot element
175 # is immediately followed by a tbody element, or if there is no
176 # more content in the parent element.
177 # XXX: we never omit the end tag when the following element is
178 # a tbody. See is_optional_start.
179 if type == "StartTag":
180 return next["name"] in ['tbody', 'tfoot']
181 elif tagname == 'tbody':
182 return type == "EndTag" or type is None
185 elif tagname == 'tfoot':
186 # A tfoot element's end tag may be omitted if the tfoot element
187 # is immediately followed by a tbody element, or if there is no
188 # more content in the parent element.
189 # XXX: we never omit the end tag when the following element is
190 # a tbody. See is_optional_start.
191 if type == "StartTag":
192 return next["name"] == 'tbody'
194 return type == "EndTag" or type is None
195 elif tagname in ('td', 'th'):
196 # A td element's end tag may be omitted if the td element is
197 # immediately followed by a td or th element, or if there is
198 # no more content in the parent element.
199 # A th element's end tag may be omitted if the th element is
200 # immediately followed by a td or th element, or if there is
201 # no more content in the parent element.
202 if type == "StartTag":
203 return next["name"] in ('td', 'th')
205 return type == "EndTag" or type is None