self._recursive(child, texts)
elif elem.is_decl():
texts.append("<!" + elem.name + ">")
+ elif elem.is_comment():
+ texts.append("<!--" + elem.name + "-->")
class TEXTRenderer(Renderer):
TAG = 100
TEXT = 200
DECL = 300
+ COMMENT = 400
def __init__(self, type, name="", attrs={}):
"""
return "<TAG:%s %s>" % (self.name, self._attrs2str())
elif self.type == HTMLElement.DECL:
return "<DECL:'%s'>" % self.name
+ elif self.type == HTMLElement.COMMENT:
+ return "<COMMENT:'%s'>" % self.name
elif self.type == HTMLElement.TEXT:
return "<TEXT:'%s'>" % self._text
else:
for i in self:
i._r_get_elements_by_name(name, buf)
+ def get_comments(self):
+ buf = []
+ for i in self:
+ i._r_get_comments(buf)
+ return buf
+
+ def _r_get_comments(self, buf):
+ if self.is_comment():
+ buf.append(self)
+ for i in self:
+ i._r_get_comments(buf)
+
def get_element_by_id(self, id):
for i in self:
if "id" in i.attrs and i.attrs["id"] == id:
def is_decl(self):
return self.type == HTMLElement.DECL
+ def is_comment(self):
+ return self.type == HTMLElement.COMMENT
+
def is_descendant(self, tagname):
p = self.parent()
while p != None:
r = self.root()
self._r_validate(self, e)
+ # tools
+ def _text_encoder(self, text):
+ # text encode check and convert.
+ # if charset is given, convert text to unicode type.
+ val = ""
+ if self.charset:
+ try:
+ val = unicode(text, self.charset)
+ except TypeError:
+ # self.charset is utf-8.
+ val = text
+ else:
+ # treat as unicode input
+ val = text
+ return val
+
+ def _attr_encoder(self, attrs):
+ return [(k, self._text_encoder(v)) for (k, v) in attrs]
+
# Handlers
def handle_starttag(self, tag, attrs):
# some tags treat as start-end tag.
if tag in self.UNCLOSABLE_TAGS:
return self.handle_startendtag(tag, attrs)
- elem = HTMLElement(HTMLElement.TAG, tag, attrs)
+ elem = HTMLElement(HTMLElement.TAG, tag, self._attr_encoder(attrs))
if self._option & HTMLTree.USE_VALIDATE > 0:
# try validation (experimental)
self._cursor = self._cursor.parent()
def handle_startendtag(self, tag, attrs):
- elem = HTMLElement(HTMLElement.TAG, tag, attrs)
+ elem = HTMLElement(HTMLElement.TAG, tag, self._attr_encoder(attrs))
elem._parent = self._cursor
self._cursor.append(elem)
elem = HTMLElement(HTMLElement.TEXT)
elem._parent = self._cursor
- # text encode check and convert.
- # if charset is given, convert text to unicode type.
- if self.charset:
- try:
- elem._text = unicode(data, self.charset)
- except TypeError:
- # self.charset is utf-8.
- elem._text = data
- else:
- # treat as unicode input
- elem._text = data
+ # encode text to utf-8
+ elem._text = self._text_encoder(data)
+
self._cursor.append(elem)
def handle_entityref(self, name):
elem._parent = self._cursor
self._cursor.append(elem)
+ def handle_comment(self, data):
+ elem = HTMLElement(HTMLElement.COMMENT, data)
+ elem._parent = self._cursor
+ self._cursor.append(elem)
+
# Accessor
def root(self):
return self._htmlroot