class Renderer(object):
"""HTMLElement Render base class."""
def attrs2str(self, elem):
- f = lambda x,y: x if y == None else "%s='%s'" % (x,y)
-
- strs = [f(x,elem.attrs[x]) for x in elem.attrs]
+ strs = []
+ for attr in elem.attrs:
+ if elem.attrs[attr] == None:
+ strs.append(attr)
+ elif "'" in elem.attrs[attr]:
+ strs.append('%s="%s"' % (attr, elem.attrs[attr]))
+ else:
+ strs.append("%s='%s'" % (attr, elem.attrs[attr]))
strs.insert(0, "")
return " ".join(strs)
elif elem.is_root():
for child in elem:
self._recursive(child, texts)
-
+ elif elem.is_decl():
+ texts.append("<!" + elem.name + ">")
+
+
+class TEXTRenderer(Renderer):
+ """Render HTMLElement as TEXT."""
+ # TODO: check tags not need to close more strict...
+ UNCLOSABLE_TAGS = ["br", "link", "meta", "img"]
+
+ def render_inner(self, elem):
+ texts = []
+ for child in elem:
+ self._recursive(child, texts)
+ return "".join(texts)
+
+ def render(self, elem):
+ texts = []
+ self._recursive(elem, texts)
+ return "".join(texts)
+
+ def _recursive(self, elem, texts):
+ if elem.is_tag():
+ for child in elem:
+ self._recursive(child, texts)
+ elif elem.is_text():
+ if elem.text():
+ texts.append(elem.text())
+ elif elem.is_root():
+ for child in elem:
+ self._recursive(child, texts)
class HTMLElement(list):
"""HTML element object to use as tree nodes."""
ROOT = 0
TAG = 100
TEXT = 200
+ DECL = 300
def __init__(self, type, name="", attrs={}):
"""
def __repr__(self):
if self.type == HTMLElement.TAG:
return "<TAG:%s %s>" % (self.name, self._attrs2str())
+ elif self.type == HTMLElement.DECL:
+ return "<DECL:'%s'>" % self.name
elif self.type == HTMLElement.TEXT:
return "<TEXT:'%s'>" % self._text
else:
return "<UNKNOWN>"
+ def __eq__(self, other):
+ return id(self) == id(other)
+
def _attrs2str(self):
str = []
f = lambda x,y: x if y == None else "%s='%s'" % (x,y)
"""returns given attribute's value."""
return self.attrs.get(attr, default)
+ def attr(self, attr, default=None):
+ """returns given attribute's value."""
+ return self.attrs.get(attr, default)
+
def has_attribute(self, attr):
"""returns True if element has "attr" attribute."""
return attr in self.attrs
rn = HTMLRenderer()
return rn.render_inner(self)
+ def inner_text(self):
+ "returns inner text"
+ rn = TEXTRenderer()
+ return rn.render_inner(self)
+
# navigation functions
def parent(self):
"""returns tag's parent element."""
for i in self:
i._r_get_elements_by_class(cls, buf)
+ def get_elements(self, name, attrs):
+ elems = self.get_elements_by_name(name)
+ results = []
+ for elem in elems:
+ for name in attrs:
+ if elem.get_attribute(name, "") != attrs[name]:
+ break
+ else:
+ results.append(elem)
+ return results
+
# manipulation functions
def append_tag(self, tag, attrs):
elem = HTMLElement(HTMLElement.TAG, tag, attrs)
self.append(elem)
+ def remove_element(self, elem):
+ parent = elem.parent()
+ parent.remove(elem)
+
+ def delete(self):
+ p = self.parent()
+ p.remove(self)
+
# query functions
# TODO: this function is under implementing...
def select(self, expr):
def is_root(self):
return self.type == HTMLElement.ROOT
+ def is_decl(self):
+ return self.type == HTMLElement.DECL
+
def is_descendant(self, tagname):
p = self.parent()
while p != None:
def __repr__(self):
str = "HTML Parse Error: %s , line: %d, char: %d" % (self.msg, self.lineno, self.offset)
return str
-
+
+
+def parse(data, charset=None, option=0):
+ "parse HTML and returns HTMLTree object"
+ tree = HTMLTree()
+ tree.parse(data, charset, option)
+ return tree
+
class HTMLTree(HTMLParser.HTMLParser):
"HTML Tree Builder"
elem._text = data
self._cursor.append(elem)
+ def handle_entityref(self, name):
+ data = "&" + name + ";"
+ self.handle_data(data)
+
+ def handle_charref(self, ref):
+ data = "&#" + ref + ";"
+ self.handle_data(data)
+
+ def handle_decl(self, decl):
+ elem = HTMLElement(HTMLElement.DECL, decl)
+ elem._parent = self._cursor
+ self._cursor.append(elem)
+
# Accessor
def root(self):
return self._htmlroot