2 # -*- coding: utf-8 -*-
4 """htmltree.py - HTML Element-Tree Builder
5 by hylom <hylomm@@single_at_mark@@gmail.com>
11 class HTMLElementError(Exception):
12 def __init__(self, msg, elem):
17 str = "HTML Element Error: %s in %s" % (self.msg, self.elem)
20 class Renderer(object):
21 """HTMLElement Render base class."""
22 def attrs2str(self, elem):
24 for attr in elem.attrs:
25 if elem.attrs[attr] == None:
27 elif "'" in elem.attrs[attr]:
28 strs.append('%s="%s"' % (attr, elem.attrs[attr]))
30 strs.append("%s='%s'" % (attr, elem.attrs[attr]))
34 class HTMLRenderer(Renderer):
35 """Render HTMLElement as HTML."""
36 # TODO: check tags not need to close more strict...
37 UNCLOSABLE_TAGS = ["br", "link", "meta", "img"]
39 def render_inner(self, elem):
42 self._recursive(child, texts)
45 def render(self, elem):
47 self._recursive(elem, texts)
50 def _recursive(self, elem, texts):
52 texts.append("<" + elem.name + self.attrs2str(elem) + ">")
54 self._recursive(child, texts)
55 if not elem.name in self.UNCLOSABLE_TAGS:
56 texts.append("</" + elem.name + ">")
59 texts.append(elem.text())
62 self._recursive(child, texts)
64 texts.append("<!" + elem.name + ">")
67 class TEXTRenderer(Renderer):
68 """Render HTMLElement as TEXT."""
69 # TODO: check tags not need to close more strict...
70 UNCLOSABLE_TAGS = ["br", "link", "meta", "img"]
72 def render_inner(self, elem):
75 self._recursive(child, texts)
78 def render(self, elem):
80 self._recursive(elem, texts)
83 def _recursive(self, elem, texts):
86 self._recursive(child, texts)
89 texts.append(elem.text())
92 self._recursive(child, texts)
94 class HTMLElement(list):
95 """HTML element object to use as tree nodes."""
101 def __init__(self, type, name="", attrs={}):
103 create HTMLElement object.
106 type -- element type. HTMLElement.(ROOT|TAG|TEXT)
107 name -- element name (default: "")
108 attrs -- dict of attributes (default:{})
111 attr = dict(href="http://example.com/", target="_blank")
112 e = HTMLElement(HTMLElement.TAG, "a", attr)
113 # 'e' means <a href="http://example.com/" target="_blank">
118 self.attrs = dict(attrs)
121 self._next_elem = None
122 self._prev_elem = None
125 if self.type == HTMLElement.TAG:
126 return "<TAG:%s %s>" % (self.name, self._attrs2str())
127 elif self.type == HTMLElement.DECL:
128 return "<DECL:'%s'>" % self.name
129 elif self.type == HTMLElement.TEXT:
130 return "<TEXT:'%s'>" % self._text
134 def __eq__(self, other):
135 return id(self) == id(other)
137 def _attrs2str(self):
139 f = lambda x,y: x if y == None else "%s='%s'" % (x,y)
141 strs = [f(x,self.attrs[x]) for x in self.attrs]
142 return " ".join(strs)
144 # basic acquision functions
145 def get_attribute(self, attr, default=None):
146 """returns given attribute's value."""
147 return self.attrs.get(attr, default)
149 def attr(self, attr, default=None):
150 """returns given attribute's value."""
151 return self.attrs.get(attr, default)
153 def has_attribute(self, attr):
154 """returns True if element has "attr" attribute."""
155 return attr in self.attrs
158 """returns content in the tag."""
161 def inner_html(self):
164 return rn.render_inner(self)
166 def inner_text(self):
169 return rn.render_inner(self)
171 # navigation functions
173 """returns tag's parent element."""
177 """returns tag's next element."""
178 return self._next_elem
181 """returns tag's previous element."""
182 return self._prev_elem
184 # basic query functions
185 def get_elements_by_name(self, name):
187 self._r_get_elements_by_name(name, buf)
190 def _r_get_elements_by_name(self, name, buf):
191 if self.name == name:
194 i._r_get_elements_by_name(name, buf)
196 def get_element_by_id(self, id):
197 if "id" in self.attrs and self.attrs["id"] == id:
200 e = i.get_element_by_id(id)
203 #raise HTMLElementError("Element not found")
206 def get_elements_by_class(self, cls):
208 self._r_get_elements_by_class(cls, buf)
211 def _r_get_elements_by_class(self, cls, buf):
212 if self.get_attribute("class") == cls:
215 i._r_get_elements_by_class(cls, buf)
217 def get_elements(self, name, attrs):
218 elems = self.get_elements_by_name(name)
222 if elem.get_attribute(name, "") != attrs[name]:
228 # manipulation functions
229 def append_tag(self, tag, attrs):
230 elem = HTMLElement(HTMLElement.TAG, tag, attrs)
233 def remove_element(self, elem):
234 parent = elem.parent()
242 # TODO: this function is under implementing...
243 def select(self, expr):
244 terms = expr.strip().split()
251 t.extend(self._select_pattern(pat, elem))
255 def _select_pattern(self, pat, elem):
258 results = [elem.get_element_by_id(pat[1:]),]
260 results = elem.get_elements_by_class(pat[1:])
261 return [x for x in results if x]
263 def select_1st(self, expr):
264 r = self.select(expr)
270 def select_by_name2(self, term1, term2):
271 tbl = self.get_elements_by_name(term1)
274 st = elem.get_elements_by_name(term2)
280 return self.type == HTMLElement.TEXT
283 return self.type == HTMLElement.TAG
286 return self.type == HTMLElement.ROOT
289 return self.type == HTMLElement.DECL
291 def is_descendant(self, tagname):
294 if p.name == tagname:
300 def trace_back(self, tag):
301 """ regexp string => list"""
303 rex = re.compile(tag)
306 if rex.search(p.name):
307 result.append(p.name)
312 class HTMLTreeError(Exception):
313 def __init__(self, msg, lineno, offset):
319 str = "HTML Parse Error: %s , line: %d, char: %d" % (self.msg, self.lineno, self.offset)
323 def parse(data, charset=None, option=0):
324 "parse HTML and returns HTMLTree object"
326 tree.parse(data, charset, option)
330 class HTMLTree(HTMLParser.HTMLParser):
332 USE_VALIDATE = 0x0001
334 IGNORE_BLANK = 0x0010
339 # TODO: check tags not need to close more strict...
340 UNCLOSABLE_TAGS = ["br", "link", "meta", "img", "input"]
344 HTMLParser.HTMLParser.__init__(self)
346 def parse(self, data, charset=None, option=0):
351 data -- HTML to parse
352 charset -- charset of HTML (default: None)
353 option -- option (default: 0, meaning none)
357 self.charset = charset
358 self._htmlroot = HTMLElement(HTMLElement.ROOT)
359 self._cursor = self._htmlroot
360 self._option = option
363 except HTMLParser.HTMLParseError, e:
364 raise HTMLTreeError("HTML parse error: " + e.msg,
367 # if charset is not given, detect charset
368 if self.charset == None:
370 metas = r.get_elements_by_name("meta")
372 if meta.attrs.get("http-equiv", None) == "Content-Type":
373 ctype = meta.attrs.get("content", "")
374 m = re.search(r"charset=([^;]+)", ctype)
376 self.charset = m.group(1)
381 self._htmlroot = HTMLElement(HTMLElement.ROOT)
382 self._cursor = self._htmlroot
391 def _r_finalize(self, elem):
397 elem[0]._next_elem = elem[1]
398 for i in range(1, l-1):
399 elem[i]._prev_elem = elem[i-1]
400 elem[i]._next_elem = elem[i+1]
402 elem[l-1]._prev_elem = elem[l-2]
404 for sub_elem in elem:
405 self._r_finalize(sub_elem)
409 self._r_validate(self, e)
412 def handle_starttag(self, tag, attrs):
413 # some tags treat as start-end tag.
414 if tag in self.UNCLOSABLE_TAGS:
415 return self.handle_startendtag(tag, attrs)
417 elem = HTMLElement(HTMLElement.TAG, tag, attrs)
419 if self._option & HTMLTree.USE_VALIDATE > 0:
420 # try validation (experimental)
421 if tag == "li" and self._cursor.name == "li":
422 self.handle_endtag("li")
425 elem._parent = self._cursor
426 self._cursor.append(elem)
429 def handle_endtag(self, tag):
430 # some tags treat as start-end tag.
431 if tag in self.UNCLOSABLE_TAGS:
434 self._cursor = self._cursor.parent()
436 def handle_startendtag(self, tag, attrs):
437 elem = HTMLElement(HTMLElement.TAG, tag, attrs)
438 elem._parent = self._cursor
439 self._cursor.append(elem)
441 def handle_data(self, data):
442 if self._option & HTMLTree.IGNORE_BLANK > 0:
443 if re.search(r"^\s*$", data):
446 elem = HTMLElement(HTMLElement.TEXT)
447 elem._parent = self._cursor
449 # text encode check and convert.
450 # if charset is given, convert text to unicode type.
453 elem._text = unicode(data, self.charset)
455 # self.charset is utf-8.
458 # treat as unicode input
460 self._cursor.append(elem)
462 def handle_entityref(self, name):
463 data = "&" + name + ";"
464 self.handle_data(data)
466 def handle_charref(self, ref):
467 data = "&#" + ref + ";"
468 self.handle_data(data)
470 def handle_decl(self, decl):
471 elem = HTMLElement(HTMLElement.DECL, decl)
472 elem._parent = self._cursor
473 self._cursor.append(elem)
477 return self._htmlroot