2 # -*- coding: utf-8 -*-
4 """htmltree.py - HTML Element-Tree Builder
5 by hylom <hylomm@@single_at_mark@@gmail.com>
11 class HTMLElementError(Exception):
12 def __init__(self, msg, elem):
17 str = "HTML Element Error: %s in %s" % (self.msg, self.elem)
20 class Renderer(object):
21 """HTMLElement Render base class."""
22 def attrs2str(self, elem):
24 for attr in elem.attrs:
25 if elem.attrs[attr] == None:
27 elif "'" in elem.attrs[attr]:
28 strs.append('%s="%s"' % (attr, elem.attrs[attr]))
30 strs.append("%s='%s'" % (attr, elem.attrs[attr]))
34 class HTMLRenderer(Renderer):
35 """Render HTMLElement as HTML."""
36 # TODO: check tags not need to close more strict...
37 UNCLOSABLE_TAGS = ["br", "link", "meta", "img"]
39 def render_inner(self, elem):
42 self._recursive(child, texts)
45 def render(self, elem):
47 self._recursive(elem, texts)
50 def _recursive(self, elem, texts):
52 texts.append("<" + elem.name + self.attrs2str(elem) + ">")
54 self._recursive(child, texts)
55 if not elem.name in self.UNCLOSABLE_TAGS:
56 texts.append("</" + elem.name + ">")
59 texts.append(elem.text())
62 self._recursive(child, texts)
65 class HTMLElement(list):
66 """HTML element object to use as tree nodes."""
71 def __init__(self, type, name="", attrs={}):
73 create HTMLElement object.
76 type -- element type. HTMLElement.(ROOT|TAG|TEXT)
77 name -- element name (default: "")
78 attrs -- dict of attributes (default:{})
81 attr = dict(href="http://example.com/", target="_blank")
82 e = HTMLElement(HTMLElement.TAG, "a", attr)
83 # 'e' means <a href="http://example.com/" target="_blank">
88 self.attrs = dict(attrs)
91 self._next_elem = None
92 self._prev_elem = None
95 if self.type == HTMLElement.TAG:
96 return "<TAG:%s %s>" % (self.name, self._attrs2str())
97 elif self.type == HTMLElement.TEXT:
98 return "<TEXT:'%s'>" % self._text
102 def _attrs2str(self):
104 f = lambda x,y: x if y == None else "%s='%s'" % (x,y)
106 strs = [f(x,self.attrs[x]) for x in self.attrs]
107 return " ".join(strs)
109 # basic acquision functions
110 def get_attribute(self, attr, default=None):
111 """returns given attribute's value."""
112 return self.attrs.get(attr, default)
114 def has_attribute(self, attr):
115 """returns True if element has "attr" attribute."""
116 return attr in self.attrs
119 """returns content in the tag."""
122 def inner_html(self):
125 return rn.render_inner(self)
127 # navigation functions
129 """returns tag's parent element."""
133 """returns tag's next element."""
134 return self._next_elem
137 """returns tag's previous element."""
138 return self._prev_elem
140 # basic query functions
141 def get_elements_by_name(self, name):
143 self._r_get_elements_by_name(name, buf)
146 def _r_get_elements_by_name(self, name, buf):
147 if self.name == name:
150 i._r_get_elements_by_name(name, buf)
152 def get_element_by_id(self, id):
153 if "id" in self.attrs and self.attrs["id"] == id:
156 e = i.get_element_by_id(id)
159 #raise HTMLElementError("Element not found")
162 def get_elements_by_class(self, cls):
164 self._r_get_elements_by_class(cls, buf)
167 def _r_get_elements_by_class(self, cls, buf):
168 if self.get_attribute("class") == cls:
171 i._r_get_elements_by_class(cls, buf)
173 # manipulation functions
174 def append_tag(self, tag, attrs):
175 elem = HTMLElement(HTMLElement.TAG, tag, attrs)
178 def remove_element(self, elem):
179 parent = elem.parent()
183 # TODO: this function is under implementing...
184 def select(self, expr):
185 terms = expr.strip().split()
192 t.extend(self._select_pattern(pat, elem))
196 def _select_pattern(self, pat, elem):
199 results = [elem.get_element_by_id(pat[1:]),]
201 results = elem.get_elements_by_class(pat[1:])
202 return [x for x in results if x]
204 def select_1st(self, expr):
205 r = self.select(expr)
211 def select_by_name2(self, term1, term2):
212 tbl = self.get_elements_by_name(term1)
215 st = elem.get_elements_by_name(term2)
221 return self.type == HTMLElement.TEXT
224 return self.type == HTMLElement.TAG
227 return self.type == HTMLElement.ROOT
229 def is_descendant(self, tagname):
232 if p.name == tagname:
238 def trace_back(self, tag):
239 """ regexp string => list"""
241 rex = re.compile(tag)
244 if rex.search(p.name):
245 result.append(p.name)
250 class HTMLTreeError(Exception):
251 def __init__(self, msg, lineno, offset):
257 str = "HTML Parse Error: %s , line: %d, char: %d" % (self.msg, self.lineno, self.offset)
261 class HTMLTree(HTMLParser.HTMLParser):
263 USE_VALIDATE = 0x0001
265 IGNORE_BLANK = 0x0010
270 # TODO: check tags not need to close more strict...
271 UNCLOSABLE_TAGS = ["br", "link", "meta", "img", "input"]
275 HTMLParser.HTMLParser.__init__(self)
277 def parse(self, data, charset=None, option=0):
282 data -- HTML to parse
283 charset -- charset of HTML (default: None)
284 option -- option (default: 0, meaning none)
288 self.charset = charset
289 self._htmlroot = HTMLElement(HTMLElement.ROOT)
290 self._cursor = self._htmlroot
291 self._option = option
294 except HTMLParser.HTMLParseError, e:
295 raise HTMLTreeError("HTML parse error: " + e.msg,
298 # if charset is not given, detect charset
299 if self.charset == None:
301 metas = r.get_elements_by_name("meta")
303 if meta.attrs.get("http-equiv", None) == "Content-Type":
304 ctype = meta.attrs.get("content", "")
305 m = re.search(r"charset=([^;]+)", ctype)
307 self.charset = m.group(1)
312 self._htmlroot = HTMLElement(HTMLElement.ROOT)
313 self._cursor = self._htmlroot
322 def _r_finalize(self, elem):
328 elem[0]._next_elem = elem[1]
329 for i in range(1, l-1):
330 elem[i]._prev_elem = elem[i-1]
331 elem[i]._next_elem = elem[i+1]
333 elem[l-1]._prev_elem = elem[l-2]
335 for sub_elem in elem:
336 self._r_finalize(sub_elem)
340 self._r_validate(self, e)
343 def handle_starttag(self, tag, attrs):
344 # some tags treat as start-end tag.
345 if tag in self.UNCLOSABLE_TAGS:
346 return self.handle_startendtag(tag, attrs)
348 elem = HTMLElement(HTMLElement.TAG, tag, attrs)
350 if self._option & HTMLTree.USE_VALIDATE > 0:
351 # try validation (experimental)
352 if tag == "li" and self._cursor.name == "li":
353 self.handle_endtag("li")
356 elem._parent = self._cursor
357 self._cursor.append(elem)
360 def handle_endtag(self, tag):
361 # some tags treat as start-end tag.
362 if tag in self.UNCLOSABLE_TAGS:
365 self._cursor = self._cursor.parent()
367 def handle_startendtag(self, tag, attrs):
368 elem = HTMLElement(HTMLElement.TAG, tag, attrs)
369 elem._parent = self._cursor
370 self._cursor.append(elem)
372 def handle_data(self, data):
373 if self._option & HTMLTree.IGNORE_BLANK > 0:
374 if re.search(r"^\s*$", data):
377 elem = HTMLElement(HTMLElement.TEXT)
378 elem._parent = self._cursor
380 # text encode check and convert.
381 # if charset is given, convert text to unicode type.
384 elem._text = unicode(data, self.charset)
386 # self.charset is utf-8.
389 # treat as unicode input
391 self._cursor.append(elem)
395 return self._htmlroot