2 # -*- coding: utf-8 -*-
4 """htmltree.py - HTML Element-Tree Builder
5 by hylom <hylomm@@single_at_mark@@gmail.com>
11 class HTMLElementError(Exception):
12 def __init__(self, msg, elem):
17 str = "HTML Element Error: %s in %s" % (self.msg, self.elem)
20 class Renderer(object):
21 """HTMLElement Render base class."""
22 def attrs2str(self, elem):
24 for attr in elem.attrs:
25 if elem.attrs[attr] == None:
27 elif "'" in elem.attrs[attr]:
28 strs.append('%s="%s"' % (attr, elem.attrs[attr]))
30 strs.append("%s='%s'" % (attr, elem.attrs[attr]))
34 class HTMLRenderer(Renderer):
35 """Render HTMLElement as HTML."""
36 # TODO: check tags not need to close more strict...
37 UNCLOSABLE_TAGS = ["br", "link", "meta", "img"]
39 def render_inner(self, elem):
42 self._recursive(child, texts)
45 def render(self, elem):
47 self._recursive(elem, texts)
50 def _recursive(self, elem, texts):
52 texts.append("<" + elem.name + self.attrs2str(elem) + ">")
54 self._recursive(child, texts)
55 if not elem.name in self.UNCLOSABLE_TAGS:
56 texts.append("</" + elem.name + ">")
59 texts.append(elem.text())
62 self._recursive(child, texts)
65 class HTMLElement(list):
66 """HTML element object to use as tree nodes."""
71 def __init__(self, type, name="", attrs={}):
73 create HTMLElement object.
76 type -- element type. HTMLElement.(ROOT|TAG|TEXT)
77 name -- element name (default: "")
78 attrs -- dict of attributes (default:{})
81 attr = dict(href="http://example.com/", target="_blank")
82 e = HTMLElement(HTMLElement.TAG, "a", attr)
83 # 'e' means <a href="http://example.com/" target="_blank">
88 self.attrs = dict(attrs)
91 self._next_elem = None
92 self._prev_elem = None
95 if self.type == HTMLElement.TAG:
96 return "<TAG:%s %s>" % (self.name, self._attrs2str())
97 elif self.type == HTMLElement.TEXT:
98 return "<TEXT:'%s'>" % self._text
102 def _attrs2str(self):
104 f = lambda x,y: x if y == None else "%s='%s'" % (x,y)
106 strs = [f(x,self.attrs[x]) for x in self.attrs]
107 return " ".join(strs)
109 # basic acquision functions
110 def get_attribute(self, attr, default=None):
111 """returns given attribute's value."""
112 return self.attrs.get(attr, default)
114 def has_attribute(self, attr):
115 """returns True if element has "attr" attribute."""
116 return attr in self.attrs
119 """returns content in the tag."""
122 def inner_html(self):
125 return rn.render_inner(self)
127 # navigation functions
129 """returns tag's parent element."""
133 """returns tag's next element."""
134 return self._next_elem
137 """returns tag's previous element."""
138 return self._prev_elem
140 # basic query functions
141 def get_elements_by_name(self, name):
143 self._r_get_elements_by_name(name, buf)
146 def _r_get_elements_by_name(self, name, buf):
147 if self.name == name:
150 i._r_get_elements_by_name(name, buf)
152 def get_element_by_id(self, id):
153 if "id" in self.attrs and self.attrs["id"] == id:
156 e = i.get_element_by_id(id)
159 #raise HTMLElementError("Element not found")
162 def get_elements_by_class(self, cls):
164 self._r_get_elements_by_class(cls, buf)
167 def _r_get_elements_by_class(self, cls, buf):
168 if self.get_attribute("class") == cls:
171 i._r_get_elements_by_class(cls, buf)
173 # manipulation functions
174 def append_tag(self, tag, attrs):
175 elem = HTMLElement(HTMLElement.TAG, tag, attrs)
179 # TODO: this function is under implementing...
180 def select(self, expr):
181 terms = expr.strip().split()
188 t.extend(self._select_pattern(pat, elem))
192 def _select_pattern(self, pat, elem):
195 results = [elem.get_element_by_id(pat[1:]),]
197 results = elem.get_elements_by_class(pat[1:])
198 return [x for x in results if x]
200 def select_1st(self, expr):
201 r = self.select(expr)
207 def select_by_name2(self, term1, term2):
208 tbl = self.get_elements_by_name(term1)
211 st = elem.get_elements_by_name(term2)
217 return self.type == HTMLElement.TEXT
220 return self.type == HTMLElement.TAG
223 return self.type == HTMLElement.ROOT
225 def is_descendant(self, tagname):
228 if p.name == tagname:
234 def trace_back(self, tag):
235 """ regexp string => list"""
237 rex = re.compile(tag)
240 if rex.search(p.name):
241 result.append(p.name)
246 class HTMLTreeError(Exception):
247 def __init__(self, msg, lineno, offset):
253 str = "HTML Parse Error: %s , line: %d, char: %d" % (self.msg, self.lineno, self.offset)
257 class HTMLTree(HTMLParser.HTMLParser):
259 USE_VALIDATE = 0x0001
261 IGNORE_BLANK = 0x0010
266 # TODO: check tags not need to close more strict...
267 UNCLOSABLE_TAGS = ["br", "link", "meta", "img", "input"]
271 HTMLParser.HTMLParser.__init__(self)
273 def parse(self, data, charset=None, option=0):
278 data -- HTML to parse
279 charset -- charset of HTML (default: None)
280 option -- option (default: 0, meaning none)
284 self.charset = charset
285 self._htmlroot = HTMLElement(HTMLElement.ROOT)
286 self._cursor = self._htmlroot
287 self._option = option
290 except HTMLParser.HTMLParseError, e:
291 raise HTMLTreeError("HTML parse error: " + e.msg,
294 # if charset is not given, detect charset
295 if self.charset == None:
297 metas = r.get_elements_by_name("meta")
299 if meta.attrs.get("http-equiv", None) == "Content-Type":
300 ctype = meta.attrs.get("content", "")
301 m = re.search(r"charset=([^;]+)", ctype)
303 self.charset = m.group(1)
308 self._htmlroot = HTMLElement(HTMLElement.ROOT)
309 self._cursor = self._htmlroot
318 def _r_finalize(self, elem):
324 elem[0]._next_elem = elem[1]
325 for i in range(1, l-1):
326 elem[i]._prev_elem = elem[i-1]
327 elem[i]._next_elem = elem[i+1]
329 elem[l-1]._prev_elem = elem[l-2]
331 for sub_elem in elem:
332 self._r_finalize(sub_elem)
336 self._r_validate(self, e)
339 def handle_starttag(self, tag, attrs):
340 # some tags treat as start-end tag.
341 if tag in self.UNCLOSABLE_TAGS:
342 return self.handle_startendtag(tag, attrs)
344 elem = HTMLElement(HTMLElement.TAG, tag, attrs)
346 if self._option & HTMLTree.USE_VALIDATE > 0:
347 # try validation (experimental)
348 if tag == "li" and self._cursor.name == "li":
349 self.handle_endtag("li")
352 elem._parent = self._cursor
353 self._cursor.append(elem)
356 def handle_endtag(self, tag):
357 # some tags treat as start-end tag.
358 if tag in self.UNCLOSABLE_TAGS:
361 self._cursor = self._cursor.parent()
363 def handle_startendtag(self, tag, attrs):
364 elem = HTMLElement(HTMLElement.TAG, tag, attrs)
365 elem._parent = self._cursor
366 self._cursor.append(elem)
368 def handle_data(self, data):
369 if self._option & HTMLTree.IGNORE_BLANK > 0:
370 if re.search(r"^\s*$", data):
373 elem = HTMLElement(HTMLElement.TEXT)
374 elem._parent = self._cursor
376 # text encode check and convert.
377 # if charset is given, convert text to unicode type.
380 elem._text = unicode(data, self.charset)
382 # self.charset is utf-8.
385 # treat as unicode input
387 self._cursor.append(elem)
391 return self._htmlroot