OSDN Git Service

append TEXTRenderer and HTMLElement.inner_text()
[htmltree-py/htmltree.git] / htmltree.py
index 8afcfb4..fea5a7b 100644 (file)
@@ -20,9 +20,14 @@ class HTMLElementError(Exception):
 class Renderer(object):
     """HTMLElement Render base class."""
     def attrs2str(self, elem):
-        f = lambda x,y: x if y == None else "%s='%s'" % (x,y)
-
-        strs = [f(x,elem.attrs[x]) for x in elem.attrs]
+        strs = []
+        for attr in elem.attrs:
+            if elem.attrs[attr] == None:
+                strs.append(attr)
+            elif "'" in elem.attrs[attr]:
+                strs.append('%s="%s"' % (attr, elem.attrs[attr]))
+            else:
+                strs.append("%s='%s'" % (attr, elem.attrs[attr]))
         strs.insert(0, "")
         return " ".join(strs)
 
@@ -55,13 +60,43 @@ class HTMLRenderer(Renderer):
         elif elem.is_root():
             for child in elem:
                 self._recursive(child, texts)
-        
+        elif elem.is_decl():
+            texts.append("<!" + elem.name + ">")
+
+
+class TEXTRenderer(Renderer):
+    """Render HTMLElement as TEXT."""
+    # TODO: check tags not need to close more strict...
+    UNCLOSABLE_TAGS = ["br", "link", "meta", "img"]
+
+    def render_inner(self, elem):
+        texts = []
+        for child in elem:
+            self._recursive(child, texts)
+        return "".join(texts)
+
+    def render(self, elem):
+        texts = []
+        self._recursive(elem, texts)
+        return "".join(texts)
+
+    def _recursive(self, elem, texts):
+        if elem.is_tag():
+            for child in elem:
+                self._recursive(child, texts)
+        elif elem.is_text():
+            if elem.text():
+                texts.append(elem.text())
+        elif elem.is_root():
+            for child in elem:
+                self._recursive(child, texts)
 
 class HTMLElement(list):
     """HTML element object to use as tree nodes."""
     ROOT = 0
     TAG = 100
     TEXT = 200
+    DECL = 300
 
     def __init__(self, type, name="", attrs={}):
         """
@@ -89,11 +124,16 @@ class HTMLElement(list):
     def __repr__(self):
         if self.type == HTMLElement.TAG:
             return "<TAG:%s %s>" % (self.name, self._attrs2str())
+        elif self.type == HTMLElement.DECL:
+            return "<DECL:'%s'>" % self.name
         elif self.type == HTMLElement.TEXT:
             return "<TEXT:'%s'>" % self._text
         else:
             return "<UNKNOWN>"
 
+    def __eq__(self, other):
+        return id(self) == id(other)
+
     def _attrs2str(self):
         str = []
         f = lambda x,y: x if y == None else "%s='%s'" % (x,y)
@@ -106,6 +146,10 @@ class HTMLElement(list):
         """returns given attribute's value."""
         return self.attrs.get(attr, default)
 
+    def attr(self, attr, default=None):
+        """returns given attribute's value."""
+        return self.attrs.get(attr, default)
+
     def has_attribute(self, attr):
         """returns True if element has "attr" attribute."""
         return attr in self.attrs
@@ -119,6 +163,11 @@ class HTMLElement(list):
         rn = HTMLRenderer()
         return rn.render_inner(self)
 
+    def inner_text(self):
+        "returns inner text"
+        rn = TEXTRenderer()
+        return rn.render_inner(self)
+
     # navigation functions
     def parent(self):
         """returns tag's parent element."""
@@ -165,11 +214,30 @@ class HTMLElement(list):
         for i in self:
             i._r_get_elements_by_class(cls, buf)
 
+    def get_elements(self, name, attrs):
+        elems = self.get_elements_by_name(name)
+        results = []
+        for elem in elems:
+            for name in attrs:
+                if elem.get_attribute(name, "") != attrs[name]:
+                    break
+            else:
+                results.append(elem)
+        return results
+
     # manipulation functions
     def append_tag(self, tag, attrs):
         elem = HTMLElement(HTMLElement.TAG, tag, attrs)
         self.append(elem)
 
+    def remove_element(self, elem):
+        parent = elem.parent()
+        parent.remove(elem)
+
+    def delete(self):
+        p = self.parent()
+        p.remove(self)
+
     # query functions
     # TODO: this function is under implementing...
     def select(self, expr):
@@ -217,6 +285,9 @@ class HTMLElement(list):
     def is_root(self):
         return self.type == HTMLElement.ROOT
 
+    def is_decl(self):
+        return self.type == HTMLElement.DECL
+
     def is_descendant(self, tagname):
         p = self.parent()
         while p != None:
@@ -247,7 +318,14 @@ class HTMLTreeError(Exception):
     def __repr__(self):
         str = "HTML Parse Error: %s , line: %d, char: %d" % (self.msg, self.lineno, self.offset)
         return str
-    
+
+
+def parse(data, charset=None, option=0):
+    "parse HTML and returns HTMLTree object"
+    tree = HTMLTree()
+    tree.parse(data, charset, option)
+    return tree
+
 
 class HTMLTree(HTMLParser.HTMLParser):
     "HTML Tree Builder"
@@ -381,6 +459,19 @@ class HTMLTree(HTMLParser.HTMLParser):
             elem._text = data
         self._cursor.append(elem)
 
+    def handle_entityref(self, name):
+        data = "&" + name + ";"
+        self.handle_data(data)
+
+    def handle_charref(self, ref):
+        data = "&#" + ref + ";"
+        self.handle_data(data)
+
+    def handle_decl(self, decl):
+        elem = HTMLElement(HTMLElement.DECL, decl)
+        elem._parent = self._cursor
+        self._cursor.append(elem)
+
     # Accessor
     def root(self):
         return self._htmlroot