OSDN Git Service

append TEXTRenderer and HTMLElement.inner_text()
authorHiromichi MATSUSHIMA <hirom@office-sv.osdn.jp>
Thu, 28 Jul 2011 11:27:56 +0000 (20:27 +0900)
committerHiromichi MATSUSHIMA <hirom@office-sv.osdn.jp>
Thu, 28 Jul 2011 11:27:56 +0000 (20:27 +0900)
htmltree.py

index a9e57b8..fea5a7b 100644 (file)
@@ -63,6 +63,34 @@ class HTMLRenderer(Renderer):
         elif elem.is_decl():
             texts.append("<!" + elem.name + ">")
 
+
+class TEXTRenderer(Renderer):
+    """Render HTMLElement as TEXT."""
+    # TODO: check tags not need to close more strict...
+    UNCLOSABLE_TAGS = ["br", "link", "meta", "img"]
+
+    def render_inner(self, elem):
+        texts = []
+        for child in elem:
+            self._recursive(child, texts)
+        return "".join(texts)
+
+    def render(self, elem):
+        texts = []
+        self._recursive(elem, texts)
+        return "".join(texts)
+
+    def _recursive(self, elem, texts):
+        if elem.is_tag():
+            for child in elem:
+                self._recursive(child, texts)
+        elif elem.is_text():
+            if elem.text():
+                texts.append(elem.text())
+        elif elem.is_root():
+            for child in elem:
+                self._recursive(child, texts)
+
 class HTMLElement(list):
     """HTML element object to use as tree nodes."""
     ROOT = 0
@@ -135,6 +163,11 @@ class HTMLElement(list):
         rn = HTMLRenderer()
         return rn.render_inner(self)
 
+    def inner_text(self):
+        "returns inner text"
+        rn = TEXTRenderer()
+        return rn.render_inner(self)
+
     # navigation functions
     def parent(self):
         """returns tag's parent element."""