OSDN Git Service

implementing html2sfjpwiki...
authorhylom <hylom@users.sourceforge.jp>
Tue, 10 Nov 2009 11:53:46 +0000 (20:53 +0900)
committerhylom <hylom@users.sourceforge.jp>
Tue, 10 Nov 2009 11:53:46 +0000 (20:53 +0900)
html2sfjpwiki.py [new file with mode: 0644]
sfjpmag2wiki.py [new file with mode: 0644]
spyder.py
test_html2sfjpwiki.py [new file with mode: 0755]

diff --git a/html2sfjpwiki.py b/html2sfjpwiki.py
new file mode 100644 (file)
index 0000000..806e972
--- /dev/null
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""convert html to sfjpwiki-style text."""
+
+import urllib
+import HTMLParser
+import os.path
+import re
+import sys
+from urlparse import urlparse
+
+
+class Html2SfjpWiki(HTMLParser.HTMLParser):
+    "html to sfjpwiki-style text converter"
+    def __init__(self):
+        HTMLParser.HTMLParser.__init__(self)
+        self._target_id = ""
+        self._buf = []
+        self._add_handlers()
+        self._thru = True
+        self._rex_empty = re.compile(r"^\s*$")
+        self._div_counter = 0
+        self._prevtag = ""
+        self._currenttag = ""
+        self._stack = []
+        self._href = ""
+
+    def parse(self, html_string, target_id):
+        """Parse html_string with url, and return anchors"""
+        self._anchors = []
+        self._imgs = []
+        self._id = target_id
+        if self._id == "":
+            self._thru = False
+        self.feed(html_string)
+        return "".join(self._buf)
+
+    def _put(self, str):
+        self._buf.append(str)
+
+    def _add_handlers(self):
+        h_start = dict()
+        h_end = dict()
+
+        h_start["a"] = self._start_a
+        h_start["img"] = self._start_img
+        h_end["a"] = self._end_a
+
+        self._start_h = h_start
+        self._end_h = h_end
+        self._rep_starttag = dict(
+            p="",
+            i="''",
+            tt="`",
+            b="'''",
+            strong="'''",
+            big="'''",
+            small="__",
+            td="||",
+            tr="",
+            hr="----\n",
+            h3="=== ",
+            h4="==== " )
+        self._rep_endtag = dict(
+            p="\n\n",
+            i="''",
+            tt="`",
+            b="'''",
+            strong="'''",
+            big="'''",
+            small="__",
+            td="||",
+            tr="\n",
+            h3=" ===\n",
+            h4=" ====\n" )
+        self._tag_ignore = ["br", "font", "table", "tbody", "tfoot", ]
+        
+    def handle_starttag(self, tag, attrs):
+        self._prevtag = self._currenttag
+        self._currenttag = tag
+
+        if self._thru:
+            d_attrs = dict(attrs)
+            if d_attrs.has_key("id"):
+                if d_attrs["id"] == self._id:
+                    self._thru = False
+                else:
+                    return
+            else:
+                return
+
+        if tag == "div":
+            self._div_counter += 1
+
+        if tag in self._tag_ignore:
+            # do nothing
+            return
+        if self._rep_starttag.has_key(tag):
+            self._put(self._rep_starttag[tag])
+            return
+        if self._start_h.has_key(tag):
+            self._start_h[tag](tag, attrs)
+            return
+
+    def handle_endtag(self, tag):
+        if self._thru:
+            return
+
+        if tag == "div":
+            self._div_counter -= 1
+            if self._div_counter == 0:
+                self._thru = True
+                return
+
+        if tag in self._tag_ignore:
+            # do nothing
+            return
+        if self._rep_endtag.has_key(tag):
+            self._put(self._rep_endtag[tag])
+            return
+        if self._end_h.has_key(tag):
+            self._end_h[tag](tag)
+            return
+
+    def handle_data(self, data):
+        if self._thru:
+            return
+        if self._rex_empty.search(data):
+            return
+
+        if self._href:
+            self._stack.append(data)
+        else:
+            self._put(data.rstrip())
+
+    def handle_charref(self, ref):
+        pass
+
+    def handle_entityref(self, name):
+        pass
+
+    # tag specific handlers
+    def _start_img(self, tag, attrs):
+        src = ""
+        title = ""
+        for (attr, val) in attrs:
+            if attr == "src":
+                src = val
+            elif attr == "alt":
+                title = val
+
+        if self._prevtag == "a":
+            filename = self._href.split("/")[-1]
+            self._href = ""
+        else:
+            filename = src.split("/")[-1]
+
+        self._put("Thumb(%s, caption=%s)\n" % (filename, title))
+
+    def _start_a(self, tag, attrs):
+        href = ""
+        for (attr, val) in attrs:
+            if attr == "href":
+                href = val
+                break
+        if href:
+            self._href = href
+
+    def _end_a(self, tag):
+        if self._href:
+            if self._stack:
+                content = self._stack.pop()
+            else:
+                content = ""
+            self._put("[")
+            self._put(self._href)
+            self._put(" ")
+            self._put(content)
+            self._put("]")
+            self._href = ""
+        
+    def _regularize_url(self, url):
+        """regularize given url."""
+        # urlparse.urlparse("http://hoge.net/foo/var/index.html;q?a=b#c")
+        #
+        #       0       1           2                      3    4      5      
+        #  -> ('http', 'hoge.net', '/foo/var/index.html', 'q', 'a=b', 'c')
+        #
+        current_term = self._base_url_items
+        current_dir = os.path.dirname(current_term[2])
+        current_last = os.path.basename(current_term[2])
+
+        result = urlparse(url)
+        term = list(result)
+        
+        if not term[0]:
+            term[0] = current_term[0] + "://"
+        else:
+            term[0] = term[0] + "://"
+        if not term[1]:
+            term[1] = current_term[1]
+        if term[2] and term[2][0] != "/":
+            term[2] = os.path.normpath(current_dir + "/" + term[2])
+        if term[3]:
+            term[3] = ";" + term[3]
+        if term[4]:
+            term[4] = "?" + term[4]
+        if term[5]:
+            term[5] = "#" + term[5]
+
+        url = "".join(term)
+        return url
+    
+
diff --git a/sfjpmag2wiki.py b/sfjpmag2wiki.py
new file mode 100644 (file)
index 0000000..f953e2e
--- /dev/null
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""convert sfjpmagazine's story to sfjpwiki-style text."""
+
+import sys
+import re
+import os
+import urlparse
+
+_USAGE = """%s <html> <wikitext>"""
+
+try:
+    html = sys.argv[1]
+    wikitext = sys.argv[2]
+except IndexError:
+    sys.exit(_USAGE % (sys.argv[0],))
+
+
index eedc8e8..b66366c 100644 (file)
--- a/spyder.py
+++ b/spyder.py
@@ -17,7 +17,6 @@ import HTMLParser
 import os.path
 import re
 import sys
-
 from urlparse import urlparse
 
 
diff --git a/test_html2sfjpwiki.py b/test_html2sfjpwiki.py
new file mode 100755 (executable)
index 0000000..0a72cb9
--- /dev/null
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Test suite for spyder.py."""
+
+import unittest
+import re
+import html2sfjpwiki
+
+class TestSequenceFunctions(unittest.TestCase):
+    def setUp(self):
+        self.test_file = "fetch_test/09/09/10/1214252/1/body.html"
+        fh = open(self.test_file, "r")
+        html = fh.read()
+        fh.close()
+
+        rex = re.compile(r"<\s*script[^>]*?>.*?</script>", re.S)
+        rex2 = re.compile(r"<\s*noscript[^>]*?>.*?</noscript>", re.S)
+        tmp = rex.sub("", html)
+        self.test_data = rex2.sub("", tmp)
+
+    def test_convert(self):
+        """test for convert"""
+        c = html2sfjpwiki.Html2SfjpWiki()
+        r = c.parse(self.test_data, "article-body")
+        print r
+
+
+# do unittest
+suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceFunctions)
+unittest.TextTestRunner(verbosity=2).run(suite)
+