OSDN Git Service

update html2fsjpwiki.py
authorhylom <hylom@users.sourceforge.jp>
Fri, 13 Nov 2009 10:06:36 +0000 (19:06 +0900)
committerhylom <hylom@users.sourceforge.jp>
Fri, 13 Nov 2009 10:06:36 +0000 (19:06 +0900)
fetch_sfjpmag.py [changed mode: 0644->0755]
html2sfjpwiki.py
html2wiki.py [new file with mode: 0755]
sfmag_fetch.py [new file with mode: 0644]
sfmag_html2wiki.py [new file with mode: 0755]
test_fetch.py

old mode 100644 (file)
new mode 100755 (executable)
index 806e972..377857c 100644 (file)
@@ -16,7 +16,6 @@ class Html2SfjpWiki(HTMLParser.HTMLParser):
         HTMLParser.HTMLParser.__init__(self)
         self._target_id = ""
         self._buf = []
-        self._add_handlers()
         self._thru = True
         self._rex_empty = re.compile(r"^\s*$")
         self._div_counter = 0
@@ -24,6 +23,13 @@ class Html2SfjpWiki(HTMLParser.HTMLParser):
         self._currenttag = ""
         self._stack = []
         self._href = ""
+        self.title = ""
+        self._start_handlers = {}
+        self._end_handlers = {}
+
+        self._stacking = False
+        self._add_handlers()
+        self._block = False
 
     def parse(self, html_string, target_id):
         """Parse html_string with url, and return anchors"""
@@ -35,46 +41,6 @@ class Html2SfjpWiki(HTMLParser.HTMLParser):
         self.feed(html_string)
         return "".join(self._buf)
 
-    def _put(self, str):
-        self._buf.append(str)
-
-    def _add_handlers(self):
-        h_start = dict()
-        h_end = dict()
-
-        h_start["a"] = self._start_a
-        h_start["img"] = self._start_img
-        h_end["a"] = self._end_a
-
-        self._start_h = h_start
-        self._end_h = h_end
-        self._rep_starttag = dict(
-            p="",
-            i="''",
-            tt="`",
-            b="'''",
-            strong="'''",
-            big="'''",
-            small="__",
-            td="||",
-            tr="",
-            hr="----\n",
-            h3="=== ",
-            h4="==== " )
-        self._rep_endtag = dict(
-            p="\n\n",
-            i="''",
-            tt="`",
-            b="'''",
-            strong="'''",
-            big="'''",
-            small="__",
-            td="||",
-            tr="\n",
-            h3=" ===\n",
-            h4=" ====\n" )
-        self._tag_ignore = ["br", "font", "table", "tbody", "tfoot", ]
-        
     def handle_starttag(self, tag, attrs):
         self._prevtag = self._currenttag
         self._currenttag = tag
@@ -92,17 +58,15 @@ class Html2SfjpWiki(HTMLParser.HTMLParser):
         if tag == "div":
             self._div_counter += 1
 
-        if tag in self._tag_ignore:
-            # do nothing
-            return
-        if self._rep_starttag.has_key(tag):
-            self._put(self._rep_starttag[tag])
-            return
-        if self._start_h.has_key(tag):
-            self._start_h[tag](tag, attrs)
-            return
+        if self._start_handlers.has_key(tag):
+            f = self._start_handlers[tag]
+            t = f(self, tag, attrs)
+            self._put(t)
 
     def handle_endtag(self, tag):
+        self._prevtag = self._currenttag
+        self._currenttag = ""
+
         if self._thru:
             return
 
@@ -112,26 +76,31 @@ class Html2SfjpWiki(HTMLParser.HTMLParser):
                 self._thru = True
                 return
 
-        if tag in self._tag_ignore:
-            # do nothing
-            return
-        if self._rep_endtag.has_key(tag):
-            self._put(self._rep_endtag[tag])
-            return
-        if self._end_h.has_key(tag):
-            self._end_h[tag](tag)
-            return
+        if self._end_handlers.has_key(tag):
+            f = self._end_handlers[tag]
+            t = f(self, tag)
+            self._put(t)
 
     def handle_data(self, data):
+        if self._currenttag == "title":
+            self.title = data.strip()
+
         if self._thru:
             return
         if self._rex_empty.search(data):
             return
 
+        output = self.wiki_escape(data)
+
         if self._href:
-            self._stack.append(data)
+            self._stack.append(output)
         else:
-            self._put(data.rstrip())
+            self._put(output.rstrip())
+
+    def wiki_escape(self, data):
+        if not self._currenttag == "pre":
+            data = data.replace("__", "!__")
+        return data
 
     def handle_charref(self, ref):
         pass
@@ -139,8 +108,111 @@ class Html2SfjpWiki(HTMLParser.HTMLParser):
     def handle_entityref(self, name):
         pass
 
+    def _put(self, str, force=False):
+        if force == False and (str == None or self._block):
+            return
+        self._buf.append(str)
+
+    def _add_handlers(self):
+        """add start/end handlers for each tag."""
+        # generate simple replace rule.
+        # prepare dictionary. key is tag. value is replaced string.
+        r_starttag = dict(
+            p="",
+            i="''",
+            tt="`",
+            b="'''",
+            strong="'''",
+            big="'''",
+            small="__",
+            hr="----\n",
+            h3="=== ",
+            h4="==== ",
+            pre="{{{" )
+        # generate function to replace tag to string.
+        for key in r_starttag:
+            self._start_handlers[key] = lambda s, t, attr: r_starttag[t]
+
+        # for end tag, do same process.
+        r_endtag = dict(
+            p="\n\n",
+            i="''",
+            tt="`",
+            b="'''",
+            strong="'''",
+            big="'''",
+            small="__",
+            h3=" ===\n",
+            h4=" ====\n",
+            pre="\n}}}\n\n" )
+        for key in r_endtag:
+            self._end_handlers[key] = lambda s, t: r_endtag[t]
+
+        # add class's "_h_start_<tagname>" function to _start_handlers[tagname],
+        # "_h_end_<tagname>" function to _end_handlers[tagname].
+        # __class__.__dict__ is a dictionary which contains class's member functions.
+        for func in self.__class__.__dict__:
+            if func.find("_h_start_") == 0:
+                # for example, if "func" is "_h_start_img", then
+                # assign func to  self._start_handlers["img"].
+                tagname = func[len("_h_start_"):]
+                self._start_handlers[tagname] = self.__class__.__dict__[func]
+            if func.find("_h_end_") == 0:
+                # for example, if "func" is "_h_start_img", then
+                # assign func to  self._start_handlers["img"].
+                tagname = func[len("_h_end_"):]
+                self._end_handlers[tagname] = self.__class__.__dict__[func]
+
     # tag specific handlers
-    def _start_img(self, tag, attrs):
+    
+    def _h_start_table(self, tag, attrs):
+        # if tag has "class" attribute, and those value is "table":
+        if ("class", "table") in attrs:
+            # save old handlers
+            #self._table_h_tr = (self._start_handlers["tr"], self._end_handlers["tr"])
+            #self._table_h_td = (self._start_handlers["td"], self._end_handlers["td"])
+            #self._table_h_th = (self._start_handlers["th"], self._end_handlers["th"])
+
+            # set new handlers
+            self._start_handlers["tr"] = lambda s, t, a: "|"
+            self._start_handlers["td"] = lambda s, t, a: "|"
+            self._start_handlers["th"] = lambda s, t, a: "|'''"
+            self._end_handlers["tr"] = lambda s, a: "|\n"
+            self._end_handlers["td"] = lambda s, a: "|"
+            self._end_handlers["th"] = lambda s, a: "'''|"
+            self._block = False
+            self._in_table = True
+        else:
+            self._block = True
+            self._in_table = False
+        
+
+    def _h_end_table(self, tag):
+        if self._in_table:
+            self._in_table = False
+            self._block = False
+            del self._start_handlers["tr"]
+            del self._end_handlers["tr"]
+            del self._start_handlers["td"]
+            del self._end_handlers["td"]
+            del self._start_handlers["th"]
+            del self._end_handlers["th"]
+            return "\n"
+        else:
+            self._in_table = False
+            self._block = False
+
+    def _h_start_caption(self, tag, attrs):
+        del self._start_handlers["b"]
+        del self._end_handlers["b"]
+        return "====== "
+
+    def _h_end_caption(self, tagd):
+        self._start_handlers["b"] = lambda s, t, a: "'''"
+        self._end_handlers["b"] = lambda s, t: "'''"
+        return " ======\n"
+        
+    def _h_start_img(self, tag, attrs):
         src = ""
         title = ""
         for (attr, val) in attrs:
@@ -149,15 +221,18 @@ class Html2SfjpWiki(HTMLParser.HTMLParser):
             elif attr == "alt":
                 title = val
 
-        if self._prevtag == "a":
+        if self._prevtag == "a" and self._href:
             filename = self._href.split("/")[-1]
             self._href = ""
         else:
             filename = src.split("/")[-1]
 
-        self._put("Thumb(%s, caption=%s)\n" % (filename, title))
+        if title:
+            self._put("[[Thumb(%s, caption=%s)]]\n\n" % (filename, title), True)
+        else:
+            self._put("[[Thumb(%s)]]\n\n" % (filename,), True)
 
-    def _start_a(self, tag, attrs):
+    def _h_start_a(self, tag, attrs):
         href = ""
         for (attr, val) in attrs:
             if attr == "href":
@@ -166,7 +241,7 @@ class Html2SfjpWiki(HTMLParser.HTMLParser):
         if href:
             self._href = href
 
-    def _end_a(self, tag):
+    def _h_end_a(self, tag):
         if self._href:
             if self._stack:
                 content = self._stack.pop()
@@ -178,37 +253,3 @@ class Html2SfjpWiki(HTMLParser.HTMLParser):
             self._put(content)
             self._put("]")
             self._href = ""
-        
-    def _regularize_url(self, url):
-        """regularize given url."""
-        # urlparse.urlparse("http://hoge.net/foo/var/index.html;q?a=b#c")
-        #
-        #       0       1           2                      3    4      5      
-        #  -> ('http', 'hoge.net', '/foo/var/index.html', 'q', 'a=b', 'c')
-        #
-        current_term = self._base_url_items
-        current_dir = os.path.dirname(current_term[2])
-        current_last = os.path.basename(current_term[2])
-
-        result = urlparse(url)
-        term = list(result)
-        
-        if not term[0]:
-            term[0] = current_term[0] + "://"
-        else:
-            term[0] = term[0] + "://"
-        if not term[1]:
-            term[1] = current_term[1]
-        if term[2] and term[2][0] != "/":
-            term[2] = os.path.normpath(current_dir + "/" + term[2])
-        if term[3]:
-            term[3] = ";" + term[3]
-        if term[4]:
-            term[4] = "?" + term[4]
-        if term[5]:
-            term[5] = "#" + term[5]
-
-        url = "".join(term)
-        return url
-    
-
diff --git a/html2wiki.py b/html2wiki.py
new file mode 100755 (executable)
index 0000000..d26c388
--- /dev/null
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Test suite for spyder.py."""
+
+import sys
+import os.path
+import html2sfjpwiki
+import re 
+
+target = sys.argv[1]
+fh = open(target, "r")
+html = fh.read()
+fh.close()
+
+dir = os.path.dirname(target)
+output = os.path.join(dir, "wiki.txt")
+
+rex = re.compile(r"<\s*script[^>]*?>.*?</script>", re.S)
+rex2 = re.compile(r"<\s*noscript[^>]*?>.*?</noscript>", re.S)
+tmp = rex.sub("", html)
+html_r = rex2.sub("", tmp)
+
+c = html2sfjpwiki.Html2SfjpWiki()
+r = c.parse(html_r, "article-body")
+
+fh = open(output, "w")
+fh.write(c.title + "\n\n")
+fh.write(r)
+fh.close()
+
+#print r
diff --git a/sfmag_fetch.py b/sfmag_fetch.py
new file mode 100644 (file)
index 0000000..bca25bc
--- /dev/null
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import fetch_sfjpmag
+import sys
+import os.path
+
+usage = "%s <url_list> <output_dir>" % sys.argv[0]
+
+try:
+    urllist = sys.argv[1]
+    output_dir = sys.argv[2]
+except IndexError:
+    sys.exit(usage)
+
+if not os.path.isdir(output_dir):
+    sys.exit(usage)
+
+try:
+    f = open(urllist, "r")
+except IOError:
+    sys.exit(usage)
+
+for url in f:
+    if url[0] == "#":
+        continue
+
+    url = url.strip()
+    fetch_sfjpmag.fetch(url, output_dir)
+
diff --git a/sfmag_html2wiki.py b/sfmag_html2wiki.py
new file mode 100755 (executable)
index 0000000..b5d280c
--- /dev/null
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""convert sfjpmagazine's story to sfjpwiki-style text."""
+
+import sys
+import re
+import os
+import os.path
+import urlparse
+
+import html2sfjpwiki
+
+usage = """%s <html>""" % sys.argv[0]
+
+try:
+    html = sys.argv[1]
+except IndexError:
+    sys.exit(usage)
+
+
+try:
+    fh = open(html, "r")
+except IOError:
+    sys.exit(usage)
+
+body = fh.read()
+fh.close()
+
+rex = re.compile(r"<\s*script[^>]*?>.*?</script>", re.S)
+rex2 = re.compile(r"<\s*noscript[^>]*?>.*?</noscript>", re.S)
+tmp = rex.sub("", body)
+tmp = rex2.sub("", tmp)
+
+c = html2sfjpwiki.Html2SfjpWiki()
+r = c.parse(tmp, "article-body")
+
+output = os.path.join(os.path.dirname(html), "wiki.txt")
+
+try:
+    fo = open(output, "w")
+except IOError:
+    sys.exit("cannot open output file: %s." % output)
+
+fo.write("[[PageNavi(NavigationList)]]\n\n\n")
+fo.write(r)
+fo.write("\n\n[[PageNavi(NavigationList)]]\n\n")
+fo.close()
+
index 56bee1e..312a00d 100755 (executable)
@@ -12,8 +12,8 @@ import fetch_sfjpmag
 
 class TestSequenceFunctions(unittest.TestCase):
     def setUp(self):
-        self.test_url = "http://sourceforge.jp/magazine/09/09/10/1214252"
-        self.test_dir = "fetch_test"
+        self.test_url = "http://sourceforge.jp/magazine/09/09/07/0257257"
+        self.test_dir = "fetch_test2"
 
     def test_fetch(self):
         """test for fetch"""