--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""convert html to sfjpwiki-style text."""
+
+import urllib
+import HTMLParser
+import os.path
+import re
+import sys
+from urlparse import urlparse
+
+
+class Html2SfjpWiki(HTMLParser.HTMLParser):
+ "html to sfjpwiki-style text converter"
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+ self._target_id = ""
+ self._buf = []
+ self._add_handlers()
+ self._thru = True
+ self._rex_empty = re.compile(r"^\s*$")
+ self._div_counter = 0
+ self._prevtag = ""
+ self._currenttag = ""
+ self._stack = []
+ self._href = ""
+
+ def parse(self, html_string, target_id):
+ """Parse html_string with url, and return anchors"""
+ self._anchors = []
+ self._imgs = []
+ self._id = target_id
+ if self._id == "":
+ self._thru = False
+ self.feed(html_string)
+ return "".join(self._buf)
+
+ def _put(self, str):
+ self._buf.append(str)
+
+ def _add_handlers(self):
+ h_start = dict()
+ h_end = dict()
+
+ h_start["a"] = self._start_a
+ h_start["img"] = self._start_img
+ h_end["a"] = self._end_a
+
+ self._start_h = h_start
+ self._end_h = h_end
+ self._rep_starttag = dict(
+ p="",
+ i="''",
+ tt="`",
+ b="'''",
+ strong="'''",
+ big="'''",
+ small="__",
+ td="||",
+ tr="",
+ hr="----\n",
+ h3="=== ",
+ h4="==== " )
+ self._rep_endtag = dict(
+ p="\n\n",
+ i="''",
+ tt="`",
+ b="'''",
+ strong="'''",
+ big="'''",
+ small="__",
+ td="||",
+ tr="\n",
+ h3=" ===\n",
+ h4=" ====\n" )
+ self._tag_ignore = ["br", "font", "table", "tbody", "tfoot", ]
+
+ def handle_starttag(self, tag, attrs):
+ self._prevtag = self._currenttag
+ self._currenttag = tag
+
+ if self._thru:
+ d_attrs = dict(attrs)
+ if d_attrs.has_key("id"):
+ if d_attrs["id"] == self._id:
+ self._thru = False
+ else:
+ return
+ else:
+ return
+
+ if tag == "div":
+ self._div_counter += 1
+
+ if tag in self._tag_ignore:
+ # do nothing
+ return
+ if self._rep_starttag.has_key(tag):
+ self._put(self._rep_starttag[tag])
+ return
+ if self._start_h.has_key(tag):
+ self._start_h[tag](tag, attrs)
+ return
+
+ def handle_endtag(self, tag):
+ if self._thru:
+ return
+
+ if tag == "div":
+ self._div_counter -= 1
+ if self._div_counter == 0:
+ self._thru = True
+ return
+
+ if tag in self._tag_ignore:
+ # do nothing
+ return
+ if self._rep_endtag.has_key(tag):
+ self._put(self._rep_endtag[tag])
+ return
+ if self._end_h.has_key(tag):
+ self._end_h[tag](tag)
+ return
+
+ def handle_data(self, data):
+ if self._thru:
+ return
+ if self._rex_empty.search(data):
+ return
+
+ if self._href:
+ self._stack.append(data)
+ else:
+ self._put(data.rstrip())
+
+ def handle_charref(self, ref):
+ pass
+
+ def handle_entityref(self, name):
+ pass
+
+ # tag specific handlers
+ def _start_img(self, tag, attrs):
+ src = ""
+ title = ""
+ for (attr, val) in attrs:
+ if attr == "src":
+ src = val
+ elif attr == "alt":
+ title = val
+
+ if self._prevtag == "a":
+ filename = self._href.split("/")[-1]
+ self._href = ""
+ else:
+ filename = src.split("/")[-1]
+
+ self._put("Thumb(%s, caption=%s)\n" % (filename, title))
+
+ def _start_a(self, tag, attrs):
+ href = ""
+ for (attr, val) in attrs:
+ if attr == "href":
+ href = val
+ break
+ if href:
+ self._href = href
+
+ def _end_a(self, tag):
+ if self._href:
+ if self._stack:
+ content = self._stack.pop()
+ else:
+ content = ""
+ self._put("[")
+ self._put(self._href)
+ self._put(" ")
+ self._put(content)
+ self._put("]")
+ self._href = ""
+
+ def _regularize_url(self, url):
+ """regularize given url."""
+ # urlparse.urlparse("http://hoge.net/foo/var/index.html;q?a=b#c")
+ #
+ # 0 1 2 3 4 5
+ # -> ('http', 'hoge.net', '/foo/var/index.html', 'q', 'a=b', 'c')
+ #
+ current_term = self._base_url_items
+ current_dir = os.path.dirname(current_term[2])
+ current_last = os.path.basename(current_term[2])
+
+ result = urlparse(url)
+ term = list(result)
+
+ if not term[0]:
+ term[0] = current_term[0] + "://"
+ else:
+ term[0] = term[0] + "://"
+ if not term[1]:
+ term[1] = current_term[1]
+ if term[2] and term[2][0] != "/":
+ term[2] = os.path.normpath(current_dir + "/" + term[2])
+ if term[3]:
+ term[3] = ";" + term[3]
+ if term[4]:
+ term[4] = "?" + term[4]
+ if term[5]:
+ term[5] = "#" + term[5]
+
+ url = "".join(term)
+ return url
+
+
--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Test suite for spyder.py."""
+
+import unittest
+import re
+import html2sfjpwiki
+
+class TestSequenceFunctions(unittest.TestCase):
+ def setUp(self):
+ self.test_file = "fetch_test/09/09/10/1214252/1/body.html"
+ fh = open(self.test_file, "r")
+ html = fh.read()
+ fh.close()
+
+ rex = re.compile(r"<\s*script[^>]*?>.*?</script>", re.S)
+ rex2 = re.compile(r"<\s*noscript[^>]*?>.*?</noscript>", re.S)
+ tmp = rex.sub("", html)
+ self.test_data = rex2.sub("", tmp)
+
+ def test_convert(self):
+ """test for convert"""
+ c = html2sfjpwiki.Html2SfjpWiki()
+ r = c.parse(self.test_data, "article-body")
+ print r
+
+
+# do unittest
+suite = unittest.TestLoader().loadTestsFromTestCase(TestSequenceFunctions)
+unittest.TextTestRunner(verbosity=2).run(suite)
+