html2sfjpwiki.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 """convert html to sfjpwiki-style text."""
   4
   5 import urllib
   6 import HTMLParser
   7 import os.path
   8 import re
   9 import sys
  10 from urlparse import urlparse
  11
  12
  13 class Html2SfjpWiki(HTMLParser.HTMLParser):
  14     "html to sfjpwiki-style text converter"
  15     def __init__(self):
  16         HTMLParser.HTMLParser.__init__(self)
  17         self._target_id = ""
  18         self._buf = []
  19         self._add_handlers()
  20         self._thru = True
  21         self._rex_empty = re.compile(r"^\s*$")
  22         self._div_counter = 0
  23         self._prevtag = ""
  24         self._currenttag = ""
  25         self._stack = []
  26         self._href = ""
  27
  28     def parse(self, html_string, target_id):
  29         """Parse html_string with url, and return anchors"""
  30         self._anchors = []
  31         self._imgs = []
  32         self._id = target_id
  33         if self._id == "":
  34             self._thru = False
  35         self.feed(html_string)
  36         return "".join(self._buf)
  37
  38     def _put(self, str):
  39         self._buf.append(str)
  40
  41     def _add_handlers(self):
  42         h_start = dict()
  43         h_end = dict()
  44
  45         h_start["a"] = self._start_a
  46         h_start["img"] = self._start_img
  47         h_end["a"] = self._end_a
  48
  49         self._start_h = h_start
  50         self._end_h = h_end
  51         self._rep_starttag = dict(
  52             p="",
  53             i="''",
  54             tt="`",
  55             b="'''",
  56             strong="'''",
  57             big="'''",
  58             small="__",
  59             td="||",
  60             tr="",
  61             hr="----\n",
  62             h3="=== ",
  63             h4="==== " )
  64         self._rep_endtag = dict(
  65             p="\n\n",
  66             i="''",
  67             tt="`",
  68             b="'''",
  69             strong="'''",
  70             big="'''",
  71             small="__",
  72             td="||",
  73             tr="\n",
  74             h3=" ===\n",
  75             h4=" ====\n" )
  76         self._tag_ignore = ["br", "font", "table", "tbody", "tfoot", ]
  77
  78     def handle_starttag(self, tag, attrs):
  79         self._prevtag = self._currenttag
  80         self._currenttag = tag
  81
  82         if self._thru:
  83             d_attrs = dict(attrs)
  84             if d_attrs.has_key("id"):
  85                 if d_attrs["id"] == self._id:
  86                     self._thru = False
  87                 else:
  88                     return
  89             else:
  90                 return
  91
  92         if tag == "div":
  93             self._div_counter += 1
  94
  95         if tag in self._tag_ignore:
  96             # do nothing
  97             return
  98         if self._rep_starttag.has_key(tag):
  99             self._put(self._rep_starttag[tag])
 100             return
 101         if self._start_h.has_key(tag):
 102             self._start_h[tag](tag, attrs)
 103             return
 104
 105     def handle_endtag(self, tag):
 106         if self._thru:
 107             return
 108
 109         if tag == "div":
 110             self._div_counter -= 1
 111             if self._div_counter == 0:
 112                 self._thru = True
 113                 return
 114
 115         if tag in self._tag_ignore:
 116             # do nothing
 117             return
 118         if self._rep_endtag.has_key(tag):
 119             self._put(self._rep_endtag[tag])
 120             return
 121         if self._end_h.has_key(tag):
 122             self._end_h[tag](tag)
 123             return
 124
 125     def handle_data(self, data):
 126         if self._thru:
 127             return
 128         if self._rex_empty.search(data):
 129             return
 130
 131         if self._href:
 132             self._stack.append(data)
 133         else:
 134             self._put(data.rstrip())
 135
 136     def handle_charref(self, ref):
 137         pass
 138
 139     def handle_entityref(self, name):
 140         pass
 141
 142     # tag specific handlers
 143     def _start_img(self, tag, attrs):
 144         src = ""
 145         title = ""
 146         for (attr, val) in attrs:
 147             if attr == "src":
 148                 src = val
 149             elif attr == "alt":
 150                 title = val
 151
 152         if self._prevtag == "a":
 153             filename = self._href.split("/")[-1]
 154             self._href = ""
 155         else:
 156             filename = src.split("/")[-1]
 157
 158         self._put("Thumb(%s, caption=%s)\n" % (filename, title))
 159
 160     def _start_a(self, tag, attrs):
 161         href = ""
 162         for (attr, val) in attrs:
 163             if attr == "href":
 164                 href = val
 165                 break
 166         if href:
 167             self._href = href
 168
 169     def _end_a(self, tag):
 170         if self._href:
 171             if self._stack:
 172                 content = self._stack.pop()
 173             else:
 174                 content = ""
 175             self._put("[")
 176             self._put(self._href)
 177             self._put(" ")
 178             self._put(content)
 179             self._put("]")
 180             self._href = ""
 181
 182     def _regularize_url(self, url):
 183         """regularize given url."""
 184         # urlparse.urlparse("http://hoge.net/foo/var/index.html;q?a=b#c")
 185         #
 186         #       0       1           2                      3    4      5
 187         #  -> ('http', 'hoge.net', '/foo/var/index.html', 'q', 'a=b', 'c')
 188         #
 189         current_term = self._base_url_items
 190         current_dir = os.path.dirname(current_term[2])
 191         current_last = os.path.basename(current_term[2])
 192
 193         result = urlparse(url)
 194         term = list(result)
 195
 196         if not term[0]:
 197             term[0] = current_term[0] + "://"
 198         else:
 199             term[0] = term[0] + "://"
 200         if not term[1]:
 201             term[1] = current_term[1]
 202         if term[2] and term[2][0] != "/":
 203             term[2] = os.path.normpath(current_dir + "/" + term[2])
 204         if term[3]:
 205             term[3] = ";" + term[3]
 206         if term[4]:
 207             term[4] = "?" + term[4]
 208         if term[5]:
 209             term[5] = "#" + term[5]
 210
 211         url = "".join(term)
 212         return url
 213
 214