OSDN Git Service

806e9722286effa125fb35a19ed48b5fe54b8539
[otptools/otptools.git] / html2sfjpwiki.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """convert html to sfjpwiki-style text."""
4
5 import urllib
6 import HTMLParser
7 import os.path
8 import re
9 import sys
10 from urlparse import urlparse
11
12
13 class Html2SfjpWiki(HTMLParser.HTMLParser):
14     "html to sfjpwiki-style text converter"
15     def __init__(self):
16         HTMLParser.HTMLParser.__init__(self)
17         self._target_id = ""
18         self._buf = []
19         self._add_handlers()
20         self._thru = True
21         self._rex_empty = re.compile(r"^\s*$")
22         self._div_counter = 0
23         self._prevtag = ""
24         self._currenttag = ""
25         self._stack = []
26         self._href = ""
27
28     def parse(self, html_string, target_id):
29         """Parse html_string with url, and return anchors"""
30         self._anchors = []
31         self._imgs = []
32         self._id = target_id
33         if self._id == "":
34             self._thru = False
35         self.feed(html_string)
36         return "".join(self._buf)
37
38     def _put(self, str):
39         self._buf.append(str)
40
41     def _add_handlers(self):
42         h_start = dict()
43         h_end = dict()
44
45         h_start["a"] = self._start_a
46         h_start["img"] = self._start_img
47         h_end["a"] = self._end_a
48
49         self._start_h = h_start
50         self._end_h = h_end
51         self._rep_starttag = dict(
52             p="",
53             i="''",
54             tt="`",
55             b="'''",
56             strong="'''",
57             big="'''",
58             small="__",
59             td="||",
60             tr="",
61             hr="----\n",
62             h3="=== ",
63             h4="==== " )
64         self._rep_endtag = dict(
65             p="\n\n",
66             i="''",
67             tt="`",
68             b="'''",
69             strong="'''",
70             big="'''",
71             small="__",
72             td="||",
73             tr="\n",
74             h3=" ===\n",
75             h4=" ====\n" )
76         self._tag_ignore = ["br", "font", "table", "tbody", "tfoot", ]
77         
78     def handle_starttag(self, tag, attrs):
79         self._prevtag = self._currenttag
80         self._currenttag = tag
81
82         if self._thru:
83             d_attrs = dict(attrs)
84             if d_attrs.has_key("id"):
85                 if d_attrs["id"] == self._id:
86                     self._thru = False
87                 else:
88                     return
89             else:
90                 return
91
92         if tag == "div":
93             self._div_counter += 1
94
95         if tag in self._tag_ignore:
96             # do nothing
97             return
98         if self._rep_starttag.has_key(tag):
99             self._put(self._rep_starttag[tag])
100             return
101         if self._start_h.has_key(tag):
102             self._start_h[tag](tag, attrs)
103             return
104
105     def handle_endtag(self, tag):
106         if self._thru:
107             return
108
109         if tag == "div":
110             self._div_counter -= 1
111             if self._div_counter == 0:
112                 self._thru = True
113                 return
114
115         if tag in self._tag_ignore:
116             # do nothing
117             return
118         if self._rep_endtag.has_key(tag):
119             self._put(self._rep_endtag[tag])
120             return
121         if self._end_h.has_key(tag):
122             self._end_h[tag](tag)
123             return
124
125     def handle_data(self, data):
126         if self._thru:
127             return
128         if self._rex_empty.search(data):
129             return
130
131         if self._href:
132             self._stack.append(data)
133         else:
134             self._put(data.rstrip())
135
136     def handle_charref(self, ref):
137         pass
138
139     def handle_entityref(self, name):
140         pass
141
142     # tag specific handlers
143     def _start_img(self, tag, attrs):
144         src = ""
145         title = ""
146         for (attr, val) in attrs:
147             if attr == "src":
148                 src = val
149             elif attr == "alt":
150                 title = val
151
152         if self._prevtag == "a":
153             filename = self._href.split("/")[-1]
154             self._href = ""
155         else:
156             filename = src.split("/")[-1]
157
158         self._put("Thumb(%s, caption=%s)\n" % (filename, title))
159
160     def _start_a(self, tag, attrs):
161         href = ""
162         for (attr, val) in attrs:
163             if attr == "href":
164                 href = val
165                 break
166         if href:
167             self._href = href
168
169     def _end_a(self, tag):
170         if self._href:
171             if self._stack:
172                 content = self._stack.pop()
173             else:
174                 content = ""
175             self._put("[")
176             self._put(self._href)
177             self._put(" ")
178             self._put(content)
179             self._put("]")
180             self._href = ""
181         
182     def _regularize_url(self, url):
183         """regularize given url."""
184         # urlparse.urlparse("http://hoge.net/foo/var/index.html;q?a=b#c")
185         #
186         #       0       1           2                      3    4      5      
187         #  -> ('http', 'hoge.net', '/foo/var/index.html', 'q', 'a=b', 'c')
188         #
189         current_term = self._base_url_items
190         current_dir = os.path.dirname(current_term[2])
191         current_last = os.path.basename(current_term[2])
192
193         result = urlparse(url)
194         term = list(result)
195         
196         if not term[0]:
197             term[0] = current_term[0] + "://"
198         else:
199             term[0] = term[0] + "://"
200         if not term[1]:
201             term[1] = current_term[1]
202         if term[2] and term[2][0] != "/":
203             term[2] = os.path.normpath(current_dir + "/" + term[2])
204         if term[3]:
205             term[3] = ";" + term[3]
206         if term[4]:
207             term[4] = "?" + term[4]
208         if term[5]:
209             term[5] = "#" + term[5]
210
211         url = "".join(term)
212         return url
213     
214