OSDN Git Service

create html2wiki
[otptools/otptools.git] / html2wiki / html2sfjpwiki.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """convert html to sfjpwiki-style text."""
4
5 import urllib
6 import urlparse
7 import HTMLParser
8 import os.path
9 import re
10 import sys
11 import urlparse
12
13 class Html2SfjpWiki(HTMLParser.HTMLParser):
14     "html to sfjpwiki-style text converter"
15     def __init__(self):
16         HTMLParser.HTMLParser.__init__(self)
17         self._target_id = ""
18         self._buf = []
19         self._thru = True
20         self._rex_empty = re.compile(r"^\s*$")
21         self._div_counter = 0
22         self._prevtag = ""
23         self._currenttag = ""
24         self._stack = []
25         self._href = ""
26         self.title = ""
27         self._start_handlers = {}
28         self._end_handlers = {}
29
30         self._stacking = False
31         self._add_handlers()
32         self._block = False
33         self._list_mode = ""
34         self._in_div = 0
35         self._inner = {}
36         self._url_r_map = {}
37         self._pre_data = ""
38         self._in_column = False
39
40     def set_url_replace_list(self, fname):
41         f = open(fname, "r")
42         for item in f:
43             (url, repl) = item.strip().split()
44             self._url_r_map[url] = repl
45         f.close()
46
47     def _post_proc(self, str):
48         rex_caption = re.compile(r"^'''(.+?)'''", re.M)
49         rex_header = re.compile(r"^(==+ )(.+?)( ==+)", re.M)
50         rex_sfwiki = re.compile(r"\?sf[0-9][0-9][0-9][0-9] ")
51         rex_anchor = re.compile(r"([A-Z][a-z0-9]+[A-Z][a-z0-9]+)")
52
53         f = lambda x: x.group(1) + re.sub(r"([A-Z][a-z0-9]+[A-Z][a-z0-9]+)", r"!\1", x.group(2)) + x.group(3)
54         t = str
55
56         t = rex_caption.sub(r"====== \1 ======\n", t)
57         t = rex_header.sub(f, t)
58         t = rex_sfwiki.sub(r"?sfwiki ", t)
59         return t
60
61     def parse(self, html_string, target_id):
62         """Parse html_string with url, and return anchors"""
63         self._anchors = []
64         self._imgs = []
65         self._id = target_id
66         if self._id == "":
67             self._thru = False
68         self.feed(html_string)
69         ret = "".join(self._buf)
70         return self._post_proc(ret)
71
72     def handle_starttag(self, tag, attrs):
73         self._prevtag = self._currenttag
74         self._currenttag = tag
75         if self._inner.has_key(tag):
76             self._inner[tag] += 1
77         else:
78             self._inner[tag] = 1
79
80         if self._thru:
81             d_attrs = dict(attrs)
82             if d_attrs.has_key("id"):
83                 if d_attrs["id"] == self._id:
84                     self._thru = False
85                 else:
86                     return
87             else:
88                 return
89
90         if tag == "div":
91             self._div_counter += 1
92
93         if self._inner.has_key("pre") and self._inner["pre"] > 0:
94             self._pre_start_handler(tag, attrs)
95             return
96
97         if self._start_handlers.has_key(tag):
98             f = self._start_handlers[tag]
99             t = f(self, tag, attrs)
100             self._put(t)
101
102     def handle_endtag(self, tag):
103         self._prevtag = self._currenttag
104         self._currenttag = ""
105         self._inner[tag] -= 1
106
107         if self._thru:
108             return
109
110         if tag == "div":
111             self._div_counter -= 1
112             if self._div_counter == 0:
113                 self._thru = True
114                 return
115
116         if self._inner.has_key("pre") and self._inner["pre"] > 0:
117             self._pre_end_handler(tag)
118             return
119
120         if self._end_handlers.has_key(tag):
121             f = self._end_handlers[tag]
122             t = f(self, tag)
123             self._put(t)
124
125     def handle_data(self, data):
126         if self._currenttag == "title":
127             self.title = data.strip()
128
129         if self._thru:
130             return
131         if self._inner.has_key("pre") and self._inner["pre"] > 0:
132             self._pre_data_handler(data)
133             return
134
135         if self._rex_empty.search(data):
136             return
137
138         output = self.wiki_escape(data)
139
140         if self._href:
141             self._stack.append(output)
142         else:
143             self._put(output.rstrip())
144
145     def wiki_escape(self, data):
146         if not self._currenttag in ("td",):
147             data = data.replace("__", "!__")
148         if self._inner.get("p", 0) > 0 and self._inner["a"] == 0:
149             data = re.sub(r"([A-Z][a-z0-9]+[A-Z][a-z0-9]+)", r"!\1", data)
150         return data
151
152     def handle_charref(self, ref):
153         pass
154
155     def handle_entityref(self, name):
156         pass
157
158     def _put(self, str, force=False):
159         if force == False and (str == None or self._block):
160             return
161         self._buf.append(str)
162
163     def _add_handlers(self):
164         """add start/end handlers for each tag."""
165         # generate simple replace rule.
166         # prepare dictionary. key is tag. value is replaced string.
167         r_starttag = dict(
168             p="\n\n",
169             i="''",
170             tt="`",
171             b="'''",
172             strong="'''",
173             big="'''",
174             small="__",
175             hr="----\n",
176             h3="\n=== ",
177             h4="\n==== ",
178             br="\n")
179         # generate function to replace tag to string.
180         for key in r_starttag:
181             self._start_handlers[key] = lambda s, t, attr: r_starttag[t]
182
183         # for end tag, do same process.
184         r_endtag = dict(
185             p="\n",
186             i="''",
187             tt="`",
188             b="'''",
189             strong="'''",
190             big="'''",
191             small="__",
192             h3=" ===\n",
193             h4=" ====\n")
194         for key in r_endtag:
195             self._end_handlers[key] = lambda s, t: r_endtag[t]
196
197         # add class's "_h_start_<tagname>" function to _start_handlers[tagname],
198         # "_h_end_<tagname>" function to _end_handlers[tagname].
199         # __class__.__dict__ is a dictionary which contains class's member functions.
200         for func in self.__class__.__dict__:
201             if func.find("_h_start_") == 0:
202                 # for example, if "func" is "_h_start_img", then
203                 # assign func to  self._start_handlers["img"].
204                 tagname = func[len("_h_start_"):]
205                 self._start_handlers[tagname] = self.__class__.__dict__[func]
206             if func.find("_h_end_") == 0:
207                 # for example, if "func" is "_h_start_img", then
208                 # assign func to  self._start_handlers["img"].
209                 tagname = func[len("_h_end_"):]
210                 self._end_handlers[tagname] = self.__class__.__dict__[func]
211
212     # tag specific handlers
213     
214     def _expand_attrs(self, tag, attrs):
215         if attrs:
216             attrlist = ["=".join((key, '"%s"' % val)) for (key,val) in attrs]
217             s = " ".join(attrlist)
218             return "<" + " ".join((tag,s)) + ">"
219         else:
220             return "<" + tag + ">"
221         
222
223     def _h_start_table(self, tag, attrs):
224         # if tag has "class" attribute, and those value is "table":
225         if ("class", "table") in attrs:
226             self._start_handlers["tr"] = lambda s, t, a: s._expand_attrs(t, a)
227             self._start_handlers["td"] = lambda s, t, a: s._expand_attrs(t, a)
228             self._start_handlers["th"] = lambda s, t, a: s._expand_attrs(t, a)
229             self._end_handlers["tr"] = lambda s, t: "</" + t + ">\n"
230             self._end_handlers["td"] = lambda s, t: "</" + t + ">"
231             self._end_handlers["th"] = lambda s, t: "</" + t + ">"
232             self._block = False
233             self._in_table = True
234             return """{{{ html
235 """
236 #         elif ("class", "column") in attrs:
237 #             self._start_handlers["tr"] = lambda s, t, a: s._expand_attrs(t, a)
238 #             self._start_handlers["td"] = lambda s, t, a: s._expand_attrs(t, a)
239 #             self._start_handlers["th"] = lambda s, t, a: s._expand_attrs(t, a)
240 #             self._end_handlers["tr"] = lambda s, t: "</" + t + ">\n"
241 #             self._end_handlers["td"] = lambda s, t: "</" + t + ">"
242 #             self._end_handlers["th"] = lambda s, t: "</" + t + ">"
243 #             self._block = False
244 #             self._in_table = True
245 #             return """{{{ html
246 # %s
247 # """ % self._expand_attrs(tag, attrs)
248         else:
249             self._block = True
250             self._in_table = False
251         
252
253     def _h_end_table(self, tag):
254         if self._in_table:
255             self._in_table = False
256             self._block = False
257             del self._start_handlers["tr"]
258             del self._end_handlers["tr"]
259             del self._start_handlers["td"]
260             del self._end_handlers["td"]
261             del self._start_handlers["th"]
262             del self._end_handlers["th"]
263             return """
264 </table>
265 }}}
266 """
267         else:
268             self._in_table = False
269             self._block = False
270
271     def _h_start_ul(self, tag, attrs):
272         self._list_mode = "ul"
273
274     def _h_start_ol(self, tag, attrs):
275         self._list_mode = "ul"
276
277     def _h_start_li(self, tag, attrs):
278         if self._list_mode == "ul":
279             return " * "
280         elif self._list_mode == "ol":
281             return " 1. "
282
283     def _h_end_li(self, tag):
284         return "\n"
285
286     def _h_end_ol(self, tag):
287         return "\n"
288
289     def _h_end_ul(self, tag):
290         return "\n"
291
292     def _h_start_caption(self, tag, attrs):
293         del self._start_handlers["b"]
294         del self._end_handlers["b"]
295         return "<h6>"
296
297     def _h_end_caption(self, tagd):
298         self._start_handlers["b"] = lambda s, t, a: "'''"
299         self._end_handlers["b"] = lambda s, t: "'''"
300         return """</h6>\n<table class="wikitable" border="1">\n\n"""
301         
302     def _h_start_img(self, tag, attrs):
303         src = ""
304         title = ""
305         for (attr, val) in attrs:
306             if attr == "src":
307                 src = val
308             elif attr == "alt":
309                 title = val
310
311         rex = re.compile(r"\.(png|PNG|gif|GIF|jpg|JPG)$")
312
313         if self._prevtag == "a" and self._href and rex.search(self._href):
314             filename = self._href.split("/")[-1]
315             self._href = ""
316         else:
317             filename = src.split("/")[-1]
318
319         if self._href:
320             self._href = ""
321
322         if title:
323             self._put("[[Thumb(%s, caption=%s)]]\n\n" % (filename, title), True)
324         else:
325             self._put("[[Thumb(%s)]]\n\n" % (filename,), True)
326
327     def _h_start_a(self, tag, attrs):
328         href = ""
329         for (attr, val) in attrs:
330             if attr == "href":
331                 href = val
332                 break
333         if href:
334             self._href = href
335
336     def _replace_url(self, url):
337         t = urlparse.urlparse(url)
338         if t[1] == "sourceforge.jp":
339             m = re.search(r"^(/magazine/\d\d/\d\d/\d\d/\d+)", t[2])
340             if m and self._url_r_map.has_key(m.group(1)):
341                 m2 = re.search(r"^/magazine/\d\d/\d\d/\d\d/\d+/(\d+)", t[2])
342                 if m2:
343                     return self._url_r_map[m.group(1)].replace("_p1", "") + "_p" + m2.group(1)
344                 else:
345                     return self._url_r_map[m.group(1)]
346         return url
347                 
348                     
349
350     def _h_end_a(self, tag):
351         if self._href:
352             if self._stack:
353                 content = self._stack.pop()
354             else:
355                 content = ""
356
357             rurl = self._replace_url(self._href)
358             if self._inner.has_key("table") and self._inner["table"] > 0:
359                 self._put('<a href="')
360                 self._put(rurl)
361                 self._put('">')
362                 self._put(content)
363                 self._put("</a>")
364                 self._href = ""
365             else:
366                 self._put("[")
367                 self._put(rurl)
368                 self._put(" ")
369                 self._put(content)
370                 self._put("]")
371                 self._href = ""
372
373     def _h_start_div(self, tag, attrs):
374         if ("class", "navigation") in attrs:
375             self._in_div = 1
376             self._block = True
377         elif ("class", "column") in attrs:
378             self._in_div = 1
379             self._in_column = True
380             self._put("""{{{ html
381 <div class="column">
382 }}}
383 """)
384         elif self._in_div > 0:
385             self._in_div += 1
386
387     def _h_end_div(self, tag):
388         if self._in_div > 0:
389             self._in_div -= 1
390
391         if self._in_div == 0:
392             self._block = False
393
394         if self._in_column:
395             self._in_column = False
396             self._put("""{{{ html
397 </div>
398 }}}
399 """)
400
401         
402     def _h_start_pre(self, tag, attrs):
403         pass
404
405     def _h_end_pre(self, tag):
406         t = """{{{%s}}}
407 """ % self._pre_data
408         self._pre_data = ""
409         return t
410
411     def _pre_data_handler(self, data):
412         self._pre_data = self._pre_data + data
413
414     def _pre_start_handler(self, tag, attrs):
415         if tag == "pre":
416             self._h_start_pre(tag, attrs)
417         # self._pre_data = self._pre_data + self._expand_attrs(tag, attrs)
418
419     def _pre_end_handler(self, tag):
420         if tag == "pre":
421             self._h_end_pre(tag)
422         # self._pre_data = self._pre_data + "</" + tag + ">"
423