2 # -*- coding: utf-8 -*-
3 """convert html to sfjpwiki-style text."""
13 class Html2SfjpWiki(HTMLParser.HTMLParser):
14 "html to sfjpwiki-style text converter"
16 HTMLParser.HTMLParser.__init__(self)
20 self._rex_empty = re.compile(r"^\s*$")
27 self._start_handlers = {}
28 self._end_handlers = {}
30 self._stacking = False
38 self._in_column = False
40 def set_url_replace_list(self, fname):
43 (url, repl) = item.strip().split()
44 self._url_r_map[url] = repl
47 def _post_proc(self, str):
48 rex_caption = re.compile(r"^'''(.+?)'''", re.M)
49 rex_header = re.compile(r"^(==+ )(.+?)( ==+)", re.M)
50 rex_sfwiki = re.compile(r"\?sf[0-9][0-9][0-9][0-9] ")
51 rex_anchor = re.compile(r"([A-Z][a-z0-9]+[A-Z][a-z0-9]+)")
53 f = lambda x: x.group(1) + re.sub(r"([A-Z][a-z0-9]+[A-Z][a-z0-9]+)", r"!\1", x.group(2)) + x.group(3)
56 t = rex_caption.sub(r"====== \1 ======\n", t)
57 t = rex_header.sub(f, t)
58 t = rex_sfwiki.sub(r"?sfwiki ", t)
61 def parse(self, html_string, target_id):
62 """Parse html_string with url, and return anchors"""
68 self.feed(html_string)
69 ret = "".join(self._buf)
70 return self._post_proc(ret)
72 def handle_starttag(self, tag, attrs):
73 self._prevtag = self._currenttag
74 self._currenttag = tag
75 if self._inner.has_key(tag):
82 if d_attrs.has_key("id"):
83 if d_attrs["id"] == self._id:
91 self._div_counter += 1
93 if self._inner.has_key("pre") and self._inner["pre"] > 0:
94 self._pre_start_handler(tag, attrs)
97 if self._start_handlers.has_key(tag):
98 f = self._start_handlers[tag]
99 t = f(self, tag, attrs)
102 def handle_endtag(self, tag):
103 self._prevtag = self._currenttag
104 self._currenttag = ""
105 self._inner[tag] -= 1
111 self._div_counter -= 1
112 if self._div_counter == 0:
116 if self._inner.has_key("pre") and self._inner["pre"] > 0:
117 self._pre_end_handler(tag)
120 if self._end_handlers.has_key(tag):
121 f = self._end_handlers[tag]
125 def handle_data(self, data):
126 if self._currenttag == "title":
127 self.title = data.strip()
131 if self._inner.has_key("pre") and self._inner["pre"] > 0:
132 self._pre_data_handler(data)
135 if self._rex_empty.search(data):
138 output = self.wiki_escape(data)
141 self._stack.append(output)
143 self._put(output.rstrip())
145 def wiki_escape(self, data):
146 if not self._currenttag in ("td",):
147 data = data.replace("__", "!__")
148 if self._inner.get("p", 0) > 0 and self._inner["a"] == 0:
149 data = re.sub(r"([A-Z][a-z0-9]+[A-Z][a-z0-9]+)", r"!\1", data)
152 def handle_charref(self, ref):
155 def handle_entityref(self, name):
158 def _put(self, str, force=False):
159 if force == False and (str == None or self._block):
161 self._buf.append(str)
163 def _add_handlers(self):
164 """add start/end handlers for each tag."""
165 # generate simple replace rule.
166 # prepare dictionary. key is tag. value is replaced string.
179 # generate function to replace tag to string.
180 for key in r_starttag:
181 self._start_handlers[key] = lambda s, t, attr: r_starttag[t]
183 # for end tag, do same process.
195 self._end_handlers[key] = lambda s, t: r_endtag[t]
197 # add class's "_h_start_<tagname>" function to _start_handlers[tagname],
198 # "_h_end_<tagname>" function to _end_handlers[tagname].
199 # __class__.__dict__ is a dictionary which contains class's member functions.
200 for func in self.__class__.__dict__:
201 if func.find("_h_start_") == 0:
202 # for example, if "func" is "_h_start_img", then
203 # assign func to self._start_handlers["img"].
204 tagname = func[len("_h_start_"):]
205 self._start_handlers[tagname] = self.__class__.__dict__[func]
206 if func.find("_h_end_") == 0:
207 # for example, if "func" is "_h_start_img", then
208 # assign func to self._start_handlers["img"].
209 tagname = func[len("_h_end_"):]
210 self._end_handlers[tagname] = self.__class__.__dict__[func]
212 # tag specific handlers
214 def _expand_attrs(self, tag, attrs):
216 attrlist = ["=".join((key, '"%s"' % val)) for (key,val) in attrs]
217 s = " ".join(attrlist)
218 return "<" + " ".join((tag,s)) + ">"
220 return "<" + tag + ">"
223 def _h_start_table(self, tag, attrs):
224 # if tag has "class" attribute, and those value is "table":
225 if ("class", "table") in attrs:
226 self._start_handlers["tr"] = lambda s, t, a: s._expand_attrs(t, a)
227 self._start_handlers["td"] = lambda s, t, a: s._expand_attrs(t, a)
228 self._start_handlers["th"] = lambda s, t, a: s._expand_attrs(t, a)
229 self._end_handlers["tr"] = lambda s, t: "</" + t + ">\n"
230 self._end_handlers["td"] = lambda s, t: "</" + t + ">"
231 self._end_handlers["th"] = lambda s, t: "</" + t + ">"
233 self._in_table = True
236 # elif ("class", "column") in attrs:
237 # self._start_handlers["tr"] = lambda s, t, a: s._expand_attrs(t, a)
238 # self._start_handlers["td"] = lambda s, t, a: s._expand_attrs(t, a)
239 # self._start_handlers["th"] = lambda s, t, a: s._expand_attrs(t, a)
240 # self._end_handlers["tr"] = lambda s, t: "</" + t + ">\n"
241 # self._end_handlers["td"] = lambda s, t: "</" + t + ">"
242 # self._end_handlers["th"] = lambda s, t: "</" + t + ">"
243 # self._block = False
244 # self._in_table = True
247 # """ % self._expand_attrs(tag, attrs)
250 self._in_table = False
253 def _h_end_table(self, tag):
255 self._in_table = False
257 del self._start_handlers["tr"]
258 del self._end_handlers["tr"]
259 del self._start_handlers["td"]
260 del self._end_handlers["td"]
261 del self._start_handlers["th"]
262 del self._end_handlers["th"]
268 self._in_table = False
271 def _h_start_ul(self, tag, attrs):
272 self._list_mode = "ul"
274 def _h_start_ol(self, tag, attrs):
275 self._list_mode = "ul"
277 def _h_start_li(self, tag, attrs):
278 if self._list_mode == "ul":
280 elif self._list_mode == "ol":
283 def _h_end_li(self, tag):
286 def _h_end_ol(self, tag):
289 def _h_end_ul(self, tag):
292 def _h_start_caption(self, tag, attrs):
293 del self._start_handlers["b"]
294 del self._end_handlers["b"]
297 def _h_end_caption(self, tagd):
298 self._start_handlers["b"] = lambda s, t, a: "'''"
299 self._end_handlers["b"] = lambda s, t: "'''"
300 return """</h6>\n<table class="wikitable" border="1">\n\n"""
302 def _h_start_img(self, tag, attrs):
305 for (attr, val) in attrs:
311 rex = re.compile(r"\.(png|PNG|gif|GIF|jpg|JPG)$")
313 if self._prevtag == "a" and self._href and rex.search(self._href):
314 filename = self._href.split("/")[-1]
317 filename = src.split("/")[-1]
323 self._put("[[Thumb(%s, caption=%s)]]\n\n" % (filename, title), True)
325 self._put("[[Thumb(%s)]]\n\n" % (filename,), True)
327 def _h_start_a(self, tag, attrs):
329 for (attr, val) in attrs:
336 def _replace_url(self, url):
337 t = urlparse.urlparse(url)
338 if t[1] == "sourceforge.jp":
339 m = re.search(r"^(/magazine/\d\d/\d\d/\d\d/\d+)", t[2])
340 if m and self._url_r_map.has_key(m.group(1)):
341 m2 = re.search(r"^/magazine/\d\d/\d\d/\d\d/\d+/(\d+)", t[2])
343 return self._url_r_map[m.group(1)].replace("_p1", "") + "_p" + m2.group(1)
345 return self._url_r_map[m.group(1)]
350 def _h_end_a(self, tag):
353 content = self._stack.pop()
357 rurl = self._replace_url(self._href)
358 if self._inner.has_key("table") and self._inner["table"] > 0:
359 self._put('<a href="')
373 def _h_start_div(self, tag, attrs):
374 if ("class", "navigation") in attrs:
377 elif ("class", "column") in attrs:
379 self._in_column = True
380 self._put("""{{{ html
384 elif self._in_div > 0:
387 def _h_end_div(self, tag):
391 if self._in_div == 0:
395 self._in_column = False
396 self._put("""{{{ html
402 def _h_start_pre(self, tag, attrs):
405 def _h_end_pre(self, tag):
411 def _pre_data_handler(self, data):
412 self._pre_data = self._pre_data + data
414 def _pre_start_handler(self, tag, attrs):
416 self._h_start_pre(tag, attrs)
417 # self._pre_data = self._pre_data + self._expand_attrs(tag, attrs)
419 def _pre_end_handler(self, tag):
422 # self._pre_data = self._pre_data + "</" + tag + ">"