2 # -*- coding: utf-8 -*-
3 """convert html to sfjpwiki-style text."""
10 from urlparse import urlparse
13 class Html2SfjpWiki(HTMLParser.HTMLParser):
14 "html to sfjpwiki-style text converter"
16 HTMLParser.HTMLParser.__init__(self)
21 self._rex_empty = re.compile(r"^\s*$")
28 def parse(self, html_string, target_id):
29 """Parse html_string with url, and return anchors"""
35 self.feed(html_string)
36 return "".join(self._buf)
41 def _add_handlers(self):
45 h_start["a"] = self._start_a
46 h_start["img"] = self._start_img
47 h_end["a"] = self._end_a
49 self._start_h = h_start
51 self._rep_starttag = dict(
64 self._rep_endtag = dict(
76 self._tag_ignore = ["br", "font", "table", "tbody", "tfoot", ]
78 def handle_starttag(self, tag, attrs):
79 self._prevtag = self._currenttag
80 self._currenttag = tag
84 if d_attrs.has_key("id"):
85 if d_attrs["id"] == self._id:
93 self._div_counter += 1
95 if tag in self._tag_ignore:
98 if self._rep_starttag.has_key(tag):
99 self._put(self._rep_starttag[tag])
101 if self._start_h.has_key(tag):
102 self._start_h[tag](tag, attrs)
105 def handle_endtag(self, tag):
110 self._div_counter -= 1
111 if self._div_counter == 0:
115 if tag in self._tag_ignore:
118 if self._rep_endtag.has_key(tag):
119 self._put(self._rep_endtag[tag])
121 if self._end_h.has_key(tag):
122 self._end_h[tag](tag)
125 def handle_data(self, data):
128 if self._rex_empty.search(data):
132 self._stack.append(data)
134 self._put(data.rstrip())
136 def handle_charref(self, ref):
139 def handle_entityref(self, name):
142 # tag specific handlers
143 def _start_img(self, tag, attrs):
146 for (attr, val) in attrs:
152 if self._prevtag == "a":
153 filename = self._href.split("/")[-1]
156 filename = src.split("/")[-1]
158 self._put("Thumb(%s, caption=%s)\n" % (filename, title))
160 def _start_a(self, tag, attrs):
162 for (attr, val) in attrs:
169 def _end_a(self, tag):
172 content = self._stack.pop()
176 self._put(self._href)
182 def _regularize_url(self, url):
183 """regularize given url."""
184 # urlparse.urlparse("http://hoge.net/foo/var/index.html;q?a=b#c")
187 # -> ('http', 'hoge.net', '/foo/var/index.html', 'q', 'a=b', 'c')
189 current_term = self._base_url_items
190 current_dir = os.path.dirname(current_term[2])
191 current_last = os.path.basename(current_term[2])
193 result = urlparse(url)
197 term[0] = current_term[0] + "://"
199 term[0] = term[0] + "://"
201 term[1] = current_term[1]
202 if term[2] and term[2][0] != "/":
203 term[2] = os.path.normpath(current_dir + "/" + term[2])
205 term[3] = ";" + term[3]
207 term[4] = "?" + term[4]
209 term[5] = "#" + term[5]