HTMLParser.HTMLParser.__init__(self)
self._target_id = ""
self._buf = []
- self._add_handlers()
self._thru = True
self._rex_empty = re.compile(r"^\s*$")
self._div_counter = 0
self._currenttag = ""
self._stack = []
self._href = ""
+ self.title = ""
+ self._start_handlers = {}
+ self._end_handlers = {}
+
+ self._stacking = False
+ self._add_handlers()
+ self._block = False
def parse(self, html_string, target_id):
"""Parse html_string with url, and return anchors"""
self.feed(html_string)
return "".join(self._buf)
- def _put(self, str):
- self._buf.append(str)
-
- def _add_handlers(self):
- h_start = dict()
- h_end = dict()
-
- h_start["a"] = self._start_a
- h_start["img"] = self._start_img
- h_end["a"] = self._end_a
-
- self._start_h = h_start
- self._end_h = h_end
- self._rep_starttag = dict(
- p="",
- i="''",
- tt="`",
- b="'''",
- strong="'''",
- big="'''",
- small="__",
- td="||",
- tr="",
- hr="----\n",
- h3="=== ",
- h4="==== " )
- self._rep_endtag = dict(
- p="\n\n",
- i="''",
- tt="`",
- b="'''",
- strong="'''",
- big="'''",
- small="__",
- td="||",
- tr="\n",
- h3=" ===\n",
- h4=" ====\n" )
- self._tag_ignore = ["br", "font", "table", "tbody", "tfoot", ]
-
def handle_starttag(self, tag, attrs):
self._prevtag = self._currenttag
self._currenttag = tag
if tag == "div":
self._div_counter += 1
- if tag in self._tag_ignore:
- # do nothing
- return
- if self._rep_starttag.has_key(tag):
- self._put(self._rep_starttag[tag])
- return
- if self._start_h.has_key(tag):
- self._start_h[tag](tag, attrs)
- return
+ if self._start_handlers.has_key(tag):
+ f = self._start_handlers[tag]
+ t = f(self, tag, attrs)
+ self._put(t)
def handle_endtag(self, tag):
+ self._prevtag = self._currenttag
+ self._currenttag = ""
+
if self._thru:
return
self._thru = True
return
- if tag in self._tag_ignore:
- # do nothing
- return
- if self._rep_endtag.has_key(tag):
- self._put(self._rep_endtag[tag])
- return
- if self._end_h.has_key(tag):
- self._end_h[tag](tag)
- return
+ if self._end_handlers.has_key(tag):
+ f = self._end_handlers[tag]
+ t = f(self, tag)
+ self._put(t)
def handle_data(self, data):
+ if self._currenttag == "title":
+ self.title = data.strip()
+
if self._thru:
return
if self._rex_empty.search(data):
return
+ output = self.wiki_escape(data)
+
if self._href:
- self._stack.append(data)
+ self._stack.append(output)
else:
- self._put(data.rstrip())
+ self._put(output.rstrip())
+
+ def wiki_escape(self, data):
+ if not self._currenttag == "pre":
+ data = data.replace("__", "!__")
+ return data
def handle_charref(self, ref):
pass
def handle_entityref(self, name):
pass
+ def _put(self, str, force=False):
+ if force == False and (str == None or self._block):
+ return
+ self._buf.append(str)
+
+ def _add_handlers(self):
+ """add start/end handlers for each tag."""
+ # generate simple replace rule.
+ # prepare dictionary. key is tag. value is replaced string.
+ r_starttag = dict(
+ p="",
+ i="''",
+ tt="`",
+ b="'''",
+ strong="'''",
+ big="'''",
+ small="__",
+ hr="----\n",
+ h3="=== ",
+ h4="==== ",
+ pre="{{{" )
+ # generate function to replace tag to string.
+ for key in r_starttag:
+ self._start_handlers[key] = lambda s, t, attr: r_starttag[t]
+
+ # for end tag, do same process.
+ r_endtag = dict(
+ p="\n\n",
+ i="''",
+ tt="`",
+ b="'''",
+ strong="'''",
+ big="'''",
+ small="__",
+ h3=" ===\n",
+ h4=" ====\n",
+ pre="\n}}}\n\n" )
+ for key in r_endtag:
+ self._end_handlers[key] = lambda s, t: r_endtag[t]
+
+ # add class's "_h_start_<tagname>" function to _start_handlers[tagname],
+ # "_h_end_<tagname>" function to _end_handlers[tagname].
+ # __class__.__dict__ is a dictionary which contains class's member functions.
+ for func in self.__class__.__dict__:
+ if func.find("_h_start_") == 0:
+ # for example, if "func" is "_h_start_img", then
+ # assign func to self._start_handlers["img"].
+ tagname = func[len("_h_start_"):]
+ self._start_handlers[tagname] = self.__class__.__dict__[func]
+ if func.find("_h_end_") == 0:
+ # for example, if "func" is "_h_start_img", then
+ # assign func to self._start_handlers["img"].
+ tagname = func[len("_h_end_"):]
+ self._end_handlers[tagname] = self.__class__.__dict__[func]
+
# tag specific handlers
- def _start_img(self, tag, attrs):
+
+ def _h_start_table(self, tag, attrs):
+ # if tag has "class" attribute, and those value is "table":
+ if ("class", "table") in attrs:
+ # save old handlers
+ #self._table_h_tr = (self._start_handlers["tr"], self._end_handlers["tr"])
+ #self._table_h_td = (self._start_handlers["td"], self._end_handlers["td"])
+ #self._table_h_th = (self._start_handlers["th"], self._end_handlers["th"])
+
+ # set new handlers
+ self._start_handlers["tr"] = lambda s, t, a: "|"
+ self._start_handlers["td"] = lambda s, t, a: "|"
+ self._start_handlers["th"] = lambda s, t, a: "|'''"
+ self._end_handlers["tr"] = lambda s, a: "|\n"
+ self._end_handlers["td"] = lambda s, a: "|"
+ self._end_handlers["th"] = lambda s, a: "'''|"
+ self._block = False
+ self._in_table = True
+ else:
+ self._block = True
+ self._in_table = False
+
+
+ def _h_end_table(self, tag):
+ if self._in_table:
+ self._in_table = False
+ self._block = False
+ del self._start_handlers["tr"]
+ del self._end_handlers["tr"]
+ del self._start_handlers["td"]
+ del self._end_handlers["td"]
+ del self._start_handlers["th"]
+ del self._end_handlers["th"]
+ return "\n"
+ else:
+ self._in_table = False
+ self._block = False
+
+ def _h_start_caption(self, tag, attrs):
+ del self._start_handlers["b"]
+ del self._end_handlers["b"]
+ return "====== "
+
+ def _h_end_caption(self, tagd):
+ self._start_handlers["b"] = lambda s, t, a: "'''"
+ self._end_handlers["b"] = lambda s, t: "'''"
+ return " ======\n"
+
+ def _h_start_img(self, tag, attrs):
src = ""
title = ""
for (attr, val) in attrs:
elif attr == "alt":
title = val
- if self._prevtag == "a":
+ if self._prevtag == "a" and self._href:
filename = self._href.split("/")[-1]
self._href = ""
else:
filename = src.split("/")[-1]
- self._put("Thumb(%s, caption=%s)\n" % (filename, title))
+ if title:
+ self._put("[[Thumb(%s, caption=%s)]]\n\n" % (filename, title), True)
+ else:
+ self._put("[[Thumb(%s)]]\n\n" % (filename,), True)
- def _start_a(self, tag, attrs):
+ def _h_start_a(self, tag, attrs):
href = ""
for (attr, val) in attrs:
if attr == "href":
if href:
self._href = href
- def _end_a(self, tag):
+ def _h_end_a(self, tag):
if self._href:
if self._stack:
content = self._stack.pop()
self._put(content)
self._put("]")
self._href = ""
-
- def _regularize_url(self, url):
- """regularize given url."""
- # urlparse.urlparse("http://hoge.net/foo/var/index.html;q?a=b#c")
- #
- # 0 1 2 3 4 5
- # -> ('http', 'hoge.net', '/foo/var/index.html', 'q', 'a=b', 'c')
- #
- current_term = self._base_url_items
- current_dir = os.path.dirname(current_term[2])
- current_last = os.path.basename(current_term[2])
-
- result = urlparse(url)
- term = list(result)
-
- if not term[0]:
- term[0] = current_term[0] + "://"
- else:
- term[0] = term[0] + "://"
- if not term[1]:
- term[1] = current_term[1]
- if term[2] and term[2][0] != "/":
- term[2] = os.path.normpath(current_dir + "/" + term[2])
- if term[3]:
- term[3] = ";" + term[3]
- if term[4]:
- term[4] = "?" + term[4]
- if term[5]:
- term[5] = "#" + term[5]
-
- url = "".join(term)
- return url
-
-