import os
import os.path
import urlparse
+import urllib
+import dircache
+import shutil
import html2sfjpwiki
-usage = """%s <html>""" % sys.argv[0]
+usage = """%s <html_pathname> <base_dir> <output_base_dir>""" % sys.argv[0]
+
try:
- html = sys.argv[1]
+ html_pathname = sys.argv[1]
+ base_dir = sys.argv[2]
+ output_base_dir = sys.argv[3]
except IndexError:
sys.exit(usage)
-
try:
- fh = open(html, "r")
+ fh = open(html_pathname, "r")
except IOError:
sys.exit(usage)
+print >> sys.stderr, "converting %s..." % html_pathname
+
body = fh.read()
fh.close()
tmp = rex.sub("", body)
tmp = rex2.sub("", tmp)
+# parse
c = html2sfjpwiki.Html2SfjpWiki()
r = c.parse(tmp, "article-body")
+title = c.title.replace(" - SourceForge.JP Magazine", "").strip()
+
+# calculate pathes
+html_dir = os.path.dirname(html_pathname)
+rel_dir = os.path.relpath(html_dir, base_dir)
+output_dir = os.path.join(output_base_dir, rel_dir.replace(os.path.sep, "-"))
+
+# calculate pages
+pdir = os.path.dirname(html_dir)
+p = 1
+while os.path.isdir(os.path.join(pdir, str(p))):
+ p += 1
+last_page = p-1
+current_page = int(os.path.basename(html_dir))
-output = os.path.join(os.path.dirname(html), "wiki.txt")
+# calculate titles
+if last_page == 1:
+ page_title = title
+ page_filename = title
+else:
+ page_title = title + "(%d/%d)" % (current_page, last_page)
+ page_filename = title + "_p%d" % current_page
+page_filename_quoted = urllib.quote(page_filename.replace(" ", "_").replace("+", "_"))
+
+# output
+if not os.path.isdir(output_dir):
+ os.makedirs(output_dir)
+o_pathname = os.path.join(output_dir, "wiki.txt")
try:
- fo = open(output, "w")
+ fo = open(o_pathname, "w")
except IOError:
- sys.exit("cannot open output file: %s." % output)
+ sys.exit("cannot open output file: %s." % o_pathname)
-fo.write("[[PageNavi(NavigationList)]]\n\n\n")
+fo.write(page_filename_quoted + "\n")
+fo.write(page_title + "\n\n")
+fo.write("[[PageNavi(NavigationList)]]\n\n")
+if current_page == 1:
+ fo.write("== %s ==\n\n" % title)
+fo.write("\n")
fo.write(r)
fo.write("\n\n[[PageNavi(NavigationList)]]\n\n")
fo.close()
+def copy_attachments(from_dir, dest_dir):
+ d = dircache.listdir(from_dir)
+ rex = re.compile(r"\.(png|jpg|gif)$")
+ for item in d:
+ p = os.path.join(from_dir, item)
+ if rex.search(item) and (not os.path.isdir(p)):
+ shutil.copy2(p, dest_dir)
+
+copy_attachments(html_dir, output_dir)
+
+# generate index page
+def get_preface(str):
+ l = r.split("\n")
+ rex = re.compile(r"^\s*$")
+ preface = ""
+ break_cnt = 0
+ for p in l:
+ if rex.match(p):
+ continue
+ else:
+ break_cnt += 1
+ preface = preface + p + "\n\n"
+ if break_cnt == 2:
+ break
+ return preface
+
+if current_page == 1 and last_page != 1:
+ index_dir = re.sub(r"-[^-]*$", "", output_dir)
+ title_name = title + " PDF"
+ file_name = urllib.quote(title.replace(" ", "_").replace("+", "_"))
+ pre = get_preface(r)
+
+ # output
+ if not os.path.isdir(index_dir):
+ os.makedirs(index_dir)
+ o_pathname = os.path.join(index_dir, "wiki.txt")
+ try:
+ fo = open(o_pathname, "w")
+ except IOError:
+ sys.exit("cannot open output file: %s." % o_pathname)
+
+
+ fo.write(file_name + "\n")
+ fo.write(title_name + "\n")
+ fo.write("\n\n")
+ fo.write("== %s ==\n\n" % title)
+ fo.write(pre)
+
+ t = """ * [[http://sourceforge.jp/projects/test11/wiki/!pdf/%s_all.pdf 全ページをPDF形式でダウンロード]]
+
+"""
+ fo.write(t % title.replace(" ", "_").replace("+", "_"))
+
+ t = " * 記事個別ページ:\n"
+ fo.write(t)
+ for n in range(1, last_page+1):
+ t1 = "%s_p%d" % (title.replace(" ", "_").replace("+", "_"), n)
+ t2 = "%s(%d/%d)" % (title, n, last_page)
+ t = " * [%s %s]" % (t1, t2)
+ fo.write(t)
+ fo.write("\n")
+ fo.write("\n\n")
+ fo.close()
+
+
+ # generate _all page
+ index_dir = re.sub(r"-[^-]*$", "_all", output_dir)
+ title_name = title
+ file_name = urllib.quote(title.replace(" ", "_").replace("+", "_") + "_all")
+
+ # output
+ if not os.path.isdir(index_dir):
+ os.makedirs(index_dir)
+ o_pathname = os.path.join(index_dir, "wiki.txt")
+ try:
+ fo = open(o_pathname, "w")
+ except IOError:
+ sys.exit("cannot open output file: %s." % o_pathname)
+
+ fo.write(file_name + "\n")
+ fo.write(title_name + "\n")
+ fo.write("\n\n")
+ for n in range(1, last_page+1):
+ t = "%s_p%d" % (title.replace(" ", "_").replace("+", "_"), n)
+ fo.write("[[include(%s)]]" % t)
+ fo.write("\n")
+ fo.write("\n\n")
+ fo.close()