beta 1

[otptools/otptools.git] / sfmag2wiki / sfmag_html2wiki.py
diff --git a/sfmag2wiki/sfmag_html2wiki.py b/sfmag2wiki/sfmag_html2wiki.py

index b5d280c..ce854d3 100755 (executable)
--- a/sfmag2wiki/sfmag_html2wiki.py
+++ b/sfmag2wiki/sfmag_html2wiki.py
@@ -7,22 +7,29 @@ import re
  import os
  import os.path
  import urlparse
+import urllib
+import dircache
+import shutil
  
  import html2sfjpwiki
  
-usage = """%s <html>""" % sys.argv[0]
+usage = """%s <html_pathname> <base_dir> <output_base_dir>""" % sys.argv[0]
+
  
  try:
-    html = sys.argv[1]
+    html_pathname = sys.argv[1]
+    base_dir = sys.argv[2]
+    output_base_dir = sys.argv[3]
  except IndexError:
      sys.exit(usage)
  
-
  try:
-    fh = open(html, "r")
+    fh = open(html_pathname, "r")
  except IOError:
      sys.exit(usage)
  
+print >> sys.stderr, "converting %s..." % html_pathname
+
  body = fh.read()
  fh.close()
  
@@ -31,18 +38,138 @@ rex2 = re.compile(r"<\s*noscript[^>]*?>.*?</noscript>", re.S)
  tmp = rex.sub("", body)
  tmp = rex2.sub("", tmp)
  
+# parse
  c = html2sfjpwiki.Html2SfjpWiki()
  r = c.parse(tmp, "article-body")
+title = c.title.replace(" - SourceForge.JP Magazine", "").strip()
+
+# calculate pathes
+html_dir = os.path.dirname(html_pathname)
+rel_dir = os.path.relpath(html_dir, base_dir)
+output_dir = os.path.join(output_base_dir, rel_dir.replace(os.path.sep, "-"))
+
+# calculate pages
+pdir = os.path.dirname(html_dir)
+p = 1
+while os.path.isdir(os.path.join(pdir, str(p))):
+    p += 1
+last_page = p-1
+current_page = int(os.path.basename(html_dir))
  
-output = os.path.join(os.path.dirname(html), "wiki.txt")
+# calculate titles
+if last_page == 1:
+    page_title = title
+    page_filename = title
+else:
+    page_title = title + "（%d/%d）" % (current_page, last_page)
+    page_filename = title + "_p%d" % current_page
  
+page_filename_quoted = urllib.quote(page_filename.replace(" ", "_").replace("+", "_"))
+
+# output
+if not os.path.isdir(output_dir):
+    os.makedirs(output_dir)
+o_pathname = os.path.join(output_dir, "wiki.txt")
  try:
-    fo = open(output, "w")
+    fo = open(o_pathname, "w")
  except IOError:
-    sys.exit("cannot open output file: %s." % output)
+    sys.exit("cannot open output file: %s." % o_pathname)
  
-fo.write("[[PageNavi(NavigationList)]]\n\n\n")
+fo.write(page_filename_quoted + "\n")
+fo.write(page_title + "\n\n")
+fo.write("[[PageNavi(NavigationList)]]\n\n")
+if current_page == 1:
+    fo.write("== %s ==\n\n" % title)
+fo.write("\n")
  fo.write(r)
  fo.write("\n\n[[PageNavi(NavigationList)]]\n\n")
  fo.close()
  
+def copy_attachments(from_dir, dest_dir):
+    d = dircache.listdir(from_dir)
+    rex = re.compile(r"\.(png|jpg|gif)$")
+    for item in d:
+        p = os.path.join(from_dir, item)
+        if rex.search(item) and (not os.path.isdir(p)):
+            shutil.copy2(p, dest_dir)
+
+copy_attachments(html_dir, output_dir)
+
+# generate index page
+def get_preface(str):
+    l = r.split("\n")
+    rex = re.compile(r"^\s*$")
+    preface = ""
+    break_cnt = 0
+    for p in l:
+        if rex.match(p):
+            continue
+        else:
+            break_cnt += 1
+            preface = preface + p + "\n\n"
+            if break_cnt == 2:
+                break
+    return preface
+
+if current_page == 1 and last_page != 1:
+    index_dir = re.sub(r"-[^-]*$", "", output_dir)
+    title_name = title + " PDF"
+    file_name = urllib.quote(title.replace(" ", "_").replace("+", "_"))
+    pre = get_preface(r)
+    
+    # output
+    if not os.path.isdir(index_dir):
+        os.makedirs(index_dir)
+    o_pathname = os.path.join(index_dir, "wiki.txt")
+    try:
+        fo = open(o_pathname, "w")
+    except IOError:
+        sys.exit("cannot open output file: %s." % o_pathname)
+
+
+    fo.write(file_name + "\n")
+    fo.write(title_name + "\n")
+    fo.write("\n\n")
+    fo.write("== %s ==\n\n" % title)
+    fo.write(pre)
+
+    t = """ * ［[http://sourceforge.jp/projects/test11/wiki/!pdf/%s_all.pdf 全ページをPDF形式でダウンロード]］
+
+"""
+    fo.write(t % title.replace(" ", "_").replace("+", "_"))
+
+    t = " * 記事個別ページ：\n"
+    fo.write(t)
+    for n in range(1, last_page+1):
+        t1 = "%s_p%d" % (title.replace(" ", "_").replace("+", "_"), n)
+        t2 = "%s（%d/%d）" % (title, n, last_page)
+        t = "   * [%s %s]" % (t1, t2)
+        fo.write(t)
+        fo.write("\n")
+    fo.write("\n\n")
+    fo.close()
+
+
+    # generate _all page
+    index_dir = re.sub(r"-[^-]*$", "_all", output_dir)
+    title_name = title
+    file_name = urllib.quote(title.replace(" ", "_").replace("+", "_") + "_all")
+
+    # output
+    if not os.path.isdir(index_dir):
+        os.makedirs(index_dir)
+    o_pathname = os.path.join(index_dir, "wiki.txt")
+    try:
+        fo = open(o_pathname, "w")
+    except IOError:
+        sys.exit("cannot open output file: %s." % o_pathname)
+
+    fo.write(file_name + "\n")
+    fo.write(title_name + "\n")
+    fo.write("\n\n")
+    for n in range(1, last_page+1):
+        t = "%s_p%d" % (title.replace(" ", "_").replace("+", "_"), n)
+        fo.write("[[include(%s)]]" % t)
+        fo.write("\n")
+    fo.write("\n\n")
+    fo.close()