sfmag2wiki/sfmag_html2wiki.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 """convert sfjpmagazine's story to sfjpwiki-style text."""
   4
   5 import sys
   6 import re
   7 import os
   8 import os.path
   9 import urlparse
  10 import urllib
  11 import dircache
  12 import shutil
  13
  14 import html2sfjpwiki
  15
  16 usage = """%s <html_pathname> <base_dir> <output_base_dir>""" % sys.argv[0]
  17
  18
  19 try:
  20     html_pathname = sys.argv[1]
  21     base_dir = sys.argv[2]
  22     output_base_dir = sys.argv[3]
  23 except IndexError:
  24     sys.exit(usage)
  25
  26 try:
  27     fh = open(html_pathname, "r")
  28 except IOError:
  29     sys.exit(usage)
  30
  31 print >> sys.stderr, "converting %s..." % html_pathname
  32
  33 body = fh.read()
  34 fh.close()
  35
  36 rex = re.compile(r"<\s*script[^>]*?>.*?</script>", re.S)
  37 rex2 = re.compile(r"<\s*noscript[^>]*?>.*?</noscript>", re.S)
  38 tmp = rex.sub("", body)
  39 tmp = rex2.sub("", tmp)
  40
  41 # parse
  42 c = html2sfjpwiki.Html2SfjpWiki()
  43 r = c.parse(tmp, "article-body")
  44 title = c.title.replace(" - SourceForge.JP Magazine", "").strip()
  45
  46 # calculate pathes
  47 html_dir = os.path.dirname(html_pathname)
  48 rel_dir = os.path.relpath(html_dir, base_dir)
  49 output_dir = os.path.join(output_base_dir, rel_dir.replace(os.path.sep, "-"))
  50
  51 # calculate pages
  52 pdir = os.path.dirname(html_dir)
  53 p = 1
  54 while os.path.isdir(os.path.join(pdir, str(p))):
  55     p += 1
  56 last_page = p-1
  57 current_page = int(os.path.basename(html_dir))
  58
  59 # calculate titles
  60 if last_page == 1:
  61     page_title = title
  62     page_filename = title
  63 else:
  64     page_title = title + "（%d/%d）" % (current_page, last_page)
  65     page_filename = title + "_p%d" % current_page
  66
  67 page_filename_quoted = urllib.quote(page_filename.replace(" ", "_").replace("+", "_"))
  68
  69 # output
  70 if not os.path.isdir(output_dir):
  71     os.makedirs(output_dir)
  72 o_pathname = os.path.join(output_dir, "wiki.txt")
  73 try:
  74     fo = open(o_pathname, "w")
  75 except IOError:
  76     sys.exit("cannot open output file: %s." % o_pathname)
  77
  78 fo.write(page_filename_quoted + "\n")
  79 fo.write(page_title + "\n\n")
  80 fo.write("[[PageNavi(NavigationList)]]\n\n")
  81 if current_page == 1:
  82     fo.write("== %s ==\n\n" % title)
  83 fo.write("\n")
  84 fo.write(r)
  85 fo.write("\n\n[[PageNavi(NavigationList)]]\n\n")
  86 fo.close()
  87
  88 def copy_attachments(from_dir, dest_dir):
  89     d = dircache.listdir(from_dir)
  90     rex = re.compile(r"\.(png|jpg|gif)$")
  91     for item in d:
  92         p = os.path.join(from_dir, item)
  93         if rex.search(item) and (not os.path.isdir(p)):
  94             shutil.copy2(p, dest_dir)
  95
  96 copy_attachments(html_dir, output_dir)
  97
  98 # generate index page
  99 def get_preface(str):
 100     l = r.split("\n")
 101     rex = re.compile(r"^\s*$")
 102     preface = ""
 103     break_cnt = 0
 104     for p in l:
 105         if rex.match(p):
 106             continue
 107         else:
 108             break_cnt += 1
 109             preface = preface + p + "\n\n"
 110             if break_cnt == 2:
 111                 break
 112     return preface
 113
 114 if current_page == 1 and last_page != 1:
 115     index_dir = re.sub(r"-[^-]*$", "", output_dir)
 116     title_name = title + " PDF"
 117     file_name = urllib.quote(title.replace(" ", "_").replace("+", "_"))
 118     pre = get_preface(r)
 119
 120     # output
 121     if not os.path.isdir(index_dir):
 122         os.makedirs(index_dir)
 123     o_pathname = os.path.join(index_dir, "wiki.txt")
 124     try:
 125         fo = open(o_pathname, "w")
 126     except IOError:
 127         sys.exit("cannot open output file: %s." % o_pathname)
 128
 129
 130     fo.write(file_name + "\n")
 131     fo.write(title_name + "\n")
 132     fo.write("\n\n")
 133     fo.write("== %s ==\n\n" % title)
 134     fo.write(pre)
 135
 136     t = """ * ［[http://sourceforge.jp/projects/test11/wiki/!pdf/%s_all.pdf 全ページをPDF形式でダウンロード]］
 137
 138 """
 139     fo.write(t % title.replace(" ", "_").replace("+", "_"))
 140
 141     t = " * 記事個別ページ：\n"
 142     fo.write(t)
 143     for n in range(1, last_page+1):
 144         t1 = "%s_p%d" % (title.replace(" ", "_").replace("+", "_"), n)
 145         t2 = "%s（%d/%d）" % (title, n, last_page)
 146         t = "   * [%s %s]" % (t1, t2)
 147         fo.write(t)
 148         fo.write("\n")
 149     fo.write("\n\n")
 150     fo.close()
 151
 152
 153     # generate _all page
 154     index_dir = re.sub(r"-[^-]*$", "_all", output_dir)
 155     title_name = title
 156     file_name = urllib.quote(title.replace(" ", "_").replace("+", "_") + "_all")
 157
 158     # output
 159     if not os.path.isdir(index_dir):
 160         os.makedirs(index_dir)
 161     o_pathname = os.path.join(index_dir, "wiki.txt")
 162     try:
 163         fo = open(o_pathname, "w")
 164     except IOError:
 165         sys.exit("cannot open output file: %s." % o_pathname)
 166
 167     fo.write(file_name + "\n")
 168     fo.write(title_name + "\n")
 169     fo.write("\n\n")
 170     for n in range(1, last_page+1):
 171         t = "%s_p%d" % (title.replace(" ", "_").replace("+", "_"), n)
 172         fo.write("[[include(%s)]]" % t)
 173         fo.write("\n")
 174     fo.write("\n\n")
 175     fo.close()