OSDN Git Service

update html2fsjpwiki.py
[otptools/otptools.git] / fetch_sfjpmag.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """fetch url"""
4
5 import os.path
6 import sys
7 import re
8 import os
9 import urlparse
10
11 import spyder
12
13
14 class SfjpMagFetcher(spyder.Spyder):
15     """Web Spider for SourceForge.JP Magazine."""
16     def __init__(self, url, output_dir):
17         """url needs complete url like 'http://hogehoge.jp/foo/bar/boo'."""
18         spyder.Spyder.__init__(self)
19         self._url = url
20         self._output_dir = output_dir
21         self._url_rex = re.compile("^" + url + r"(.*)$")
22         self._url_rex_img = re.compile(r"^http://static.sourceforge.jp/magazine/blob/.*$")
23         self.append_url(url)
24
25     def handle_url(self, url):
26         """check url should be traced or not. if trace, return True. Normally, you should override this function."""
27
28         if self._url_rex.search(url):
29             return True
30         if self._url_rex_img.search(url):
31             self._save_attachment(url)
32         return False
33
34     def _save_attachment(self, url):
35         """save url as attachment."""
36         t = urlparse.urlparse(url)  # scheme://netloc/path;parameters?query#fragment
37         filename = t.path.split("/")[-1]
38         data = self.grab_by_get(url)
39         output_dir = self._url_to_path(self.current_url())
40         output_file = os.path.join(output_dir, filename)
41
42         self.prepare_output_dir(output_dir)
43         
44         print >> sys.stderr, "output to %s ." % (output_file)
45         f = open(output_file, "w")
46         f.write(data)
47         f.close()
48
49     def handle_start_fetch(self, url):
50         """this function is called when start to fetch url."""
51         print >> sys.stderr, "fetch %s ..." % (url)
52
53     def _url_to_path(self, url):
54         m = self._url_rex.search(url)
55         if m:  # data is html:
56             page = m.group(1)
57             if page == "":
58                 pagenum = 1
59             elif page[0] == "/":
60                 pagenum = page[1:]
61             story_id = url.replace("http://sourceforge.jp/magazine/", "")
62
63             if pagenum == 1:
64                 rel_dir = story_id + "/1"
65             else:
66                 rel_dir = story_id
67             return os.path.join(self._output_dir, rel_dir)
68         else:
69             # something wrong!
70             return None
71
72     def handle_data(self, url, level, data):
73         """this function is called when data grabbed."""
74         output_dir = self._url_to_path(url)
75         output_file = os.path.join(output_dir, "body.html")
76         self.prepare_output_dir(output_dir)
77
78         print >> sys.stderr, "output to %s ." % (output_file)
79         f = open(output_file, "w")
80         f.write(data)
81         f.close()
82
83         # get attachments
84         for src in self.extract_imgs(data, url):
85             if self._url_rex_img.search(src):
86                 self._save_attachment(src)
87
88     def prepare_output_dir(self, dir):
89         if not os.path.exists(dir):
90             try:
91                 os.makedirs(dir)
92             except error:
93                 pass
94
95
96
97 def fetch(url, output_dir):
98     """Fetch SourceForge.JP Magazine's story selected by url with keep paging"""
99     f = SfjpMagFetcher(url, output_dir)
100     f.run()
101