2 # -*- coding: utf-8 -*-
14 class SfjpMagFetcher(spyder.Spyder):
15 """Web Spider for SourceForge.JP Magazine."""
16 def __init__(self, url, output_dir):
17 """url needs complete url like 'http://hogehoge.jp/foo/bar/boo'."""
18 spyder.Spyder.__init__(self)
20 self._output_dir = output_dir
21 self._url_rex = re.compile("^" + url + r"(.*)$")
22 self._url_rex_img = re.compile(r"^http://static.sourceforge.jp/magazine/blob/.*$")
25 def handle_url(self, url):
26 """check url should be traced or not. if trace, return True. Normally, you should override this function."""
28 if self._url_rex.search(url):
30 if self._url_rex_img.search(url):
31 self._save_attachment(url)
34 def _save_attachment(self, url):
35 """save url as attachment."""
36 t = urlparse.urlparse(url) # scheme://netloc/path;parameters?query#fragment
37 filename = t.path.split("/")[-1]
38 data = self.grab_by_get(url)
39 output_dir = self._url_to_path(self.current_url())
40 output_file = os.path.join(output_dir, filename)
42 self.prepare_output_dir(output_dir)
44 print >> sys.stderr, "output to %s ." % (output_file)
45 f = open(output_file, "w")
49 def handle_start_fetch(self, url):
50 """this function is called when start to fetch url."""
51 print >> sys.stderr, "fetch %s ..." % (url)
53 def _url_to_path(self, url):
54 m = self._url_rex.search(url)
61 story_id = url.replace("http://sourceforge.jp/magazine/", "")
64 rel_dir = story_id + "/1"
67 return os.path.join(self._output_dir, rel_dir)
72 def handle_data(self, url, level, data):
73 """this function is called when data grabbed."""
74 output_dir = self._url_to_path(url)
75 output_file = os.path.join(output_dir, "body.html")
76 self.prepare_output_dir(output_dir)
78 print >> sys.stderr, "output to %s ." % (output_file)
79 f = open(output_file, "w")
84 for src in self.extract_imgs(data, url):
85 if self._url_rex_img.search(src):
86 self._save_attachment(src)
88 def prepare_output_dir(self, dir):
89 if not os.path.exists(dir):
97 def fetch(url, output_dir):
98 """Fetch SourceForge.JP Magazine's story selected by url with keep paging"""
99 f = SfjpMagFetcher(url, output_dir)