+++ /dev/null
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import os.path
-import sys
-import re
-import os
-import urllib
-import HTMLParser
-
-usage = "%s <urllist>" % sys.argv[0]
-
-try:
- fname = sys.argv[1]
-except IndexError:
- sys.exit(usage)
-
-class TitleParser(HTMLParser.HTMLParser):
-
- def __init__(self):
- HTMLParser.HTMLParser.__init__(self)
- self._capt = False
- self._title = ""
-
- def parse(self, html_string):
- """Parse html_string with url, and return anchors"""
- self.feed(html_string)
- return self._title
-
- def handle_starttag(self, tag, attrs):
- """starttag handler."""
- if tag == "title":
- self._capt = True
-
- def handle_endtag(self, tag):
- """starttag handler."""
- if tag == "title":
- self._capt = False
-
- def handle_data(self, data):
- if self._capt:
- self._title = self._title + data.strip()
-
-f = open(fname, "r")
-
-rex = re.compile(r"<\s*script[^>]*?>.*?</script>", re.S)
-rex2 = re.compile(r"<\s*noscript[^>]*?>.*?</noscript>", re.S)
-for url in f:
- url = url.strip()
- u = urllib.urlopen(url)
- data = u.read()
-
- t = rex.sub("", data)
- t = rex2.sub("", t)
-
- p = TitleParser()
- title = p.parse(t)
- title = re.sub(r"\s*- SourceForge.JP Magazine\s*$", "", title)
- title = title.replace(" ", "_").replace("/", "_").replace("+", "_")
- sys.stdout.write(url + "\t" + title + "_p1\n")
-
-f.close()
-
-