OSDN Git Service

add exception routine in A tag convertion.
[otptools/otptools.git] / StoryLists / otptools.py
1 #!/usr/bin/env python\r
2 # -*- coding: utf-8 -*-\r
3 """\r
4 otptools base module - retain session/login info\r
5 """\r
6 import sys\r
7 import os\r
8 import os.path\r
9 import copy\r
10 import urllib\r
11 import httplib\r
12 import re\r
13 \r
14 sys.path.append(os.path.abspath("../"))\r
15 \r
16 from BeautifulSoup import BeautifulSoup\r
17 \r
18 OTP_LOGIN_URL = "http://magazine.sourceforge.jp/login.pl"\r
19 OTP_LOGIN_HOST = "magazine.sourceforge.jp"\r
20 OTP_LOGIN_PATH = "/login.pl"\r
21 \r
22 OTP_LOGIN_PARAM = {\r
23         "op":"userlogin",\r
24         "unickname":"",\r
25         "upasswd":"",\r
26 #       "login_temp":0,\r
27         "userlogin":"ログイン",\r
28         }\r
29 \r
30 OTP_LIST_PATH = "/admin.pl"\r
31 \r
32 BROWSER = "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7 (.NET CLR 3.5.30729) "\r
33 \r
34 \r
35 class otptools(object):\r
36         """\r
37         Open Tech Press management library core module.\r
38         """\r
39 \r
40         def __init__(self, path_cookie, login_name="", login_password=""):\r
41                 """\r
42                 @param path_cookie: path of file which cookie's information stored.\r
43                 @type  path_cookie: stinrg\r
44 \r
45                 @param login_name: OTP's login name for use.\r
46                 @type  login_name: string\r
47 \r
48                 @param login_password: OTP's login password.\r
49                 @type  login_password: string\r
50                 """\r
51                 self.path_cookie = path_cookie\r
52                 self.unickname = login_name\r
53                 self.upasswd = login_password\r
54                 self.cookie = ""\r
55 \r
56         def get_cookie(self):\r
57                 return self.cookie\r
58 \r
59         def login(self, user="", passwd=""):\r
60                 login_param = copy.deepcopy(OTP_LOGIN_PARAM)\r
61 \r
62                 if user != "":\r
63                         self.unickname = user\r
64                 if passwd != "":\r
65                         self.upasswd = passwd\r
66 \r
67                 login_param["unickname"] = self.unickname\r
68                 login_param["upasswd"] = self.upasswd\r
69 \r
70 #               for item in login_param:\r
71 #                       print "%s > %s" % (item, login_param[item] )\r
72 \r
73                 encoded_data = urllib.urlencode(login_param)\r
74 #               print encoded_data\r
75 \r
76                 headers = {\r
77                         "User-Agent": BROWSER,\r
78                         "Content-type": "application/x-www-form-urlencoded",\r
79                         "Accept": "text/plain",\r
80                         }\r
81 \r
82                 obj = httplib.HTTPConnection(OTP_LOGIN_HOST)\r
83                 obj.request("POST", OTP_LOGIN_PATH, encoded_data, headers)\r
84                 resp = obj.getresponse()\r
85                 headers = resp.getheaders()\r
86 \r
87 #               for item in headers:\r
88 #                       print item\r
89 \r
90                 for header in headers:\r
91                         if header[0] == "set-cookie":\r
92                                 str_cookie = header[1]\r
93                                 break\r
94                 else:\r
95                         return -1\r
96 \r
97                 self.cookie = str_cookie\r
98                 return 1\r
99 \r
100         def save_cookie(self):\r
101                 file_obj = open(self.path_cookie, "w")\r
102                 file_obj.write(self.cookie)\r
103                 file_obj.close()\r
104 \r
105         def load_cookie(self):\r
106                 file_obj = open(self.path_cookie, "r")\r
107                 self.cookie = file_obj.readline()\r
108                 file_obj.close()\r
109 \r
110         def get_list(self, skips=0):\r
111                 """\r
112                 get otp story list.\r
113 \r
114                 @param skips: index\r
115                 @type skips:  int\r
116                 """\r
117                 path = OTP_LIST_PATH\r
118                 if skips != 0:\r
119                         path = path + "?section=&op=list&next=%s" % skips\r
120 \r
121 #               sys.stderr.write( path + "\n" )\r
122                 return self._retrieve_html( path )\r
123 \r
124         def _retrieve_html(self, path):\r
125                 """\r
126                 retrive html from url.\r
127                 """\r
128                 headers = {\r
129                         "User-Agent": BROWSER,\r
130                         "Content-type": "application/x-www-form-urlencoded",\r
131                         "Accept": "text/plain",\r
132                         "Cookie": self.cookie,\r
133                         }\r
134                 obj = httplib.HTTPConnection(OTP_LOGIN_HOST)\r
135 \r
136                 try:\r
137                         obj.request("GET", path, "", headers)\r
138                 except HTTPException:\r
139                         # 10060:timed out\r
140                         sys.stderr.write( "socket error: %s" % (path) )\r
141                         return ""\r
142 \r
143                 resp = obj.getresponse()\r
144                 headers = dict(resp.getheaders())\r
145 \r
146                 if resp.status == 302:  # redirect\r
147                         return self._retrieve_html( headers["location"] )\r
148 \r
149                 return resp.read()\r
150 \r
151         def get_tags(self, url):\r
152                 """\r
153                 get story's tags (topics).\r
154                 """\r
155                 html = self._retrieve_html(url)\r
156                 if html == "":\r
157                         return ""\r
158 \r
159                 tags = []\r
160                 match = re.search( r'<meta name="description" content=".* -- article related to (.*)\..*">', html, re.M )\r
161 # 分割されたWebページを1つにつなげる「AutoPager」拡張 -- article related to 森川拓男, プラグイン/機能拡張, Mozilla, Index, and デベロッパー.\r
162                 if match:\r
163                         str = match.group(1)\r
164                         str = str.replace( "and ", "" )\r
165                         tags = str.split( ", ")\r
166                         return tags\r
167                 else:\r
168                         return []\r
169                 \r
170         def parse_list(self, html_content, story_infos={}):\r
171                 """\r
172                 parse admin.pl's html.\r
173                 """\r
174                 bsp = BeautifulSoup(html_content,fromEncoding="utf_8")\r
175                 bsp_table = bsp.table\r
176 \r
177                 for row in bsp_table.findAll('tr'):\r
178                         str = unicode(row.prettify(), "utf_8" )\r
179                         if not str.find(ur"lt_tb_col") == -1:\r
180                                 continue\r
181 \r
182                         datas = row.findAll('td')\r
183 \r
184                         # extract story's url and title\r
185                         str = unicode(datas[1].prettify(), "utf_8")\r
186                         str = re.sub( r"&nbsp;\s*", " ", str, re.S )\r
187                         #       print str\r
188                         match_obj = re.search( r'<a href="(.*)">(.*)</a>', str, re.S )\r
189                         story_url = match_obj.group(1).strip()\r
190                         story_title = match_obj.group(2).strip()\r
191 \r
192                         # extract editor\r
193                         str = unicode(datas[2].prettify(), "utf_8")\r
194                         str = re.sub( r"&nbsp;\s*", " ", str, re.S )\r
195                         match_obj = re.search( r'<b>(.*)</b>', str, re.S )\r
196                         editor = match_obj.group(1).strip()\r
197 \r
198                         # extract PVs\r
199                         str = unicode(datas[5].prettify(), "utf_8")\r
200                         str = re.sub( r"&nbsp;\s*", " ", str, re.S )\r
201                         match_obj = re.search( r'<td>\s*(.*)\s*</td>', str, re.S )\r
202                         page_views = match_obj.group(1).strip()\r
203 \r
204                         # extract comments\r
205                         str = unicode(datas[6].prettify(), "utf_8")\r
206                         str = re.sub( r"&nbsp;\s*", " ", str, re.S )\r
207                         match_obj = re.search( r'<td>\s*(.*)\s*</td>', str, re.S )\r
208                         comments = match_obj.group(1).strip()\r
209 \r
210                         # extract datetime\r
211                         str = unicode(datas[7].prettify(), "utf_8")\r
212                         str = re.sub( r"&nbsp;\s*", " ", str, re.S )\r
213                         match_obj = re.search( r'<td>\s*(.*)\s*</td>', str, re.S )\r
214                         date_time = match_obj.group(1).strip()\r
215 \r
216                         story_info = {\r
217                                 "url":story_url,\r
218                                 "title":story_title,\r
219                                 "editor":editor,\r
220                                 "page_views":page_views,\r
221                                 "comments":comments,\r
222                                 "datetime":date_time,\r
223                                 }\r
224                         story_infos[story_url] = story_info\r
225                 return story_infos\r