1 #!/usr/bin/env python
\r
2 # -*- coding: utf-8 -*-
\r
4 otptools base module - retain session/login info
\r
14 sys.path.append(os.path.abspath("../"))
\r
16 from BeautifulSoup import BeautifulSoup
\r
18 OTP_LOGIN_URL = "http://magazine.sourceforge.jp/login.pl"
\r
19 OTP_LOGIN_HOST = "magazine.sourceforge.jp"
\r
20 OTP_LOGIN_PATH = "/login.pl"
\r
30 OTP_LIST_PATH = "/admin.pl"
\r
32 BROWSER = "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7 (.NET CLR 3.5.30729) "
\r
35 class otptools(object):
\r
37 Open Tech Press management library core module.
\r
40 def __init__(self, path_cookie, login_name="", login_password=""):
\r
42 @param path_cookie: path of file which cookie's information stored.
\r
43 @type path_cookie: stinrg
\r
45 @param login_name: OTP's login name for use.
\r
46 @type login_name: string
\r
48 @param login_password: OTP's login password.
\r
49 @type login_password: string
\r
51 self.path_cookie = path_cookie
\r
52 self.unickname = login_name
\r
53 self.upasswd = login_password
\r
56 def get_cookie(self):
\r
59 def login(self, user="", passwd=""):
\r
60 login_param = copy.deepcopy(OTP_LOGIN_PARAM)
\r
63 self.unickname = user
\r
65 self.upasswd = passwd
\r
67 login_param["unickname"] = self.unickname
\r
68 login_param["upasswd"] = self.upasswd
\r
70 # for item in login_param:
\r
71 # print "%s > %s" % (item, login_param[item] )
\r
73 encoded_data = urllib.urlencode(login_param)
\r
74 # print encoded_data
\r
77 "User-Agent": BROWSER,
\r
78 "Content-type": "application/x-www-form-urlencoded",
\r
79 "Accept": "text/plain",
\r
82 obj = httplib.HTTPConnection(OTP_LOGIN_HOST)
\r
83 obj.request("POST", OTP_LOGIN_PATH, encoded_data, headers)
\r
84 resp = obj.getresponse()
\r
85 headers = resp.getheaders()
\r
87 # for item in headers:
\r
90 for header in headers:
\r
91 if header[0] == "set-cookie":
\r
92 str_cookie = header[1]
\r
97 self.cookie = str_cookie
\r
100 def save_cookie(self):
\r
101 file_obj = open(self.path_cookie, "w")
\r
102 file_obj.write(self.cookie)
\r
105 def load_cookie(self):
\r
106 file_obj = open(self.path_cookie, "r")
\r
107 self.cookie = file_obj.readline()
\r
110 def get_list(self, skips=0):
\r
112 get otp story list.
\r
114 @param skips: index
\r
117 path = OTP_LIST_PATH
\r
119 path = path + "?section=&op=list&next=%s" % skips
\r
121 # sys.stderr.write( path + "\n" )
\r
122 return self._retrieve_html( path )
\r
124 def _retrieve_html(self, path):
\r
126 retrive html from url.
\r
129 "User-Agent": BROWSER,
\r
130 "Content-type": "application/x-www-form-urlencoded",
\r
131 "Accept": "text/plain",
\r
132 "Cookie": self.cookie,
\r
134 obj = httplib.HTTPConnection(OTP_LOGIN_HOST)
\r
137 obj.request("GET", path, "", headers)
\r
138 except HTTPException:
\r
140 sys.stderr.write( "socket error: %s" % (path) )
\r
143 resp = obj.getresponse()
\r
144 headers = dict(resp.getheaders())
\r
146 if resp.status == 302: # redirect
\r
147 return self._retrieve_html( headers["location"] )
\r
151 def get_tags(self, url):
\r
153 get story's tags (topics).
\r
155 html = self._retrieve_html(url)
\r
160 match = re.search( r'<meta name="description" content=".* -- article related to (.*)\..*">', html, re.M )
\r
161 # 分割されたWebページを1つにつなげる「AutoPager」拡張 -- article related to 森川拓男, プラグイン/機能拡張, Mozilla, Index, and デベロッパー.
\r
163 str = match.group(1)
\r
164 str = str.replace( "and ", "" )
\r
165 tags = str.split( ", ")
\r
170 def parse_list(self, html_content, story_infos={}):
\r
172 parse admin.pl's html.
\r
174 bsp = BeautifulSoup(html_content,fromEncoding="utf_8")
\r
175 bsp_table = bsp.table
\r
177 for row in bsp_table.findAll('tr'):
\r
178 str = unicode(row.prettify(), "utf_8" )
\r
179 if not str.find(ur"lt_tb_col") == -1:
\r
182 datas = row.findAll('td')
\r
184 # extract story's url and title
\r
185 str = unicode(datas[1].prettify(), "utf_8")
\r
186 str = re.sub( r" \s*", " ", str, re.S )
\r
188 match_obj = re.search( r'<a href="(.*)">(.*)</a>', str, re.S )
\r
189 story_url = match_obj.group(1).strip()
\r
190 story_title = match_obj.group(2).strip()
\r
193 str = unicode(datas[2].prettify(), "utf_8")
\r
194 str = re.sub( r" \s*", " ", str, re.S )
\r
195 match_obj = re.search( r'<b>(.*)</b>', str, re.S )
\r
196 editor = match_obj.group(1).strip()
\r
199 str = unicode(datas[5].prettify(), "utf_8")
\r
200 str = re.sub( r" \s*", " ", str, re.S )
\r
201 match_obj = re.search( r'<td>\s*(.*)\s*</td>', str, re.S )
\r
202 page_views = match_obj.group(1).strip()
\r
205 str = unicode(datas[6].prettify(), "utf_8")
\r
206 str = re.sub( r" \s*", " ", str, re.S )
\r
207 match_obj = re.search( r'<td>\s*(.*)\s*</td>', str, re.S )
\r
208 comments = match_obj.group(1).strip()
\r
211 str = unicode(datas[7].prettify(), "utf_8")
\r
212 str = re.sub( r" \s*", " ", str, re.S )
\r
213 match_obj = re.search( r'<td>\s*(.*)\s*</td>', str, re.S )
\r
214 date_time = match_obj.group(1).strip()
\r
218 "title":story_title,
\r
220 "page_views":page_views,
\r
221 "comments":comments,
\r
222 "datetime":date_time,
\r
224 story_infos[story_url] = story_info
\r