--- /dev/null
+# -*- coding: Windows-31J -*-\r
+\r
+require 'fileutils'\r
+require 'uri'\r
+require 'net/http'\r
+require 'net/https'\r
+require 'digest/sha2'\r
+require 'time'\r
+require 'cgi'\r
+\r
+$LOAD_PATH.unshift(File.dirname(__FILE__))\r
+require 'downtable.rb'\r
+$LOAD_PATH.shift\r
+Net::HTTP.version_1_2\r
+\r
+\r
+\r
+class ArcGET\r
+ \r
+ MEDIA_EXP = /\.(wav|wma|avi|asf|ogg|mp3|mp4|mpeg|mpg|mid|midi|smf|smaf|m4a|swf|flv|bmp|gif|tiff|tif|png|vob|ogm|mov|rm|divx)$/ni\r
+ \r
+ Waiting = URLtable::Waiting\r
+ WaitRetry = URLtable::WaitRetry\r
+ \r
+ SaveSuccess = URLtable::SaveSuccess\r
+ NewLocation = URLtable::NewLocation\r
+ NotFound = URLtable::NotFound\r
+ SkipScheme = URLtable::SkipScheme\r
+ \r
+ ConectTimeout = URLtable::ConectTimeout\r
+ ConnectError = URLtable::ConnectError\r
+ Forbidden = URLtable::Forbidden\r
+ AuthRequest = URLtable::AuthRequest\r
+ Error4xx = URLtable::Error4xx\r
+ Error5xx = URLtable::Error5xx\r
+ EOFreached = URLtable::EOFreached\r
+ CannotGetNetHTTP = URLtable::CannotGetNetHTTP # Net::HTTP \82Å\82Í\8eæ\93¾\82Å\82«\82È\82¢\83y\81[\83W\81i\83T\83C\83g\82Í\82 \82é\81j\r
+ BadURI = URLtable::BadURI # \82 \82Æ\82ÅURI\92ù\90³\82ð\8e\8e\82Ý\82é\r
+ \r
+ \r
+ def initialize(param={:path => nil, :db => nil, :pendingURI => nil})\r
+ @rootExp = []\r
+ @rootMAXlink = 99999999\r
+ @cgirootExp = []\r
+ @cgiMAXlink = 10\r
+ @excludeWayback = false\r
+ @sleepTime = 20.0\r
+ @commitCount = 16\r
+ @savePath = if param[:path] then param[:path] else "./save" end\r
+ @saveDB = if param[:db] then param[:db] else "./url.db3" end\r
+ @pendingURI = param[:pendingURI]\r
+ @pendingURI = nil if @pendingURI.to_s == ""\r
+ @list = URLtable.new(param={:path => @savePath, :db => @saveDB})\r
+ @outStream = STDOUT\r
+ end\r
+ \r
+ def sleep_setting(set)\r
+ @sleepTime=set\r
+ end\r
+ \r
+ def exclude_wayback(set)\r
+ @excludeWayback = set\r
+ end\r
+ \r
+ def commit_count(set)\r
+ @commitCount = set\r
+ end\r
+ \r
+ def save_path(param={:path => nil, :db => nil, :pendingURI => nil})\r
+ path = param[:path]\r
+ db = param[:db]\r
+ pendingURI = param[:pendingURI]\r
+ \r
+ if path then\r
+ @savePath = File.expand_path(path)\r
+ @list.dataDir = @savePath\r
+ end\r
+ if db then\r
+ @saveDB = File.expand_path(db)\r
+ @list.saveDB = @saveDB\r
+ end\r
+ if pendingURI then\r
+ @pendingURI = pendingURI\r
+ @pendingURI = nil if @pendingURI.to_s.size == 0\r
+ end\r
+ end\r
+ \r
+ def set_echo(obj=nil)\r
+ tmp=@outStream\r
+ @outStream = obj\r
+ return tmp\r
+ end\r
+ \r
+ def add_root(root, max=nil)\r
+ if max then\r
+ @rootMAXlink = max.to_i\r
+ end\r
+ unless root then\r
+ @rootExp = []\r
+ return\r
+ end\r
+ @rootExp << root\r
+ @rootExp.uniq!\r
+ end\r
+ \r
+ def add_cgi_root(root, max=nil)\r
+ if max then\r
+ @cgiMAXlink = max.to_i\r
+ end\r
+ unless root then\r
+ @cgirootExp = []\r
+ return\r
+ end\r
+ @cgirootExp << root\r
+ @cgirootExp.uniq!\r
+ end\r
+ \r
+ def add_url(url)\r
+ return unless url\r
+ url = url.to_s\r
+ @list.transaction do\r
+ unless @list.exists(url) then\r
+ r = URLtable::ROW.new\r
+ r.url = url\r
+ r.priority = r.priority | 0x40000000\r
+ @list.update r\r
+ else\r
+ @outStream.write("Alerdy Exists #{url}\n") if @outStream\r
+ end\r
+ end\r
+ end\r
+ \r
+ \r
+ def save_param(dat,response)\r
+ uri = dat.uri\r
+ #p "@@",response.code.to_s\r
+ dat.statusCode = response.code.to_s\r
+ dat.bytes = response.body.size\r
+ dat.tryNow = dat.tryNow + 1\r
+ dat.timeStamp = Time.parse(response['Date']) rescue nil\r
+ dat.downloadTime = Time.now\r
+ digest = Digest::SHA256.new\r
+ digest << response.body\r
+ dat.checksum = digest.hexdigest\r
+ dat.contentType = response['Content-Type'].to_s\r
+ #p response\r
+ end\r
+ \r
+ def save_body(dat,response, cnt=0)\r
+ return if cnt > 9\r
+ \r
+ if cnt == 0 then\r
+ uri = dat.uri\r
+ dat.savePath = "/#{uri.host}#{uri.path}"\r
+ if(uri.query.to_s.size>0)then\r
+ dat.savePath = dat.savePath + "_"+URI.escape(uri.query,/[<>:\x27\x22\x2B\x2F\x3F\x5C]/n)\r
+ end\r
+ end\r
+ \r
+ path="#{@savePath}#{dat.savePath}"\r
+ if cnt > 0 then\r
+ path += ".#{cnt}"\r
+ end\r
+ \r
+ path = File.expand_path(path)\r
+ FileUtils.mkdir_p(File.dirname(path))\r
+ \r
+ if File.exist?(path) then\r
+ save_body(dat,response, cnt+1)\r
+ end\r
+ begin\r
+ open(path,'wb'){|f| f.write response.body }\r
+ rescue Errno::EISDIR\r
+ dat.savePath = dat.savePath + '/index.html'\r
+ save_body(dat,response)\r
+ end\r
+ end\r
+ \r
+ def check_root(add)\r
+ #p add.url.to_s\r
+ #p add.referrer.to_s\r
+ @rootExp.each do |exp|\r
+ # p exp\r
+ return true if exp.match(add.url.to_s)\r
+ return true if exp.match(add.referrer.to_s)\r
+ end\r
+ return false\r
+ end\r
+ \r
+ def check_cgiroot(add)\r
+ to = false\r
+ from = false\r
+ @cgirootExp.each do |exp|\r
+ to = true if exp.match(add.url.to_s)\r
+ from = true if exp.match(add.referrer.to_s)\r
+ return true if to and from\r
+ end\r
+ return false\r
+ end\r
+ \r
+ def add_nextpage_sub(path, dat)\r
+ return unless path\r
+ \r
+ #p path\r
+ path = CGI::unescapeHTML(path.to_s)\r
+ return if /^(?:javascript|mailto|data|file|tel):/ni.match(path)\r
+ #p "##"\r
+ uri = dat.uri\r
+ begin\r
+ # p path\r
+ path = URI.parse( path.gsub(/[\x00-\x1F\x80-\xFF]/n){|x| '%'+x.unpack('H2')[0] } )\r
+ # p path\r
+ rescue URI::InvalidURIError, URI::InvalidComponentError\r
+ # p "INV #{path}"\r
+ return\r
+ end\r
+ newuri = (uri+path)\r
+ newuri.fragment=nil\r
+ \r
+ add = URLtable::ROW.new\r
+ add.url = newuri.to_s\r
+ add.referrer = dat.url\r
+ add.linkCount = dat.linkCount+1\r
+ if check_cgiroot(add) then\r
+ add.linkCountCGI = dat.linkCountCGI+1\r
+ end\r
+ ##add.message = path #for debug\r
+ \r
+ #p "L #{newuri} #{add.linkCount}:#{add.linkCountCGI}"\r
+ return if add.linkCountCGI > @cgiMAXlink\r
+ \r
+ isroot = check_root(add)\r
+ #p isroot\r
+ if isroot and (add.linkCount <= @rootMAXlink) then\r
+ # p "U"\r
+ if @rootExp[0].match(add.url) or (@cgirootExp[0] and @cgirootExp[0].match(add.url)) then\r
+ add.priority = add.priority | 0x40000000\r
+ end\r
+ @list.update add,false\r
+ else\r
+ if @pendingURI then\r
+ open(@pendingURI,'a'){|f|\r
+ f.write "#{add.url}\n"\r
+ }\r
+ end\r
+ end\r
+ #p path\r
+ end\r
+ \r
+ def add_nextpage(dat,response)\r
+=begin\r
+ text = response.body\r
+ #p text\r
+ scriptmode = false\r
+ exp = /(<script|<\/script)\b|\b(?:href|src)(?:\s*=\s*"([^\x22]*)"|=([^\x22\x27> ]+))|\burl\(([^\x29]*)\)/ni\r
+ text.scan( exp ) do |t|\r
+ curr=t.shift\r
+ if curr=='<script'\r
+ scriptmode = true\r
+ end\r
+ if curr=='</script'\r
+ scriptmode = false\r
+ end\r
+ \r
+ if !scriptmode then\r
+ add_nextpage_sub(t[0], dat)\r
+ add_nextpage_sub(t[1], dat)\r
+ add_nextpage_sub(t[2], dat)\r
+ end\r
+ end\r
+=end\r
+ text = response.body\r
+ scriptmode = false\r
+ exp = /(<script|<\/script)\b|\b(href|src|value)(?:\s*=\s*"([^\x22]*)"|=([^\x22\x27> ]+))|\burl\(([^\x29]*)\)/ni\r
+ text.scan( exp ) do |t|\r
+ curr=t.shift.to_s.downcase\r
+ if curr=='<script'\r
+ scriptmode = true\r
+ end\r
+ if curr=='</script'\r
+ scriptmode = false\r
+ end\r
+ curr=t.shift.to_s.downcase\r
+ mediacheck = curr=='value'\r
+ add_nextpage_sub(t[0], dat) if t[0] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))\r
+ add_nextpage_sub(t[1], dat) if t[1] and ( (not mediacheck) or MEDIA_EXP.match(t[1].to_s))\r
+ add_nextpage_sub(t[2], dat) if t[2] and ( (not mediacheck) or MEDIA_EXP.match(t[2].to_s))\r
+ end\r
+ end\r
+ \r
+ def save_newlocation(dat,response)\r
+ add_nextpage_sub(response['location'], dat)\r
+ end\r
+ \r
+ def responseHead(response)\r
+ r=''\r
+ response.each do |name,value|\r
+ r += "#{name} : #{value}\n"\r
+ end\r
+ return r\r
+ end\r
+ \r
+ def httpget(dat)\r
+ begin\r
+ @outStream.write("Get: #{Time.now.to_s} [#{sprintf('%08X:%3d,%3d',dat.priority.to_i, dat.linkCount, dat.linkCountCGI)}] #{dat.uri.to_s} <- Referer:#{dat.referrer.to_s} ... ") if @outStream\r
+ rescue URI::InvalidURIError\r
+ @outStream.write("BAD: #{Time.now.to_s} #{dat.url} <- Referer:#{dat.referrer.to_s}\n") if @outStream\r
+ dat.execond = URLtable::BadURI\r
+ dat.status = ''\r
+ dat.tryNow = dat.tryNow + 1\r
+ dat.downloadTime = Time.now\r
+ @list.update dat\r
+ \r
+ open("badURI.txt","a"){|f| f.write "#{dat.inspect}\n" }\r
+ return\r
+ end\r
+ \r
+ uri = dat.uri\r
+ response = nil\r
+ rescode = nil\r
+ begin\r
+ case uri.normalize.scheme.tr('A-Z','a-z')\r
+ when 'https'\r
+ https = Net::HTTP.new(uri.host, uri.port)\r
+ https.open_timeout = 15\r
+ https.use_ssl = true\r
+ https.verify_mode = OpenSSL::SSL::VERIFY_NONE\r
+ https.verify_depth = 5\r
+ https.start do\r
+ request = Net::HTTP::Get.new( uri.request_uri, { 'Referer'=>dat.referrer.to_s } )\r
+ response = https.request request\r
+ end\r
+ when 'http'\r
+ http = Net::HTTP.new(uri.host, uri.port)\r
+ http.open_timeout = 15\r
+ http.start do\r
+ request = Net::HTTP::Get.new( uri.request_uri, { 'Referer'=>dat.referrer.to_s } )\r
+ response = http.request request\r
+ end\r
+ when 'ttp'\r
+ uri = URI.parse( 'h'+uri.to_s )\r
+ http = Net::HTTP.new(uri.host, uri.port)\r
+ http.open_timeout = 15\r
+ http.start do\r
+ request = Net::HTTP::Get.new( uri.request_uri, { 'Referer'=>dat.referrer.to_s } )\r
+ response = http.request request\r
+ end\r
+ \r
+ when 'mailto','data','ftp','file','tel','mms','rtsp','shinsei'\r
+ response=false\r
+ rescode = "Skip"\r
+ dat.execond = URLtable::SkipScheme\r
+ \r
+ else\r
+ p "UN KNOWN SCHEME * #{uri.scheme}"\r
+ p uri,dat\r
+ exit\r
+ end\r
+ \r
+ rescue Errno::ETIMEDOUT,Timeout::Error\r
+ response = false\r
+ rescode = "TimeOut"\r
+ dat.execond = URLtable::ConectTimeout\r
+ \r
+ rescue SocketError,\r
+ Errno::EHOSTUNREACH,\r
+ Errno::ECONNREFUSED,\r
+ Errno::ECONNRESET\r
+ response = false\r
+ rescode = "NoHost"\r
+ dat.execond = URLtable::ConnectError\r
+ \r
+ rescue Net::HTTPBadResponse\r
+ response = false\r
+ rescode = "Can'tGet"\r
+ dat.execond = URLtable::CannotGetNetHTTP\r
+ \r
+ rescue EOFError\r
+ response = false\r
+ rescode = "EOFerr"\r
+ dat.execond = URLtable::EOFreached\r
+ \r
+ rescue Errno::EPIPE\r
+ response = false\r
+ rescode = "Retry"\r
+ dat.execond = URLtable::WaitRetry\r
+ dat.priority = rand(0x3fffffff)\r
+ \r
+ rescue Object\r
+ timeoutexp = /\btimeout\.rb:.*:in \x60timeout\x27/n\r
+ expline = $@[0]\r
+ #p timeoutexp, expline\r
+ \r
+ if timeoutexp.match( expline ) then\r
+ response = false\r
+ rescode = "TimeOut"\r
+ dat.execond = URLtable::ConectTimeout\r
+ else\r
+ p "----------------------------------"\r
+ p $1.class\r
+ p "----------------------------------"\r
+ p $!\r
+ p $@\r
+ raise\r
+ end\r
+ end\r
+ \r
+ \r
+ case response\r
+ when FalseClass\r
+ dat.status = rescode\r
+ dat.tryNow = dat.tryNow + 1\r
+ dat.downloadTime = Time.now\r
+ dat.body=''\r
+ @list.update dat\r
+ \r
+ when Net::HTTPSuccess\r
+ dat.execond = URLtable::SaveSuccess\r
+ dat.status = "Done"\r
+ dat.body = response.body.to_s\r
+ save_param(dat,response)\r
+ #save_body(dat,response)\r
+ add_nextpage(dat,response)\r
+ @list.update dat\r
+ \r
+ when Net::HTTPMovedPermanently,\r
+ Net::HTTPTemporaryRedirect,\r
+ Net::HTTPFound,\r
+ Net::HTTPSeeOther,\r
+ Net::HTTPMultipleChoice\r
+ location = response['location'].to_s\r
+ dat.execond = URLtable::NewLocation\r
+ dat.status = "Move"\r
+ dat.body = response.body.to_s\r
+ dat.message = "Location: "+location\r
+ save_param(dat,response)\r
+ #save_body(dat,response)\r
+ save_newlocation(dat,response)\r
+ @list.update dat\r
+ \r
+ when Net::HTTPBadRequest,\r
+ Net::HTTPClientError\r
+ dat.execond = URLtable::Error4xx\r
+ dat.status = "Err4xx"\r
+ dat.body = response.body.to_s\r
+ save_param(dat,response)\r
+ #save_body(dat,response)\r
+ add_nextpage(dat,response)\r
+ @list.update dat\r
+ \r
+ when Net::HTTPUnauthorized \r
+ dat.execond = URLtable::AuthRequest\r
+ dat.status = "AuthReq"\r
+ dat.body = response.body.to_s\r
+ save_param(dat,response)\r
+ dat.message = responseHead(response)\r
+ @outStream.write("-*- #{response.code} Authorization Requet -*-\n") if @outStream\r
+ @outStream.write("#{dat.message}\n") if @outStream\r
+ #save_body(dat,response)\r
+ add_nextpage(dat,response)\r
+ @list.update dat\r
+ \r
+ when Net::HTTPForbidden\r
+ dat.execond = URLtable::Forbidden\r
+ dat.status = "Forbidden"\r
+ dat.body = response.body.to_s\r
+ save_param(dat,response)\r
+ #save_body(dat,response)\r
+ add_nextpage(dat,response)\r
+ @list.update dat\r
+ \r
+ when Net::HTTPNotFound,\r
+ Net::HTTPGone\r
+ dat.execond = URLtable::NotFound\r
+ dat.status = "NotFound"\r
+ dat.body = response.body.to_s\r
+ save_param(dat,response)\r
+ #save_body(dat,response)\r
+ add_nextpage(dat,response)\r
+ @list.update dat\r
+ \r
+ when Net::HTTPServerError\r
+ dat.execond = URLtable::Error5xx\r
+ dat.status = "Err5xx"\r
+ dat.body = response.body.to_s\r
+ save_param(dat,response)\r
+ #save_body(dat,response)\r
+ add_nextpage(dat,response)\r
+ @list.update dat\r
+ \r
+ else\r
+ p response\r
+ p response.body\r
+ raise $!\r
+ end\r
+ \r
+ @outStream.write("#{dat.status}\n") if @outStream\r
+ \r
+ end\r
+ \r
+ def getpage(dat)\r
+ httpget(dat)\r
+ end\r
+ \r
+ def start(cond='',data=[])\r
+ \r
+ begin\r
+ @list.transaction do\r
+ @commitCount.times{\r
+ curr = if cond.size>0 then\r
+ @list.entry( cond,data )\r
+ elsif @excludeWayback then\r
+ @list.entry(\r
+ '(ExeCondition<?) and (NOT URL like ?;) order by priority desc limit 1;', [URLtable::SaveSuccess,'http://web.archive.org/%']\r
+ )\r
+ else\r
+ @list.entry\r
+ end\r
+ \r
+ if curr.url == '' then\r
+ # \91S\95\94\8f\84\89ñ\8dÏ\82Ý\r
+ return\r
+ end\r
+ getpage(curr)\r
+ sleep(@sleepTime)\r
+ }\r
+ end\r
+ end while true\r
+ end\r
+\r
+ def read(url)\r
+ @list[url]\r
+ end\r
+ \r
+ def check_link(row)\r
+ add_nextpage(row,row)\r
+ end\r
+ \r
+ def each\r
+ @list.each do |row|\r
+ yield row\r
+ end\r
+ end\r
+ \r
+ def rest\r
+ return @list.rest\r
+ end\r
+ \r
+ def my_query\r
+ @list.my_query do |db|\r
+ yield db\r
+ end\r
+ end\r
+ \r
+# def changeStatus\r
+# end\r
+=begin\r
+ def check_link_test(row)\r
+ text = row.body\r
+ #p text\r
+ scriptmode = false\r
+ exp = /(<script|<\/script)\b|\b(href|src|value)(?:\s*=\s*"([^\x22]*)"|=([^\x22\x27> ]+))|\burl\(([^\x29]*)\)/ni\r
+ text.scan( exp ) do |t|\r
+ curr=t.shift.to_s.downcase\r
+ if curr=='<script'\r
+ scriptmode = true\r
+ end\r
+ if curr=='</script'\r
+ scriptmode = false\r
+ end\r
+ \r
+ curr=t.shift.to_s.downcase\r
+ mediacheck = curr=='value'\r
+ # if !scriptmode then\r
+ puts "#{t[0]} is link" if t[0] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))\r
+ puts "#{t[1]} is link" if t[1] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))\r
+ puts "#{t[2]} is link" if t[2] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))\r
+ # add_nextpage_sub(t[0], dat)\r
+ # add_nextpage_sub(t[1], dat)\r
+ # add_nextpage_sub(t[2], dat)\r
+ # end\r
+ end\r
+ \r
+ end\r
+=end\r
+ \r
+ \r
+end\r
+\r
--- /dev/null
+# -*- coding: Windows-31J -*-\r
+\r
+require 'sqlite3'\r
+require 'fileutils'\r
+require 'uri'\r
+require 'pathname'\r
+\r
+#TABLE_PATH = "./url.db3"\r
+\r
+\r
+\r
+class URLtable\r
+ Waiting = 0\r
+ WaitRetry = 1\r
+ \r
+ SaveSuccess = 800\r
+ NewLocation = 801\r
+ NotFound = 802\r
+ SkipScheme = 804\r
+ \r
+ ConectTimeout = 900\r
+ ConnectError = 901\r
+ Forbidden = 902\r
+ AuthRequest = 903\r
+ Error4xx = 904\r
+ Error5xx = 905\r
+ EOFreached = 906\r
+ CannotGetNetHTTP = 907 # Net::HTTP \82Å\82Í\8eæ\93¾\82Å\82«\82È\82¢\83y\81[\83W\81i\83T\83C\83g\82Í\82 \82é\81j\r
+ BadURI = 908 # \82 \82Æ\82ÅURI\92ù\90³\82ð\8e\8e\82Ý\82é\r
+ \r
+ \r
+ \r
+ def initialize(param={:path => nil, :db => nil})\r
+ @db = nil\r
+ @dbFile = nil\r
+ @dataDir = nil\r
+ \r
+ self.dataDir = if param[:path] then param[:path] else "./save" end\r
+ self.saveDB = if param[:db] then param[:db] else "./url.db3" end\r
+ end\r
+ \r
+ def dataDir=(path)\r
+ @dataDir = File.expand_path(path)\r
+ FileUtils.mkdir_p(@dataDir)\r
+ end\r
+ \r
+ def saveDB=(db)\r
+ db=db.to_s\r
+ raise ArgumentError if db.size == 0\r
+ if @dbFile != db then\r
+ @db.close if @db\r
+ @dbFile = db\r
+ dbinit(@dbFile)\r
+ end\r
+ end\r
+ \r
+ \r
+ class ROW\r
+ def initialize(dt={})\r
+ @url = dt[:url].to_s\r
+ @status = dt[:status].to_s\r
+ @statusCode = dt[:code].to_s\r
+ @bytes = dt[:bytes].to_i\r
+ @tryNow = dt[:try].to_i\r
+ @message = dt[:message].to_s\r
+ @referrer = dt[:referrer].to_s\r
+ @savePath = dt[:path].to_s\r
+ @linkCount = dt[:link].to_i\r
+ @linkCountCGI = dt[:cgilink].to_i\r
+ @timeStamp = dt[:filetime]\r
+ @downloadTime = dt[:downtime]\r
+ @checksum = dt[:checksum].to_s\r
+ @priority = if dt[:priority] then dt[:priority] else rand(0x3fffffff) end\r
+ @execond = dt[:execond].to_i\r
+ @body = dt[:body].to_s\r
+ @contentType = dt[:contentType].to_s\r
+ end\r
+ attr_accessor :url, :status, :statusCode, :bytes, :tryNow\r
+ attr_accessor :message, :referrer, :savePath, :linkCount, :linkCountCGI\r
+ attr_accessor :timeStamp, :downloadTime, :checksum, :priority, :execond\r
+ attr_accessor :body, :contentType\r
+ \r
+ def uri\r
+ return URI.parse(@url)\r
+ end\r
+ \r
+ def to_a\r
+ return [\r
+ @url.to_s,\r
+ @status.to_s,\r
+ @statusCode.to_s,\r
+ @bytes.to_i,\r
+ @tryNow.to_i,\r
+ @message.to_s,\r
+ @referrer.to_s,\r
+ @savePath.to_s,\r
+ @linkCount.to_i,\r
+ @linkCountCGI.to_i,\r
+ @timeStamp.to_i,\r
+ @downloadTime.to_i,\r
+ @checksum.to_s,\r
+ if @priority then @priority.to_i else rand(0x3fffffff) end,\r
+ @execond.to_i,\r
+ SQLite3::Blob.new( @body.to_s ),\r
+ @contentType.to_s,\r
+ ]\r
+ end\r
+ \r
+ def copy\r
+ return Marshal.load(Marshal.dump self)\r
+ end\r
+ end\r
+\r
+\r
+ def URLtable.create(fn)\r
+ FileUtils.rm_f(fn)\r
+ sql = <<'SQL'\r
+\r
+-- URL\83e\81[\83u\83\8b\r
+create table URLinfo (\r
+ URL text NOT NULL UNIQUE,\r
+ Status text,\r
+ StatusCode text,\r
+ Bytes integer,\r
+ TryNow integer,\r
+ Message text, -- Move\82Ì\8ds\82«\90æ\82È\82Ç\r
+ Referrer text, -- \83\8a\83t\83@\83\89\r
+ SavePath text, -- \95Û\91¶\83p\83X\81i\91\8a\91Î\81j\r
+ LinkCount integer, -- \83\8b\81[\83g\82©\82ç\82Ì\83\8a\83\93\83N\83J\83E\83\93\83g\r
+ LinkCountCGI integer, -- cgi\83\8b\81[\83g\82©\82ç\82Ì\83\8a\83\93\83N\83J\83E\83\93\83g\r
+ TimeStamp integer,\r
+ DownloadTime integer,\r
+ CheckSum text,\r
+ Priority integer, -- \97D\90æ\8f\87\88Ê\r
+ ExeCondition integer, -- 0:\8f\88\97\9d\91Ò\82¿ 1:\83\8a\83g\83\89\83C\91Ò\82¿ 800-:\90³\8fí\8fI\97¹ 900:\88Ù\8fí\8fI\97¹\r
+ Body blob, --\r
+ ContentType text,\r
+ \r
+ primary key(URL)\r
+);\r
+\r
+create index idx_URLinfo_StatusCode on URLinfo(StatusCode);\r
+create index idx_URLinfo_Message on URLinfo(Message);\r
+create index idx_URLinfo_CheckSum on URLinfo(CheckSum);\r
+create index idx_URLinfo_Priority on URLinfo(Priority);\r
+create index idx_URLinfo_ExeCondition on URLinfo(ExeCondition);\r
+create index idx_URLinfo_ContentType on URLinfo(ContentType);\r
+\r
+SQL\r
+ \r
+ vdb = SQLite3::Database.new(fn)\r
+ vdb.busy_timeout(2000)\r
+ vdb.transaction do\r
+ begin\r
+ vdb.execute_batch(sql)\r
+ rescue SQLite3::SQLException\r
+ raise\r
+ end\r
+ end\r
+ \r
+ end\r
+ \r
+ def dbinit(fn)\r
+ FileUtils.mkdir_p(File.dirname(fn))\r
+ URLtable.create(fn) unless File.exist?(fn)\r
+ @db = SQLite3::Database.new(fn)\r
+ @db.busy_timeout(10000)\r
+ end\r
+ private :dbinit\r
+ \r
+ def search_fn(dir)\r
+ p "toobig adta"\r
+ cur = 1\r
+ low = 0\r
+ high = 1\r
+ fn = ''\r
+ base = File.expand_path(dir)\r
+ begin\r
+ print "#{low} <#{cur}> #{high} : "\r
+ fn = sprintf('%s/%012x.blob', base, cur)\r
+ if File.exist?(dir+'/'+fn) then\r
+ puts "found"\r
+ low = cur\r
+ if cur==high then\r
+ high *= 2\r
+ cur = high\r
+ else\r
+ cur = (high+low+1)/2\r
+ end\r
+ else\r
+ puts "nofile"\r
+ high = cur\r
+ cur = (high+low)/2\r
+ break if(high = low+1)\r
+ end\r
+ end while true\r
+ puts "newfn = #{fn}"\r
+ puts "from = #{@dbFile}"\r
+ from = Pathname.new( File.dirname( File.expand_path(@dbFile) ) )\r
+ tofn = Pathname.new( fn )\r
+ tofn = tofn.relative_path_from( from )\r
+ puts "return = #{tofn.to_s}"\r
+ return [tofn.to_s, fn]\r
+ end\r
+ \r
+ def save(text)\r
+ (fn, full)=search_fn(@dataDir)\r
+ FileUtils.mkdir_p(File.dirname(full))\r
+ open(full, 'wb'){|f|\r
+ f.write text\r
+ }\r
+ puts "write blob #{text.size}bytes to #{fn} * #{full}"\r
+ return fn\r
+ end\r
+\r
+ def insert_query(row)\r
+ write = row.to_a\r
+ cnt = 0\r
+ begin\r
+ @db.execute(\r
+ 'insert into URLinfo (' +\r
+ 'URL, Status, StatusCode, Bytes, TryNow, ' +\r
+ 'Message, Referrer, SavePath, LinkCount, LinkCountCGI, ' +\r
+ 'TimeStamp, DownloadTime, CheckSum, Priority, ExeCondition, ' +\r
+ 'Body, ContentType' +\r
+ ') values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?);', write\r
+ )\r
+ rescue SQLite3::TooBigException\r
+ cnt += 1\r
+ write[15]=''\r
+ row.savePath = save(row.body)\r
+ if cnt <2 then\r
+ retry\r
+ end\r
+ raise\r
+ end\r
+ end\r
+\r
+ SELECT_SQLBODY = 'select ' +\r
+ 'URL, Status, StatusCode, Bytes, TryNow, ' +\r
+ 'Message, Referrer, SavePath, LinkCount, LinkCountCGI, ' +\r
+ 'TimeStamp, DownloadTime, CheckSum, Priority, ExeCondition, ' +\r
+ 'Body, ContentType' +\r
+ ' from URLinfo'\r
+ UPDATE_SQLBODY = 'update URLinfo set ' +\r
+ 'Status=?, StatusCode=?, Bytes=?, TryNow=?, ' +\r
+ 'Message=?, Referrer=?, SavePath=?, LinkCount=?, LinkCountCGI=?, ' +\r
+ 'TimeStamp=?, DownloadTime=?, CheckSum=?, Priority=?, ExeCondition=?, ' +\r
+ 'Body=?, ContentType=?'\r
+ \r
+ def update_query(row)\r
+ a = row.to_a\r
+ key = a.shift\r
+ a << key\r
+ @db.execute(\r
+ UPDATE_SQLBODY + ' where URL=?;', a\r
+ )\r
+ end\r
+ \r
+ def select_query(cond='', data=[])\r
+ sql = SELECT_SQLBODY\r
+ if cond != '' then\r
+ sql += ' where ' + cond\r
+ end\r
+ sql += ';'\r
+ result = @db.query( sql, data )\r
+ return result\r
+ end\r
+ \r
+ def toRow_sub(x)\r
+ row = nil\r
+ if x then\r
+ row = ROW.new({\r
+ :url => x[0].to_s,\r
+ :status => x[1].to_s,\r
+ :code => x[2].to_s,\r
+ :bytes => x[3].to_i,\r
+ :try => x[4].to_i,\r
+ :message => x[5].to_s,\r
+ :referrer => x[6].to_s,\r
+ :path => x[7].to_s,\r
+ :link => x[8].to_i,\r
+ :cgilink => x[9].to_i,\r
+ :filetime => Time.at(x[10].to_i),\r
+ :downtime => Time.at(x[11].to_i),\r
+ :checksum => x[12].to_s,\r
+ :priority => x[13].to_i,\r
+ :execond => x[14].to_i,\r
+ :body => x[15].to_s,\r
+ :contentType => x[16].to_s,\r
+ })\r
+ end\r
+ return row\r
+ end\r
+ \r
+ def toRow(result)\r
+ x=result.next; result.close\r
+ return toRow_sub(x)\r
+ end\r
+ \r
+ def read_sub(cond, data=[])\r
+ result = select_query(cond, data)\r
+ return toRow(result)\r
+ end\r
+ \r
+ def read(url)\r
+ return read_sub('URL=?', [url])\r
+ end\r
+ \r
+ def notNil(row)\r
+ if row then\r
+ return row\r
+ end\r
+ return ROW.new\r
+ end\r
+ \r
+ def [](key)\r
+ return notNil( read(key) )\r
+ end\r
+ \r
+ def update(data,force=true)\r
+ begin\r
+ insert_query(data)\r
+ rescue SQLite3::ConstraintException\r
+ if force then\r
+ update_query(data)\r
+ end\r
+ end\r
+ end\r
+ \r
+ def exists(key)\r
+ if read(key) then\r
+ return true\r
+ end\r
+ return false\r
+ end\r
+ \r
+ def entry(cond='', data=[])\r
+ if(cond.to_s.size==0)then\r
+ return notNil(\r
+ read_sub('ExeCondition<? order by priority desc limit 1;', [SaveSuccess])\r
+ )\r
+ else\r
+ return notNil(\r
+ read_sub(cond,data)\r
+ )\r
+ end\r
+# sql = SELECT_SQLBODY\r
+# sql += ' order by priority desc limit 1;'\r
+# result = @db.query( sql, [] )\r
+# return notNil( toRow(result) )\r
+ end\r
+ \r
+ def rest\r
+ res = @db.query('select COUNT(*) from URLinfo where ExeCondition < ?',[SaveSuccess])\r
+ x=res.next\r
+ res.close\r
+ return x\r
+ end\r
+ \r
+ def transaction\r
+ @db.transaction do\r
+ yield\r
+ end\r
+ end\r
+ \r
+ def each(cond='', data=[])\r
+ result = select_query(cond, data)\r
+ while x=result.next\r
+ yield toRow_sub(x)\r
+ end\r
+ result.close\r
+ end\r
+ \r
+ \r
+ def my_query\r
+ yield @db\r
+ end\r
+ \r
+ \r
+end\r