OSDN Git Service

新規コミット
authormikari <mikari@wolf.nightfall.jp>
Tue, 8 Apr 2014 17:44:51 +0000 (02:44 +0900)
committermikari <mikari@wolf.nightfall.jp>
Tue, 8 Apr 2014 17:44:51 +0000 (02:44 +0900)
lib/arcget.rb [new file with mode: 0644]
lib/downtable.rb [new file with mode: 0644]
sample/getMOE1.rb [new file with mode: 0644]

diff --git a/lib/arcget.rb b/lib/arcget.rb
new file mode 100644 (file)
index 0000000..cccc1a6
--- /dev/null
@@ -0,0 +1,589 @@
+# -*- coding: Windows-31J -*-\r
+\r
+require 'fileutils'\r
+require 'uri'\r
+require 'net/http'\r
+require 'net/https'\r
+require 'digest/sha2'\r
+require 'time'\r
+require 'cgi'\r
+\r
+$LOAD_PATH.unshift(File.dirname(__FILE__))\r
+require 'downtable.rb'\r
+$LOAD_PATH.shift\r
+Net::HTTP.version_1_2\r
+\r
+\r
+\r
+class ArcGET\r
+       \r
+       MEDIA_EXP = /\.(wav|wma|avi|asf|ogg|mp3|mp4|mpeg|mpg|mid|midi|smf|smaf|m4a|swf|flv|bmp|gif|tiff|tif|png|vob|ogm|mov|rm|divx)$/ni\r
+       \r
+       Waiting       = URLtable::Waiting\r
+       WaitRetry     = URLtable::WaitRetry\r
+       \r
+       SaveSuccess   = URLtable::SaveSuccess\r
+       NewLocation   = URLtable::NewLocation\r
+       NotFound      = URLtable::NotFound\r
+       SkipScheme    = URLtable::SkipScheme\r
+       \r
+       ConectTimeout = URLtable::ConectTimeout\r
+       ConnectError  = URLtable::ConnectError\r
+       Forbidden     = URLtable::Forbidden\r
+       AuthRequest   = URLtable::AuthRequest\r
+       Error4xx      = URLtable::Error4xx\r
+       Error5xx      = URLtable::Error5xx\r
+       EOFreached    = URLtable::EOFreached\r
+       CannotGetNetHTTP = URLtable::CannotGetNetHTTP   # Net::HTTP \82Å\82Í\8eæ\93¾\82Å\82«\82È\82¢\83y\81[\83W\81i\83T\83C\83g\82Í\82 \82é\81j\r
+       BadURI        = URLtable::BadURI                                                        # \82 \82Æ\82ÅURI\92ù\90³\82ð\8e\8e\82Ý\82é\r
+       \r
+       \r
+       def initialize(param={:path => nil, :db => nil, :pendingURI => nil})\r
+                       @rootExp = []\r
+                       @rootMAXlink = 99999999\r
+                       @cgirootExp = []\r
+                       @cgiMAXlink = 10\r
+                       @excludeWayback = false\r
+                       @sleepTime = 20.0\r
+                       @commitCount = 16\r
+                       @savePath = if param[:path] then param[:path] else "./save"    end\r
+                       @saveDB   = if param[:db] then param[:db] else "./url.db3" end\r
+                       @pendingURI = param[:pendingURI]\r
+                       @pendingURI = nil if @pendingURI.to_s == ""\r
+                       @list = URLtable.new(param={:path => @savePath, :db => @saveDB})\r
+                       @outStream = STDOUT\r
+       end\r
+       \r
+       def sleep_setting(set)\r
+               @sleepTime=set\r
+       end\r
+       \r
+       def exclude_wayback(set)\r
+               @excludeWayback = set\r
+       end\r
+       \r
+       def commit_count(set)\r
+               @commitCount = set\r
+       end\r
+       \r
+       def save_path(param={:path => nil, :db => nil, :pendingURI => nil})\r
+               path = param[:path]\r
+               db   = param[:db]\r
+               pendingURI = param[:pendingURI]\r
+               \r
+               if path then\r
+                       @savePath = File.expand_path(path)\r
+                       @list.dataDir = @savePath\r
+               end\r
+               if db then\r
+                       @saveDB = File.expand_path(db)\r
+                       @list.saveDB = @saveDB\r
+               end\r
+               if pendingURI then\r
+                       @pendingURI = pendingURI\r
+                       @pendingURI = nil if @pendingURI.to_s.size == 0\r
+               end\r
+       end\r
+       \r
+       def set_echo(obj=nil)\r
+               tmp=@outStream\r
+               @outStream = obj\r
+               return tmp\r
+       end\r
+       \r
+       def add_root(root, max=nil)\r
+               if max then\r
+                       @rootMAXlink = max.to_i\r
+               end\r
+               unless root then\r
+                       @rootExp = []\r
+                       return\r
+               end\r
+               @rootExp << root\r
+               @rootExp.uniq!\r
+       end\r
+       \r
+       def add_cgi_root(root, max=nil)\r
+               if max then\r
+                       @cgiMAXlink = max.to_i\r
+               end\r
+               unless root then\r
+                       @cgirootExp = []\r
+                       return\r
+               end\r
+               @cgirootExp << root\r
+               @cgirootExp.uniq!\r
+       end\r
+       \r
+       def add_url(url)\r
+               return unless url\r
+               url = url.to_s\r
+               @list.transaction do\r
+                       unless @list.exists(url) then\r
+                               r = URLtable::ROW.new\r
+                               r.url = url\r
+                               r.priority = r.priority | 0x40000000\r
+                               @list.update r\r
+                       else\r
+                               @outStream.write("Alerdy Exists #{url}\n") if @outStream\r
+                       end\r
+               end\r
+       end\r
+       \r
+       \r
+       def save_param(dat,response)\r
+               uri = dat.uri\r
+               #p "@@",response.code.to_s\r
+               dat.statusCode = response.code.to_s\r
+               dat.bytes = response.body.size\r
+               dat.tryNow = dat.tryNow + 1\r
+               dat.timeStamp = Time.parse(response['Date']) rescue nil\r
+               dat.downloadTime = Time.now\r
+               digest = Digest::SHA256.new\r
+               digest << response.body\r
+               dat.checksum = digest.hexdigest\r
+               dat.contentType = response['Content-Type'].to_s\r
+               #p response\r
+       end\r
+       \r
+       def save_body(dat,response, cnt=0)\r
+               return if cnt > 9\r
+               \r
+               if cnt == 0 then\r
+                       uri = dat.uri\r
+                       dat.savePath = "/#{uri.host}#{uri.path}"\r
+                       if(uri.query.to_s.size>0)then\r
+                               dat.savePath = dat.savePath + "_"+URI.escape(uri.query,/[<>:\x27\x22\x2B\x2F\x3F\x5C]/n)\r
+                       end\r
+               end\r
+               \r
+               path="#{@savePath}#{dat.savePath}"\r
+               if cnt > 0 then\r
+                       path += ".#{cnt}"\r
+               end\r
+               \r
+               path = File.expand_path(path)\r
+               FileUtils.mkdir_p(File.dirname(path))\r
+               \r
+               if File.exist?(path) then\r
+                       save_body(dat,response, cnt+1)\r
+               end\r
+               begin\r
+                       open(path,'wb'){|f| f.write response.body }\r
+               rescue Errno::EISDIR\r
+                       dat.savePath =  dat.savePath + '/index.html'\r
+                       save_body(dat,response)\r
+               end\r
+       end\r
+       \r
+       def check_root(add)\r
+               #p add.url.to_s\r
+               #p add.referrer.to_s\r
+               @rootExp.each do |exp|\r
+               #       p exp\r
+                       return true if exp.match(add.url.to_s)\r
+                       return true if exp.match(add.referrer.to_s)\r
+               end\r
+               return false\r
+       end\r
+       \r
+       def check_cgiroot(add)\r
+               to   = false\r
+               from = false\r
+               @cgirootExp.each do |exp|\r
+                       to   = true if exp.match(add.url.to_s)\r
+                       from = true if exp.match(add.referrer.to_s)\r
+                       return true if to and from\r
+               end\r
+               return false\r
+       end\r
+       \r
+       def add_nextpage_sub(path, dat)\r
+               return unless path\r
+               \r
+               #p path\r
+               path = CGI::unescapeHTML(path.to_s)\r
+               return if /^(?:javascript|mailto|data|file|tel):/ni.match(path)\r
+               #p "##"\r
+               uri = dat.uri\r
+               begin\r
+               #       p path\r
+                       path = URI.parse( path.gsub(/[\x00-\x1F\x80-\xFF]/n){|x| '%'+x.unpack('H2')[0] } )\r
+               #       p path\r
+               rescue  URI::InvalidURIError, URI::InvalidComponentError\r
+               #       p "INV #{path}"\r
+                       return\r
+               end\r
+               newuri = (uri+path)\r
+               newuri.fragment=nil\r
+               \r
+               add = URLtable::ROW.new\r
+               add.url = newuri.to_s\r
+               add.referrer = dat.url\r
+               add.linkCount = dat.linkCount+1\r
+               if check_cgiroot(add) then\r
+                       add.linkCountCGI = dat.linkCountCGI+1\r
+               end\r
+               ##add.message = path    #for debug\r
+               \r
+               #p "L #{newuri} #{add.linkCount}:#{add.linkCountCGI}"\r
+               return if add.linkCountCGI > @cgiMAXlink\r
+               \r
+               isroot = check_root(add)\r
+               #p isroot\r
+               if isroot and (add.linkCount <= @rootMAXlink) then\r
+               #       p "U"\r
+                       if @rootExp[0].match(add.url) or (@cgirootExp[0] and @cgirootExp[0].match(add.url)) then\r
+                               add.priority = add.priority | 0x40000000\r
+                       end\r
+                       @list.update add,false\r
+               else\r
+                       if @pendingURI then\r
+                               open(@pendingURI,'a'){|f|\r
+                                       f.write "#{add.url}\n"\r
+                               }\r
+                       end\r
+               end\r
+               #p path\r
+       end\r
+       \r
+       def add_nextpage(dat,response)\r
+=begin\r
+               text = response.body\r
+               #p text\r
+               scriptmode = false\r
+               exp = /(<script|<\/script)\b|\b(?:href|src)(?:\s*=\s*"([^\x22]*)"|=([^\x22\x27> ]+))|\burl\(([^\x29]*)\)/ni\r
+               text.scan( exp ) do |t|\r
+                       curr=t.shift\r
+                       if curr=='<script'\r
+                               scriptmode = true\r
+                       end\r
+                       if curr=='</script'\r
+                               scriptmode = false\r
+                       end\r
+                       \r
+                       if !scriptmode then\r
+                               add_nextpage_sub(t[0], dat)\r
+                               add_nextpage_sub(t[1], dat)\r
+                               add_nextpage_sub(t[2], dat)\r
+                       end\r
+               end\r
+=end\r
+               text = response.body\r
+               scriptmode = false\r
+               exp = /(<script|<\/script)\b|\b(href|src|value)(?:\s*=\s*"([^\x22]*)"|=([^\x22\x27> ]+))|\burl\(([^\x29]*)\)/ni\r
+               text.scan( exp ) do |t|\r
+                       curr=t.shift.to_s.downcase\r
+                       if curr=='<script'\r
+                               scriptmode = true\r
+                       end\r
+                       if curr=='</script'\r
+                               scriptmode = false\r
+                       end\r
+                       curr=t.shift.to_s.downcase\r
+                       mediacheck = curr=='value'\r
+                       add_nextpage_sub(t[0], dat) if t[0] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))\r
+                       add_nextpage_sub(t[1], dat) if t[1] and ( (not mediacheck) or MEDIA_EXP.match(t[1].to_s))\r
+                       add_nextpage_sub(t[2], dat) if t[2] and ( (not mediacheck) or MEDIA_EXP.match(t[2].to_s))\r
+               end\r
+       end\r
+       \r
+       def save_newlocation(dat,response)\r
+               add_nextpage_sub(response['location'], dat)\r
+       end\r
+       \r
+       def responseHead(response)\r
+               r=''\r
+               response.each do |name,value|\r
+                 r += "#{name} : #{value}\n"\r
+               end\r
+               return r\r
+       end\r
+       \r
+       def httpget(dat)\r
+               begin\r
+                       @outStream.write("Get: #{Time.now.to_s} [#{sprintf('%08X:%3d,%3d',dat.priority.to_i, dat.linkCount, dat.linkCountCGI)}] #{dat.uri.to_s}    <- Referer:#{dat.referrer.to_s} ... ") if @outStream\r
+               rescue URI::InvalidURIError\r
+                       @outStream.write("BAD: #{Time.now.to_s} #{dat.url}    <- Referer:#{dat.referrer.to_s}\n") if @outStream\r
+                       dat.execond = URLtable::BadURI\r
+                       dat.status = ''\r
+                       dat.tryNow = dat.tryNow + 1\r
+                       dat.downloadTime = Time.now\r
+                       @list.update dat\r
+                       \r
+                       open("badURI.txt","a"){|f| f.write "#{dat.inspect}\n" }\r
+                       return\r
+               end\r
+               \r
+               uri = dat.uri\r
+               response = nil\r
+               rescode = nil\r
+               begin\r
+                       case uri.normalize.scheme.tr('A-Z','a-z')\r
+                       when 'https'\r
+                               https = Net::HTTP.new(uri.host, uri.port)\r
+                               https.open_timeout = 15\r
+                               https.use_ssl = true\r
+                               https.verify_mode = OpenSSL::SSL::VERIFY_NONE\r
+                               https.verify_depth = 5\r
+                               https.start do\r
+                                       request = Net::HTTP::Get.new( uri.request_uri, { 'Referer'=>dat.referrer.to_s } )\r
+                                       response = https.request request\r
+                               end\r
+                       when 'http'\r
+                               http = Net::HTTP.new(uri.host, uri.port)\r
+                               http.open_timeout = 15\r
+                               http.start do\r
+                                       request = Net::HTTP::Get.new( uri.request_uri, { 'Referer'=>dat.referrer.to_s } )\r
+                                       response = http.request request\r
+                               end\r
+                       when 'ttp'\r
+                               uri = URI.parse( 'h'+uri.to_s )\r
+                               http = Net::HTTP.new(uri.host, uri.port)\r
+                               http.open_timeout = 15\r
+                               http.start do\r
+                                       request = Net::HTTP::Get.new( uri.request_uri, { 'Referer'=>dat.referrer.to_s } )\r
+                                       response = http.request request\r
+                               end\r
+                       \r
+                       when 'mailto','data','ftp','file','tel','mms','rtsp','shinsei'\r
+                               response=false\r
+                               rescode = "Skip"\r
+                               dat.execond = URLtable::SkipScheme\r
+                               \r
+                       else\r
+                               p "UN KNOWN SCHEME * #{uri.scheme}"\r
+                               p uri,dat\r
+                               exit\r
+                       end\r
+                       \r
+               rescue Errno::ETIMEDOUT,Timeout::Error\r
+                       response = false\r
+                       rescode = "TimeOut"\r
+                       dat.execond = URLtable::ConectTimeout\r
+                       \r
+               rescue SocketError,\r
+                      Errno::EHOSTUNREACH,\r
+                      Errno::ECONNREFUSED,\r
+                      Errno::ECONNRESET\r
+                       response = false\r
+                       rescode = "NoHost"\r
+                       dat.execond = URLtable::ConnectError\r
+               \r
+               rescue Net::HTTPBadResponse\r
+                       response = false\r
+                       rescode  = "Can'tGet"\r
+                       dat.execond = URLtable::CannotGetNetHTTP\r
+               \r
+               rescue EOFError\r
+                       response = false\r
+                       rescode  = "EOFerr"\r
+                       dat.execond = URLtable::EOFreached\r
+               \r
+               rescue Errno::EPIPE\r
+                       response = false\r
+                       rescode  = "Retry"\r
+                       dat.execond = URLtable::WaitRetry\r
+                       dat.priority = rand(0x3fffffff)\r
+               \r
+               rescue Object\r
+                       timeoutexp = /\btimeout\.rb:.*:in \x60timeout\x27/n\r
+                       expline = $@[0]\r
+                       #p timeoutexp, expline\r
+                       \r
+                       if timeoutexp.match( expline ) then\r
+                               response = false\r
+                               rescode = "TimeOut"\r
+                               dat.execond = URLtable::ConectTimeout\r
+                       else\r
+                               p "----------------------------------"\r
+                               p $1.class\r
+                               p "----------------------------------"\r
+                               p $!\r
+                               p $@\r
+                               raise\r
+                       end\r
+               end\r
+               \r
+               \r
+               case response\r
+               when FalseClass\r
+                       dat.status = rescode\r
+                       dat.tryNow = dat.tryNow + 1\r
+                       dat.downloadTime = Time.now\r
+                       dat.body=''\r
+                       @list.update dat\r
+               \r
+               when Net::HTTPSuccess\r
+                       dat.execond = URLtable::SaveSuccess\r
+                       dat.status = "Done"\r
+                       dat.body = response.body.to_s\r
+                       save_param(dat,response)\r
+                       #save_body(dat,response)\r
+                       add_nextpage(dat,response)\r
+                       @list.update dat\r
+                       \r
+               when Net::HTTPMovedPermanently,\r
+                                Net::HTTPTemporaryRedirect,\r
+                                Net::HTTPFound,\r
+                                Net::HTTPSeeOther,\r
+                                Net::HTTPMultipleChoice\r
+                       location = response['location'].to_s\r
+                       dat.execond = URLtable::NewLocation\r
+                       dat.status = "Move"\r
+                       dat.body = response.body.to_s\r
+                       dat.message = "Location: "+location\r
+                       save_param(dat,response)\r
+                       #save_body(dat,response)\r
+                       save_newlocation(dat,response)\r
+                       @list.update dat\r
+                       \r
+               when Net::HTTPBadRequest,\r
+                    Net::HTTPClientError\r
+                       dat.execond = URLtable::Error4xx\r
+                       dat.status = "Err4xx"\r
+                       dat.body = response.body.to_s\r
+                       save_param(dat,response)\r
+                       #save_body(dat,response)\r
+                       add_nextpage(dat,response)\r
+                       @list.update dat\r
+               \r
+               when Net::HTTPUnauthorized \r
+                       dat.execond = URLtable::AuthRequest\r
+                       dat.status = "AuthReq"\r
+                       dat.body = response.body.to_s\r
+                       save_param(dat,response)\r
+                       dat.message = responseHead(response)\r
+                       @outStream.write("-*- #{response.code} Authorization Requet -*-\n") if @outStream\r
+                       @outStream.write("#{dat.message}\n") if @outStream\r
+                       #save_body(dat,response)\r
+                       add_nextpage(dat,response)\r
+                       @list.update dat\r
+               \r
+               when Net::HTTPForbidden\r
+                       dat.execond = URLtable::Forbidden\r
+                       dat.status = "Forbidden"\r
+                       dat.body = response.body.to_s\r
+                       save_param(dat,response)\r
+                       #save_body(dat,response)\r
+                       add_nextpage(dat,response)\r
+                       @list.update dat\r
+               \r
+               when Net::HTTPNotFound,\r
+                    Net::HTTPGone\r
+                       dat.execond = URLtable::NotFound\r
+                       dat.status = "NotFound"\r
+                       dat.body = response.body.to_s\r
+                       save_param(dat,response)\r
+                       #save_body(dat,response)\r
+                       add_nextpage(dat,response)\r
+                       @list.update dat\r
+               \r
+               when Net::HTTPServerError\r
+                       dat.execond = URLtable::Error5xx\r
+                       dat.status = "Err5xx"\r
+                       dat.body = response.body.to_s\r
+                       save_param(dat,response)\r
+                       #save_body(dat,response)\r
+                       add_nextpage(dat,response)\r
+                       @list.update dat\r
+               \r
+               else\r
+                       p response\r
+                       p response.body\r
+                       raise $!\r
+               end\r
+               \r
+               @outStream.write("#{dat.status}\n") if @outStream\r
+               \r
+       end\r
+       \r
+       def getpage(dat)\r
+                       httpget(dat)\r
+       end\r
+       \r
+       def start(cond='',data=[])\r
+               \r
+               begin\r
+                       @list.transaction do\r
+                               @commitCount.times{\r
+                                       curr = if cond.size>0 then\r
+                                               @list.entry( cond,data )\r
+                                       elsif @excludeWayback then\r
+                                               @list.entry(\r
+                                                       '(ExeCondition<?) and (NOT URL like ?;) order by priority desc limit 1;', [URLtable::SaveSuccess,'http://web.archive.org/%']\r
+                                               )\r
+                                       else\r
+                                               @list.entry\r
+                                       end\r
+                                       \r
+                                       if curr.url == '' then\r
+                                       # \91S\95\94\8f\84\89ñ\8dÏ\82Ý\r
+                                               return\r
+                                       end\r
+                                       getpage(curr)\r
+                                       sleep(@sleepTime)\r
+                               }\r
+                       end\r
+               end while true\r
+       end\r
+\r
+       def read(url)\r
+               @list[url]\r
+       end\r
+       \r
+       def check_link(row)\r
+               add_nextpage(row,row)\r
+       end\r
+       \r
+       def each\r
+               @list.each do |row|\r
+                       yield row\r
+               end\r
+       end\r
+       \r
+       def rest\r
+               return @list.rest\r
+       end\r
+       \r
+       def my_query\r
+               @list.my_query do |db|\r
+                       yield db\r
+               end\r
+       end\r
+       \r
+#      def changeStatus\r
+#      end\r
+=begin\r
+       def check_link_test(row)\r
+               text = row.body\r
+               #p text\r
+               scriptmode = false\r
+               exp = /(<script|<\/script)\b|\b(href|src|value)(?:\s*=\s*"([^\x22]*)"|=([^\x22\x27> ]+))|\burl\(([^\x29]*)\)/ni\r
+               text.scan( exp ) do |t|\r
+                       curr=t.shift.to_s.downcase\r
+                       if curr=='<script'\r
+                               scriptmode = true\r
+                       end\r
+                       if curr=='</script'\r
+                               scriptmode = false\r
+                       end\r
+                       \r
+                       curr=t.shift.to_s.downcase\r
+                       mediacheck = curr=='value'\r
+               #       if !scriptmode then\r
+                               puts "#{t[0]} is link" if t[0] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))\r
+                               puts "#{t[1]} is link" if t[1] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))\r
+                               puts "#{t[2]} is link" if t[2] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))\r
+                       #       add_nextpage_sub(t[0], dat)\r
+                       #       add_nextpage_sub(t[1], dat)\r
+                       #       add_nextpage_sub(t[2], dat)\r
+               #       end\r
+               end\r
+               \r
+       end\r
+=end\r
+       \r
+       \r
+end\r
+\r
diff --git a/lib/downtable.rb b/lib/downtable.rb
new file mode 100644 (file)
index 0000000..cd7d385
--- /dev/null
@@ -0,0 +1,381 @@
+# -*- coding: Windows-31J -*-\r
+\r
+require 'sqlite3'\r
+require 'fileutils'\r
+require 'uri'\r
+require 'pathname'\r
+\r
+#TABLE_PATH = "./url.db3"\r
+\r
+\r
+\r
+class URLtable\r
+       Waiting       =   0\r
+       WaitRetry     =   1\r
+       \r
+       SaveSuccess   = 800\r
+       NewLocation   = 801\r
+       NotFound      = 802\r
+       SkipScheme    = 804\r
+       \r
+       ConectTimeout = 900\r
+       ConnectError  = 901\r
+       Forbidden     = 902\r
+       AuthRequest   = 903\r
+       Error4xx      = 904\r
+       Error5xx      = 905\r
+       EOFreached    = 906\r
+       CannotGetNetHTTP = 907  # Net::HTTP \82Å\82Í\8eæ\93¾\82Å\82«\82È\82¢\83y\81[\83W\81i\83T\83C\83g\82Í\82 \82é\81j\r
+       BadURI        = 908     # \82 \82Æ\82ÅURI\92ù\90³\82ð\8e\8e\82Ý\82é\r
+       \r
+       \r
+       \r
+       def initialize(param={:path => nil, :db => nil})\r
+               @db      = nil\r
+               @dbFile  = nil\r
+               @dataDir = nil\r
+               \r
+               self.dataDir = if param[:path] then param[:path] else "./save"    end\r
+               self.saveDB  = if param[:db]   then param[:db]   else "./url.db3" end\r
+       end\r
+       \r
+       def dataDir=(path)\r
+               @dataDir = File.expand_path(path)\r
+               FileUtils.mkdir_p(@dataDir)\r
+       end\r
+       \r
+       def saveDB=(db)\r
+               db=db.to_s\r
+               raise ArgumentError if db.size == 0\r
+               if @dbFile != db then\r
+                       @db.close if @db\r
+                       @dbFile = db\r
+                       dbinit(@dbFile)\r
+               end\r
+       end\r
+       \r
+       \r
+       class ROW\r
+               def initialize(dt={})\r
+                       @url           = dt[:url].to_s\r
+                       @status        = dt[:status].to_s\r
+                       @statusCode    = dt[:code].to_s\r
+                       @bytes         = dt[:bytes].to_i\r
+                       @tryNow        = dt[:try].to_i\r
+                       @message       = dt[:message].to_s\r
+                       @referrer      = dt[:referrer].to_s\r
+                       @savePath      = dt[:path].to_s\r
+                       @linkCount     = dt[:link].to_i\r
+                       @linkCountCGI  = dt[:cgilink].to_i\r
+                       @timeStamp     = dt[:filetime]\r
+                       @downloadTime  = dt[:downtime]\r
+                       @checksum      = dt[:checksum].to_s\r
+                       @priority      = if dt[:priority] then dt[:priority] else rand(0x3fffffff) end\r
+                       @execond       = dt[:execond].to_i\r
+                       @body          = dt[:body].to_s\r
+                       @contentType   = dt[:contentType].to_s\r
+               end\r
+               attr_accessor :url, :status, :statusCode, :bytes, :tryNow\r
+               attr_accessor :message, :referrer, :savePath, :linkCount, :linkCountCGI\r
+               attr_accessor :timeStamp, :downloadTime, :checksum, :priority, :execond\r
+               attr_accessor :body, :contentType\r
+               \r
+               def uri\r
+                       return URI.parse(@url)\r
+               end\r
+               \r
+               def to_a\r
+                       return [\r
+                               @url.to_s,\r
+                               @status.to_s,\r
+                               @statusCode.to_s,\r
+                               @bytes.to_i,\r
+                               @tryNow.to_i,\r
+                               @message.to_s,\r
+                               @referrer.to_s,\r
+                               @savePath.to_s,\r
+                               @linkCount.to_i,\r
+                               @linkCountCGI.to_i,\r
+                               @timeStamp.to_i,\r
+                               @downloadTime.to_i,\r
+                               @checksum.to_s,\r
+                               if @priority then @priority.to_i else rand(0x3fffffff) end,\r
+                               @execond.to_i,\r
+                               SQLite3::Blob.new( @body.to_s ),\r
+                               @contentType.to_s,\r
+                       ]\r
+               end\r
+               \r
+               def copy\r
+                       return Marshal.load(Marshal.dump self)\r
+               end\r
+       end\r
+\r
+\r
+       def URLtable.create(fn)\r
+               FileUtils.rm_f(fn)\r
+               sql = <<'SQL'\r
+\r
+-- URL\83e\81[\83u\83\8b\r
+create table URLinfo (\r
+  URL           text NOT NULL UNIQUE,\r
+  Status        text,\r
+  StatusCode    text,\r
+  Bytes         integer,\r
+  TryNow        integer,\r
+  Message       text,     -- Move\82Ì\8ds\82«\90æ\82È\82Ç\r
+  Referrer      text,     -- \83\8a\83t\83@\83\89\r
+  SavePath      text,     -- \95Û\91\83p\83X\81i\91\8a\91Î\81j\r
+  LinkCount     integer,  -- \83\8b\81[\83g\82©\82ç\82Ì\83\8a\83\93\83N\83J\83E\83\93\83g\r
+  LinkCountCGI  integer,  -- cgi\83\8b\81[\83g\82©\82ç\82Ì\83\8a\83\93\83N\83J\83E\83\93\83g\r
+  TimeStamp     integer,\r
+  DownloadTime  integer,\r
+  CheckSum      text,\r
+  Priority      integer,  -- \97D\90æ\8f\87\88Ê\r
+  ExeCondition  integer,  -- 0:\8f\88\97\9d\91Ò\82¿ 1:\83\8a\83g\83\89\83C\91Ò\82¿ 800-:\90³\8fí\8fI\97¹ 900:\88Ù\8fí\8fI\97¹\r
+  Body          blob,     --\r
+  ContentType   text,\r
+  \r
+  primary key(URL)\r
+);\r
+\r
+create index idx_URLinfo_StatusCode on URLinfo(StatusCode);\r
+create index idx_URLinfo_Message on URLinfo(Message);\r
+create index idx_URLinfo_CheckSum on URLinfo(CheckSum);\r
+create index idx_URLinfo_Priority on URLinfo(Priority);\r
+create index idx_URLinfo_ExeCondition on URLinfo(ExeCondition);\r
+create index idx_URLinfo_ContentType on URLinfo(ContentType);\r
+\r
+SQL\r
+       \r
+               vdb = SQLite3::Database.new(fn)\r
+               vdb.busy_timeout(2000)\r
+               vdb.transaction do\r
+                       begin\r
+                               vdb.execute_batch(sql)\r
+                       rescue SQLite3::SQLException\r
+                               raise\r
+                       end\r
+               end\r
+               \r
+       end\r
+       \r
+       def dbinit(fn)\r
+               FileUtils.mkdir_p(File.dirname(fn))\r
+               URLtable.create(fn) unless File.exist?(fn)\r
+               @db = SQLite3::Database.new(fn)\r
+               @db.busy_timeout(10000)\r
+       end\r
+       private :dbinit\r
+       \r
+       def search_fn(dir)\r
+               p "toobig adta"\r
+               cur  = 1\r
+               low  = 0\r
+               high = 1\r
+               fn = ''\r
+               base = File.expand_path(dir)\r
+               begin\r
+                       print "#{low} <#{cur}> #{high} : "\r
+                       fn = sprintf('%s/%012x.blob', base, cur)\r
+                       if File.exist?(dir+'/'+fn) then\r
+                               puts "found"\r
+                               low = cur\r
+                               if cur==high then\r
+                                       high *= 2\r
+                                       cur = high\r
+                               else\r
+                                       cur = (high+low+1)/2\r
+                               end\r
+                       else\r
+                               puts "nofile"\r
+                               high = cur\r
+                               cur = (high+low)/2\r
+                               break if(high = low+1)\r
+                       end\r
+               end while true\r
+               puts "newfn = #{fn}"\r
+               puts "from = #{@dbFile}"\r
+               from = Pathname.new( File.dirname( File.expand_path(@dbFile) ) )\r
+               tofn = Pathname.new( fn )\r
+               tofn = tofn.relative_path_from( from )\r
+               puts "return = #{tofn.to_s}"\r
+               return [tofn.to_s, fn]\r
+       end\r
+       \r
+       def save(text)\r
+               (fn, full)=search_fn(@dataDir)\r
+               FileUtils.mkdir_p(File.dirname(full))\r
+               open(full,  'wb'){|f|\r
+                       f.write text\r
+               }\r
+               puts "write blob #{text.size}bytes to #{fn} * #{full}"\r
+               return fn\r
+       end\r
+\r
+       def insert_query(row)\r
+               write = row.to_a\r
+               cnt = 0\r
+               begin\r
+                       @db.execute(\r
+                               'insert into URLinfo (' +\r
+                                       'URL, Status, StatusCode, Bytes, TryNow, ' +\r
+                                       'Message, Referrer, SavePath, LinkCount, LinkCountCGI, ' +\r
+                                       'TimeStamp, DownloadTime, CheckSum, Priority, ExeCondition, ' +\r
+                                       'Body, ContentType' +\r
+                               ') values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?);', write\r
+                       )\r
+               rescue SQLite3::TooBigException\r
+                       cnt += 1\r
+                       write[15]=''\r
+                       row.savePath = save(row.body)\r
+                       if cnt <2 then\r
+                               retry\r
+                       end\r
+                       raise\r
+               end\r
+       end\r
+\r
+       SELECT_SQLBODY = 'select ' +\r
+                               'URL, Status, StatusCode, Bytes, TryNow, ' +\r
+                               'Message, Referrer, SavePath, LinkCount, LinkCountCGI, ' +\r
+                               'TimeStamp, DownloadTime, CheckSum, Priority, ExeCondition, ' +\r
+                               'Body, ContentType' +\r
+                       ' from URLinfo'\r
+       UPDATE_SQLBODY = 'update URLinfo set ' +\r
+                               'Status=?, StatusCode=?, Bytes=?, TryNow=?, ' +\r
+                               'Message=?, Referrer=?, SavePath=?, LinkCount=?, LinkCountCGI=?, ' +\r
+                               'TimeStamp=?, DownloadTime=?, CheckSum=?, Priority=?, ExeCondition=?, ' +\r
+                               'Body=?, ContentType=?'\r
+               \r
+       def update_query(row)\r
+               a = row.to_a\r
+               key = a.shift\r
+               a << key\r
+               @db.execute(\r
+                       UPDATE_SQLBODY + ' where URL=?;', a\r
+               )\r
+       end\r
+       \r
+       def select_query(cond='', data=[])\r
+               sql = SELECT_SQLBODY\r
+               if cond != '' then\r
+                       sql += ' where ' + cond\r
+               end\r
+               sql += ';'\r
+               result = @db.query( sql, data )\r
+               return result\r
+       end\r
+       \r
+       def toRow_sub(x)\r
+               row = nil\r
+               if x then\r
+                       row = ROW.new({\r
+                               :url                    => x[0].to_s,\r
+                               :status         => x[1].to_s,\r
+                               :code                   => x[2].to_s,\r
+                               :bytes          => x[3].to_i,\r
+                               :try                    => x[4].to_i,\r
+                               :message        => x[5].to_s,\r
+                               :referrer       => x[6].to_s,\r
+                               :path                   => x[7].to_s,\r
+                               :link                   => x[8].to_i,\r
+                               :cgilink        => x[9].to_i,\r
+                               :filetime       => Time.at(x[10].to_i),\r
+                               :downtime       => Time.at(x[11].to_i),\r
+                               :checksum       => x[12].to_s,\r
+                               :priority       => x[13].to_i,\r
+                               :execond        => x[14].to_i,\r
+                               :body                   => x[15].to_s,\r
+                               :contentType => x[16].to_s,\r
+                       })\r
+               end\r
+               return row\r
+       end\r
+       \r
+       def toRow(result)\r
+               x=result.next; result.close\r
+               return toRow_sub(x)\r
+       end\r
+       \r
+       def read_sub(cond, data=[])\r
+               result = select_query(cond, data)\r
+               return toRow(result)\r
+       end\r
+       \r
+       def read(url)\r
+               return read_sub('URL=?', [url])\r
+       end\r
+       \r
+       def notNil(row)\r
+               if row then\r
+                       return row\r
+               end\r
+               return ROW.new\r
+       end\r
+       \r
+       def [](key)\r
+               return notNil( read(key) )\r
+       end\r
+       \r
+       def update(data,force=true)\r
+               begin\r
+                       insert_query(data)\r
+               rescue SQLite3::ConstraintException\r
+                       if force then\r
+                               update_query(data)\r
+                       end\r
+               end\r
+       end\r
+       \r
+       def exists(key)\r
+               if read(key) then\r
+                       return true\r
+               end\r
+               return false\r
+       end\r
+       \r
+       def entry(cond='', data=[])\r
+               if(cond.to_s.size==0)then\r
+                       return notNil(\r
+                               read_sub('ExeCondition<? order by priority desc limit 1;', [SaveSuccess])\r
+                       )\r
+               else\r
+                       return notNil(\r
+                               read_sub(cond,data)\r
+                       )\r
+               end\r
+#              sql = SELECT_SQLBODY\r
+#              sql += ' order by priority desc limit 1;'\r
+#              result = @db.query( sql, [] )\r
+#              return notNil( toRow(result) )\r
+       end\r
+       \r
+       def rest\r
+               res = @db.query('select COUNT(*) from URLinfo where ExeCondition < ?',[SaveSuccess])\r
+               x=res.next\r
+               res.close\r
+               return x\r
+       end\r
+       \r
+       def transaction\r
+               @db.transaction do\r
+                       yield\r
+               end\r
+       end\r
+       \r
+       def each(cond='', data=[])\r
+               result = select_query(cond, data)\r
+               while x=result.next\r
+                       yield toRow_sub(x)\r
+               end\r
+               result.close\r
+       end\r
+       \r
+       \r
+       def my_query\r
+               yield @db\r
+       end\r
+       \r
+       \r
+end\r
diff --git a/sample/getMOE1.rb b/sample/getMOE1.rb
new file mode 100644 (file)
index 0000000..da5fc66
--- /dev/null
@@ -0,0 +1,15 @@
+#! /usr/local/bin/ruby -Ks\r
+\r
+require "../lib/arcget"\r
+\r
+pending = "./MOE1save/pending.txt"\r
+saveDir = "./MOE1save"\r
+saveDB = "./MOE1.db3"\r
+uri=URI.parse("http://asagi.la.coocan.jp/moe1/")\r
+\r
+wg=ArcGET.new({:path=>saveDir, :db=>saveDB, :pendingURI=>pending})\r
+rootexp = %r{^[^/]*//(?:[^/]+\.)*#{Regexp::escape(uri.host)}(?::\d*)?/}ni\r
+wg.add_root rootexp,3\r
+wg.add_url(uri)\r
+wg.start\r
+\r