Look what I found!! This is an archive of what my site/life was about 4-5 years ago. For the modern JobyBednar.com, try here.
JobyBednar.com
All programmers are playwrights and all computers are lousy actors.
- Unknown

Ruby Development
/root
  ./apple I
  ./articles
  ./code
  ./decode
  ./hobbies
  ./mac
  ./pics
  ./ruby
  ./www



 Use OpenOffice.org
USE:
path = An array of paths to search.

eg. path = ['/','/index2.html','index3.html']

You can also use a shorthand notation for numerically consecutive pages.

eg. path = ['/','/index[2..3].html']

filter = An array of filters to avoid logos, directories, etc. The filter can either contain an exact path string and/or a regular expression. The following will prevent any image found in a "/grfx/" or "/banner/" folder or subfolder, case-insensitive:

eg. filter = [/\/grfx\/*/i,/\/banner\/*/i]

min_size_k = The minimum file size (in kilobytes) for an image to be grabbed
iSlurp.rb

1 require 'net/http' 2 require 'FileUtils' 3 4 #----------------------------------------------------- 5 site = 'www.google.com' 6 path = ['/'] 7 8 $filter = [] 9 $min_size_k = 1 10 #----------------------------------------------------- 11 12 $imgid = '0000' 13 14 def msg(text) 15 puts text 16 STDOUT.flush 17 end 18 19 def getImage(site,path,imgpath,imgsize) 20 imgfile = '' 21 imgfolder = (site+path).gsub('/','_') 22 if imgpath[0] == '/' 23 imgfile = imgpath 24 else 25 if not /\.[a-z0-9]*/i.match(path).nil? 26 path = path.split('/') 27 path.compact! 28 path.pop 29 path = path.join('/') 30 end 31 imgfile = (path+'/'+imgpath).gsub('//','/') 32 end 33 filter_match = false 34 $filter.each do |fm| 35 if fm.class == Regexp and not fm.match(imgfile).nil? 36 filter_match = true 37 elsif imgfile == fm 38 puts "got here!" 39 filter_match = true 40 end 41 end 42 return unless not filter_match 43 Net::HTTP.start(site) do |http| 44 response = http.request_head(imgfile) 45 if response.code == '200' and 46 (response['content-type'] == 'image/jpeg') and 47 response['content-length'].to_i >= imgsize 48 #response.each {|key,val| printf "%-14s = %-40.40s\n", key, val} 49 response = http.get(imgfile) 50 ext = imgfile.split('.') 51 ext = ext.pop.gsub('/','') 52 53 if not File.exist?(imgfolder) 54 FileUtils.mkdir(imgfolder) 55 end 56 imgstr = "#{imgfolder}/image_#{$imgid}.#{ext}" 57 File.open(imgstr,'wb') do |f| 58 f.puts response.body 59 end 60 isize = response['content-length'].to_i/1024.0 61 mess = "Slurped: http://#{site}#{imgfile} (##{$imgid} - %2.2fk)"%isize 62 $log << mess 63 msg(mess) 64 $html_page << "<img src='#{imgstr}'>"+'<br>'+mess 65 $imgid.succ! 66 end 67 end 68 end 69 70 def getImageList(site,path) 71 imgs = Array.new 72 73 msg("---Connecting: #{site}#{path}") 74 Net::HTTP.start(site) do |http| 75 response = http.get(path) 76 #puts "Code = #{response.code}" 77 #puts "Message = #{response.message}" 78 #response.each {|key,val| printf "%-14s = %-40.40s\n", key, val} 79 txt = response.body 80 while not txt.nil? do 81 images = /src=\"[^\"]+\.(jpg|jpeg)\"/i.match(txt) 82 imgs << images.to_s.gsub(/src=\"/i,'').gsub('"','') 83 if not images.nil? 84 txt = images.post_match 85 else 86 txt = nil 87 end 88 end 89 end 90 imgs.uniq! 91 imgs.each {|i| if not i.empty? then $log << "Found: #{i}"; msg("Found: #{i}") end} 92 $log << "------------------------" 93 msg("------------------------") 94 imgs 95 end 96 97 def getLinkList(site,path) 98 paths = Array.new 99 100 #msg("Connecting...") 101 Net::HTTP.start(site) do |http| 102 response = http.get(path) 103 #puts "Code = #{response.code}" 104 #puts "Message = #{response.message}" 105 #response.each {|key,val| printf "%-14s = %-40.40s\n", key, val} 106 txt = response.body 107 while not txt.nil? do 108 links = /href=\"[^\"]*\"/i.match(txt) 109 paths << links.to_s.gsub(/href=\"/i,'').gsub('"','') 110 if not links.nil? 111 txt = links.post_match 112 else 113 txt = nil 114 end 115 end 116 end 117 paths.uniq! 118 bad = Array.new 119 paths.each {|p| if not /(http:\/\/|mailto:|ftp:\/\/)/i.match(p).nil? then bad << p end} 120 paths.each {|p| if /^[a-z0-9\_\-\.\/\&\=\?]+/i.match(p).nil? then bad << p end} 121 paths.each {|p| 122 if not /\.(zip|js|ico|xml|css|wml|fla|swf|mov|mpg|mpeg|avi|rm|mp3)$/i.match(p).nil? 123 bad << p 124 end 125 } 126 paths = paths - bad 127 paths.map! {|p| if p[0] != '/' then p = '/'+p end} 128 paths.each {|p| if not p.empty? then $log << "Path Found: #{p}"end} #msg("Path Found: #{p}") 129 $log << "------------------------" 130 #msg("------------------------") 131 paths 132 end 133 134 def slurp(site,path) 135 imgsize = $min_size_k*1024 136 $log = Array.new 137 $html_page = Array.new 138 $site_imgs = Array.new 139 #not currently used other than logging links that were found... 140 #will be used later to crawl through sites 141 links = getLinkList(site,path) 142 $site_imgs = getImageList(site,path) 143 #links.each do |linkpath| 144 # site_imgs = site_imgs|getImageList(site,linkpath) 145 #end 146 $site_imgs.each do |img| 147 if not img.empty? 148 getImage(site,path,img,imgsize) 149 end 150 end 151 safe_path = "#{site}#{path}".gsub('/','_') 152 File.open("!log_#{safe_path}.txt",'w') do |f| 153 f.puts "-----#{Time.now}-----" 154 f.puts "---URL: #{site}#{path}---" 155 f.print $log.join("\n") 156 end 157 File.open("!summary_#{safe_path}.html",'w') do |f| 158 f.puts '<html><head></head><body>', 159 '<div align="center" style="font-size:10px;font-family:verdana;">' 160 f.puts "<h1><a href='http://#{site}#{path}' target='_blank'>#{site}#{path}</a></h1>" 161 f.puts $html_page.join('<br><br>') 162 f.puts '</div></body></html>' 163 end 164 end 165 166 #----------------------------------------------------------------------------------------------- 167 path.each do |p| 168 if not /\[([0-9]+)\.\.([0-9]+)\]/.match(p).nil? 169 $1.to_i.upto($2.to_i) do |i| 170 subp = p.gsub(/\[([0-9]+)\.\.([0-9]+)\]/,"#{i}") 171 slurp(site,subp) 172 end 173 else 174 slurp(site,p) 175 end 176 end