webrobots-0.1.1/ 0000755 0000041 0000041 00000000000 12166110604 013544 5 ustar www-data www-data webrobots-0.1.1/.travis.yml 0000644 0000041 0000041 00000000312 12166110604 015651 0 ustar www-data www-data language: ruby rvm: - 1.8.7 - 1.9.2 - 1.9.3 - 2.0.0 - ree - jruby-18mode - jruby-19mode - rbx-18mode - rbx-19mode matrix: allow_failures: - rvm: rbx-18mode - rvm: rbx-19mode webrobots-0.1.1/test/ 0000755 0000041 0000041 00000000000 12166110604 014523 5 ustar www-data www-data webrobots-0.1.1/test/test_webrobots.rb 0000644 0000041 0000041 00000056503 12166110604 020126 0 ustar www-data www-data # -*- coding: utf-8 -*- require 'helper' class TestWebRobots < Test::Unit::TestCase context "robots.txt with no rules" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| case uri.to_s when 'http://site1.example.org/robots.txt' <<-'TXT' TXT when 'http://site2.example.org/robots.txt' <<-'TXT' TXT when 'http://site3.example.org/robots.txt' <<-'TXT' #comment TXT when 'http://site4.example.org/robots.txt' <<-'TXT' #comment TXT else raise "#{uri} is not supposed to be fetched" end }) end should "allow any robot" do assert @robots.allowed?('http://site1.example.org/index.html') assert @robots.allowed?('http://site1.example.org/private/secret.txt') assert @robots.allowed?('http://site2.example.org/index.html') assert @robots.allowed?('http://site2.example.org/private/secret.txt') assert @robots.allowed?('http://site3.example.org/index.html') assert @robots.allowed?('http://site3.example.org/private/secret.txt') assert @robots.allowed?('http://site4.example.org/index.html') assert @robots.allowed?('http://site4.example.org/private/secret.txt') end end context "robots.txt that cannot be fetched" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| case uri.to_s when 'http://site1.example.org/robots.txt' raise Net::HTTPFatalError.new( 'Internal Server Error', Net::HTTPInternalServerError.new('1.1', '500', 'Internal Server Error')) when 'http://site2.example.org/robots.txt' raise Net::HTTPRetriableError.new( 'Found', Net::HTTPFound.new('1.1', '302', 'Found')) when 'http://site3.example.org/robots.txt' raise Errno::ECONNREFUSED when 'http://site4.example.org/robots.txt' raise SocketError, "getaddrinfo: nodename nor servname provided, or not known" when 'http://site5.example.org/robots.txt' nil else raise "#{uri} is not supposed to be fetched" end }) end should "disallow any robot" do assert @robots.disallowed?('http://site1.example.org/index.html') assert @robots.disallowed?('http://site1.example.org/private/secret.txt') assert @robots.disallowed?('http://site2.example.org/index.html') assert @robots.disallowed?('http://site2.example.org/private/secret.txt') assert @robots.disallowed?('http://site3.example.org/index.html') assert @robots.disallowed?('http://site3.example.org/private/secret.txt') assert @robots.disallowed?('http://site4.example.org/index.html') assert @robots.disallowed?('http://site4.example.org/private/secret.txt') assert @robots.disallowed?('http://site5.example.org/index.html') assert @robots.disallowed?('http://site5.example.org/private/secret.txt') end end context "robots.txt with some rules" do setup do http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' <<-'TXT' # Punish evil bots User-Agent: evil Disallow: / Disallow-Not: / # parser teaser User-Agent: good # Be generous to good bots Disallow: /2heavy/ Allow: /2heavy/*.htm Disallow: /2heavy/*.htm$ User-Agent: * Disallow: /2heavy/ Disallow: /index.html # Allow takes precedence over Disallow if the pattern lengths are the same. Allow: /index.html TXT when 'http://www.example.com/robots.txt' <<-'TXT' # Default rule is evaluated last even if it is put first. User-Agent: * Disallow: /2heavy/ Disallow: /index.html # Allow takes precedence over Disallow if the pattern lengths are the same. Allow: /index.html # Punish evil bots User-Agent: evil Disallow: / User-Agent: good # Be generous to good bots Disallow: /2heavy/ Allow: /2heavy/*.htm Disallow: /2heavy/*.htm$ TXT when 'http://koster1.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /tmp TXT when 'http://koster2.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /tmp/ TXT when 'http://koster3.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a%3cd.html TXT when 'http://koster4.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a%3Cd.html TXT when 'http://koster5.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a%2fb.html TXT when 'http://koster6.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a/b.html TXT when 'http://koster7.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /%7ejoe/index.html TXT when 'http://koster8.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /~joe/index.html TXT else raise "#{uri} is not supposed to be fetched" end } @robots = WebRobots.new('RandomBot', :http_get => http_get) @robots_good = WebRobots.new('GoodBot', :http_get => http_get) @robots_evil = WebRobots.new('EvilBot', :http_get => http_get) end should "properly restrict access" do assert_nothing_raised { assert @robots_good.allowed?('http://www.example.org/index.html') } assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php') assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php') assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php')) assert @robots_good.allowed?('http://www.example.org/2heavy/index.html') assert @robots_good.allowed?('http://WWW.Example.Org/2heavy/index.html') assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm') assert !@robots_good.allowed?('http://WWW.Example.Org/2heavy/index.htm') assert !@robots_evil.allowed?('http://www.example.org/index.html') assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php') assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.html') assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.htm') assert @robots.allowed?('http://www.example.org/index.html') assert !@robots.allowed?('http://www.example.org/2heavy/index.php') assert !@robots.allowed?('http://www.example.org/2heavy/index.html') assert !@robots.allowed?('http://www.example.org/2heavy/index.htm') assert @robots_good.allowed?('http://www.example.com/index.html') assert !@robots_good.allowed?('http://www.example.com/2heavy/index.php') assert @robots_good.allowed?('http://www.example.com/2heavy/index.html') assert !@robots_good.allowed?('http://www.example.com/2heavy/index.htm') assert !@robots_evil.allowed?('http://www.example.com/index.html') assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.php') assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.html') assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.htm') assert @robots.allowed?('http://www.example.com/index.html') assert !@robots.allowed?('http://www.example.com/2heavy/index.php') assert !@robots.allowed?('http://www.example.com/2heavy/index.html') assert !@robots.allowed?('http://www.example.com/2heavy/index.htm') end should "follow what is said in Koster's draft" do assert @robots.disallowed?('http://koster1.example.net/tmp') assert @robots.disallowed?('http://koster1.example.net/tmp.html') assert @robots.disallowed?('http://koster1.example.net/tmp/a.html') assert !@robots.disallowed?('http://koster2.example.net/tmp') assert @robots.disallowed?('http://koster2.example.net/tmp/') assert @robots.disallowed?('http://koster2.example.net/tmp/a.html') assert @robots.disallowed?('http://koster3.example.net/a%3cd.html') assert @robots.disallowed?('http://koster3.example.net/a%3Cd.html') assert @robots.disallowed?('http://koster4.example.net/a%3cd.html') assert @robots.disallowed?('http://koster4.example.net/a%3Cd.html') assert @robots.disallowed?('http://koster5.example.net/a%2fb.html') assert !@robots.disallowed?('http://koster5.example.net/a/b.html') assert !@robots.disallowed?('http://koster6.example.net/a%2fb.html') assert @robots.disallowed?('http://koster6.example.net/a/b.html') assert @robots.disallowed?('http://koster7.example.net/~joe/index.html') assert @robots.disallowed?('http://koster8.example.net/%7Ejoe/index.html') end end context "robots.txt with errors" do setup do @turn1 = @turn2 = 0 @http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' if (@turn1 += 1) % 2 == 1 <<-'TXT' # some comment User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT else <<-'TXT' # some comment User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html # User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT end when 'http://www.example.com/robots.txt' if (@turn2 += 1) % 2 == 1 <<-'TXT' # some comment #User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT else <<-'TXT' # some comment User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT end else raise "#{uri} is not supposed to be fetched" end } end should "raise ParseError" do robots = WebRobots.new('TheBot', :http_get => @http_get) url = 'http://www.example.org/2heavy/index.php' assert_nil robots.error(url) assert !robots.allowed?(url) assert_nothing_raised { robots.error!(url) } robots.reset(url) assert robots.allowed?(url) error = robots.error(url) assert_instance_of WebRobots::ParseError, error assert_equal URI('http://www.example.org/'), error.site assert_raise(WebRobots::ParseError) { robots.error!(url) } robots.reset(url) assert_nil robots.error(url) assert !robots.allowed?(url) assert_nothing_raised { robots.error!(url) } url = 'http://www.example.com/2heavy/index.php' assert robots.allowed?(url) assert_instance_of WebRobots::ParseError, robots.error(url) assert_raise(WebRobots::ParseError) { robots.error!(url) } robots.reset(url) assert_nil robots.error(url) assert !robots.allowed?(url) assert_nothing_raised { robots.error!(url) } robots.reset(url) assert robots.allowed?(url) assert_instance_of WebRobots::ParseError, robots.error(url) assert_raise(WebRobots::ParseError) { robots.error!(url) } end end context "robots.txt with options" do setup do http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' <<-'TXT' Sitemap: http://www.example.org/sitemap-host1.xml Sitemap: http://www.example.org/sitemap-host2.xml User-Agent: MyBot Disallow: /2heavy/ Allow: /2heavy/*.html Option1: Foo Option2: Hello Crawl-Delay: 1.5 User-Agent: HerBot Disallow: /2heavy/ Allow: /2heavy/*.html Option1: Baz Option2: Qux User-Agent: * Disallow: /2heavy/ Allow: /2heavy/*.html # These are wrong but should be allowed Allow: /2heavy/% Crawl-Delay: # Option1: Bar Option3: Hi TXT else raise "#{uri} is not supposed to be fetched" end } @robots_mybot = WebRobots.new('MyBot', :http_get => http_get) @robots_mybot_ignore = WebRobots.new('MyBot', :http_get => http_get, :crawl_delay => :ignore) @robots_mybot_custom = WebRobots.new('MyBot', :http_get => http_get, :crawl_delay => proc { |*args| @delay_args = args }) @robots_herbot = WebRobots.new('HerBot', :http_get => http_get) @robots_hisbot = WebRobots.new('HisBot', :http_get => http_get) end should "read options" do options = @robots_mybot.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Foo', @robots_mybot.option('http://www.example.org/', 'Option1') assert_equal 'Foo', options['option1'] assert_equal 'Hello', @robots_mybot.option('http://www.example.org/', 'Option2') assert_equal 'Hello', options['option2'] options = @robots_mybot_ignore.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Foo', @robots_mybot_ignore.option('http://www.example.org/', 'Option1') assert_equal 'Foo', options['option1'] assert_equal 'Hello', @robots_mybot_ignore.option('http://www.example.org/', 'Option2') assert_equal 'Hello', options['option2'] options = @robots_mybot_custom.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Foo', @robots_mybot_custom.option('http://www.example.org/', 'Option1') assert_equal 'Foo', options['option1'] assert_equal 'Hello', @robots_mybot_custom.option('http://www.example.org/', 'Option2') assert_equal 'Hello', options['option2'] options = @robots_herbot.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Baz', @robots_herbot.option('http://www.example.org/', 'Option1') assert_equal 'Baz', options['option1'] assert_equal 'Qux', @robots_herbot.option('http://www.example.org/', 'Option2') assert_equal 'Qux', options['option2'] options = @robots_hisbot.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Bar', @robots_hisbot.option('http://www.example.org/', 'Option1') assert_equal 'Bar', options['option1'] assert_equal 'Hi', @robots_hisbot.option('http://www.example.org/', 'Option3') assert_equal 'Hi', options['option3'] assert_equal %w[ http://www.example.org/sitemap-host1.xml http://www.example.org/sitemap-host2.xml ], @robots_mybot.sitemaps('http://www.example.org/') assert_equal %w[ http://www.example.org/sitemap-host1.xml http://www.example.org/sitemap-host2.xml ], @robots_mybot_ignore.sitemaps('http://www.example.org/') assert_equal %w[ http://www.example.org/sitemap-host1.xml http://www.example.org/sitemap-host2.xml ], @robots_herbot.sitemaps('http://www.example.org/') assert_equal %w[ http://www.example.org/sitemap-host1.xml http://www.example.org/sitemap-host2.xml ], @robots_hisbot.sitemaps('http://www.example.org/') assert_equal 1.5, @robots_mybot.crawl_delay('http://www.example.org/') assert_equal 1.5, @robots_mybot_ignore.crawl_delay('http://www.example.org/') assert_equal 1.5, @robots_mybot_custom.crawl_delay('http://www.example.org/') assert_equal 0, @robots_herbot.crawl_delay('http://www.example.org/') assert_equal 0, @robots_hisbot.crawl_delay('http://www.example.org/') t1 = Time.now @robots_mybot.allowed?('http://www.example.org/') @robots_mybot.allowed?('http://www.example.org/article1.html') t2 = Time.now assert_in_delta 1.5, t2 - t1, 0.1 @robots_mybot.allowed?('http://www.example.org/article2.html') t3 = Time.now assert_in_delta 1.5, t3 - t2, 0.1 t1 = Time.now @robots_mybot_ignore.allowed?('http://www.example.org/') @robots_mybot_ignore.allowed?('http://www.example.org/article1.html') t2 = Time.now assert_in_delta 0, t2 - t1, 0.1 @robots_mybot_ignore.allowed?('http://www.example.org/article2.html') t3 = Time.now assert_in_delta 0, t3 - t2, 0.1 t1 = Time.now @robots_mybot_custom.allowed?('http://www.example.org/') @robots_mybot_custom.allowed?('http://www.example.org/article1.html') t2 = Time.now assert_in_delta 0, t2 - t1, 0.1 assert_instance_of Array, @delay_args assert_equal 2, @delay_args.size assert_equal 1.5, @delay_args[0] assert_instance_of Time, @delay_args[1] end end context "robots.txt with options" do setup do http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' <<-'TXT' User-Agent: * Disallow: / TXT else raise "#{uri} is not supposed to be fetched" end } @robots = WebRobots.new('RandomBot', :http_get => http_get) end should "validate URI" do assert_raise(ArgumentError) { @robots.allowed?('www.example.org/') } assert_raise(ArgumentError) { @robots.allowed?('::/home/knu') } end end context "robots.txt in the real world" do setup do @testbot = WebRobots.new('TestBot') @msnbot = WebRobots.new('TestMSNBot') # matches msnbot end should "be parsed for major sites" do assert_nothing_raised { assert !@testbot.allowed?("http://www.google.com/search") assert !@testbot.allowed?("https://www.google.com/search") assert !@testbot.allowed?("http://www.google.com/news/section?pz=1&cf=all&ned=jp&topic=y&ict=ln") assert @testbot.allowed?("http://www.google.com/news/directory?pz=1&cf=all&ned=us&hl=en&sort=users&category=6") } assert_nothing_raised { assert @testbot.allowed?("http://www.yahoo.com/") assert !@testbot.allowed?("http://www.yahoo.com/?") assert !@testbot.allowed?("http://www.yahoo.com/p/foo") } assert_nothing_raised { assert !@testbot.allowed?("http://store.apple.com/vieworder") assert @msnbot.allowed?("http://store.apple.com/vieworder") } assert_nothing_raised { assert !@testbot.allowed?("http://github.com/login") } end end context "meta robots tag" do setup do @doc = Nokogiri::HTML(<<-HTML)
test HTML end should "be properly parsed when given in HTML string" do assert !@doc.noindex? assert @doc.nofollow? assert @doc.noindex?('slurp') assert @doc.nofollow?('slurp') assert @doc.noindex?('googlebot') assert !@doc.nofollow?('googlebot') assert @doc.meta_robots('googlebot').include?('noarchive') end end class Agent def initialize @robots = WebRobots.new 'agent', :http_get => method(:get) end def get uri @robots.allowed? uri if uri.request_uri == '/robots.txt' then '' else 'content' end end end context "embedded in a user-agent" do setup do @agent = Agent.new end should "fetch robots.txt" do body = @agent.get URI.parse 'http://example/robots.html' assert_equal 'content', body end end context "robots.txt with a space at the end of the last line" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| res = case uri.to_s when 'http://site1.example.com/robots.txt' <<-'TXT' User-agent: * Request-rate: 1/30 Disallow: /util/ Sitemap: http://site1.example.com/text/sitemap.xml TXT when 'http://site2.example.com/robots.txt' <<-'TXT' User-agent: * Request-rate: 1/30 Disallow: /util/ Sitemap: http://site2.example.com/text/sitemap.xml TXT else raise "#{uri} is not supposed to be fetched" end # This chomp is actually key to the test. Remove the final EOL. # The final line should be the one ending with the space. res.chomp }) end should "be properly parsed" do assert_equal(["http://site1.example.com/text/sitemap.xml"], @robots.sitemaps("http://site1.example.com/")) assert_equal(["http://site2.example.com/text/sitemap.xml"], @robots.sitemaps("http://site2.example.com/")) end end context "robots.txt cache" do setup do @fetched = false @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| case uri.to_s when 'http://site1.example.org/robots.txt' @fetched = true <<-'TXT' User-Agent: * Disallow: /foo TXT when 'http://site2.example.org/robots.txt' @fetched = true nil end }) end should "persist unless cache is cleared" do assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert @fetched @fetched = false assert @robots.allowed?('http://site1.example.org/bar') assert !@fetched assert @robots.allowed?('http://site1.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert !@fetched @robots.flush_cache assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert @fetched @fetched = false assert @robots.allowed?('http://site1.example.org/bar') assert !@fetched assert @robots.allowed?('http://site1.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert !@fetched end should "persist for non-existent robots.txt unless cache is cleared" do assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert @fetched @fetched = false assert !@robots.allowed?('http://site2.example.org/bar') assert !@fetched assert !@robots.allowed?('http://site2.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert !@fetched @robots.flush_cache assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert @fetched @fetched = false assert !@robots.allowed?('http://site2.example.org/bar') assert !@fetched assert !@robots.allowed?('http://site2.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert !@fetched end end context "robots.txt with just user-agent & sitemap and no blank line between them" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| res = case uri.to_s when 'http://site1.example.com/robots.txt' <<-'TXT' User-agent: * Sitemap: http://site1.example.com/text/sitemap.xml TXT else raise "#{uri} is not supposed to be fetched" end }) end should "be properly parsed" do assert @robots.allowed?("http://site1.example.com/foo") assert_equal(["http://site1.example.com/text/sitemap.xml"], @robots.sitemaps("http://site1.example.com/")) end end end webrobots-0.1.1/test/helper.rb 0000644 0000041 0000041 00000000656 12166110604 016336 0 ustar www-data www-data require 'rubygems' require 'bundler' begin Bundler.setup(:default, :development) rescue Bundler::BundlerError => e $stderr.puts e.message $stderr.puts "Run `bundle install` to install missing gems" exit e.status_code end require 'test/unit' require 'shoulda' $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) $LOAD_PATH.unshift(File.dirname(__FILE__)) require 'webrobots' class Test::Unit::TestCase end webrobots-0.1.1/README.rdoc 0000644 0000041 0000041 00000002231 12166110604 015350 0 ustar www-data www-data = webrobots This is a library to help write robots.txt compliant web robots. == Usage require 'webrobots' require 'uri' require 'net/http' robots = WebRobots.new('MyBot/1.0') uri = URI('http://digg.com/news/24hr') if robots.disallowed?(uri) STDERR.puts "Access disallowed: #{uri}" exit 1 end body = Net::HTTP.get(uri) # ... == Requirements - Ruby 1.8.7 or 1.9.2+ == Contributing to webrobots * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it * Fork the project * Start a feature/bugfix branch * Commit and push until you are happy with your contribution * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally. * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it. == Copyright Copyright (c) 2010, 2011, 2012, 2013 Akinori MUSHA. See LICENSE.txt for further details. webrobots-0.1.1/Rakefile 0000644 0000041 0000041 00000001123 12166110604 015206 0 ustar www-data www-data # encoding: utf-8 require 'bundler/gem_tasks' gemspec = Bundler::GemHelper.gemspec require 'rake/testtask' Rake::TestTask.new(:test) do |test| test.libs << 'test' test.test_files = gemspec.test_files test.verbose = true end require 'rdoc/task' Rake::RDocTask.new do |rdoc| rdoc.rdoc_dir = 'rdoc' rdoc.title = "#{gemspec.name} #{gemspec.version}" rdoc.rdoc_files.include(gemspec.extra_rdoc_files) rdoc.rdoc_files.include('lib/**/*.rb') end task :default => :test task :test => 'lib/webrobots/robotstxt.rb' rule '.rb' => ['.ry'] do |t| sh 'racc', '-o', t.name, t.source end webrobots-0.1.1/webrobots.gemspec 0000644 0000041 0000041 00000002313 12166110604 017116 0 ustar www-data www-data # -*- encoding: utf-8 -*- $:.push File.expand_path("../lib", __FILE__) require "webrobots/version" Gem::Specification.new do |s| s.name = "webrobots" s.version = Webrobots::VERSION s.authors = ["Akinori MUSHA"] s.email = ["knu@idaemons.org"] s.homepage = %q{https://github.com/knu/webrobots} s.licenses = [%q{2-clause BSDL}] s.summary = %q{A Ruby library to help write robots.txt compliant web robots} s.description = <<-'EOS' This library helps write robots.txt compliant web robots in Ruby. EOS s.files = `git ls-files`.split("\n") s.test_files = s.files.grep(%r{/test_[^/]+\.rb$}) s.executables = s.files.grep(%r{^bin/[^.]}).map{ |f| File.basename(f) } s.require_paths = ["lib"] s.extra_rdoc_files = [ "LICENSE.txt", "README.rdoc" ] s.rdoc_options += [ '--exclude', '\.ry$' ] s.add_development_dependency("rake", [">= 0.9.2.2"]) s.add_development_dependency("racc", [">= 0"]) unless RUBY_PLATFORM == "java" s.add_development_dependency("shoulda", [">= 0"]) s.add_development_dependency("rdoc", ["> 2.4.2"]) s.add_development_dependency("bundler", [">= 1.2"]) s.add_development_dependency("nokogiri", [">= 1.4.4"]) end webrobots-0.1.1/LICENSE.txt 0000644 0000041 0000041 00000002423 12166110604 015370 0 ustar www-data www-data Copyright (c) 2010, 2011, 2012, 2013 Akinori MUSHA All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. webrobots-0.1.1/checksums.yaml.gz 0000444 0000041 0000041 00000000415 12166110604 017032 0 ustar www-data www-data CQe9VdA>E_yJ* X|0?##x~7xv+TklLDFd{|'.yG<V!:4d6qhl}[^\Y65pjjz3,]VB/?,rwCI!!@$ &fU~m[x64p?֢Iɺ꒳]Mifh$sy!~. =xZV-tǩ2HIâ webrobots-0.1.1/.document 0000644 0000041 0000041 00000000052 12166110604 015360 0 ustar www-data www-data LICENSE.txt README.rdoc bin/* lib/**/*.rb webrobots-0.1.1/metadata.yml 0000644 0000041 0000041 00000006665 12166110604 016064 0 ustar www-data www-data --- !ruby/object:Gem::Specification name: webrobots version: !ruby/object:Gem::Version version: 0.1.1 platform: ruby authors: - Akinori MUSHA autorequire: bindir: bin cert_chain: [] date: 2013-03-15 00:00:00.000000000 Z dependencies: - !ruby/object:Gem::Dependency name: rake requirement: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: 0.9.2.2 type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: 0.9.2.2 - !ruby/object:Gem::Dependency name: racc requirement: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' - !ruby/object:Gem::Dependency name: shoulda requirement: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' - !ruby/object:Gem::Dependency name: rdoc requirement: !ruby/object:Gem::Requirement requirements: - - '>' - !ruby/object:Gem::Version version: 2.4.2 type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>' - !ruby/object:Gem::Version version: 2.4.2 - !ruby/object:Gem::Dependency name: bundler requirement: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '1.2' type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '1.2' - !ruby/object:Gem::Dependency name: nokogiri requirement: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: 1.4.4 type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: 1.4.4 description: | This library helps write robots.txt compliant web robots in Ruby. email: - knu@idaemons.org executables: [] extensions: [] extra_rdoc_files: - LICENSE.txt - README.rdoc files: - .document - .gitignore - .travis.yml - Gemfile - LICENSE.txt - README.rdoc - Rakefile - lib/webrobots.rb - lib/webrobots/nokogiri.rb - lib/webrobots/robotstxt.rb - lib/webrobots/robotstxt.ry - lib/webrobots/version.rb - test/helper.rb - test/test_webrobots.rb - webrobots.gemspec homepage: https://github.com/knu/webrobots licenses: - 2-clause BSDL metadata: {} post_install_message: rdoc_options: - --exclude - \.ry$ require_paths: - lib required_ruby_version: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' required_rubygems_version: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' requirements: [] rubyforge_project: rubygems_version: 2.0.3 signing_key: specification_version: 4 summary: A Ruby library to help write robots.txt compliant web robots test_files: - test/test_webrobots.rb webrobots-0.1.1/Gemfile 0000644 0000041 0000041 00000000135 12166110604 015036 0 ustar www-data www-data source "http://rubygems.org" # Specify your gem's dependencies in webrobots.gemspec gemspec webrobots-0.1.1/.gitignore 0000644 0000041 0000041 00000000135 12166110604 015533 0 ustar www-data www-data *.gem .bundle Gemfile.lock pkg/* coverage rdoc doc .yardoc /lib/webrobots/robotstxt.output webrobots-0.1.1/lib/ 0000755 0000041 0000041 00000000000 12166110604 014312 5 ustar www-data www-data webrobots-0.1.1/lib/webrobots.rb 0000644 0000041 0000041 00000013002 12166110604 016641 0 ustar www-data www-data require 'webrobots/version' require 'webrobots/robotstxt' require 'uri' require 'net/https' require 'thread' if defined?(Nokogiri) require 'webrobots/nokogiri' else autoload :Nokogiri, 'webrobots/nokogiri' end class WebRobots # Creates a WebRobots object for a robot named +user_agent+, with # optional +options+. # # * :http_get => a custom method, proc, or anything that responds to # .call(uri), to be used for fetching robots.txt. It must return # the response body if successful, return an empty string if the # resource is not found, and return nil or raise any error on # failure. Redirects should be handled within this proc. # # * :crawl_delay => determines how to react to Crawl-delay # directives. If +:sleep+ is given, WebRobots sleeps as demanded # when allowed?(url)/disallowed?(url) is called. This is the # default behavior. If +:ignore+ is given, WebRobots does # nothing. If a custom method, proc, or anything that responds to # .call(delay, last_checked_at), it is called. def initialize(user_agent, options = nil) @user_agent = user_agent options ||= {} @http_get = options[:http_get] || method(:http_get) crawl_delay_handler = case value = options[:crawl_delay] || :sleep when :ignore nil when :sleep method(:crawl_delay_handler) else if value.respond_to?(:call) value else raise ArgumentError, "invalid Crawl-delay handler: #{value.inspect}" end end @parser = RobotsTxt::Parser.new(user_agent, crawl_delay_handler) @parser_mutex = Mutex.new @robotstxt = create_cache() end # :nodoc: def create_cache Hash.new # Must respond to [], []=, delete and clear. end # Flushes robots.txt cache. def flush_cache @robotstxt.clear end # Returns the robot name initially given. attr_reader :user_agent # Tests if the robot is allowed to access a resource at +url+. If a # malformed URI string is given, URI::InvalidURIError is raised. If # a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is # raised. def allowed?(url) site, request_uri = split_uri(url) return true if request_uri == '/robots.txt' robots_txt = get_robots_txt(site) robots_txt.allow?(request_uri) end # Equivalent to !allowed?(url). def disallowed?(url) !allowed?(url) end # Returns the number of seconds that the configured agent should wait # between successive requests to the site identified by +url+ according # to the site's robots.txt +Crawl-delay+ directive. def crawl_delay(url) robots_txt_for(url).crawl_delay() end # Returns extended option values for a resource at +url+ in a hash # with each field name lower-cased. See allowed?() for a list of # errors that may be raised. def options(url) robots_txt_for(url).options end # Equivalent to option(url)[token.downcase]. def option(url, token) options(url)[token.downcase] end # Returns an array of Sitemap URLs. See allowed?() for a list of # errors that may be raised. def sitemaps(url) robots_txt_for(url).sitemaps end # Returns an error object if there is an error in fetching or # parsing robots.txt of the site +url+. def error(url) robots_txt_for(url).error end # Raises the error if there was an error in fetching or parsing # robots.txt of the site +url+. def error!(url) robots_txt_for(url).error! end # Removes robots.txt cache for the site +url+. def reset(url) site, = split_uri(url) @robotstxt.delete(site) end private def split_uri(url) site = if url.is_a?(URI) url.dup else begin URI.parse(url) rescue => e raise ArgumentError, e.message end end site.scheme && site.host or raise ArgumentError, "non-absolute URI: #{url}" site.is_a?(URI::HTTP) or raise ArgumentError, "non-HTTP/HTTPS URI: #{url}" request_uri = site.request_uri if (host = site.host).match(/[[:upper:]]/) site.host = host.downcase end site.path = '/' return site, request_uri end def robots_txt_for(url) site, = split_uri(url) get_robots_txt(site) end def get_robots_txt(site) @robotstxt[site] ||= fetch_robots_txt(site) end def fetch_robots_txt(site) begin body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable' rescue => e return RobotsTxt.unfetchable(site, e, @user_agent) end @parser_mutex.synchronize { @parser.parse!(body, site) } end def http_get(uri) referer = nil 10.times { http = Net::HTTP.new(uri.host, uri.port) if http.use_ssl = uri.is_a?(URI::HTTPS) http.verify_mode = OpenSSL::SSL::VERIFY_PEER http.cert_store = OpenSSL::X509::Store.new.tap { |store| store.set_default_paths } end header = { 'User-Agent' => @user_agent } header['Referer'] = referer if referer # header is destroyed by this in ruby 1.9.2! response = http.get(uri.request_uri, header) case response when Net::HTTPSuccess return response.body when Net::HTTPRedirection referer = uri.to_s uri = URI(response['location']) when Net::HTTPNotFound return '' else response.value end } raise 'too many HTTP redirects' end def crawl_delay_handler(delay, last_checked_at) if last_checked_at delay -= Time.now - last_checked_at sleep delay if delay > 0 end end end webrobots-0.1.1/lib/webrobots/ 0000755 0000041 0000041 00000000000 12166110604 016320 5 ustar www-data www-data webrobots-0.1.1/lib/webrobots/version.rb 0000644 0000041 0000041 00000000051 12166110604 020326 0 ustar www-data www-data module Webrobots VERSION = "0.1.1" end webrobots-0.1.1/lib/webrobots/robotstxt.ry 0000644 0000041 0000041 00000024607 12166110604 020755 0 ustar www-data www-data # -*- coding: utf-8 -*- class Parser rule robotstxt : opt_blanklines { @sitemaps = [] } body { body = val[2] result = RobotsTxt.new(@site, body, :target => @target, :sitemaps => @sitemaps, :crawl_delay_handler => @crawl_delay_handler) } body : | records opt_blanklines opt_blanklines : | blanklines blanklines : blankline | blanklines blankline blankline : EOL opt_space : | SPACE opt_commentlines : | commentlines commentlines : comment | commentlines comment comment : opt_space COMMENT EOL | 'sitemap' ':' opt_space VALUE eol_opt_comment { @sitemaps << val[3] } records : record { result = [] result << val[0] } | commentblock { result = [] } | records blanklines record { result << val[2] } | records blanklines rulelines { val[2].each_with_index { |line, i| warn "%s line %d: %s: orphan rule line" % [@site.to_s, @rulelinenos[i], line.token] if $VERBOSE } } | records blanklines commentblock commentblock : commentlines record : opt_commentlines agentlines opt_rulelines { result = Record.new(val[1], val[2]) } agentlines : agentline { result = [val[0]] } | agentlines agentline { result << val[1] } | agentlines comment agentline : 'user-agent' ':' opt_space VALUE eol_opt_comment { result = AgentLine.new(val[0], val[3]) } opt_rulelines : | rulelines rulelines : ruleline { result = [result] @rulelinenos = [] } | rulelines ruleline { result << val[1] @rulelinenos << @lineno } | rulelines comment ruleline : allowline | disallowline | crawldelayline | extension allowline : 'allow' ':' opt_space VALUE eol_opt_comment { result = AllowLine.new(val[0], val[3]) } disallowline : 'disallow' ':' opt_space VALUE eol_opt_comment { result = DisallowLine.new(val[0], val[3]) } crawldelayline : 'crawl-delay' ':' opt_space VALUE eol_opt_comment { result = CrawlDelayLine.new(val[0], val[3]) } extension : TOKEN ':' opt_space VALUE eol_opt_comment { result = ExtentionLine.new(val[0], val[3]) } eol_opt_comment : EOL | comment ---- header require 'strscan' class WebRobots class Error < StandardError end class ParseError < Error # The site's root URI attr_reader :site def initialize(message, site) @message = message @site = site end def to_s @message end end class RobotsTxt ---- inner def initialize(target, crawl_delay_handler = nil) super() @target = target @crawl_delay_handler = crawl_delay_handler end def parse!(input, site) parse(input, site) rescue Error => e RobotsTxt.new(site, nil, :error => e, :target => @target, :crawl_delay_handler => @crawl_delay_handler) end KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap] RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i def parse(input, site) @q ||= [] @errors = [] @lineno = 0 @site = site string = input.respond_to?(:read) ? input.read : input s = StringScanner.new(string) value_expected = false until s.eos? @lineno += 1 if s.bol? if t = s.scan(/[ \t]*(?:\r?\n|\z)/) if value_expected @q << [:VALUE, ''] end @q << [:EOL, t] value_expected = false elsif t = s.scan(/[ \t]+/) @q << [:SPACE, t] elsif t = s.scan(/:/) @q << [t, t] value_expected = true elsif t = s.scan(/#.*/) if value_expected @q << [:VALUE, ''] end @q << [:COMMENT, t] else if value_expected if t = s.scan(/.*?(?=[ \t]*(?:#|$))/) @q << [:VALUE, t] else parse_error @lineno, "unexpected characters: %s" % s.check(/.*/) end value_expected = false elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/) case t when RE_KNOWN_TOKENS @q << [t.downcase, t] else @q << [:TOKEN, t] end else parse_error "unexpected characters: %s" % s.check(/.*/) end end end @q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL @pos = -1 do_parse rescue Racc::ParseError => e raise ParseError.new(e.message, @site) ensure @q.clear end def next_token @q[@pos += 1] end def on_error(token_id, value, stack) parse_error "unexpected %s: %s" % [token_to_str(token_id), value] end def parse_error(message) message = "%s line %d: %s" % [@site.to_s, @lineno, message] if @lax @errors << message else raise Racc::ParseError, message end end ---- footer def initialize(site, records, options = nil) @timestamp = Time.now @site = site @options = options || {} @last_checked_at = nil @error = @options[:error] @target = @options[:target] @sitemaps = @options[:sitemaps] || [] @crawl_delay_handler = @options[:crawl_delay_handler] if records && !records.empty? @records, defaults = [], [] records.each { |record| if record.default? defaults << record elsif !@target || record.match?(@target) @records << record end } @records.concat(defaults) else @records = [] end end attr_reader :timestamp, :site, :sitemaps attr_accessor :error def error! raise @error if @error end def target(user_agent = nil) if user_agent raise ArgumentError, "this instance is targeted for #{@target}" if @target user_agent else raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target @target end end private :target def find_record(user_agent = nil) user_agent = target(user_agent) @records.find { |record| record.match?(user_agent) } end private :find_record def allow?(request_uri, user_agent = nil) record = find_record(user_agent) or return true allow = record.allow?(request_uri) if delay = record.delay and @crawl_delay_handler @crawl_delay_handler.call(delay, @last_checked_at) end @last_checked_at = Time.now return allow end def crawl_delay(user_agent = nil) record = find_record(user_agent) or return 0 record.delay or return 0 end def options(user_agent = nil) record = find_record(user_agent) or return {} record.options end DISALLOW_ALL = <<-TXT User-Agent: * Disallow: / TXT def self.unfetchable(site, reason, target = nil) Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt| robots_txt.error = reason } end class Record def initialize(agentlines, rulelines) @patterns = agentlines.map { |agentline| agentline.pattern } @acls = [] @delay = nil @options = {} rulelines.each { |ruleline| case ruleline when AccessControlLine @acls << ruleline when CrawlDelayLine @delay = ruleline.delay else @options[ruleline.token.downcase] = ruleline.value end } if rulelines @acls.replace @acls.sort_by { |x| [-x.value.length, x.is_a?(AllowLine) ? -1 : 0] } end attr_reader :delay, :options def match?(user_agent) @patterns.any? { |pattern| pattern.match(user_agent) } end def default? @patterns.include?(//) end def allow?(request_uri) @acls.each { |acl| if acl.match?(request_uri) return acl.allow? end } return true end end class Line def initialize(token, value) @token = token @value = value compile end attr_reader :token, :value def compile self end end class AgentLine < Line def compile if @value == '*' @pattern = // else @pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE) end self end attr_reader :pattern end class AccessControlLine < Line def compile @empty = @value.empty? re_src = '\A' s = StringScanner.new(@value) until s.eos? if t = s.scan(/[^%*$]+/) re_src << Regexp.quote(t) elsif t = s.scan(/%([0-9a-f]{2})/i) c = s[1].to_i(16) if c == 0x2f re_src << '%2[fF]' else re_src << Regexp.quote('%c' % c) end elsif t = s.scan(/\*/) re_src << '.*' elsif t = s.scan(/\$/) re_src << '\z' break else re_src << Regexp.quote(s.scan(/./)) end end @pattern = Regexp.new(re_src, Regexp::MULTILINE) self end def match?(request_uri) return false if @empty transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) } !!@pattern.match(transformed) end end class AllowLine < AccessControlLine def allow? true end end class DisallowLine < AccessControlLine def allow? false end end class CrawlDelayLine < Line def compile case @value when /\A((0|[1-9][0-9]*)\.[0-9]+)/ @delay = @value.to_f when /\A(0|[1-9][0-9]*)/ @delay = @value.to_i else @delay = nil end self end attr_reader :delay end class ExtentionLine < Line end end end webrobots-0.1.1/lib/webrobots/robotstxt.rb 0000644 0000041 0000041 00000042223 12166110604 020720 0 ustar www-data www-data # # DO NOT MODIFY!!!! # This file is automatically generated by Racc 1.4.9 # from Racc grammer file "". # require 'racc/parser.rb' require 'strscan' class WebRobots class Error < StandardError end class ParseError < Error # The site's root URI attr_reader :site def initialize(message, site) @message = message @site = site end def to_s @message end end class RobotsTxt class Parser < Racc::Parser module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 171) def initialize(target, crawl_delay_handler = nil) super() @target = target @crawl_delay_handler = crawl_delay_handler end def parse!(input, site) parse(input, site) rescue Error => e RobotsTxt.new(site, nil, :error => e, :target => @target, :crawl_delay_handler => @crawl_delay_handler) end KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap] RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i def parse(input, site) @q ||= [] @errors = [] @lineno = 0 @site = site string = input.respond_to?(:read) ? input.read : input s = StringScanner.new(string) value_expected = false until s.eos? @lineno += 1 if s.bol? if t = s.scan(/[ \t]*(?:\r?\n|\z)/) if value_expected @q << [:VALUE, ''] end @q << [:EOL, t] value_expected = false elsif t = s.scan(/[ \t]+/) @q << [:SPACE, t] elsif t = s.scan(/:/) @q << [t, t] value_expected = true elsif t = s.scan(/#.*/) if value_expected @q << [:VALUE, ''] end @q << [:COMMENT, t] else if value_expected if t = s.scan(/.*?(?=[ \t]*(?:#|$))/) @q << [:VALUE, t] else parse_error @lineno, "unexpected characters: %s" % s.check(/.*/) end value_expected = false elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/) case t when RE_KNOWN_TOKENS @q << [t.downcase, t] else @q << [:TOKEN, t] end else parse_error "unexpected characters: %s" % s.check(/.*/) end end end @q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL @pos = -1 do_parse rescue Racc::ParseError => e raise ParseError.new(e.message, @site) ensure @q.clear end def next_token @q[@pos += 1] end def on_error(token_id, value, stack) parse_error "unexpected %s: %s" % [token_to_str(token_id), value] end def parse_error(message) message = "%s line %d: %s" % [@site.to_s, @lineno, message] if @lax @errors << message else raise Racc::ParseError, message end end ...end robotstxt.ry/module_eval... ##### State transition tables begin ### racc_action_table = [ 5, 12, -10, 16, 52, 40, -12, 36, 37, 38, 39, 12, -10, 16, 46, 27, 27, 36, 37, 38, 39, 12, -10, 16, 49, 50, 51, 36, 37, 38, 39, 12, -10, 16, 12, 53, 24, 36, 37, 38, 39, 12, -10, 16, 12, 12, -12, 12, -10, 16, 60, 12, -13, 16, 60, 12, 12, 16, 60, 12, 12, 16, 60, 12, 12, 16, 60, 12, 23, 16, 60, 12, 62, 16, 63, 64, 65, 66, 5, 9, 5, 6, 5 ] racc_action_check = [ 21, 21, 21, 21, 39, 23, 21, 21, 21, 21, 21, 25, 25, 25, 27, 19, 25, 25, 25, 25, 25, 45, 45, 45, 36, 37, 38, 45, 45, 45, 45, 29, 29, 29, 24, 41, 16, 29, 29, 29, 29, 7, 7, 7, 46, 49, 7, 13, 13, 13, 62, 62, 13, 62, 53, 53, 50, 53, 63, 63, 51, 63, 64, 64, 52, 64, 65, 65, 15, 65, 66, 66, 54, 66, 55, 56, 57, 58, 11, 6, 3, 1, 0 ] racc_action_pointer = [ 80, 81, nil, 78, nil, nil, 79, 38, nil, nil, nil, 76, nil, 44, nil, 64, 30, nil, nil, 7, nil, -2, nil, 3, 31, 8, nil, 8, nil, 28, nil, nil, nil, nil, nil, nil, 18, 19, 20, -2, nil, 28, nil, nil, nil, 18, 41, nil, nil, 42, 53, 57, 61, 52, 65, 67, 68, 69, 70, nil, nil, nil, 48, 56, 60, 64, 68, nil, nil, nil, nil, nil ] racc_action_default = [ -5, -44, -1, -6, -7, -9, -44, -3, -8, 72, -2, -5, -11, -23, -14, -44, -44, -18, -19, -44, -4, -6, -15, -44, -10, -29, -25, -44, -20, -21, -22, -31, -34, -35, -36, -37, -44, -44, -44, -44, -16, -44, -24, -26, -27, -30, -10, -32, -33, -10, -10, -10, -10, -10, -44, -44, -44, -44, -44, -17, -42, -43, -10, -10, -10, -10, -10, -28, -38, -39, -40, -41 ] racc_goto_table = [ 14, 41, 8, 47, 3, 2, 22, 17, 29, 11, 18, 26, 45, 10, 14, 21, 20, 43, 44, 47, 8, 28, 48, 54, 30, 25, 55, 56, 57, 58, 59, 42, 7, 1, nil, nil, nil, nil, 48, 67, 68, 69, 70, 71 ] racc_goto_check = [ 11, 8, 7, 19, 6, 2, 11, 13, 15, 5, 14, 18, 15, 3, 11, 6, 2, 18, 11, 19, 7, 13, 11, 8, 14, 16, 8, 8, 8, 8, 12, 17, 4, 1, nil, nil, nil, nil, 11, 12, 12, 12, 12, 12 ] racc_goto_pointer = [ nil, 33, 5, 6, 30, 2, 4, -1, -23, nil, nil, -7, -23, 0, 3, -13, 6, 6, -8, -26, nil, nil, nil, nil ] racc_goto_default = [ nil, nil, nil, nil, nil, nil, nil, 4, 15, 19, 13, 61, nil, nil, nil, nil, nil, nil, nil, 31, 32, 33, 34, 35 ] racc_reduce_table = [ 0, 0, :racc_error, 0, 17, :_reduce_1, 3, 14, :_reduce_2, 0, 16, :_reduce_none, 2, 16, :_reduce_none, 0, 15, :_reduce_none, 1, 15, :_reduce_none, 1, 19, :_reduce_none, 2, 19, :_reduce_none, 1, 20, :_reduce_none, 0, 21, :_reduce_none, 1, 21, :_reduce_none, 0, 22, :_reduce_none, 1, 22, :_reduce_none, 1, 23, :_reduce_none, 2, 23, :_reduce_none, 3, 24, :_reduce_none, 5, 24, :_reduce_17, 1, 18, :_reduce_18, 1, 18, :_reduce_19, 3, 18, :_reduce_20, 3, 18, :_reduce_21, 3, 18, :_reduce_none, 1, 27, :_reduce_none, 3, 26, :_reduce_24, 1, 29, :_reduce_25, 2, 29, :_reduce_26, 2, 29, :_reduce_none, 5, 31, :_reduce_28, 0, 30, :_reduce_none, 1, 30, :_reduce_none, 1, 28, :_reduce_31, 2, 28, :_reduce_32, 2, 28, :_reduce_none, 1, 32, :_reduce_none, 1, 32, :_reduce_none, 1, 32, :_reduce_none, 1, 32, :_reduce_none, 5, 33, :_reduce_38, 5, 34, :_reduce_39, 5, 35, :_reduce_40, 5, 36, :_reduce_41, 1, 25, :_reduce_none, 1, 25, :_reduce_none ] racc_reduce_n = 44 racc_shift_n = 72 racc_token_table = { false => 0, :error => 1, :EOL => 2, :SPACE => 3, :COMMENT => 4, "sitemap" => 5, ":" => 6, :VALUE => 7, "user-agent" => 8, "allow" => 9, "disallow" => 10, "crawl-delay" => 11, :TOKEN => 12 } racc_nt_base = 13 racc_use_result_var = true Racc_arg = [ racc_action_table, racc_action_check, racc_action_default, racc_action_pointer, racc_goto_table, racc_goto_check, racc_goto_default, racc_goto_pointer, racc_nt_base, racc_reduce_table, racc_token_table, racc_shift_n, racc_reduce_n, racc_use_result_var ] Racc_token_to_s_table = [ "$end", "error", "EOL", "SPACE", "COMMENT", "\"sitemap\"", "\":\"", "VALUE", "\"user-agent\"", "\"allow\"", "\"disallow\"", "\"crawl-delay\"", "TOKEN", "$start", "robotstxt", "opt_blanklines", "body", "@1", "records", "blanklines", "blankline", "opt_space", "opt_commentlines", "commentlines", "comment", "eol_opt_comment", "record", "commentblock", "rulelines", "agentlines", "opt_rulelines", "agentline", "ruleline", "allowline", "disallowline", "crawldelayline", "extension" ] Racc_debug_parser = false ##### State transition tables end ##### # reduce 0 omitted module_eval(<<'.,.,', 'robotstxt.ry', 7) def _reduce_1(val, _values, result) @sitemaps = [] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 11) def _reduce_2(val, _values, result) body = val[2] result = RobotsTxt.new(@site, body, :target => @target, :sitemaps => @sitemaps, :crawl_delay_handler => @crawl_delay_handler) result end .,., # reduce 3 omitted # reduce 4 omitted # reduce 5 omitted # reduce 6 omitted # reduce 7 omitted # reduce 8 omitted # reduce 9 omitted # reduce 10 omitted # reduce 11 omitted # reduce 12 omitted # reduce 13 omitted # reduce 14 omitted # reduce 15 omitted # reduce 16 omitted module_eval(<<'.,.,', 'robotstxt.ry', 44) def _reduce_17(val, _values, result) @sitemaps << val[3] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 49) def _reduce_18(val, _values, result) result = [] result << val[0] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 54) def _reduce_19(val, _values, result) result = [] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 60) def _reduce_20(val, _values, result) result << val[2] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 66) def _reduce_21(val, _values, result) val[2].each_with_index { |line, i| warn "%s line %d: %s: orphan rule line" % [@site.to_s, @rulelinenos[i], line.token] if $VERBOSE } result end .,., # reduce 22 omitted # reduce 23 omitted module_eval(<<'.,.,', 'robotstxt.ry', 81) def _reduce_24(val, _values, result) result = Record.new(val[1], val[2]) result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 86) def _reduce_25(val, _values, result) result = [val[0]] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 91) def _reduce_26(val, _values, result) result << val[1] result end .,., # reduce 27 omitted module_eval(<<'.,.,', 'robotstxt.ry', 98) def _reduce_28(val, _values, result) result = AgentLine.new(val[0], val[3]) result end .,., # reduce 29 omitted # reduce 30 omitted module_eval(<<'.,.,', 'robotstxt.ry', 106) def _reduce_31(val, _values, result) result = [result] @rulelinenos = [] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 112) def _reduce_32(val, _values, result) result << val[1] @rulelinenos << @lineno result end .,., # reduce 33 omitted # reduce 34 omitted # reduce 35 omitted # reduce 36 omitted # reduce 37 omitted module_eval(<<'.,.,', 'robotstxt.ry', 125) def _reduce_38(val, _values, result) result = AllowLine.new(val[0], val[3]) result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 130) def _reduce_39(val, _values, result) result = DisallowLine.new(val[0], val[3]) result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 135) def _reduce_40(val, _values, result) result = CrawlDelayLine.new(val[0], val[3]) result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 140) def _reduce_41(val, _values, result) result = ExtentionLine.new(val[0], val[3]) result end .,., # reduce 42 omitted # reduce 43 omitted def _reduce_none(val, _values, result) val[0] end end # class Parser def initialize(site, records, options = nil) @timestamp = Time.now @site = site @options = options || {} @last_checked_at = nil @error = @options[:error] @target = @options[:target] @sitemaps = @options[:sitemaps] || [] @crawl_delay_handler = @options[:crawl_delay_handler] if records && !records.empty? @records, defaults = [], [] records.each { |record| if record.default? defaults << record elsif !@target || record.match?(@target) @records << record end } @records.concat(defaults) else @records = [] end end attr_reader :timestamp, :site, :sitemaps attr_accessor :error def error! raise @error if @error end def target(user_agent = nil) if user_agent raise ArgumentError, "this instance is targeted for #{@target}" if @target user_agent else raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target @target end end private :target def find_record(user_agent = nil) user_agent = target(user_agent) @records.find { |record| record.match?(user_agent) } end private :find_record def allow?(request_uri, user_agent = nil) record = find_record(user_agent) or return true allow = record.allow?(request_uri) if delay = record.delay and @crawl_delay_handler @crawl_delay_handler.call(delay, @last_checked_at) end @last_checked_at = Time.now return allow end def crawl_delay(user_agent = nil) record = find_record(user_agent) or return 0 record.delay or return 0 end def options(user_agent = nil) record = find_record(user_agent) or return {} record.options end DISALLOW_ALL = <<-TXT User-Agent: * Disallow: / TXT def self.unfetchable(site, reason, target = nil) Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt| robots_txt.error = reason } end class Record def initialize(agentlines, rulelines) @patterns = agentlines.map { |agentline| agentline.pattern } @acls = [] @delay = nil @options = {} rulelines.each { |ruleline| case ruleline when AccessControlLine @acls << ruleline when CrawlDelayLine @delay = ruleline.delay else @options[ruleline.token.downcase] = ruleline.value end } if rulelines @acls.replace @acls.sort_by { |x| [-x.value.length, x.is_a?(AllowLine) ? -1 : 0] } end attr_reader :delay, :options def match?(user_agent) @patterns.any? { |pattern| pattern.match(user_agent) } end def default? @patterns.include?(//) end def allow?(request_uri) @acls.each { |acl| if acl.match?(request_uri) return acl.allow? end } return true end end class Line def initialize(token, value) @token = token @value = value compile end attr_reader :token, :value def compile self end end class AgentLine < Line def compile if @value == '*' @pattern = // else @pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE) end self end attr_reader :pattern end class AccessControlLine < Line def compile @empty = @value.empty? re_src = '\A' s = StringScanner.new(@value) until s.eos? if t = s.scan(/[^%*$]+/) re_src << Regexp.quote(t) elsif t = s.scan(/%([0-9a-f]{2})/i) c = s[1].to_i(16) if c == 0x2f re_src << '%2[fF]' else re_src << Regexp.quote('%c' % c) end elsif t = s.scan(/\*/) re_src << '.*' elsif t = s.scan(/\$/) re_src << '\z' break else re_src << Regexp.quote(s.scan(/./)) end end @pattern = Regexp.new(re_src, Regexp::MULTILINE) self end def match?(request_uri) return false if @empty transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) } !!@pattern.match(transformed) end end class AllowLine < AccessControlLine def allow? true end end class DisallowLine < AccessControlLine def allow? false end end class CrawlDelayLine < Line def compile case @value when /\A((0|[1-9][0-9]*)\.[0-9]+)/ @delay = @value.to_f when /\A(0|[1-9][0-9]*)/ @delay = @value.to_i else @delay = nil end self end attr_reader :delay end class ExtentionLine < Line end end end webrobots-0.1.1/lib/webrobots/nokogiri.rb 0000644 0000041 0000041 00000002061 12166110604 020465 0 ustar www-data www-data require 'nokogiri' class Nokogiri::HTML::Document # Returns an array of lower-cased tokens. If # no tag is found, returns an empty array. An optional # +custom_name+ specifies the name of a meta tag to look for ahead # of "ROBOTS". Names are compared in a case-insensitive manner. def meta_robots(custom_name = nil) (@meta_robots ||= {})[custom_name] = (custom_name && parse_meta_robots(custom_name)) || parse_meta_robots('robots') end # Equivalent to meta_robots(custom_name).include?('noindex'). def noindex?(custom_name = nil) meta_robots(custom_name).include?('noindex') end # Equivalent to meta_robots(custom_name).include?('nofollow'). def nofollow?(custom_name = nil) meta_robots(custom_name).include?('nofollow') end private def parse_meta_robots(custom_name) pattern = /\A#{Regexp.quote(custom_name)}\z/i meta = css('meta[@name]').find { |element| element['name'].match(pattern) } and content = meta['content'] or return [] content.downcase.split(/[,\s]+/) end end