webrobots-0.1.1/0000755000004100000410000000000012166110604013544 5ustar www-datawww-datawebrobots-0.1.1/.travis.yml0000644000004100000410000000031212166110604015651 0ustar www-datawww-datalanguage: ruby rvm: - 1.8.7 - 1.9.2 - 1.9.3 - 2.0.0 - ree - jruby-18mode - jruby-19mode - rbx-18mode - rbx-19mode matrix: allow_failures: - rvm: rbx-18mode - rvm: rbx-19mode webrobots-0.1.1/test/0000755000004100000410000000000012166110604014523 5ustar www-datawww-datawebrobots-0.1.1/test/test_webrobots.rb0000644000004100000410000005650312166110604020126 0ustar www-datawww-data# -*- coding: utf-8 -*- require 'helper' class TestWebRobots < Test::Unit::TestCase context "robots.txt with no rules" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| case uri.to_s when 'http://site1.example.org/robots.txt' <<-'TXT' TXT when 'http://site2.example.org/robots.txt' <<-'TXT' TXT when 'http://site3.example.org/robots.txt' <<-'TXT' #comment TXT when 'http://site4.example.org/robots.txt' <<-'TXT' #comment TXT else raise "#{uri} is not supposed to be fetched" end }) end should "allow any robot" do assert @robots.allowed?('http://site1.example.org/index.html') assert @robots.allowed?('http://site1.example.org/private/secret.txt') assert @robots.allowed?('http://site2.example.org/index.html') assert @robots.allowed?('http://site2.example.org/private/secret.txt') assert @robots.allowed?('http://site3.example.org/index.html') assert @robots.allowed?('http://site3.example.org/private/secret.txt') assert @robots.allowed?('http://site4.example.org/index.html') assert @robots.allowed?('http://site4.example.org/private/secret.txt') end end context "robots.txt that cannot be fetched" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| case uri.to_s when 'http://site1.example.org/robots.txt' raise Net::HTTPFatalError.new( 'Internal Server Error', Net::HTTPInternalServerError.new('1.1', '500', 'Internal Server Error')) when 'http://site2.example.org/robots.txt' raise Net::HTTPRetriableError.new( 'Found', Net::HTTPFound.new('1.1', '302', 'Found')) when 'http://site3.example.org/robots.txt' raise Errno::ECONNREFUSED when 'http://site4.example.org/robots.txt' raise SocketError, "getaddrinfo: nodename nor servname provided, or not known" when 'http://site5.example.org/robots.txt' nil else raise "#{uri} is not supposed to be fetched" end }) end should "disallow any robot" do assert @robots.disallowed?('http://site1.example.org/index.html') assert @robots.disallowed?('http://site1.example.org/private/secret.txt') assert @robots.disallowed?('http://site2.example.org/index.html') assert @robots.disallowed?('http://site2.example.org/private/secret.txt') assert @robots.disallowed?('http://site3.example.org/index.html') assert @robots.disallowed?('http://site3.example.org/private/secret.txt') assert @robots.disallowed?('http://site4.example.org/index.html') assert @robots.disallowed?('http://site4.example.org/private/secret.txt') assert @robots.disallowed?('http://site5.example.org/index.html') assert @robots.disallowed?('http://site5.example.org/private/secret.txt') end end context "robots.txt with some rules" do setup do http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' <<-'TXT' # Punish evil bots User-Agent: evil Disallow: / Disallow-Not: / # parser teaser User-Agent: good # Be generous to good bots Disallow: /2heavy/ Allow: /2heavy/*.htm Disallow: /2heavy/*.htm$ User-Agent: * Disallow: /2heavy/ Disallow: /index.html # Allow takes precedence over Disallow if the pattern lengths are the same. Allow: /index.html TXT when 'http://www.example.com/robots.txt' <<-'TXT' # Default rule is evaluated last even if it is put first. User-Agent: * Disallow: /2heavy/ Disallow: /index.html # Allow takes precedence over Disallow if the pattern lengths are the same. Allow: /index.html # Punish evil bots User-Agent: evil Disallow: / User-Agent: good # Be generous to good bots Disallow: /2heavy/ Allow: /2heavy/*.htm Disallow: /2heavy/*.htm$ TXT when 'http://koster1.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /tmp TXT when 'http://koster2.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /tmp/ TXT when 'http://koster3.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a%3cd.html TXT when 'http://koster4.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a%3Cd.html TXT when 'http://koster5.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a%2fb.html TXT when 'http://koster6.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a/b.html TXT when 'http://koster7.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /%7ejoe/index.html TXT when 'http://koster8.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /~joe/index.html TXT else raise "#{uri} is not supposed to be fetched" end } @robots = WebRobots.new('RandomBot', :http_get => http_get) @robots_good = WebRobots.new('GoodBot', :http_get => http_get) @robots_evil = WebRobots.new('EvilBot', :http_get => http_get) end should "properly restrict access" do assert_nothing_raised { assert @robots_good.allowed?('http://www.example.org/index.html') } assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php') assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php') assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php')) assert @robots_good.allowed?('http://www.example.org/2heavy/index.html') assert @robots_good.allowed?('http://WWW.Example.Org/2heavy/index.html') assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm') assert !@robots_good.allowed?('http://WWW.Example.Org/2heavy/index.htm') assert !@robots_evil.allowed?('http://www.example.org/index.html') assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php') assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.html') assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.htm') assert @robots.allowed?('http://www.example.org/index.html') assert !@robots.allowed?('http://www.example.org/2heavy/index.php') assert !@robots.allowed?('http://www.example.org/2heavy/index.html') assert !@robots.allowed?('http://www.example.org/2heavy/index.htm') assert @robots_good.allowed?('http://www.example.com/index.html') assert !@robots_good.allowed?('http://www.example.com/2heavy/index.php') assert @robots_good.allowed?('http://www.example.com/2heavy/index.html') assert !@robots_good.allowed?('http://www.example.com/2heavy/index.htm') assert !@robots_evil.allowed?('http://www.example.com/index.html') assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.php') assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.html') assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.htm') assert @robots.allowed?('http://www.example.com/index.html') assert !@robots.allowed?('http://www.example.com/2heavy/index.php') assert !@robots.allowed?('http://www.example.com/2heavy/index.html') assert !@robots.allowed?('http://www.example.com/2heavy/index.htm') end should "follow what is said in Koster's draft" do assert @robots.disallowed?('http://koster1.example.net/tmp') assert @robots.disallowed?('http://koster1.example.net/tmp.html') assert @robots.disallowed?('http://koster1.example.net/tmp/a.html') assert !@robots.disallowed?('http://koster2.example.net/tmp') assert @robots.disallowed?('http://koster2.example.net/tmp/') assert @robots.disallowed?('http://koster2.example.net/tmp/a.html') assert @robots.disallowed?('http://koster3.example.net/a%3cd.html') assert @robots.disallowed?('http://koster3.example.net/a%3Cd.html') assert @robots.disallowed?('http://koster4.example.net/a%3cd.html') assert @robots.disallowed?('http://koster4.example.net/a%3Cd.html') assert @robots.disallowed?('http://koster5.example.net/a%2fb.html') assert !@robots.disallowed?('http://koster5.example.net/a/b.html') assert !@robots.disallowed?('http://koster6.example.net/a%2fb.html') assert @robots.disallowed?('http://koster6.example.net/a/b.html') assert @robots.disallowed?('http://koster7.example.net/~joe/index.html') assert @robots.disallowed?('http://koster8.example.net/%7Ejoe/index.html') end end context "robots.txt with errors" do setup do @turn1 = @turn2 = 0 @http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' if (@turn1 += 1) % 2 == 1 <<-'TXT' # some comment User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT else <<-'TXT' # some comment User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html # User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT end when 'http://www.example.com/robots.txt' if (@turn2 += 1) % 2 == 1 <<-'TXT' # some comment #User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT else <<-'TXT' # some comment User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT end else raise "#{uri} is not supposed to be fetched" end } end should "raise ParseError" do robots = WebRobots.new('TheBot', :http_get => @http_get) url = 'http://www.example.org/2heavy/index.php' assert_nil robots.error(url) assert !robots.allowed?(url) assert_nothing_raised { robots.error!(url) } robots.reset(url) assert robots.allowed?(url) error = robots.error(url) assert_instance_of WebRobots::ParseError, error assert_equal URI('http://www.example.org/'), error.site assert_raise(WebRobots::ParseError) { robots.error!(url) } robots.reset(url) assert_nil robots.error(url) assert !robots.allowed?(url) assert_nothing_raised { robots.error!(url) } url = 'http://www.example.com/2heavy/index.php' assert robots.allowed?(url) assert_instance_of WebRobots::ParseError, robots.error(url) assert_raise(WebRobots::ParseError) { robots.error!(url) } robots.reset(url) assert_nil robots.error(url) assert !robots.allowed?(url) assert_nothing_raised { robots.error!(url) } robots.reset(url) assert robots.allowed?(url) assert_instance_of WebRobots::ParseError, robots.error(url) assert_raise(WebRobots::ParseError) { robots.error!(url) } end end context "robots.txt with options" do setup do http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' <<-'TXT' Sitemap: http://www.example.org/sitemap-host1.xml Sitemap: http://www.example.org/sitemap-host2.xml User-Agent: MyBot Disallow: /2heavy/ Allow: /2heavy/*.html Option1: Foo Option2: Hello Crawl-Delay: 1.5 User-Agent: HerBot Disallow: /2heavy/ Allow: /2heavy/*.html Option1: Baz Option2: Qux User-Agent: * Disallow: /2heavy/ Allow: /2heavy/*.html # These are wrong but should be allowed Allow: /2heavy/% Crawl-Delay: # Option1: Bar Option3: Hi TXT else raise "#{uri} is not supposed to be fetched" end } @robots_mybot = WebRobots.new('MyBot', :http_get => http_get) @robots_mybot_ignore = WebRobots.new('MyBot', :http_get => http_get, :crawl_delay => :ignore) @robots_mybot_custom = WebRobots.new('MyBot', :http_get => http_get, :crawl_delay => proc { |*args| @delay_args = args }) @robots_herbot = WebRobots.new('HerBot', :http_get => http_get) @robots_hisbot = WebRobots.new('HisBot', :http_get => http_get) end should "read options" do options = @robots_mybot.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Foo', @robots_mybot.option('http://www.example.org/', 'Option1') assert_equal 'Foo', options['option1'] assert_equal 'Hello', @robots_mybot.option('http://www.example.org/', 'Option2') assert_equal 'Hello', options['option2'] options = @robots_mybot_ignore.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Foo', @robots_mybot_ignore.option('http://www.example.org/', 'Option1') assert_equal 'Foo', options['option1'] assert_equal 'Hello', @robots_mybot_ignore.option('http://www.example.org/', 'Option2') assert_equal 'Hello', options['option2'] options = @robots_mybot_custom.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Foo', @robots_mybot_custom.option('http://www.example.org/', 'Option1') assert_equal 'Foo', options['option1'] assert_equal 'Hello', @robots_mybot_custom.option('http://www.example.org/', 'Option2') assert_equal 'Hello', options['option2'] options = @robots_herbot.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Baz', @robots_herbot.option('http://www.example.org/', 'Option1') assert_equal 'Baz', options['option1'] assert_equal 'Qux', @robots_herbot.option('http://www.example.org/', 'Option2') assert_equal 'Qux', options['option2'] options = @robots_hisbot.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Bar', @robots_hisbot.option('http://www.example.org/', 'Option1') assert_equal 'Bar', options['option1'] assert_equal 'Hi', @robots_hisbot.option('http://www.example.org/', 'Option3') assert_equal 'Hi', options['option3'] assert_equal %w[ http://www.example.org/sitemap-host1.xml http://www.example.org/sitemap-host2.xml ], @robots_mybot.sitemaps('http://www.example.org/') assert_equal %w[ http://www.example.org/sitemap-host1.xml http://www.example.org/sitemap-host2.xml ], @robots_mybot_ignore.sitemaps('http://www.example.org/') assert_equal %w[ http://www.example.org/sitemap-host1.xml http://www.example.org/sitemap-host2.xml ], @robots_herbot.sitemaps('http://www.example.org/') assert_equal %w[ http://www.example.org/sitemap-host1.xml http://www.example.org/sitemap-host2.xml ], @robots_hisbot.sitemaps('http://www.example.org/') assert_equal 1.5, @robots_mybot.crawl_delay('http://www.example.org/') assert_equal 1.5, @robots_mybot_ignore.crawl_delay('http://www.example.org/') assert_equal 1.5, @robots_mybot_custom.crawl_delay('http://www.example.org/') assert_equal 0, @robots_herbot.crawl_delay('http://www.example.org/') assert_equal 0, @robots_hisbot.crawl_delay('http://www.example.org/') t1 = Time.now @robots_mybot.allowed?('http://www.example.org/') @robots_mybot.allowed?('http://www.example.org/article1.html') t2 = Time.now assert_in_delta 1.5, t2 - t1, 0.1 @robots_mybot.allowed?('http://www.example.org/article2.html') t3 = Time.now assert_in_delta 1.5, t3 - t2, 0.1 t1 = Time.now @robots_mybot_ignore.allowed?('http://www.example.org/') @robots_mybot_ignore.allowed?('http://www.example.org/article1.html') t2 = Time.now assert_in_delta 0, t2 - t1, 0.1 @robots_mybot_ignore.allowed?('http://www.example.org/article2.html') t3 = Time.now assert_in_delta 0, t3 - t2, 0.1 t1 = Time.now @robots_mybot_custom.allowed?('http://www.example.org/') @robots_mybot_custom.allowed?('http://www.example.org/article1.html') t2 = Time.now assert_in_delta 0, t2 - t1, 0.1 assert_instance_of Array, @delay_args assert_equal 2, @delay_args.size assert_equal 1.5, @delay_args[0] assert_instance_of Time, @delay_args[1] end end context "robots.txt with options" do setup do http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' <<-'TXT' User-Agent: * Disallow: / TXT else raise "#{uri} is not supposed to be fetched" end } @robots = WebRobots.new('RandomBot', :http_get => http_get) end should "validate URI" do assert_raise(ArgumentError) { @robots.allowed?('www.example.org/') } assert_raise(ArgumentError) { @robots.allowed?('::/home/knu') } end end context "robots.txt in the real world" do setup do @testbot = WebRobots.new('TestBot') @msnbot = WebRobots.new('TestMSNBot') # matches msnbot end should "be parsed for major sites" do assert_nothing_raised { assert !@testbot.allowed?("http://www.google.com/search") assert !@testbot.allowed?("https://www.google.com/search") assert !@testbot.allowed?("http://www.google.com/news/section?pz=1&cf=all&ned=jp&topic=y&ict=ln") assert @testbot.allowed?("http://www.google.com/news/directory?pz=1&cf=all&ned=us&hl=en&sort=users&category=6") } assert_nothing_raised { assert @testbot.allowed?("http://www.yahoo.com/") assert !@testbot.allowed?("http://www.yahoo.com/?") assert !@testbot.allowed?("http://www.yahoo.com/p/foo") } assert_nothing_raised { assert !@testbot.allowed?("http://store.apple.com/vieworder") assert @msnbot.allowed?("http://store.apple.com/vieworder") } assert_nothing_raised { assert !@testbot.allowed?("http://github.com/login") } end end context "meta robots tag" do setup do @doc = Nokogiri::HTML(<<-HTML) test HTML end should "be properly parsed when given in HTML string" do assert !@doc.noindex? assert @doc.nofollow? assert @doc.noindex?('slurp') assert @doc.nofollow?('slurp') assert @doc.noindex?('googlebot') assert !@doc.nofollow?('googlebot') assert @doc.meta_robots('googlebot').include?('noarchive') end end class Agent def initialize @robots = WebRobots.new 'agent', :http_get => method(:get) end def get uri @robots.allowed? uri if uri.request_uri == '/robots.txt' then '' else 'content' end end end context "embedded in a user-agent" do setup do @agent = Agent.new end should "fetch robots.txt" do body = @agent.get URI.parse 'http://example/robots.html' assert_equal 'content', body end end context "robots.txt with a space at the end of the last line" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| res = case uri.to_s when 'http://site1.example.com/robots.txt' <<-'TXT' User-agent: * Request-rate: 1/30 Disallow: /util/ Sitemap: http://site1.example.com/text/sitemap.xml TXT when 'http://site2.example.com/robots.txt' <<-'TXT' User-agent: * Request-rate: 1/30 Disallow: /util/ Sitemap: http://site2.example.com/text/sitemap.xml TXT else raise "#{uri} is not supposed to be fetched" end # This chomp is actually key to the test. Remove the final EOL. # The final line should be the one ending with the space. res.chomp }) end should "be properly parsed" do assert_equal(["http://site1.example.com/text/sitemap.xml"], @robots.sitemaps("http://site1.example.com/")) assert_equal(["http://site2.example.com/text/sitemap.xml"], @robots.sitemaps("http://site2.example.com/")) end end context "robots.txt cache" do setup do @fetched = false @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| case uri.to_s when 'http://site1.example.org/robots.txt' @fetched = true <<-'TXT' User-Agent: * Disallow: /foo TXT when 'http://site2.example.org/robots.txt' @fetched = true nil end }) end should "persist unless cache is cleared" do assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert @fetched @fetched = false assert @robots.allowed?('http://site1.example.org/bar') assert !@fetched assert @robots.allowed?('http://site1.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert !@fetched @robots.flush_cache assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert @fetched @fetched = false assert @robots.allowed?('http://site1.example.org/bar') assert !@fetched assert @robots.allowed?('http://site1.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert !@fetched end should "persist for non-existent robots.txt unless cache is cleared" do assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert @fetched @fetched = false assert !@robots.allowed?('http://site2.example.org/bar') assert !@fetched assert !@robots.allowed?('http://site2.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert !@fetched @robots.flush_cache assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert @fetched @fetched = false assert !@robots.allowed?('http://site2.example.org/bar') assert !@fetched assert !@robots.allowed?('http://site2.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert !@fetched end end context "robots.txt with just user-agent & sitemap and no blank line between them" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| res = case uri.to_s when 'http://site1.example.com/robots.txt' <<-'TXT' User-agent: * Sitemap: http://site1.example.com/text/sitemap.xml TXT else raise "#{uri} is not supposed to be fetched" end }) end should "be properly parsed" do assert @robots.allowed?("http://site1.example.com/foo") assert_equal(["http://site1.example.com/text/sitemap.xml"], @robots.sitemaps("http://site1.example.com/")) end end end webrobots-0.1.1/test/helper.rb0000644000004100000410000000065612166110604016336 0ustar www-datawww-datarequire 'rubygems' require 'bundler' begin Bundler.setup(:default, :development) rescue Bundler::BundlerError => e $stderr.puts e.message $stderr.puts "Run `bundle install` to install missing gems" exit e.status_code end require 'test/unit' require 'shoulda' $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) $LOAD_PATH.unshift(File.dirname(__FILE__)) require 'webrobots' class Test::Unit::TestCase end webrobots-0.1.1/README.rdoc0000644000004100000410000000223112166110604015350 0ustar www-datawww-data= webrobots This is a library to help write robots.txt compliant web robots. == Usage require 'webrobots' require 'uri' require 'net/http' robots = WebRobots.new('MyBot/1.0') uri = URI('http://digg.com/news/24hr') if robots.disallowed?(uri) STDERR.puts "Access disallowed: #{uri}" exit 1 end body = Net::HTTP.get(uri) # ... == Requirements - Ruby 1.8.7 or 1.9.2+ == Contributing to webrobots * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it * Fork the project * Start a feature/bugfix branch * Commit and push until you are happy with your contribution * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally. * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it. == Copyright Copyright (c) 2010, 2011, 2012, 2013 Akinori MUSHA. See LICENSE.txt for further details. webrobots-0.1.1/Rakefile0000644000004100000410000000112312166110604015206 0ustar www-datawww-data# encoding: utf-8 require 'bundler/gem_tasks' gemspec = Bundler::GemHelper.gemspec require 'rake/testtask' Rake::TestTask.new(:test) do |test| test.libs << 'test' test.test_files = gemspec.test_files test.verbose = true end require 'rdoc/task' Rake::RDocTask.new do |rdoc| rdoc.rdoc_dir = 'rdoc' rdoc.title = "#{gemspec.name} #{gemspec.version}" rdoc.rdoc_files.include(gemspec.extra_rdoc_files) rdoc.rdoc_files.include('lib/**/*.rb') end task :default => :test task :test => 'lib/webrobots/robotstxt.rb' rule '.rb' => ['.ry'] do |t| sh 'racc', '-o', t.name, t.source end webrobots-0.1.1/webrobots.gemspec0000644000004100000410000000231312166110604017116 0ustar www-datawww-data# -*- encoding: utf-8 -*- $:.push File.expand_path("../lib", __FILE__) require "webrobots/version" Gem::Specification.new do |s| s.name = "webrobots" s.version = Webrobots::VERSION s.authors = ["Akinori MUSHA"] s.email = ["knu@idaemons.org"] s.homepage = %q{https://github.com/knu/webrobots} s.licenses = [%q{2-clause BSDL}] s.summary = %q{A Ruby library to help write robots.txt compliant web robots} s.description = <<-'EOS' This library helps write robots.txt compliant web robots in Ruby. EOS s.files = `git ls-files`.split("\n") s.test_files = s.files.grep(%r{/test_[^/]+\.rb$}) s.executables = s.files.grep(%r{^bin/[^.]}).map{ |f| File.basename(f) } s.require_paths = ["lib"] s.extra_rdoc_files = [ "LICENSE.txt", "README.rdoc" ] s.rdoc_options += [ '--exclude', '\.ry$' ] s.add_development_dependency("rake", [">= 0.9.2.2"]) s.add_development_dependency("racc", [">= 0"]) unless RUBY_PLATFORM == "java" s.add_development_dependency("shoulda", [">= 0"]) s.add_development_dependency("rdoc", ["> 2.4.2"]) s.add_development_dependency("bundler", [">= 1.2"]) s.add_development_dependency("nokogiri", [">= 1.4.4"]) end webrobots-0.1.1/LICENSE.txt0000644000004100000410000000242312166110604015370 0ustar www-datawww-dataCopyright (c) 2010, 2011, 2012, 2013 Akinori MUSHA All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. webrobots-0.1.1/checksums.yaml.gz0000444000004100000410000000041512166110604017032 0ustar www-datawww-dataCQe9VdA >E_yJ* X|0?##x~7xv+TklLDFd{|'.yG<V!:4d6qhl}[^\Y65pjjz3,]VB/?, rwCI!!@$ &fU~m[x64p?֢Iɺ꒳]Mifh$sy!~. =xZV-tǩ2HIâwebrobots-0.1.1/.document0000644000004100000410000000005212166110604015360 0ustar www-datawww-dataLICENSE.txt README.rdoc bin/* lib/**/*.rb webrobots-0.1.1/metadata.yml0000644000004100000410000000666512166110604016064 0ustar www-datawww-data--- !ruby/object:Gem::Specification name: webrobots version: !ruby/object:Gem::Version version: 0.1.1 platform: ruby authors: - Akinori MUSHA autorequire: bindir: bin cert_chain: [] date: 2013-03-15 00:00:00.000000000 Z dependencies: - !ruby/object:Gem::Dependency name: rake requirement: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: 0.9.2.2 type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: 0.9.2.2 - !ruby/object:Gem::Dependency name: racc requirement: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' - !ruby/object:Gem::Dependency name: shoulda requirement: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' - !ruby/object:Gem::Dependency name: rdoc requirement: !ruby/object:Gem::Requirement requirements: - - '>' - !ruby/object:Gem::Version version: 2.4.2 type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>' - !ruby/object:Gem::Version version: 2.4.2 - !ruby/object:Gem::Dependency name: bundler requirement: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '1.2' type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '1.2' - !ruby/object:Gem::Dependency name: nokogiri requirement: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: 1.4.4 type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: 1.4.4 description: | This library helps write robots.txt compliant web robots in Ruby. email: - knu@idaemons.org executables: [] extensions: [] extra_rdoc_files: - LICENSE.txt - README.rdoc files: - .document - .gitignore - .travis.yml - Gemfile - LICENSE.txt - README.rdoc - Rakefile - lib/webrobots.rb - lib/webrobots/nokogiri.rb - lib/webrobots/robotstxt.rb - lib/webrobots/robotstxt.ry - lib/webrobots/version.rb - test/helper.rb - test/test_webrobots.rb - webrobots.gemspec homepage: https://github.com/knu/webrobots licenses: - 2-clause BSDL metadata: {} post_install_message: rdoc_options: - --exclude - \.ry$ require_paths: - lib required_ruby_version: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' required_rubygems_version: !ruby/object:Gem::Requirement requirements: - - '>=' - !ruby/object:Gem::Version version: '0' requirements: [] rubyforge_project: rubygems_version: 2.0.3 signing_key: specification_version: 4 summary: A Ruby library to help write robots.txt compliant web robots test_files: - test/test_webrobots.rb webrobots-0.1.1/Gemfile0000644000004100000410000000013512166110604015036 0ustar www-datawww-datasource "http://rubygems.org" # Specify your gem's dependencies in webrobots.gemspec gemspec webrobots-0.1.1/.gitignore0000644000004100000410000000013512166110604015533 0ustar www-datawww-data*.gem .bundle Gemfile.lock pkg/* coverage rdoc doc .yardoc /lib/webrobots/robotstxt.output webrobots-0.1.1/lib/0000755000004100000410000000000012166110604014312 5ustar www-datawww-datawebrobots-0.1.1/lib/webrobots.rb0000644000004100000410000001300212166110604016641 0ustar www-datawww-datarequire 'webrobots/version' require 'webrobots/robotstxt' require 'uri' require 'net/https' require 'thread' if defined?(Nokogiri) require 'webrobots/nokogiri' else autoload :Nokogiri, 'webrobots/nokogiri' end class WebRobots # Creates a WebRobots object for a robot named +user_agent+, with # optional +options+. # # * :http_get => a custom method, proc, or anything that responds to # .call(uri), to be used for fetching robots.txt. It must return # the response body if successful, return an empty string if the # resource is not found, and return nil or raise any error on # failure. Redirects should be handled within this proc. # # * :crawl_delay => determines how to react to Crawl-delay # directives. If +:sleep+ is given, WebRobots sleeps as demanded # when allowed?(url)/disallowed?(url) is called. This is the # default behavior. If +:ignore+ is given, WebRobots does # nothing. If a custom method, proc, or anything that responds to # .call(delay, last_checked_at), it is called. def initialize(user_agent, options = nil) @user_agent = user_agent options ||= {} @http_get = options[:http_get] || method(:http_get) crawl_delay_handler = case value = options[:crawl_delay] || :sleep when :ignore nil when :sleep method(:crawl_delay_handler) else if value.respond_to?(:call) value else raise ArgumentError, "invalid Crawl-delay handler: #{value.inspect}" end end @parser = RobotsTxt::Parser.new(user_agent, crawl_delay_handler) @parser_mutex = Mutex.new @robotstxt = create_cache() end # :nodoc: def create_cache Hash.new # Must respond to [], []=, delete and clear. end # Flushes robots.txt cache. def flush_cache @robotstxt.clear end # Returns the robot name initially given. attr_reader :user_agent # Tests if the robot is allowed to access a resource at +url+. If a # malformed URI string is given, URI::InvalidURIError is raised. If # a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is # raised. def allowed?(url) site, request_uri = split_uri(url) return true if request_uri == '/robots.txt' robots_txt = get_robots_txt(site) robots_txt.allow?(request_uri) end # Equivalent to !allowed?(url). def disallowed?(url) !allowed?(url) end # Returns the number of seconds that the configured agent should wait # between successive requests to the site identified by +url+ according # to the site's robots.txt +Crawl-delay+ directive. def crawl_delay(url) robots_txt_for(url).crawl_delay() end # Returns extended option values for a resource at +url+ in a hash # with each field name lower-cased. See allowed?() for a list of # errors that may be raised. def options(url) robots_txt_for(url).options end # Equivalent to option(url)[token.downcase]. def option(url, token) options(url)[token.downcase] end # Returns an array of Sitemap URLs. See allowed?() for a list of # errors that may be raised. def sitemaps(url) robots_txt_for(url).sitemaps end # Returns an error object if there is an error in fetching or # parsing robots.txt of the site +url+. def error(url) robots_txt_for(url).error end # Raises the error if there was an error in fetching or parsing # robots.txt of the site +url+. def error!(url) robots_txt_for(url).error! end # Removes robots.txt cache for the site +url+. def reset(url) site, = split_uri(url) @robotstxt.delete(site) end private def split_uri(url) site = if url.is_a?(URI) url.dup else begin URI.parse(url) rescue => e raise ArgumentError, e.message end end site.scheme && site.host or raise ArgumentError, "non-absolute URI: #{url}" site.is_a?(URI::HTTP) or raise ArgumentError, "non-HTTP/HTTPS URI: #{url}" request_uri = site.request_uri if (host = site.host).match(/[[:upper:]]/) site.host = host.downcase end site.path = '/' return site, request_uri end def robots_txt_for(url) site, = split_uri(url) get_robots_txt(site) end def get_robots_txt(site) @robotstxt[site] ||= fetch_robots_txt(site) end def fetch_robots_txt(site) begin body = @http_get.call(site + 'robots.txt') or raise 'robots.txt unfetchable' rescue => e return RobotsTxt.unfetchable(site, e, @user_agent) end @parser_mutex.synchronize { @parser.parse!(body, site) } end def http_get(uri) referer = nil 10.times { http = Net::HTTP.new(uri.host, uri.port) if http.use_ssl = uri.is_a?(URI::HTTPS) http.verify_mode = OpenSSL::SSL::VERIFY_PEER http.cert_store = OpenSSL::X509::Store.new.tap { |store| store.set_default_paths } end header = { 'User-Agent' => @user_agent } header['Referer'] = referer if referer # header is destroyed by this in ruby 1.9.2! response = http.get(uri.request_uri, header) case response when Net::HTTPSuccess return response.body when Net::HTTPRedirection referer = uri.to_s uri = URI(response['location']) when Net::HTTPNotFound return '' else response.value end } raise 'too many HTTP redirects' end def crawl_delay_handler(delay, last_checked_at) if last_checked_at delay -= Time.now - last_checked_at sleep delay if delay > 0 end end end webrobots-0.1.1/lib/webrobots/0000755000004100000410000000000012166110604016320 5ustar www-datawww-datawebrobots-0.1.1/lib/webrobots/version.rb0000644000004100000410000000005112166110604020326 0ustar www-datawww-datamodule Webrobots VERSION = "0.1.1" end webrobots-0.1.1/lib/webrobots/robotstxt.ry0000644000004100000410000002460712166110604020755 0ustar www-datawww-data# -*- coding: utf-8 -*- class Parser rule robotstxt : opt_blanklines { @sitemaps = [] } body { body = val[2] result = RobotsTxt.new(@site, body, :target => @target, :sitemaps => @sitemaps, :crawl_delay_handler => @crawl_delay_handler) } body : | records opt_blanklines opt_blanklines : | blanklines blanklines : blankline | blanklines blankline blankline : EOL opt_space : | SPACE opt_commentlines : | commentlines commentlines : comment | commentlines comment comment : opt_space COMMENT EOL | 'sitemap' ':' opt_space VALUE eol_opt_comment { @sitemaps << val[3] } records : record { result = [] result << val[0] } | commentblock { result = [] } | records blanklines record { result << val[2] } | records blanklines rulelines { val[2].each_with_index { |line, i| warn "%s line %d: %s: orphan rule line" % [@site.to_s, @rulelinenos[i], line.token] if $VERBOSE } } | records blanklines commentblock commentblock : commentlines record : opt_commentlines agentlines opt_rulelines { result = Record.new(val[1], val[2]) } agentlines : agentline { result = [val[0]] } | agentlines agentline { result << val[1] } | agentlines comment agentline : 'user-agent' ':' opt_space VALUE eol_opt_comment { result = AgentLine.new(val[0], val[3]) } opt_rulelines : | rulelines rulelines : ruleline { result = [result] @rulelinenos = [] } | rulelines ruleline { result << val[1] @rulelinenos << @lineno } | rulelines comment ruleline : allowline | disallowline | crawldelayline | extension allowline : 'allow' ':' opt_space VALUE eol_opt_comment { result = AllowLine.new(val[0], val[3]) } disallowline : 'disallow' ':' opt_space VALUE eol_opt_comment { result = DisallowLine.new(val[0], val[3]) } crawldelayline : 'crawl-delay' ':' opt_space VALUE eol_opt_comment { result = CrawlDelayLine.new(val[0], val[3]) } extension : TOKEN ':' opt_space VALUE eol_opt_comment { result = ExtentionLine.new(val[0], val[3]) } eol_opt_comment : EOL | comment ---- header require 'strscan' class WebRobots class Error < StandardError end class ParseError < Error # The site's root URI attr_reader :site def initialize(message, site) @message = message @site = site end def to_s @message end end class RobotsTxt ---- inner def initialize(target, crawl_delay_handler = nil) super() @target = target @crawl_delay_handler = crawl_delay_handler end def parse!(input, site) parse(input, site) rescue Error => e RobotsTxt.new(site, nil, :error => e, :target => @target, :crawl_delay_handler => @crawl_delay_handler) end KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap] RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i def parse(input, site) @q ||= [] @errors = [] @lineno = 0 @site = site string = input.respond_to?(:read) ? input.read : input s = StringScanner.new(string) value_expected = false until s.eos? @lineno += 1 if s.bol? if t = s.scan(/[ \t]*(?:\r?\n|\z)/) if value_expected @q << [:VALUE, ''] end @q << [:EOL, t] value_expected = false elsif t = s.scan(/[ \t]+/) @q << [:SPACE, t] elsif t = s.scan(/:/) @q << [t, t] value_expected = true elsif t = s.scan(/#.*/) if value_expected @q << [:VALUE, ''] end @q << [:COMMENT, t] else if value_expected if t = s.scan(/.*?(?=[ \t]*(?:#|$))/) @q << [:VALUE, t] else parse_error @lineno, "unexpected characters: %s" % s.check(/.*/) end value_expected = false elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/) case t when RE_KNOWN_TOKENS @q << [t.downcase, t] else @q << [:TOKEN, t] end else parse_error "unexpected characters: %s" % s.check(/.*/) end end end @q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL @pos = -1 do_parse rescue Racc::ParseError => e raise ParseError.new(e.message, @site) ensure @q.clear end def next_token @q[@pos += 1] end def on_error(token_id, value, stack) parse_error "unexpected %s: %s" % [token_to_str(token_id), value] end def parse_error(message) message = "%s line %d: %s" % [@site.to_s, @lineno, message] if @lax @errors << message else raise Racc::ParseError, message end end ---- footer def initialize(site, records, options = nil) @timestamp = Time.now @site = site @options = options || {} @last_checked_at = nil @error = @options[:error] @target = @options[:target] @sitemaps = @options[:sitemaps] || [] @crawl_delay_handler = @options[:crawl_delay_handler] if records && !records.empty? @records, defaults = [], [] records.each { |record| if record.default? defaults << record elsif !@target || record.match?(@target) @records << record end } @records.concat(defaults) else @records = [] end end attr_reader :timestamp, :site, :sitemaps attr_accessor :error def error! raise @error if @error end def target(user_agent = nil) if user_agent raise ArgumentError, "this instance is targeted for #{@target}" if @target user_agent else raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target @target end end private :target def find_record(user_agent = nil) user_agent = target(user_agent) @records.find { |record| record.match?(user_agent) } end private :find_record def allow?(request_uri, user_agent = nil) record = find_record(user_agent) or return true allow = record.allow?(request_uri) if delay = record.delay and @crawl_delay_handler @crawl_delay_handler.call(delay, @last_checked_at) end @last_checked_at = Time.now return allow end def crawl_delay(user_agent = nil) record = find_record(user_agent) or return 0 record.delay or return 0 end def options(user_agent = nil) record = find_record(user_agent) or return {} record.options end DISALLOW_ALL = <<-TXT User-Agent: * Disallow: / TXT def self.unfetchable(site, reason, target = nil) Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt| robots_txt.error = reason } end class Record def initialize(agentlines, rulelines) @patterns = agentlines.map { |agentline| agentline.pattern } @acls = [] @delay = nil @options = {} rulelines.each { |ruleline| case ruleline when AccessControlLine @acls << ruleline when CrawlDelayLine @delay = ruleline.delay else @options[ruleline.token.downcase] = ruleline.value end } if rulelines @acls.replace @acls.sort_by { |x| [-x.value.length, x.is_a?(AllowLine) ? -1 : 0] } end attr_reader :delay, :options def match?(user_agent) @patterns.any? { |pattern| pattern.match(user_agent) } end def default? @patterns.include?(//) end def allow?(request_uri) @acls.each { |acl| if acl.match?(request_uri) return acl.allow? end } return true end end class Line def initialize(token, value) @token = token @value = value compile end attr_reader :token, :value def compile self end end class AgentLine < Line def compile if @value == '*' @pattern = // else @pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE) end self end attr_reader :pattern end class AccessControlLine < Line def compile @empty = @value.empty? re_src = '\A' s = StringScanner.new(@value) until s.eos? if t = s.scan(/[^%*$]+/) re_src << Regexp.quote(t) elsif t = s.scan(/%([0-9a-f]{2})/i) c = s[1].to_i(16) if c == 0x2f re_src << '%2[fF]' else re_src << Regexp.quote('%c' % c) end elsif t = s.scan(/\*/) re_src << '.*' elsif t = s.scan(/\$/) re_src << '\z' break else re_src << Regexp.quote(s.scan(/./)) end end @pattern = Regexp.new(re_src, Regexp::MULTILINE) self end def match?(request_uri) return false if @empty transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) } !!@pattern.match(transformed) end end class AllowLine < AccessControlLine def allow? true end end class DisallowLine < AccessControlLine def allow? false end end class CrawlDelayLine < Line def compile case @value when /\A((0|[1-9][0-9]*)\.[0-9]+)/ @delay = @value.to_f when /\A(0|[1-9][0-9]*)/ @delay = @value.to_i else @delay = nil end self end attr_reader :delay end class ExtentionLine < Line end end end webrobots-0.1.1/lib/webrobots/robotstxt.rb0000644000004100000410000004222312166110604020720 0ustar www-datawww-data# # DO NOT MODIFY!!!! # This file is automatically generated by Racc 1.4.9 # from Racc grammer file "". # require 'racc/parser.rb' require 'strscan' class WebRobots class Error < StandardError end class ParseError < Error # The site's root URI attr_reader :site def initialize(message, site) @message = message @site = site end def to_s @message end end class RobotsTxt class Parser < Racc::Parser module_eval(<<'...end robotstxt.ry/module_eval...', 'robotstxt.ry', 171) def initialize(target, crawl_delay_handler = nil) super() @target = target @crawl_delay_handler = crawl_delay_handler end def parse!(input, site) parse(input, site) rescue Error => e RobotsTxt.new(site, nil, :error => e, :target => @target, :crawl_delay_handler => @crawl_delay_handler) end KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap] RE_KNOWN_TOKENS = /\A(#{KNOWN_TOKENS.map { |t| Regexp.quote(t) }.join('|')})\z/i def parse(input, site) @q ||= [] @errors = [] @lineno = 0 @site = site string = input.respond_to?(:read) ? input.read : input s = StringScanner.new(string) value_expected = false until s.eos? @lineno += 1 if s.bol? if t = s.scan(/[ \t]*(?:\r?\n|\z)/) if value_expected @q << [:VALUE, ''] end @q << [:EOL, t] value_expected = false elsif t = s.scan(/[ \t]+/) @q << [:SPACE, t] elsif t = s.scan(/:/) @q << [t, t] value_expected = true elsif t = s.scan(/#.*/) if value_expected @q << [:VALUE, ''] end @q << [:COMMENT, t] else if value_expected if t = s.scan(/.*?(?=[ \t]*(?:#|$))/) @q << [:VALUE, t] else parse_error @lineno, "unexpected characters: %s" % s.check(/.*/) end value_expected = false elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/) case t when RE_KNOWN_TOKENS @q << [t.downcase, t] else @q << [:TOKEN, t] end else parse_error "unexpected characters: %s" % s.check(/.*/) end end end @q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL @pos = -1 do_parse rescue Racc::ParseError => e raise ParseError.new(e.message, @site) ensure @q.clear end def next_token @q[@pos += 1] end def on_error(token_id, value, stack) parse_error "unexpected %s: %s" % [token_to_str(token_id), value] end def parse_error(message) message = "%s line %d: %s" % [@site.to_s, @lineno, message] if @lax @errors << message else raise Racc::ParseError, message end end ...end robotstxt.ry/module_eval... ##### State transition tables begin ### racc_action_table = [ 5, 12, -10, 16, 52, 40, -12, 36, 37, 38, 39, 12, -10, 16, 46, 27, 27, 36, 37, 38, 39, 12, -10, 16, 49, 50, 51, 36, 37, 38, 39, 12, -10, 16, 12, 53, 24, 36, 37, 38, 39, 12, -10, 16, 12, 12, -12, 12, -10, 16, 60, 12, -13, 16, 60, 12, 12, 16, 60, 12, 12, 16, 60, 12, 12, 16, 60, 12, 23, 16, 60, 12, 62, 16, 63, 64, 65, 66, 5, 9, 5, 6, 5 ] racc_action_check = [ 21, 21, 21, 21, 39, 23, 21, 21, 21, 21, 21, 25, 25, 25, 27, 19, 25, 25, 25, 25, 25, 45, 45, 45, 36, 37, 38, 45, 45, 45, 45, 29, 29, 29, 24, 41, 16, 29, 29, 29, 29, 7, 7, 7, 46, 49, 7, 13, 13, 13, 62, 62, 13, 62, 53, 53, 50, 53, 63, 63, 51, 63, 64, 64, 52, 64, 65, 65, 15, 65, 66, 66, 54, 66, 55, 56, 57, 58, 11, 6, 3, 1, 0 ] racc_action_pointer = [ 80, 81, nil, 78, nil, nil, 79, 38, nil, nil, nil, 76, nil, 44, nil, 64, 30, nil, nil, 7, nil, -2, nil, 3, 31, 8, nil, 8, nil, 28, nil, nil, nil, nil, nil, nil, 18, 19, 20, -2, nil, 28, nil, nil, nil, 18, 41, nil, nil, 42, 53, 57, 61, 52, 65, 67, 68, 69, 70, nil, nil, nil, 48, 56, 60, 64, 68, nil, nil, nil, nil, nil ] racc_action_default = [ -5, -44, -1, -6, -7, -9, -44, -3, -8, 72, -2, -5, -11, -23, -14, -44, -44, -18, -19, -44, -4, -6, -15, -44, -10, -29, -25, -44, -20, -21, -22, -31, -34, -35, -36, -37, -44, -44, -44, -44, -16, -44, -24, -26, -27, -30, -10, -32, -33, -10, -10, -10, -10, -10, -44, -44, -44, -44, -44, -17, -42, -43, -10, -10, -10, -10, -10, -28, -38, -39, -40, -41 ] racc_goto_table = [ 14, 41, 8, 47, 3, 2, 22, 17, 29, 11, 18, 26, 45, 10, 14, 21, 20, 43, 44, 47, 8, 28, 48, 54, 30, 25, 55, 56, 57, 58, 59, 42, 7, 1, nil, nil, nil, nil, 48, 67, 68, 69, 70, 71 ] racc_goto_check = [ 11, 8, 7, 19, 6, 2, 11, 13, 15, 5, 14, 18, 15, 3, 11, 6, 2, 18, 11, 19, 7, 13, 11, 8, 14, 16, 8, 8, 8, 8, 12, 17, 4, 1, nil, nil, nil, nil, 11, 12, 12, 12, 12, 12 ] racc_goto_pointer = [ nil, 33, 5, 6, 30, 2, 4, -1, -23, nil, nil, -7, -23, 0, 3, -13, 6, 6, -8, -26, nil, nil, nil, nil ] racc_goto_default = [ nil, nil, nil, nil, nil, nil, nil, 4, 15, 19, 13, 61, nil, nil, nil, nil, nil, nil, nil, 31, 32, 33, 34, 35 ] racc_reduce_table = [ 0, 0, :racc_error, 0, 17, :_reduce_1, 3, 14, :_reduce_2, 0, 16, :_reduce_none, 2, 16, :_reduce_none, 0, 15, :_reduce_none, 1, 15, :_reduce_none, 1, 19, :_reduce_none, 2, 19, :_reduce_none, 1, 20, :_reduce_none, 0, 21, :_reduce_none, 1, 21, :_reduce_none, 0, 22, :_reduce_none, 1, 22, :_reduce_none, 1, 23, :_reduce_none, 2, 23, :_reduce_none, 3, 24, :_reduce_none, 5, 24, :_reduce_17, 1, 18, :_reduce_18, 1, 18, :_reduce_19, 3, 18, :_reduce_20, 3, 18, :_reduce_21, 3, 18, :_reduce_none, 1, 27, :_reduce_none, 3, 26, :_reduce_24, 1, 29, :_reduce_25, 2, 29, :_reduce_26, 2, 29, :_reduce_none, 5, 31, :_reduce_28, 0, 30, :_reduce_none, 1, 30, :_reduce_none, 1, 28, :_reduce_31, 2, 28, :_reduce_32, 2, 28, :_reduce_none, 1, 32, :_reduce_none, 1, 32, :_reduce_none, 1, 32, :_reduce_none, 1, 32, :_reduce_none, 5, 33, :_reduce_38, 5, 34, :_reduce_39, 5, 35, :_reduce_40, 5, 36, :_reduce_41, 1, 25, :_reduce_none, 1, 25, :_reduce_none ] racc_reduce_n = 44 racc_shift_n = 72 racc_token_table = { false => 0, :error => 1, :EOL => 2, :SPACE => 3, :COMMENT => 4, "sitemap" => 5, ":" => 6, :VALUE => 7, "user-agent" => 8, "allow" => 9, "disallow" => 10, "crawl-delay" => 11, :TOKEN => 12 } racc_nt_base = 13 racc_use_result_var = true Racc_arg = [ racc_action_table, racc_action_check, racc_action_default, racc_action_pointer, racc_goto_table, racc_goto_check, racc_goto_default, racc_goto_pointer, racc_nt_base, racc_reduce_table, racc_token_table, racc_shift_n, racc_reduce_n, racc_use_result_var ] Racc_token_to_s_table = [ "$end", "error", "EOL", "SPACE", "COMMENT", "\"sitemap\"", "\":\"", "VALUE", "\"user-agent\"", "\"allow\"", "\"disallow\"", "\"crawl-delay\"", "TOKEN", "$start", "robotstxt", "opt_blanklines", "body", "@1", "records", "blanklines", "blankline", "opt_space", "opt_commentlines", "commentlines", "comment", "eol_opt_comment", "record", "commentblock", "rulelines", "agentlines", "opt_rulelines", "agentline", "ruleline", "allowline", "disallowline", "crawldelayline", "extension" ] Racc_debug_parser = false ##### State transition tables end ##### # reduce 0 omitted module_eval(<<'.,.,', 'robotstxt.ry', 7) def _reduce_1(val, _values, result) @sitemaps = [] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 11) def _reduce_2(val, _values, result) body = val[2] result = RobotsTxt.new(@site, body, :target => @target, :sitemaps => @sitemaps, :crawl_delay_handler => @crawl_delay_handler) result end .,., # reduce 3 omitted # reduce 4 omitted # reduce 5 omitted # reduce 6 omitted # reduce 7 omitted # reduce 8 omitted # reduce 9 omitted # reduce 10 omitted # reduce 11 omitted # reduce 12 omitted # reduce 13 omitted # reduce 14 omitted # reduce 15 omitted # reduce 16 omitted module_eval(<<'.,.,', 'robotstxt.ry', 44) def _reduce_17(val, _values, result) @sitemaps << val[3] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 49) def _reduce_18(val, _values, result) result = [] result << val[0] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 54) def _reduce_19(val, _values, result) result = [] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 60) def _reduce_20(val, _values, result) result << val[2] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 66) def _reduce_21(val, _values, result) val[2].each_with_index { |line, i| warn "%s line %d: %s: orphan rule line" % [@site.to_s, @rulelinenos[i], line.token] if $VERBOSE } result end .,., # reduce 22 omitted # reduce 23 omitted module_eval(<<'.,.,', 'robotstxt.ry', 81) def _reduce_24(val, _values, result) result = Record.new(val[1], val[2]) result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 86) def _reduce_25(val, _values, result) result = [val[0]] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 91) def _reduce_26(val, _values, result) result << val[1] result end .,., # reduce 27 omitted module_eval(<<'.,.,', 'robotstxt.ry', 98) def _reduce_28(val, _values, result) result = AgentLine.new(val[0], val[3]) result end .,., # reduce 29 omitted # reduce 30 omitted module_eval(<<'.,.,', 'robotstxt.ry', 106) def _reduce_31(val, _values, result) result = [result] @rulelinenos = [] result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 112) def _reduce_32(val, _values, result) result << val[1] @rulelinenos << @lineno result end .,., # reduce 33 omitted # reduce 34 omitted # reduce 35 omitted # reduce 36 omitted # reduce 37 omitted module_eval(<<'.,.,', 'robotstxt.ry', 125) def _reduce_38(val, _values, result) result = AllowLine.new(val[0], val[3]) result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 130) def _reduce_39(val, _values, result) result = DisallowLine.new(val[0], val[3]) result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 135) def _reduce_40(val, _values, result) result = CrawlDelayLine.new(val[0], val[3]) result end .,., module_eval(<<'.,.,', 'robotstxt.ry', 140) def _reduce_41(val, _values, result) result = ExtentionLine.new(val[0], val[3]) result end .,., # reduce 42 omitted # reduce 43 omitted def _reduce_none(val, _values, result) val[0] end end # class Parser def initialize(site, records, options = nil) @timestamp = Time.now @site = site @options = options || {} @last_checked_at = nil @error = @options[:error] @target = @options[:target] @sitemaps = @options[:sitemaps] || [] @crawl_delay_handler = @options[:crawl_delay_handler] if records && !records.empty? @records, defaults = [], [] records.each { |record| if record.default? defaults << record elsif !@target || record.match?(@target) @records << record end } @records.concat(defaults) else @records = [] end end attr_reader :timestamp, :site, :sitemaps attr_accessor :error def error! raise @error if @error end def target(user_agent = nil) if user_agent raise ArgumentError, "this instance is targeted for #{@target}" if @target user_agent else raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target @target end end private :target def find_record(user_agent = nil) user_agent = target(user_agent) @records.find { |record| record.match?(user_agent) } end private :find_record def allow?(request_uri, user_agent = nil) record = find_record(user_agent) or return true allow = record.allow?(request_uri) if delay = record.delay and @crawl_delay_handler @crawl_delay_handler.call(delay, @last_checked_at) end @last_checked_at = Time.now return allow end def crawl_delay(user_agent = nil) record = find_record(user_agent) or return 0 record.delay or return 0 end def options(user_agent = nil) record = find_record(user_agent) or return {} record.options end DISALLOW_ALL = <<-TXT User-Agent: * Disallow: / TXT def self.unfetchable(site, reason, target = nil) Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt| robots_txt.error = reason } end class Record def initialize(agentlines, rulelines) @patterns = agentlines.map { |agentline| agentline.pattern } @acls = [] @delay = nil @options = {} rulelines.each { |ruleline| case ruleline when AccessControlLine @acls << ruleline when CrawlDelayLine @delay = ruleline.delay else @options[ruleline.token.downcase] = ruleline.value end } if rulelines @acls.replace @acls.sort_by { |x| [-x.value.length, x.is_a?(AllowLine) ? -1 : 0] } end attr_reader :delay, :options def match?(user_agent) @patterns.any? { |pattern| pattern.match(user_agent) } end def default? @patterns.include?(//) end def allow?(request_uri) @acls.each { |acl| if acl.match?(request_uri) return acl.allow? end } return true end end class Line def initialize(token, value) @token = token @value = value compile end attr_reader :token, :value def compile self end end class AgentLine < Line def compile if @value == '*' @pattern = // else @pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE) end self end attr_reader :pattern end class AccessControlLine < Line def compile @empty = @value.empty? re_src = '\A' s = StringScanner.new(@value) until s.eos? if t = s.scan(/[^%*$]+/) re_src << Regexp.quote(t) elsif t = s.scan(/%([0-9a-f]{2})/i) c = s[1].to_i(16) if c == 0x2f re_src << '%2[fF]' else re_src << Regexp.quote('%c' % c) end elsif t = s.scan(/\*/) re_src << '.*' elsif t = s.scan(/\$/) re_src << '\z' break else re_src << Regexp.quote(s.scan(/./)) end end @pattern = Regexp.new(re_src, Regexp::MULTILINE) self end def match?(request_uri) return false if @empty transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) } !!@pattern.match(transformed) end end class AllowLine < AccessControlLine def allow? true end end class DisallowLine < AccessControlLine def allow? false end end class CrawlDelayLine < Line def compile case @value when /\A((0|[1-9][0-9]*)\.[0-9]+)/ @delay = @value.to_f when /\A(0|[1-9][0-9]*)/ @delay = @value.to_i else @delay = nil end self end attr_reader :delay end class ExtentionLine < Line end end end webrobots-0.1.1/lib/webrobots/nokogiri.rb0000644000004100000410000000206112166110604020465 0ustar www-datawww-datarequire 'nokogiri' class Nokogiri::HTML::Document # Returns an array of lower-cased tokens. If # no tag is found, returns an empty array. An optional # +custom_name+ specifies the name of a meta tag to look for ahead # of "ROBOTS". Names are compared in a case-insensitive manner. def meta_robots(custom_name = nil) (@meta_robots ||= {})[custom_name] = (custom_name && parse_meta_robots(custom_name)) || parse_meta_robots('robots') end # Equivalent to meta_robots(custom_name).include?('noindex'). def noindex?(custom_name = nil) meta_robots(custom_name).include?('noindex') end # Equivalent to meta_robots(custom_name).include?('nofollow'). def nofollow?(custom_name = nil) meta_robots(custom_name).include?('nofollow') end private def parse_meta_robots(custom_name) pattern = /\A#{Regexp.quote(custom_name)}\z/i meta = css('meta[@name]').find { |element| element['name'].match(pattern) } and content = meta['content'] or return [] content.downcase.split(/[,\s]+/) end end