sanitize-2.1.0/0000755000175000017500000000000012277765761012372 5ustar jonasjonassanitize-2.1.0/checksums.yaml.gz0000444000175000017500000000041412277765761015657 0ustar jonasjonasnvRe;@EYl`,?/33w@F&DV3 8z_|^s;?}UF,mSYUeb0sez9 px,]eӻf̦cdB.Rta)ɳ*v"FwB|wII bV;U>|+hV_xJ3PBO ݽu&k D*E EyHN;L XbyТsanitize-2.1.0/LICENSE0000644000175000017500000000205712277765761013403 0ustar jonasjonasCopyright (c) 2014 Ryan Grove Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. sanitize-2.1.0/test/0000755000175000017500000000000012277765761013351 5ustar jonasjonassanitize-2.1.0/test/test_sanitize.rb0000644000175000017500000006060012277765761016565 0ustar jonasjonas# encoding: utf-8 #-- # Copyright (c) 2013 Ryan Grove # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the 'Software'), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. #++ require 'rubygems' gem 'minitest' require 'minitest/autorun' require 'sanitize' strings = { :basic => { :html => 'Lorem ipsum dolor sit
amet ', :default => 'Lorem ipsum dolor sit amet alert("hello world");', :restricted => 'Lorem ipsum dolor sit amet alert("hello world");', :basic => 'Lorem ipsum dolor sit
amet alert("hello world");', :relaxed => 'Lorem ipsum dolor sit
amet alert("hello world");' }, :malformed => { :html => 'Lorem dolor sit
amet ', :default => 'Lorem ipsum dolor sit amet script>alert("hello world");', :restricted => 'Lorem ipsum dolor sit amet script>alert("hello world");', :basic => 'Lorem ipsum dolor sit
amet script>alert("hello world");', :relaxed => 'Lorem ipsum dolor sit
amet script>alert("hello world");' }, :raw_comment => { :html => 'Hello', :default => 'Hello', :restricted => 'Hello', :basic => 'Hello', :relaxed => 'Hello', :document => ' Hello ', } } tricky = { 'protocol-based JS injection: simple, no spaces' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: simple, spaces before' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: simple, spaces after' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: simple, spaces before and after' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: preceding colon' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: UTF-8 encoding' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: long UTF-8 encoding' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: long UTF-8 encoding without semicolons' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: hex encoding' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: long hex encoding' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: hex encoding without semicolons' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: null char' => { :html => "", :default => '', :restricted => '', :basic => '', :relaxed => '' # everything following the null char gets stripped, and URL is considered relative }, 'protocol-based JS injection: invalid URL char' => { :html => '', :default => '', :restricted => '', :basic => '', :relaxed => '' }, 'protocol-based JS injection: spaces and entities' => { :html => '', :default => '', :restricted => '', :basic => '', :relaxed => '' } } describe 'Config::DEFAULT' do it 'should translate valid HTML entities' do Sanitize.clean("Don't tasé me & bro!").must_equal("Don't tasé me & bro!") end it 'should translate valid HTML entities while encoding unencoded ampersands' do Sanitize.clean("cookies² & ¼ créme").must_equal("cookies² & ¼ créme") end it 'should never output '' do Sanitize.clean("IE6 isn't a real browser").wont_match(/'/) end it 'should not choke on several instances of the same element in a row' do Sanitize.clean('').must_equal('') end it 'should surround the contents of :whitespace_elements with space characters when removing the element' do Sanitize.clean('foo
bar
baz').must_equal('foo bar baz') Sanitize.clean('foo
bar
baz').must_equal('foo bar baz') Sanitize.clean('foo
bar
baz').must_equal('foo bar baz') end strings.each do |name, data| it "should clean #{name} HTML" do Sanitize.clean(data[:html]).must_equal(data[:default]) end end tricky.each do |name, data| it "should not allow #{name}" do Sanitize.clean(data[:html]).must_equal(data[:default]) end end end describe 'Config::RESTRICTED' do before { @s = Sanitize.new(Sanitize::Config::RESTRICTED) } strings.each do |name, data| it "should clean #{name} HTML" do @s.clean(data[:html]).must_equal(data[:restricted]) end end tricky.each do |name, data| it "should not allow #{name}" do @s.clean(data[:html]).must_equal(data[:restricted]) end end end describe 'Config::BASIC' do before { @s = Sanitize.new(Sanitize::Config::BASIC) } it 'should not choke on valueless attributes' do @s.clean('foo foo bar').must_equal('foo foo bar') end it 'should downcase attribute names' do @s.clean('bar').must_equal('bar') end strings.each do |name, data| it "should clean #{name} HTML" do @s.clean(data[:html]).must_equal(data[:basic]) end end tricky.each do |name, data| it "should not allow #{name}" do @s.clean(data[:html]).must_equal(data[:basic]) end end end describe 'Config::RELAXED' do before { @s = Sanitize.new(Sanitize::Config::RELAXED) } it 'should encode special chars in attribute values' do input = 'foo' output = Nokogiri::HTML.fragment('foo').to_xhtml(:encoding => 'utf-8', :indent => 0, :save_with => Nokogiri::XML::Node::SaveOptions::AS_XHTML) @s.clean(input).must_equal(output) end strings.each do |name, data| it "should clean #{name} HTML" do @s.clean(data[:html]).must_equal(data[:relaxed]) end end tricky.each do |name, data| it "should not allow #{name}" do @s.clean(data[:html]).must_equal(data[:relaxed]) end end end describe 'Full Document parser (using clean_document)' do before { @s = Sanitize.new({:elements => %w[!DOCTYPE html]}) @default_doctype = "" } it 'should require HTML element is whitelisted to prevent parser errors' do assert_raises(RuntimeError, 'You must have the HTML element whitelisted') { Sanitize.clean_document!('', {:elements => [], :remove_contents => false}) } end it 'should NOT require HTML element to be whitelisted if remove_contents is true' do output = 'foo' Sanitize.clean_document!(output, {:remove_contents => true}).must_equal "\n\n" end it 'adds a doctype tag if not included' do @s.clean_document('').must_equal("#{@default_doctype}\n\n") end it 'should apply whitelist filtering to HTML element' do output = "\n\n\n" @s.clean_document(output).must_equal("\n\n") end strings.each do |name, data| it "should wrap #{name} with DOCTYPE and HTML tag" do output = data[:document] || data[:default] @s.clean_document(data[:html]).must_equal("#{@default_doctype}\n#{output}\n") end end tricky.each do |name, data| it "should wrap #{name} with DOCTYPE and HTML tag" do @s.clean_document(data[:html]).must_equal("#{@default_doctype}\n#{data[:default]}\n") end end end describe 'Custom configs' do it 'should allow attributes on all elements if whitelisted under :all' do input = '

bar

' Sanitize.clean(input).must_equal(' bar ') Sanitize.clean(input, {:elements => ['p'], :attributes => {:all => ['class']}}).must_equal(input) Sanitize.clean(input, {:elements => ['p'], :attributes => {'div' => ['class']}}).must_equal('

bar

') Sanitize.clean(input, {:elements => ['p'], :attributes => {'p' => ['title'], :all => ['class']}}).must_equal(input) end it 'should allow comments when :allow_comments == true' do input = 'foo baz' Sanitize.clean(input).must_equal('foo baz') Sanitize.clean(input, :allow_comments => true).must_equal(input) end it 'should allow relative URLs containing colons where the colon is not in the first path segment' do input = 'Random Page' Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input) end it 'should allow relative URLs containing colons where the colon is part of an anchor' do input = 'Footnote 1' Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input) end it 'should allow relative URLs containing colons where the colon is part of an anchor' do input = 'Footnote 1' Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input) end it 'should output HTML when :output == :html' do input = 'foo
bar
baz' Sanitize.clean(input, :elements => ['br'], :output => :html).must_equal('foo
bar
baz') end it 'should remove the contents of filtered nodes when :remove_contents == true' do Sanitize.clean('foo bar
bazquux
', :remove_contents => true).must_equal('foo bar ') end it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as strings' do Sanitize.clean('foo bar
bazquux
', :remove_contents => ['script', 'span']).must_equal('foo bar baz ') end it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as symbols' do Sanitize.clean('foo bar
bazquux
', :remove_contents => [:script, :span]).must_equal('foo bar baz ') end it 'should support encodings other than utf-8' do html = 'foo bar' Sanitize.clean(html).must_equal("foo\302\240bar") Sanitize.clean(html, :output_encoding => 'ASCII').must_equal("foo bar") end it 'should not allow arbitrary HTML5 data attributes by default' do config = { :elements => ['b'] } Sanitize.clean('', config) .must_equal('') config[:attributes] = {'b' => ['class']} Sanitize.clean('', config) .must_equal('') end it 'should allow arbitrary HTML5 data attributes when the :attributes config includes :data' do config = { :attributes => {'b' => [:data]}, :elements => ['b'] } Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') # Nokogiri quirk; not ideal, but harmless Sanitize.clean('', config) .must_equal('') # Another annoying Nokogiri quirk. end end describe 'Sanitize.clean' do it 'should not modify the input string' do input = 'foo' Sanitize.clean(input) input.must_equal('foo') end it 'should return a new string' do input = 'foo' Sanitize.clean(input).must_equal('foo') end end describe 'Sanitize.clean!' do it 'should modify the input string' do input = 'foo' Sanitize.clean!(input) input.must_equal('foo') end it 'should return the string if it was modified' do input = 'foo' Sanitize.clean!(input).must_equal('foo') end it 'should return nil if the string was not modified' do input = 'foo' Sanitize.clean!(input).must_equal(nil) end end describe 'Sanitize.clean_document' do before { @config = { :elements => ['html', 'p'] } } it 'should be idempotent' do input = '

foo

' first = Sanitize.clean_document(input, @config) second = Sanitize.clean_document(first, @config) second.must_equal first second.wont_be_nil end it 'should handle nil without raising' do Sanitize.clean_document(nil).must_equal nil end it 'should not modify the input string' do input = 'foo' Sanitize.clean_document(input, @config) input.must_equal('foo') end it 'should return a new string' do input = 'foo' Sanitize.clean_document(input, @config).must_equal("\nfoo\n") end end describe 'Sanitize.clean_document!' do before { @config = { :elements => ['html'] } } it 'should modify the input string' do input = 'foo' Sanitize.clean_document!(input, @config) input.must_equal("\nfoo\n") end it 'should return the string if it was modified' do input = 'foo' Sanitize.clean_document!(input, @config).must_equal("\nfoo\n") end it 'should return nil if the string was not modified' do input = "\n\n" Sanitize.clean_document!(input, @config).must_equal(nil) end end describe 'transformers' do # YouTube embed transformer. youtube = lambda do |env| node = env[:node] node_name = env[:node_name] # Don't continue if this node is already whitelisted or is not an element. return if env[:is_whitelisted] || !node.element? # Don't continue unless the node is an iframe. return unless node_name == 'iframe' # Verify that the video URL is actually a valid YouTube video URL. return unless node['src'] =~ /\Ahttps?:\/\/(?:www\.)?youtube(?:-nocookie)?\.com\// # We're now certain that this is a YouTube embed, but we still need to run # it through a special Sanitize step to ensure that no unwanted elements or # attributes that don't belong in a YouTube embed can sneak in. Sanitize.clean_node!(node, { :elements => %w[iframe], :attributes => { 'iframe' => %w[allowfullscreen frameborder height src width] } }) # Now that we're sure that this is a valid YouTube embed and that there are # no unwanted elements or attributes hidden inside it, we can tell Sanitize # to whitelist the current node. {:node_whitelist => [node]} end it 'should receive a complete env Hash as input' do Sanitize.clean!('foo', :foo => :bar, :transformers => lambda {|env| return unless env[:node].element? env[:config][:foo].must_equal(:bar) env[:is_whitelisted].must_equal(false) env[:node].must_be_kind_of(Nokogiri::XML::Node) env[:node_name].must_equal('span') env[:node_whitelist].must_be_kind_of(Set) env[:node_whitelist].must_be_empty }) end it 'should traverse all node types, including the fragment itself' do nodes = [] Sanitize.clean!('
foo
', :transformers => proc {|env| nodes << env[:node_name] }) nodes.must_equal(%w[ text div comment #cdata-section script #document-fragment ]) end it 'should traverse in depth-first mode by default' do nodes = [] Sanitize.clean!('
foo

bar

', :transformers => proc {|env| env[:traversal_mode].must_equal(:depth) nodes << env[:node_name] if env[:node].element? }) nodes.must_equal(['span', 'div', 'p']) end it 'should traverse in breadth-first mode when using :transformers_breadth' do nodes = [] Sanitize.clean!('
foo

bar

', :transformers_breadth => proc {|env| env[:traversal_mode].must_equal(:breadth) nodes << env[:node_name] if env[:node].element? }) nodes.must_equal(['div', 'span', 'p']) end it 'should whitelist nodes in the node whitelist' do Sanitize.clean!('
foo
bar', :transformers => [ proc {|env| {:node_whitelist => [env[:node]]} if env[:node_name] == 'div' }, proc {|env| env[:is_whitelisted].must_equal(false) unless env[:node_name] == 'div' env[:is_whitelisted].must_equal(true) if env[:node_name] == 'div' env[:node_whitelist].must_include(env[:node]) if env[:node_name] == 'div' } ]).must_equal('
foo
bar') end it 'should clear the node whitelist after each fragment' do called = false Sanitize.clean!('
foo
', :transformers => proc {|env| {:node_whitelist => [env[:node]]} }) Sanitize.clean!('
foo
', :transformers => proc {|env| called = true env[:is_whitelisted].must_equal(false) env[:node_whitelist].must_be_empty }) called.must_equal(true) end it 'should allow youtube video embeds via the youtube transformer' do input = '' output = Nokogiri::HTML::DocumentFragment.parse('').to_html(:encoding => 'utf-8', :indent => 0) Sanitize.clean!(input, :transformers => youtube).must_equal(output) end it 'should allow https youtube video embeds via the youtube transformer' do input = '' output = Nokogiri::HTML::DocumentFragment.parse('').to_html(:encoding => 'utf-8', :indent => 0) Sanitize.clean!(input, :transformers => youtube).must_equal(output) end it 'should allow privacy-enhanced youtube video embeds via the youtube transformer' do input = '' output = Nokogiri::HTML::DocumentFragment.parse('').to_html(:encoding => 'utf-8', :indent => 0) Sanitize.clean!(input, :transformers => youtube).must_equal(output) end it 'should not allow non-youtube video embeds via the youtube transformer' do input = '' output = '' Sanitize.clean!(input, :transformers => youtube).must_equal(output) end end describe 'bugs' do it 'should not have Nokogiri 1.4.2+ unterminated script/style element bug' do Sanitize.clean!('foo