diff --git a/lib/loofah.rb b/lib/loofah.rb index 099f183b..ce41c270 100644 --- a/lib/loofah.rb +++ b/lib/loofah.rb @@ -6,6 +6,7 @@ require 'loofah/elements' require 'loofah/html5/whitelist' +require 'loofah/html5/libxml2_workarounds' require 'loofah/html5/scrub' require 'loofah/scrubber' diff --git a/lib/loofah/html5/libxml2_workarounds.rb b/lib/loofah/html5/libxml2_workarounds.rb new file mode 100644 index 00000000..fc3fb244 --- /dev/null +++ b/lib/loofah/html5/libxml2_workarounds.rb @@ -0,0 +1,26 @@ +# coding: utf-8 +require 'set' + +module Loofah + # + # constants related to working around unhelpful libxml2 behavior + # + # ಠ_ಠ + # + module LibxmlWorkarounds + # + # these attributes and qualifying parent tags are determined by the code at: + # + # https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714 + # + # see comments about CVE-2018-8048 within the tests for more information + # + BROKEN_ESCAPING_ATTRIBUTES = Set.new %w[ + href + action + src + name + ] + BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG = {"name" => "a"} + end +end diff --git a/lib/loofah/html5/scrub.rb b/lib/loofah/html5/scrub.rb index 4e2ca648..99622bfe 100644 --- a/lib/loofah/html5/scrub.rb +++ b/lib/loofah/html5/scrub.rb @@ -1,5 +1,3 @@ -#encoding: US-ASCII - require 'cgi' require 'crass' @@ -65,6 +63,8 @@ def scrub_attributes node node.attribute_nodes.each do |attr_node| node.remove_attribute(attr_node.name) if attr_node.value !~ /[^[:space:]]/ end + + force_correct_attribute_escaping! node end def scrub_css_attribute node @@ -100,6 +100,35 @@ def scrub_css style Crass::Parser.stringify sanitized_tree end + + private + + # + # libxml2 >= 2.9.2 fails to escape comments within some attributes. + # + # see comments about CVE-2018-8048 within the tests for more information + # + def force_correct_attribute_escaping! node + return unless Nokogiri::VersionInfo.instance.libxml2? + + node.attribute_nodes.each do |attr_node| + next unless LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES.include?(attr_node.name) + + tag_name = LibxmlWorkarounds::BROKEN_ESCAPING_ATTRIBUTES_QUALIFYING_TAG[attr_node.name] + next unless tag_name.nil? || tag_name == node.name + + # + # this block is just like CGI.escape in Ruby 2.4, but + # only encodes space and double-quote, to mimic + # pre-2.9.2 behavior + # + encoding = attr_node.value.encoding + attr_node.value = attr_node.value.gsub(/[ "]/) do |m| + '%' + m.unpack('H2' * m.bytesize).join('%').upcase + end.force_encoding(encoding) + end + end + end end end diff --git a/test/integration/test_ad_hoc.rb b/test/integration/test_ad_hoc.rb index 13539668..71958fda 100644 --- a/test/integration/test_ad_hoc.rb +++ b/test/integration/test_ad_hoc.rb @@ -188,6 +188,71 @@ def test_dont_remove_whitespace_between_tags html = "

Foo

\n

Bar

" assert_equal "Foo\nBar", Loofah.scrub_document(html, :prune).text end + + # + # tests for CVE-2018-8048 (see https://github.com/flavorjones/loofah/issues/144) + # + # libxml2 >= 2.9.2 fails to escape comments within some attributes. It + # wants to ensure these comments can be treated as "server-side includes", + # but as a result fails to ensure that serialization is well-formed, + # resulting in an opportunity for XSS injection of code into a final + # re-parsed document (presumably in a browser). + # + # we'll test this by parsing the HTML, serializing it, then + # re-parsing it to ensure there isn't any ambiguity in the output + # that might allow code injection into a browser consuming + # "sanitized" output. + # + [ + # + # these tags and attributes are determined by the code at: + # + # https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714 + # + {tag: "a", attr: "href"}, + {tag: "div", attr: "href"}, + {tag: "a", attr: "action"}, + {tag: "div", attr: "action"}, + {tag: "a", attr: "src"}, + {tag: "div", attr: "src"}, + {tag: "a", attr: "name"}, + # + # note that div+name is _not_ affected by the libxml2 issue. + # but we test it anyway to ensure our logic isn't modifying + # attributes that don't need modifying. + # + {tag: "div", attr: "name", unescaped: true}, + ].each do |config| + + define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do + html = %{<#{config[:tag]} #{config[:attr]}='example.com'>test} + + reparsed = Loofah.fragment(Loofah.fragment(html).scrub!(:prune).to_html) + attributes = reparsed.at_css(config[:tag]).attribute_nodes + + assert_equal [config[:attr]], attributes.collect(&:name) + if Nokogiri::VersionInfo.new.libxml2? + if config[:unescaped] + # + # this attribute was emitted wrapped in single-quotes, so a double quote is A-OK. + # assert that this attribute's serialization is unaffected. + # + assert_equal %{example.com}, attributes.first.value + else + # + # let's match the behavior in libxml < 2.9.2. + # test that this attribute's serialization is well-formed and sanitized. + # + assert_equal %{example.com}, attributes.first.value + end + else + # + # yay for consistency in javaland. move along, nothing to see here. + # + assert_equal %{example.com}, attributes.first.value + end + end + + end end end -