Skip to content

Commit

Permalink
feat: use Nokogiri::HTML5 (libgumbo) by default
Browse files Browse the repository at this point in the history
Note that there is an escape hatch which is to set the environment
variable LOOFAH_HTML4_MODE to return to the previous behavior.

CI tests both html4 and html5 modes
  • Loading branch information
flavorjones committed Jun 7, 2022
1 parent bc7b362 commit 8b75439
Show file tree
Hide file tree
Showing 10 changed files with 206 additions and 91 deletions.
21 changes: 19 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,29 @@ jobs:
bundler-cache: true
- run: bundle exec rake rubocop

test:
test_html4:
env:
LOOFAH_HTML4_MODE: "t"
needs: ["rubocop"]
strategy:
fail-fast: false
matrix:
ruby: ["2.5", "2.6", "2.7", "3.0", "3.1", "head", "truffleruby-head"]
ruby: ["2.6", "2.7", "3.0", "3.1", "head", "truffleruby-head"]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: ruby/setup-ruby@v1
with:
ruby-version: ${{matrix.ruby}}
bundler-cache: true
- run: bundle exec rake test

test_html5:
needs: ["rubocop"]
strategy:
fail-fast: false
matrix:
ruby: ["2.6", "2.7", "3.0", "3.1", "head", "truffleruby-head"]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
Expand Down
56 changes: 38 additions & 18 deletions lib/loofah.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,6 @@
require "nokogiri"

require_relative "loofah/version"
require_relative "loofah/metahelpers"
require_relative "loofah/elements"

require_relative "loofah/html5/safelist"
require_relative "loofah/html5/libxml2_workarounds"
require_relative "loofah/html5/scrub"

require_relative "loofah/scrubber"
require_relative "loofah/scrubbers"

require_relative "loofah/instance_methods"
require_relative "loofah/xml/document"
require_relative "loofah/xml/document_fragment"
require_relative "loofah/html/document"
require_relative "loofah/html/document_fragment"

# == Strings and IO Objects as Input
#
Expand All @@ -31,13 +16,13 @@
module Loofah
class << self
# Shortcut for Loofah::HTML::Document.parse
# This method accepts the same parameters as Nokogiri::HTML::Document.parse
# This method accepts the same parameters as Nokogiri::HTML5::Document.parse
def document(*args, &block)
remove_comments_before_html_element Loofah::HTML::Document.parse(*args, &block)
remove_comments_before_html_element(Loofah::HTML::Document.parse(*args, &block))
end

# Shortcut for Loofah::HTML::DocumentFragment.parse
# This method accepts the same parameters as Nokogiri::HTML::DocumentFragment.parse
# This method accepts the same parameters as Nokogiri::HTML5::DocumentFragment.parse
def fragment(*args, &block)
Loofah::HTML::DocumentFragment.parse(*args, &block)
end
Expand Down Expand Up @@ -79,6 +64,25 @@ def remove_extraneous_whitespace(string)
string.gsub(/\n\s*\n\s*\n/, "\n\n")
end

def html5_mode?
defined?(::Nokogiri::HTML5) && (parser_module == ::Nokogiri::HTML5)
end

def parser_module(class_name=nil)
@parser_module ||= begin
if ENV["LOOFAH_HTML4_MODE"].nil? && defined?(::Nokogiri::HTML5)
::Nokogiri::HTML5
else
::Nokogiri::HTML4
end
end
if class_name
@parser_module.const_get(class_name)
else
@parser_module
end
end

private

# remove comments that exist outside of the HTML element.
Expand All @@ -98,3 +102,19 @@ def remove_comments_before_html_element(doc)
end
end
end

require_relative "loofah/metahelpers"
require_relative "loofah/elements"

require_relative "loofah/html5/safelist"
require_relative "loofah/html5/libxml2_workarounds"
require_relative "loofah/html5/scrub"

require_relative "loofah/scrubber"
require_relative "loofah/scrubbers"

require_relative "loofah/instance_methods"
require_relative "loofah/xml/document"
require_relative "loofah/xml/document_fragment"
require_relative "loofah/html/document"
require_relative "loofah/html/document_fragment"
4 changes: 2 additions & 2 deletions lib/loofah/html/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
module Loofah
module HTML # :nodoc:
#
# Subclass of Nokogiri::HTML::Document.
# Subclass of Nokogiri::HTML5::Document.
#
# See Loofah::ScrubBehavior and Loofah::TextBehavior for additional methods.
#
class Document < Nokogiri::HTML::Document
class Document < ::Loofah.parser_module(:Document)
include Loofah::ScrubBehavior::Node
include Loofah::DocumentDecorator
include Loofah::TextBehavior
Expand Down
6 changes: 3 additions & 3 deletions lib/loofah/html/document_fragment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
module Loofah
module HTML # :nodoc:
#
# Subclass of Nokogiri::HTML::DocumentFragment.
# Subclass of Nokogiri::HTML5::DocumentFragment.
#
# See Loofah::ScrubBehavior and Loofah::TextBehavior for additional methods.
#
class DocumentFragment < Nokogiri::HTML::DocumentFragment
class DocumentFragment < ::Loofah.parser_module(:DocumentFragment)
include Loofah::TextBehavior

class << self
#
# Overridden Nokogiri::HTML::DocumentFragment
# Overridden Nokogiri::HTML5::DocumentFragment
# constructor. Applications should use Loofah.fragment to
# parse a fragment.
#
Expand Down
3 changes: 3 additions & 0 deletions lib/loofah/html5/safelist.rb
Original file line number Diff line number Diff line change
Expand Up @@ -869,6 +869,9 @@ module SafeList
"img",
"input",
])
if ::Loofah.html5_mode?
VOID_ELEMENTS.add("wbr")
end

# additional tags we should consider safe since we have libxml2 fixing up our documents.
TAGS_SAFE_WITH_LIBXML2 = Set.new([
Expand Down
78 changes: 51 additions & 27 deletions test/assets/testdata_sanitizer_tests1.dat
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
{
"name": "IE_Comments_2",
"input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
"output": "&lt;script&gt;alert('XSS');&lt;/script&gt;",
"xhtml": "&lt;![if !IE 5]&gt;&lt;script&gt;alert('XSS');&lt;/script&gt;&lt;![endif]&gt;",
"commentary": "output is libxml 2.9.13 and earlier, xhtml is libxml 2.9.14 and later, see libxml 148be64"
"libxml_lte_2.9.13": "&lt;script&gt;alert('XSS');&lt;/script&gt;",
"libxml_gte_2.9.14": "&lt;![if !IE 5]&gt;&lt;script&gt;alert('XSS');&lt;/script&gt;&lt;![endif]&gt;",
"libgumbo": "&lt;!--[if !IE 5]--&gt;&lt;script&gt;alert('XSS');&lt;/script&gt;&lt;!--[endif]--&gt;"
},

{
Expand All @@ -30,7 +30,8 @@
{
"name": "bgsound",
"input": "<bgsound src=\"javascript:alert('XSS');\" />",
"output": "&lt;bgsound&gt;&lt;/bgsound&gt;"
"libxml": "&lt;bgsound&gt;&lt;/bgsound&gt;",
"libgumbo": "&lt;bgsound&gt;"
},

{
Expand Down Expand Up @@ -84,15 +85,27 @@
{
"name": "double_open_angle_brackets",
"input": "<img src=http://ha.ckers.org/scriptlet.html <",
"output": "<img src='http://ha.ckers.org/scriptlet.html'>",
"rexml": "Ill-formed XHTML!"
"libxml": "<img src='http://ha.ckers.org/scriptlet.html'>",
"libgumbo": "" /* it is indeed the empty result, see next test for a better test */
},

{
"name": "double_open_angle_brackets v2",
"input": "<div><img src=http://ha.ckers.org/scriptlet.html < </div>",
"output": "<div><img src='http://ha.ckers.org/scriptlet.html'></div>"
},

{
"name": "double_open_angle_brackets_2",
"input": "<script src=http://ha.ckers.org/scriptlet.html <",
"output": "&lt;script src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
"libxml": "&lt;script src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/script&gt;",
"libgumbo": "" /* it is indeed empty */
},

{
"name": "double_open_angle_brackets_2 v2",
"input": "<div><script src=http://ha.ckers.org/scriptlet.html < </div>",
"libxml": "<div>&lt;script src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/script&gt;</div>"
},

{
Expand Down Expand Up @@ -132,8 +145,7 @@
{
"name": "link_stylesheets_2",
"input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
"output": "&lt;link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\"&gt;",
"rexml": "&lt;link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"/&gt;"
"output": "&lt;link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\"&gt;"
},

{
Expand All @@ -145,8 +157,8 @@
{
"name": "no_closing_script_tags",
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
"libxml": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;",
"libgumbo": "&lt;script src='http://ha.ckers.org/xss.js?&lt;b'&gt;&lt;/script&gt;"
},

{
Expand All @@ -166,8 +178,8 @@
{
"name": "non_alpha_non_digit_3",
"input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
"output": "<img>",
"rexml": "Ill-formed XHTML!"
"libxml": "<img>",
"libgumbo": "<img src='http://ha.ckers.org/xss.js'>" /* see "should_allow_image_src_attribute" test */
},

{
Expand Down Expand Up @@ -365,8 +377,14 @@
{
"name": "should_sanitize_half_open_scripts",
"input": "<img src=\"javascript:alert('XSS')\"",
"output": "<img>",
"rexml": "Ill-formed XHTML!"
"libxml": "<img>",
"libgumbo": "" /* indeed it is empty */
},

{
"name": "should_sanitize_half_open_scripts 2",
"input": "<div><img src=\"javascript:alert('XSS')\" </div>",
"output": "<div><img></div>"
},

{
Expand All @@ -385,24 +403,30 @@
},

{
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2a",
"input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
"output": "&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/iframe&gt;",
"rexml": "Ill-formed XHTML!"
"libxml": "&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/iframe&gt;",
"libgumbo": "" /* it is indeed empty, see next test */
},

{
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2b",
"input": "<div><iframe src=http://ha.ckers.org/scriptlet.html\n< </div>",
"libxml": "<div>&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\"&gt;&lt;/iframe&gt;</div>"
},

{
"name": "should_sanitize_tag_broken_up_by_null",
"input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
"output": "&lt;scr&gt;&lt;/scr&gt;",
"rexml": "Ill-formed XHTML!"
"libxml": "&lt;scr&gt;&lt;/scr&gt;",
"libgumbo": "&lt;scr�ipt&gt;alert('XSS')&lt;/scr�ipt&gt;"
},

{
"name": "should_sanitize_unclosed_script",
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;",
"rexml": "Ill-formed XHTML!"
"libxml": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;",
"libgumbo": "&lt;script src='http://ha.ckers.org/xss.js?&lt;b'&gt;&lt;/script&gt;"
},

{
Expand Down Expand Up @@ -446,8 +470,8 @@
{
"name": "quotes_in_attributes",
"input": "<img src='foo' title='\"foo\" bar' />",
"rexml": "<img src='foo' title='\"foo\" bar' />",
"output": "<img src='foo' title='\"foo\" bar'>"
"libxml": "<img src='foo' title='\"foo\" bar'>",
"libgumbo": "<img src='foo' title='&quot;foo&quot; bar'>"
},

{
Expand Down Expand Up @@ -485,8 +509,8 @@
{
"name": "allow_html5_image_tag",
"input": "<image src='foo' />",
"rexml": "&lt;image src=\"foo\"&gt;&lt;/image&gt;",
"output": "&lt;image src=\"foo\"/&gt;"
"libxml": "&lt;image src=\"foo\"&gt;&lt;/image&gt;",
"libgumbo": "<img src='foo'>"
},

{
Expand Down
5 changes: 5 additions & 0 deletions test/helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@
require File.expand_path(File.join(File.dirname(__FILE__), "..", "lib", "loofah", "helpers"))

puts "=> testing with Nokogiri #{Nokogiri::VERSION_INFO.inspect}"
puts "=> parser module is #{::Loofah::parser_module}"

class Loofah::TestCase < MiniTest::Spec
class << self
alias_method :context, :describe
end

def html5_mode?
::Loofah.html5_mode?
end
end

0 comments on commit 8b75439

Please sign in to comment.