Skip to content

Commit

Permalink
feat: add an option to preserve whitespace to FullSanitizer
Browse files Browse the repository at this point in the history
  • Loading branch information
Earlopain committed May 16, 2023
1 parent 50644ff commit 82fea0e
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 33 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Expand Up @@ -14,6 +14,9 @@

*Mike Dalessio*

* `FullSanitizer` now supports the optional argument `preserve_whitespace` to keep whitespace around block elements and line break elements.

*Earlopain*

## 1.5.0 / 2023-01-20

Expand Down
15 changes: 5 additions & 10 deletions README.md
Expand Up @@ -62,20 +62,15 @@ All sanitizers respond to `sanitize`, and are available in variants that use eit
full_sanitizer = Rails::HTML5::FullSanitizer.new
full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
# => Bold no more! See more here...
```

or, if you insist on parsing the content as HTML4:
# Whitespace is swallowed by default. If whitespace is significant you must pass an option to preserve it.
# This option is slower, but is clever about whitespace around block elements and line break elements.

```ruby
full_sanitizer = Rails::HTML4::FullSanitizer.new
full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
# => Bold no more! See more here...
full_sanitizer = Rails::HTML5::FullSanitizer.new
full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
# => \nParagraphs\n and \n newlines
```

HTML5 version:



#### LinkSanitizer

```ruby
Expand Down
31 changes: 31 additions & 0 deletions lib/rails/html/sanitizer.rb
Expand Up @@ -66,6 +66,19 @@ def parse_fragment(html)
end if Rails::HTML::Sanitizer.html5_support?
end

module Sanitizer
module PreserveWhitespace
def sanitize(html, options = {})
return unless html
if options[:preserve_whitespace]
parse_fragment(html).to_text
else
super
end
end
end
end

module Scrubber
module Full
def scrub(fragment, options = {})
Expand Down Expand Up @@ -217,11 +230,20 @@ module HTML4
# full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
# # => "Bold no more! See more here..."
#
# === Options
#
# If whitespace is significant you can pass preserve_whitespace: true.
# This option is slower, but is clever about whitespace around block elements and line break elements.
#
# full_sanitizer = Rails::HTML4::FullSanitizer.new
# full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
# # => \nParagraphs\n and \n newlines
class FullSanitizer < Rails::HTML::Sanitizer
include HTML::Concern::ComposedSanitize
include HTML::Concern::Parser::HTML4
include HTML::Concern::Scrubber::Full
include HTML::Concern::Serializer::UTF8Encode
include HTML::Concern::Sanitizer::PreserveWhitespace
end

# == Rails::HTML4::LinkSanitizer
Expand Down Expand Up @@ -307,11 +329,20 @@ module HTML5
# full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
# # => "Bold no more! See more here..."
#
# === Options
#
# If whitespace is significant you can pass preserve_whitespace: true.
# This option is slower, but is clever about whitespace around block elements and line break elements.
#
# full_sanitizer = Rails::HTML5::FullSanitizer.new
# full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
# # => \nParagraphs\n and \n newlines
class FullSanitizer < Rails::HTML::Sanitizer
include HTML::Concern::ComposedSanitize
include HTML::Concern::Parser::HTML5
include HTML::Concern::Scrubber::Full
include HTML::Concern::Serializer::UTF8Encode
include HTML::Concern::Sanitizer::PreserveWhitespace
end

# == Rails::HTML5::LinkSanitizer
Expand Down
66 changes: 43 additions & 23 deletions test/sanitizer_test.rb
Expand Up @@ -80,50 +80,53 @@ module FullSanitizerTest

def test_strip_tags_with_quote
input = '<" <img src="trollface.gif" onload="alert(1)"> hi'
result = full_sanitize(input)
acceptable_results = [
# libxml2 >= 2.9.14 and xerces+neko
%{&lt;" hi},
# other libxml2
%{ hi},
# preserve_whitespace: true
"&lt;&quot; hi",
]

assert_includes(acceptable_results, result)
assert_full_sanitized(acceptable_results, input)
end

def test_strip_invalid_html
assert_equal "&lt;&lt;", full_sanitize("<<<bad html")
assert_full_sanitized "&lt;&lt;", "<<<bad html"
end

def test_strip_nested_tags
expected = "Wei&lt;a onclick='alert(document.cookie);'/&gt;rdos"
input = "Wei<<a>a onclick='alert(document.cookie);'</a>/>rdos"
assert_equal expected, full_sanitize(input)
assert_full_sanitized expected, input
end

def test_strip_tags_multiline
expected = %{This is a test.\n\n\n\nIt no longer contains any HTML.\n}
input = %{<h1>This is <b>a <a href="" target="_blank">test</a></b>.</h1>\n\n<!-- it has a comment -->\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n}
acceptable_results = [
%{This is a test.\n\n\n\nIt no longer contains any HTML.\n},
# preserve_whitespace: true
%{\nThis is a test.\n\nIt no longer contains any HTML.\n\n}
]

assert_equal expected, full_sanitize(input)
assert_full_sanitized acceptable_results, input
end

def test_remove_unclosed_tags
input = "This is <-- not\n a comment here."
result = full_sanitize(input)
acceptable_results = [
# libxml2 >= 2.9.14 and xerces+neko
%{This is &lt;-- not\n a comment here.},
# other libxml2
%{This is },
]

assert_includes(acceptable_results, result)
assert_full_sanitized(acceptable_results, input)
end

def test_strip_cdata
input = "This has a <![CDATA[<section>]]> here."
result = full_sanitize(input)
acceptable_results = [
# libxml2 = 2.9.14
%{This has a &lt;![CDATA[]]&gt; here.},
Expand All @@ -133,51 +136,68 @@ def test_strip_cdata
%{This has a here.},
]

assert_includes(acceptable_results, result)
assert_full_sanitized(acceptable_results, input)
end

def test_strip_blank_string
assert_nil full_sanitize(nil)
assert_equal "", full_sanitize("")
assert_equal " ", full_sanitize(" ")
assert_nil full_sanitize(nil, preserve_whitespace: true)
assert_full_sanitized "", ""
assert_full_sanitized " ", " "
end

def test_strip_tags_with_plaintext
assert_equal "Don't touch me", full_sanitize("Don't touch me")
assert_full_sanitized "Don't touch me", "Don't touch me"
end

def test_strip_tags_with_tags
assert_equal "This is a test.", full_sanitize("<p>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</p>")
assert_full_sanitized "This is a test.", "<b>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</b>"
end

def test_escape_tags_with_many_open_quotes
assert_equal "&lt;&lt;", full_sanitize("<<<bad html>")
assert_full_sanitized "&lt;&lt;", "<<<bad html>"
end

def test_strip_tags_with_sentence
assert_equal "This is a test.", full_sanitize("This is a test.")
assert_full_sanitized "This is a test.", "This is a test."
end

def test_strip_tags_with_comment
assert_equal "This has a here.", full_sanitize("This has a <!-- comment --> here.")
assert_full_sanitized "This has a here.", "This has a <!-- comment --> here."
end

def test_strip_tags_with_frozen_string
assert_equal "Frozen string with no tags", full_sanitize("Frozen string with no tags")
assert_full_sanitized "Frozen string with no tags", "Frozen string with no tags"
end

def test_full_sanitize_respect_html_escaping_of_the_given_string
assert_equal 'test\r\nstring', full_sanitize('test\r\nstring')
assert_equal "&amp;", full_sanitize("&")
assert_equal "&amp;", full_sanitize("&amp;")
assert_equal "&amp;amp;", full_sanitize("&amp;amp;")
assert_equal "omg &lt;script&gt;BOM&lt;/script&gt;", full_sanitize("omg &lt;script&gt;BOM&lt;/script&gt;")
assert_full_sanitized 'test\r\nstring', 'test\r\nstring'
assert_full_sanitized "&amp;", "&"
assert_full_sanitized "&amp;", "&amp;"
assert_full_sanitized "&amp;amp;", "&amp;amp;"
assert_full_sanitized "omg &lt;script&gt;BOM&lt;/script&gt;", "omg &lt;script&gt;BOM&lt;/script&gt;"
end

def test_full_sanitize_preserve_whitespace
assert_equal "\nParagraphs\n and \n newlines", full_sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
end

def test_full_sanitize_preserve_whitespace_ascii_8bit_string
full_sanitize("<a>hello</a>".encode("ASCII-8BIT")).tap do |sanitized|
assert_equal "hello", sanitized
assert_equal Encoding::UTF_8, sanitized.encoding
end
end

protected
def full_sanitize(input, options = {})
module_under_test::FullSanitizer.new.sanitize(input, options)
end

def assert_full_sanitized(acceptable_results, input)
assert_includes(Array(acceptable_results), full_sanitize(input))
assert_includes(Array(acceptable_results), full_sanitize(input, preserve_whitespace: true))
end
end

class HTML4FullSanitizerTest < Minitest::Test
Expand Down

0 comments on commit 82fea0e

Please sign in to comment.