Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add an option to preserve whitespace to FullSanitizer #157

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Expand Up @@ -14,6 +14,9 @@

*Mike Dalessio*

* `FullSanitizer` now supports the optional argument `preserve_whitespace` to keep whitespace around block elements and line break elements.

*Earlopain*

## 1.5.0 / 2023-01-20

Expand Down
15 changes: 5 additions & 10 deletions README.md
Expand Up @@ -62,20 +62,15 @@ All sanitizers respond to `sanitize`, and are available in variants that use eit
full_sanitizer = Rails::HTML5::FullSanitizer.new
full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
# => Bold no more! See more here...
```

or, if you insist on parsing the content as HTML4:
# Whitespace is swallowed by default. If whitespace is significant you must pass an option to preserve it.
# This option is slower, but is clever about whitespace around block elements and line break elements.

```ruby
full_sanitizer = Rails::HTML4::FullSanitizer.new
full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
# => Bold no more! See more here...
full_sanitizer = Rails::HTML5::FullSanitizer.new
full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
# => \nParagraphs\n and \n newlines
```

HTML5 version:



#### LinkSanitizer

```ruby
Expand Down
31 changes: 31 additions & 0 deletions lib/rails/html/sanitizer.rb
Expand Up @@ -66,6 +66,19 @@ def parse_fragment(html)
end if Rails::HTML::Sanitizer.html5_support?
end

module Sanitizer
module PreserveWhitespace
def sanitize(html, options = {})
return unless html
if options[:preserve_whitespace]
parse_fragment(html).to_text
else
super
end
end
end
end

module Scrubber
module Full
def scrub(fragment, options = {})
Expand Down Expand Up @@ -217,11 +230,20 @@ module HTML4
# full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
# # => "Bold no more! See more here..."
#
# === Options
#
# If whitespace is significant you can pass preserve_whitespace: true.
# This option is slower, but is clever about whitespace around block elements and line break elements.
#
# full_sanitizer = Rails::HTML4::FullSanitizer.new
# full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
# # => \nParagraphs\n and \n newlines
class FullSanitizer < Rails::HTML::Sanitizer
include HTML::Concern::ComposedSanitize
include HTML::Concern::Parser::HTML4
include HTML::Concern::Scrubber::Full
include HTML::Concern::Serializer::UTF8Encode
include HTML::Concern::Sanitizer::PreserveWhitespace
end

# == Rails::HTML4::LinkSanitizer
Expand Down Expand Up @@ -307,11 +329,20 @@ module HTML5
# full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
# # => "Bold no more! See more here..."
#
# === Options
#
# If whitespace is significant you can pass preserve_whitespace: true.
# This option is slower, but is clever about whitespace around block elements and line break elements.
#
# full_sanitizer = Rails::HTML5::FullSanitizer.new
# full_sanitizer.sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
# # => \nParagraphs\n and \n newlines
class FullSanitizer < Rails::HTML::Sanitizer
include HTML::Concern::ComposedSanitize
include HTML::Concern::Parser::HTML5
include HTML::Concern::Scrubber::Full
include HTML::Concern::Serializer::UTF8Encode
include HTML::Concern::Sanitizer::PreserveWhitespace
end

# == Rails::HTML5::LinkSanitizer
Expand Down
66 changes: 43 additions & 23 deletions test/sanitizer_test.rb
Expand Up @@ -80,50 +80,53 @@ module FullSanitizerTest

def test_strip_tags_with_quote
input = '<" <img src="trollface.gif" onload="alert(1)"> hi'
result = full_sanitize(input)
acceptable_results = [
# libxml2 >= 2.9.14 and xerces+neko
%{&lt;" hi},
# other libxml2
%{ hi},
# preserve_whitespace: true
"&lt;&quot; hi",
]

assert_includes(acceptable_results, result)
assert_full_sanitized(acceptable_results, input)
end

def test_strip_invalid_html
assert_equal "&lt;&lt;", full_sanitize("<<<bad html")
assert_full_sanitized "&lt;&lt;", "<<<bad html"
end

def test_strip_nested_tags
expected = "Wei&lt;a onclick='alert(document.cookie);'/&gt;rdos"
input = "Wei<<a>a onclick='alert(document.cookie);'</a>/>rdos"
assert_equal expected, full_sanitize(input)
assert_full_sanitized expected, input
end

def test_strip_tags_multiline
expected = %{This is a test.\n\n\n\nIt no longer contains any HTML.\n}
input = %{<h1>This is <b>a <a href="" target="_blank">test</a></b>.</h1>\n\n<!-- it has a comment -->\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n}
acceptable_results = [
%{This is a test.\n\n\n\nIt no longer contains any HTML.\n},
# preserve_whitespace: true
%{\nThis is a test.\n\nIt no longer contains any HTML.\n\n}
]

assert_equal expected, full_sanitize(input)
assert_full_sanitized acceptable_results, input
end

def test_remove_unclosed_tags
input = "This is <-- not\n a comment here."
result = full_sanitize(input)
acceptable_results = [
# libxml2 >= 2.9.14 and xerces+neko
%{This is &lt;-- not\n a comment here.},
# other libxml2
%{This is },
]

assert_includes(acceptable_results, result)
assert_full_sanitized(acceptable_results, input)
end

def test_strip_cdata
input = "This has a <![CDATA[<section>]]> here."
result = full_sanitize(input)
acceptable_results = [
# libxml2 = 2.9.14
%{This has a &lt;![CDATA[]]&gt; here.},
Expand All @@ -133,51 +136,68 @@ def test_strip_cdata
%{This has a here.},
]

assert_includes(acceptable_results, result)
assert_full_sanitized(acceptable_results, input)
end

def test_strip_blank_string
assert_nil full_sanitize(nil)
assert_equal "", full_sanitize("")
assert_equal " ", full_sanitize(" ")
assert_nil full_sanitize(nil, preserve_whitespace: true)
assert_full_sanitized "", ""
assert_full_sanitized " ", " "
end

def test_strip_tags_with_plaintext
assert_equal "Don't touch me", full_sanitize("Don't touch me")
assert_full_sanitized "Don't touch me", "Don't touch me"
end

def test_strip_tags_with_tags
assert_equal "This is a test.", full_sanitize("<p>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</p>")
assert_full_sanitized "This is a test.", "<b>This <u>is<u> a <a href='test.html'><strong>test</strong></a>.</b>"
end

def test_escape_tags_with_many_open_quotes
assert_equal "&lt;&lt;", full_sanitize("<<<bad html>")
assert_full_sanitized "&lt;&lt;", "<<<bad html>"
end

def test_strip_tags_with_sentence
assert_equal "This is a test.", full_sanitize("This is a test.")
assert_full_sanitized "This is a test.", "This is a test."
end

def test_strip_tags_with_comment
assert_equal "This has a here.", full_sanitize("This has a <!-- comment --> here.")
assert_full_sanitized "This has a here.", "This has a <!-- comment --> here."
end

def test_strip_tags_with_frozen_string
assert_equal "Frozen string with no tags", full_sanitize("Frozen string with no tags")
assert_full_sanitized "Frozen string with no tags", "Frozen string with no tags"
end

def test_full_sanitize_respect_html_escaping_of_the_given_string
assert_equal 'test\r\nstring', full_sanitize('test\r\nstring')
assert_equal "&amp;", full_sanitize("&")
assert_equal "&amp;", full_sanitize("&amp;")
assert_equal "&amp;amp;", full_sanitize("&amp;amp;")
assert_equal "omg &lt;script&gt;BOM&lt;/script&gt;", full_sanitize("omg &lt;script&gt;BOM&lt;/script&gt;")
assert_full_sanitized 'test\r\nstring', 'test\r\nstring'
assert_full_sanitized "&amp;", "&"
assert_full_sanitized "&amp;", "&amp;"
assert_full_sanitized "&amp;amp;", "&amp;amp;"
assert_full_sanitized "omg &lt;script&gt;BOM&lt;/script&gt;", "omg &lt;script&gt;BOM&lt;/script&gt;"
end

def test_full_sanitize_preserve_whitespace
assert_equal "\nParagraphs\n and \n newlines", full_sanitize("<p>Paragraphs</p> and <br> newlines", preserve_whitespace: true)
end

def test_full_sanitize_preserve_whitespace_ascii_8bit_string
full_sanitize("<a>hello</a>".encode("ASCII-8BIT")).tap do |sanitized|
assert_equal "hello", sanitized
assert_equal Encoding::UTF_8, sanitized.encoding
end
end

protected
def full_sanitize(input, options = {})
module_under_test::FullSanitizer.new.sanitize(input, options)
end

def assert_full_sanitized(acceptable_results, input)
assert_includes(Array(acceptable_results), full_sanitize(input))
assert_includes(Array(acceptable_results), full_sanitize(input, preserve_whitespace: true))
end
end

class HTML4FullSanitizerTest < Minitest::Test
Expand Down