Skip to content

Commit

Permalink
Merge pull request #2534 from sparklemotion/2331-html5-subclassing
Browse files Browse the repository at this point in the history
html5 subclassing

---

**What problem is this PR intended to solve?**

See #2331 for context. I want to start getting things in place to make it possible to seamlessly switch to HTML5 parsing by default on supported platform. Part of this will require subclassing behavior to work properly (i.e., as Loofah expects it to, where a subclass of Nokogiri::HTML5::Document will return the appropriate subclass from `.parse`).

This PR introduces that subclassing behavior, and makes all the HTML4 tests explicitly use `HTML4` instead of `HTML`.

Note that `Gumbo.parse` now takes an additional argument, which is the class that should be used for the new document. `Gumbo.parse` is considered to be an internal-only API and so this shouldn't be a breaking change, but it might be worth mentioning in release notes just in case.

**Have you included adequate test coverage?**

Yes, additional coverage has been added to `test/html5/test_api.rb`


**Does this change affect the behavior of either the C or the Java implementations?**

HTML5 only exists in the CRuby implementation
  • Loading branch information
flavorjones committed May 9, 2022
2 parents 199434c + ebde7da commit c39183a
Show file tree
Hide file tree
Showing 23 changed files with 363 additions and 263 deletions.
10 changes: 6 additions & 4 deletions ext/nokogiri/gumbo.c
Expand Up @@ -23,7 +23,7 @@
//
// Processing starts by calling gumbo_parse_with_options. The resulting document tree
// is then walked, a parallel libxml2 tree is constructed, and the final document is
// then wrapped using Nokogiri_wrap_xml_document. This approach reduces memory and CPU
// then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
// requirements as Ruby objects are only built when necessary.
//

Expand Down Expand Up @@ -297,6 +297,7 @@ typedef struct {
GumboOutput *output;
VALUE input;
VALUE url_or_frag;
VALUE klass;
xmlDocPtr doc;
} ParseArgs;

Expand All @@ -321,7 +322,7 @@ static VALUE parse_continue(VALUE parse_args);
* @!visibility protected
*/
static VALUE
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
{
GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(max_attributes);
Expand All @@ -333,6 +334,7 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors
.output = output,
.input = input,
.url_or_frag = url,
.klass = klass,
.doc = NULL,
};

Expand All @@ -357,7 +359,7 @@ parse_continue(VALUE parse_args)
}
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
build_tree(doc, (xmlNodePtr)doc, output->document);
VALUE rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc);
VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
add_errors(output, rdoc, args->input, args->url_or_frag);
return rdoc;
Expand Down Expand Up @@ -577,7 +579,7 @@ noko_init_gumbo()
parent = rb_intern_const("parent");

// Define Nokogumbo module with parse and fragment methods.
rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 5);
rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
}

Expand Down
2 changes: 1 addition & 1 deletion lib/nokogiri/html5/document.rb
Expand Up @@ -63,7 +63,7 @@ def do_parse(string_or_io, url, encoding, options)
max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth, self)
doc.encoding = "UTF-8"
doc
end
Expand Down
8 changes: 4 additions & 4 deletions test/html4/sax/test_parser.rb
Expand Up @@ -9,7 +9,7 @@ module SAX
class TestParser < Nokogiri::SAX::TestCase
def setup
super
@parser = HTML::SAX::Parser.new(Doc.new)
@parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new)
end

def test_parse_empty_document
Expand Down Expand Up @@ -163,9 +163,9 @@ def test_empty_processing_instruction
end

it "handles invalid types gracefully" do
assert_raises(TypeError) { Nokogiri::HTML::SAX::Parser.new.parse(0xcafecafe) }
assert_raises(TypeError) { Nokogiri::HTML::SAX::Parser.new.parse_memory(0xcafecafe) }
assert_raises(TypeError) { Nokogiri::HTML::SAX::Parser.new.parse_io(0xcafecafe) }
assert_raises(TypeError) { Nokogiri::HTML4::SAX::Parser.new.parse(0xcafecafe) }
assert_raises(TypeError) { Nokogiri::HTML4::SAX::Parser.new.parse_memory(0xcafecafe) }
assert_raises(TypeError) { Nokogiri::HTML4::SAX::Parser.new.parse_io(0xcafecafe) }
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion test/html4/sax/test_parser_text.rb
Expand Up @@ -10,7 +10,7 @@ class TestParserText < Nokogiri::SAX::TestCase
def setup
super
@doc = DocWithOrderedItems.new
@parser = HTML::SAX::Parser.new(@doc)
@parser = Nokogiri::HTML4::SAX::Parser.new(@doc)
end

def test_texts_order
Expand Down
2 changes: 1 addition & 1 deletion test/html4/sax/test_push_parser.rb
Expand Up @@ -9,7 +9,7 @@ module SAX
class TestPushParser < Nokogiri::SAX::TestCase
def setup
super
@parser = HTML::SAX::PushParser.new(Doc.new)
@parser = Nokogiri::HTML4::SAX::PushParser.new(Doc.new)
end

def test_end_document_called
Expand Down
2 changes: 1 addition & 1 deletion test/html4/test_attributes.rb
Expand Up @@ -55,7 +55,7 @@ class TestAttr < Nokogiri::TestCase

html = %{<#{config[:tag]} #{config[:attr]}='examp<!--" unsafeattr=unsafevalue()>-->le.com'>test</#{config[:tag]}>}

reparsed = HTML.fragment(HTML.fragment(html).to_html)
reparsed = Nokogiri::HTML4.fragment(Nokogiri::HTML4.fragment(html).to_html)
attributes = reparsed.at_css(config[:tag]).attribute_nodes

assert_equal [config[:attr]], attributes.collect(&:name)
Expand Down
6 changes: 3 additions & 3 deletions test/html4/test_attributes_properly_escaped.rb
Expand Up @@ -9,7 +9,7 @@ def test_attribute_macros_are_escaped
skip_unless_libxml2_patch("0001-Remove-script-macro-support.patch") if Nokogiri.uses_libxml?

html = "<p><i for=\"&{<test>}\"></i></p>"
document = Nokogiri::HTML::Document.new
document = Nokogiri::HTML4::Document.new
nodes = document.parse(html)

assert_equal("<p><i for=\"&amp;{&lt;test&gt;}\"></i></p>", nodes[0].to_s)
Expand All @@ -19,7 +19,7 @@ def test_libxml_escapes_server_side_includes
skip_unless_libxml2_patch("0002-Update-entities-to-remove-handling-of-ssi.patch") if Nokogiri.uses_libxml?

original_html = %(<p><a href='<!--"><test>-->'></a></p>)
document = Nokogiri::HTML::Document.new
document = Nokogiri::HTML4::Document.new
html = document.parse(original_html).to_s

assert_match(/!--%22&gt;&lt;test&gt;/, html)
Expand All @@ -29,7 +29,7 @@ def test_libxml_escapes_server_side_includes_without_nested_quotes
skip_unless_libxml2_patch("0002-Update-entities-to-remove-handling-of-ssi.patch") if Nokogiri.uses_libxml?

original_html = %(<p><i for="<!--<test>-->"></i></p>)
document = Nokogiri::HTML::Document.new
document = Nokogiri::HTML4::Document.new
html = document.parse(original_html).to_s

assert_match(/&lt;!--&lt;test&gt;/, html)
Expand Down
28 changes: 14 additions & 14 deletions test/html4/test_builder.rb
Expand Up @@ -8,11 +8,11 @@ class TestBuilder < Nokogiri::TestCase
def test_top_level_function_builds
foo = nil
Nokogiri() { |xml| foo = xml }
assert_instance_of(Nokogiri::HTML::Builder, foo)
assert_instance_of(Nokogiri::HTML4::Builder, foo)
end

def test_builder_with_explicit_tags
html_doc = Nokogiri::HTML::Builder.new do
html_doc = Nokogiri::HTML4::Builder.new do
div.slide(class: "another_class") do
node = Nokogiri::XML::Node.new("id", doc)
node.content = "hello"
Expand All @@ -24,7 +24,7 @@ def test_builder_with_explicit_tags
end

def test_hash_as_attributes_for_attribute_method
html = Nokogiri::HTML::Builder.new do ||
html = Nokogiri::HTML4::Builder.new do ||
div.slide(class: "another_class") do
span("Slide 1")
end
Expand All @@ -33,7 +33,7 @@ def test_hash_as_attributes_for_attribute_method
end

def test_hash_as_attributes
builder = Nokogiri::HTML::Builder.new do
builder = Nokogiri::HTML4::Builder.new do
div(id: "awesome") do
h1("america")
end
Expand All @@ -54,7 +54,7 @@ def test_href_with_attributes
end

def test_tag_nesting
builder = Nokogiri::HTML::Builder.new do
builder = Nokogiri::HTML4::Builder.new do
body do
span.left("")
span.middle do
Expand All @@ -68,7 +68,7 @@ def test_tag_nesting
end

def test_has_ampersand
builder = Nokogiri::HTML::Builder.new do
builder = Nokogiri::HTML4::Builder.new do
div.rad.thing! do
text("<awe&some>")
b("hello & world")
Expand All @@ -81,7 +81,7 @@ def test_has_ampersand
end

def test_multi_tags
builder = Nokogiri::HTML::Builder.new do
builder = Nokogiri::HTML4::Builder.new do
div.rad.thing! do
text("<awesome>")
b("hello")
Expand All @@ -94,7 +94,7 @@ def test_multi_tags
end

def test_attributes_plus_block
builder = Nokogiri::HTML::Builder.new do
builder = Nokogiri::HTML4::Builder.new do
div.rad.thing! do
text("<awesome>")
end
Expand All @@ -104,22 +104,22 @@ def test_attributes_plus_block
end

def test_builder_adds_attributes
builder = Nokogiri::HTML::Builder.new do
builder = Nokogiri::HTML4::Builder.new do
div.rad.thing!("tender div")
end
assert_equal('<div class="rad" id="thing">tender div</div>',
builder.doc.root.to_html.chomp)
end

def test_bold_tag
builder = Nokogiri::HTML::Builder.new do
builder = Nokogiri::HTML4::Builder.new do
b("bold tag")
end
assert_equal("<b>bold tag</b>", builder.doc.root.to_html.chomp)
end

def test_html_then_body_tag
builder = Nokogiri::HTML::Builder.new do
builder = Nokogiri::HTML4::Builder.new do
html do
body do
b("bold tag")
Expand All @@ -137,12 +137,12 @@ def foo
end
end

builder = Nokogiri::HTML::Builder.new { text(foo) }
builder = Nokogiri::HTML4::Builder.new { text(foo) }
assert_includes(builder.to_html, "foo!")
end

def test_builder_with_param
doc = Nokogiri::HTML::Builder.new do |html|
doc = Nokogiri::HTML4::Builder.new do |html|
html.body do
html.p("hello world")
end
Expand All @@ -154,7 +154,7 @@ def test_builder_with_param

def test_builder_with_id
text = "hello world"
doc = Nokogiri::HTML::Builder.new do |html|
doc = Nokogiri::HTML4::Builder.new do |html|
html.body do
html.id_(text)
end
Expand Down
10 changes: 5 additions & 5 deletions test/html4/test_comments.rb
Expand Up @@ -15,7 +15,7 @@ class TestComment < Nokogiri::TestCase
# <!--> or <!--->). The parser behaves as if the comment is
# closed correctly.
describe "abrupt closing of empty comment" do
let(:doc) { Nokogiri::HTML(html) }
let(:doc) { Nokogiri::HTML4(html) }
let(:subject) { doc.at_css("div#under-test") }
let(:other_div) { doc.at_css("div#also-here") }

Expand Down Expand Up @@ -101,7 +101,7 @@ class TestComment < Nokogiri::TestCase
# stream.
describe "eof in comment" do
let(:html) { "<html><body><div id=under-test><!--start of unterminated comment" }
let(:doc) { Nokogiri::HTML(html) }
let(:doc) { Nokogiri::HTML4(html) }
let(:subject) { doc.at_css("div#under-test") }

if Nokogiri.uses_libxml?
Expand Down Expand Up @@ -129,7 +129,7 @@ class TestComment < Nokogiri::TestCase
# code point sequence.
describe "incorrectly closed comment" do
let(:html) { "<html><body><div id=under-test><!--foo--!><div id=do-i-exist></div><!--bar--></div></body></html>" }
let(:doc) { Nokogiri::HTML(html) }
let(:doc) { Nokogiri::HTML4(html) }
let(:subject) { doc.at_css("div#under-test") }
let(:inner_div) { doc.at_css("div#do-i-exist") }

Expand Down Expand Up @@ -169,7 +169,7 @@ class TestComment < Nokogiri::TestCase
describe "incorrectly opened comment" do
let(:html) { "<html><body><div id=under-test><! comment <div id=do-i-exist>inner content</div>-->hello</div></body></html>" }

let(:doc) { Nokogiri::HTML(html) }
let(:doc) { Nokogiri::HTML4(html) }
let(:body) { doc.at_css("body") }
let(:subject) { doc.at_css("div#under-test") }

Expand Down Expand Up @@ -225,7 +225,7 @@ class TestComment < Nokogiri::TestCase
# everything that follows will be treated as markup.
describe "nested comment" do
let(:html) { "<html><body><div id=under-test><!-- outer <!-- inner --><div id=do-i-exist></div>--></div></body></html>" }
let(:doc) { Nokogiri::HTML(html) }
let(:doc) { Nokogiri::HTML4(html) }
let(:subject) { doc.at_css("div#under-test") }
let(:inner_div) { doc.at_css("div#do-i-exist") }

Expand Down

0 comments on commit c39183a

Please sign in to comment.