Skip to content

Commit

Permalink
fix: make sure HTML5::Document{,Fragment} subclass properly
Browse files Browse the repository at this point in the history
Loofah and other downstream libraries rely on this behavior. This is
long-term prep for a day when HTML5 may become the default on
supported platforms.
  • Loading branch information
flavorjones committed May 8, 2022
1 parent 4600a1c commit ebde7da
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 5 deletions.
10 changes: 6 additions & 4 deletions ext/nokogiri/gumbo.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
//
// Processing starts by calling gumbo_parse_with_options. The resulting document tree
// is then walked, a parallel libxml2 tree is constructed, and the final document is
// then wrapped using Nokogiri_wrap_xml_document. This approach reduces memory and CPU
// then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
// requirements as Ruby objects are only built when necessary.
//

Expand Down Expand Up @@ -297,6 +297,7 @@ typedef struct {
GumboOutput *output;
VALUE input;
VALUE url_or_frag;
VALUE klass;
xmlDocPtr doc;
} ParseArgs;

Expand All @@ -321,7 +322,7 @@ static VALUE parse_continue(VALUE parse_args);
* @!visibility protected
*/
static VALUE
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
{
GumboOptions options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(max_attributes);
Expand All @@ -333,6 +334,7 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors
.output = output,
.input = input,
.url_or_frag = url,
.klass = klass,
.doc = NULL,
};

Expand All @@ -357,7 +359,7 @@ parse_continue(VALUE parse_args)
}
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
build_tree(doc, (xmlNodePtr)doc, output->document);
VALUE rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc);
VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
add_errors(output, rdoc, args->input, args->url_or_frag);
return rdoc;
Expand Down Expand Up @@ -577,7 +579,7 @@ noko_init_gumbo()
parent = rb_intern_const("parent");

// Define Nokogumbo module with parse and fragment methods.
rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 5);
rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
}

Expand Down
2 changes: 1 addition & 1 deletion lib/nokogiri/html5/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def do_parse(string_or_io, url, encoding, options)
max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth, self)
doc.encoding = "UTF-8"
doc
end
Expand Down
118 changes: 118 additions & 0 deletions test/html5/test_api.rb
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,122 @@ def test_html_eh
assert_predicate(doc, :html?)
refute_predicate(doc, :xml?)
end

describe Nokogiri::HTML5::Document do
describe "subclassing" do
let(:klass) do
Class.new(Nokogiri::HTML5::Document) do
attr_accessor :initialized_with, :initialized_count

def initialize(*args)
super
@initialized_with = args
@initialized_count ||= 0
@initialized_count += 1
end
end
end

describe ".new" do
it "returns an instance of the expected class" do
doc = klass.new
assert_instance_of(klass, doc)
end

it "calls #initialize exactly once" do
doc = klass.new
assert_equal(1, doc.initialized_count)
end

it "passes arguments to #initialize" do
doc = klass.new("http://www.w3.org/TR/REC-html40/loose.dtd", "-//W3C//DTD HTML 4.0 Transitional//EN")
assert_equal(
["http://www.w3.org/TR/REC-html40/loose.dtd", "-//W3C//DTD HTML 4.0 Transitional//EN"],
doc.initialized_with
)
end
end

it "#dup returns the expected class" do
doc = klass.new.dup
assert_instance_of(klass, doc)
end

describe ".parse" do
let(:html) { Nokogiri::HTML5.parse(File.read(HTML_FILE)) }

it "returns an instance of the expected class" do
doc = klass.parse(File.read(HTML_FILE))
assert_instance_of(klass, doc)
end

it "calls #initialize exactly once" do
doc = klass.parse(File.read(HTML_FILE))
assert_equal(1, doc.initialized_count)
end

it "parses the doc" do
doc = klass.parse(File.read(HTML_FILE))
assert_equal(html.root.to_s, doc.root.to_s)
end
end
end
end

describe Nokogiri::HTML5::DocumentFragment do
describe "subclassing" do
let(:klass) do
Class.new(Nokogiri::HTML5::DocumentFragment) do
attr_accessor :initialized_with, :initialized_count

def initialize(*args)
super
@initialized_with = args
@initialized_count ||= 0
@initialized_count += 1
end
end
end
let(:html) { Nokogiri::HTML5.parse(File.read(HTML_FILE), HTML_FILE) }

describe ".new" do
it "returns an instance of the right class" do
fragment = klass.new(html, "<div>a</div>")
assert_instance_of(klass, fragment)
end

it "calls #initialize exactly once" do
fragment = klass.new(html, "<div>a</div>")
assert_equal(1, fragment.initialized_count)
end

it "passes args to #initialize" do
fragment = klass.new(html, "<div>a</div>")
assert_equal([html, "<div>a</div>"], fragment.initialized_with)
end
end

it "#dup returns the expected class" do
doc = klass.new(html, "<div>a</div>").dup
assert_instance_of(klass, doc)
end

describe ".parse" do
it "returns an instance of the right class" do
fragment = klass.parse("<div>a</div>")
assert_instance_of(klass, fragment)
end

it "calls #initialize exactly once" do
fragment = klass.parse("<div>a</div>")
assert_equal(1, fragment.initialized_count)
end

it "passes the fragment" do
fragment = klass.parse("<div>a</div>")
assert_equal(Nokogiri::HTML5::DocumentFragment.parse("<div>a</div>").to_s, fragment.to_s)
end
end
end
end
end if Nokogiri.uses_gumbo?

0 comments on commit ebde7da

Please sign in to comment.