From 219526312dafa270bb7778769d9d345b17b9d5be Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 11 Apr 2021 00:55:03 -0400 Subject: [PATCH] feat!: XSLT docs are parsed with additional ParseOptions Closes #1940 --- CHANGELOG.md | 8 ++++++++ lib/nokogiri/xml/parse_options.rb | 2 ++ lib/nokogiri/xslt.rb | 5 +++-- test/test_xslt_transforms.rb | 33 +++++++++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eedb439931..2408760a24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA --- +## next / unreleased + +### Changed + +* Introduce `Nokogiri::XML::ParseOptions::DEFAULT_XSLT` which adds the libxslt-preferred options of `NOENT | DTDLOAD | DTDATTR | NOCDATA` to `ParseOptions::DEFAULT_XML`. +* `Nokogiri.XSLT` parses the stylesheet using `ParseOptions::DEFAULT_XSLT`, which should make some edge-case XSL transformations match libxslt's default behavior. [[#1940](https://github.com/sparklemotion/nokogiri/issues/1940)] + + ## 1.11.3 / 2021-04-07 ### Fixed diff --git a/lib/nokogiri/xml/parse_options.rb b/lib/nokogiri/xml/parse_options.rb index a266d5ba07..36186ab7b9 100644 --- a/lib/nokogiri/xml/parse_options.rb +++ b/lib/nokogiri/xml/parse_options.rb @@ -71,6 +71,8 @@ class ParseOptions # the default options used for parsing XML documents DEFAULT_XML = RECOVER | NONET + # the default options used for parsing XSLT stylesheets + DEFAULT_XSLT = RECOVER | NONET | NOENT | DTDLOAD | DTDATTR | NOCDATA # the default options used for parsing HTML documents DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET # the default options used for parsing XML schemas diff --git a/lib/nokogiri/xslt.rb b/lib/nokogiri/xslt.rb index 503c91419e..b6c9442008 100644 --- a/lib/nokogiri/xslt.rb +++ b/lib/nokogiri/xslt.rb @@ -27,10 +27,11 @@ def parse(string, modules = {}) XSLT.register(url, klass) end + doc = XML::Document.parse(string, nil, nil, XML::ParseOptions::DEFAULT_XSLT) if Nokogiri.jruby? - Stylesheet.parse_stylesheet_doc(XML.parse(string), string) + Stylesheet.parse_stylesheet_doc(doc, string) else - Stylesheet.parse_stylesheet_doc(XML.parse(string)) + Stylesheet.parse_stylesheet_doc(doc) end end diff --git a/test/test_xslt_transforms.rb b/test/test_xslt_transforms.rb index 3dae8935af..175c9920e3 100644 --- a/test/test_xslt_transforms.rb +++ b/test/test_xslt_transforms.rb @@ -367,5 +367,38 @@ def test_non_html_xslt_transform end assert_match(/decimal/, exception.message) end + + describe "DEFAULT_XSLT parse options" do + it "is the union of DEFAULT_XML and libxslt's XSLT_PARSE_OPTIONS" do + xslt_parse_options = Nokogiri::XML::ParseOptions.new.noent.dtdload.dtdattr.nocdata + expected = Nokogiri::XML::ParseOptions::DEFAULT_XML | xslt_parse_options.options + assert_equal(expected, Nokogiri::XML::ParseOptions::DEFAULT_XSLT) + end + + it "parses docs the same as xsltproc" do + skip_unless_libxml2("JRuby implementation disallows this edge case XSLT") + + # see https://github.com/sparklemotion/nokogiri/issues/1940 + xml = "" + xsl = <<~EOF + + + + + ]]> + + + EOF + + doc = Nokogiri::XML(xml) + stylesheet = Nokogiri::XSLT(xsl) + + # TODO: ideally I'd like to be able to access the parse options in the final object + # assert_equal(Nokogiri::XML::ParseOptions::DEFAULT_XSLT, stylesheet.document.parse_options) + + result = stylesheet.transform(doc) + assert_equal("<>", result.children.to_xml) + end + end end end