From 2f1513b197a4d8db88d4aeb8fe96ce95e9c5c7be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibaut=20Barr=C3=A8re?= Date: Tue, 8 Sep 2020 20:14:54 +0200 Subject: [PATCH 1/6] Add fixture for ISO-8859-1 issue reproduction --- test/files/iso-8859-1.xml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 test/files/iso-8859-1.xml diff --git a/test/files/iso-8859-1.xml b/test/files/iso-8859-1.xml new file mode 100644 index 0000000000..d714a0989b --- /dev/null +++ b/test/files/iso-8859-1.xml @@ -0,0 +1,5 @@ + + + Accepté + Something + From 1926bf60ed4a785eb890ef594b364e9a5eebe737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibaut=20Barr=C3=A8re?= Date: Tue, 8 Sep 2020 20:15:59 +0200 Subject: [PATCH 2/6] Add reproduction for JRuby regression on ISO-8859-1 content --- test/test_iso.rb | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 test/test_iso.rb diff --git a/test/test_iso.rb b/test/test_iso.rb new file mode 100644 index 0000000000..0f3f64a6e0 --- /dev/null +++ b/test/test_iso.rb @@ -0,0 +1,15 @@ +require "helper" + +class TestISO < Nokogiri::TestCase + def test_iso_content_not_lacking_accents + data = IO.binread('test/files/iso-8859-1.xml') + document = Nokogiri::XML(data) + assert_equal "Accepté", document.at('DATA').text + end + + def test_iso_content_not_truncated + data = IO.binread('test/files/iso-8859-1.xml') + document = Nokogiri::XML(data) + assert_equal 2, document.search('DATA').count + end +end From d306fc43179d6c1276cfa1bfad1a615c1cb62eda Mon Sep 17 00:00:00 2001 From: John Shahid Date: Tue, 15 Sep 2020 09:11:07 -0400 Subject: [PATCH 3/6] Do not use a default encoding when parsing XML documents. This change fixes the tests in #2080, but introduces more errors. The errors are mostly unexpected null encoding when parsing an HTML document. --- ext/java/nokogiri/internals/XmlDomParserContext.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/java/nokogiri/internals/XmlDomParserContext.java b/ext/java/nokogiri/internals/XmlDomParserContext.java index 3a75d72f6f..d400faae0c 100644 --- a/ext/java/nokogiri/internals/XmlDomParserContext.java +++ b/ext/java/nokogiri/internals/XmlDomParserContext.java @@ -93,7 +93,7 @@ public XmlDomParserContext(Ruby runtime, IRubyObject options) { public XmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options) { super(runtime); this.options = new ParserContext.Options(RubyFixnum.fix2long(options)); - java_encoding = NokogiriHelpers.getValidEncoding(encoding); + java_encoding = NokogiriHelpers.getValidEncodingOrNull(encoding); ruby_encoding = encoding; initErrorHandler(); initParser(runtime); From 8a602ec81282b2966dcf3c693ee79211b7aa0658 Mon Sep 17 00:00:00 2001 From: John Shahid Date: Tue, 15 Sep 2020 09:16:57 -0400 Subject: [PATCH 4/6] Use default encoding, instead of null, when parsing HTML docs. This change reduces the number of errors from 23 to 4. --- ext/java/nokogiri/internals/HtmlDomParserContext.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ext/java/nokogiri/internals/HtmlDomParserContext.java b/ext/java/nokogiri/internals/HtmlDomParserContext.java index 52955b5d13..cafcee7c54 100644 --- a/ext/java/nokogiri/internals/HtmlDomParserContext.java +++ b/ext/java/nokogiri/internals/HtmlDomParserContext.java @@ -66,11 +66,12 @@ public class HtmlDomParserContext extends XmlDomParserContext { public HtmlDomParserContext(Ruby runtime, IRubyObject options) { - super(runtime, options); + this(runtime, runtime.getNil(), options); } - + public HtmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options) { super(runtime, encoding, options); + java_encoding = NokogiriHelpers.getValidEncoding(encoding); } @Override From 763b07125d643059a5477a34af6af5871b3e13c7 Mon Sep 17 00:00:00 2001 From: John Shahid Date: Tue, 15 Sep 2020 09:18:56 -0400 Subject: [PATCH 5/6] Use setStringInputStream instead of setInputStream. This is another place where we pass less information to the Java layer that was missed in ba16682aae2cbd42c196bd8afdfcfe8a5d82fbdb. --- ext/java/nokogiri/XmlNode.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ext/java/nokogiri/XmlNode.java b/ext/java/nokogiri/XmlNode.java index 45853a71ed..1a7e48459c 100644 --- a/ext/java/nokogiri/XmlNode.java +++ b/ext/java/nokogiri/XmlNode.java @@ -759,15 +759,13 @@ public IRubyObject in_context(ThreadContext context, IRubyObject str, IRubyObjec klass = getNokogiriClass(runtime, "Nokogiri::HTML::Document"); ctx = new HtmlDomParserContext(runtime, options); ((HtmlDomParserContext) ctx).enableDocumentFragment(); - istream = new ByteArrayInputStream((rubyStringToString(str)).getBytes()); + ctx.setStringInputSource(context, str, context.nil); } else { klass = getNokogiriClass(runtime, "Nokogiri::XML::Document"); ctx = new XmlDomParserContext(runtime, options); - String input = rubyStringToString(str); - istream = new ByteArrayInputStream(input.getBytes()); + ctx.setStringInputSource(context, str, context.nil); } - ctx.setInputSource(istream); // TODO: for some reason, document.getEncoding() can be null or nil (don't know why) // run `test_parse_with_unparented_html_text_context_node' few times to see this happen if (document instanceof HtmlDocument && !(document.getEncoding() == null || document.getEncoding().isNil())) { From 9cf1b353f5ed5f5072f7d54117515f566155b6e9 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 12 Oct 2020 12:23:18 -0400 Subject: [PATCH 6/6] update CHANGELOG for document encoding fix Related to #2080, #2083 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 003a08d643..c631eaf619 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -98,6 +98,7 @@ This release ends support for: * [CRuby] Fixed installation on AIX with respect to `vasprintf`. [[#1908](https://github.com/sparklemotion/nokogiri/issues/1908)] * [JRuby] Standardize reading from IO like objects, including StringIO. [[#1888](https://github.com/sparklemotion/nokogiri/issues/1888), [#1897](https://github.com/sparklemotion/nokogiri/issues/1897)] * [Windows Visual C++] Fixed compiler warnings and errors. [[#2061](https://github.com/sparklemotion/nokogiri/issues/2061), [#2068](https://github.com/sparklemotion/nokogiri/issues/2068)] +* [JRuby] Fixed document encoding regression in v1.11.0 release candidates. [[#2080](https://github.com/sparklemotion/nokogiri/issues/2080), [#2083](https://github.com/sparklemotion/nokogiri/issues/2083)] (Thanks, [@thbar](https://github.com/thbar)!) ### Removed