From 4590844481f778c527555b77e6e5fe74fd379be2 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 28 Nov 2021 16:14:40 -0500 Subject: [PATCH] feat(CRuby): XML::Reader#encoding returns detected enc when not set Previously this always and only returned the encoding passed to the constructor. Now, if no encoding is passed to the constructor, then Reader#encoding will return the encoding detected by the parser. Note that this has not been implemented for JRuby. --- CHANGELOG.md | 1 + ext/java/nokogiri/XmlReader.java | 12 ++++++++++++ ext/nokogiri/xml_reader.c | 19 +++++++++++++++++++ lib/nokogiri/xml/reader.rb | 3 --- test/xml/test_reader_encoding.rb | 26 ++++++++++++++++++++++++++ 5 files changed, 58 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dcb26061b8..a2e0bfce24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ A related discussion about Trust exists at [#2357](https://github.com/sparklemot ### Improved +* [CRuby] XML::Reader#encoding will return the encoding detected by the parser when it's not passed to the constructor. [[#980](https://github.com/sparklemotion/nokogiri/issues/980)] * [CRuby] Handle abruptly-closed HTML comments as WHATWG recommends for browsers. (Thanks to HackerOne user [tehryanx](https://hackerone.com/tehryanx?type=user) for reporting this!) * [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES` and set in `ParseOptions::DEFAULT_XML`, `::DEFAULT_XSLT`, `::DEFAULT_HTML`, and `::DEFAULT_SCHEMA`. (Note that JRuby never had this problem.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)] * [CRuby] If a cycle is introduced when reparenting a node (i.e., the node becomes its own ancestor), a `RuntimeError` is raised. libxml2 does no checking for this, which means cycles would otherwise result in infinite loops on subsequent operations. (Note: JRuby/Xerces already does this.) [[#1912](https://github.com/sparklemotion/nokogiri/issues/1912)] diff --git a/ext/java/nokogiri/XmlReader.java b/ext/java/nokogiri/XmlReader.java index 122831f6e1..f8b01d550a 100644 --- a/ext/java/nokogiri/XmlReader.java +++ b/ext/java/nokogiri/XmlReader.java @@ -184,6 +184,18 @@ public class XmlReader extends RubyObject return RubyBoolean.newBoolean(context.getRuntime(), !readerNode.hasChildren); } + @JRubyMethod + public IRubyObject + encoding(ThreadContext context) + { + IRubyObject constructor_encoding = getInstanceVariable("@encoding"); + if (!constructor_encoding.isNil()) { + return constructor_encoding; + } + // TODO: get the parser's detected encoding + return context.getRuntime().getNil(); + } + @JRubyMethod(meta = true, rest = true) public static IRubyObject from_io(ThreadContext context, IRubyObject cls, IRubyObject args[]) diff --git a/ext/nokogiri/xml_reader.c b/ext/nokogiri/xml_reader.c index 0ee2017552..4f87e18f14 100644 --- a/ext/nokogiri/xml_reader.c +++ b/ext/nokogiri/xml_reader.c @@ -659,6 +659,24 @@ empty_element_p(VALUE self) return Qfalse; } +static VALUE +rb_xml_reader_encoding(VALUE rb_reader) +{ + xmlTextReaderPtr c_reader; + const char *parser_encoding; + VALUE constructor_encoding; + + constructor_encoding = rb_iv_get(rb_reader, "@encoding"); + if (RTEST(constructor_encoding)) { + return constructor_encoding; + } + + Data_Get_Struct(rb_reader, xmlTextReader, c_reader); + parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader); + if (parser_encoding == NULL) { return Qnil; } + return NOKOGIRI_STR_NEW2(parser_encoding); +} + void noko_init_xml_reader() { @@ -683,6 +701,7 @@ noko_init_xml_reader() rb_define_method(cNokogiriXmlReader, "default?", default_eh, 0); rb_define_method(cNokogiriXmlReader, "depth", depth, 0); rb_define_method(cNokogiriXmlReader, "empty_element?", empty_element_p, 0); + rb_define_method(cNokogiriXmlReader, "encoding", rb_xml_reader_encoding, 0); rb_define_method(cNokogiriXmlReader, "inner_xml", inner_xml, 0); rb_define_method(cNokogiriXmlReader, "lang", lang, 0); rb_define_method(cNokogiriXmlReader, "local_name", local_name, 0); diff --git a/lib/nokogiri/xml/reader.rb b/lib/nokogiri/xml/reader.rb index 09c60f0d18..df13216afe 100644 --- a/lib/nokogiri/xml/reader.rb +++ b/lib/nokogiri/xml/reader.rb @@ -71,9 +71,6 @@ class Reader # A list of errors encountered while parsing attr_accessor :errors - # The encoding for the document - attr_reader :encoding - # The XML source attr_reader :source diff --git a/test/xml/test_reader_encoding.rb b/test/xml/test_reader_encoding.rb index 3307e858f3..e9a3e607d9 100644 --- a/test/xml/test_reader_encoding.rb +++ b/test/xml/test_reader_encoding.rb @@ -15,6 +15,32 @@ def setup ) end + def test_libxml2_detects_internal_encoding_correctly + skip_unless_libxml2("This feature wasn't implemented for JRuby") + + reader = Nokogiri::XML::Reader(<<~XML) + + + XML + + assert_nil(reader.encoding) + reader.each do + assert_equal("ISO-8859-1", reader.encoding) + end + end + + def test_libxml2_overrides_internal_encoding_when_specified + reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8") + + + XML + + assert_equal("UTF-8", reader.encoding) + reader.each do + assert_equal("UTF-8", reader.encoding) + end + end + def test_attribute_at @reader.each do |node| next unless (attribute = node.attribute_at(0))