Skip to content

Commit

Permalink
feat(CRuby): XML::Reader#encoding returns detected enc when not set
Browse files Browse the repository at this point in the history
Previously this always and only returned the encoding passed to the
constructor. Now, if no encoding is passed to the constructor, then
Reader#encoding will return the encoding detected by the parser.

Note that this has not been implemented for JRuby.
  • Loading branch information
flavorjones committed Nov 28, 2021
1 parent cc5a7ea commit f9ac835
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -35,6 +35,7 @@ A related discussion about Trust exists at [#2357](https://github.com/sparklemot

### Improved

* [CRuby] XML::Reader#encoding will return the encoding detected by the parser when it's not passed to the constructor. [[#980](https://github.com/sparklemotion/nokogiri/issues/980)]
* [CRuby] Handle abruptly-closed HTML comments as WHATWG recommends for browsers. (Thanks to HackerOne user [tehryanx](https://hackerone.com/tehryanx?type=user) for reporting this!)
* [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES` and set in `ParseOptions::DEFAULT_XML`, `::DEFAULT_XSLT`, `::DEFAULT_HTML`, and `::DEFAULT_SCHEMA`. (Note that JRuby never had this problem.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)]
* [CRuby] If a cycle is introduced when reparenting a node (i.e., the node becomes its own ancestor), a `RuntimeError` is raised. libxml2 does no checking for this, which means cycles would otherwise result in infinite loops on subsequent operations. (Note: JRuby/Xerces already does this.) [[#1912](https://github.com/sparklemotion/nokogiri/issues/1912)]
Expand Down
12 changes: 12 additions & 0 deletions ext/java/nokogiri/XmlReader.java
Expand Up @@ -184,6 +184,18 @@ public class XmlReader extends RubyObject
return RubyBoolean.newBoolean(context.getRuntime(), !readerNode.hasChildren);
}

@JRubyMethod
public IRubyObject
encoding(ThreadContext context)
{
IRubyObject constructor_encoding = getInstanceVariable("@encoding");
if (!constructor_encoding.isNil()) {
return constructor_encoding;
}
// TODO: get the parser's detected encoding
return context.getRuntime().getNil();
}

@JRubyMethod(meta = true, rest = true)
public static IRubyObject
from_io(ThreadContext context, IRubyObject cls, IRubyObject args[])
Expand Down
19 changes: 19 additions & 0 deletions ext/nokogiri/xml_reader.c
Expand Up @@ -659,6 +659,24 @@ empty_element_p(VALUE self)
return Qfalse;
}

static VALUE
rb_xml_reader_encoding(VALUE rb_reader)
{
xmlTextReaderPtr c_reader;
const char *parser_encoding;
VALUE constructor_encoding;

constructor_encoding = rb_iv_get(rb_reader, "@encoding");
if (RTEST(constructor_encoding)) {
return constructor_encoding;
}

Data_Get_Struct(rb_reader, xmlTextReader, c_reader);
parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
if (parser_encoding == NULL) { return Qnil; }
return NOKOGIRI_STR_NEW2(parser_encoding);
}

void
noko_init_xml_reader()
{
Expand All @@ -683,6 +701,7 @@ noko_init_xml_reader()
rb_define_method(cNokogiriXmlReader, "default?", default_eh, 0);
rb_define_method(cNokogiriXmlReader, "depth", depth, 0);
rb_define_method(cNokogiriXmlReader, "empty_element?", empty_element_p, 0);
rb_define_method(cNokogiriXmlReader, "encoding", rb_xml_reader_encoding, 0);
rb_define_method(cNokogiriXmlReader, "inner_xml", inner_xml, 0);
rb_define_method(cNokogiriXmlReader, "lang", lang, 0);
rb_define_method(cNokogiriXmlReader, "local_name", local_name, 0);
Expand Down
3 changes: 0 additions & 3 deletions lib/nokogiri/xml/reader.rb
Expand Up @@ -71,9 +71,6 @@ class Reader
# A list of errors encountered while parsing
attr_accessor :errors

# The encoding for the document
attr_reader :encoding

# The XML source
attr_reader :source

Expand Down
12 changes: 12 additions & 0 deletions test/xml/test_reader_encoding.rb
Expand Up @@ -13,6 +13,18 @@ def setup
XML_FILE,
"UTF-8"
)
@reader_internal_encoding = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0" encoding="ISO-8859-1"?>
<root attr="foo"><employee /></root>
XML
end

def test_libxml2_detects_encoding_correctly
skip_unless_libxml2("This feature wasn't implemented for JRuby")
assert_nil(@reader_internal_encoding.encoding)
@reader_internal_encoding.each do
assert_equal("ISO-8859-1", @reader_internal_encoding.encoding)
end
end

def test_attribute_at
Expand Down

0 comments on commit f9ac835

Please sign in to comment.