Skip to content

Commit

Permalink
Merge pull request #2377 from sparklemotion/980-reader-encoding-detec…
Browse files Browse the repository at this point in the history
…tion

feat(CRuby): XML::Reader#encoding returns detected enc when not set
  • Loading branch information
flavorjones committed Dec 10, 2021
2 parents 5cf53bf + 4590844 commit 2ab961e
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -35,6 +35,7 @@ A related discussion about Trust exists at [#2357](https://github.com/sparklemot

### Improved

* [CRuby] XML::Reader#encoding will return the encoding detected by the parser when it's not passed to the constructor. [[#980](https://github.com/sparklemotion/nokogiri/issues/980)]
* [CRuby] Handle abruptly-closed HTML comments as WHATWG recommends for browsers. (Thanks to HackerOne user [tehryanx](https://hackerone.com/tehryanx?type=user) for reporting this!)
* [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES` and set in `ParseOptions::DEFAULT_XML`, `::DEFAULT_XSLT`, `::DEFAULT_HTML`, and `::DEFAULT_SCHEMA`. (Note that JRuby never had this problem.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)]
* [CRuby] If a cycle is introduced when reparenting a node (i.e., the node becomes its own ancestor), a `RuntimeError` is raised. libxml2 does no checking for this, which means cycles would otherwise result in infinite loops on subsequent operations. (Note: JRuby/Xerces already does this.) [[#1912](https://github.com/sparklemotion/nokogiri/issues/1912)]
Expand Down
12 changes: 12 additions & 0 deletions ext/java/nokogiri/XmlReader.java
Expand Up @@ -184,6 +184,18 @@ public class XmlReader extends RubyObject
return RubyBoolean.newBoolean(context.getRuntime(), !readerNode.hasChildren);
}

@JRubyMethod
public IRubyObject
encoding(ThreadContext context)
{
IRubyObject constructor_encoding = getInstanceVariable("@encoding");
if (!constructor_encoding.isNil()) {
return constructor_encoding;
}
// TODO: get the parser's detected encoding
return context.getRuntime().getNil();
}

@JRubyMethod(meta = true, rest = true)
public static IRubyObject
from_io(ThreadContext context, IRubyObject cls, IRubyObject args[])
Expand Down
19 changes: 19 additions & 0 deletions ext/nokogiri/xml_reader.c
Expand Up @@ -659,6 +659,24 @@ empty_element_p(VALUE self)
return Qfalse;
}

static VALUE
rb_xml_reader_encoding(VALUE rb_reader)
{
xmlTextReaderPtr c_reader;
const char *parser_encoding;
VALUE constructor_encoding;

constructor_encoding = rb_iv_get(rb_reader, "@encoding");
if (RTEST(constructor_encoding)) {
return constructor_encoding;
}

Data_Get_Struct(rb_reader, xmlTextReader, c_reader);
parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
if (parser_encoding == NULL) { return Qnil; }
return NOKOGIRI_STR_NEW2(parser_encoding);
}

void
noko_init_xml_reader()
{
Expand All @@ -683,6 +701,7 @@ noko_init_xml_reader()
rb_define_method(cNokogiriXmlReader, "default?", default_eh, 0);
rb_define_method(cNokogiriXmlReader, "depth", depth, 0);
rb_define_method(cNokogiriXmlReader, "empty_element?", empty_element_p, 0);
rb_define_method(cNokogiriXmlReader, "encoding", rb_xml_reader_encoding, 0);
rb_define_method(cNokogiriXmlReader, "inner_xml", inner_xml, 0);
rb_define_method(cNokogiriXmlReader, "lang", lang, 0);
rb_define_method(cNokogiriXmlReader, "local_name", local_name, 0);
Expand Down
3 changes: 0 additions & 3 deletions lib/nokogiri/xml/reader.rb
Expand Up @@ -71,9 +71,6 @@ class Reader
# A list of errors encountered while parsing
attr_accessor :errors

# The encoding for the document
attr_reader :encoding

# The XML source
attr_reader :source

Expand Down
26 changes: 26 additions & 0 deletions test/xml/test_reader_encoding.rb
Expand Up @@ -15,6 +15,32 @@ def setup
)
end

def test_libxml2_detects_internal_encoding_correctly
skip_unless_libxml2("This feature wasn't implemented for JRuby")

reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0" encoding="ISO-8859-1"?>
<root attr="foo"><employee /></root>
XML

assert_nil(reader.encoding)
reader.each do
assert_equal("ISO-8859-1", reader.encoding)
end
end

def test_libxml2_overrides_internal_encoding_when_specified
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
<?xml version="1.0" encoding="ISO-8859-1"?>
<root attr="foo"><employee /></root>
XML

assert_equal("UTF-8", reader.encoding)
reader.each do
assert_equal("UTF-8", reader.encoding)
end
end

def test_attribute_at
@reader.each do |node|
next unless (attribute = node.attribute_at(0))
Expand Down

0 comments on commit 2ab961e

Please sign in to comment.