Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(CRuby): XML::Reader#encoding returns detected enc when not set #2377

Merged
merged 1 commit into from Dec 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -35,6 +35,7 @@ A related discussion about Trust exists at [#2357](https://github.com/sparklemot

### Improved

* [CRuby] XML::Reader#encoding will return the encoding detected by the parser when it's not passed to the constructor. [[#980](https://github.com/sparklemotion/nokogiri/issues/980)]
* [CRuby] Handle abruptly-closed HTML comments as WHATWG recommends for browsers. (Thanks to HackerOne user [tehryanx](https://hackerone.com/tehryanx?type=user) for reporting this!)
* [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES` and set in `ParseOptions::DEFAULT_XML`, `::DEFAULT_XSLT`, `::DEFAULT_HTML`, and `::DEFAULT_SCHEMA`. (Note that JRuby never had this problem.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)]
* [CRuby] If a cycle is introduced when reparenting a node (i.e., the node becomes its own ancestor), a `RuntimeError` is raised. libxml2 does no checking for this, which means cycles would otherwise result in infinite loops on subsequent operations. (Note: JRuby/Xerces already does this.) [[#1912](https://github.com/sparklemotion/nokogiri/issues/1912)]
Expand Down
12 changes: 12 additions & 0 deletions ext/java/nokogiri/XmlReader.java
Expand Up @@ -184,6 +184,18 @@ public class XmlReader extends RubyObject
return RubyBoolean.newBoolean(context.getRuntime(), !readerNode.hasChildren);
}

@JRubyMethod
public IRubyObject
encoding(ThreadContext context)
{
IRubyObject constructor_encoding = getInstanceVariable("@encoding");
if (!constructor_encoding.isNil()) {
return constructor_encoding;
}
// TODO: get the parser's detected encoding
return context.getRuntime().getNil();
}

@JRubyMethod(meta = true, rest = true)
public static IRubyObject
from_io(ThreadContext context, IRubyObject cls, IRubyObject args[])
Expand Down
19 changes: 19 additions & 0 deletions ext/nokogiri/xml_reader.c
Expand Up @@ -659,6 +659,24 @@ empty_element_p(VALUE self)
return Qfalse;
}

static VALUE
rb_xml_reader_encoding(VALUE rb_reader)
{
xmlTextReaderPtr c_reader;
const char *parser_encoding;
VALUE constructor_encoding;

constructor_encoding = rb_iv_get(rb_reader, "@encoding");
if (RTEST(constructor_encoding)) {
return constructor_encoding;
}

Data_Get_Struct(rb_reader, xmlTextReader, c_reader);
parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
if (parser_encoding == NULL) { return Qnil; }
return NOKOGIRI_STR_NEW2(parser_encoding);
}

void
noko_init_xml_reader()
{
Expand All @@ -683,6 +701,7 @@ noko_init_xml_reader()
rb_define_method(cNokogiriXmlReader, "default?", default_eh, 0);
rb_define_method(cNokogiriXmlReader, "depth", depth, 0);
rb_define_method(cNokogiriXmlReader, "empty_element?", empty_element_p, 0);
rb_define_method(cNokogiriXmlReader, "encoding", rb_xml_reader_encoding, 0);
rb_define_method(cNokogiriXmlReader, "inner_xml", inner_xml, 0);
rb_define_method(cNokogiriXmlReader, "lang", lang, 0);
rb_define_method(cNokogiriXmlReader, "local_name", local_name, 0);
Expand Down
3 changes: 0 additions & 3 deletions lib/nokogiri/xml/reader.rb
Expand Up @@ -71,9 +71,6 @@ class Reader
# A list of errors encountered while parsing
attr_accessor :errors

# The encoding for the document
attr_reader :encoding

# The XML source
attr_reader :source

Expand Down
26 changes: 26 additions & 0 deletions test/xml/test_reader_encoding.rb
Expand Up @@ -15,6 +15,32 @@ def setup
)
end

def test_libxml2_detects_internal_encoding_correctly
skip_unless_libxml2("This feature wasn't implemented for JRuby")

reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0" encoding="ISO-8859-1"?>
<root attr="foo"><employee /></root>
XML

assert_nil(reader.encoding)
reader.each do
assert_equal("ISO-8859-1", reader.encoding)
end
end

def test_libxml2_overrides_internal_encoding_when_specified
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
<?xml version="1.0" encoding="ISO-8859-1"?>
<root attr="foo"><employee /></root>
XML

assert_equal("UTF-8", reader.encoding)
reader.each do
assert_equal("UTF-8", reader.encoding)
end
end

def test_attribute_at
@reader.each do |node|
next unless (attribute = node.attribute_at(0))
Expand Down