Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly serialize UTF-16 documents that are longer than libxml2's internal string buffer #2434

Merged
merged 3 commits into from
Jan 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

## 1.14.0 / unreleased

### Notes

#### Faster, more reliable installation: Native Gem for ARM64 Linux

This version of Nokogiri ships full native gem support for the `aarch64-linux` platform, which should support AWS Graviton and other ARM Linux platforms. Please note that glibc >= 2.29 is required for aarch64-linux systems, see [Supported Platforms](https://nokogiri.org/#supported-platforms) for more information.
Expand All @@ -16,6 +18,11 @@ This version of Nokogiri ships full native gem support for the `aarch64-linux` p
This version of Nokogiri uses [`jar-dependencies`](https://github.com/mkristian/jar-dependencies) to manage most of the vendored Java dependencies. `nokogiri -v` now outputs maven metadata for all Java dependencies, and `Nokogiri::VERSION_INFO` also contains this metadata. [[#2432](https://github.com/sparklemotion/nokogiri/issues/2432)]


### Fixed

* [CRuby] UTF-16-encoded documents longer than ~4000 code points now serialize properly. Previously the serialized document was corrupted when it exceeded the length of libxml2's internal string buffer. [[#752](https://github.com/sparklemotion/nokogiri/issues/752)]


## 1.13.1 / 2022-01-13

### Fixed
Expand Down
60 changes: 32 additions & 28 deletions ext/nokogiri/nokogiri.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ void noko_init_html_sax_push_parser(void);
void noko_init_gumbo(void);
void noko_init_test_global_handlers(void);

static ID id_read, id_write;
static ID id_read, id_write, id_external_encoding;


#ifndef HAVE_VASPRINTF
Expand All @@ -76,76 +76,79 @@ vasprintf(char **strp, const char *fmt, va_list ap)


static VALUE
read_check(VALUE val)
noko_io_read_check(VALUE val)
{
VALUE *args = (VALUE *)val;
return rb_funcall(args[0], id_read, 1, args[1]);
}


static VALUE
read_failed(VALUE arg, VALUE exc)
noko_io_read_failed(VALUE arg, VALUE exc)
{
return Qundef;
}


int
noko_io_read(void *ctx, char *buffer, int len)
noko_io_read(void *io, char *c_buffer, int c_buffer_len)
{
VALUE string, args[2];
size_t str_len, safe_len;
VALUE rb_io = (VALUE)io;
VALUE rb_read_string, rb_args[2];
size_t n_bytes_read, safe_len;

args[0] = (VALUE)ctx;
args[1] = INT2NUM(len);
rb_args[0] = rb_io;
rb_args[1] = INT2NUM(c_buffer_len);

string = rb_rescue(read_check, (VALUE)args, read_failed, 0);
rb_read_string = rb_rescue(noko_io_read_check, (VALUE)rb_args, noko_io_read_failed, 0);

if (NIL_P(string)) { return 0; }
if (string == Qundef) { return -1; }
if (TYPE(string) != T_STRING) { return -1; }
if (NIL_P(rb_read_string)) { return 0; }
if (rb_read_string == Qundef) { return -1; }
if (TYPE(rb_read_string) != T_STRING) { return -1; }

str_len = (size_t)RSTRING_LEN(string);
safe_len = str_len > (size_t)len ? (size_t)len : str_len;
memcpy(buffer, StringValuePtr(string), safe_len);
n_bytes_read = (size_t)RSTRING_LEN(rb_read_string);
safe_len = (n_bytes_read > (size_t)c_buffer_len) ? (size_t)c_buffer_len : n_bytes_read;
memcpy(c_buffer, StringValuePtr(rb_read_string), safe_len);

return (int)safe_len;
}


static VALUE
write_check(VALUE val)
noko_io_write_check(VALUE rb_args)
{
VALUE *args = (VALUE *)val;
return rb_funcall(args[0], id_write, 1, args[1]);
VALUE rb_io = ((VALUE *)rb_args)[0];
VALUE rb_output = ((VALUE *)rb_args)[1];
return rb_funcall(rb_io, id_write, 1, rb_output);
}


static VALUE
write_failed(VALUE arg, VALUE exc)
noko_io_write_failed(VALUE arg, VALUE exc)
{
return Qundef;
}


int
noko_io_write(void *ctx, char *buffer, int len)
noko_io_write(void *io, char *c_buffer, int c_buffer_len)
{
VALUE args[2], size;

args[0] = (VALUE)ctx;
args[1] = rb_str_new(buffer, (long)len);
VALUE rb_args[2], rb_n_bytes_written;
VALUE rb_io = (VALUE)io;
rb_encoding *io_encoding = rb_to_encoding(rb_funcall(rb_io, id_external_encoding, 0));

size = rb_rescue(write_check, (VALUE)args, write_failed, 0);
rb_args[0] = rb_io;
rb_args[1] = rb_enc_str_new(c_buffer, (long)c_buffer_len, io_encoding);

if (size == Qundef) { return -1; }
rb_n_bytes_written = rb_rescue(noko_io_write_check, (VALUE)rb_args, noko_io_write_failed, 0);
if (rb_n_bytes_written == Qundef) { return -1; }

return NUM2INT(size);
return NUM2INT(rb_n_bytes_written);
}


int
noko_io_close(void *ctx)
noko_io_close(void *io)
{
return 0;
}
Expand Down Expand Up @@ -275,4 +278,5 @@ Init_nokogiri()

id_read = rb_intern("read");
id_write = rb_intern("write");
id_external_encoding = rb_intern("external_encoding");
}
9 changes: 4 additions & 5 deletions lib/nokogiri/xml/node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1188,12 +1188,11 @@ def serialize(*args, &block)
}
end

encoding = options[:encoding] || document.encoding
options[:encoding] = encoding
options[:encoding] ||= document.encoding
encoding = Encoding.find(options[:encoding] || "UTF-8")

io = StringIO.new(String.new(encoding: encoding), "wb:#{encoding}:#{encoding}")

outstring = +""
outstring.force_encoding(Encoding.find(encoding || "utf-8"))
io = StringIO.new(outstring)
write_to(io, options, &block)
io.string
end
Expand Down
8 changes: 0 additions & 8 deletions test/xml/test_document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -491,14 +491,6 @@ def test_document_parse_method
assert_equal(xml.root.to_s, xml.root.to_s)
end

def test_encoding=
xml.encoding = "UTF-8"
assert_match("UTF-8", xml.to_xml)

xml.encoding = "EUC-JP"
assert_match("EUC-JP", xml.to_xml)
end

def test_namespace_should_not_exist
assert_raises(NoMethodError) do
xml.namespace
Expand Down
71 changes: 53 additions & 18 deletions test/xml/test_document_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,63 @@
module Nokogiri
module XML
class TestDocumentEncoding < Nokogiri::TestCase
def setup
super
@xml = Nokogiri::XML(File.read(SHIFT_JIS_XML), SHIFT_JIS_XML)
end
describe "Nokogiri::XML::Document encoding" do
let(:shift_jis_document) { Nokogiri::XML(File.read(SHIFT_JIS_XML), SHIFT_JIS_XML) }
let(:ascii_document) { Nokogiri::XML.parse(File.read(XML_FILE), XML_FILE) }

def test_url
assert_equal("UTF-8", @xml.url.encoding.name)
end
describe "#encoding" do
it "describes the document's encoding correctly" do
assert_equal("Shift_JIS", shift_jis_document.encoding)
end

def test_encoding
assert_equal("UTF-8", @xml.encoding.encoding.name)
end
it "applies the specified encoding even if on empty documents" do
encoding = "Shift_JIS"
assert_equal(encoding, Nokogiri::XML(nil, nil, encoding).encoding)
end
end

def test_dotted_version
skip_unless_libxml2
assert_equal("UTF-8", Nokogiri::LIBXML_COMPILED_VERSION.encoding.name)
assert_equal("UTF-8", Nokogiri::LIBXSLT_COMPILED_VERSION.encoding.name)
end
describe "#encoding=" do
it "determines the document's encoding when serialized" do
ascii_document.encoding = "UTF-8"
assert_match("encoding=\"UTF-8\"", ascii_document.to_xml)

ascii_document.encoding = "EUC-JP"
assert_match("encoding=\"EUC-JP\"", ascii_document.to_xml)
end
end

it "encodes the URL as UTF-8" do
assert_equal("UTF-8", shift_jis_document.url.encoding.name)
end

it "encodes the encoding name as UTF-8" do
assert_equal("UTF-8", shift_jis_document.encoding.encoding.name)
end

it "encodes the library versions as UTF-8" do
skip_unless_libxml2
assert_equal("UTF-8", Nokogiri::LIBXML_COMPILED_VERSION.encoding.name)
assert_equal("UTF-8", Nokogiri::LIBXSLT_COMPILED_VERSION.encoding.name)
end

it "serializes UTF-16 correctly across libxml2 buffer flushes" do
# https://github.com/sparklemotion/nokogiri/issues/752
skip_unless_libxml2

# the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
# is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
size = 4000
input = String.new(<<~XML, encoding: "UTF-16")
<?xml version="1.0" encoding="UTF-16"?>
<root>
<bar>#{"A" * size}</bar>
</root>
XML
expected_length = (input.bytesize * 2) + 2 # double character width, add BOM bytes 0xFEFF

def test_empty_doc_encoding
encoding = "US-ASCII"
assert_equal(encoding, Nokogiri::XML(nil, nil, encoding).encoding)
output = Nokogiri::XML(input).to_xml
assert_equal(expected_length, output.bytesize)
end
end
end
end
Expand Down