Skip to content

Commit

Permalink
Merge pull request #2434 from sparklemotion/752-long-utf16-documents
Browse files Browse the repository at this point in the history
Correctly serialize UTF-16 documents that are longer than libxml2's internal string buffer
  • Loading branch information
flavorjones committed Jan 23, 2022
2 parents ffb8cc6 + 2e260f5 commit 53c8293
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 59 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

## 1.14.0 / unreleased

### Notes

#### Faster, more reliable installation: Native Gem for ARM64 Linux

This version of Nokogiri ships full native gem support for the `aarch64-linux` platform, which should support AWS Graviton and other ARM Linux platforms. Please note that glibc >= 2.29 is required for aarch64-linux systems, see [Supported Platforms](https://nokogiri.org/#supported-platforms) for more information.
Expand All @@ -16,6 +18,11 @@ This version of Nokogiri ships full native gem support for the `aarch64-linux` p
This version of Nokogiri uses [`jar-dependencies`](https://github.com/mkristian/jar-dependencies) to manage most of the vendored Java dependencies. `nokogiri -v` now outputs maven metadata for all Java dependencies, and `Nokogiri::VERSION_INFO` also contains this metadata. [[#2432](https://github.com/sparklemotion/nokogiri/issues/2432)]


### Fixed

* [CRuby] UTF-16-encoded documents longer than ~4000 code points now serialize properly. Previously the serialized document was corrupted when it exceeded the length of libxml2's internal string buffer. [[#752](https://github.com/sparklemotion/nokogiri/issues/752)]


## 1.13.1 / 2022-01-13

### Fixed
Expand Down
60 changes: 32 additions & 28 deletions ext/nokogiri/nokogiri.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ void noko_init_html_sax_push_parser(void);
void noko_init_gumbo(void);
void noko_init_test_global_handlers(void);

static ID id_read, id_write;
static ID id_read, id_write, id_external_encoding;


#ifndef HAVE_VASPRINTF
Expand All @@ -76,76 +76,79 @@ vasprintf(char **strp, const char *fmt, va_list ap)


static VALUE
read_check(VALUE val)
noko_io_read_check(VALUE val)
{
VALUE *args = (VALUE *)val;
return rb_funcall(args[0], id_read, 1, args[1]);
}


static VALUE
read_failed(VALUE arg, VALUE exc)
noko_io_read_failed(VALUE arg, VALUE exc)
{
return Qundef;
}


int
noko_io_read(void *ctx, char *buffer, int len)
noko_io_read(void *io, char *c_buffer, int c_buffer_len)
{
VALUE string, args[2];
size_t str_len, safe_len;
VALUE rb_io = (VALUE)io;
VALUE rb_read_string, rb_args[2];
size_t n_bytes_read, safe_len;

args[0] = (VALUE)ctx;
args[1] = INT2NUM(len);
rb_args[0] = rb_io;
rb_args[1] = INT2NUM(c_buffer_len);

string = rb_rescue(read_check, (VALUE)args, read_failed, 0);
rb_read_string = rb_rescue(noko_io_read_check, (VALUE)rb_args, noko_io_read_failed, 0);

if (NIL_P(string)) { return 0; }
if (string == Qundef) { return -1; }
if (TYPE(string) != T_STRING) { return -1; }
if (NIL_P(rb_read_string)) { return 0; }
if (rb_read_string == Qundef) { return -1; }
if (TYPE(rb_read_string) != T_STRING) { return -1; }

str_len = (size_t)RSTRING_LEN(string);
safe_len = str_len > (size_t)len ? (size_t)len : str_len;
memcpy(buffer, StringValuePtr(string), safe_len);
n_bytes_read = (size_t)RSTRING_LEN(rb_read_string);
safe_len = (n_bytes_read > (size_t)c_buffer_len) ? (size_t)c_buffer_len : n_bytes_read;
memcpy(c_buffer, StringValuePtr(rb_read_string), safe_len);

return (int)safe_len;
}


static VALUE
write_check(VALUE val)
noko_io_write_check(VALUE rb_args)
{
VALUE *args = (VALUE *)val;
return rb_funcall(args[0], id_write, 1, args[1]);
VALUE rb_io = ((VALUE *)rb_args)[0];
VALUE rb_output = ((VALUE *)rb_args)[1];
return rb_funcall(rb_io, id_write, 1, rb_output);
}


static VALUE
write_failed(VALUE arg, VALUE exc)
noko_io_write_failed(VALUE arg, VALUE exc)
{
return Qundef;
}


int
noko_io_write(void *ctx, char *buffer, int len)
noko_io_write(void *io, char *c_buffer, int c_buffer_len)
{
VALUE args[2], size;

args[0] = (VALUE)ctx;
args[1] = rb_str_new(buffer, (long)len);
VALUE rb_args[2], rb_n_bytes_written;
VALUE rb_io = (VALUE)io;
rb_encoding *io_encoding = rb_to_encoding(rb_funcall(rb_io, id_external_encoding, 0));

size = rb_rescue(write_check, (VALUE)args, write_failed, 0);
rb_args[0] = rb_io;
rb_args[1] = rb_enc_str_new(c_buffer, (long)c_buffer_len, io_encoding);

if (size == Qundef) { return -1; }
rb_n_bytes_written = rb_rescue(noko_io_write_check, (VALUE)rb_args, noko_io_write_failed, 0);
if (rb_n_bytes_written == Qundef) { return -1; }

return NUM2INT(size);
return NUM2INT(rb_n_bytes_written);
}


int
noko_io_close(void *ctx)
noko_io_close(void *io)
{
return 0;
}
Expand Down Expand Up @@ -275,4 +278,5 @@ Init_nokogiri()

id_read = rb_intern("read");
id_write = rb_intern("write");
id_external_encoding = rb_intern("external_encoding");
}
9 changes: 4 additions & 5 deletions lib/nokogiri/xml/node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1188,12 +1188,11 @@ def serialize(*args, &block)
}
end

encoding = options[:encoding] || document.encoding
options[:encoding] = encoding
options[:encoding] ||= document.encoding
encoding = Encoding.find(options[:encoding] || "UTF-8")

io = StringIO.new(String.new(encoding: encoding), "wb:#{encoding}:#{encoding}")

outstring = +""
outstring.force_encoding(Encoding.find(encoding || "utf-8"))
io = StringIO.new(outstring)
write_to(io, options, &block)
io.string
end
Expand Down
8 changes: 0 additions & 8 deletions test/xml/test_document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -491,14 +491,6 @@ def test_document_parse_method
assert_equal(xml.root.to_s, xml.root.to_s)
end

def test_encoding=
xml.encoding = "UTF-8"
assert_match("UTF-8", xml.to_xml)

xml.encoding = "EUC-JP"
assert_match("EUC-JP", xml.to_xml)
end

def test_namespace_should_not_exist
assert_raises(NoMethodError) do
xml.namespace
Expand Down
71 changes: 53 additions & 18 deletions test/xml/test_document_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,63 @@
module Nokogiri
module XML
class TestDocumentEncoding < Nokogiri::TestCase
def setup
super
@xml = Nokogiri::XML(File.read(SHIFT_JIS_XML), SHIFT_JIS_XML)
end
describe "Nokogiri::XML::Document encoding" do
let(:shift_jis_document) { Nokogiri::XML(File.read(SHIFT_JIS_XML), SHIFT_JIS_XML) }
let(:ascii_document) { Nokogiri::XML.parse(File.read(XML_FILE), XML_FILE) }

def test_url
assert_equal("UTF-8", @xml.url.encoding.name)
end
describe "#encoding" do
it "describes the document's encoding correctly" do
assert_equal("Shift_JIS", shift_jis_document.encoding)
end

def test_encoding
assert_equal("UTF-8", @xml.encoding.encoding.name)
end
it "applies the specified encoding even if on empty documents" do
encoding = "Shift_JIS"
assert_equal(encoding, Nokogiri::XML(nil, nil, encoding).encoding)
end
end

def test_dotted_version
skip_unless_libxml2
assert_equal("UTF-8", Nokogiri::LIBXML_COMPILED_VERSION.encoding.name)
assert_equal("UTF-8", Nokogiri::LIBXSLT_COMPILED_VERSION.encoding.name)
end
describe "#encoding=" do
it "determines the document's encoding when serialized" do
ascii_document.encoding = "UTF-8"
assert_match("encoding=\"UTF-8\"", ascii_document.to_xml)

ascii_document.encoding = "EUC-JP"
assert_match("encoding=\"EUC-JP\"", ascii_document.to_xml)
end
end

it "encodes the URL as UTF-8" do
assert_equal("UTF-8", shift_jis_document.url.encoding.name)
end

it "encodes the encoding name as UTF-8" do
assert_equal("UTF-8", shift_jis_document.encoding.encoding.name)
end

it "encodes the library versions as UTF-8" do
skip_unless_libxml2
assert_equal("UTF-8", Nokogiri::LIBXML_COMPILED_VERSION.encoding.name)
assert_equal("UTF-8", Nokogiri::LIBXSLT_COMPILED_VERSION.encoding.name)
end

it "serializes UTF-16 correctly across libxml2 buffer flushes" do
# https://github.com/sparklemotion/nokogiri/issues/752
skip_unless_libxml2

# the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
# is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
size = 4000
input = String.new(<<~XML, encoding: "UTF-16")
<?xml version="1.0" encoding="UTF-16"?>
<root>
<bar>#{"A" * size}</bar>
</root>
XML
expected_length = (input.bytesize * 2) + 2 # double character width, add BOM bytes 0xFEFF

def test_empty_doc_encoding
encoding = "US-ASCII"
assert_equal(encoding, Nokogiri::XML(nil, nil, encoding).encoding)
output = Nokogiri::XML(input).to_xml
assert_equal(expected_length, output.bytesize)
end
end
end
end
Expand Down

0 comments on commit 53c8293

Please sign in to comment.