Skip to content

Commit

Permalink
Merge pull request #2829 from sparklemotion/flavorjones-2800-xslt-mod…
Browse files Browse the repository at this point in the history
…ifying-doc

better defensive behavior when libxml2 or libxslt will make unsafe modifications to a document
  • Loading branch information
flavorjones committed Mar 10, 2023
2 parents 770a48e + cfbb42f commit 2932489
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 67 deletions.
6 changes: 4 additions & 2 deletions CHANGELOG.md
Expand Up @@ -15,10 +15,12 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

### Changed

* [CRuby] `Schema.from_document` now makes a defensive copy of the document if it has blank text nodes with Ruby objects instantiated for them. This prevents unsafe behavior in libxml2 from causing a segfault. There is a small performance cost, but we think this has the virtue of being "what the user meant" since modifying the original is surprising behavior for most users. Previously this was addressed in v1.10.9 by raising an exception.


### Fixed

* [JRuby] Serializing an HTML4 document with `#write_to` and specifying no save options will properly emit an HTML document anyway, like libxml2 does. Previously JRuby emitted XML in this situation.
* [JRuby] Serializing with `#write_to` will fall back to the document encoding when no encoding is specified, like libxml2 does. Previously JRuby emitted UTF-8 in this situation.
* [CRuby] `XSLT.transform` now makes a defensive copy of the document if it has blank text nodes with Ruby objects instantiated for them _and_ the template uses `xsl:strip-spaces`. This prevents unsafe behavior in libxslt from causing a segfault. There is a small performance cost, but we think this has the virtue of being "what the user meant" since modifying the original is surprising behavior for most users. Previously this would allow unsafe memory access and potentially segfault. [[#2800](https://github.com/sparklemotion/nokogiri/issues/2800)]


### Improved
Expand Down
2 changes: 2 additions & 0 deletions ext/nokogiri/nokogiri.h
Expand Up @@ -51,6 +51,7 @@
#include <libxslt/xsltconfig.h>
#include <libxslt/xsltutils.h>
#include <libxslt/transform.h>
#include <libxslt/imports.h>
#include <libxslt/xsltInternals.h>

#include <libexslt/exslt.h>
Expand Down Expand Up @@ -168,6 +169,7 @@ typedef struct _nokogiriXsltStylesheetTuple {

void noko_xml_document_pin_node(xmlNodePtr);
void noko_xml_document_pin_namespace(xmlNsPtr, xmlDocPtr);
int noko_xml_document_has_wrapped_blank_nodes_p(xmlDocPtr c_document);

int noko_io_read(void *ctx, char *buffer, int len);
int noko_io_write(void *ctx, char *buffer, int len);
Expand Down
27 changes: 27 additions & 0 deletions ext/nokogiri/xml_document.c
Expand Up @@ -685,6 +685,33 @@ noko_xml_document_unwrap(VALUE rb_document)
return c_document;
}

/* Schema creation will remove and deallocate "blank" nodes.
* If those blank nodes have been exposed to Ruby, they could get freed
* out from under the VALUE pointer. This function checks to see if any of
* those nodes have been exposed to Ruby, and if so we should raise an exception.
*/
int
noko_xml_document_has_wrapped_blank_nodes_p(xmlDocPtr c_document)
{
VALUE cache = DOC_NODE_CACHE(c_document);

if (NIL_P(cache)) {
return 0;
}

for (long jnode = 0; jnode < RARRAY_LEN(cache); jnode++) {
xmlNodePtr node;
VALUE element = rb_ary_entry(cache, jnode);

Noko_Node_Get_Struct(element, xmlNode, node);
if (xmlIsBlankNode(node)) {
return 1;
}
}

return 0;
}

void
noko_xml_document_pin_node(xmlNodePtr node)
{
Expand Down
45 changes: 13 additions & 32 deletions ext/nokogiri/xml_schema.c
Expand Up @@ -191,32 +191,6 @@ read_memory(int argc, VALUE *argv, VALUE klass)
return xml_schema_parse_schema(klass, c_parser_context, rb_parse_options);
}

/* Schema creation will remove and deallocate "blank" nodes.
* If those blank nodes have been exposed to Ruby, they could get freed
* out from under the VALUE pointer. This function checks to see if any of
* those nodes have been exposed to Ruby, and if so we should raise an exception.
*/
static int
has_blank_nodes_p(VALUE cache)
{
long i;

if (NIL_P(cache)) {
return 0;
}

for (i = 0; i < RARRAY_LEN(cache); i++) {
xmlNodePtr node;
VALUE element = rb_ary_entry(cache, i);
Noko_Node_Get_Struct(element, xmlNode, node);
if (xmlIsBlankNode(node)) {
return 1;
}
}

return 0;
}

/*
* call-seq:
* from_document(document) → Nokogiri::XML::Schema
Expand All @@ -233,8 +207,10 @@ from_document(int argc, VALUE *argv, VALUE klass)
{
VALUE rb_document;
VALUE rb_parse_options;
VALUE rb_schema;
xmlDocPtr c_document;
xmlSchemaParserCtxtPtr c_parser_context;
int defensive_copy_p = 0;

rb_scan_args(argc, argv, "11", &rb_document, &rb_parse_options);

Expand All @@ -248,16 +224,21 @@ from_document(int argc, VALUE *argv, VALUE klass)
c_document = deprecated_node_type_arg->doc;
}

if (has_blank_nodes_p(DOC_NODE_CACHE(c_document))) {
rb_raise(
rb_eArgError,
"Creating a schema from a document that has blank nodes exposed to Ruby is dangerous"
);
if (noko_xml_document_has_wrapped_blank_nodes_p(c_document)) {
// see https://github.com/sparklemotion/nokogiri/pull/2001
c_document = xmlCopyDoc(c_document, 1);
defensive_copy_p = 1;
}

c_parser_context = xmlSchemaNewDocParserCtxt(c_document);
rb_schema = xml_schema_parse_schema(klass, c_parser_context, rb_parse_options);

return xml_schema_parse_schema(klass, c_parser_context, rb_parse_options);
if (defensive_copy_p) {
xmlFreeDoc(c_document);
c_document = NULL;
}

return rb_schema;
}

void
Expand Down
56 changes: 35 additions & 21 deletions ext/nokogiri/xslt_stylesheet.c
Expand Up @@ -244,58 +244,72 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
static VALUE
rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self)
{
VALUE xmldoc, paramobj, errstr, exception ;
xmlDocPtr xml ;
xmlDocPtr result ;
VALUE rb_document, rb_param, rb_error_str;
xmlDocPtr c_document ;
xmlDocPtr c_result_document ;
nokogiriXsltStylesheetTuple *wrapper;
const char **params ;
long param_len, j ;
int parse_error_occurred ;
int defensive_copy_p = 0;

rb_scan_args(argc, argv, "11", &xmldoc, &paramobj);
if (NIL_P(paramobj)) { paramobj = rb_ary_new2(0L) ; }
if (!rb_obj_is_kind_of(xmldoc, cNokogiriXmlDocument)) {
rb_scan_args(argc, argv, "11", &rb_document, &rb_param);
if (NIL_P(rb_param)) { rb_param = rb_ary_new2(0L) ; }
if (!rb_obj_is_kind_of(rb_document, cNokogiriXmlDocument)) {
rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::Document");
}

/* handle hashes as arguments. */
if (T_HASH == TYPE(paramobj)) {
paramobj = rb_funcall(paramobj, rb_intern("to_a"), 0);
paramobj = rb_funcall(paramobj, rb_intern("flatten"), 0);
if (T_HASH == TYPE(rb_param)) {
rb_param = rb_funcall(rb_param, rb_intern("to_a"), 0);
rb_param = rb_funcall(rb_param, rb_intern("flatten"), 0);
}

Check_Type(paramobj, T_ARRAY);
Check_Type(rb_param, T_ARRAY);

xml = noko_xml_document_unwrap(xmldoc);
c_document = noko_xml_document_unwrap(rb_document);
TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &xslt_stylesheet_type, wrapper);

param_len = RARRAY_LEN(paramobj);
param_len = RARRAY_LEN(rb_param);
params = ruby_xcalloc((size_t)param_len + 1, sizeof(char *));
for (j = 0 ; j < param_len ; j++) {
VALUE entry = rb_ary_entry(paramobj, j);
VALUE entry = rb_ary_entry(rb_param, j);
const char *ptr = StringValueCStr(entry);
params[j] = ptr;
}
params[param_len] = 0 ;

errstr = rb_str_new(0, 0);
xsltSetGenericErrorFunc((void *)errstr, xslt_generic_error_handler);
xmlSetGenericErrorFunc((void *)errstr, xslt_generic_error_handler);
xsltTransformContextPtr c_transform_context = xsltNewTransformContext(wrapper->ss, c_document);
if (xsltNeedElemSpaceHandling(c_transform_context) &&
noko_xml_document_has_wrapped_blank_nodes_p(c_document)) {
// see https://github.com/sparklemotion/nokogiri/issues/2800
c_document = xmlCopyDoc(c_document, 1);
defensive_copy_p = 1;
}
xsltFreeTransformContext(c_transform_context);

rb_error_str = rb_str_new(0, 0);
xsltSetGenericErrorFunc((void *)rb_error_str, xslt_generic_error_handler);
xmlSetGenericErrorFunc((void *)rb_error_str, xslt_generic_error_handler);

c_result_document = xsltApplyStylesheet(wrapper->ss, c_document, params);

result = xsltApplyStylesheet(wrapper->ss, xml, params);
ruby_xfree(params);
if (defensive_copy_p) {
xmlFreeDoc(c_document);
c_document = NULL;
}

xsltSetGenericErrorFunc(NULL, NULL);
xmlSetGenericErrorFunc(NULL, NULL);

parse_error_occurred = (Qfalse == rb_funcall(errstr, rb_intern("empty?"), 0));
parse_error_occurred = (Qfalse == rb_funcall(rb_error_str, rb_intern("empty?"), 0));

if (parse_error_occurred) {
exception = rb_exc_new3(rb_eRuntimeError, errstr);
rb_exc_raise(exception);
rb_exc_raise(rb_exc_new3(rb_eRuntimeError, rb_error_str));
}

return noko_xml_document_wrap((VALUE)0, result) ;
return noko_xml_document_wrap((VALUE)0, c_result_document) ;
}

static void
Expand Down
54 changes: 54 additions & 0 deletions test/test_xslt_transforms.rb
Expand Up @@ -366,6 +366,60 @@ def test_non_html_xslt_transform
end
end

describe "https://github.com/sparklemotion/nokogiri/issues/2800" do
let(:doc) do
Nokogiri::XML::Document.parse(<<~XML)
<catalog>
<entry><title>abc</title></entry>
<entry><title> </title></entry>
<entry><title>xyz</title></entry>
</catalog>
XML
end

let(:stylesheet) do
Nokogiri::XSLT.parse(<<~XSL)
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:strip-space elements="title" />
<xsl:template match="/">
<xsl:for-each select="catalog/entry">[<xsl:value-of select="title" />]</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
XSL
end

it "should modify the original doc if no wrapped blank text nodes would be removed" do
# this is the default libxslt behavior
skip_unless_libxml2("libxslt bug is not present in JRuby")

result = stylesheet.transform(doc)

assert_includes(result.to_s, "[abc][][xyz]", "xsl:strip-space should work")
doc.at_css("entry:nth-child(2)").tap do |entry|
assert_equal(1, entry.children.length)
assert_equal("", entry.children.first.content, "original doc should be modified")
end
end

it "should not modify the original doc if wrapped blank text nodes would be removed" do
skip_unless_libxml2("libxslt bug is not present in JRuby")

# wrap the blank text node
assert(child = doc.css("title").children.find(&:blank?))

result = stylesheet.transform(doc)

assert(child.to_s) # raise a valgrind error if the fix isn't working

assert_includes(result.to_s, "[abc][][xyz]", "xsl:strip-space should work")
doc.at_css("entry:nth-child(2)").tap do |entry|
assert_equal(1, entry.children.length)
assert_equal(" ", entry.children.first.content, "original doc is unmodified")
end
end
end

describe "DEFAULT_XSLT parse options" do
it "is the union of DEFAULT_XML and libxslt's XSLT_PARSE_OPTIONS" do
xslt_parse_options = Nokogiri::XML::ParseOptions.new.noent.dtdload.dtdattr.nocdata
Expand Down
25 changes: 13 additions & 12 deletions test/xml/test_schema.rb
Expand Up @@ -10,15 +10,18 @@ def setup
assert(@xsd = Nokogiri::XML::Schema(File.read(PO_SCHEMA_FILE)))
end

def test_issue_1985_segv_on_schema_parse
def test_issue_1985_schema_parse_modifying_underlying_document
skip_unless_libxml2("Pure Java version doesn't have this bug")

# This is a test for a workaround for a bug in LibXML2. The upstream
# bug is here: https://gitlab.gnome.org/GNOME/libxml2/issues/148
# Schema creation can result in dangling pointers. If no nodes have
# been exposed, then it should be fine to create a schema. If nodes
# have been exposed to Ruby, then we need to make sure they won't be
# freed out from under us.
# This is a test for a workaround for a bug in LibXML2:
#
# https://gitlab.gnome.org/GNOME/libxml2/issues/148
#
# Schema creation can modify the original document -- removal of blank text nodes -- which
# results in dangling pointers.
#
# If no nodes have been exposed, then it should be fine to create a schema. If nodes have
# been exposed to Ruby, then we need to make sure they won't be freed out from under us.
doc = <<~EOF
<?xml version="1.0" encoding="UTF-8" ?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
Expand All @@ -32,12 +35,10 @@ def test_issue_1985_segv_on_schema_parse

# This is not OK, nodes have been exposed to Ruby
xsd_doc = Nokogiri::XML(doc)
xsd_doc.root.children.find(&:blank?) # Finds a node
child = xsd_doc.root.children.find(&:blank?) # Find a blank node that would be freed without the fix

ex = assert_raises(ArgumentError) do
Nokogiri::XML::Schema.from_document(xsd_doc)
end
assert_match(/blank nodes/, ex.message)
Nokogiri::XML::Schema.from_document(xsd_doc)
assert(child.to_s) # This will raise a valgrind error if the node was freed
end

def test_schema_read_memory
Expand Down

0 comments on commit 2932489

Please sign in to comment.