Merge pull request #2186 from sparklemotion/1952-node-set-segfault

Allow NodeSet to contain Nodes from multiple Documents (fix for #1952) --- **What problem is this PR intended to solve?** fix: allow NodeSets to contain Nodes from multiple Documents Specifically, this means we've added a mark callback for the NodeSet which marks each of the contained objects. Previously we skipped explicitly marking the contained objects due to the assumption that all the nodes would be from the same document as the NodeSet itself. Fixes #1952. **Have you included adequate test coverage?** Yes, additional tests created. **Does this change affect the behavior of either the C or the Java implementations?** This change allows the C implementation to do what the Java implementation has always allowed.
sparklemotion · Feb 5, 2021 · d5c053a · d5c053a
2 parents 8d738a2 + 26872c9
commit d5c053a
Show file tree

Hide file tree

Showing 8 changed files with 921 additions and 786 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
 
 ### Fixed
 
+* [CRuby] `NodeSet` may now safely contain `Node` objects from multiple documents. Previously the GC lifecycle of the parent `Document` objects could lead to contained nodes being GCed while still in scope. [[#1952](https://github.com/sparklemotion/nokogiri/issues/1952)]
 * [CRuby] `{XML,HTML}::Document.parse` now invokes `#initialize` exactly once. Previously `#initialize` was invoked twice on each object.
 * [JRuby] `{XML,HTML}::Document.parse` now invokes `#initialize` exactly once. Previously `#initialize` was not called, which was a problem for subclassing such as done by `Loofah`.
 

diff --git a/ext/nokogiri/xml_namespace.c b/ext/nokogiri/xml_namespace.c
@@ -1,5 +1,27 @@
 #include <xml_namespace.h>
 
+/*
+ *  The lifecycle of a Namespace node is more complicated than other Nodes, for two reasons:
+ *
+ *  1. the underlying C structure has a different layout than all the other node structs, with the
+ *     `_private` member where we store a pointer to Ruby object data not being in first position.
+ *  2. xmlNs structures returned in an xmlNodeset from an XPath query are copies of the document's
+ *     namespaces, and so do not share the same memory lifecycle as everything else in a document.
+ *
+ *  As a result of 1, you may see special handling of XML_NAMESPACE_DECL node types throughout the
+ *  Nokogiri C code, though I intend to wrap up that logic in ruby_object_{get,set} functions
+ *  shortly.
+ *
+ *  As a result of 2, you will see we have special handling in this file and in xml_node_set.c to
+ *  carefully manage the memory lifecycle of xmlNs structs to match the Ruby object's GC
+ *  lifecycle. In xml_node_set.c we have local versions of xmlXPathNodeSetDel() and
+ *  xmlXPathFreeNodeSet() that avoid freeing xmlNs structs in the node set. In this file, we decide
+ *  whether or not to call dealloc_namespace() depending on whether the xmlNs struct appears to be
+ *  in an xmlNodeSet (and thus the result of an XPath query) or not.
+ *
+ *  Yes, this is madness.
+ */
+
 VALUE cNokogiriXmlNamespace ;
 
 static void dealloc_namespace(xmlNsPtr ns)

diff --git a/ext/nokogiri/xml_node.c b/ext/nokogiri/xml_node.c
@@ -1341,9 +1341,9 @@ static VALUE line(VALUE self)
 static VALUE set_line(VALUE self, VALUE num)
 {
   xmlNodePtr node;
-  Data_Get_Struct(self, xmlNode, node);
-
   int value = NUM2INT(num);
+
+  Data_Get_Struct(self, xmlNode, node);
   if (value < 65535) {
     node->line = value;
   }

diff --git a/ext/nokogiri/xml_node_set.c b/ext/nokogiri/xml_node_set.c
@@ -3,7 +3,6 @@
 #include <libxml/xpathInternals.h>
 
 static ID decorate ;
-static void xpath_node_set_del(xmlNodeSetPtr cur, xmlNodePtr val);
 
 
 static void Check_Node_Set_Node_Type(VALUE node)
@@ -15,22 +14,75 @@ static void Check_Node_Set_Node_Type(VALUE node)
 }
 
 
+static
+VALUE
+ruby_object_get(xmlNodePtr c_node)
+{
+  /* see xmlElementType in libxml2 tree.h */
+  switch (c_node->type) {
+    case XML_NAMESPACE_DECL:
+      /* _private is later in the namespace struct */
+      return (VALUE)(((xmlNsPtr)c_node)->_private);
+
+    case XML_DOCUMENT_NODE:
+    case XML_HTML_DOCUMENT_NODE:
+      /* in documents we use _private to store a tuple */
+      if (DOC_RUBY_OBJECT_TEST(((xmlDocPtr)c_node))) {
+        return DOC_RUBY_OBJECT((xmlDocPtr)c_node);
+      }
+      return (VALUE)NULL;
+
+    default:
+      return (VALUE)(c_node->_private);
+  }
+}
+
+
+static void mark(xmlNodeSetPtr node_set)
+{
+  VALUE rb_node;
+  int jnode;
+
+  for (jnode = 0; jnode < node_set->nodeNr; jnode++) {
+    rb_node = ruby_object_get(node_set->nodeTab[jnode]);
+    if (rb_node) {
+      rb_gc_mark(rb_node);
+    }
+  }
+}
+
+static void xpath_node_set_del(xmlNodeSetPtr cur, xmlNodePtr val)
+{
+  /*
+   * For reasons outlined in xml_namespace.c, here we reproduce xmlXPathNodeSetDel() except for the
+   * offending call to xmlXPathNodeSetFreeNs().
+   */
+  int i;
+
+  if (cur == NULL) return;
+  if (val == NULL) return;
+
+  /*
+   * find node in nodeTab
+   */
+  for (i = 0;i < cur->nodeNr;i++)
+    if (cur->nodeTab[i] == val) break;
+
+  if (i >= cur->nodeNr) {	/* not found */
+    return;
+  }
+  cur->nodeNr--;
+  for (;i < cur->nodeNr;i++)
+    cur->nodeTab[i] = cur->nodeTab[i + 1];
+  cur->nodeTab[cur->nodeNr] = NULL;
+}
+
+
 static void deallocate(xmlNodeSetPtr node_set)
 {
   /*
-   *
-   *  since xpath queries return copies of the xmlNs structs,
-   *  xmlXPathFreeNodeSet() frees those xmlNs structs that are in the
-   *  NodeSet.
-   *
-   *  this is bad if someone is still trying to use the Namespace object wrapped
-   *  around the xmlNs, so we need to avoid that.
-   *
-   *  here we reproduce xmlXPathFreeNodeSet() without the xmlNs logic.
-   *
-   *  this doesn't cause a leak because Namespace objects that are in an XPath
-   *  query NodeSet are given their own lifecycle in
-   *  Nokogiri_wrap_xml_namespace().
+   * For reasons outlined in xml_namespace.c, here we reproduce xmlXPathFreeNodeSet() except for the
+   * offending call to xmlXPathNodeSetFreeNs().
    */
   NOKOGIRI_DEBUG_START(node_set) ;
   if (node_set->nodeTab != NULL)
@@ -40,6 +92,7 @@ static void deallocate(xmlNodeSetPtr node_set)
   NOKOGIRI_DEBUG_END(node_set) ;
 }
 
+
 static VALUE allocate(VALUE klass)
 {
   return Nokogiri_wrap_xml_node_set(xmlXPathNodeSetCreate(NULL), Qnil);
@@ -176,21 +229,22 @@ static VALUE include_eh(VALUE self, VALUE rb_node)
  * Returns a new set built by merging the set and the elements of the given
  * set.
  */
-static VALUE set_union(VALUE self, VALUE rb_other)
+static VALUE rb_xml_node_set_union(VALUE rb_node_set, VALUE rb_other)
 {
-  xmlNodeSetPtr node_set, other;
-  xmlNodeSetPtr new;
+  xmlNodeSetPtr c_node_set, c_other;
+  xmlNodeSetPtr c_new_node_set;
 
-  if(!rb_obj_is_kind_of(rb_other, cNokogiriXmlNodeSet))
+  if (!rb_obj_is_kind_of(rb_other, cNokogiriXmlNodeSet)) {
     rb_raise(rb_eArgError, "node_set must be a Nokogiri::XML::NodeSet");
+  }
 
-  Data_Get_Struct(self, xmlNodeSet, node_set);
-  Data_Get_Struct(rb_other, xmlNodeSet, other);
+  Data_Get_Struct(rb_node_set, xmlNodeSet, c_node_set);
+  Data_Get_Struct(rb_other, xmlNodeSet, c_other);
 
-  new = xmlXPathNodeSetMerge(NULL, node_set);
-  new = xmlXPathNodeSetMerge(new, other);
+  c_new_node_set = xmlXPathNodeSetMerge(NULL, c_node_set);
+  c_new_node_set = xmlXPathNodeSetMerge(c_new_node_set, c_other);
 
-  return Nokogiri_wrap_xml_node_set(new, rb_iv_get(self, "@document"));
+  return Nokogiri_wrap_xml_node_set(c_new_node_set, rb_iv_get(rb_node_set, "@document"));
 }
 
 /*
@@ -364,57 +418,28 @@ static VALUE unlink_nodeset(VALUE self)
 }
 
 
-static void reify_node_set_namespaces(VALUE self)
+VALUE Nokogiri_wrap_xml_node_set(xmlNodeSetPtr c_node_set, VALUE document)
 {
-  /*
-   *  as mentioned in deallocate() above, xmlNs structs returned in an XPath
-   *  NodeSet are duplicates, and we don't clean them up at deallocate() time.
-   *
-   *  as a result, we need to make sure the Ruby manages this memory. we do this
-   *  by forcing the creation of a Ruby object wrapped around the xmlNs.
-   *
-   *  we also have to make sure that the NodeSet has a reference to the
-   *  Namespace object, otherwise GC will kick in and the Namespace won't be
-   *  marked.
-   *
-   *  we *could* do this safely with *all* the nodes in the NodeSet, but we only
-   *  *need* to do it for xmlNs structs, and so you get the code we have here.
-   */
-  int j ;
-  xmlNodeSetPtr node_set ;
-  VALUE namespace_cache ;
+  int j;
+  VALUE rb_node_set ;
 
-  Data_Get_Struct(self, xmlNodeSet, node_set);
-
-  namespace_cache = rb_iv_get(self, "@namespace_cache");
-
-  for (j = 0 ; j < node_set->nodeNr ; j++) {
-    if (NOKOGIRI_NAMESPACE_EH(node_set->nodeTab[j])) {
-      rb_ary_push(namespace_cache, Nokogiri_wrap_xml_node_set_node(node_set->nodeTab[j], self));
-    }
+  if (c_node_set == NULL) {
+    c_node_set = xmlXPathNodeSetCreate(NULL);
   }
-}
 
-
-VALUE Nokogiri_wrap_xml_node_set(xmlNodeSetPtr node_set, VALUE document)
-{
-  VALUE new_set ;
-
-  if (node_set == NULL) {
-    node_set = xmlXPathNodeSetCreate(NULL);
-  }
-
-  new_set = Data_Wrap_Struct(cNokogiriXmlNodeSet, 0, deallocate, node_set);
+  rb_node_set = Data_Wrap_Struct(cNokogiriXmlNodeSet, mark, deallocate, c_node_set);
 
   if (!NIL_P(document)) {
-    rb_iv_set(new_set, "@document", document);
-    rb_funcall(document, decorate, 1, new_set);
+    rb_iv_set(rb_node_set, "@document", document);
+    rb_funcall(document, decorate, 1, rb_node_set);
   }
 
-  rb_iv_set(new_set, "@namespace_cache", rb_ary_new());
-  reify_node_set_namespaces(new_set);
+  /* make sure we create ruby objects for all the results, so they'll be marked during the GC mark phase */
+  for (j = 0 ; j < c_node_set->nodeNr ; j++) {
+    Nokogiri_wrap_xml_node_set_node(c_node_set->nodeTab[j], rb_node_set);
+  }
 
-  return new_set ;
+  return rb_node_set ;
 }
 
 VALUE Nokogiri_wrap_xml_node_set_node(xmlNodePtr node, VALUE node_set)
@@ -430,36 +455,6 @@ VALUE Nokogiri_wrap_xml_node_set_node(xmlNodePtr node, VALUE node_set)
 }
 
 
-static void xpath_node_set_del(xmlNodeSetPtr cur, xmlNodePtr val)
-{
-  /*
-   * as mentioned a few times above, we do not want to free xmlNs structs
-   * outside of the Namespace lifecycle.
-   *
-   * xmlXPathNodeSetDel() frees xmlNs structs, and so here we reproduce that
-   * function with the xmlNs logic.
-   */
-  int i;
-
-  if (cur == NULL) return;
-  if (val == NULL) return;
-
-  /*
-   * find node in nodeTab
-   */
-  for (i = 0;i < cur->nodeNr;i++)
-    if (cur->nodeTab[i] == val) break;
-
-  if (i >= cur->nodeNr) {	/* not found */
-    return;
-  }
-  cur->nodeNr--;
-  for (;i < cur->nodeNr;i++)
-    cur->nodeTab[i] = cur->nodeTab[i + 1];
-  cur->nodeTab[cur->nodeNr] = NULL;
-}
-
-
 VALUE cNokogiriXmlNodeSet ;
 void init_xml_node_set(void)
 {
@@ -473,7 +468,7 @@ void init_xml_node_set(void)
   rb_define_method(klass, "[]", slice, -1);
   rb_define_method(klass, "slice", slice, -1);
   rb_define_method(klass, "push", push, 1);
-  rb_define_method(klass, "|", set_union, 1);
+  rb_define_method(klass, "|", rb_xml_node_set_union, 1);
   rb_define_method(klass, "-", minus, 1);
   rb_define_method(klass, "unlink", unlink_nodeset, 0);
   rb_define_method(klass, "to_a", to_array, 0);

diff --git a/rakelib/test.rake b/rakelib/test.rake
@@ -57,6 +57,13 @@ class ValgrindTestTask < Rake::TestTask
   end
 end
 
+class GdbTestTask < ValgrindTestTask
+  def ruby(*args, **options, &block)
+    command = "gdb --args #{RUBY} #{args.join(' ')}"
+    sh(command, **options, &block)
+  end
+end
+
 def nokogiri_test_task_configuration(t)
   t.libs << "test"
   t.test_files = FileList["test/**/test_*.rb"]
@@ -71,4 +78,8 @@ namespace "test" do
   ValgrindTestTask.new("valgrind") do |t|
     nokogiri_test_task_configuration(t)
   end
+
+  GdbTestTask.new("gdb") do |t|
+    nokogiri_test_task_configuration(t)
+  end
 end
diff --git a/scripts/files-modified-by-open-prs b/scripts/files-modified-by-open-prs
@@ -35,15 +35,16 @@ prs = get("gh pr list").split("\n")
 paths_to_prs = Hash.new { |h,k| h[k] = Set.new }
 
 prs.each do |pr|
-  pr_number = pr.split(/\s+/).first
+  pr_number, *pr_desc = pr.split(/\s+/)
+  pr_desc = pr_desc.join(" ")
 
   diff = get("gh pr diff #{pr_number}")
   diff_data = GitDiff::from_string(diff)
 
   relevant_files = diff_data.files.map(&:b_path)
 
   if relevant_files.length > 0
-    puts "PR #{pr_number}:"
+    puts "PR #{pr_number}: #{pr_desc}"
     relevant_files.sort.each do |file|
       puts "- #{file}"
     end