From cb1557d08cca2a214d1c4e84ed4dc8bad72a71ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Barri=C3=A9?= Date: Fri, 3 Mar 2023 11:50:21 +0100 Subject: [PATCH 1/4] Use TypedData for Nokogiri::XML::Document Co-authored-by: Jean Boussier --- ext/nokogiri/html4_document.c | 2 +- ext/nokogiri/nokogiri.h | 2 ++ ext/nokogiri/xml_comment.c | 2 +- ext/nokogiri/xml_document.c | 37 +++++++++++++++-------- ext/nokogiri/xml_document_fragment.c | 2 +- ext/nokogiri/xml_entity_reference.c | 2 +- ext/nokogiri/xml_node.c | 2 +- ext/nokogiri/xml_processing_instruction.c | 2 +- ext/nokogiri/xml_relax_ng.c | 4 +-- ext/nokogiri/xslt_stylesheet.c | 11 +++++-- 10 files changed, 42 insertions(+), 24 deletions(-) diff --git a/ext/nokogiri/html4_document.c b/ext/nokogiri/html4_document.c index 939321e3bd..7b816d5ff0 100644 --- a/ext/nokogiri/html4_document.c +++ b/ext/nokogiri/html4_document.c @@ -145,7 +145,7 @@ static VALUE rb_html_document_type(VALUE self) { htmlDocPtr doc; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); return INT2NUM(doc->type); } diff --git a/ext/nokogiri/nokogiri.h b/ext/nokogiri/nokogiri.h index 4c6bea855e..bc7f515f99 100644 --- a/ext/nokogiri/nokogiri.h +++ b/ext/nokogiri/nokogiri.h @@ -166,6 +166,8 @@ typedef struct _nokogiriXsltStylesheetTuple { VALUE func_instances; } nokogiriXsltStylesheetTuple; +extern const rb_data_type_t noko_xml_document_data_type; + void noko_xml_document_pin_node(xmlNodePtr); void noko_xml_document_pin_namespace(xmlNsPtr, xmlDocPtr); diff --git a/ext/nokogiri/xml_comment.c b/ext/nokogiri/xml_comment.c index 81b57ca472..35db13d098 100644 --- a/ext/nokogiri/xml_comment.c +++ b/ext/nokogiri/xml_comment.c @@ -30,7 +30,7 @@ new (int argc, VALUE *argv, VALUE klass) rb_raise(rb_eArgError, "first argument must be a XML::Document or XML::Node"); } - Data_Get_Struct(document, xmlDoc, xml_doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc); node = xmlNewDocComment( xml_doc, diff --git a/ext/nokogiri/xml_document.c b/ext/nokogiri/xml_document.c index a55daa1b55..e987446504 100644 --- a/ext/nokogiri/xml_document.c +++ b/ext/nokogiri/xml_document.c @@ -51,8 +51,9 @@ remove_private(xmlNodePtr node) } static void -mark(xmlDocPtr doc) +mark(void *data) { + xmlDocPtr doc = (xmlDocPtr)data; nokogiriTuplePtr tuple = (nokogiriTuplePtr)doc->_private; if (tuple) { rb_gc_mark(tuple->doc); @@ -61,8 +62,9 @@ mark(xmlDocPtr doc) } static void -dealloc(xmlDocPtr doc) +dealloc(void *data) { + xmlDocPtr doc = (xmlDocPtr)data; st_table *node_hash; node_hash = DOC_UNLINKED_NODE_HASH(doc); @@ -84,6 +86,15 @@ dealloc(xmlDocPtr doc) xmlFreeDoc(doc); } +const rb_data_type_t noko_xml_document_data_type = { + .wrap_struct_name = "Nokogiri::XML::Document", + .function = { + .dmark = mark, + .dfree = dealloc, + }, + .flags = RUBY_TYPED_FREE_IMMEDIATELY +}; + static void recursively_remove_namespaces_from_node(xmlNodePtr node) { @@ -127,7 +138,7 @@ static VALUE url(VALUE self) { xmlDocPtr doc; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); if (doc->URL) { return NOKOGIRI_STR_NEW2(doc->URL); } @@ -146,7 +157,7 @@ rb_xml_document_root_set(VALUE self, VALUE rb_new_root) xmlDocPtr c_document; xmlNodePtr c_new_root = NULL, c_current_root; - Data_Get_Struct(self, xmlDoc, c_document); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, c_document); c_current_root = xmlDocGetRootElement(c_document); if (c_current_root) { @@ -190,7 +201,7 @@ rb_xml_document_root(VALUE self) xmlDocPtr c_document; xmlNodePtr c_root; - Data_Get_Struct(self, xmlDoc, c_document); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, c_document); c_root = xmlDocGetRootElement(c_document); if (!c_root) { @@ -210,7 +221,7 @@ static VALUE set_encoding(VALUE self, VALUE encoding) { xmlDocPtr doc; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); if (doc->encoding) { xmlFree(DISCARD_CONST_QUAL_XMLCHAR(doc->encoding)); @@ -231,7 +242,7 @@ static VALUE encoding(VALUE self) { xmlDocPtr doc; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); if (!doc->encoding) { return Qnil; } return NOKOGIRI_STR_NEW2(doc->encoding); @@ -247,7 +258,7 @@ static VALUE version(VALUE self) { xmlDocPtr doc; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); if (!doc->version) { return Qnil; } return NOKOGIRI_STR_NEW2(doc->version); @@ -369,7 +380,7 @@ duplicate_document(int argc, VALUE *argv, VALUE self) level = INT2NUM((long)1); } - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); dup = xmlCopyDoc(doc, (int)NUM2INT(level)); @@ -443,7 +454,7 @@ static VALUE remove_namespaces_bang(VALUE self) { xmlDocPtr doc ; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); recursively_remove_namespaces_from_node((xmlNodePtr)doc); return self; @@ -471,7 +482,7 @@ create_entity(int argc, VALUE *argv, VALUE self) xmlEntityPtr ptr; xmlDocPtr doc ; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id, &content); @@ -559,7 +570,7 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self) } } - Data_Get_Struct(self, xmlDoc, c_doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, c_doc); rb_cStringIO = rb_const_get_at(rb_cObject, rb_intern("StringIO")); rb_io = rb_class_new_instance(0, 0, rb_cStringIO); @@ -607,7 +618,7 @@ noko_xml_document_wrap_with_init_args(VALUE klass, xmlDocPtr c_document, int arg klass = cNokogiriXmlDocument; } - rb_document = Data_Wrap_Struct(klass, mark, dealloc, c_document); + rb_document = TypedData_Wrap_Struct(klass, &noko_xml_document_data_type, c_document); tuple = (nokogiriTuplePtr)ruby_xmalloc(sizeof(nokogiriTuple)); tuple->doc = rb_document; diff --git a/ext/nokogiri/xml_document_fragment.c b/ext/nokogiri/xml_document_fragment.c index 1a82eed1ec..bd534c3df1 100644 --- a/ext/nokogiri/xml_document_fragment.c +++ b/ext/nokogiri/xml_document_fragment.c @@ -19,7 +19,7 @@ new (int argc, VALUE *argv, VALUE klass) rb_scan_args(argc, argv, "1*", &document, &rest); - Data_Get_Struct(document, xmlDoc, xml_doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc); node = xmlNewDocFragment(xml_doc->doc); diff --git a/ext/nokogiri/xml_entity_reference.c b/ext/nokogiri/xml_entity_reference.c index d8f12438c5..272ea9ffec 100644 --- a/ext/nokogiri/xml_entity_reference.c +++ b/ext/nokogiri/xml_entity_reference.c @@ -20,7 +20,7 @@ new (int argc, VALUE *argv, VALUE klass) rb_scan_args(argc, argv, "2*", &document, &name, &rest); - Data_Get_Struct(document, xmlDoc, xml_doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc); node = xmlNewReference( xml_doc, diff --git a/ext/nokogiri/xml_node.c b/ext/nokogiri/xml_node.c index 7ddc639cbe..204df446fe 100644 --- a/ext/nokogiri/xml_node.c +++ b/ext/nokogiri/xml_node.c @@ -984,7 +984,7 @@ duplicate_node(int argc, VALUE *argv, VALUE self) if (n_args < 2) { new_parent_doc = node->doc; } else { - Data_Get_Struct(r_new_parent_doc, xmlDoc, new_parent_doc); + TypedData_Get_Struct(r_new_parent_doc, xmlDoc, &noko_xml_document_data_type, new_parent_doc); } dup = xmlDocCopyNode(node, new_parent_doc, level); diff --git a/ext/nokogiri/xml_processing_instruction.c b/ext/nokogiri/xml_processing_instruction.c index f4a6dc2eb2..76358cc0cc 100644 --- a/ext/nokogiri/xml_processing_instruction.c +++ b/ext/nokogiri/xml_processing_instruction.c @@ -22,7 +22,7 @@ new (int argc, VALUE *argv, VALUE klass) rb_scan_args(argc, argv, "3*", &document, &name, &content, &rest); - Data_Get_Struct(document, xmlDoc, xml_doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc); node = xmlNewDocPI( xml_doc, diff --git a/ext/nokogiri/xml_relax_ng.c b/ext/nokogiri/xml_relax_ng.c index 943b0a7cd5..4533abd883 100644 --- a/ext/nokogiri/xml_relax_ng.c +++ b/ext/nokogiri/xml_relax_ng.c @@ -23,7 +23,7 @@ validate_document(VALUE self, VALUE document) xmlRelaxNGValidCtxtPtr valid_ctxt; Data_Get_Struct(self, xmlRelaxNG, schema); - Data_Get_Struct(document, xmlDoc, doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, doc); errors = rb_ary_new(); @@ -127,7 +127,7 @@ from_document(int argc, VALUE *argv, VALUE klass) scanned_args = rb_scan_args(argc, argv, "11", &document, &parse_options); - Data_Get_Struct(document, xmlDoc, doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, doc); doc = doc->doc; /* In case someone passes us a node. ugh. */ if (scanned_args == 1) { diff --git a/ext/nokogiri/xslt_stylesheet.c b/ext/nokogiri/xslt_stylesheet.c index 700c754ee3..720adda863 100644 --- a/ext/nokogiri/xslt_stylesheet.c +++ b/ext/nokogiri/xslt_stylesheet.c @@ -79,7 +79,7 @@ parse_stylesheet_doc(VALUE klass, VALUE xmldocobj) xmlDocPtr xml, xml_cpy; VALUE errstr, exception; xsltStylesheetPtr ss ; - Data_Get_Struct(xmldocobj, xmlDoc, xml); + TypedData_Get_Struct(xmldocobj, xmlDoc, &noko_xml_document_data_type, xml); errstr = rb_str_new(0, 0); xsltSetGenericErrorFunc((void *)errstr, xslt_generic_error_handler); @@ -114,7 +114,12 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj) int doc_len ; VALUE rval ; - Data_Get_Struct(xmlobj, xmlDoc, xml); + TypedData_Get_Struct( + xmlobj, + xmlDoc, + &noko_xml_document_data_type, + xml + ); TypedData_Get_Struct( self, nokogiriXsltStylesheetTuple, @@ -265,7 +270,7 @@ rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self) Check_Type(paramobj, T_ARRAY); - Data_Get_Struct(xmldoc, xmlDoc, xml); + TypedData_Get_Struct(xmldoc, xmlDoc, &noko_xml_document_data_type, xml); TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &xslt_stylesheet_type, wrapper); param_len = RARRAY_LEN(paramobj); From 9208336a3649c2fec3a8532b9dc5719cf9f2a15f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Barri=C3=A9?= Date: Fri, 3 Mar 2023 12:32:13 +0100 Subject: [PATCH 2/4] Define a memsize for noko_xml_document_data_type Co-authored-by: Jean Boussier --- ext/nokogiri/xml_document.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ext/nokogiri/xml_document.c b/ext/nokogiri/xml_document.c index e987446504..4645f74c31 100644 --- a/ext/nokogiri/xml_document.c +++ b/ext/nokogiri/xml_document.c @@ -86,11 +86,34 @@ dealloc(void *data) xmlFreeDoc(doc); } +static size_t +memsize_node(const xmlNodePtr node) +{ + xmlNodePtr child; + size_t memsize = 0; + for (child = node->children; child; child = child->next) { + /* This should count the properties too. */ + memsize += sizeof(xmlNode) + memsize_node(child); + } + return memsize; +} + +static size_t +memsize(const void *data) +{ + xmlDocPtr doc = (const xmlDocPtr)data; + size_t memsize = sizeof(xmlDoc); + /* This may not account for all memory use */ + memsize += memsize_node((xmlNodePtr)doc); + return memsize; +} + const rb_data_type_t noko_xml_document_data_type = { .wrap_struct_name = "Nokogiri::XML::Document", .function = { .dmark = mark, .dfree = dealloc, + .dsize = memsize, }, .flags = RUBY_TYPED_FREE_IMMEDIATELY }; From 080438028f45679d958cf672ba9cad35e951af01 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 3 Mar 2023 16:25:17 -0500 Subject: [PATCH 3/4] ext: don't use FREE_IMMEDIATELY with XML::Document This is a tricky one to explain, but in summary, when dealloc_node_i2 parents unparented nodes, libxml2 may merge two adjacent text nodes, which causes additional memory allocations during GC. If the FREE_IMMEDIATELY flag is set, this will generate warnings. But generating those warnings during GC will lead to a segfault for reasons I haven't dug into yet. Anyway, let's leave this flag off for now. --- ext/nokogiri/xml_document.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ext/nokogiri/xml_document.c b/ext/nokogiri/xml_document.c index 4645f74c31..4b26cd4b6e 100644 --- a/ext/nokogiri/xml_document.c +++ b/ext/nokogiri/xml_document.c @@ -109,13 +109,12 @@ memsize(const void *data) } const rb_data_type_t noko_xml_document_data_type = { - .wrap_struct_name = "Nokogiri::XML::Document", - .function = { - .dmark = mark, - .dfree = dealloc, - .dsize = memsize, - }, - .flags = RUBY_TYPED_FREE_IMMEDIATELY + .wrap_struct_name = "Nokogiri::XML::Document", + .function = { + .dmark = mark, + .dfree = dealloc, + .dsize = memsize, + } }; static void From 6b234617189b0295649e084f50a2fe257b8dd624 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 3 Mar 2023 17:02:37 -0500 Subject: [PATCH 4/4] ext: improve fidelity of XML::Document memsize and include test coverage for it --- ext/nokogiri/xml_document.c | 10 ++++++++- test/test_memory_leak.rb | 41 +++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/ext/nokogiri/xml_document.c b/ext/nokogiri/xml_document.c index 4b26cd4b6e..8dff604bec 100644 --- a/ext/nokogiri/xml_document.c +++ b/ext/nokogiri/xml_document.c @@ -89,10 +89,18 @@ dealloc(void *data) static size_t memsize_node(const xmlNodePtr node) { + /* note we don't count namespace definitions, just going for a good-enough number here */ xmlNodePtr child; size_t memsize = 0; + + memsize += xmlStrlen(node->name); + for (child = (xmlNodePtr)node->properties; child; child = child->next) { + memsize += sizeof(xmlAttr) + memsize_node(child); + } + if (node->type == XML_TEXT_NODE) { + memsize += xmlStrlen(node->content); + } for (child = node->children; child; child = child->next) { - /* This should count the properties too. */ memsize += sizeof(xmlNode) + memsize_node(child); } return memsize; diff --git a/test/test_memory_leak.rb b/test/test_memory_leak.rb index 808040d25a..3b8dc32d1e 100644 --- a/test/test_memory_leak.rb +++ b/test/test_memory_leak.rb @@ -272,6 +272,47 @@ def test_leaking_dtd_nodes_after_internal_subset_removal end end # if NOKOGIRI_GC + def test_object_space_memsize_of + require "objspace" + skip("memsize_of not defined") unless ObjectSpace.respond_to?(:memsize_of) + + base_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML)) + + asdf + + XML + + more_children_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML)) + + asdf + asdf + asdf + + XML + assert(more_children_size > base_size, "adding children should increase memsize") + + attributes_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML)) + + asdf + + XML + assert(attributes_size > base_size, "adding attributes should increase memsize") + + string_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML)) + + asdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdf + + XML + assert(string_size > base_size, "longer strings should increase memsize") + + bigger_name_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML)) + + asdf + + XML + assert(bigger_name_size > base_size, "longer tags should increase memsize") + end + module MemInfo # from https://stackoverflow.com/questions/7220896/get-current-ruby-process-memory-usage # this is only going to work on linux