diff --git a/ext/nokogiri/html4_document.c b/ext/nokogiri/html4_document.c index 939321e3bd..7b816d5ff0 100644 --- a/ext/nokogiri/html4_document.c +++ b/ext/nokogiri/html4_document.c @@ -145,7 +145,7 @@ static VALUE rb_html_document_type(VALUE self) { htmlDocPtr doc; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); return INT2NUM(doc->type); } diff --git a/ext/nokogiri/nokogiri.h b/ext/nokogiri/nokogiri.h index 4c6bea855e..bc7f515f99 100644 --- a/ext/nokogiri/nokogiri.h +++ b/ext/nokogiri/nokogiri.h @@ -166,6 +166,8 @@ typedef struct _nokogiriXsltStylesheetTuple { VALUE func_instances; } nokogiriXsltStylesheetTuple; +extern const rb_data_type_t noko_xml_document_data_type; + void noko_xml_document_pin_node(xmlNodePtr); void noko_xml_document_pin_namespace(xmlNsPtr, xmlDocPtr); diff --git a/ext/nokogiri/xml_comment.c b/ext/nokogiri/xml_comment.c index 81b57ca472..35db13d098 100644 --- a/ext/nokogiri/xml_comment.c +++ b/ext/nokogiri/xml_comment.c @@ -30,7 +30,7 @@ new (int argc, VALUE *argv, VALUE klass) rb_raise(rb_eArgError, "first argument must be a XML::Document or XML::Node"); } - Data_Get_Struct(document, xmlDoc, xml_doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc); node = xmlNewDocComment( xml_doc, diff --git a/ext/nokogiri/xml_document.c b/ext/nokogiri/xml_document.c index a55daa1b55..8dff604bec 100644 --- a/ext/nokogiri/xml_document.c +++ b/ext/nokogiri/xml_document.c @@ -51,8 +51,9 @@ remove_private(xmlNodePtr node) } static void -mark(xmlDocPtr doc) +mark(void *data) { + xmlDocPtr doc = (xmlDocPtr)data; nokogiriTuplePtr tuple = (nokogiriTuplePtr)doc->_private; if (tuple) { rb_gc_mark(tuple->doc); @@ -61,8 +62,9 @@ mark(xmlDocPtr doc) } static void -dealloc(xmlDocPtr doc) +dealloc(void *data) { + xmlDocPtr doc = (xmlDocPtr)data; st_table *node_hash; node_hash = DOC_UNLINKED_NODE_HASH(doc); @@ -84,6 +86,45 @@ dealloc(xmlDocPtr doc) xmlFreeDoc(doc); } +static size_t +memsize_node(const xmlNodePtr node) +{ + /* note we don't count namespace definitions, just going for a good-enough number here */ + xmlNodePtr child; + size_t memsize = 0; + + memsize += xmlStrlen(node->name); + for (child = (xmlNodePtr)node->properties; child; child = child->next) { + memsize += sizeof(xmlAttr) + memsize_node(child); + } + if (node->type == XML_TEXT_NODE) { + memsize += xmlStrlen(node->content); + } + for (child = node->children; child; child = child->next) { + memsize += sizeof(xmlNode) + memsize_node(child); + } + return memsize; +} + +static size_t +memsize(const void *data) +{ + xmlDocPtr doc = (const xmlDocPtr)data; + size_t memsize = sizeof(xmlDoc); + /* This may not account for all memory use */ + memsize += memsize_node((xmlNodePtr)doc); + return memsize; +} + +const rb_data_type_t noko_xml_document_data_type = { + .wrap_struct_name = "Nokogiri::XML::Document", + .function = { + .dmark = mark, + .dfree = dealloc, + .dsize = memsize, + } +}; + static void recursively_remove_namespaces_from_node(xmlNodePtr node) { @@ -127,7 +168,7 @@ static VALUE url(VALUE self) { xmlDocPtr doc; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); if (doc->URL) { return NOKOGIRI_STR_NEW2(doc->URL); } @@ -146,7 +187,7 @@ rb_xml_document_root_set(VALUE self, VALUE rb_new_root) xmlDocPtr c_document; xmlNodePtr c_new_root = NULL, c_current_root; - Data_Get_Struct(self, xmlDoc, c_document); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, c_document); c_current_root = xmlDocGetRootElement(c_document); if (c_current_root) { @@ -190,7 +231,7 @@ rb_xml_document_root(VALUE self) xmlDocPtr c_document; xmlNodePtr c_root; - Data_Get_Struct(self, xmlDoc, c_document); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, c_document); c_root = xmlDocGetRootElement(c_document); if (!c_root) { @@ -210,7 +251,7 @@ static VALUE set_encoding(VALUE self, VALUE encoding) { xmlDocPtr doc; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); if (doc->encoding) { xmlFree(DISCARD_CONST_QUAL_XMLCHAR(doc->encoding)); @@ -231,7 +272,7 @@ static VALUE encoding(VALUE self) { xmlDocPtr doc; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); if (!doc->encoding) { return Qnil; } return NOKOGIRI_STR_NEW2(doc->encoding); @@ -247,7 +288,7 @@ static VALUE version(VALUE self) { xmlDocPtr doc; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); if (!doc->version) { return Qnil; } return NOKOGIRI_STR_NEW2(doc->version); @@ -369,7 +410,7 @@ duplicate_document(int argc, VALUE *argv, VALUE self) level = INT2NUM((long)1); } - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); dup = xmlCopyDoc(doc, (int)NUM2INT(level)); @@ -443,7 +484,7 @@ static VALUE remove_namespaces_bang(VALUE self) { xmlDocPtr doc ; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); recursively_remove_namespaces_from_node((xmlNodePtr)doc); return self; @@ -471,7 +512,7 @@ create_entity(int argc, VALUE *argv, VALUE self) xmlEntityPtr ptr; xmlDocPtr doc ; - Data_Get_Struct(self, xmlDoc, doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, doc); rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id, &content); @@ -559,7 +600,7 @@ rb_xml_document_canonicalize(int argc, VALUE *argv, VALUE self) } } - Data_Get_Struct(self, xmlDoc, c_doc); + TypedData_Get_Struct(self, xmlDoc, &noko_xml_document_data_type, c_doc); rb_cStringIO = rb_const_get_at(rb_cObject, rb_intern("StringIO")); rb_io = rb_class_new_instance(0, 0, rb_cStringIO); @@ -607,7 +648,7 @@ noko_xml_document_wrap_with_init_args(VALUE klass, xmlDocPtr c_document, int arg klass = cNokogiriXmlDocument; } - rb_document = Data_Wrap_Struct(klass, mark, dealloc, c_document); + rb_document = TypedData_Wrap_Struct(klass, &noko_xml_document_data_type, c_document); tuple = (nokogiriTuplePtr)ruby_xmalloc(sizeof(nokogiriTuple)); tuple->doc = rb_document; diff --git a/ext/nokogiri/xml_document_fragment.c b/ext/nokogiri/xml_document_fragment.c index 1a82eed1ec..bd534c3df1 100644 --- a/ext/nokogiri/xml_document_fragment.c +++ b/ext/nokogiri/xml_document_fragment.c @@ -19,7 +19,7 @@ new (int argc, VALUE *argv, VALUE klass) rb_scan_args(argc, argv, "1*", &document, &rest); - Data_Get_Struct(document, xmlDoc, xml_doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc); node = xmlNewDocFragment(xml_doc->doc); diff --git a/ext/nokogiri/xml_entity_reference.c b/ext/nokogiri/xml_entity_reference.c index d8f12438c5..272ea9ffec 100644 --- a/ext/nokogiri/xml_entity_reference.c +++ b/ext/nokogiri/xml_entity_reference.c @@ -20,7 +20,7 @@ new (int argc, VALUE *argv, VALUE klass) rb_scan_args(argc, argv, "2*", &document, &name, &rest); - Data_Get_Struct(document, xmlDoc, xml_doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc); node = xmlNewReference( xml_doc, diff --git a/ext/nokogiri/xml_node.c b/ext/nokogiri/xml_node.c index 7ddc639cbe..204df446fe 100644 --- a/ext/nokogiri/xml_node.c +++ b/ext/nokogiri/xml_node.c @@ -984,7 +984,7 @@ duplicate_node(int argc, VALUE *argv, VALUE self) if (n_args < 2) { new_parent_doc = node->doc; } else { - Data_Get_Struct(r_new_parent_doc, xmlDoc, new_parent_doc); + TypedData_Get_Struct(r_new_parent_doc, xmlDoc, &noko_xml_document_data_type, new_parent_doc); } dup = xmlDocCopyNode(node, new_parent_doc, level); diff --git a/ext/nokogiri/xml_processing_instruction.c b/ext/nokogiri/xml_processing_instruction.c index f4a6dc2eb2..76358cc0cc 100644 --- a/ext/nokogiri/xml_processing_instruction.c +++ b/ext/nokogiri/xml_processing_instruction.c @@ -22,7 +22,7 @@ new (int argc, VALUE *argv, VALUE klass) rb_scan_args(argc, argv, "3*", &document, &name, &content, &rest); - Data_Get_Struct(document, xmlDoc, xml_doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, xml_doc); node = xmlNewDocPI( xml_doc, diff --git a/ext/nokogiri/xml_relax_ng.c b/ext/nokogiri/xml_relax_ng.c index 943b0a7cd5..4533abd883 100644 --- a/ext/nokogiri/xml_relax_ng.c +++ b/ext/nokogiri/xml_relax_ng.c @@ -23,7 +23,7 @@ validate_document(VALUE self, VALUE document) xmlRelaxNGValidCtxtPtr valid_ctxt; Data_Get_Struct(self, xmlRelaxNG, schema); - Data_Get_Struct(document, xmlDoc, doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, doc); errors = rb_ary_new(); @@ -127,7 +127,7 @@ from_document(int argc, VALUE *argv, VALUE klass) scanned_args = rb_scan_args(argc, argv, "11", &document, &parse_options); - Data_Get_Struct(document, xmlDoc, doc); + TypedData_Get_Struct(document, xmlDoc, &noko_xml_document_data_type, doc); doc = doc->doc; /* In case someone passes us a node. ugh. */ if (scanned_args == 1) { diff --git a/ext/nokogiri/xslt_stylesheet.c b/ext/nokogiri/xslt_stylesheet.c index 700c754ee3..720adda863 100644 --- a/ext/nokogiri/xslt_stylesheet.c +++ b/ext/nokogiri/xslt_stylesheet.c @@ -79,7 +79,7 @@ parse_stylesheet_doc(VALUE klass, VALUE xmldocobj) xmlDocPtr xml, xml_cpy; VALUE errstr, exception; xsltStylesheetPtr ss ; - Data_Get_Struct(xmldocobj, xmlDoc, xml); + TypedData_Get_Struct(xmldocobj, xmlDoc, &noko_xml_document_data_type, xml); errstr = rb_str_new(0, 0); xsltSetGenericErrorFunc((void *)errstr, xslt_generic_error_handler); @@ -114,7 +114,12 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj) int doc_len ; VALUE rval ; - Data_Get_Struct(xmlobj, xmlDoc, xml); + TypedData_Get_Struct( + xmlobj, + xmlDoc, + &noko_xml_document_data_type, + xml + ); TypedData_Get_Struct( self, nokogiriXsltStylesheetTuple, @@ -265,7 +270,7 @@ rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self) Check_Type(paramobj, T_ARRAY); - Data_Get_Struct(xmldoc, xmlDoc, xml); + TypedData_Get_Struct(xmldoc, xmlDoc, &noko_xml_document_data_type, xml); TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &xslt_stylesheet_type, wrapper); param_len = RARRAY_LEN(paramobj); diff --git a/test/test_memory_leak.rb b/test/test_memory_leak.rb index 808040d25a..3b8dc32d1e 100644 --- a/test/test_memory_leak.rb +++ b/test/test_memory_leak.rb @@ -272,6 +272,47 @@ def test_leaking_dtd_nodes_after_internal_subset_removal end end # if NOKOGIRI_GC + def test_object_space_memsize_of + require "objspace" + skip("memsize_of not defined") unless ObjectSpace.respond_to?(:memsize_of) + + base_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML)) + + asdf + + XML + + more_children_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML)) + + asdf + asdf + asdf + + XML + assert(more_children_size > base_size, "adding children should increase memsize") + + attributes_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML)) + + asdf + + XML + assert(attributes_size > base_size, "adding attributes should increase memsize") + + string_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML)) + + asdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdfasdf + + XML + assert(string_size > base_size, "longer strings should increase memsize") + + bigger_name_size = ObjectSpace.memsize_of(Nokogiri::XML(<<~XML)) + + asdf + + XML + assert(bigger_name_size > base_size, "longer tags should increase memsize") + end + module MemInfo # from https://stackoverflow.com/questions/7220896/get-current-ruby-process-memory-usage # this is only going to work on linux