From 1284421f84a37ee2a075bde73836909dff1e9289 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Thu, 5 Aug 2021 22:08:26 -0400 Subject: [PATCH] ext: avoid C90 features Related to #2302 --- ext/nokogiri/gumbo.c | 135 ++++++++++++++++++++++++++++--------------- 1 file changed, 87 insertions(+), 48 deletions(-) diff --git a/ext/nokogiri/gumbo.c b/ext/nokogiri/gumbo.c index f10625e9d9..3a67f6dc32 100644 --- a/ext/nokogiri/gumbo.c +++ b/ext/nokogiri/gumbo.c @@ -89,15 +89,19 @@ get_parent(xmlNodePtr node) static GumboOutput * perform_parse(const GumboOptions *options, VALUE input) { + GumboOutput *output; + const char *status_string; + assert(RTEST(input)); Check_Type(input, T_STRING); - GumboOutput *output = gumbo_parse_with_options( - options, - RSTRING_PTR(input), - RSTRING_LEN(input) - ); - const char *status_string = gumbo_status_to_string(output->status); + output = gumbo_parse_with_options( + options, + RSTRING_PTR(input), + RSTRING_LEN(input) + ); + + status_string = gumbo_status_to_string(output->status); switch (output->status) { case GUMBO_STATUS_OK: break; @@ -151,9 +155,13 @@ build_tree( size_t child_index = 0; while (true) { + const GumboVector *children; + const GumboNode *gumbo_child; + xmlNodePtr xml_child; + assert(gumbo_node != NULL); - const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT ? - &gumbo_node->v.document.children : &gumbo_node->v.element.children; + children = gumbo_node->type == GUMBO_NODE_DOCUMENT ? + &gumbo_node->v.document.children : &gumbo_node->v.element.children; if (child_index >= children->length) { // Move up the tree and to the next child. if (xml_node == xml_output_node) { @@ -172,9 +180,8 @@ build_tree( } continue; } - const GumboNode *gumbo_child = children->data[child_index++]; - xmlNodePtr xml_child; + gumbo_child = children->data[child_index++]; switch (gumbo_child->type) { case GUMBO_NODE_DOCUMENT: abort(); // Bug in Gumbo. @@ -202,12 +209,15 @@ build_tree( case GUMBO_NODE_TEMPLATE: // XXX: Should create a template element and a new DocumentFragment case GUMBO_NODE_ELEMENT: { + xmlNsPtr ns = NULL; + const GumboVector *attrs; + size_t i; + xml_child = xmlNewDocNode(doc, NULL, BAD_CAST gumbo_child->v.element.name, NULL); set_line(xml_child, gumbo_child->v.element.start_pos.line); if (xml_root == NULL) { xml_root = xml_child; } - xmlNsPtr ns = NULL; switch (gumbo_child->v.element.tag_namespace) { case GUMBO_NAMESPACE_HTML: break; @@ -224,8 +234,8 @@ build_tree( xmlAddChild(xml_node, xml_child); // Add the attributes. - const GumboVector *attrs = &gumbo_child->v.element.attributes; - for (size_t i = 0; i < attrs->length; i++) { + attrs = &gumbo_child->v.element.attributes; + for (i = 0; i < attrs->length; i++) { const GumboAttribute *attr = attrs->data[i]; switch (attr->attr_namespace) { @@ -264,19 +274,29 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url) // Add parse errors to rdoc. if (output->errors.length) { + size_t i; const GumboVector *errors = &output->errors; VALUE rerrors = rb_ary_new2(errors->length); - for (size_t i = 0; i < errors->length; i++) { - GumboError *err = errors->data[i]; - GumboSourcePosition position = gumbo_error_position(err); + for (i = 0; i < errors->length; i++) { + GumboError *err; + GumboSourcePosition position; char *msg; - size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg); - VALUE err_str = rb_utf8_str_new(msg, size); + size_t size; + VALUE err_str; + VALUE syntax_error; + const char *error_code; + VALUE str1; + + err = errors->data[i]; + position = gumbo_error_position(err); + size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg); + err_str = rb_utf8_str_new(msg, size); free(msg); - VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError); - const char *error_code = gumbo_error_code(err); - VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil; + syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError); + error_code = gumbo_error_code(err); + str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil; + rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR @@ -323,18 +343,20 @@ static VALUE parse_continue(VALUE parse_args); static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) { + GumboOutput *output; GumboOptions options = kGumboDefaultOptions; + ParseArgs args; + options.max_attributes = NUM2INT(max_attributes); options.max_errors = NUM2INT(max_errors); options.max_tree_depth = NUM2INT(max_depth); - GumboOutput *output = perform_parse(&options, input); - ParseArgs args = { - .output = output, - .input = input, - .url_or_frag = url, - .doc = NULL, - }; + output = perform_parse(&options, input); + + args.output = output; + args.input = input; + args.url_or_frag = url; + args.doc = NULL; return rb_ensure(parse_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args)); } @@ -345,6 +367,8 @@ parse_continue(VALUE parse_args) ParseArgs *args = (ParseArgs *)parse_args; GumboOutput *output = args->output; xmlDocPtr doc; + VALUE rdoc; + if (output->document->v.document.has_doctype) { const char *name = output->document->v.document.name; const char *public = output->document->v.document.public_identifier; @@ -357,7 +381,7 @@ parse_continue(VALUE parse_args) } args->doc = doc; // Make sure doc gets cleaned up if an error is thrown. build_tree(doc, (xmlNodePtr)doc, output->document); - VALUE rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc); + rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc); args->doc = NULL; // The Ruby runtime now owns doc so don't delete it. add_errors(output, rdoc, args->input, args->url_or_frag); return rdoc; @@ -366,10 +390,15 @@ parse_continue(VALUE parse_args) static int lookup_namespace(VALUE node, bool require_known_ns) { + VALUE ns; ID namespace, href; + const char *href_ptr; + size_t href_len; + CONST_ID(namespace, "namespace"); CONST_ID(href, "href"); - VALUE ns = rb_funcall(node, namespace, 0); + + ns = rb_funcall(node, namespace, 0); if (NIL_P(ns)) { return GUMBO_NAMESPACE_HTML; @@ -378,8 +407,8 @@ lookup_namespace(VALUE node, bool require_known_ns) assert(RTEST(ns)); Check_Type(ns, T_STRING); - const char *href_ptr = RSTRING_PTR(ns); - size_t href_len = RSTRING_LEN(ns); + href_ptr = RSTRING_PTR(ns); + href_len = RSTRING_LEN(ns); #define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len)) if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) { return GUMBO_NAMESPACE_HTML; @@ -427,15 +456,23 @@ fragment( GumboQuirksModeEnum quirks_mode; bool form = false; const char *encoding = NULL; + VALUE doc, dtd; + int depth; + GumboOptions options = kGumboDefaultOptions; + GumboOutput *output; + ParseArgs args; if (NIL_P(ctx)) { ctx_tag = "body"; ctx_ns = GUMBO_NAMESPACE_HTML; } else if (TYPE(ctx) == T_STRING) { + size_t len; + const char *colon; + ctx_tag = StringValueCStr(ctx); ctx_ns = GUMBO_NAMESPACE_HTML; - size_t len = RSTRING_LEN(ctx); - const char *colon = memchr(ctx_tag, ':', len); + len = RSTRING_LEN(ctx); + colon = memchr(ctx_tag, ':', len); if (colon) { switch (colon - ctx_tag) { case 3: @@ -470,6 +507,7 @@ fragment( // Check if it's a form. form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0; } else { + VALUE node, element_name; ID element_ = rb_intern_const("element?"); // Context fragment name. @@ -482,13 +520,13 @@ fragment( ctx_ns = lookup_namespace(ctx, true); // Check for a form ancestor, including self. - for (VALUE node = ctx; + for (node = ctx; !NIL_P(node); node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) { if (!RTEST(rb_funcall(node, element_, 0))) { continue; } - VALUE element_name = rb_funcall(node, name, 0); + element_name = rb_funcall(node, name, 0); if (RSTRING_LEN(element_name) == 4 && !st_strcasecmp(RSTRING_PTR(element_name), "form") && lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) { @@ -510,8 +548,8 @@ fragment( } // Quirks mode. - VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0); - VALUE dtd = rb_funcall(doc, internal_subset, 0); + doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0); + dtd = rb_funcall(doc, internal_subset, 0); if (NIL_P(dtd)) { quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS; } else { @@ -526,8 +564,8 @@ fragment( } // Perform a fragment parse. - int depth = NUM2INT(max_depth); - GumboOptions options = kGumboDefaultOptions; + depth = NUM2INT(max_depth); + options = kGumboDefaultOptions; options.max_attributes = NUM2INT(max_attributes); options.max_errors = NUM2INT(max_errors); // Add one to account for the HTML element. @@ -538,13 +576,13 @@ fragment( options.quirks_mode = quirks_mode; options.fragment_context_has_form_ancestor = form; - GumboOutput *output = perform_parse(&options, tags); - ParseArgs args = { - .output = output, - .input = tags, - .url_or_frag = doc_fragment, - .doc = (xmlDocPtr)extract_xml_node(doc), - }; + output = perform_parse(&options, tags); + + args.output = output; + args.input = tags; + args.url_or_frag = doc_fragment; + args.doc = (xmlDocPtr)extract_xml_node(doc); + rb_ensure(fragment_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args)); return Qnil; } @@ -556,9 +594,10 @@ fragment_continue(VALUE parse_args) GumboOutput *output = args->output; VALUE doc_fragment = args->url_or_frag; xmlDocPtr xml_doc = args->doc; + xmlNodePtr xml_frag; args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it. - xmlNodePtr xml_frag = extract_xml_node(doc_fragment); + xml_frag = extract_xml_node(doc_fragment); build_tree(xml_doc, xml_frag, output->root); add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9)); return Qnil;