diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 0000000000..36e8cfb78c --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,50 @@ +# this is a work in progress! +name: windows +on: + push: + branches: + - main + pull_request: + types: [opened, synchronize] + branches: + - '*' + +jobs: + windows: + name: "windows, sys: ${{ matrix.sys }}, ${{ matrix.ruby }}" + + env: + MAKEFLAGS: -j2 + + runs-on: windows-latest + + strategy: + fail-fast: false + matrix: + sys: [ enable, disable ] + ruby: [ "2.5", "2.6", "2.7", "3.0", "mingw" ] + + steps: + - name: configure git crlf on windows + run: | + git config --system core.autocrlf false + git config --system core.eol lf + - name: checkout + uses: actions/checkout@v2 + - name: load Ruby and bundle install + uses: MSP-Greg/setup-ruby-pkgs@v1 + with: + ruby-version: ${{ matrix.ruby }} + mingw: libxml2 libxslt + bundler-cache: true + - uses: actions/cache@v2 + if: matrix.sys == 'disable' + with: + path: ports/archives + key: ${{ matrix.os }}-${{ matrix.ruby }}-tarballs-${{ hashFiles('**/dependencies.yml') }} + restore-keys: ${{ matrix.os }}-${{ matrix.ruby }}-tarballs- + - name: bundle exec rake compile + run: | + bundle exec rake compile -- --${{ matrix.sys }}-system-libraries + - name: bundle exec rake test + run: bundle exec rake test diff --git a/.gitignore b/.gitignore index 79baf6a716..5a6b71bc58 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ /gems/ /lib/nokogiri/**/nokogiri.bundle /lib/nokogiri/**/nokogiri.so +/lib/nokogumbo/**/nokogumbo.bundle +/lib/nokogumbo/**/nokogumbo.so /lib/nokogiri/nokogiri.jar /pkg/ /ports/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a710b1c6f..db1d50c3bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,11 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ## next / unreleased +### Dependencies + +* [MRI] Upgrade mini_portile2 dependency from `~> 2.5.0` to `~> 2.5.1`. + + ### Changed * Introduce `Nokogiri::XML::ParseOptions::DEFAULT_XSLT` which adds the libxslt-preferred options of `NOENT | DTDLOAD | DTDATTR | NOCDATA` to `ParseOptions::DEFAULT_XML`. diff --git a/ext/java/nokogiri/XmlDocument.java b/ext/java/nokogiri/XmlDocument.java index 139659bad2..1942cb4891 100644 --- a/ext/java/nokogiri/XmlDocument.java +++ b/ext/java/nokogiri/XmlDocument.java @@ -443,7 +443,7 @@ private static class DocumentBuilderFactoryHolder return new_root; } if (!(new_root instanceof XmlNode)) { - throw context.runtime.newArgumentError("expected Nokogiri::XML::Node but received " + new_root.getType()); + throw context.runtime.newArgumentError("expected Nokogiri::XML::Node but received " + new_root.getType()); } XmlNode newRoot = asXmlNode(context, new_root); diff --git a/ext/nokogiri/extconf.rb b/ext/nokogiri/extconf.rb index fa4d189c77..38c93e2ff6 100644 --- a/ext/nokogiri/extconf.rb +++ b/ext/nokogiri/extconf.rb @@ -14,7 +14,7 @@ # The gem version constraint in the Rakefile is not respected at install time. # Keep this version in sync with the one in the Rakefile ! -REQUIRED_MINI_PORTILE_VERSION = "~> 2.5.0" +REQUIRED_MINI_PORTILE_VERSION = "~> 2.5.1" REQUIRED_PKG_CONFIG_VERSION = "~> 1.1" # Keep track of what versions of what libraries we build against @@ -402,7 +402,7 @@ def process_recipe(name, version, static_p, cross_p) require 'mini_portile2' message("Using mini_portile version #{MiniPortile::VERSION}\n") - if name != "libxml2" && name != "libxslt" + unless ["libxml2", "libxslt"].include?(name) OTHER_LIBRARY_VERSIONS[name] = version end @@ -486,7 +486,7 @@ def process_recipe(name, version, static_p, cross_p) end end - message(<<~EOM) + message(<<~EOM) if name != "libgumbo" The Nokogiri maintainers intend to provide timely security updates, but if this is a concern for you and want to use your OS/distro system library @@ -498,7 +498,7 @@ def process_recipe(name, version, static_p, cross_p) EOM message(<<~EOM) if name == 'libxml2' - Note, however, that nokogiri cannot guarantee compatiblity with every + Note, however, that nokogiri cannot guarantee compatibility with every version of libxml2 that may be provided by OS/package vendors. EOM @@ -868,6 +868,56 @@ def compile ensure_func("exsltFuncRegister", "libexslt/exslt.h") end +libgumbo_recipe = process_recipe("libgumbo", "1.0.0-nokogiri", static_p, cross_build_p) do |recipe| + recipe.configure_options = [] + + class << recipe + def downloaded? + true + end + + def extract + target = File.join(tmp_path, "gumbo-parser") + output "Copying gumbo-parser files into #{target}..." + FileUtils.mkdir_p target + FileUtils.cp Dir.glob(File.join(PACKAGE_ROOT_DIR, "gumbo-parser/src/*")), target + end + + def configured? + true + end + + def install + lib_dir = File.join(port_path, "lib") + inc_dir = File.join(port_path, "include") + FileUtils.mkdir_p([lib_dir, inc_dir]) + FileUtils.cp File.join(work_path, "libgumbo.a"), lib_dir + FileUtils.cp Dir.glob(File.join(work_path, "*.h")), inc_dir + end + + def compile + cflags = concat_flags(ENV["CFLAGS"], "-fPIC", "-g") + + env = {"CC" => gcc_cmd, "CFLAGS" => cflags} + if config_cross_build? + if host =~ /darwin/ + env["AR"] = "#{host}-libtool" + env["ARFLAGS"] = "-o" + else + env["AR"] = "#{host}-ar" + end + env["RANLIB"] = "#{host}-ranlib" + end + + execute("compile", make_cmd, {env: env}) + end + end +end +append_cppflags("-I#{File.join(libgumbo_recipe.path, "include")}") +$libs = $libs + " " + File.join(libgumbo_recipe.path, "lib", "libgumbo.a") +$LIBPATH = $LIBPATH | [File.join(libgumbo_recipe.path, "lib")] +ensure_func("gumbo_parse_with_options", "gumbo.h") + have_func('xmlHasFeature') || abort("xmlHasFeature() is missing.") # introduced in libxml 2.6.21 have_func('xmlFirstElementChild') # introduced in libxml 2.7.3 have_func('xmlRelaxNGSetParserStructuredErrors') # introduced in libxml 2.6.24 diff --git a/ext/nokogiri/gumbo.c b/ext/nokogiri/gumbo.c new file mode 100644 index 0000000000..7a3b42a878 --- /dev/null +++ b/ext/nokogiri/gumbo.c @@ -0,0 +1,606 @@ +// +// Copyright 2013-2021 Sam Ruby, Stephen Checkoway +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// +// nokogumbo.c defines the following: +// +// class Nokogumbo +// def parse(utf8_string) # returns Nokogiri::HTML5::Document +// end +// +// Processing starts by calling gumbo_parse_with_options. The resulting document tree +// is then walked, a parallel libxml2 tree is constructed, and the final document is +// then wrapped using Nokogiri_wrap_xml_document. This approach reduces memory and CPU +// requirements as Ruby objects are only built when necessary. +// + +#include + +#include "gumbo.h" + +VALUE cNokogiriHtml5Document; + +// Interned symbols +static ID internal_subset; +static ID parent; + +/* Backwards compatibility to Ruby 2.1.0 */ +#if RUBY_API_VERSION_CODE < 20200 +#define ONIG_ESCAPE_UCHAR_COLLISION 1 +#include + +static VALUE +rb_utf8_str_new(const char *str, long length) +{ + return rb_enc_str_new(str, length, rb_utf8_encoding()); +} + +static VALUE +rb_utf8_str_new_cstr(const char *str) +{ + return rb_enc_str_new_cstr(str, rb_utf8_encoding()); +} + +static VALUE +rb_utf8_str_new_static(const char *str, long length) +{ + return rb_enc_str_new(str, length, rb_utf8_encoding()); +} +#endif + +#include +#include +#include + +// URI = system id +// external id = public id +static xmlDocPtr +new_html_doc(const char *dtd_name, const char *system, const char *public) +{ + // These two libxml2 functions take the public and system ids in + // opposite orders. + htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL); + assert(doc); + if (dtd_name) { + xmlCreateIntSubset(doc, BAD_CAST dtd_name, BAD_CAST public, BAD_CAST system); + } + return doc; +} + +static xmlNodePtr +get_parent(xmlNodePtr node) +{ + return node->parent; +} + +static GumboOutput * +perform_parse(const GumboOptions *options, VALUE input) +{ + assert(RTEST(input)); + Check_Type(input, T_STRING); + GumboOutput *output = gumbo_parse_with_options( + options, + RSTRING_PTR(input), + RSTRING_LEN(input) + ); + + const char *status_string = gumbo_status_to_string(output->status); + switch (output->status) { + case GUMBO_STATUS_OK: + break; + case GUMBO_STATUS_TOO_MANY_ATTRIBUTES: + case GUMBO_STATUS_TREE_TOO_DEEP: + gumbo_destroy_output(output); + rb_raise(rb_eArgError, "%s", status_string); + case GUMBO_STATUS_OUT_OF_MEMORY: + gumbo_destroy_output(output); + rb_raise(rb_eNoMemError, "%s", status_string); + } + return output; +} + +static xmlNsPtr +lookup_or_add_ns( + xmlDocPtr doc, + xmlNodePtr root, + const char *href, + const char *prefix +) +{ + xmlNsPtr ns = xmlSearchNs(doc, root, BAD_CAST prefix); + if (ns) { + return ns; + } + return xmlNewNs(root, BAD_CAST href, BAD_CAST prefix); +} + +static void +set_line(xmlNodePtr node, size_t line) +{ + // libxml2 uses 65535 to mean look elsewhere for the line number on some + // nodes. + if (line < 65535) { + node->line = (unsigned short)line; + } +} + +// Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted +// at gumbo_node. +static void +build_tree( + xmlDocPtr doc, + xmlNodePtr xml_output_node, + const GumboNode *gumbo_node +) +{ + xmlNodePtr xml_root = NULL; + xmlNodePtr xml_node = xml_output_node; + size_t child_index = 0; + + while (true) { + assert(gumbo_node != NULL); + const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT ? + &gumbo_node->v.document.children : &gumbo_node->v.element.children; + if (child_index >= children->length) { + // Move up the tree and to the next child. + if (xml_node == xml_output_node) { + // We've built as much of the tree as we can. + return; + } + child_index = gumbo_node->index_within_parent + 1; + gumbo_node = gumbo_node->parent; + xml_node = get_parent(xml_node); + // Children of fragments don't share the same root, so reset it and + // it'll be set below. In the non-fragment case, this will only happen + // after the html element has been finished at which point there are no + // further elements. + if (xml_node == xml_output_node) { + xml_root = NULL; + } + continue; + } + const GumboNode *gumbo_child = children->data[child_index++]; + xmlNodePtr xml_child; + + switch (gumbo_child->type) { + case GUMBO_NODE_DOCUMENT: + abort(); // Bug in Gumbo. + + case GUMBO_NODE_TEXT: + case GUMBO_NODE_WHITESPACE: + xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text); + set_line(xml_child, gumbo_child->v.text.start_pos.line); + xmlAddChild(xml_node, xml_child); + break; + + case GUMBO_NODE_CDATA: + xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text, + (int) strlen(gumbo_child->v.text.text)); + set_line(xml_child, gumbo_child->v.text.start_pos.line); + xmlAddChild(xml_node, xml_child); + break; + + case GUMBO_NODE_COMMENT: + xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text); + set_line(xml_child, gumbo_child->v.text.start_pos.line); + xmlAddChild(xml_node, xml_child); + break; + + case GUMBO_NODE_TEMPLATE: + // XXX: Should create a template element and a new DocumentFragment + case GUMBO_NODE_ELEMENT: { + xml_child = xmlNewDocNode(doc, NULL, BAD_CAST gumbo_child->v.element.name, NULL); + set_line(xml_child, gumbo_child->v.element.start_pos.line); + if (xml_root == NULL) { + xml_root = xml_child; + } + xmlNsPtr ns = NULL; + switch (gumbo_child->v.element.tag_namespace) { + case GUMBO_NAMESPACE_HTML: + break; + case GUMBO_NAMESPACE_SVG: + ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/svg", "svg"); + break; + case GUMBO_NAMESPACE_MATHML: + ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1998/Math/MathML", "math"); + break; + } + if (ns != NULL) { + xmlSetNs(xml_child, ns); + } + xmlAddChild(xml_node, xml_child); + + // Add the attributes. + const GumboVector *attrs = &gumbo_child->v.element.attributes; + for (size_t i = 0; i < attrs->length; i++) { + const GumboAttribute *attr = attrs->data[i]; + + switch (attr->attr_namespace) { + case GUMBO_ATTR_NAMESPACE_XLINK: + ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1999/xlink", "xlink"); + break; + + case GUMBO_ATTR_NAMESPACE_XML: + ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/XML/1998/namespace", "xml"); + break; + + case GUMBO_ATTR_NAMESPACE_XMLNS: + ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/xmlns/", "xmlns"); + break; + + default: + ns = NULL; + } + xmlNewNsProp(xml_child, ns, BAD_CAST attr->name, BAD_CAST attr->value); + } + + // Add children for this element. + child_index = 0; + gumbo_node = gumbo_child; + xml_node = xml_child; + } + } + } +} + +static void +add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url) +{ + const char *input_str = RSTRING_PTR(input); + size_t input_len = RSTRING_LEN(input); + + // Add parse errors to rdoc. + if (output->errors.length) { + const GumboVector *errors = &output->errors; + VALUE rerrors = rb_ary_new2(errors->length); + + for (size_t i = 0; i < errors->length; i++) { + GumboError *err = errors->data[i]; + GumboSourcePosition position = gumbo_error_position(err); + char *msg; + size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg); + VALUE err_str = rb_utf8_str_new(msg, size); + free(msg); + VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError); + const char *error_code = gumbo_error_code(err); + VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil; + rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER + rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR + rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR + rb_iv_set(syntax_error, "@file", url); + rb_iv_set(syntax_error, "@line", INT2NUM(position.line)); + rb_iv_set(syntax_error, "@str1", str1); + rb_iv_set(syntax_error, "@str2", Qnil); + rb_iv_set(syntax_error, "@str3", Qnil); + rb_iv_set(syntax_error, "@int1", INT2NUM(0)); + rb_iv_set(syntax_error, "@column", INT2NUM(position.column)); + rb_ary_push(rerrors, syntax_error); + } + rb_iv_set(rdoc, "@errors", rerrors); + } +} + +typedef struct { + GumboOutput *output; + VALUE input; + VALUE url_or_frag; + xmlDocPtr doc; +} ParseArgs; + +static void +parse_args_mark(void *parse_args) +{ + ParseArgs *args = parse_args; + rb_gc_mark_maybe(args->input); + rb_gc_mark_maybe(args->url_or_frag); +} + +// Wrap a ParseArgs pointer. The underlying ParseArgs must outlive the +// wrapper. +static VALUE +wrap_parse_args(ParseArgs *args) +{ + return Data_Wrap_Struct(rb_cData, parse_args_mark, RUBY_NEVER_FREE, args); +} + +// Returnsd the underlying ParseArgs wrapped by wrap_parse_args. +static ParseArgs * +unwrap_parse_args(VALUE obj) +{ + ParseArgs *args; + Data_Get_Struct(obj, ParseArgs, args); + return args; +} + +static VALUE +parse_cleanup(VALUE parse_args) +{ + ParseArgs *args = unwrap_parse_args(parse_args); + gumbo_destroy_output(args->output); + // Make sure garbage collection doesn't mark the objects as being live based + // on references from the ParseArgs. This may be unnecessary. + args->input = Qnil; + args->url_or_frag = Qnil; + if (args->doc != NULL) { + xmlFreeDoc(args->doc); + } + return Qnil; +} + +static VALUE parse_continue(VALUE parse_args); + +// Parse a string using gumbo_parse into a Nokogiri document +static VALUE +parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) +{ + GumboOptions options = kGumboDefaultOptions; + options.max_attributes = NUM2INT(max_attributes); + options.max_errors = NUM2INT(max_errors); + options.max_tree_depth = NUM2INT(max_depth); + + GumboOutput *output = perform_parse(&options, input); + ParseArgs args = { + .output = output, + .input = input, + .url_or_frag = url, + .doc = NULL, + }; + VALUE parse_args = wrap_parse_args(&args); + + return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args); +} + +static VALUE +parse_continue(VALUE parse_args) +{ + ParseArgs *args = unwrap_parse_args(parse_args); + GumboOutput *output = args->output; + xmlDocPtr doc; + if (output->document->v.document.has_doctype) { + const char *name = output->document->v.document.name; + const char *public = output->document->v.document.public_identifier; + const char *system = output->document->v.document.system_identifier; + public = public[0] ? public : NULL; + system = system[0] ? system : NULL; + doc = new_html_doc(name, system, public); + } else { + doc = new_html_doc(NULL, NULL, NULL); + } + args->doc = doc; // Make sure doc gets cleaned up if an error is thrown. + build_tree(doc, (xmlNodePtr)doc, output->document); + VALUE rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc); + args->doc = NULL; // The Ruby runtime now owns doc so don't delete it. + add_errors(output, rdoc, args->input, args->url_or_frag); + return rdoc; +} + +static int +lookup_namespace(VALUE node, bool require_known_ns) +{ + ID namespace, href; + CONST_ID(namespace, "namespace"); + CONST_ID(href, "href"); + VALUE ns = rb_funcall(node, namespace, 0); + + if (NIL_P(ns)) { + return GUMBO_NAMESPACE_HTML; + } + ns = rb_funcall(ns, href, 0); + assert(RTEST(ns)); + Check_Type(ns, T_STRING); + + const char *href_ptr = RSTRING_PTR(ns); + size_t href_len = RSTRING_LEN(ns); +#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len)) + if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) { + return GUMBO_NAMESPACE_HTML; + } + if (NAMESPACE_P("http://www.w3.org/1998/Math/MathML")) { + return GUMBO_NAMESPACE_MATHML; + } + if (NAMESPACE_P("http://www.w3.org/2000/svg")) { + return GUMBO_NAMESPACE_SVG; + } +#undef NAMESPACE_P + if (require_known_ns) { + rb_raise(rb_eArgError, "Unexpected namespace URI \"%*s\"", (int)href_len, href_ptr); + } + return -1; +} + +static xmlNodePtr +extract_xml_node(VALUE node) +{ + xmlNodePtr xml_node; + Data_Get_Struct(node, xmlNode, xml_node); + return xml_node; +} + +static VALUE fragment_continue(VALUE parse_args); + +static VALUE +fragment( + VALUE self, + VALUE doc_fragment, + VALUE tags, + VALUE ctx, + VALUE max_attributes, + VALUE max_errors, + VALUE max_depth +) +{ + ID name = rb_intern_const("name"); + const char *ctx_tag; + GumboNamespaceEnum ctx_ns; + GumboQuirksModeEnum quirks_mode; + bool form = false; + const char *encoding = NULL; + + if (NIL_P(ctx)) { + ctx_tag = "body"; + ctx_ns = GUMBO_NAMESPACE_HTML; + } else if (TYPE(ctx) == T_STRING) { + ctx_tag = StringValueCStr(ctx); + ctx_ns = GUMBO_NAMESPACE_HTML; + size_t len = RSTRING_LEN(ctx); + const char *colon = memchr(ctx_tag, ':', len); + if (colon) { + switch (colon - ctx_tag) { + case 3: + if (st_strncasecmp(ctx_tag, "svg", 3) != 0) { + goto error; + } + ctx_ns = GUMBO_NAMESPACE_SVG; + break; + case 4: + if (st_strncasecmp(ctx_tag, "html", 4) == 0) { + ctx_ns = GUMBO_NAMESPACE_HTML; + } else if (st_strncasecmp(ctx_tag, "math", 4) == 0) { + ctx_ns = GUMBO_NAMESPACE_MATHML; + } else { + goto error; + } + break; + default: +error: + rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag); + } + ctx_tag = colon + 1; + } else { + // For convenience, put 'svg' and 'math' in their namespaces. + if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0) { + ctx_ns = GUMBO_NAMESPACE_SVG; + } else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0) { + ctx_ns = GUMBO_NAMESPACE_MATHML; + } + } + + // Check if it's a form. + form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0; + } else { + ID element_ = rb_intern_const("element?"); + + // Context fragment name. + VALUE tag_name = rb_funcall(ctx, name, 0); + assert(RTEST(tag_name)); + Check_Type(tag_name, T_STRING); + ctx_tag = StringValueCStr(tag_name); + + // Context fragment namespace. + ctx_ns = lookup_namespace(ctx, true); + + // Check for a form ancestor, including self. + for (VALUE node = ctx; + !NIL_P(node); + node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) { + if (!RTEST(rb_funcall(node, element_, 0))) { + continue; + } + VALUE element_name = rb_funcall(node, name, 0); + if (RSTRING_LEN(element_name) == 4 + && !st_strcasecmp(RSTRING_PTR(element_name), "form") + && lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) { + form = true; + break; + } + } + + // Encoding. + if (RSTRING_LEN(tag_name) == 14 + && !st_strcasecmp(ctx_tag, "annotation-xml")) { + VALUE enc = rb_funcall(ctx, rb_intern_const("[]"), + rb_utf8_str_new_static("encoding", 8)); + if (RTEST(enc)) { + Check_Type(enc, T_STRING); + encoding = StringValueCStr(enc); + } + } + } + + // Quirks mode. + VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0); + VALUE dtd = rb_funcall(doc, internal_subset, 0); + if (NIL_P(dtd)) { + quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS; + } else { + VALUE dtd_name = rb_funcall(dtd, name, 0); + VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0); + VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0); + quirks_mode = gumbo_compute_quirks_mode( + NIL_P(dtd_name) ? NULL : StringValueCStr(dtd_name), + NIL_P(pubid) ? NULL : StringValueCStr(pubid), + NIL_P(sysid) ? NULL : StringValueCStr(sysid) + ); + } + + // Perform a fragment parse. + int depth = NUM2INT(max_depth); + GumboOptions options = kGumboDefaultOptions; + options.max_attributes = NUM2INT(max_attributes); + options.max_errors = NUM2INT(max_errors); + // Add one to account for the HTML element. + options.max_tree_depth = depth < 0 ? -1 : (depth + 1); + options.fragment_context = ctx_tag; + options.fragment_namespace = ctx_ns; + options.fragment_encoding = encoding; + options.quirks_mode = quirks_mode; + options.fragment_context_has_form_ancestor = form; + + GumboOutput *output = perform_parse(&options, tags); + ParseArgs args = { + .output = output, + .input = tags, + .url_or_frag = doc_fragment, + .doc = (xmlDocPtr)extract_xml_node(doc), + }; + VALUE parse_args = wrap_parse_args(&args); + rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args); + return Qnil; +} + +static VALUE +fragment_continue(VALUE parse_args) +{ + ParseArgs *args = unwrap_parse_args(parse_args); + GumboOutput *output = args->output; + VALUE doc_fragment = args->url_or_frag; + xmlDocPtr xml_doc = args->doc; + + args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it. + xmlNodePtr xml_frag = extract_xml_node(doc_fragment); + build_tree(xml_doc, xml_frag, output->root); + add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9)); + return Qnil; +} + +// Initialize the Nokogumbo class and fetch constants we will use later. +void +noko_init_gumbo() +{ + // Class constants. + cNokogiriHtml5Document = rb_define_class_under(mNokogiriHtml5, "Document", cNokogiriHtmlDocument); + rb_gc_register_mark_object(cNokogiriHtml5Document); + + // Interned symbols. + internal_subset = rb_intern_const("internal_subset"); + parent = rb_intern_const("parent"); + + // Define Nokogumbo module with parse and fragment methods. + rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 5); + rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6); +} + +// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: diff --git a/ext/nokogiri/nokogiri.c b/ext/nokogiri/nokogiri.c index fb255f03e3..4db653981c 100644 --- a/ext/nokogiri/nokogiri.c +++ b/ext/nokogiri/nokogiri.c @@ -1,8 +1,10 @@ #include VALUE mNokogiri ; +VALUE mNokogiriGumbo ; VALUE mNokogiriHtml ; VALUE mNokogiriHtmlSax ; +VALUE mNokogiriHtml5 ; VALUE mNokogiriXml ; VALUE mNokogiriXmlSax ; VALUE mNokogiriXmlXpath ; @@ -44,6 +46,7 @@ void noko_init_html_element_description(); void noko_init_html_entity_lookup(); void noko_init_html_sax_parser_context(); void noko_init_html_sax_push_parser(); +void noko_init_gumbo(); void noko_init_test_global_handlers(); static ID id_read, id_write; @@ -152,12 +155,14 @@ void Init_nokogiri() { mNokogiri = rb_define_module("Nokogiri"); - mNokogiriXml = rb_define_module_under(mNokogiri, "XML"); + mNokogiriGumbo = rb_define_module_under(mNokogiri, "Gumbo"); mNokogiriHtml = rb_define_module_under(mNokogiri, "HTML"); - mNokogiriXslt = rb_define_module_under(mNokogiri, "XSLT"); - mNokogiriXmlXpath = rb_define_module_under(mNokogiriXml, "XPath"); - mNokogiriXmlSax = rb_define_module_under(mNokogiriXml, "SAX"); mNokogiriHtmlSax = rb_define_module_under(mNokogiriHtml, "SAX"); + mNokogiriHtml5 = rb_define_module_under(mNokogiri, "HTML5"); + mNokogiriXml = rb_define_module_under(mNokogiri, "XML"); + mNokogiriXmlSax = rb_define_module_under(mNokogiriXml, "SAX"); + mNokogiriXmlXpath = rb_define_module_under(mNokogiriXml, "XPath"); + mNokogiriXslt = rb_define_module_under(mNokogiri, "XSLT"); rb_const_set(mNokogiri, rb_intern("LIBXML_COMPILED_VERSION"), NOKOGIRI_STR_NEW2(LIBXML_DOTTED_VERSION)); rb_const_set(mNokogiri, rb_intern("LIBXML_LOADED_VERSION"), NOKOGIRI_STR_NEW2(xmlParserVersion)); @@ -238,6 +243,7 @@ Init_nokogiri() noko_init_xml_document_fragment(); noko_init_xml_document(); noko_init_html_document(); + noko_init_gumbo(); noko_init_test_global_handlers(); diff --git a/ext/nokogiri/nokogiri.h b/ext/nokogiri/nokogiri.h index 8b3cc9e845..bb93a1b053 100644 --- a/ext/nokogiri/nokogiri.h +++ b/ext/nokogiri/nokogiri.h @@ -69,6 +69,7 @@ xmlNodePtr xmlLastElementChild(xmlNodePtr parent); #include #include #include +#include #define NOKOGIRI_STR_NEW2(str) NOKOGIRI_STR_NEW(str, strlen((const char *)(str))) #define NOKOGIRI_STR_NEW(str, len) rb_external_str_new_with_enc((const char *)(str), (long)(len), rb_utf8_encoding()) @@ -92,10 +93,13 @@ xmlNodePtr xmlLastElementChild(xmlNodePtr parent); NOKOPUBVAR VALUE mNokogiri ; +NOKOPUBVAR VALUE mNokogiriGumbo ; NOKOPUBVAR VALUE mNokogiriHtml ; NOKOPUBVAR VALUE mNokogiriHtmlSax ; +NOKOPUBVAR VALUE mNokogiriHtml5 ; NOKOPUBVAR VALUE mNokogiriXml ; NOKOPUBVAR VALUE mNokogiriXmlSax ; +NOKOPUBVAR VALUE mNokogiriXmlXpath ; NOKOPUBVAR VALUE mNokogiriXslt ; NOKOPUBVAR VALUE cNokogiriSyntaxError; @@ -129,6 +133,7 @@ NOKOPUBVAR VALUE cNokogiriXmlXpathSyntaxError; NOKOPUBVAR VALUE cNokogiriXsltStylesheet ; NOKOPUBVAR VALUE cNokogiriHtmlDocument ; +NOKOPUBVAR VALUE cNokogiriHtml5Document ; NOKOPUBVAR VALUE cNokogiriHtmlSaxPushParser ; NOKOPUBVAR VALUE cNokogiriHtmlElementDescription ; NOKOPUBVAR VALUE cNokogiriHtmlSaxParserContext; @@ -177,7 +182,8 @@ VALUE noko_xml_node_set_wrap(xmlNodeSetPtr node_set, VALUE document) ; VALUE noko_xml_document_wrap_with_init_args(VALUE klass, xmlDocPtr doc, int argc, VALUE *argv); VALUE noko_xml_document_wrap(VALUE klass, xmlDocPtr doc); -NOKOPUBFUN VALUE Nokogiri_wrap_xml_document(VALUE klass, xmlDocPtr doc); /* deprecated. use noko_xml_document_wrap() instead. */ +NOKOPUBFUN VALUE Nokogiri_wrap_xml_document(VALUE klass, + xmlDocPtr doc); /* deprecated. use noko_xml_document_wrap() instead. */ #define DOC_RUBY_OBJECT_TEST(x) ((nokogiriTuplePtr)(x->_private)) #define DOC_RUBY_OBJECT(x) (((nokogiriTuplePtr)(x->_private))->doc) diff --git a/gumbo-parser/.gitignore b/gumbo-parser/.gitignore new file mode 100644 index 0000000000..13b0a3e962 --- /dev/null +++ b/gumbo-parser/.gitignore @@ -0,0 +1,2 @@ +build +src/*.o diff --git a/nokogumbo-import/gumbo-parser/CHANGES.md b/gumbo-parser/CHANGES.md similarity index 100% rename from nokogumbo-import/gumbo-parser/CHANGES.md rename to gumbo-parser/CHANGES.md diff --git a/nokogumbo-import/gumbo-parser/Makefile b/gumbo-parser/Makefile similarity index 100% rename from nokogumbo-import/gumbo-parser/Makefile rename to gumbo-parser/Makefile diff --git a/nokogumbo-import/gumbo-parser/THANKS b/gumbo-parser/THANKS similarity index 100% rename from nokogumbo-import/gumbo-parser/THANKS rename to gumbo-parser/THANKS diff --git a/gumbo-parser/src/Makefile b/gumbo-parser/src/Makefile new file mode 100644 index 0000000000..3a50bc96fa --- /dev/null +++ b/gumbo-parser/src/Makefile @@ -0,0 +1,17 @@ +# this Makefile is used by ext/nokogiri/extconf.rb +# to enable a mini_portile2 recipe to build the gumbo parser +.PHONY: clean + +override CFLAGS += -std=c99 -Wall + +# allow the ENV var to override this +RANLIB ?= ranlib + +gumbo_objs := $(patsubst %.c,%.o,$(wildcard *.c)) + +libgumbo.a: $(gumbo_objs) + $(AR) $(ARFLAGS) $@ $^ + - ($(RANLIB) $@ || true) >/dev/null 2>&1 + +clean: + rm -f $(gumbo_objs) libgumbo.a diff --git a/nokogumbo-import/gumbo-parser/src/README.md b/gumbo-parser/src/README.md similarity index 100% rename from nokogumbo-import/gumbo-parser/src/README.md rename to gumbo-parser/src/README.md diff --git a/nokogumbo-import/gumbo-parser/src/ascii.c b/gumbo-parser/src/ascii.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/ascii.c rename to gumbo-parser/src/ascii.c diff --git a/nokogumbo-import/gumbo-parser/src/ascii.h b/gumbo-parser/src/ascii.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/ascii.h rename to gumbo-parser/src/ascii.h diff --git a/nokogumbo-import/gumbo-parser/src/attribute.c b/gumbo-parser/src/attribute.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/attribute.c rename to gumbo-parser/src/attribute.c diff --git a/nokogumbo-import/gumbo-parser/src/attribute.h b/gumbo-parser/src/attribute.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/attribute.h rename to gumbo-parser/src/attribute.h diff --git a/nokogumbo-import/gumbo-parser/src/char_ref.c b/gumbo-parser/src/char_ref.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/char_ref.c rename to gumbo-parser/src/char_ref.c diff --git a/nokogumbo-import/gumbo-parser/src/char_ref.h b/gumbo-parser/src/char_ref.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/char_ref.h rename to gumbo-parser/src/char_ref.h diff --git a/nokogumbo-import/gumbo-parser/src/char_ref.rl b/gumbo-parser/src/char_ref.rl similarity index 100% rename from nokogumbo-import/gumbo-parser/src/char_ref.rl rename to gumbo-parser/src/char_ref.rl diff --git a/nokogumbo-import/gumbo-parser/src/error.c b/gumbo-parser/src/error.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/error.c rename to gumbo-parser/src/error.c diff --git a/nokogumbo-import/gumbo-parser/src/error.h b/gumbo-parser/src/error.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/error.h rename to gumbo-parser/src/error.h diff --git a/nokogumbo-import/gumbo-parser/src/foreign_attrs.c b/gumbo-parser/src/foreign_attrs.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/foreign_attrs.c rename to gumbo-parser/src/foreign_attrs.c diff --git a/nokogumbo-import/gumbo-parser/src/foreign_attrs.gperf b/gumbo-parser/src/foreign_attrs.gperf similarity index 100% rename from nokogumbo-import/gumbo-parser/src/foreign_attrs.gperf rename to gumbo-parser/src/foreign_attrs.gperf diff --git a/nokogumbo-import/gumbo-parser/src/gumbo.h b/gumbo-parser/src/gumbo.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/gumbo.h rename to gumbo-parser/src/gumbo.h diff --git a/nokogumbo-import/gumbo-parser/src/insertion_mode.h b/gumbo-parser/src/insertion_mode.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/insertion_mode.h rename to gumbo-parser/src/insertion_mode.h diff --git a/nokogumbo-import/gumbo-parser/src/macros.h b/gumbo-parser/src/macros.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/macros.h rename to gumbo-parser/src/macros.h diff --git a/nokogumbo-import/gumbo-parser/src/parser.c b/gumbo-parser/src/parser.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/parser.c rename to gumbo-parser/src/parser.c diff --git a/nokogumbo-import/gumbo-parser/src/parser.h b/gumbo-parser/src/parser.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/parser.h rename to gumbo-parser/src/parser.h diff --git a/nokogumbo-import/gumbo-parser/src/replacement.h b/gumbo-parser/src/replacement.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/replacement.h rename to gumbo-parser/src/replacement.h diff --git a/nokogumbo-import/gumbo-parser/src/string_buffer.c b/gumbo-parser/src/string_buffer.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/string_buffer.c rename to gumbo-parser/src/string_buffer.c diff --git a/nokogumbo-import/gumbo-parser/src/string_buffer.h b/gumbo-parser/src/string_buffer.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/string_buffer.h rename to gumbo-parser/src/string_buffer.h diff --git a/nokogumbo-import/gumbo-parser/src/string_piece.c b/gumbo-parser/src/string_piece.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/string_piece.c rename to gumbo-parser/src/string_piece.c diff --git a/nokogumbo-import/gumbo-parser/src/svg_attrs.c b/gumbo-parser/src/svg_attrs.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/svg_attrs.c rename to gumbo-parser/src/svg_attrs.c diff --git a/nokogumbo-import/gumbo-parser/src/svg_attrs.gperf b/gumbo-parser/src/svg_attrs.gperf similarity index 100% rename from nokogumbo-import/gumbo-parser/src/svg_attrs.gperf rename to gumbo-parser/src/svg_attrs.gperf diff --git a/nokogumbo-import/gumbo-parser/src/svg_tags.c b/gumbo-parser/src/svg_tags.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/svg_tags.c rename to gumbo-parser/src/svg_tags.c diff --git a/nokogumbo-import/gumbo-parser/src/svg_tags.gperf b/gumbo-parser/src/svg_tags.gperf similarity index 100% rename from nokogumbo-import/gumbo-parser/src/svg_tags.gperf rename to gumbo-parser/src/svg_tags.gperf diff --git a/nokogumbo-import/gumbo-parser/src/tag.c b/gumbo-parser/src/tag.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/tag.c rename to gumbo-parser/src/tag.c diff --git a/nokogumbo-import/gumbo-parser/src/tag_lookup.c b/gumbo-parser/src/tag_lookup.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/tag_lookup.c rename to gumbo-parser/src/tag_lookup.c diff --git a/nokogumbo-import/gumbo-parser/src/tag_lookup.gperf b/gumbo-parser/src/tag_lookup.gperf similarity index 100% rename from nokogumbo-import/gumbo-parser/src/tag_lookup.gperf rename to gumbo-parser/src/tag_lookup.gperf diff --git a/nokogumbo-import/gumbo-parser/src/tag_lookup.h b/gumbo-parser/src/tag_lookup.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/tag_lookup.h rename to gumbo-parser/src/tag_lookup.h diff --git a/nokogumbo-import/gumbo-parser/src/token_buffer.c b/gumbo-parser/src/token_buffer.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/token_buffer.c rename to gumbo-parser/src/token_buffer.c diff --git a/nokogumbo-import/gumbo-parser/src/token_buffer.h b/gumbo-parser/src/token_buffer.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/token_buffer.h rename to gumbo-parser/src/token_buffer.h diff --git a/nokogumbo-import/gumbo-parser/src/token_type.h b/gumbo-parser/src/token_type.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/token_type.h rename to gumbo-parser/src/token_type.h diff --git a/nokogumbo-import/gumbo-parser/src/tokenizer.c b/gumbo-parser/src/tokenizer.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/tokenizer.c rename to gumbo-parser/src/tokenizer.c diff --git a/nokogumbo-import/gumbo-parser/src/tokenizer.h b/gumbo-parser/src/tokenizer.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/tokenizer.h rename to gumbo-parser/src/tokenizer.h diff --git a/nokogumbo-import/gumbo-parser/src/tokenizer_states.h b/gumbo-parser/src/tokenizer_states.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/tokenizer_states.h rename to gumbo-parser/src/tokenizer_states.h diff --git a/nokogumbo-import/gumbo-parser/src/utf8.c b/gumbo-parser/src/utf8.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/utf8.c rename to gumbo-parser/src/utf8.c diff --git a/nokogumbo-import/gumbo-parser/src/utf8.h b/gumbo-parser/src/utf8.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/utf8.h rename to gumbo-parser/src/utf8.h diff --git a/nokogumbo-import/gumbo-parser/src/util.c b/gumbo-parser/src/util.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/util.c rename to gumbo-parser/src/util.c diff --git a/nokogumbo-import/gumbo-parser/src/util.h b/gumbo-parser/src/util.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/util.h rename to gumbo-parser/src/util.h diff --git a/nokogumbo-import/gumbo-parser/src/vector.c b/gumbo-parser/src/vector.c similarity index 100% rename from nokogumbo-import/gumbo-parser/src/vector.c rename to gumbo-parser/src/vector.c diff --git a/nokogumbo-import/gumbo-parser/src/vector.h b/gumbo-parser/src/vector.h similarity index 100% rename from nokogumbo-import/gumbo-parser/src/vector.h rename to gumbo-parser/src/vector.h diff --git a/nokogumbo-import/gumbo-parser/test/attribute.cc b/gumbo-parser/test/attribute.cc similarity index 100% rename from nokogumbo-import/gumbo-parser/test/attribute.cc rename to gumbo-parser/test/attribute.cc diff --git a/nokogumbo-import/gumbo-parser/test/parser.cc b/gumbo-parser/test/parser.cc similarity index 100% rename from nokogumbo-import/gumbo-parser/test/parser.cc rename to gumbo-parser/test/parser.cc diff --git a/nokogumbo-import/gumbo-parser/test/string_buffer.cc b/gumbo-parser/test/string_buffer.cc similarity index 100% rename from nokogumbo-import/gumbo-parser/test/string_buffer.cc rename to gumbo-parser/test/string_buffer.cc diff --git a/nokogumbo-import/gumbo-parser/test/string_piece.cc b/gumbo-parser/test/string_piece.cc similarity index 100% rename from nokogumbo-import/gumbo-parser/test/string_piece.cc rename to gumbo-parser/test/string_piece.cc diff --git a/nokogumbo-import/gumbo-parser/test/test_utils.cc b/gumbo-parser/test/test_utils.cc similarity index 100% rename from nokogumbo-import/gumbo-parser/test/test_utils.cc rename to gumbo-parser/test/test_utils.cc diff --git a/nokogumbo-import/gumbo-parser/test/test_utils.h b/gumbo-parser/test/test_utils.h similarity index 100% rename from nokogumbo-import/gumbo-parser/test/test_utils.h rename to gumbo-parser/test/test_utils.h diff --git a/nokogumbo-import/gumbo-parser/test/token_buffer.cc b/gumbo-parser/test/token_buffer.cc similarity index 100% rename from nokogumbo-import/gumbo-parser/test/token_buffer.cc rename to gumbo-parser/test/token_buffer.cc diff --git a/nokogumbo-import/gumbo-parser/test/tokenizer.cc b/gumbo-parser/test/tokenizer.cc similarity index 100% rename from nokogumbo-import/gumbo-parser/test/tokenizer.cc rename to gumbo-parser/test/tokenizer.cc diff --git a/nokogumbo-import/gumbo-parser/test/utf8.cc b/gumbo-parser/test/utf8.cc similarity index 100% rename from nokogumbo-import/gumbo-parser/test/utf8.cc rename to gumbo-parser/test/utf8.cc diff --git a/nokogumbo-import/gumbo-parser/test/vector.cc b/gumbo-parser/test/vector.cc similarity index 100% rename from nokogumbo-import/gumbo-parser/test/vector.cc rename to gumbo-parser/test/vector.cc diff --git a/lib/nokogiri.rb b/lib/nokogiri.rb index bf5663dad4..5921c75995 100644 --- a/lib/nokogiri.rb +++ b/lib/nokogiri.rb @@ -19,6 +19,8 @@ require 'nokogiri/css' require 'nokogiri/html/builder' +require 'nokogiri/html5' if Nokogiri.uses_gumbo? + # Nokogiri parses and searches XML/HTML very quickly, and also has # correctly implemented CSS3 selector support as well as XPath 1.0 # support. diff --git a/lib/nokogiri/gumbo.rb b/lib/nokogiri/gumbo.rb new file mode 100644 index 0000000000..d6f7403ec3 --- /dev/null +++ b/lib/nokogiri/gumbo.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true +module Nokogiri + module Gumbo + # The default maximum number of attributes per element. + DEFAULT_MAX_ATTRIBUTES = 400 + + # The default maximum number of errors for parsing a document or a fragment. + DEFAULT_MAX_ERRORS = 0 + + # The default maximum depth of the DOM tree produced by parsing a document + # or fragment. + DEFAULT_MAX_TREE_DEPTH = 400 + end +end diff --git a/nokogumbo-import/lib/nokogumbo/html5.rb b/lib/nokogiri/html5.rb similarity index 98% rename from nokogumbo-import/lib/nokogumbo/html5.rb rename to lib/nokogiri/html5.rb index 4cdd474068..306a29175e 100644 --- a/nokogumbo-import/lib/nokogumbo/html5.rb +++ b/lib/nokogiri/html5.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true # # Copyright 2013-2021 Sam Ruby, Stephen Checkoway # @@ -14,9 +15,9 @@ # limitations under the License. # -require 'nokogumbo/html5/document' -require 'nokogumbo/html5/document_fragment' -require 'nokogumbo/html5/node' +require_relative 'html5/document' +require_relative 'html5/document_fragment' +require_relative 'html5/node' module Nokogiri # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse @@ -266,3 +267,5 @@ def self.prepend_newline?(node) end end end + +require_relative 'gumbo' diff --git a/nokogumbo-import/lib/nokogumbo/html5/document.rb b/lib/nokogiri/html5/document.rb similarity index 87% rename from nokogumbo-import/lib/nokogumbo/html5/document.rb rename to lib/nokogiri/html5/document.rb index cc3ee25388..4fa49f2192 100644 --- a/nokogumbo-import/lib/nokogumbo/html5/document.rb +++ b/lib/nokogiri/html5/document.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true # # Copyright 2013-2021 Sam Ruby, Stephen Checkoway # @@ -57,10 +58,10 @@ def to_xml(options = {}, &block) private def self.do_parse(string_or_io, url, encoding, options) string = HTML5.read_and_encode(string_or_io, encoding) - max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES - max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS - max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH - doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth) + max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES + max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS + max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH + doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth) doc.encoding = 'UTF-8' doc end diff --git a/nokogumbo-import/lib/nokogumbo/html5/document_fragment.rb b/lib/nokogiri/html5/document_fragment.rb similarity index 83% rename from nokogumbo-import/lib/nokogumbo/html5/document_fragment.rb rename to lib/nokogiri/html5/document_fragment.rb index 45afc36791..574b16e3f7 100644 --- a/nokogumbo-import/lib/nokogumbo/html5/document_fragment.rb +++ b/lib/nokogiri/html5/document_fragment.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true # # Copyright 2013-2021 Sam Ruby, Stephen Checkoway # @@ -14,7 +15,7 @@ # limitations under the License. # -require 'nokogiri' +require 'nokogiri/html/document_fragment' module Nokogiri module HTML5 @@ -28,11 +29,11 @@ def initialize(doc, tags = nil, ctx = nil, options = {}) self.errors = [] return self unless tags - max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES - max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS - max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH + max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES + max_errors = options[:max_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS + max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH tags = Nokogiri::HTML5.read_and_encode(tags, nil) - Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth) + Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth) end def serialize(options = {}, &block) diff --git a/nokogumbo-import/lib/nokogumbo/html5/node.rb b/lib/nokogiri/html5/node.rb similarity index 97% rename from nokogumbo-import/lib/nokogumbo/html5/node.rb rename to lib/nokogiri/html5/node.rb index 34af272651..d1f76d68e7 100644 --- a/nokogumbo-import/lib/nokogumbo/html5/node.rb +++ b/lib/nokogiri/html5/node.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: true # # Copyright 2013-2021 Sam Ruby, Stephen Checkoway # @@ -14,7 +15,7 @@ # limitations under the License. # -require 'nokogiri' +require 'nokogiri/xml/node' module Nokogiri module HTML5 @@ -40,7 +41,7 @@ def add_child_node_and_reparent_attrs(node) def inner_html(options = {}) return super(options) unless document.is_a?(HTML5::Document) - result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : "" + result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? String.new("\n") : String.new result << children.map { |child| child.to_html(options) }.join result end diff --git a/lib/nokogiri/version/info.rb b/lib/nokogiri/version/info.rb index 1bfea42908..8011525667 100644 --- a/lib/nokogiri/version/info.rb +++ b/lib/nokogiri/version/info.rb @@ -190,6 +190,10 @@ def self.uses_libxml?(requirement = nil) # :nodoc: Gem::Requirement.new(requirement).satisfied_by?(VersionInfo.instance.loaded_libxml_version) end + def self.uses_gumbo? + uses_libxml? # TODO: replace with Gumbo functionality + end + def self.jruby? # :nodoc: VersionInfo.instance.jruby? end diff --git a/nokogiri.gemspec b/nokogiri.gemspec index 7a414ab1be..7e00470e20 100644 --- a/nokogiri.gemspec +++ b/nokogiri.gemspec @@ -181,6 +181,51 @@ Gem::Specification.new do |spec| "ext/nokogiri/xml_text.c", "ext/nokogiri/xml_xpath_context.c", "ext/nokogiri/xslt_stylesheet.c", + "gumbo-parser/CHANGES.md", + "gumbo-parser/Makefile", + "gumbo-parser/THANKS", + "gumbo-parser/src/Makefile", + "gumbo-parser/src/README.md", + "gumbo-parser/src/ascii.c", + "gumbo-parser/src/ascii.h", + "gumbo-parser/src/attribute.c", + "gumbo-parser/src/attribute.h", + "gumbo-parser/src/char_ref.c", + "gumbo-parser/src/char_ref.h", + "gumbo-parser/src/char_ref.rl", + "gumbo-parser/src/error.c", + "gumbo-parser/src/error.h", + "gumbo-parser/src/foreign_attrs.c", + "gumbo-parser/src/foreign_attrs.gperf", + "gumbo-parser/src/gumbo.h", + "gumbo-parser/src/insertion_mode.h", + "gumbo-parser/src/macros.h", + "gumbo-parser/src/parser.c", + "gumbo-parser/src/parser.h", + "gumbo-parser/src/replacement.h", + "gumbo-parser/src/string_buffer.c", + "gumbo-parser/src/string_buffer.h", + "gumbo-parser/src/string_piece.c", + "gumbo-parser/src/svg_attrs.c", + "gumbo-parser/src/svg_attrs.gperf", + "gumbo-parser/src/svg_tags.c", + "gumbo-parser/src/svg_tags.gperf", + "gumbo-parser/src/tag.c", + "gumbo-parser/src/tag_lookup.c", + "gumbo-parser/src/tag_lookup.gperf", + "gumbo-parser/src/tag_lookup.h", + "gumbo-parser/src/token_buffer.c", + "gumbo-parser/src/token_buffer.h", + "gumbo-parser/src/token_type.h", + "gumbo-parser/src/tokenizer.c", + "gumbo-parser/src/tokenizer.h", + "gumbo-parser/src/tokenizer_states.h", + "gumbo-parser/src/utf8.c", + "gumbo-parser/src/utf8.h", + "gumbo-parser/src/util.c", + "gumbo-parser/src/util.h", + "gumbo-parser/src/vector.c", + "gumbo-parser/src/vector.h", "lib/isorelax.jar", "lib/jing.jar", "lib/nekodtd.jar", @@ -197,6 +242,7 @@ Gem::Specification.new do |spec| "lib/nokogiri/css/xpath_visitor.rb", "lib/nokogiri/decorators/slop.rb", "lib/nokogiri/extension.rb", + "lib/nokogiri/gumbo.rb", "lib/nokogiri/html.rb", "lib/nokogiri/html/builder.rb", "lib/nokogiri/html/document.rb", @@ -207,6 +253,10 @@ Gem::Specification.new do |spec| "lib/nokogiri/html/sax/parser.rb", "lib/nokogiri/html/sax/parser_context.rb", "lib/nokogiri/html/sax/push_parser.rb", + "lib/nokogiri/html5.rb", + "lib/nokogiri/html5/document.rb", + "lib/nokogiri/html5/document_fragment.rb", + "lib/nokogiri/html5/node.rb", "lib/nokogiri/jruby/dependencies.rb", "lib/nokogiri/syntax_error.rb", "lib/nokogiri/version.rb", @@ -266,7 +316,7 @@ Gem::Specification.new do |spec| spec.rdoc_options = ["--main", "README.md"] spec.add_runtime_dependency("racc", "~> 1.4") - spec.add_runtime_dependency("mini_portile2", "~> 2.5.0") unless java_p # keep version in sync with extconf.rb + spec.add_runtime_dependency("mini_portile2", "~> 2.5.1") unless java_p # keep version in sync with extconf.rb spec.add_development_dependency("bundler", "~> 2.2") spec.add_development_dependency("concourse", "~> 0.41") diff --git a/nokogumbo-import/Rakefile b/nokogumbo-import/Rakefile index 94200bedbe..c04ca8de0c 100644 --- a/nokogumbo-import/Rakefile +++ b/nokogumbo-import/Rakefile @@ -12,10 +12,10 @@ task default: :test task test: :compile task gem: :test -ext = Rake::ExtensionTask.new 'nokogumbo' do |e| - e.lib_dir = 'lib/nokogumbo' - e.source_pattern = '{,../../gumbo-parser/src/}*.[hc]' -end +# ext = Rake::ExtensionTask.new 'nokogumbo' do |e| +# e.lib_dir = 'lib/nokogumbo' +# e.source_pattern = '{,../../gumbo-parser/src/}*.[hc]' +# end Rake::TestTask.new(:test) do |t| t.libs << 'test' diff --git a/nokogumbo-import/ext/nokogumbo/extconf.rb b/nokogumbo-import/ext/nokogumbo/extconf.rb deleted file mode 100644 index d0a81d0dfd..0000000000 --- a/nokogumbo-import/ext/nokogumbo/extconf.rb +++ /dev/null @@ -1,160 +0,0 @@ -# -# Copyright 2013-2021 Sam Ruby, Stephen Checkoway, Mike Dalessio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -require 'rubygems' -require 'fileutils' -require 'mkmf' -require 'nokogiri' - -$CFLAGS += " -std=c99" -$LDFLAGS.gsub!('-Wl,--no-undefined', '') -$DLDFLAGS.gsub!('-Wl,--no-undefined', '') -$warnflags = CONFIG['warnflags'] = '-Wall' - -NG_SPEC = Gem::Specification.find_by_name('nokogiri', "= #{Nokogiri::VERSION}") - -def download_headers - begin - require 'yaml' - - dependencies = YAML.load_file(File.join(NG_SPEC.gem_dir, 'dependencies.yml')) - version = dependencies['libxml2']['version'] - host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"] - path = File.join('ports', host, 'libxml2', version, 'include/libxml2') - return path if File.directory?(path) - - # Make sure we're using the same version Nokogiri uses - dep_index = NG_SPEC.dependencies.index { |dep| dep.name == 'mini_portile2' and dep.type == :runtime } - return nil if dep_index.nil? - requirement = NG_SPEC.dependencies[dep_index].requirement.to_s - - gem 'mini_portile2', requirement - require 'mini_portile2' - p = MiniPortile::new('libxml2', version).tap do |r| - r.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"] - r.files = [{ - url: "http://xmlsoft.org/sources/libxml2-#{r.version}.tar.gz", - sha256: dependencies['libxml2']['sha256'] - }] - r.configure_options += [ - "--without-python", - "--without-readline", - "--with-c14n", - "--with-debug", - "--with-threads" - ] - end - p.download unless p.downloaded? - p.extract - p.configure unless p.configured? - system('make', '-C', "tmp/#{p.host}/ports/libxml2/#{version}/libxml2-#{version}/include/libxml", 'install-xmlincHEADERS') - path - rescue - puts 'failed to download/install headers' - nil - end -end - -required = arg_config('--with-libxml2') -prohibited = arg_config('--without-libxml2') -if required and prohibited - abort "cannot use both --with-libxml2 and --without-libxml2" -end - -have_libxml2 = false -have_ng = false - -def windows? - ::RUBY_PLATFORM =~ /mingw|mswin/ -end - -def modern_nokogiri? - nokogiri_version = Gem::Version.new(Nokogiri::VERSION) - requirement = windows? ? ">= 1.11.2" : ">= 1.11.0.rc4" - Gem::Requirement.new(requirement).satisfied_by?(nokogiri_version) -end - -if !prohibited - if modern_nokogiri? - append_cflags(Nokogiri::VERSION_INFO["nokogiri"]["cppflags"]) - append_ldflags(Nokogiri::VERSION_INFO["nokogiri"]["ldflags"]) # may be nil for nokogiri pre-1.11.2 - have_libxml2 = if Nokogiri::VERSION_INFO["nokogiri"]["ldflags"].empty? - have_header('libxml/tree.h') - else - have_func("xmlNewDoc", "libxml/tree.h") - end - end - - if !have_libxml2 - if Nokogiri::VERSION_INFO.include?('libxml') and - Nokogiri::VERSION_INFO['libxml']['source'] == 'packaged' - # Nokogiri has libxml2 built in. Find the headers. - libxml2_path = File.join(Nokogiri::VERSION_INFO['libxml']['libxml2_path'], - 'include/libxml2') - if find_header('libxml/tree.h', libxml2_path) - have_libxml2 = true - else - # Unfortunately, some versions of Nokogiri delete these files. - # https://github.com/sparklemotion/nokogiri/pull/1788 - # Try to download them - libxml2_path = download_headers - unless libxml2_path.nil? - have_libxml2 = find_header('libxml/tree.h', libxml2_path) - end - end - else - # Nokogiri is compiled with system headers. - # Hack to work around broken mkmf on macOS - # (https://bugs.ruby-lang.org/issues/14992 fixed now) - if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH' - RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH' - end - - pkg_config('libxml-2.0') - have_libxml2 = have_library('xml2', 'xmlNewDoc') - end - end - - if required and !have_libxml2 - abort "libxml2 required but could not be located" - end - - - if have_libxml2 - have_ng = have_header('nokogiri.h') || find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri')) - end -end - -if have_libxml2 and have_ng - $CFLAGS += " -DNGLIB=1" -end - -# Symlink gumbo-parser source files. -ext_dir = File.dirname(__FILE__) - -Dir.chdir(ext_dir) do - $srcs = Dir['*.c', '../../gumbo-parser/src/*.c'] - $hdrs = Dir['*.h', '../../gumbo-parser/src/*.h'] -end -$INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src' -$VPATH << '$(srcdir)/../../gumbo-parser/src' - -create_makefile('nokogumbo/nokogumbo') do |conf| - conf.map! do |chunk| - chunk.gsub(/^HDRS = .*$/, "HDRS = #{$hdrs.map { |h| File.join('$(srcdir)', h)}.join(' ')}") - end -end -# vim: set sw=2 sts=2 ts=8 et: diff --git a/nokogumbo-import/ext/nokogumbo/nokogumbo.c b/nokogumbo-import/ext/nokogumbo/nokogumbo.c deleted file mode 100644 index 1ee7d70178..0000000000 --- a/nokogumbo-import/ext/nokogumbo/nokogumbo.c +++ /dev/null @@ -1,809 +0,0 @@ -// -// Copyright 2013-2021 Sam Ruby, Stephen Checkoway -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// -// nokogumbo.c defines the following: -// -// class Nokogumbo -// def parse(utf8_string) # returns Nokogiri::HTML5::Document -// end -// -// Processing starts by calling gumbo_parse_with_options. The resulting -// document tree is then walked: -// -// * if Nokogiri and libxml2 headers are available at compile time, -// (if NGLIB) then a parallel libxml2 tree is constructed, and the -// final document is then wrapped using Nokogiri_wrap_xml_document. -// This approach reduces memory and CPU requirements as Ruby objects -// are only built when necessary. -// -// * if the necessary headers are not available at compile time, Nokogiri -// methods are called instead, producing the equivalent functionality. -// - -#include -#include -#include - -#include "gumbo.h" - -// class constants -static VALUE Document; - -// Interned symbols -static ID internal_subset; -static ID parent; - -/* Backwards compatibility to Ruby 2.1.0 */ -#if RUBY_API_VERSION_CODE < 20200 -#define ONIG_ESCAPE_UCHAR_COLLISION 1 -#include - -static VALUE rb_utf8_str_new(const char *str, long length) { - return rb_enc_str_new(str, length, rb_utf8_encoding()); -} - -static VALUE rb_utf8_str_new_cstr(const char *str) { - return rb_enc_str_new_cstr(str, rb_utf8_encoding()); -} - -static VALUE rb_utf8_str_new_static(const char *str, long length) { - return rb_enc_str_new(str, length, rb_utf8_encoding()); -} -#endif - -#if NGLIB -#include -#include -#include - -#define NIL NULL -#else -#define NIL Qnil - -// These are defined by nokogiri.h -static VALUE cNokogiriXmlSyntaxError; -static VALUE cNokogiriXmlElement; -static VALUE cNokogiriXmlText; -static VALUE cNokogiriXmlCData; -static VALUE cNokogiriXmlComment; - -// Interned symbols. -static ID new; -static ID node_name_; - -// Map libxml2 types to Ruby VALUE. -typedef VALUE xmlNodePtr; -typedef VALUE xmlDocPtr; -typedef VALUE xmlNsPtr; -typedef VALUE xmlDtdPtr; -typedef char xmlChar; -#define BAD_CAST - -// Redefine libxml2 API as Ruby function calls. -static xmlNodePtr xmlNewDocNode(xmlDocPtr doc, xmlNsPtr ns, const xmlChar *name, const xmlChar *content) { - assert(ns == NIL && content == NULL); - return rb_funcall(cNokogiriXmlElement, new, 2, rb_utf8_str_new_cstr(name), doc); -} - -static xmlNodePtr xmlNewDocText(xmlDocPtr doc, const xmlChar *content) { - VALUE str = rb_utf8_str_new_cstr(content); - return rb_funcall(cNokogiriXmlText, new, 2, str, doc); -} - -static xmlNodePtr xmlNewCDataBlock(xmlDocPtr doc, const xmlChar *content, int len) { - VALUE str = rb_utf8_str_new(content, len); - // CDATA.new takes arguments in the opposite order from Text.new. - return rb_funcall(cNokogiriXmlCData, new, 2, doc, str); -} - -static xmlNodePtr xmlNewDocComment(xmlDocPtr doc, const xmlChar *content) { - VALUE str = rb_utf8_str_new_cstr(content); - return rb_funcall(cNokogiriXmlComment, new, 2, doc, str); -} - -static xmlNodePtr xmlAddChild(xmlNodePtr parent, xmlNodePtr cur) { - ID add_child; - CONST_ID(add_child, "add_child"); - return rb_funcall(parent, add_child, 1, cur); -} - -static void xmlSetNs(xmlNodePtr node, xmlNsPtr ns) { - ID namespace_; - CONST_ID(namespace_, "namespace="); - rb_funcall(node, namespace_, 1, ns); -} - -static void xmlFreeDoc(xmlDocPtr doc) { } - -static VALUE Nokogiri_wrap_xml_document(VALUE klass, xmlDocPtr doc) { - return doc; -} - -static VALUE find_dummy_key(VALUE collection) { - VALUE r_dummy = Qnil; - char dummy[5] = "a"; - size_t len = 1; - ID key_; - CONST_ID(key_, "key?"); - while (len < sizeof dummy) { - r_dummy = rb_utf8_str_new(dummy, len); - if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse) - return r_dummy; - for (size_t i = 0; ; ++i) { - if (dummy[i] == 0) { - dummy[i] = 'a'; - ++len; - break; - } - if (dummy[i] == 'z') - dummy[i] = 'a'; - else { - ++dummy[i]; - break; - } - } - } - // This collection has 475254 elements?? Give up. - rb_raise(rb_eArgError, "Failed to find a dummy key."); -} - -// This should return an xmlAttrPtr, but we don't need it and it's easier to -// not get the result. -static void xmlNewNsProp ( - xmlNodePtr node, - xmlNsPtr ns, - const xmlChar *name, - const xmlChar *value -) { - ID set_attribute; - CONST_ID(set_attribute, "set_attribute"); - - VALUE rvalue = rb_utf8_str_new_cstr(value); - - if (RTEST(ns)) { - // This is an easy case, we have a namespace so it's enough to do - // node["#{ns.prefix}:#{name}"] = value - ID prefix; - CONST_ID(prefix, "prefix"); - VALUE ns_prefix = rb_funcall(ns, prefix, 0); - VALUE qname = rb_sprintf("%" PRIsVALUE ":%s", ns_prefix, name); - rb_funcall(node, set_attribute, 2, qname, rvalue); - return; - } - - size_t len = strlen(name); - VALUE rname = rb_utf8_str_new(name, len); - if (memchr(name, ':', len) == NULL) { - // This is the easiest case. There's no colon so we can do - // node[name] = value. - rb_funcall(node, set_attribute, 2, rname, rvalue); - return; - } - - // Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value) - // which behaves roughly as - // if name is a QName prefix:local - // if node->doc has a namespace ns corresponding to prefix - // return xmlSetNsProp(node, ns, local, value) - // return xmlSetNsProp(node, NULL, name, value) - // - // If the prefix is "xml", then the namespace lookup will create it. - // - // By contrast, xmlNewNsProp does not do this parsing and creates an attribute - // with the name and value exactly as given. This is the behavior that we - // want. - // - // Thus, for attribute names like "xml:lang", #set_attribute will create an - // attribute with namespace "xml" and name "lang". This is incorrect for - // html elements (but correct for foreign elements). - // - // Work around this by inserting a dummy attribute and then changing the - // name, if needed. - - // Find a dummy attribute string that doesn't already exist. - VALUE dummy = find_dummy_key(node); - // Add the dummy attribute. - rb_funcall(node, set_attribute, 2, dummy, rvalue); - - // Remove the old attribute, if it exists. - ID remove_attribute; - CONST_ID(remove_attribute, "remove_attribute"); - rb_funcall(node, remove_attribute, 1, rname); - - // Rename the dummy - ID attribute; - CONST_ID(attribute, "attribute"); - VALUE attr = rb_funcall(node, attribute, 1, dummy); - rb_funcall(attr, node_name_, 1, rname); -} -#endif - -// URI = system id -// external id = public id -static xmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public) -{ -#if NGLIB - // These two libxml2 functions take the public and system ids in - // opposite orders. - htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL); - assert(doc); - if (dtd_name) - xmlCreateIntSubset(doc, BAD_CAST dtd_name, BAD_CAST public, BAD_CAST system); - return doc; -#else - // remove internal subset from newly created documents - VALUE doc; - // If system and public are both NULL, Document#new is going to set default - // values for them so we're going to have to remove the internal subset - // which seems to leak memory in Nokogiri, so leak as little as possible. - if (system == NULL && public == NULL) { - ID remove; - CONST_ID(remove, "remove"); - doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_utf8_str_new_static("", 0)); - rb_funcall(rb_funcall(doc, internal_subset, 0), remove, 0); - if (dtd_name) { - // We need to create an internal subset now. - ID create_internal_subset; - CONST_ID(create_internal_subset, "create_internal_subset"); - rb_funcall(doc, create_internal_subset, 3, rb_utf8_str_new_cstr(dtd_name), Qnil, Qnil); - } - } else { - assert(dtd_name); - // Rather than removing and creating the internal subset as we did above, - // just create and then rename one. - VALUE r_system = system ? rb_utf8_str_new_cstr(system) : Qnil; - VALUE r_public = public ? rb_utf8_str_new_cstr(public) : Qnil; - doc = rb_funcall(Document, new, 2, r_system, r_public); - rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_utf8_str_new_cstr(dtd_name)); - } - return doc; -#endif -} - -static xmlNodePtr get_parent(xmlNodePtr node) { -#if NGLIB - return node->parent; -#else - if (!rb_respond_to(node, parent)) - return Qnil; - return rb_funcall(node, parent, 0); -#endif -} - -static GumboOutput *perform_parse(const GumboOptions *options, VALUE input) { - assert(RTEST(input)); - Check_Type(input, T_STRING); - GumboOutput *output = gumbo_parse_with_options ( - options, - RSTRING_PTR(input), - RSTRING_LEN(input) - ); - - const char *status_string = gumbo_status_to_string(output->status); - switch (output->status) { - case GUMBO_STATUS_OK: - break; - case GUMBO_STATUS_TOO_MANY_ATTRIBUTES: - case GUMBO_STATUS_TREE_TOO_DEEP: - gumbo_destroy_output(output); - rb_raise(rb_eArgError, "%s", status_string); - case GUMBO_STATUS_OUT_OF_MEMORY: - gumbo_destroy_output(output); - rb_raise(rb_eNoMemError, "%s", status_string); - } - return output; -} - -static xmlNsPtr lookup_or_add_ns ( - xmlDocPtr doc, - xmlNodePtr root, - const char *href, - const char *prefix -) { -#if NGLIB - xmlNsPtr ns = xmlSearchNs(doc, root, BAD_CAST prefix); - if (ns) - return ns; - return xmlNewNs(root, BAD_CAST href, BAD_CAST prefix); -#else - ID add_namespace_definition; - CONST_ID(add_namespace_definition, "add_namespace_definition"); - VALUE rprefix = rb_utf8_str_new_cstr(prefix); - VALUE rhref = rb_utf8_str_new_cstr(href); - return rb_funcall(root, add_namespace_definition, 2, rprefix, rhref); -#endif -} - -static void set_line(xmlNodePtr node, size_t line) { -#if NGLIB - // libxml2 uses 65535 to mean look elsewhere for the line number on some - // nodes. - if (line < 65535) - node->line = (unsigned short)line; -#else - // XXX: If Nokogiri gets a `#line=` method, we'll use that. -#endif -} - -// Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted -// at gumbo_node. -static void build_tree ( - xmlDocPtr doc, - xmlNodePtr xml_output_node, - const GumboNode *gumbo_node -) { - xmlNodePtr xml_root = NIL; - xmlNodePtr xml_node = xml_output_node; - size_t child_index = 0; - - while (true) { - assert(gumbo_node != NULL); - const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT? - &gumbo_node->v.document.children : &gumbo_node->v.element.children; - if (child_index >= children->length) { - // Move up the tree and to the next child. - if (xml_node == xml_output_node) { - // We've built as much of the tree as we can. - return; - } - child_index = gumbo_node->index_within_parent + 1; - gumbo_node = gumbo_node->parent; - xml_node = get_parent(xml_node); - // Children of fragments don't share the same root, so reset it and - // it'll be set below. In the non-fragment case, this will only happen - // after the html element has been finished at which point there are no - // further elements. - if (xml_node == xml_output_node) - xml_root = NIL; - continue; - } - const GumboNode *gumbo_child = children->data[child_index++]; - xmlNodePtr xml_child; - - switch (gumbo_child->type) { - case GUMBO_NODE_DOCUMENT: - abort(); // Bug in Gumbo. - - case GUMBO_NODE_TEXT: - case GUMBO_NODE_WHITESPACE: - xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text); - set_line(xml_child, gumbo_child->v.text.start_pos.line); - xmlAddChild(xml_node, xml_child); - break; - - case GUMBO_NODE_CDATA: - xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text, - (int) strlen(gumbo_child->v.text.text)); - set_line(xml_child, gumbo_child->v.text.start_pos.line); - xmlAddChild(xml_node, xml_child); - break; - - case GUMBO_NODE_COMMENT: - xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text); - set_line(xml_child, gumbo_child->v.text.start_pos.line); - xmlAddChild(xml_node, xml_child); - break; - - case GUMBO_NODE_TEMPLATE: - // XXX: Should create a template element and a new DocumentFragment - case GUMBO_NODE_ELEMENT: - { - xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL); - set_line(xml_child, gumbo_child->v.element.start_pos.line); - if (xml_root == NIL) - xml_root = xml_child; - xmlNsPtr ns = NIL; - switch (gumbo_child->v.element.tag_namespace) { - case GUMBO_NAMESPACE_HTML: - break; - case GUMBO_NAMESPACE_SVG: - ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/svg", "svg"); - break; - case GUMBO_NAMESPACE_MATHML: - ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1998/Math/MathML", "math"); - break; - } - if (ns != NIL) - xmlSetNs(xml_child, ns); - xmlAddChild(xml_node, xml_child); - - // Add the attributes. - const GumboVector* attrs = &gumbo_child->v.element.attributes; - for (size_t i=0; i < attrs->length; i++) { - const GumboAttribute *attr = attrs->data[i]; - - switch (attr->attr_namespace) { - case GUMBO_ATTR_NAMESPACE_XLINK: - ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1999/xlink", "xlink"); - break; - - case GUMBO_ATTR_NAMESPACE_XML: - ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/XML/1998/namespace", "xml"); - break; - - case GUMBO_ATTR_NAMESPACE_XMLNS: - ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/xmlns/", "xmlns"); - break; - - default: - ns = NIL; - } - xmlNewNsProp(xml_child, ns, BAD_CAST attr->name, BAD_CAST attr->value); - } - - // Add children for this element. - child_index = 0; - gumbo_node = gumbo_child; - xml_node = xml_child; - } - } - } -} - -static void add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url) { - const char *input_str = RSTRING_PTR(input); - size_t input_len = RSTRING_LEN(input); - - // Add parse errors to rdoc. - if (output->errors.length) { - const GumboVector *errors = &output->errors; - VALUE rerrors = rb_ary_new2(errors->length); - - for (size_t i=0; i < errors->length; i++) { - GumboError *err = errors->data[i]; - GumboSourcePosition position = gumbo_error_position(err); - char *msg; - size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg); - VALUE err_str = rb_utf8_str_new(msg, size); - free(msg); - VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError); - const char *error_code = gumbo_error_code(err); - VALUE str1 = error_code? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil; - rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER - rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR - rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR - rb_iv_set(syntax_error, "@file", url); - rb_iv_set(syntax_error, "@line", INT2NUM(position.line)); - rb_iv_set(syntax_error, "@str1", str1); - rb_iv_set(syntax_error, "@str2", Qnil); - rb_iv_set(syntax_error, "@str3", Qnil); - rb_iv_set(syntax_error, "@int1", INT2NUM(0)); - rb_iv_set(syntax_error, "@column", INT2NUM(position.column)); - rb_ary_push(rerrors, syntax_error); - } - rb_iv_set(rdoc, "@errors", rerrors); - } -} - -typedef struct { - GumboOutput *output; - VALUE input; - VALUE url_or_frag; - xmlDocPtr doc; -} ParseArgs; - -static void parse_args_mark(void *parse_args) { - ParseArgs *args = parse_args; - rb_gc_mark_maybe(args->input); - rb_gc_mark_maybe(args->url_or_frag); -} - -// Wrap a ParseArgs pointer. The underlying ParseArgs must outlive the -// wrapper. -static VALUE wrap_parse_args(ParseArgs *args) { - return Data_Wrap_Struct(rb_cData, parse_args_mark, RUBY_NEVER_FREE, args); -} - -// Returnsd the underlying ParseArgs wrapped by wrap_parse_args. -static ParseArgs *unwrap_parse_args(VALUE obj) { - ParseArgs *args; - Data_Get_Struct(obj, ParseArgs, args); - return args; -} - -static VALUE parse_cleanup(VALUE parse_args) { - ParseArgs *args = unwrap_parse_args(parse_args); - gumbo_destroy_output(args->output); - // Make sure garbage collection doesn't mark the objects as being live based - // on references from the ParseArgs. This may be unnecessary. - args->input = Qnil; - args->url_or_frag = Qnil; - if (args->doc != NIL) - xmlFreeDoc(args->doc); - return Qnil; -} - -static VALUE parse_continue(VALUE parse_args); - -// Parse a string using gumbo_parse into a Nokogiri document -static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) { - GumboOptions options = kGumboDefaultOptions; - options.max_attributes = NUM2INT(max_attributes); - options.max_errors = NUM2INT(max_errors); - options.max_tree_depth = NUM2INT(max_depth); - - GumboOutput *output = perform_parse(&options, input); - ParseArgs args = { - .output = output, - .input = input, - .url_or_frag = url, - .doc = NIL, - }; - VALUE parse_args = wrap_parse_args(&args); - - return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args); -} - -static VALUE parse_continue(VALUE parse_args) { - ParseArgs *args = unwrap_parse_args(parse_args); - GumboOutput *output = args->output; - xmlDocPtr doc; - if (output->document->v.document.has_doctype) { - const char *name = output->document->v.document.name; - const char *public = output->document->v.document.public_identifier; - const char *system = output->document->v.document.system_identifier; - public = public[0] ? public : NULL; - system = system[0] ? system : NULL; - doc = new_html_doc(name, system, public); - } else { - doc = new_html_doc(NULL, NULL, NULL); - } - args->doc = doc; // Make sure doc gets cleaned up if an error is thrown. - build_tree(doc, (xmlNodePtr)doc, output->document); - VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc); - args->doc = NIL; // The Ruby runtime now owns doc so don't delete it. - add_errors(output, rdoc, args->input, args->url_or_frag); - return rdoc; -} - -static int lookup_namespace(VALUE node, bool require_known_ns) { - ID namespace, href; - CONST_ID(namespace, "namespace"); - CONST_ID(href, "href"); - VALUE ns = rb_funcall(node, namespace, 0); - - if (NIL_P(ns)) - return GUMBO_NAMESPACE_HTML; - ns = rb_funcall(ns, href, 0); - assert(RTEST(ns)); - Check_Type(ns, T_STRING); - - const char *href_ptr = RSTRING_PTR(ns); - size_t href_len = RSTRING_LEN(ns); -#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len)) - if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) - return GUMBO_NAMESPACE_HTML; - if (NAMESPACE_P("http://www.w3.org/1998/Math/MathML")) - return GUMBO_NAMESPACE_MATHML; - if (NAMESPACE_P("http://www.w3.org/2000/svg")) - return GUMBO_NAMESPACE_SVG; -#undef NAMESPACE_P - if (require_known_ns) - rb_raise(rb_eArgError, "Unexpected namespace URI \"%*s\"", (int)href_len, href_ptr); - return -1; -} - -static xmlNodePtr extract_xml_node(VALUE node) { -#if NGLIB - xmlNodePtr xml_node; - Data_Get_Struct(node, xmlNode, xml_node); - return xml_node; -#else - return node; -#endif -} - -static VALUE fragment_continue(VALUE parse_args); - -static VALUE fragment ( - VALUE self, - VALUE doc_fragment, - VALUE tags, - VALUE ctx, - VALUE max_attributes, - VALUE max_errors, - VALUE max_depth -) { - ID name = rb_intern_const("name"); - const char *ctx_tag; - GumboNamespaceEnum ctx_ns; - GumboQuirksModeEnum quirks_mode; - bool form = false; - const char *encoding = NULL; - - if (NIL_P(ctx)) { - ctx_tag = "body"; - ctx_ns = GUMBO_NAMESPACE_HTML; - } else if (TYPE(ctx) == T_STRING) { - ctx_tag = StringValueCStr(ctx); - ctx_ns = GUMBO_NAMESPACE_HTML; - size_t len = RSTRING_LEN(ctx); - const char *colon = memchr(ctx_tag, ':', len); - if (colon) { - switch (colon - ctx_tag) { - case 3: - if (st_strncasecmp(ctx_tag, "svg", 3) != 0) - goto error; - ctx_ns = GUMBO_NAMESPACE_SVG; - break; - case 4: - if (st_strncasecmp(ctx_tag, "html", 4) == 0) - ctx_ns = GUMBO_NAMESPACE_HTML; - else if (st_strncasecmp(ctx_tag, "math", 4) == 0) - ctx_ns = GUMBO_NAMESPACE_MATHML; - else - goto error; - break; - default: - error: - rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag); - } - ctx_tag = colon+1; - } else { - // For convenience, put 'svg' and 'math' in their namespaces. - if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0) - ctx_ns = GUMBO_NAMESPACE_SVG; - else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0) - ctx_ns = GUMBO_NAMESPACE_MATHML; - } - - // Check if it's a form. - form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0; - } else { - ID element_ = rb_intern_const("element?"); - - // Context fragment name. - VALUE tag_name = rb_funcall(ctx, name, 0); - assert(RTEST(tag_name)); - Check_Type(tag_name, T_STRING); - ctx_tag = StringValueCStr(tag_name); - - // Context fragment namespace. - ctx_ns = lookup_namespace(ctx, true); - - // Check for a form ancestor, including self. - for (VALUE node = ctx; - !NIL_P(node); - node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) { - if (!RTEST(rb_funcall(node, element_, 0))) - continue; - VALUE element_name = rb_funcall(node, name, 0); - if (RSTRING_LEN(element_name) == 4 - && !st_strcasecmp(RSTRING_PTR(element_name), "form") - && lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) { - form = true; - break; - } - } - - // Encoding. - if (RSTRING_LEN(tag_name) == 14 - && !st_strcasecmp(ctx_tag, "annotation-xml")) { - VALUE enc = rb_funcall(ctx, rb_intern_const("[]"), - rb_utf8_str_new_static("encoding", 8)); - if (RTEST(enc)) { - Check_Type(enc, T_STRING); - encoding = StringValueCStr(enc); - } - } - } - - // Quirks mode. - VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0); - VALUE dtd = rb_funcall(doc, internal_subset, 0); - if (NIL_P(dtd)) { - quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS; - } else { - VALUE dtd_name = rb_funcall(dtd, name, 0); - VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0); - VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0); - quirks_mode = gumbo_compute_quirks_mode ( - NIL_P(dtd_name)? NULL:StringValueCStr(dtd_name), - NIL_P(pubid)? NULL:StringValueCStr(pubid), - NIL_P(sysid)? NULL:StringValueCStr(sysid) - ); - } - - // Perform a fragment parse. - int depth = NUM2INT(max_depth); - GumboOptions options = kGumboDefaultOptions; - options.max_attributes = NUM2INT(max_attributes); - options.max_errors = NUM2INT(max_errors); - // Add one to account for the HTML element. - options.max_tree_depth = depth < 0 ? -1 : (depth + 1); - options.fragment_context = ctx_tag; - options.fragment_namespace = ctx_ns; - options.fragment_encoding = encoding; - options.quirks_mode = quirks_mode; - options.fragment_context_has_form_ancestor = form; - - GumboOutput *output = perform_parse(&options, tags); - ParseArgs args = { - .output = output, - .input = tags, - .url_or_frag = doc_fragment, - .doc = (xmlDocPtr)extract_xml_node(doc), - }; - VALUE parse_args = wrap_parse_args(&args); - rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args); - return Qnil; -} - -static VALUE fragment_continue(VALUE parse_args) { - ParseArgs *args = unwrap_parse_args(parse_args); - GumboOutput *output = args->output; - VALUE doc_fragment = args->url_or_frag; - xmlDocPtr xml_doc = args->doc; - - args->doc = NIL; // The Ruby runtime owns doc so make sure we don't delete it. - xmlNodePtr xml_frag = extract_xml_node(doc_fragment); - build_tree(xml_doc, xml_frag, output->root); - add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9)); - return Qnil; -} - -// Initialize the Nokogumbo class and fetch constants we will use later. -void Init_nokogumbo() { - rb_funcall(rb_mKernel, rb_intern_const("gem"), 1, rb_utf8_str_new_static("nokogiri", 8)); - rb_require("nokogiri"); - - VALUE line_supported = Qtrue; - -#if !NGLIB - // Class constants. - VALUE mNokogiri = rb_const_get(rb_cObject, rb_intern_const("Nokogiri")); - VALUE mNokogiriXml = rb_const_get(mNokogiri, rb_intern_const("XML")); - cNokogiriXmlSyntaxError = rb_const_get(mNokogiriXml, rb_intern_const("SyntaxError")); - rb_gc_register_mark_object(cNokogiriXmlSyntaxError); - cNokogiriXmlElement = rb_const_get(mNokogiriXml, rb_intern_const("Element")); - rb_gc_register_mark_object(cNokogiriXmlElement); - cNokogiriXmlText = rb_const_get(mNokogiriXml, rb_intern_const("Text")); - rb_gc_register_mark_object(cNokogiriXmlText); - cNokogiriXmlCData = rb_const_get(mNokogiriXml, rb_intern_const("CDATA")); - rb_gc_register_mark_object(cNokogiriXmlCData); - cNokogiriXmlComment = rb_const_get(mNokogiriXml, rb_intern_const("Comment")); - rb_gc_register_mark_object(cNokogiriXmlComment); - - // Interned symbols. - new = rb_intern_const("new"); - node_name_ = rb_intern_const("node_name="); - - // #line is not supported (returns 0) - line_supported = Qfalse; -#endif - - // Class constants. - VALUE HTML5 = rb_const_get(mNokogiri, rb_intern_const("HTML5")); - Document = rb_const_get(HTML5, rb_intern_const("Document")); - rb_gc_register_mark_object(Document); - - // Interned symbols. - internal_subset = rb_intern_const("internal_subset"); - parent = rb_intern_const("parent"); - - // Define Nokogumbo module with parse and fragment methods. - VALUE Gumbo = rb_define_module("Nokogumbo"); - rb_define_singleton_method(Gumbo, "parse", parse, 5); - rb_define_singleton_method(Gumbo, "fragment", fragment, 6); - - // Add private constant for testing. - rb_define_const(Gumbo, "LINE_SUPPORTED", line_supported); - rb_funcall(Gumbo, rb_intern_const("private_constant"), 1, - rb_utf8_str_new_cstr("LINE_SUPPORTED")); -} - -// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: diff --git a/nokogumbo-import/lib/nokogumbo.rb b/nokogumbo-import/lib/nokogumbo.rb deleted file mode 100644 index ca797294e0..0000000000 --- a/nokogumbo-import/lib/nokogumbo.rb +++ /dev/null @@ -1,43 +0,0 @@ -# -# Copyright 2013-2021 Sam Ruby, Stephen Checkoway -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -require 'nokogiri' - -if ((defined?(Nokogiri::HTML5) && Nokogiri::HTML5.respond_to?(:parse)) && - (defined?(Nokogiri::Gumbo) && Nokogiri::Gumbo.respond_to?(:parse)) && - !(ENV.key?("NOKOGUMBO_IGNORE_NOKOGIRI_HTML5") && ENV["NOKOGUMBO_IGNORE_NOKOGIRI_HTML5"] != "false")) - - warn "NOTE: nokogumbo: Using Nokogiri::HTML5 provided by Nokogiri. See https://github.com/sparklemotion/nokogiri/issues/2205 for more information." - - ::Nokogumbo = ::Nokogiri::Gumbo -else - require 'nokogumbo/html5' - require 'nokogumbo/nokogumbo' - - module Nokogumbo - # The default maximum number of attributes per element. - DEFAULT_MAX_ATTRIBUTES = 400 - - # The default maximum number of errors for parsing a document or a fragment. - DEFAULT_MAX_ERRORS = 0 - - # The default maximum depth of the DOM tree produced by parsing a document - # or fragment. - DEFAULT_MAX_TREE_DEPTH = 400 - end -end - -require 'nokogumbo/version' diff --git a/nokogumbo-import/lib/nokogumbo/version.rb b/nokogumbo-import/lib/nokogumbo/version.rb deleted file mode 100644 index 84da549a1d..0000000000 --- a/nokogumbo-import/lib/nokogumbo/version.rb +++ /dev/null @@ -1,19 +0,0 @@ -# -# Copyright 2013-2021 Sam Ruby, Stephen Checkoway -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -module Nokogumbo - VERSION = "2.0.5" -end diff --git a/nokogumbo-import/nokogumbo.gemspec b/nokogumbo-import/nokogumbo.gemspec deleted file mode 100644 index 0529fa2932..0000000000 --- a/nokogumbo-import/nokogumbo.gemspec +++ /dev/null @@ -1,32 +0,0 @@ -require_relative 'lib/nokogumbo/version' - -Gem::Specification.new do |s| - s.name = 'nokogumbo' - s.version = Nokogumbo::VERSION - - s.authors = ['Sam Ruby', 'Stephen Checkoway'] - s.email = ['rubys@intertwingly.net', 's@pahtak.org'] - - s.license = 'Apache-2.0' - s.homepage = 'https://github.com/rubys/nokogumbo/#readme' - s.summary = 'Nokogiri interface to the Gumbo HTML5 parser' - s.description = 'Nokogumbo allows a Ruby program to invoke the Gumbo ' \ - 'HTML5 parser and access the result as a Nokogiri parsed document.' - - s.metadata = { - 'bug_tracker_uri' => 'https://github.com/rubys/nokogumbo/issues', - 'changelog_uri' => 'https://github.com/rubys/nokogumbo/blob/master/CHANGELOG.md', - 'homepage_uri' => s.homepage, - 'source_code_uri' => 'https://github.com/rubys/nokogumbo' - } - - s.extensions = %w[ ext/nokogumbo/extconf.rb ] - - s.files = %w[ LICENSE.txt README.md ] + - Dir['lib/**/*.rb'] + - Dir['ext/nokogumbo/*.{rb,c}'] + - Dir['gumbo-parser/src/*.[hc]'] - - s.required_ruby_version = ">= 2.1" - s.add_runtime_dependency 'nokogiri', '~> 1.8', '>= 1.8.4' -end diff --git a/nokogumbo-import/test/test_encoding.rb b/nokogumbo-import/test/test_encoding.rb deleted file mode 100644 index 7a94f1564d..0000000000 --- a/nokogumbo-import/test/test_encoding.rb +++ /dev/null @@ -1,208 +0,0 @@ -require 'nokogumbo' -require 'minitest/autorun' - -class TestNokogumbo < Minitest::Test - if ''.respond_to? 'encoding' - def test_macroman_encoding - mac="\xCA".force_encoding('macroman') - doc = Nokogiri::HTML5(mac) - assert_equal " ", doc.at("span").to_xml - end - - def test_iso8859_encoding - iso8859="Se\xF1or".force_encoding(Encoding::ASCII_8BIT) - doc = Nokogiri::HTML5(iso8859) - assert_equal 'Señor', doc.at('span').to_xml - end - - def test_charset_encoding - utf8="Se\xC3\xB1or". - force_encoding(Encoding::ASCII_8BIT) - doc = Nokogiri::HTML5(utf8) - assert_equal 'Señor', doc.at('span').to_xml - end - - def test_bogus_encoding - bogus="Se\xF1or". - force_encoding(Encoding::ASCII_8BIT) - doc = Nokogiri::HTML5(bogus) - assert_equal 'Señor', doc.at('span').to_xml - end - - def test_utf8_bom - utf8 = "\uFEFF".encode('UTF-8') - doc = Nokogiri::HTML5(utf8, max_errors: 10) - assert_equal [], doc.errors - end - - def test_utf16le_bom - utf16le = "\uFEFF".encode('UTF-16LE') - doc = Nokogiri::HTML5(utf16le, max_errors: 10) - assert_equal [], doc.errors - end - - def test_utf16be_bom - utf16be = "\uFEFF".encode('UTF-16BE') - doc = Nokogiri::HTML5(utf16be, max_errors: 10) - assert_equal [], doc.errors - end - - def test_utf8_bom_ascii - utf8 = "\uFEFF".encode('UTF-8') - utf8.force_encoding(Encoding::ASCII_8BIT) - doc = Nokogiri::HTML5(utf8, max_errors: 10) - doc.errors.each { |err| puts(err) } - assert_equal [], doc.errors - end - - def test_utf16le_bom_ascii - utf16le = "\uFEFF".encode('UTF-16LE') - utf16le.force_encoding(Encoding::ASCII_8BIT) - doc = Nokogiri::HTML5(utf16le, max_errors: 10) - assert_equal [], doc.errors - doc.errors.each { |err| puts(err) } - end - - def test_utf16be_bom_ascii - utf16be = "\uFEFF".encode('UTF-16BE') - utf16be.force_encoding(Encoding::ASCII_8BIT) - doc = Nokogiri::HTML5(utf16be, max_errors: 10) - assert_equal [], doc.errors - doc.errors.each { |err| puts(err) } - end - - def test_tag_after_utf8_bom - utf8 = "\uFEFF".encode('UTF-8') - doc = Nokogiri::HTML5.fragment(utf8, max_errors: 10) - assert_equal [], doc.errors - end - end - - # https://github.com/rubys/nokogumbo/issues/68 - def test_charset_sniff_to_html - html = <<-EOF.gsub(/^ /, '') - - - - - - - Hello! - - - EOF - doc = Nokogiri::HTML5(html, max_errors: 10) - assert_equal 0, doc.errors.length - refute_equal '', doc.to_html - end - - # https://encoding.spec.whatwg.org/#names-and-labels - # I chose these by looking at the Wikipedia page for each encoding, picked - # one of the languages it was supposed to encode, and then Googled for a - # proverb in the language. Apologies if these are ill-chosen or nonsensical. - # I'm happy to change them. I'm just pasting them in here so I'm pretty sure - # the right-to-left languages are backward. Corrections welcome. - ENCODINGS = [ - ['UTF-8', "Let's concatentate all of these for UTF-8"], # English - ['IBM866', 'А дело бывало -- и коза волка съедала'], # Russian - ['ISO-8859-2', 'Co můžeš udělat dnes, neodkládej na zítřek.'], # Czech - ['ISO-8859-3', 'Yukarda mavi gök, asağıda yağız yer yaratıldıkta'], # Turkish - ['ISO-8859-4', 'Ceļš uz elli ir bruģēts ar labiem nodomiem.'], # Latvian - ['ISO-8859-5', 'Каде има сила, нема правдина.'], # Macedonian - ['ISO-8859-6', 'أباد الله خضراءهم ابذل لصديقك دمك ومالك'], # Arabic - ['ISO-8859-7', 'Η καλύτερη άμυνα είναι η επίθεση.'], # Greek - ['ISO-8859-8', 'אין הנחתום מעיד על עיסתו'], # Hebrew - ['ISO-8859-8-I', 'אל תסתכל בקנקן, אלא במה שבתוכו'], # Hebrew - ['ISO-8859-10', 'Alla känner apan, men apan känner ingen.'], # Swedish - ['ISO-8859-13', 'Lašas po lašo ir akmenį pratašo.'], # Lithuanian - ['ISO-8859-14', "ha bhòrd bòrd gun aran ach 's bòrd aran leis fhèin."], # Scottish Gaelic - ['ISO-8859-15', 'This is essentially ISO 8859-1 but with € Š š Ž ž Œ œ Ÿ'], # English - ['ISO-8859-16', 'Kiedy wszedłeś między wrony, musisz krakać jak i one.'], # Polish - ['KOI8-R', 'А дело бывало -- и коза волка съедала'], # Russian - ['KOI8-U', 'Яблуко від яблуньки не далеко. Ґ, Є, І, Ї'], # Ukrainian - ['macroman', 'Some good old Mac Roman œ∑´®†¥¨ˆøπå߃©'], # English - ['windows-874', 'กระต่ายหมายจันทร์'], # Thai - ['windows-1250', 'Addig nyújtózkodj, amíg a takaród ér.'], # Hungarian - ['windows-1251', 'Бързата работа - срам за майстора.'], # Bulgarian - ['windows-1252', 'Basically ISO 8859-1 with ‘differences’™ •'], # English - ['windows-1253', 'Και οι τοίχοι έχουν αυτιά.'], # Greek - ['windows-1254', 'Baban nasılsa oğlu da öyledir.'], # Turkish - ['windows-1255', 'אל תקנה חתול בשק; ₪'], # Hebrew - ['windows-1256', 'أبطأ من سلحفاة'], # Arabic - ['windows-1257', 'Hommikune töö kuld, õhtune muld.'], # Estonian - ['windows-1258', 'Ăn theo thuở, ở theo thời.'], # Vietnamese - ['macCyrillic', 'А дело бывало -- и коза волка съедала'], # Russian - ['GBK', '不闻不若闻之,闻之不若见之,见之不若知之,知之不若行之;学至于行之而止矣'], # Simplified Chinese - ['gb18030', '不聞不若聞之,聞之不若見之,見之不若知之,知之不若行之;學至於行之而止矣'], # Traditional Chinese - ['Big5', '有其父必有其子'], # Traditional Chinese - ['EUC-JP', '猿も木から落ちる'], # Japanese - ['ISO-2022-JP', '井の中の蛙大海を知らず'], # Japanese - ['Shift_JIS', '鳥なき里の蝙蝠'], # Japanese - ['EUC-KR', '아는 길도 물어가라'], # Korean - ['replacement', '콩 심은데 콩나고, 팥 심은데 팥난다'], # Korean - ['UTF-16BE', 'Everything had better be representable!'], # English - ['UTF-16LE', 'Same as with UTF-16BE'], # English - ['US-ASCII', 'Surprisingly not one of the required encodings'] # English - ].freeze - - def encodings_html - @encodings_html ||= - "" + - ENCODINGS.map { |enc| %(#{enc[1]}) }.join + - '' - end - - def encodings_doc - @encodings_doc ||= Nokogiri::HTML5(encodings_html) - end - - def round_trip_through(str, enc) - begin - encoding = Encoding.find(enc) - rescue ArgumentError - skip "#{enc} not supported" - end - begin - encoded = str.encode(encoding) - rescue Encoding::ConverterNotFoundError - skip "Converting UTF-8 to #{enc} not supported" - end - begin - decoded = encoded.encode('UTF-8') - rescue Encoding::ConverterNotFoundError - skip "Converting #{enc} to UTF-8 not supported" - end - assert_equal str, decoded, "'#{str}' did not round trip through #{enc[0]}" - encoded - end - - ENCODINGS.each do |enc| - define_method("test_parse_encoded_#{enc[0]}".to_sym) do - html = "#{enc[1]}" - encoded_html = round_trip_through(html, enc[0]) - doc = Nokogiri::HTML5(encoded_html, encoding: enc[0]) - span = doc.at('/html/body/span') - refute_nil span - assert_equal enc[1], span.content - end - - define_method("test_inner_html_encoded_#{enc[0]}".to_sym) do - encoded = round_trip_through(enc[1], enc[0]) - span = encodings_doc.at(%(/html/body/span[@id="#{enc[0]}"])) - refute_nil span - assert_equal encoded, span.inner_html(encoding: enc[0]) - end - - define_method("test_roundtrip_through_#{enc[0]}".to_sym) do - # https://bugs.ruby-lang.org/issues/15033 - # Ruby has a bug with the `:fallback` parameter passed to `#encode` when - # multiple conversions have to happen. I'm not sure it's worth working - # around. It impacts this test though. - skip 'https://bugs.ruby-lang.org/issues/15033' if enc[0] == 'ISO-2022-JP' - round_trip_through(enc[1], enc[0]) - encoded = encodings_doc.serialize(encoding: enc[0]) - doc = Nokogiri::HTML5(encoded, encoding: enc[0]) - assert_equal encodings_html, doc.serialize - end - end -end diff --git a/nokogumbo-import/test/test_tree-construction.rb b/nokogumbo-import/test/test_tree-construction.rb deleted file mode 100644 index 5060d2137c..0000000000 --- a/nokogumbo-import/test/test_tree-construction.rb +++ /dev/null @@ -1,277 +0,0 @@ -# encoding: utf-8 -require 'nokogumbo' -require 'minitest/autorun' - -def parse_test(test_data) - test = { script: :both } - index = /(?:^#errors\n|\n#errors\n)/ =~ test_data - abort "Expected #errors in\n#{test_data}" if index.nil? - skip_amount = $~[0].length - # Omit the final new line - test[:data] = test_data[0...index] - - # Process the rest line by line - lines = test_data[index+skip_amount..-1].split("\n") - index = lines.find_index do |line| - line == '#document-fragment' || - line == '#document' || - line == '#script-off' || - line == '#script-on' || - line == '#new-errors' - end - abort 'Expected #document' if index.nil? - test[:errors] = lines[0...index] - test[:new_errors] = [] - if lines[index] == '#new-errors' - index += 1 - while !%w[#document-fragment #document #script-off #script-on].include?(lines[index]) - test[:new_errors] << lines[index] - index += 1 - end - end - - if lines[index] == '#document-fragment' - test[:context] = lines[index+1].chomp.split(' ', 2) - index += 2 - end - abort "failed to find fragment: #{index}: #{lines[index]}" if test_data.include?("#document-fragment") && test[:context].nil? - - if lines[index] =~ /#script-(on|off)/ - test[:script] = $~[1].to_sym - index += 1 - end - - abort "Expected #document, got #{lines[index]}" unless lines[index] == '#document' - index += 1 - - document = { - type: test[:context] ? :fragment : :document, - children: [] - } - open_nodes = [document] - while index < lines.length - abort "Expected '| ' but got #{lines[index]}" unless /^\| ( *)([^ ].*$)/ =~ lines[index] - depth = $~[1].length - if depth.odd? - abort "Invalid nesting depth" - else - depth = depth / 2 - end - abort "Too deep" if depth >= open_nodes.length - - node = {} - node_text = $~[2] - if node_text[0] == '"' - if node_text == '"' || node_text[-1] != '"' - loop do - index += 1 - node_text << "\n" + lines[index] - break if node_text[-1] == '"' - end - end - node[:type] = :text - node[:contents] = node_text[1..-2] - elsif /^]*)(?: "([^"]*)" "(.*)")?>$/ =~ node_text - node[:type] = :doctype - node[:name] = $~[1] - node[:public_id] = $~[2].nil? || $~[2].empty? ? nil : $~[2] - node[:system_id] = $~[3].nil? || $~[3].empty? ? nil : $~[3] - elsif /^$/ =~ node_text - node[:type] = :comment - node[:contents] = $~[1] - elsif /^<(svg |math )?(.+)>$/ =~ node_text - node[:type] = :element - node[:ns] = $~[1].nil? ? nil : $~[1].rstrip - node[:tag] = $~[2] - node[:attributes] = [] - node[:children] = [] - elsif /^([^ ]+ )?([^=]+)="(.*)"$/ =~ node_text - node[:type] = :attribute - node[:ns] = $~[1].nil? ? nil : $~[1].rstrip - node[:name] = $~[2] - node[:value] = $~[3] - elsif node_text == 'content' - node[:type] = :template - else - abort "Unexpected node_text: #{node_text}" - end - - if node[:type] == :attribute - abort "depth #{depth} != #{open_nodes.length}" unless depth == open_nodes.length - 1 - abort "type :#{open_nodes[-1][:type]} != :element" unless open_nodes[-1][:type] == :element - abort "element has children" unless open_nodes[-1][:children].empty? - open_nodes[-1][:attributes] << node - elsif node[:type] == :template - abort "depth #{depth} != #{open_nodes.length}" unless depth == open_nodes.length - 1 - abort "type :#{open_nodes[-1][:type]} != :element" unless open_nodes[-1][:type] == :element - abort "tag :#{open_nodes[-1][:tag]} != template" unless open_nodes[-1][:tag] == 'template' - abort "template has children before the 'content'" unless open_nodes[-1][:children].empty? - # Hack. We want the children of this template node to be reparented as - # children of the template element. - # XXX: Template contents are _not_ supposed to be children of the - # template, but we currently mishandle this. - open_nodes << open_nodes[-1] - else - open_nodes[depth][:children] << node - open_nodes[depth+1..-1] = [] - if node[:type] == :element - open_nodes << node - end - end - index += 1 - end - test[:document] = document - test -end - -class TestTreeConstructionBase < Minitest::Test - def assert_equal_or_nil(exp, act) - if exp.nil? - assert_nil act - else - assert_equal exp, act - end - end - - def compare_nodes(node, ng_node) - case ng_node.type - when Nokogiri::XML::Node::ELEMENT_NODE - assert_equal node[:type], :element - if node[:ns] - refute_nil ng_node.namespace - assert_equal node[:ns], ng_node.namespace.prefix - end - assert_equal node[:tag], ng_node.name - attributes = ng_node.attributes - assert_equal node[:attributes].length, attributes.length - node[:attributes].each do |attr| - if attr[:ns] - value = ng_node["#{attr[:ns]}:#{attr[:name]}"] - else - value = attributes[attr[:name]].value - end - assert_equal attr[:value], value - end - assert_equal node[:children].length, ng_node.children.length, - "Element <#{node[:tag]}> has wrong number of children: #{ng_node.children.map { |c| c.name }}" - when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE - # We preserve the CDATA in the tree, but the tests represent it as text. - assert_equal node[:type], :text - assert_equal node[:contents], ng_node.content - when Nokogiri::XML::Node::COMMENT_NODE - assert_equal node[:type], :comment - assert_equal node[:contents], ng_node.content - when Nokogiri::XML::Node::HTML_DOCUMENT_NODE - assert_equal node[:type], :document - assert_equal node[:children].length, ng_node.children.length - when Nokogiri::XML::Node::DOCUMENT_FRAG_NODE - assert_equal node[:type], :fragment - assert_equal node[:children].length, ng_node.children.length - when Nokogiri::XML::Node::DTD_NODE - assert_equal node[:type], :doctype - assert_equal node[:name], ng_node.name - assert_equal_or_nil node[:public_id], ng_node.external_id - assert_equal_or_nil node[:system_id], ng_node.system_id - else - flunk "Unknown node type #{ng_node.type} (expected #{node[:type]})" - end - end - - def run_test - if @test[:context] - ctx = @test[:context].join(':') - doc = Nokogiri::HTML5::Document.new - doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], ctx, max_errors: @test[:errors].length + 10) - else - doc = Nokogiri::HTML5.parse(@test[:data], max_errors: @test[:errors].length + 10) - end - # Walk the tree. - exp_nodes = [@test[:document]] - act_nodes = [doc] - children = [0] - compare_nodes(exp_nodes[0], doc) - while children.any? - child_index = children[-1] - exp = exp_nodes[-1] - act = act_nodes[-1] - if child_index == exp[:children].length - exp_nodes.pop - act_nodes.pop - children.pop - next - end - exp_child = exp[:children][child_index] - act_child = act.children[child_index] - compare_nodes(exp_child, act_child) - children[-1] = child_index + 1 - if exp_child.has_key?(:children) - exp_nodes << exp_child - act_nodes << act_child - children << 0 - end - end - - # Test the errors. - assert_equal @test[:errors].length, doc.errors.length - - # The new, standardized tokenizer errors live in @test[:new_errors]. Let's - # match each one to exactly one error in doc.errors. Unfortunately, the - # tests specify the column the error is detected, _not_ the column of the - # start of the problematic HTML (e.g., the start of a character reference - # or \d+):(?\d+)(?:-\d+:\d+)?\) (?.*)$/ - @test[:new_errors].each do |err| - assert_match(error_regex, err) - m = err.match(error_regex) - line = m[:line].to_i - column = m[:column].to_i - code = m[:code] - idx = errors.index do |e| - e[:line] == line && - e[:code] == code && - e[:column] <= column - end - # This error should be the first error in the list. - #refute_nil(idx, "Expected to find error #{code} at #{line}:#{column}") - assert_equal(0, idx, "Expected to find error #{code} at #{line}:#{column}") - errors.delete_at(idx) - end - end -end - -tc_path = File.expand_path('../html5lib-tests/tree-construction', __FILE__) -Dir[File.join(tc_path, '*.dat')].each do |path| - test_name = "TestTreeConstruction" + File.basename(path, '.dat') - .split(/[_-]/) - .map { |s| s.capitalize } - .join('') - tests = [] - File.open(path, "r", encoding: 'UTF-8') do |f| - f.each("\n\n#data\n") do |test_data| - if test_data.start_with?("#data\n") - test_data = test_data[6..-1] - end - if test_data.end_with?("\n\n#data\n") - test_data = test_data[0..-9] - end - tests << parse_test(test_data) - end - end - - klass = Class.new(TestTreeConstructionBase) do - tests.each_with_index do |test, index| - next if test[:script] == :on; - define_method "test_#{index}".to_sym do - @test = test - @index = index - run_test - end - end - end - Object.const_set test_name, klass -end - -# vim: set sw=2 sts=2 ts=8 et: diff --git a/rakelib/check-manifest.rake b/rakelib/check-manifest.rake index d5eda72380..bc3a42792e 100644 --- a/rakelib/check-manifest.rake +++ b/rakelib/check-manifest.rake @@ -16,6 +16,7 @@ task :check_manifest do coverage doc gems + nokogumbo-import patches pkg ports @@ -33,8 +34,6 @@ task :check_manifest do .editorconfig .gitignore .yardopts - appveyor.yml - nokogiri.gemspec CHANGELOG.md CODE_OF_CONDUCT.md CONTRIBUTING.md @@ -44,8 +43,11 @@ task :check_manifest do SECURITY.md STANDARD_RESPONSES.md Vagrantfile - [0-9]+-.* - [a-z.]+.(log|out) + [a-z]*.{log,out} + appveyor.yml + gumbo-parser/test/* + lib/nokogiri/**/nokogiri.{jar,so} + nokogiri.gemspec ] intended_directories = Dir.children(".") @@ -54,10 +56,11 @@ task :check_manifest do intended_files = Dir.children(".") .select { |filename| File.file?(filename) } - .reject { |filename| ignore_files.any? { |ig| File.fnmatch?(ig, filename) } } + .reject { |filename| ignore_files.any? { |ig| File.fnmatch?(ig, filename, File::FNM_EXTGLOB) } } intended_files += Dir.glob(intended_directories.map { |d| File.join(d, "/**/*") }) .select { |filename| File.file?(filename) } + .reject { |filename| ignore_files.any? { |ig| File.fnmatch?(ig, filename, File::FNM_EXTGLOB) } } .sort spec_files = raw_gemspec.files.sort diff --git a/rakelib/extensions.rake b/rakelib/extensions.rake index bc5067e3f6..766861949a 100644 --- a/rakelib/extensions.rake +++ b/rakelib/extensions.rake @@ -307,7 +307,9 @@ if java? jruby_home = RbConfig::CONFIG['prefix'] jars = ["#{jruby_home}/lib/jruby.jar"] + FileList['lib/*.jar'] + # Keep the extension C files because they have docstrings (and Java files don't) ext.gem_spec.files.reject! { |path| File.fnmatch?("ext/nokogiri/*.h", path) } + ext.gem_spec.files.reject! { |path| File.fnmatch?("gumbo-parser/**/*", path) } ext.ext_dir = 'ext/java' ext.lib_dir = 'lib/nokogiri' @@ -326,7 +328,7 @@ else dependencies = YAML.load_file("dependencies.yml") task gem_build_path do - NOKOGIRI_SPEC.files.reject! { |f| f =~ %r{\.(java|jar)$} } + NOKOGIRI_SPEC.files.reject! { |path| File.fnmatch?("**/*.{java,jar}", path, File::FNM_EXTGLOB) } ["libxml2", "libxslt"].each do |lib| version = dependencies[lib]["version"] @@ -345,7 +347,7 @@ else end Rake::ExtensionTask.new("nokogiri", NOKOGIRI_SPEC) do |ext| - ext.gem_spec.files.reject! { |f| f =~ %r{\.(java|jar)$} } + ext.gem_spec.files.reject! { |path| File.fnmatch?("**/*.{java,jar}", path, File::FNM_EXTGLOB) } ext.lib_dir = File.join(*['lib', 'nokogiri', ENV['FAT_DIR']].compact) ext.config_options << ENV['EXTOPTS'] @@ -354,6 +356,7 @@ else ext.cross_config_options << "--enable-cross-build" ext.cross_compiling do |spec| spec.files.reject! { |path| File.fnmatch?('ports/*', path) } + spec.files.reject! { |path| File.fnmatch?("gumbo-parser/**/*", path) } spec.dependencies.reject! { |dep| dep.name=='mini_portile2' } # when pre-compiling a native gem, package all the C headers sitting in ext/nokogiri/include diff --git a/scripts/test-gem-file-contents b/scripts/test-gem-file-contents index 08bee5ff73..c567bf40d5 100755 --- a/scripts/test-gem-file-contents +++ b/scripts/test-gem-file-contents @@ -78,7 +78,7 @@ describe File.basename(gemfile) do assert_operator(actual, :>, 60, "expected gemfile to contain more than #{actual} files") end - it "gemspec is a Gem::Specfication" do + it "gemspec is a Gem::Specification" do assert_equal(Gem::Specification, gemspec.class) end end @@ -93,6 +93,19 @@ describe File.basename(gemfile) do end describe "ruby platform" do + it "depends on mini_portile2" do + assert(gemspec.dependencies.find { |d| d.name == "mini_portile2" }) + end + + it "contains ext/nokogiri C and header files" do + assert_operator(gemfile_contents.grep(%r{^ext/nokogiri/.*\.c}).length, :>, 20) + assert_operator(gemfile_contents.grep(%r{^ext/nokogiri/.*\.h}).length, :>, 0) + end + + it "includes C files in extra_rdoc_files" do + assert_operator(gemspec.extra_rdoc_files.grep(%r{ext/nokogiri/.*\.c$}).length, :>, 10) + end + it "contains the port files" do actual_ports = gemfile_contents.grep(%r{^ports/}) assert_equal(1, actual_ports.grep(/libxml2-\d+\.\d+\.\d+\.tar\.gz/).length, @@ -106,31 +119,37 @@ describe File.basename(gemfile) do assert_operator(gemfile_contents.grep(%r{^patches/}).length, :>, 0) end - it "contains ext/nokogiri C and header files" do - assert_operator(gemfile_contents.grep(%r{^ext/nokogiri/.*\.c}).length, :>, 20) - assert_operator(gemfile_contents.grep(%r{^ext/nokogiri/.*\.h}).length, :>, 0) - end - it "does not contain packaged libraries' header files" do # these files are present after installation if the packaged libraries are used assert_empty(gemfile_contents.grep(%r{^ext/nokogiri/include/})) end + it "contains the gumbo parser source code" do + assert_includes(gemfile_contents, "gumbo-parser/src/Makefile") + assert_operator(gemfile_contents.grep(%r{^gumbo-parser/src/.*\.c}).length, :>, 10) + assert_operator(gemfile_contents.grep(%r{^gumbo-parser/src/.*\.h}).length, :>, 10) + end + it "does not contain java files" do assert_empty(gemfile_contents.grep(%r{^ext/java/})) assert_empty(gemfile_contents.grep(/.*\.jar$/)) end + end if gemspec.platform == Gem::Platform::RUBY - it "depends on mini_portile2" do - assert(gemspec.dependencies.find { |d| d.name == "mini_portile2" }) + describe "native platform" do + it "does not depend on mini_portile2" do + refute(gemspec.dependencies.find { |d| d.name == "mini_portile2" }) + end + + it "contains ext/nokogiri C and header files" do + assert_operator(gemfile_contents.grep(%r{^ext/nokogiri/.*\.c}).length, :>, 20) + assert_operator(gemfile_contents.grep(%r{^ext/nokogiri/.*\.h}).length, :>, 20) end it "includes C files in extra_rdoc_files" do assert_operator(gemspec.extra_rdoc_files.grep(%r{ext/nokogiri/.*\.c$}).length, :>, 10) end - end if gemspec.platform == Gem::Platform::RUBY - describe "native platform" do it "does not contain the port files" do assert_empty(gemfile_contents.grep(%r{^ports/})) end @@ -139,30 +158,21 @@ describe File.basename(gemfile) do assert_empty(gemfile_contents.grep(%r{^patches/})) end - it "contains ext/nokogiri C and header files" do - assert_operator(gemfile_contents.grep(%r{^ext/nokogiri/.*\.c}).length, :>, 20) - assert_operator(gemfile_contents.grep(%r{^ext/nokogiri/.*\.h}).length, :>, 20) - end - it "contains packaged libraries' header files" do assert_includes(gemfile_contents, "ext/nokogiri/include/libxml2/libxml/tree.h") assert_includes(gemfile_contents, "ext/nokogiri/include/libxslt/xslt.h") assert_includes(gemfile_contents, "ext/nokogiri/include/libexslt/exslt.h") end + it "does not contain the gumbo parser source code" do + assert_empty(gemfile_contents.grep(%r{^gumbo-parser/src/})) + end + it "does not contain java files" do assert_empty(gemfile_contents.grep(%r{^ext/java/})) assert_empty(gemfile_contents.grep(/.*\.jar$/)) end - it "does not depend on mini_portile2" do - refute(gemspec.dependencies.find { |d| d.name == "mini_portile2" }) - end - - it "includes C files in extra_rdoc_files" do - assert_operator(gemspec.extra_rdoc_files.grep(%r{ext/nokogiri/.*\.c$}).length, :>, 10) - end - it "contains expected shared library files " do native_ruby_versions.each do |version| actual = gemfile_contents.find do |p| @@ -184,15 +194,12 @@ describe File.basename(gemfile) do end if gemspec.platform.is_a?(Gem::Platform) && gemspec.platform.cpu describe "java platform" do - it "does not contain the port files" do - assert_empty(gemfile_contents.grep(%r{^ports/})) - end - - it "does not contain the patch files" do - assert_empty(gemfile_contents.grep(%r{^patches/})) + it "does not depend on mini_portile2" do + refute(gemspec.dependencies.find { |d| d.name == "mini_portile2" }) end it "contains ext/nokogiri C files" do + # Note: we keep the C files because they have docstrings and Java files don't assert_operator(gemfile_contents.grep(%r{^ext/nokogiri/.*\.c}).length, :>, 20) end @@ -200,10 +207,30 @@ describe File.basename(gemfile) do assert_empty(gemfile_contents.grep(%r{^ext/nokogiri/.*\.h})) end + it "includes C files in extra_rdoc_files" do + assert_operator(gemspec.extra_rdoc_files.grep(%r{ext/nokogiri/.*\.c$}).length, :>, 10) + end + + it "does not contain the port files" do + assert_empty(gemfile_contents.grep(%r{^ports/})) + end + + it "does not contain the patch files" do + assert_empty(gemfile_contents.grep(%r{^patches/})) + end + it "does not contain packaged libraries' header files" do assert_empty(gemfile_contents.grep(%r{^ext/nokogiri/include/})) end + it "does not contain the gumbo parser source code" do + assert_empty(gemfile_contents.grep(%r{^gumbo-parser/src/})) + end + + it "contains java source files" do + assert_operator(gemfile_contents.grep(%r{^ext/java/.*\.java}).length, :>, 20) + end + it "contains the java jar files" do actual_jars = gemfile_contents.grep(/.*\.jar$/) expected_jars = [ @@ -221,9 +248,5 @@ describe File.basename(gemfile) do assert_equal(1, actual_jars.grep(%r{/#{jar}\.jar$}).length, "expected to contain #{jar}.jar") end end - - it "does not depend on mini_portile2" do - refute(gemspec.dependencies.find { |d| d.name == "mini_portile2" }) - end end if gemspec.platform == Gem::Platform.new("java") end diff --git a/nokogumbo-import/test/test_api.rb b/test/html5/test_api.rb similarity index 55% rename from nokogumbo-import/test/test_api.rb rename to test/html5/test_api.rb index 045807055f..c1c03652c4 100644 --- a/nokogumbo-import/test/test_api.rb +++ b/test/html5/test_api.rb @@ -1,39 +1,40 @@ -require 'nokogumbo' -require 'minitest/autorun' +# coding: utf-8 +# frozen_string_literal: true +require "helper" -class TestAPI < Minitest::Test +class TestHtml5API < Nokogiri::TestCase def test_parse_convenience_methods - html = '

hi'.freeze + html = "

hi" base = Nokogiri::HTML5::Document.parse(html) html5_parse = Nokogiri::HTML5.parse(html) html5 = Nokogiri::HTML5(html) str = base.to_html - assert_equal str, html5_parse.to_html - assert_equal str, html5.to_html + assert_equal(str, html5_parse.to_html) + assert_equal(str, html5.to_html) end def test_fragment_convenience_methods - frag = '

hi

'.freeze + frag = "

hi

" base = Nokogiri::HTML5::DocumentFragment.parse(frag) html5_fragment = Nokogiri::HTML5.fragment(frag) - assert_equal base.to_html, html5_fragment.to_html + assert_equal(base.to_html, html5_fragment.to_html) end def test_url - html = '

hi' - url = 'http://example.com' + html = "

hi" + url = "http://example.com" doc = Nokogiri::HTML5::Document.parse(html, url, max_errors: 1) - assert_equal url, doc.errors[0].file + assert_equal(url, doc.errors[0].file) doc = Nokogiri::HTML5.parse(html, url, max_errors: 1) - assert_equal url, doc.errors[0].file + assert_equal(url, doc.errors[0].file) doc = Nokogiri::HTML5(html, url, max_errors: 1) - assert_equal url, doc.errors[0].file + assert_equal(url, doc.errors[0].file) end def test_parse_encoding - utf8 = '

おはようございます' + utf8 = "

おはようございます" shift_jis = utf8.encode(Encoding::SHIFT_JIS) raw = shift_jis.dup raw.force_encoding(Encoding::ASCII_8BIT) @@ -48,7 +49,7 @@ def test_parse_encoding end def test_fragment_encoding - utf8 = '

おはようございます

' + utf8 = "

おはようございます

" shift_jis = utf8.encode(Encoding::SHIFT_JIS) raw = shift_jis.dup raw.force_encoding(Encoding::ASCII_8BIT) @@ -62,33 +63,32 @@ def test_fragment_encoding end def test_fragment_serialization_encoding - frag = Nokogiri::HTML5.fragment('아는 길도 물어가라') - html = frag.serialize(encoding: 'US-ASCII') - assert_equal '아는 길도 물어가라', html + frag = Nokogiri::HTML5.fragment("아는 길도 물어가라") + html = frag.serialize(encoding: "US-ASCII") + assert_equal("아는 길도 물어가라", html) frag = Nokogiri::HTML5.fragment(html) - assert_equal '아는 길도 물어가라', frag.serialize + assert_equal("아는 길도 물어가라", frag.serialize) end def test_serialization_encoding - html = 'ฉันไม่พูดภาษาไทย' + html = "ฉันไม่พูดภาษาไทย" doc = Nokogiri::HTML5(html) - span = doc.at('/html/body/span') - serialized = span.inner_html(encoding: 'US-ASCII') + span = doc.at("/html/body/span") + serialized = span.inner_html(encoding: "US-ASCII") assert_match(/^(?:&#(?:\d+|x\h+);)*$/, serialized) - assert_equal('ฉันไม่พูดภาษาไทย'.each_char.map(&:ord), - serialized.scan(/&#(\d+|x\h+);/).map do |s| + assert_equal("ฉันไม่พูดภาษาไทย".each_char.map(&:ord), + serialized.scan(/&#(\d+|x\h+);/).map do |s| s = s.first - if s.start_with? 'x' + if s.start_with?("x") s[1..-1].to_i(16) else s.to_i end - end - ) + end) - doc2 = Nokogiri::HTML5(doc.serialize(encoding: 'Big5')) - html2 = doc2.serialize(encoding: 'UTF-8') - assert_match 'ฉันไม่พูดภาษาไทย', html2 + doc2 = Nokogiri::HTML5(doc.serialize(encoding: "Big5")) + html2 = doc2.serialize(encoding: "UTF-8") + assert_match("ฉันไม่พูดภาษาไทย", html2) end %w[pre listing textarea].each do |tag| @@ -106,73 +106,73 @@ def test_serialization_encoding end def test_document_io - html = StringIO.new('test', 'r') + html = StringIO.new("test", "r") doc = Nokogiri::HTML5::Document.read_io(html) - refute_nil doc.at_xpath('/html/body/span') + refute_nil(doc.at_xpath("/html/body/span")) end def test_document_memory - html = 'test' + html = "test" doc = Nokogiri::HTML5::Document.read_memory(html) - refute_nil doc - refute_nil doc.at_xpath('/html/body/span') + refute_nil(doc) + refute_nil(doc.at_xpath("/html/body/span")) end def test_document_io_failure - html = 'test' + html = "test" assert_raises(ArgumentError) { Nokogiri::HTML5::Document.read_io(html) } end def test_document_memory_failure - html = StringIO.new('test', 'r') + html = StringIO.new("test", "r") assert_raises(ArgumentError) { Nokogiri::HTML5::Document.read_memory(html) } end def test_document_parse_failure - html = ['Neither a string, nor I/O'] + html = ["Neither a string, nor I/O"] assert_raises(ArgumentError) { Nokogiri::HTML5::Document.parse(html) } end def test_ownership # Test that we don't change the passed in string, even if we need to # re-encode it. - html = ''.freeze - refute_nil Nokogiri::HTML5.parse(html) + html = "" + refute_nil(Nokogiri::HTML5.parse(html)) iso8859_1 = html.encode(Encoding::ISO_8859_1).freeze - refute_nil Nokogiri::HTML5.parse(iso8859_1) + refute_nil(Nokogiri::HTML5.parse(iso8859_1)) ascii_8bit = html.encode(Encoding::ASCII_8BIT).freeze - refute_nil Nokogiri::HTML5.parse(ascii_8bit) + refute_nil(Nokogiri::HTML5.parse(ascii_8bit)) end def test_fragment_from_node - doc = Nokogiri.HTML5('
') - span = doc.at_xpath('/html/body/form/span') - refute_nil span - frag = span.fragment('
Nested forms should be ignored
') - assert frag.is_a?(Nokogiri::HTML5::DocumentFragment) - assert_equal 1, frag.children.length - nested_form = frag.at_xpath('form') - assert_nil nested_form - assert frag.children[0].text? + doc = Nokogiri.HTML5("
") + span = doc.at_xpath("/html/body/form/span") + refute_nil(span) + frag = span.fragment("
Nested forms should be ignored
") + assert(frag.is_a?(Nokogiri::HTML5::DocumentFragment)) + assert_equal(1, frag.children.length) + nested_form = frag.at_xpath("form") + assert_nil(nested_form) + assert(frag.children[0].text?) end def test_fragment_from_node_no_form - doc = Nokogiri.HTML5('') - span = doc.at_xpath('/html/body/span') - refute_nil span - frag = span.fragment('
Form should not be ignored
') - assert frag.is_a?(Nokogiri::HTML5::DocumentFragment) - assert_equal 1, frag.children.length - form = frag.at_xpath('form') - refute_nil form + doc = Nokogiri.HTML5("") + span = doc.at_xpath("/html/body/span") + refute_nil(span) + frag = span.fragment("
Form should not be ignored
") + assert(frag.is_a?(Nokogiri::HTML5::DocumentFragment)) + assert_equal(1, frag.children.length) + form = frag.at_xpath("form") + refute_nil(form) end def test_empty_fragment - doc = Nokogiri.HTML5('') + doc = Nokogiri.HTML5("") frag = doc.fragment - assert frag.is_a?(Nokogiri::HTML5::DocumentFragment) - assert frag.children.empty? + assert(frag.is_a?(Nokogiri::HTML5::DocumentFragment)) + assert(frag.children.empty?) end -end +end if Nokogiri.uses_gumbo? diff --git a/test/html5/test_encoding.rb b/test/html5/test_encoding.rb new file mode 100644 index 0000000000..bbe24771ba --- /dev/null +++ b/test/html5/test_encoding.rb @@ -0,0 +1,209 @@ +# coding: utf-8 +# frozen_string_literal: true +require "helper" + +class TestHtml5Encoding < Nokogiri::TestCase + if "".respond_to?("encoding") + def test_macroman_encoding + mac = String.new("\xCA").force_encoding("macroman") + doc = Nokogiri::HTML5(mac) + assert_equal(" ", doc.at("span").to_xml) + end + + def test_iso8859_encoding + iso8859 = String.new("Se\xF1or").force_encoding(Encoding::ASCII_8BIT) + doc = Nokogiri::HTML5(iso8859) + assert_equal("Señor", doc.at("span").to_xml) + end + + def test_charset_encoding + utf8 = String.new("Se\xC3\xB1or") + .force_encoding(Encoding::ASCII_8BIT) + doc = Nokogiri::HTML5(utf8) + assert_equal("Señor", doc.at("span").to_xml) + end + + def test_bogus_encoding + bogus = String.new("Se\xF1or") + .force_encoding(Encoding::ASCII_8BIT) + doc = Nokogiri::HTML5(bogus) + assert_equal("Señor", doc.at("span").to_xml) + end + + def test_utf8_bom + utf8 = "\uFEFF".encode("UTF-8") + doc = Nokogiri::HTML5(utf8, max_errors: 10) + assert_equal([], doc.errors) + end + + def test_utf16le_bom + utf16le = "\uFEFF".encode("UTF-16LE") + doc = Nokogiri::HTML5(utf16le, max_errors: 10) + assert_equal([], doc.errors) + end + + def test_utf16be_bom + utf16be = "\uFEFF".encode("UTF-16BE") + doc = Nokogiri::HTML5(utf16be, max_errors: 10) + assert_equal([], doc.errors) + end + + def test_utf8_bom_ascii + utf8 = "\uFEFF".encode("UTF-8") + utf8.force_encoding(Encoding::ASCII_8BIT) + doc = Nokogiri::HTML5(utf8, max_errors: 10) + doc.errors.each { |err| puts(err) } + assert_equal([], doc.errors) + end + + def test_utf16le_bom_ascii + utf16le = "\uFEFF".encode("UTF-16LE") + utf16le.force_encoding(Encoding::ASCII_8BIT) + doc = Nokogiri::HTML5(utf16le, max_errors: 10) + assert_equal([], doc.errors) + doc.errors.each { |err| puts(err) } + end + + def test_utf16be_bom_ascii + utf16be = "\uFEFF".encode("UTF-16BE") + utf16be.force_encoding(Encoding::ASCII_8BIT) + doc = Nokogiri::HTML5(utf16be, max_errors: 10) + assert_equal([], doc.errors) + doc.errors.each { |err| puts(err) } + end + + def test_tag_after_utf8_bom + utf8 = "\uFEFF".encode("UTF-8") + doc = Nokogiri::HTML5.fragment(utf8, max_errors: 10) + assert_equal([], doc.errors) + end + end + + # https://github.com/rubys/nokogumbo/issues/68 + def test_charset_sniff_to_html + html = <<-EOF.gsub(/^ /, "") + + + + + + + Hello! + + + EOF + doc = Nokogiri::HTML5(html, max_errors: 10) + assert_equal(0, doc.errors.length) + refute_equal("", doc.to_html) + end + + # https://encoding.spec.whatwg.org/#names-and-labels + # I chose these by looking at the Wikipedia page for each encoding, picked + # one of the languages it was supposed to encode, and then Googled for a + # proverb in the language. Apologies if these are ill-chosen or nonsensical. + # I'm happy to change them. I'm just pasting them in here so I'm pretty sure + # the right-to-left languages are backward. Corrections welcome. + ENCODINGS = [ + ["UTF-8", "Let's concatentate all of these for UTF-8"], # English + ["IBM866", "А дело бывало -- и коза волка съедала"], # Russian + ["ISO-8859-2", "Co můžeš udělat dnes, neodkládej na zítřek."], # Czech + ["ISO-8859-3", "Yukarda mavi gök, asağıda yağız yer yaratıldıkta"], # Turkish + ["ISO-8859-4", "Ceļš uz elli ir bruģēts ar labiem nodomiem."], # Latvian + ["ISO-8859-5", "Каде има сила, нема правдина."], # Macedonian + ["ISO-8859-6", "أباد الله خضراءهم ابذل لصديقك دمك ومالك"], # Arabic + ["ISO-8859-7", "Η καλύτερη άμυνα είναι η επίθεση."], # Greek + ["ISO-8859-8", "אין הנחתום מעיד על עיסתו"], # Hebrew + ["ISO-8859-8-I", "אל תסתכל בקנקן, אלא במה שבתוכו"], # Hebrew + ["ISO-8859-10", "Alla känner apan, men apan känner ingen."], # Swedish + ["ISO-8859-13", "Lašas po lašo ir akmenį pratašo."], # Lithuanian + ["ISO-8859-14", "ha bhòrd bòrd gun aran ach 's bòrd aran leis fhèin."], # Scottish Gaelic + ["ISO-8859-15", "This is essentially ISO 8859-1 but with € Š š Ž ž Œ œ Ÿ"], # English + ["ISO-8859-16", "Kiedy wszedłeś między wrony, musisz krakać jak i one."], # Polish + ["KOI8-R", "А дело бывало -- и коза волка съедала"], # Russian + ["KOI8-U", "Яблуко від яблуньки не далеко. Ґ, Є, І, Ї"], # Ukrainian + ["macroman", "Some good old Mac Roman œ∑´®†¥¨ˆøπå߃©"], # English + ["windows-874", "กระต่ายหมายจันทร์"], # Thai + ["windows-1250", "Addig nyújtózkodj, amíg a takaród ér."], # Hungarian + ["windows-1251", "Бързата работа - срам за майстора."], # Bulgarian + ["windows-1252", "Basically ISO 8859-1 with ‘differences’™ •"], # English + ["windows-1253", "Και οι τοίχοι έχουν αυτιά."], # Greek + ["windows-1254", "Baban nasılsa oğlu da öyledir."], # Turkish + ["windows-1255", "אל תקנה חתול בשק; ₪"], # Hebrew + ["windows-1256", "أبطأ من سلحفاة"], # Arabic + ["windows-1257", "Hommikune töö kuld, õhtune muld."], # Estonian + ["windows-1258", "Ăn theo thuở, ở theo thời."], # Vietnamese + ["macCyrillic", "А дело бывало -- и коза волка съедала"], # Russian + ["GBK", "不闻不若闻之,闻之不若见之,见之不若知之,知之不若行之;学至于行之而止矣"], # Simplified Chinese + ["gb18030", "不聞不若聞之,聞之不若見之,見之不若知之,知之不若行之;學至於行之而止矣"], # Traditional Chinese + ["Big5", "有其父必有其子"], # Traditional Chinese + ["EUC-JP", "猿も木から落ちる"], # Japanese + ["ISO-2022-JP", "井の中の蛙大海を知らず"], # Japanese + ["Shift_JIS", "鳥なき里の蝙蝠"], # Japanese + ["EUC-KR", "아는 길도 물어가라"], # Korean + ["replacement", "콩 심은데 콩나고, 팥 심은데 팥난다"], # Korean + ["UTF-16BE", "Everything had better be representable!"], # English + ["UTF-16LE", "Same as with UTF-16BE"], # English + ["US-ASCII", "Surprisingly not one of the required encodings"], # English + ].freeze + + def encodings_html + @encodings_html ||= + "" + + ENCODINGS.map { |enc| %(#{enc[1]}) }.join + + "" + end + + def encodings_doc + @encodings_doc ||= Nokogiri::HTML5(encodings_html) + end + + def round_trip_through(str, enc) + begin + encoding = Encoding.find(enc) + rescue ArgumentError + skip("#{enc} not supported") + end + begin + encoded = str.encode(encoding) + rescue Encoding::ConverterNotFoundError + skip("Converting UTF-8 to #{enc} not supported") + end + begin + decoded = encoded.encode("UTF-8") + rescue Encoding::ConverterNotFoundError + skip("Converting #{enc} to UTF-8 not supported") + end + assert_equal(str, decoded, "'#{str}' did not round trip through #{enc[0]}") + encoded + end + + ENCODINGS.each do |enc| + define_method("test_parse_encoded_#{enc[0]}".to_sym) do + html = "#{enc[1]}" + encoded_html = round_trip_through(html, enc[0]) + doc = Nokogiri::HTML5(encoded_html, encoding: enc[0]) + span = doc.at("/html/body/span") + refute_nil span + assert_equal enc[1], span.content + end + + define_method("test_inner_html_encoded_#{enc[0]}".to_sym) do + encoded = round_trip_through(enc[1], enc[0]) + span = encodings_doc.at(%(/html/body/span[@id="#{enc[0]}"])) + refute_nil span + assert_equal encoded, span.inner_html(encoding: enc[0]) + end + + define_method("test_roundtrip_through_#{enc[0]}".to_sym) do + # https://bugs.ruby-lang.org/issues/15033 + # Ruby has a bug with the `:fallback` parameter passed to `#encode` when + # multiple conversions have to happen. I'm not sure it's worth working + # around. It impacts this test though. + skip "https://bugs.ruby-lang.org/issues/15033" if enc[0] == "ISO-2022-JP" + round_trip_through(enc[1], enc[0]) + encoded = encodings_doc.serialize(encoding: enc[0]) + doc = Nokogiri::HTML5(encoded, encoding: enc[0]) + assert_equal encodings_html, doc.serialize + end + end +end if Nokogiri.uses_gumbo? diff --git a/nokogumbo-import/test/test_monkey_patch.rb b/test/html5/test_monkey_patch.rb similarity index 75% rename from nokogumbo-import/test/test_monkey_patch.rb rename to test/html5/test_monkey_patch.rb index 5fdd1e4e71..42ff0ac6ab 100644 --- a/nokogumbo-import/test/test_monkey_patch.rb +++ b/test/html5/test_monkey_patch.rb @@ -1,8 +1,7 @@ # encoding: utf-8 -require 'nokogumbo' -require 'minitest/autorun' +require 'helper' -class TestNokogumbo < Minitest::Test +class TestHtml5SerializationMonkeyPatch < Nokogiri::TestCase def test_to_xml xml = Nokogiri.HTML5('').to_xml assert_match(/\A<\?xml version/, xml) @@ -13,4 +12,4 @@ def test_html4_fragment frag = Nokogiri::HTML.fragment('') assert frag.is_a?(Nokogiri::HTML::DocumentFragment) end -end +end if Nokogiri.uses_gumbo? diff --git a/nokogumbo-import/test/test_nokogumbo.rb b/test/html5/test_nokogumbo.rb similarity index 61% rename from nokogumbo-import/test/test_nokogumbo.rb rename to test/html5/test_nokogumbo.rb index 307f3b1846..a49b9e9848 100644 --- a/nokogumbo-import/test/test_nokogumbo.rb +++ b/test/html5/test_nokogumbo.rb @@ -1,58 +1,51 @@ # encoding: utf-8 -require 'nokogumbo' +# frozen_string_literal: true +require "helper" -# Make sure that Ruby objects constructed in C are treated as GC roots. -# See: https://github.com/rubys/nokogumbo/pull/150 -if GC.respond_to?(:verify_compaction_references) - GC.verify_compaction_references(toward: :empty, double_heap: true) -end - -require 'minitest/autorun' - -class TestNokogumbo < Minitest::Test +class TestHtml5Nokogumbo < Nokogiri::TestCase def test_element_text doc = Nokogiri::HTML5(buffer) - assert_equal "content", doc.at('span').text + assert_equal("content", doc.at("span").text) end def test_element_cdata_textarea doc = Nokogiri::HTML5(buffer) - assert_equal "foobar", doc.at('textarea').text.strip + assert_equal("foobar", doc.at("textarea").text.strip) end def test_element_cdata_script doc = Nokogiri::HTML5.fragment(buffer) - assert_equal true, doc.document.html? - assert_equal "", doc.at('script').to_s + assert_equal(true, doc.document.html?) + assert_equal("", doc.at("script").to_s) end def test_attr_value doc = Nokogiri::HTML5(buffer) - assert_equal "utf-8", doc.at('meta')['charset'] + assert_equal("utf-8", doc.at("meta")["charset"]) end def test_comment doc = Nokogiri::HTML5(buffer) - assert_equal " test comment ", doc.xpath('//comment()').text + assert_equal(" test comment ", doc.xpath("//comment()").text) end def test_unknown_element doc = Nokogiri::HTML5(buffer) - assert_equal "main", doc.at('main').name + assert_equal("main", doc.at("main").name) end def test_IO - require 'stringio' + require "stringio" doc = Nokogiri::HTML5(StringIO.new(buffer)) - assert_equal 'textarea', doc.at('form').element_children.first.name + assert_equal("textarea", doc.at("form").element_children.first.name) end def test_nil doc = Nokogiri::HTML5(nil) - assert_equal 1, doc.search('body').count + assert_equal(1, doc.search("body").count) - fragment = Nokogiri::HTML5::fragment(nil) - assert_equal 0, fragment.errors.length + fragment = Nokogiri::HTML5.fragment(nil) + assert_equal(0, fragment.errors.length) end def test_html5_doctype @@ -62,56 +55,56 @@ def test_html5_doctype def test_fragment_no_errors doc = Nokogiri::HTML5.fragment("no missing DOCTYPE errors", max_errors: 10) - assert_equal 0, doc.errors.length + assert_equal(0, doc.errors.length) end # This should be deleted when `:max_parse_errors` is removed. def test_fragment_max_parse_errors doc = Nokogiri::HTML5.fragment("testing deprecated :max_parse_errors", max_parse_errors: 10) - assert_equal 0, doc.errors.length + assert_equal(0, doc.errors.length) end def test_fragment_head - doc = Nokogiri::HTML5.fragment(buffer[/(.*?)<\/head>/m, 1]) - assert_equal "hello world", doc.xpath('title').text - assert_equal "utf-8", doc.xpath('meta').first['charset'] + doc = Nokogiri::HTML5.fragment(buffer[%r{(.*?)}m, 1]) + assert_equal("hello world", doc.xpath("title").text) + assert_equal("utf-8", doc.xpath("meta").first["charset"]) end def test_fragment_body - doc = Nokogiri::HTML5.fragment(buffer[/(.*?)<\/body>/m, 1]) - assert_equal 'content', doc.xpath('main/span').to_xml - assert_equal " test comment ", doc.xpath('comment()').text + doc = Nokogiri::HTML5.fragment(buffer[%r{(.*?)}m, 1]) + assert_equal("content", doc.xpath("main/span").to_xml) + assert_equal(" test comment ", doc.xpath("comment()").text) end def test_xlink_attribute - source = <<-EOF.gsub(/^ {6}/, '') + source = <<-EOF.gsub(/^ {6}/, "") EOF doc = Nokogiri::HTML5.parse(source) - a = doc.at_xpath('/html/body/svg:svg/svg:a') - refute_nil a - refute_nil a['xlink:href'] - refute_nil a['xmlns:xlink'] + a = doc.at_xpath("/html/body/svg:svg/svg:a") + refute_nil(a) + refute_nil(a["xlink:href"]) + refute_nil(a["xmlns:xlink"]) end def test_xlink_attribute_fragment - source = <<-EOF.gsub(/^ {6}/, '') + source = <<-EOF.gsub(/^ {6}/, "") EOF doc = Nokogiri::HTML5.fragment(source) - a = doc.at_xpath('svg:svg/svg:a') - refute_nil a - refute_nil a['xlink:href'] - refute_nil a['xmlns:xlink'] + a = doc.at_xpath("svg:svg/svg:a") + refute_nil(a) + refute_nil(a["xlink:href"]) + refute_nil(a["xmlns:xlink"]) end def test_template - source = <<-EOF.gsub(/^ {6}/, '') + source = <<-EOF.gsub(/^ {6}/, "") EOF doc = Nokogiri::HTML5.fragment(source) - template = doc.at('template') - assert_equal "productrow", template['id'] - assert_equal "record", template.at('td')['class'] + template = doc.at("template") + assert_equal("productrow", template["id"]) + assert_equal("record", template.at("td")["class"]) end def test_root_comments doc = Nokogiri::HTML5("") - assert_equal ["html", "comment", "html", "comment"], doc.children.map(&:name) + assert_equal(["html", "comment", "html", "comment"], doc.children.map(&:name)) end def test_max_attributes @@ -139,15 +132,16 @@ def test_max_attributes # -1 disables limit doc = Nokogiri::HTML5(html, max_attributes: -1) - assert_equal({ 'id' => 'i', 'class' => 'c', 'title' => 't' }, attributes(doc.at_css('div'))) - assert_equal({ 'src' => 's', 'alt' => 'a' }, attributes(doc.at_css('img'))) + assert_equal({ "id" => "i", "class" => "c", "title" => "t" }, attributes(doc.at_css("div"))) + assert_equal({ "src" => "s", "alt" => "a" }, attributes(doc.at_css("img"))) end def test_max_attributes_boolean html = '' doc = Nokogiri::HTML5(html, max_attributes: 4) - assert_equal({ 'checked' => '', 'type' => 'checkbox', 'disabled' => '', 'name' => 'cheese' }, attributes(doc.at_css('input'))) + assert_equal({ "checked" => "", "type" => "checkbox", "disabled" => "", "name" => "cheese" }, + attributes(doc.at_css("input"))) assert_raises(ArgumentError) { Nokogiri::HTML5(html, max_attributes: 3) } assert_raises(ArgumentError) { Nokogiri::HTML5(html, max_attributes: 2) } @@ -156,13 +150,17 @@ def test_max_attributes_boolean end def test_default_max_attributes - a = 'a' + a = String.new("a") attrs = 50_000.times.map { x = a.dup; a.succ!; x } #
contains 50,000 attributes, but default limit is 400. Parsing this would take ages if # we were not enforcing any limit on attributes. All attributes are duplicated to make sure # this doesn’t alter performance or end result. - html = "
hello
" + html = <<~EOF +
+ hello +
+ EOF assert_raises(ArgumentError) { Nokogiri::HTML5(html) } end @@ -176,81 +174,85 @@ def test_fragment_max_attributes # -1 disables limit doc = Nokogiri::HTML5.fragment(html, max_attributes: -1) - assert_equal({ 'id' => 'i', 'class' => 'c', 'title' => 't' }, attributes(doc.at_css('div'))) - assert_equal({ 'src' => 's', 'alt' => 'a' }, attributes(doc.at_css('img'))) + assert_equal({ "id" => "i", "class" => "c", "title" => "t" }, attributes(doc.at_css("div"))) + assert_equal({ "src" => "s", "alt" => "a" }, attributes(doc.at_css("img"))) end def test_fragment_default_max_attributes - a = 'a' + a = String.new("a") attrs = 50_000.times.map { x = a.dup; a.succ!; x } #
contains 50,000 attributes, but default limit is 400. Parsing this would take ages if # we were not enforcing any limit on attributes. All attributes are duplicated to make sure # this doesn’t alter performance or end result. - html = "
hello
" + html = <<~EOF +
+ hello +
+ EOF assert_raises(ArgumentError) { Nokogiri::HTML5.fragment(html) } end def test_parse_errors doc = Nokogiri::HTML5("", max_errors: 10) - assert_equal doc.errors.length, 2 + assert_equal(doc.errors.length, 2) doc = Nokogiri::HTML5("", max_errors: 10) - assert_empty doc.errors + assert_empty(doc.errors) end def test_max_errors # This document contains 2 parse errors, but we force limit to 1. doc = Nokogiri::HTML5("", max_errors: 1) - assert_equal 1, doc.errors.length + assert_equal(1, doc.errors.length) doc = Nokogiri::HTML5("", max_errors: 1) - assert_empty doc.errors + assert_empty(doc.errors) end def test_default_max_errors # This document contains 200 parse errors, but default limit is 0. doc = Nokogiri::HTML5("" + "

" * 200) - assert_equal 0, doc.errors.length + assert_equal(0, doc.errors.length) end def test_parse_fragment_errors doc = Nokogiri::HTML5.fragment("<\r\n", max_errors: 10) - refute_empty doc.errors + refute_empty(doc.errors) end def test_fragment_max_errors # This fragment contains 2 parse errors, but we force limit to 1. doc = Nokogiri::HTML5.fragment("", max_errors: 1) - assert_equal 1, doc.errors.length + assert_equal(1, doc.errors.length) doc = Nokogiri::HTML5.fragment("", max_errors: 10) - assert_equal 2, doc.errors.length + assert_equal(2, doc.errors.length) end def test_fragment_default_max_errors # This fragment contains 200 parse errors, but default limit is 0. doc = Nokogiri::HTML5.fragment("

" * 200) - assert_equal 0, Nokogumbo::DEFAULT_MAX_ERRORS - assert_equal 0, doc.errors.length + assert_equal(0, Nokogiri::Gumbo::DEFAULT_MAX_ERRORS) + assert_equal(0, doc.errors.length) end def test_default_max_depth_parse - assert_raises ArgumentError do - depth = Nokogumbo::DEFAULT_MAX_TREE_DEPTH + 1 - Nokogiri::HTML5('' + '
' * (depth - 2)) + assert_raises(ArgumentError) do + depth = Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH + 1 + Nokogiri::HTML5("" + "
" * (depth - 2)) end end def test_default_max_depth_fragment - assert_raises ArgumentError do - depth = Nokogumbo::DEFAULT_MAX_TREE_DEPTH + 1 - Nokogiri::HTML5.fragment('
' * depth) + assert_raises(ArgumentError) do + depth = Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH + 1 + Nokogiri::HTML5.fragment("
" * depth) end end def test_max_depth_parse depth = 10 - html = '' + '
' * (depth - 2) - assert_raises ArgumentError do + html = "" + "
" * (depth - 2) + assert_raises(ArgumentError) do Nokogiri::HTML5(html, max_tree_depth: depth - 1) end @@ -258,14 +260,14 @@ def test_max_depth_parse Nokogiri::HTML5(html, max_tree_depth: depth) pass rescue ArgumentError - flunk "Expected document parse to succeed" + flunk("Expected document parse to succeed") end end def test_max_depth_fragment depth = 10 - html = '
' * depth - assert_raises ArgumentError do + html = "
" * depth + assert_raises(ArgumentError) do Nokogiri::HTML5.fragment(html, max_tree_depth: depth - 1) end @@ -273,11 +275,10 @@ def test_max_depth_fragment Nokogiri::HTML5.fragment(html, max_tree_depth: depth) pass rescue ArgumentError - flunk "Expected fragment parse to succeed" + flunk("Expected fragment parse to succeed") end end - def test_document_encoding html = <<-TEXT @@ -290,47 +291,42 @@ def test_document_encoding TEXT doc = Nokogiri::HTML5.parse(html) - assert_equal "UTF-8", doc.encoding - assert_equal "Кирилические символы", doc.at('body').text.gsub(/\n\s+/,'') + assert_equal("UTF-8", doc.encoding) + assert_equal("Кирилические символы", doc.at("body").text.gsub(/\n\s+/, "")) end def test_line_text - skip unless Nokogumbo.const_get(:LINE_SUPPORTED) doc = Nokogiri.HTML5("\ntext node") - assert_equal 2, doc.at_xpath('/html/body/text()').line + assert_equal(2, doc.at_xpath("/html/body/text()").line) end def test_line_comment - skip unless Nokogumbo.const_get(:LINE_SUPPORTED) doc = Nokogiri.HTML5("\n\n") - assert_equal 3, doc.at_xpath('/comment()').line + assert_equal(3, doc.at_xpath("/comment()").line) end def test_line_element - skip unless Nokogumbo.const_get(:LINE_SUPPORTED) doc = Nokogiri.HTML5("\n

") - assert_equal 2, doc.at_xpath('/html/body/p').line + assert_equal(2, doc.at_xpath("/html/body/p").line) end def test_line_template - skip unless Nokogumbo.const_get(:LINE_SUPPORTED) doc = Nokogiri.HTML5("\n\n") - assert_equal 3, doc.at_xpath('/html/head/template').line + assert_equal(3, doc.at_xpath("/html/head/template").line) end def test_line_cdata - skip unless Nokogumbo.const_get(:LINE_SUPPORTED) html = "\n\n" doc = Nokogiri.HTML5(html) - node = doc.at_xpath('/html/body/svg:svg/svg:script/text()') - assert node.cdata? - assert_equal 3, node.line + node = doc.at_xpath("/html/body/svg:svg/svg:script/text()") + assert(node.cdata?) + assert_equal(3, node.line) end -private + private def buffer - <<-EOF.gsub(/^ /, '') + <<-EOF.gsub(/^ /, "") @@ -354,4 +350,4 @@ def buffer def attributes(element) element.attributes.map { |name, attribute| [name, attribute.value] }.to_h end -end +end if Nokogiri.uses_gumbo? diff --git a/nokogumbo-import/test/test_null.rb b/test/html5/test_null.rb similarity index 81% rename from nokogumbo-import/test/test_null.rb rename to test/html5/test_null.rb index 8d23c5b0b9..d0a326c291 100644 --- a/nokogumbo-import/test/test_null.rb +++ b/test/html5/test_null.rb @@ -1,22 +1,22 @@ # encoding: utf-8 -require 'nokogumbo' -require 'minitest/autorun' +# frozen_string_literal: true +require "helper" -class TestNull < Minitest::Test +class TestHtml5Null < Nokogiri::TestCase def fragment(s) Nokogiri::HTML5.fragment(s, max_errors: 10) end def test_null_char_ref - frag = fragment('�') - assert_equal 1, frag.errors.length + frag = fragment("�") + assert_equal(1, frag.errors.length) end def test_data_state frag = fragment("\u0000") # 12.2.5.1 Data state: unexpected-null-character parse error # 12.2.6.4.7 The "in body" insertion mode: Parse error - assert_equal 2, frag.errors.length + assert_equal(2, frag.errors.length) end def test_data_rcdata_state @@ -24,7 +24,7 @@ def test_data_rcdata_state # state frag = fragment("") # 12.2.5.2 RCDATA state: unexpected-null-character parse error - assert_equal 1, frag.errors.length + assert_equal(1, frag.errors.length) end def test_data_scriptdata_state @@ -34,20 +34,20 @@ def test_data_scriptdata_state # data state frag = fragment("") # 12.2.5.4 Script data state: unexpected-null-character parse error - assert_equal 1, frag.errors.length + assert_equal(1, frag.errors.length) end def test_data_plaintext_state frag = fragment("

\u0000</plaintext>") # 12.2.5.5 PLAINTEXT state: unexpected-null-character parse error # EOF parse error because there's no way to switch out of plaintext! - assert_equal 2, frag.errors.length + assert_equal(2, frag.errors.length) end def test_data_tag_name_state frag = fragment("<x\u0000></x\ufffd>") # 12.2.5.8 Tag name state: unexpected-null-character parse error - assert_equal 1, frag.errors.length + assert_equal(1, frag.errors.length) end # XXX: There are 6 script states to test. @@ -55,7 +55,7 @@ def test_data_tag_name_state def test_attribute_name_state frag = fragment("<p \u0000>") # 12.2.5.33 Attribute name state - assert_equal 1, frag.errors.length + assert_equal(1, frag.errors.length) end def test_attribute_value_states @@ -63,7 +63,7 @@ def test_attribute_value_states # 12.2.5.36 Attribute value (double-quoted) state # 12.2.5.37 Attribute value (single-quoted) state # 12.2.5.38 Attribute value (unquoted) state - assert_equal 3, frag.errors.length + assert_equal(3, frag.errors.length) end def test_bogus_comment_state @@ -71,19 +71,19 @@ def test_bogus_comment_state # 12.2.5.42 Markup declaration open state: incorrectly-opened-comment # parse error # 12.2.5.41 Bogus comment state: unexpected-null-character parse error - assert_equal 2, frag.errors.length + assert_equal(2, frag.errors.length) end def test_comment_state frag = fragment("<!-- \u0000 -->") # 12.2.5.45 Comment state: unexpected-null-character parse error - assert_equal 1, frag.errors.length + assert_equal(1, frag.errors.length) end def test_doctype_name_states # There are two missing here for double quoted PUBLIC and SYSTEM values. doc = Nokogiri::HTML5.parse("<!DOCTYPE \u0000\u0000 PUBLIC '\u0000' '\u0000' \u0000>", - max_errors: 10) + max_errors: 10) # 12.2.5.54 Before DOCTYPE name state: unexpected-null-character parse # error # 12.2.5.55 DOCTYPE name state: unexpected-null-character parse error @@ -95,18 +95,18 @@ def test_doctype_name_states # unexpected-character-after-doctype-system-identifier parse error # 12.2.5.68 Bogus DOCTYPE state: unexpected-null-character parse error # 12.2.6.4.1 The "initial" insertion mode: parse error - assert_equal 7, doc.errors.length + assert_equal(7, doc.errors.length) end def test_cdata_section_state frag = fragment("<script>//<![CDATA[\n\u0000\n//]]></script>") # 12.2.6.5 The rules for parsing tokens in foreign content: parse error - assert_equal 1, frag.errors.length + assert_equal(1, frag.errors.length) end def test_error_api_with_null frag = fragment("<p \u0000>") - assert frag.errors.any? - assert_includes frag.errors[0].to_s, "<p \u0000>" + assert(frag.errors.any?) + assert_includes(frag.errors[0].to_s, "<p \u0000>") end -end +end if Nokogiri.uses_gumbo? diff --git a/nokogumbo-import/test/test_serialize.rb b/test/html5/test_serialize.rb similarity index 61% rename from nokogumbo-import/test/test_serialize.rb rename to test/html5/test_serialize.rb index f05d6c996d..32cd759061 100644 --- a/nokogumbo-import/test/test_serialize.rb +++ b/test/html5/test_serialize.rb @@ -1,29 +1,29 @@ # encoding: utf-8 -require 'nokogumbo' -require 'minitest/autorun' +# frozen_string_literal: true +require "helper" -class TestAPI < Minitest::Test +class TestHtml5Serialize < Nokogiri::TestCase # https://github.com/web-platform-tests/wpt/blob/master/html/syntax/serializing-html-fragments/initial-linefeed-pre.html def initial_linefeed_pre @initial_linefeed_pre ||= begin - html = <<-EOF.gsub(/^ /, '').freeze + html = <<~EOF.gsub(/^ /, "").freeze <!DOCTYPE html> <div id="outer"> <div id="inner"> <pre id="pre1"> x</pre> <pre id="pre2"> - + x</pre> <textarea id="textarea1"> x</textarea> <textarea id="textarea2"> - + x</textarea> <listing id="listing1"> x</listing> <listing id="listing2"> - + x</listing> </div> </div> @@ -36,22 +36,22 @@ def initial_linefeed_pre def test_initial_linefeed_pre_outer expected = %{\n<div id="inner">\n<pre id="pre1">x</pre>\n<pre id="pre2">\nx</pre>\n<textarea id="textarea1">x</textarea>\n<textarea id="textarea2">\nx</textarea>\n<listing id="listing1">x</listing>\n<listing id="listing2">\nx</listing>\n</div>\n} outer = initial_linefeed_pre.xpath('//div[@id="outer"]')[0] - refute_nil outer - assert_equal expected, outer.inner_html + refute_nil(outer) + assert_equal(expected, outer.inner_html) end def test_initial_linefeed_pre_inner expected = %{\n<pre id="pre1">x</pre>\n<pre id="pre2">\nx</pre>\n<textarea id="textarea1">x</textarea>\n<textarea id="textarea2">\nx</textarea>\n<listing id="listing1">x</listing>\n<listing id="listing2">\nx</listing>\n} inner = initial_linefeed_pre.at('//div[@id="inner"]') - refute_nil inner - assert_equal expected, inner.inner_html + refute_nil(inner) + assert_equal(expected, inner.inner_html) end %w[pre textarea listing].each do |tag| define_method("test_initial_linefeed_#{tag}1".to_sym) do elem = initial_linefeed_pre.at("//*[@id=\"#{tag}1\"]") refute_nil elem - assert_equal 'x', elem.inner_html + assert_equal "x", elem.inner_html end define_method("test_initial_linefeed_#{tag}2".to_sym) do @@ -197,7 +197,7 @@ def test_initial_linefeed_pre_inner # https://github.com/web-platform-tests/wpt/blob/master/html/syntax/serializing-html-fragments/serializing.html def serializing_test_data @serializing_test_data ||= begin - html = <<-EOF.gsub(/ /, '') + html = <<~EOF.gsub(/ /, "") <!DOCTYPE html> <div id="test" style="display:none"> <span></span> @@ -230,7 +230,7 @@ def serializing_test_data <span b=c></span> </div> EOF - Nokogiri::HTML5(html).xpath('/html/body/div/*') + Nokogiri::HTML5(html).xpath("/html/body/div/*") end @serializing_test_data end @@ -263,136 +263,136 @@ def serializing_test_data ["<noscript><&></noscript>", "<span><noscript><&></noscript></span>"], ["<!--data-->", "<span><!--data--></span>"], ["<a><b><c></c></b><d>e</d><f><g>h</g></f></a>", "<span><a><b><c></c></b><d>e</d><f><g>h</g></f></a></span>"], - ["", "<span b=\"c\"></span>"] + ["", "<span b=\"c\"></span>"], ].freeze DOM_TESTS = [ - ['Attribute in the XML namespace', - lambda do - doc = Nokogiri::HTML5::Document.new - span = Nokogiri::XML::Element.new('span', doc) - svg = Nokogiri::XML::Element.new('svg', doc) - span.add_child(svg) - svg.add_namespace('xml', 'http://www.w3.org/XML/1998/namespace') - svg['xml:foo'] = 'test' - span - end, - '<svg xml:foo="test"></svg>', - '<span><svg xml:foo="test"></svg></span>'], + ["Attribute in the XML namespace", + lambda do + doc = Nokogiri::HTML5::Document.new + span = Nokogiri::XML::Element.new("span", doc) + svg = Nokogiri::XML::Element.new("svg", doc) + span.add_child(svg) + svg.add_namespace("xml", "http://www.w3.org/XML/1998/namespace") + svg["xml:foo"] = "test" + span + end, + '<svg xml:foo="test"></svg>', + '<span><svg xml:foo="test"></svg></span>'], ["Attribute in the XML namespace with the prefix not set to xml:", - lambda do - doc = Nokogiri::HTML5::Document.new - span = Nokogiri::XML::Element.new('span', doc) - svg = Nokogiri::XML::Element.new('svg', doc) - span.add_child(svg) - svg['abc:foo'] = 'test' - ns = svg.add_namespace('xml', 'http://www.w3.org/XML/1998/namespace') - svg.attribute('abc:foo').namespace = ns - span - end, - '<svg xml:foo="test"></svg>', - '<span><svg xml:foo="test"></svg></span>'], + lambda do + doc = Nokogiri::HTML5::Document.new + span = Nokogiri::XML::Element.new("span", doc) + svg = Nokogiri::XML::Element.new("svg", doc) + span.add_child(svg) + svg["abc:foo"] = "test" + ns = svg.add_namespace("xml", "http://www.w3.org/XML/1998/namespace") + svg.attribute("abc:foo").namespace = ns + span + end, + '<svg xml:foo="test"></svg>', + '<span><svg xml:foo="test"></svg></span>'], ["Non-'xmlns' attribute in the xmlns namespace", - lambda do - doc = Nokogiri::HTML5::Document.new - span = Nokogiri::XML::Element.new('span', doc) - svg = Nokogiri::XML::Element.new('svg', doc) - span.add_child(svg) - svg.add_namespace('xmlns', 'http://www.w3.org/2000/xmlns/') - svg['xmlns:foo'] = 'test' - span - end, - '<svg xmlns:foo="test"></svg>', - '<span><svg xmlns:foo="test"></svg></span>'], + lambda do + doc = Nokogiri::HTML5::Document.new + span = Nokogiri::XML::Element.new("span", doc) + svg = Nokogiri::XML::Element.new("svg", doc) + span.add_child(svg) + svg.add_namespace("xmlns", "http://www.w3.org/2000/xmlns/") + svg["xmlns:foo"] = "test" + span + end, + '<svg xmlns:foo="test"></svg>', + '<span><svg xmlns:foo="test"></svg></span>'], ["'xmlns' attribute in the xmlns namespace", - lambda do - doc = Nokogiri::HTML5::Document.new - span = Nokogiri::XML::Element.new('span', doc) - svg = Nokogiri::XML::Element.new('svg', doc) - span.add_child(svg) - svg.add_namespace('xmlns', 'http://www.w3.org/2000/xmlns/') - svg['xmlns'] = 'test' - span - end, - '<svg xmlns="test"></svg>', - '<span><svg xmlns="test"></svg></span>'], + lambda do + doc = Nokogiri::HTML5::Document.new + span = Nokogiri::XML::Element.new("span", doc) + svg = Nokogiri::XML::Element.new("svg", doc) + span.add_child(svg) + svg.add_namespace("xmlns", "http://www.w3.org/2000/xmlns/") + svg["xmlns"] = "test" + span + end, + '<svg xmlns="test"></svg>', + '<span><svg xmlns="test"></svg></span>'], ["Attribute in non-standard namespace", - lambda do - doc = Nokogiri::HTML5::Document.new - span = Nokogiri::XML::Element.new('span', doc) - svg = Nokogiri::XML::Element.new('svg', doc) - span.add_child(svg) - svg.add_namespace('abc', 'fake_ns') - svg['abc:def'] = 'test' - span - end, - '<svg abc:def="test"></svg>', - '<span><svg abc:def="test"></svg></span>'], + lambda do + doc = Nokogiri::HTML5::Document.new + span = Nokogiri::XML::Element.new("span", doc) + svg = Nokogiri::XML::Element.new("svg", doc) + span.add_child(svg) + svg.add_namespace("abc", "fake_ns") + svg["abc:def"] = "test" + span + end, + '<svg abc:def="test"></svg>', + '<span><svg abc:def="test"></svg></span>'], ["<span> starting with U+000A", - lambda do - doc = Nokogiri::HTML5::Document.new - span = Nokogiri::XML::Element.new('span', doc) - text = Nokogiri::XML::Text.new("\x0A", doc) - span.add_child(text) - span - end, - "\x0A", - "<span>\x0A</span>"], - #TODO: Processing instructions + lambda do + doc = Nokogiri::HTML5::Document.new + span = Nokogiri::XML::Element.new("span", doc) + text = Nokogiri::XML::Text.new("\x0A", doc) + span.add_child(text) + span + end, + "\x0A", + "<span>\x0A</span>"], + # TODO: Processing instructions ] TEXT_ELEMENTS = %w[pre textarea listing] TEXT_TESTS = [ ["<%text> context starting with U+000A", - lambda do |tag| - doc = Nokogiri::HTML5::Document.new - elem = Nokogiri::XML::Element.new(tag, doc) - text = Nokogiri::XML::Text.new("\x0A", doc) - elem.add_child(text) - elem - end, + lambda do |tag| + doc = Nokogiri::HTML5::Document.new + elem = Nokogiri::XML::Element.new(tag, doc) + text = Nokogiri::XML::Text.new("\x0A", doc) + elem.add_child(text) + elem + end, "\x0A", "<%text>\x0A</%text>"], ["<%text> context not starting with U+000A", - lambda do |tag| - doc = Nokogiri::HTML5::Document.new - elem = Nokogiri::XML::Element.new(tag, doc) - text = Nokogiri::XML::Text.new("a\x0A", doc) - elem.add_child(text) - elem - end, + lambda do |tag| + doc = Nokogiri::HTML5::Document.new + elem = Nokogiri::XML::Element.new(tag, doc) + text = Nokogiri::XML::Text.new("a\x0A", doc) + elem.add_child(text) + elem + end, "a\x0A", "<%text>a\x0A</%text>"], ["<%text> non-context starting with U+000A", - lambda do |tag| - doc = Nokogiri::HTML5::Document.new - elem = Nokogiri::XML::Element.new(tag, doc) - span = Nokogiri::XML::Element.new('span', doc) - text = Nokogiri::XML::Text.new("\x0A", doc) - elem.add_child(text) - span.add_child(elem) - span - end, + lambda do |tag| + doc = Nokogiri::HTML5::Document.new + elem = Nokogiri::XML::Element.new(tag, doc) + span = Nokogiri::XML::Element.new("span", doc) + text = Nokogiri::XML::Text.new("\x0A", doc) + elem.add_child(text) + span.add_child(elem) + span + end, "<%text>\x0A</%text>", "<span><%text>\x0A</%text></span>"], ["<%text> non-context not starting with U+000A", - lambda do |tag| - doc = Nokogiri::HTML5::Document.new - elem = Nokogiri::XML::Element.new(tag, doc) - span = Nokogiri::XML::Element.new('span', doc) - text = Nokogiri::XML::Text.new("a\x0A", doc) - elem.add_child(text) - span.add_child(elem) - span - end, + lambda do |tag| + doc = Nokogiri::HTML5::Document.new + elem = Nokogiri::XML::Element.new(tag, doc) + span = Nokogiri::XML::Element.new("span", doc) + text = Nokogiri::XML::Text.new("a\x0A", doc) + elem.add_child(text) + span.add_child(elem) + span + end, "<%text>a\x0A</%text>", "<span><%text>a\x0A</%text></span>"], ] @@ -404,56 +404,52 @@ def serializing_test_data ] VOID_TESTS = [ ["Void context node", - lambda do |tag| - doc = Nokogiri::HTML5::Document.new - Nokogiri::XML::Element.new(tag, doc) - end, - "", - "<%void>"], + lambda do |tag| + doc = Nokogiri::HTML5::Document.new + Nokogiri::XML::Element.new(tag, doc) + end, + "", + "<%void>"], ["void as first child with following siblings", - lambda do |tag| - doc = Nokogiri::HTML5::Document.new - span = Nokogiri::XML::Element.new('span', doc) - span.add_child(Nokogiri::XML::Element.new(tag, doc)) - span.add_child(Nokogiri::XML::Element.new('a', doc)) - .add_child(Nokogiri::XML::Text.new('test', doc)) - span.add_child(Nokogiri::XML::Element.new('b', doc)) - span - end, - "<%void><a>test</a><b></b>", - "<span><%void><a>test</a><b></b></span>" - ], + lambda do |tag| + doc = Nokogiri::HTML5::Document.new + span = Nokogiri::XML::Element.new("span", doc) + span.add_child(Nokogiri::XML::Element.new(tag, doc)) + span.add_child(Nokogiri::XML::Element.new("a", doc)) + .add_child(Nokogiri::XML::Text.new("test", doc)) + span.add_child(Nokogiri::XML::Element.new("b", doc)) + span + end, + "<%void><a>test</a><b></b>", + "<span><%void><a>test</a><b></b></span>"], ["void as second child with following siblings", - lambda do |tag| - doc = Nokogiri::HTML5::Document.new - span = Nokogiri::XML::Element.new('span', doc) - span.add_child(Nokogiri::XML::Element.new('a', doc)) - .add_child(Nokogiri::XML::Text.new('test', doc)) - span.add_child(Nokogiri::XML::Element.new(tag, doc)) - span.add_child(Nokogiri::XML::Element.new('b', doc)) - span - end, - "<a>test</a><%void><b></b>", - "<span><a>test</a><%void><b></b></span>" - ], + lambda do |tag| + doc = Nokogiri::HTML5::Document.new + span = Nokogiri::XML::Element.new("span", doc) + span.add_child(Nokogiri::XML::Element.new("a", doc)) + .add_child(Nokogiri::XML::Text.new("test", doc)) + span.add_child(Nokogiri::XML::Element.new(tag, doc)) + span.add_child(Nokogiri::XML::Element.new("b", doc)) + span + end, + "<a>test</a><%void><b></b>", + "<span><a>test</a><%void><b></b></span>"], ["void as last child with preceding siblings", - lambda do |tag| - doc = Nokogiri::HTML5::Document.new - span = Nokogiri::XML::Element.new('span', doc) - span.add_child(Nokogiri::XML::Element.new('a', doc)) - .add_child(Nokogiri::XML::Text.new('test', doc)) - span.add_child(Nokogiri::XML::Element.new('b', doc)) - span.add_child(Nokogiri::XML::Element.new(tag, doc)) - span - end, - "<a>test</a><b></b><%void>", - "<span><a>test</a><b></b><%void></span>" - ], + lambda do |tag| + doc = Nokogiri::HTML5::Document.new + span = Nokogiri::XML::Element.new("span", doc) + span.add_child(Nokogiri::XML::Element.new("a", doc)) + .add_child(Nokogiri::XML::Text.new("test", doc)) + span.add_child(Nokogiri::XML::Element.new("b", doc)) + span.add_child(Nokogiri::XML::Element.new(tag, doc)) + span + end, + "<a>test</a><b></b><%void>", + "<span><a>test</a><b></b><%void></span>"], ] - # Generate tests def self.cross_map(a1, a2) rv = [] @@ -489,22 +485,22 @@ def self.cross_map(a1, a2) end cross_map(TEXT_TESTS, TEXT_ELEMENTS) do |test_data, tag| - define_method("test_serializing_text_innerHTML_#{test_data[0].gsub('%text', tag)}".to_sym) do - assert_equal test_data[2].gsub('%text', tag), test_data[1].call(tag).inner_html + define_method("test_serializing_text_innerHTML_#{test_data[0].gsub("%text", tag)}".to_sym) do + assert_equal test_data[2].gsub("%text", tag), test_data[1].call(tag).inner_html end - define_method("test_serialization_text_outerHTML_#{test_data[0].gsub('%text', tag)}".to_sym) do - assert_equal test_data[3].gsub('%text', tag), test_data[1].call(tag).serialize + define_method("test_serialization_text_outerHTML_#{test_data[0].gsub("%text", tag)}".to_sym) do + assert_equal test_data[3].gsub("%text", tag), test_data[1].call(tag).serialize end end cross_map(VOID_TESTS, VOID_ELEMENTS) do |test_data, tag| define_method("test_serializing_void_innerHTML_#{test_data[0]}_#{tag}".to_sym) do - assert_equal test_data[2].gsub('%void', tag), test_data[1].call(tag).inner_html + assert_equal test_data[2].gsub("%void", tag), test_data[1].call(tag).inner_html end define_method("test_serialization_void_outerHTML_#{test_data[0]}_#{tag}".to_sym) do - assert_equal test_data[3].gsub('%void', tag), test_data[1].call(tag).serialize + assert_equal test_data[3].gsub("%void", tag), test_data[1].call(tag).serialize end end -end +end if Nokogiri.uses_gumbo? diff --git a/test/html5/test_tree-construction.rb b/test/html5/test_tree-construction.rb new file mode 100644 index 0000000000..191142c2db --- /dev/null +++ b/test/html5/test_tree-construction.rb @@ -0,0 +1,276 @@ +# encoding: utf-8 +# frozen_string_literal: true +require "helper" + +if Nokogiri.uses_gumbo? + def parse_test(test_data) + test = { script: :both } + index = /(?:^#errors\n|\n#errors\n)/ =~ test_data + abort("Expected #errors in\n#{test_data}") if index.nil? + skip_amount = $~[0].length + # Omit the final new line + test[:data] = test_data[0...index] + + # Process the rest line by line + lines = test_data[index + skip_amount..-1].split("\n") + index = lines.find_index do |line| + line == "#document-fragment" || + line == "#document" || + line == "#script-off" || + line == "#script-on" || + line == "#new-errors" + end + abort("Expected #document") if index.nil? + test[:errors] = lines[0...index] + test[:new_errors] = [] + if lines[index] == "#new-errors" + index += 1 + until %w[#document-fragment #document #script-off #script-on].include?(lines[index]) + test[:new_errors] << lines[index] + index += 1 + end + end + + if lines[index] == "#document-fragment" + test[:context] = lines[index + 1].chomp.split(" ", 2) + index += 2 + end + abort("failed to find fragment: #{index}: #{lines[index]}") if test_data.include?("#document-fragment") && test[:context].nil? + + if lines[index] =~ /#script-(on|off)/ + test[:script] = $~[1].to_sym + index += 1 + end + + abort("Expected #document, got #{lines[index]}") unless lines[index] == "#document" + index += 1 + + document = { + type: test[:context] ? :fragment : :document, + children: [], + } + open_nodes = [document] + while index < lines.length + abort("Expected '| ' but got #{lines[index]}") unless /^\| ( *)([^ ].*$)/ =~ lines[index] + depth = $~[1].length + if depth.odd? + abort("Invalid nesting depth") + else + depth /= 2 + end + abort("Too deep") if depth >= open_nodes.length + + node = {} + node_text = $~[2] + if node_text[0] == '"' + if node_text == '"' || node_text[-1] != '"' + loop do + index += 1 + node_text << "\n" + lines[index] + break if node_text[-1] == '"' + end + end + node[:type] = :text + node[:contents] = node_text[1..-2] + elsif /^<!DOCTYPE ([^ >]*)(?: "([^"]*)" "(.*)")?>$/ =~ node_text + node[:type] = :doctype + node[:name] = $~[1] + node[:public_id] = $~[2].nil? || $~[2].empty? ? nil : $~[2] + node[:system_id] = $~[3].nil? || $~[3].empty? ? nil : $~[3] + elsif /^<!-- (.*) -->$/ =~ node_text + node[:type] = :comment + node[:contents] = $~[1] + elsif /^<(svg |math )?(.+)>$/ =~ node_text + node[:type] = :element + node[:ns] = $~[1].nil? ? nil : $~[1].rstrip + node[:tag] = $~[2] + node[:attributes] = [] + node[:children] = [] + elsif /^([^ ]+ )?([^=]+)="(.*)"$/ =~ node_text + node[:type] = :attribute + node[:ns] = $~[1].nil? ? nil : $~[1].rstrip + node[:name] = $~[2] + node[:value] = $~[3] + elsif node_text == "content" + node[:type] = :template + else + abort("Unexpected node_text: #{node_text}") + end + + if node[:type] == :attribute + abort("depth #{depth} != #{open_nodes.length}") unless depth == open_nodes.length - 1 + abort("type :#{open_nodes[-1][:type]} != :element") unless open_nodes[-1][:type] == :element + abort("element has children") unless open_nodes[-1][:children].empty? + open_nodes[-1][:attributes] << node + elsif node[:type] == :template + abort("depth #{depth} != #{open_nodes.length}") unless depth == open_nodes.length - 1 + abort("type :#{open_nodes[-1][:type]} != :element") unless open_nodes[-1][:type] == :element + abort("tag :#{open_nodes[-1][:tag]} != template") unless open_nodes[-1][:tag] == "template" + abort("template has children before the 'content'") unless open_nodes[-1][:children].empty? + # Hack. We want the children of this template node to be reparented as + # children of the template element. + # XXX: Template contents are _not_ supposed to be children of the + # template, but we currently mishandle this. + open_nodes << open_nodes[-1] + else + open_nodes[depth][:children] << node + open_nodes[depth + 1..-1] = [] + if node[:type] == :element + open_nodes << node + end + end + index += 1 + end + test[:document] = document + test + end + + class TestHtml5TreeConstructionBase < Nokogiri::TestCase + def assert_equal_or_nil(exp, act) + if exp.nil? + assert_nil(act) + else + assert_equal(exp, act) + end + end + + def compare_nodes(node, ng_node) + case ng_node.type + when Nokogiri::XML::Node::ELEMENT_NODE + assert_equal(node[:type], :element) + if node[:ns] + refute_nil(ng_node.namespace) + assert_equal(node[:ns], ng_node.namespace.prefix) + end + assert_equal(node[:tag], ng_node.name) + attributes = ng_node.attributes + assert_equal(node[:attributes].length, attributes.length) + node[:attributes].each do |attr| + value = if attr[:ns] + ng_node["#{attr[:ns]}:#{attr[:name]}"] + else + attributes[attr[:name]].value + end + assert_equal(attr[:value], value) + end + assert_equal(node[:children].length, ng_node.children.length, + "Element <#{node[:tag]}> has wrong number of children: #{ng_node.children.map { |c| c.name }}") + when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE + # We preserve the CDATA in the tree, but the tests represent it as text. + assert_equal(node[:type], :text) + assert_equal(node[:contents], ng_node.content) + when Nokogiri::XML::Node::COMMENT_NODE + assert_equal(node[:type], :comment) + assert_equal(node[:contents], ng_node.content) + when Nokogiri::XML::Node::HTML_DOCUMENT_NODE + assert_equal(node[:type], :document) + assert_equal(node[:children].length, ng_node.children.length) + when Nokogiri::XML::Node::DOCUMENT_FRAG_NODE + assert_equal(node[:type], :fragment) + assert_equal(node[:children].length, ng_node.children.length) + when Nokogiri::XML::Node::DTD_NODE + assert_equal(node[:type], :doctype) + assert_equal(node[:name], ng_node.name) + assert_equal_or_nil(node[:public_id], ng_node.external_id) + assert_equal_or_nil(node[:system_id], ng_node.system_id) + else + flunk("Unknown node type #{ng_node.type} (expected #{node[:type]})") + end + end + + def run_test + if @test[:context] + ctx = @test[:context].join(":") + doc = Nokogiri::HTML5::Document.new + doc = Nokogiri::HTML5::DocumentFragment.new(doc, @test[:data], ctx, max_errors: @test[:errors].length + 10) + else + doc = Nokogiri::HTML5.parse(@test[:data], max_errors: @test[:errors].length + 10) + end + # Walk the tree. + exp_nodes = [@test[:document]] + act_nodes = [doc] + children = [0] + compare_nodes(exp_nodes[0], doc) + while children.any? + child_index = children[-1] + exp = exp_nodes[-1] + act = act_nodes[-1] + if child_index == exp[:children].length + exp_nodes.pop + act_nodes.pop + children.pop + next + end + exp_child = exp[:children][child_index] + act_child = act.children[child_index] + compare_nodes(exp_child, act_child) + children[-1] = child_index + 1 + next unless exp_child.has_key?(:children) + exp_nodes << exp_child + act_nodes << act_child + children << 0 + end + + # Test the errors. + assert_equal(@test[:errors].length, doc.errors.length) + + # The new, standardized tokenizer errors live in @test[:new_errors]. Let's + # match each one to exactly one error in doc.errors. Unfortunately, the + # tests specify the column the error is detected, _not_ the column of the + # start of the problematic HTML (e.g., the start of a character reference + # or <![CDATA[) the way gumbo does. So check that Gumbo's column is no + # later than the error's column. + errors = doc.errors.map { |err| { line: err.line, column: err.column, code: err.str1 } } + errors.reject! { |err| err[:code] == "generic-parser" } + error_regex = /^\((?<line>\d+):(?<column>\d+)(?:-\d+:\d+)?\) (?<code>.*)$/ + @test[:new_errors].each do |err| + assert_match(error_regex, err) + m = err.match(error_regex) + line = m[:line].to_i + column = m[:column].to_i + code = m[:code] + idx = errors.index do |e| + e[:line] == line && + e[:code] == code && + e[:column] <= column + end + # This error should be the first error in the list. + # refute_nil(idx, "Expected to find error #{code} at #{line}:#{column}") + assert_equal(0, idx, "Expected to find error #{code} at #{line}:#{column}") + errors.delete_at(idx) + end + end + end + + tc_path = File.expand_path("../../html5lib-tests/tree-construction", __FILE__) + Dir[File.join(tc_path, "*.dat")].each do |path| + test_name = "TestHtml5TreeConstruction" + File.basename(path, ".dat") + .split(/[_-]/) + .map { |s| s.capitalize } + .join("") + tests = [] + File.open(path, "r", encoding: "UTF-8") do |f| + f.each("\n\n#data\n") do |test_data| + if test_data.start_with?("#data\n") + test_data = test_data[6..-1] + end + if test_data.end_with?("\n\n#data\n") + test_data = test_data[0..-9] + end + tests << parse_test(test_data) + end + end + + klass = Class.new(TestHtml5TreeConstructionBase) do + tests.each_with_index do |test, index| + next if test[:script] == :on + define_method "test_#{index}".to_sym do + @test = test + @index = index + run_test + end + end + end + Object.const_set(test_name, klass) + end +end diff --git a/test/test_nokogumbo_contract.rb b/test/test_nokogumbo_contract.rb new file mode 100644 index 0000000000..83df5837ea --- /dev/null +++ b/test/test_nokogumbo_contract.rb @@ -0,0 +1,26 @@ +require "helper" + +describe "Nokogumbo contract expectations" do + # per https://github.com/rubys/nokogumbo/pull/171 + it "includes the HTML5 public interface" do + skip("Gumbo is not supported on this platform") unless Nokogiri.uses_gumbo? + + assert_includes(::Nokogiri.singleton_methods, :HTML5) + + assert_equal(defined?(::Nokogiri::HTML5), "constant") + assert_includes(::Nokogiri::HTML5.singleton_methods, :parse) + assert_includes(::Nokogiri::HTML5.singleton_methods, :fragment) + + assert_equal(defined?(::Nokogiri::HTML5::Node), "constant") + assert_equal(defined?(::Nokogiri::HTML5::Document), "constant") + assert_equal(defined?(::Nokogiri::HTML5::DocumentFragment), "constant") + end + + it "includes a replacement for the Nokogumbo private interface" do + skip("Gumbo is not supported on this platform") unless Nokogiri.uses_gumbo? + + assert_equal(defined?(::Nokogiri::Gumbo), "constant") + assert_includes(::Nokogiri::Gumbo.singleton_methods, :parse) + assert_includes(::Nokogiri::Gumbo.singleton_methods, :fragment) + end +end