Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

c99 fixes #2303

Merged
merged 6 commits into from Aug 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 9 additions & 2 deletions CHANGELOG.md
Expand Up @@ -4,18 +4,25 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

---

## 1.12.3 / unreleased

### Fixed

* [CRuby] Fix compilation of libgumbo on older systems with versions of GCC that give errors on C99-isms. Affected systems include RHEL6, RHEL7, and SLES12. [[#2302](https://github.com/sparklemotion/nokogiri/issues/2302)]


## 1.12.2 / 2021-08-04

### Fixed

* Ensure that C extension files in non-native gem installations are loaded using `require` and rely on `$LOAD_PATH` instead of using `require_relative`. This issue only exists when deleting shared libraries that exist outside the extensions directory, something users occasionally do to conserve disk space. [[#2300](https://github.com/sparklemotion/nokogiri/issues/2300)]
* [CRuby] Ensure that C extension files in non-native gem installations are loaded using `require` and rely on `$LOAD_PATH` instead of using `require_relative`. This issue only exists when deleting shared libraries that exist outside the extensions directory, something users occasionally do to conserve disk space. [[#2300](https://github.com/sparklemotion/nokogiri/issues/2300)]


## 1.12.1 / 2021-08-03

### Fixed

* Fix compilation of libgumbo on BSD systems by avoiding GNU-isms. [[#2298](https://github.com/sparklemotion/nokogiri/issues/2298)]
* [CRuby] Fix compilation of libgumbo on BSD systems by avoiding GNU-isms. [[#2298](https://github.com/sparklemotion/nokogiri/issues/2298)]


## 1.12.0 / 2021-08-02
Expand Down
157 changes: 98 additions & 59 deletions ext/nokogiri/gumbo.c
Expand Up @@ -75,7 +75,7 @@ new_html_doc(const char *dtd_name, const char *system, const char *public)
htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
assert(doc);
if (dtd_name) {
xmlCreateIntSubset(doc, BAD_CAST dtd_name, BAD_CAST public, BAD_CAST system);
xmlCreateIntSubset(doc, (const xmlChar *)dtd_name, (const xmlChar *)public, (const xmlChar *)system);
}
return doc;
}
Expand All @@ -89,15 +89,19 @@ get_parent(xmlNodePtr node)
static GumboOutput *
perform_parse(const GumboOptions *options, VALUE input)
{
GumboOutput *output;
const char *status_string;

assert(RTEST(input));
Check_Type(input, T_STRING);
GumboOutput *output = gumbo_parse_with_options(
options,
RSTRING_PTR(input),
RSTRING_LEN(input)
);

const char *status_string = gumbo_status_to_string(output->status);
output = gumbo_parse_with_options(
options,
RSTRING_PTR(input),
RSTRING_LEN(input)
);

status_string = gumbo_status_to_string(output->status);
switch (output->status) {
case GUMBO_STATUS_OK:
break;
Expand All @@ -120,11 +124,11 @@ lookup_or_add_ns(
const char *prefix
)
{
xmlNsPtr ns = xmlSearchNs(doc, root, BAD_CAST prefix);
xmlNsPtr ns = xmlSearchNs(doc, root, (const xmlChar *)prefix);
if (ns) {
return ns;
}
return xmlNewNs(root, BAD_CAST href, BAD_CAST prefix);
return xmlNewNs(root, (const xmlChar *)href, (const xmlChar *)prefix);
}

static void
Expand All @@ -151,9 +155,13 @@ build_tree(
size_t child_index = 0;

while (true) {
const GumboVector *children;
const GumboNode *gumbo_child;
xmlNodePtr xml_child;

assert(gumbo_node != NULL);
const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT ?
&gumbo_node->v.document.children : &gumbo_node->v.element.children;
children = gumbo_node->type == GUMBO_NODE_DOCUMENT ?
&gumbo_node->v.document.children : &gumbo_node->v.element.children;
if (child_index >= children->length) {
// Move up the tree and to the next child.
if (xml_node == xml_output_node) {
Expand All @@ -172,42 +180,44 @@ build_tree(
}
continue;
}
const GumboNode *gumbo_child = children->data[child_index++];
xmlNodePtr xml_child;

gumbo_child = children->data[child_index++];
switch (gumbo_child->type) {
case GUMBO_NODE_DOCUMENT:
abort(); // Bug in Gumbo.

case GUMBO_NODE_TEXT:
case GUMBO_NODE_WHITESPACE:
xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text);
xml_child = xmlNewDocText(doc, (const xmlChar *)gumbo_child->v.text.text);
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;

case GUMBO_NODE_CDATA:
xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text,
xml_child = xmlNewCDataBlock(doc, (const xmlChar *)gumbo_child->v.text.text,
(int) strlen(gumbo_child->v.text.text));
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;

case GUMBO_NODE_COMMENT:
xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text);
xml_child = xmlNewDocComment(doc, (const xmlChar *)gumbo_child->v.text.text);
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;

case GUMBO_NODE_TEMPLATE:
// XXX: Should create a template element and a new DocumentFragment
case GUMBO_NODE_ELEMENT: {
xml_child = xmlNewDocNode(doc, NULL, BAD_CAST gumbo_child->v.element.name, NULL);
xmlNsPtr ns = NULL;
const GumboVector *attrs;
size_t i;

xml_child = xmlNewDocNode(doc, NULL, (const xmlChar *)gumbo_child->v.element.name, NULL);
set_line(xml_child, gumbo_child->v.element.start_pos.line);
if (xml_root == NULL) {
xml_root = xml_child;
}
xmlNsPtr ns = NULL;
switch (gumbo_child->v.element.tag_namespace) {
case GUMBO_NAMESPACE_HTML:
break;
Expand All @@ -224,8 +234,8 @@ build_tree(
xmlAddChild(xml_node, xml_child);

// Add the attributes.
const GumboVector *attrs = &gumbo_child->v.element.attributes;
for (size_t i = 0; i < attrs->length; i++) {
attrs = &gumbo_child->v.element.attributes;
for (i = 0; i < attrs->length; i++) {
const GumboAttribute *attr = attrs->data[i];

switch (attr->attr_namespace) {
Expand All @@ -244,7 +254,7 @@ build_tree(
default:
ns = NULL;
}
xmlNewNsProp(xml_child, ns, BAD_CAST attr->name, BAD_CAST attr->value);
xmlNewNsProp(xml_child, ns, (const xmlChar *)attr->name, (const xmlChar *)attr->value);
}

// Add children for this element.
Expand All @@ -264,19 +274,29 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)

// Add parse errors to rdoc.
if (output->errors.length) {
size_t i;
const GumboVector *errors = &output->errors;
VALUE rerrors = rb_ary_new2(errors->length);

for (size_t i = 0; i < errors->length; i++) {
GumboError *err = errors->data[i];
GumboSourcePosition position = gumbo_error_position(err);
for (i = 0; i < errors->length; i++) {
GumboError *err;
GumboSourcePosition position;
char *msg;
size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
VALUE err_str = rb_utf8_str_new(msg, size);
size_t size;
VALUE err_str;
VALUE syntax_error;
const char *error_code;
VALUE str1;

err = errors->data[i];
position = gumbo_error_position(err);
size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
err_str = rb_utf8_str_new(msg, size);
free(msg);
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
const char *error_code = gumbo_error_code(err);
VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
error_code = gumbo_error_code(err);
str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;

rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
Expand All @@ -303,7 +323,7 @@ typedef struct {
static VALUE
parse_cleanup(VALUE parse_args)
{
ParseArgs *args = (ParseArgs*)parse_args;
ParseArgs *args = (ParseArgs *)parse_args;
gumbo_destroy_output(args->output);
// Make sure garbage collection doesn't mark the objects as being live based
// on references from the ParseArgs. This may be unnecessary.
Expand All @@ -323,28 +343,32 @@ static VALUE parse_continue(VALUE parse_args);
static VALUE
parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
{
GumboOutput *output;
GumboOptions options = kGumboDefaultOptions;
ParseArgs args;

options.max_attributes = NUM2INT(max_attributes);
options.max_errors = NUM2INT(max_errors);
options.max_tree_depth = NUM2INT(max_depth);

GumboOutput *output = perform_parse(&options, input);
ParseArgs args = {
.output = output,
.input = input,
.url_or_frag = url,
.doc = NULL,
};
output = perform_parse(&options, input);

args.output = output;
args.input = input;
args.url_or_frag = url;
args.doc = NULL;

return rb_ensure(parse_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args));
}

static VALUE
parse_continue(VALUE parse_args)
{
ParseArgs *args = (ParseArgs*)parse_args;
ParseArgs *args = (ParseArgs *)parse_args;
GumboOutput *output = args->output;
xmlDocPtr doc;
VALUE rdoc;

if (output->document->v.document.has_doctype) {
const char *name = output->document->v.document.name;
const char *public = output->document->v.document.public_identifier;
Expand All @@ -357,7 +381,7 @@ parse_continue(VALUE parse_args)
}
args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
build_tree(doc, (xmlNodePtr)doc, output->document);
VALUE rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc);
rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc);
args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
add_errors(output, rdoc, args->input, args->url_or_frag);
return rdoc;
Expand All @@ -366,10 +390,15 @@ parse_continue(VALUE parse_args)
static int
lookup_namespace(VALUE node, bool require_known_ns)
{
VALUE ns;
ID namespace, href;
const char *href_ptr;
size_t href_len;

CONST_ID(namespace, "namespace");
CONST_ID(href, "href");
VALUE ns = rb_funcall(node, namespace, 0);

ns = rb_funcall(node, namespace, 0);

if (NIL_P(ns)) {
return GUMBO_NAMESPACE_HTML;
Expand All @@ -378,8 +407,8 @@ lookup_namespace(VALUE node, bool require_known_ns)
assert(RTEST(ns));
Check_Type(ns, T_STRING);

const char *href_ptr = RSTRING_PTR(ns);
size_t href_len = RSTRING_LEN(ns);
href_ptr = RSTRING_PTR(ns);
href_len = RSTRING_LEN(ns);
#define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) {
return GUMBO_NAMESPACE_HTML;
Expand Down Expand Up @@ -427,15 +456,23 @@ fragment(
GumboQuirksModeEnum quirks_mode;
bool form = false;
const char *encoding = NULL;
VALUE doc, dtd;
int depth;
GumboOptions options = kGumboDefaultOptions;
GumboOutput *output;
ParseArgs args;

if (NIL_P(ctx)) {
ctx_tag = "body";
ctx_ns = GUMBO_NAMESPACE_HTML;
} else if (TYPE(ctx) == T_STRING) {
size_t len;
const char *colon;

ctx_tag = StringValueCStr(ctx);
ctx_ns = GUMBO_NAMESPACE_HTML;
size_t len = RSTRING_LEN(ctx);
const char *colon = memchr(ctx_tag, ':', len);
len = RSTRING_LEN(ctx);
colon = memchr(ctx_tag, ':', len);
if (colon) {
switch (colon - ctx_tag) {
case 3:
Expand Down Expand Up @@ -470,6 +507,7 @@ fragment(
// Check if it's a form.
form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
} else {
VALUE node, element_name;
ID element_ = rb_intern_const("element?");

// Context fragment name.
Expand All @@ -482,13 +520,13 @@ fragment(
ctx_ns = lookup_namespace(ctx, true);

// Check for a form ancestor, including self.
for (VALUE node = ctx;
for (node = ctx;
!NIL_P(node);
node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
if (!RTEST(rb_funcall(node, element_, 0))) {
continue;
}
VALUE element_name = rb_funcall(node, name, 0);
element_name = rb_funcall(node, name, 0);
if (RSTRING_LEN(element_name) == 4
&& !st_strcasecmp(RSTRING_PTR(element_name), "form")
&& lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
Expand All @@ -510,8 +548,8 @@ fragment(
}

// Quirks mode.
VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
VALUE dtd = rb_funcall(doc, internal_subset, 0);
doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
dtd = rb_funcall(doc, internal_subset, 0);
if (NIL_P(dtd)) {
quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
} else {
Expand All @@ -526,8 +564,8 @@ fragment(
}

// Perform a fragment parse.
int depth = NUM2INT(max_depth);
GumboOptions options = kGumboDefaultOptions;
depth = NUM2INT(max_depth);
options = kGumboDefaultOptions;
options.max_attributes = NUM2INT(max_attributes);
options.max_errors = NUM2INT(max_errors);
// Add one to account for the HTML element.
Expand All @@ -538,27 +576,28 @@ fragment(
options.quirks_mode = quirks_mode;
options.fragment_context_has_form_ancestor = form;

GumboOutput *output = perform_parse(&options, tags);
ParseArgs args = {
.output = output,
.input = tags,
.url_or_frag = doc_fragment,
.doc = (xmlDocPtr)extract_xml_node(doc),
};
output = perform_parse(&options, tags);

args.output = output;
args.input = tags;
args.url_or_frag = doc_fragment;
args.doc = (xmlDocPtr)extract_xml_node(doc);

rb_ensure(fragment_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args));
return Qnil;
}

static VALUE
fragment_continue(VALUE parse_args)
{
ParseArgs *args = (ParseArgs*)parse_args;
ParseArgs *args = (ParseArgs *)parse_args;
GumboOutput *output = args->output;
VALUE doc_fragment = args->url_or_frag;
xmlDocPtr xml_doc = args->doc;
xmlNodePtr xml_frag;

args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
xml_frag = extract_xml_node(doc_fragment);
build_tree(xml_doc, xml_frag, output->root);
add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
return Qnil;
Expand Down