Skip to content

Commit

Permalink
fix: skip relinking namespaces in HTML4/5 documents
Browse files Browse the repository at this point in the history
Also fix both implementations of relink_namespaces to not skip
attributes when the node itself doesn't have a namespace.

Finally, the private method
XML::Node#add_child_node_and_reparent_attrs is no longer needed after
fixing relink_namespaces, so this commit essentially reverts 9fd03d8.
  • Loading branch information
flavorjones committed Aug 30, 2022
1 parent 90b3dd0 commit 8dde633
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 80 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Expand Up @@ -50,6 +50,11 @@ This version of Nokogiri uses [`jar-dependencies`](https://github.com/mkristian/
* [JRuby] `NodeSet#[]` now raises a TypeError if passed an invalid parameter type. [[#2211](https://github.com/sparklemotion/nokogiri/issues/2211)]


### Fixed

* [CRuby] When reparenting HTML nodes, do not attempt to relink namespaces. Previously, an HTML attribute with a colon might be interpreted as a prefixed/namespaced atttribute (for example, "xml:lang"). [[#1790](https://github.com/sparklemotion/nokogiri/issues/1790)]


### Improved

* Serialization of HTML5 documents and fragments has been re-implemented and is ~10x faster than previous versions. [[#2596](https://github.com/sparklemotion/nokogiri/issues/2596), [#2569](https://github.com/sparklemotion/nokogiri/issues/2569)]
Expand Down
42 changes: 22 additions & 20 deletions ext/java/nokogiri/XmlNode.java
Expand Up @@ -440,24 +440,24 @@ public class XmlNode extends RubyObject
set_namespace(context, ((XmlNode)parent(context)).namespace(context));
}
return;
}

String currentPrefix = e.getParentNode().lookupPrefix(nsURI);
String currentURI = e.getParentNode().lookupNamespaceURI(prefix);
boolean isDefault = e.getParentNode().isDefaultNamespace(nsURI);

// add xmlns attribute if this is a new root node or if the node's
// namespace isn't a default namespace in the new document
if (e.getParentNode().getNodeType() == Node.DOCUMENT_NODE) {
// this is the root node, so we must set the namespaces attributes anyway
e.setAttribute(prefix == null ? "xmlns" : "xmlns:" + prefix, nsURI);
} else if (prefix == null) {
// this is a default namespace but isn't the default where this node is being added
if (!isDefault) { e.setAttribute("xmlns", nsURI); }
} else if (!prefix.equals(currentPrefix) || nsURI.equals(currentURI)) {
// this is a prefixed namespace
// but doesn't have the same prefix or the prefix is set to a different URI
e.setAttribute("xmlns:" + prefix, nsURI);
} else {
String currentPrefix = e.getParentNode().lookupPrefix(nsURI);
String currentURI = e.getParentNode().lookupNamespaceURI(prefix);
boolean isDefault = e.getParentNode().isDefaultNamespace(nsURI);

// add xmlns attribute if this is a new root node or if the node's
// namespace isn't a default namespace in the new document
if (e.getParentNode().getNodeType() == Node.DOCUMENT_NODE) {
// this is the root node, so we must set the namespaces attributes anyway
e.setAttribute(prefix == null ? "xmlns" : "xmlns:" + prefix, nsURI);
} else if (prefix == null) {
// this is a default namespace but isn't the default where this node is being added
if (!isDefault) { e.setAttribute("xmlns", nsURI); }
} else if (!prefix.equals(currentPrefix) || nsURI.equals(currentURI)) {
// this is a prefixed namespace
// but doesn't have the same prefix or the prefix is set to a different URI
e.setAttribute("xmlns:" + prefix, nsURI);
}
}

if (e.hasAttributes()) {
Expand Down Expand Up @@ -492,8 +492,10 @@ public class XmlNode extends RubyObject
}
}

if (this.node.hasChildNodes()) {
relink_namespace(context, getChildren());
if (nsURI != null && !nsURI.isEmpty()) {
if (this.node.hasChildNodes()) {
relink_namespace(context, getChildren());
}
}
}

Expand Down
29 changes: 16 additions & 13 deletions ext/nokogiri/xml_node.c
Expand Up @@ -153,25 +153,25 @@ relink_namespace(xmlNodePtr reparented)
}
}

/* Only walk all children if there actually is a namespace we need to */
/* reparent. */
if (NULL == reparented->ns) { return; }

/* When a node gets reparented, walk it's children to make sure that */
/* their namespaces are reparented as well. */
child = reparented->children;
while (NULL != child) {
relink_namespace(child);
child = child->next;
}

if (reparented->type == XML_ELEMENT_NODE) {
attr = reparented->properties;
while (NULL != attr) {
relink_namespace((xmlNodePtr)attr);
attr = attr->next;
}
}

/* Only walk all children if there actually is a namespace we need to */
/* reparent. */
if (reparented->ns) {
/* When a node gets reparented, walk it's children to make sure that */
/* their namespaces are reparented as well. */
child = reparented->children;
while (NULL != child) {
relink_namespace(child);
child = child->next;
}
}
}


Expand Down Expand Up @@ -408,7 +408,10 @@ reparent_node_with(VALUE pivot_obj, VALUE reparentee_obj, pivot_reparentee_func
/* if we've created a cycle, raise an exception */
raise_if_ancestor_of_self(reparented);

relink_namespace(reparented);
/* HTML doesn't have namespaces */
if (!rb_obj_is_kind_of(DOC_RUBY_OBJECT(reparented->doc), cNokogiriHtml4Document)) {
relink_namespace(reparented);
}

return reparented_obj ;
}
Expand Down
23 changes: 1 addition & 22 deletions lib/nokogiri/html5/node.rb
Expand Up @@ -67,29 +67,8 @@ def fragment(tags)

DocumentFragment.new(document, tags, self)
end

private

# HTML elements can have attributes that contain colons.
# Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
# and tries to create an attribute in a namespace. This is especially
# annoying with attribute names like xml:lang since libxml2 will
# actually create the xml namespace if it doesn't exist already.
def add_child_node_and_reparent_attrs(node)
return super(node) unless document.is_a?(HTML5::Document)

# I'm not sure what this method is supposed to do. Reparenting
# namespaces is handled by libxml2, including child namespaces which
# this method wouldn't handle.
# https://github.com/sparklemotion/nokogiri/issues/1790
add_child_node(node)
# node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
# attr.remove
# ns = attr.namespace
# a["#{ns.prefix}:#{attr.name}"] = attr.value
# end
end
end

# Monkey patch
XML::Node.prepend(HTML5::Node)
end
Expand Down
16 changes: 4 additions & 12 deletions lib/nokogiri/xml/node.rb
Expand Up @@ -145,9 +145,9 @@ def decorate!
def add_child(node_or_tags)
node_or_tags = coerce(node_or_tags)
if node_or_tags.is_a?(XML::NodeSet)
node_or_tags.each { |n| add_child_node_and_reparent_attrs(n) }
node_or_tags.each { |n| add_child_node(n) }
else
add_child_node_and_reparent_attrs(node_or_tags)
add_child_node(node_or_tags)
end
node_or_tags
end
Expand Down Expand Up @@ -263,9 +263,9 @@ def children=(node_or_tags)
node_or_tags = coerce(node_or_tags)
children.unlink
if node_or_tags.is_a?(XML::NodeSet)
node_or_tags.each { |n| add_child_node_and_reparent_attrs(n) }
node_or_tags.each { |n| add_child_node(n) }
else
add_child_node_and_reparent_attrs(node_or_tags)
add_child_node(node_or_tags)
end
end

Expand Down Expand Up @@ -1392,14 +1392,6 @@ def inspect_attributes
end

IMPLIED_XPATH_CONTEXTS = [".//"].freeze

def add_child_node_and_reparent_attrs(node)
add_child_node(node)
node.attribute_nodes.find_all { |a| a.name.include?(":") }.each do |attr_node|
attr_node.remove
node[attr_node.name] = attr_node.value
end
end
end
end
end
Expand Down
26 changes: 13 additions & 13 deletions test/namespaces/test_namespaces_in_created_doc.rb
Expand Up @@ -100,19 +100,19 @@ def test_created_namespaced_attribute_on_unparented_node_is_renamespaced_in_xml_
assert(child.attribute_nodes.first.namespace)
end

# def test_created_namespaced_attribute_on_unparented_node_is_not_renamespaced_in_html4_doc
# doc1 = Nokogiri::HTML4("<html><body></body></html>")
# doc2 = Nokogiri::HTML4("<html><body></body></html>")

# child = doc1.create_element("div")
# child["tempname"] = "en"
# attr = child.attribute("tempname")
# attr.name = "xml:lang"
# assert_nil(child.attribute_nodes.first.namespace)

# doc2.at_css("body").add_child(child)
# assert_nil(child.attribute_nodes.first.namespace)
# end
def test_created_namespaced_attribute_on_unparented_node_is_not_renamespaced_in_html4_doc
doc1 = Nokogiri::HTML4("<html><body></body></html>")
doc2 = Nokogiri::HTML4("<html><body></body></html>")

child = doc1.create_element("div")
child["tempname"] = "en"
attr = child.attribute("tempname")
attr.name = "xml:lang"
assert_nil(child.attribute_nodes.first.namespace)

doc2.at_css("body").add_child(child)
assert_nil(child.attribute_nodes.first.namespace)
end

def test_created_namespaced_attribute_on_unparented_node_is_not_renamespaced_in_html5_doc
skip("HTML5 not supported on this platform yet") unless defined?(Nokogiri::HTML5)
Expand Down

0 comments on commit 8dde633

Please sign in to comment.