Skip to content
This repository has been archived by the owner on Aug 26, 2023. It is now read-only.

Include line numbers from the parser. #55

Merged
merged 1 commit into from Sep 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Expand Up @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Added option `:max_errors` to control the maximum number of errors reported
by `#errors`
- Added option `:max_tree_depth` to control the maximum parse tree depth.
- Line number support via `Nokogiri::XML::Node#line` as long as Nokogumbo has
been compiled with libxml2 support.

### Changed
- Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
Expand Down
30 changes: 30 additions & 0 deletions README.md
Expand Up @@ -202,6 +202,36 @@ rules defined in the HTML5 specification for doing so.
* Instead of returning `unknown` as the element name for unknown tags, the
original tag name is returned verbatim.

# Flavors of Nokogumbo
Nokogumbo uses libxml2, the XML library underlying Nokogiri, to speed up
parsing. If the libxml2 headers are not available, then Nokogumbo resorts to
using Nokogiri's Ruby API to construct the DOM tree.

Nokogiri can be configured to either use the system library version of libxml2
or use a bundled version. By default (as of Nokogiri version 1.8.4), Nokogiri
will use a bundled version.

To prevent differences between versions of libxml2, Nokogumbo will only use
libxml2 if the build process can find the exact same version used by Nokogiri.
This leads to three possibilities

1. Nokogiri is compiled with the bundled libxml2. In this case, Nokogumbo will
(by default) use the same version of libxml2.
2. Nokogiri is compiled with the system libxml2. In this case, if the libxml2
headers are available, then Nokogumbo will (by default) use the system
version and headers.
3. Nokogiri is compiled with the system libxml2 but its headers aren't
available at build time for Nokogumbo. In this case, Nokogumbo will use the
slower Ruby API.

Using libxml2 can be required by passing `-- --with-libxml2` to `bundle exec
rake` or to `gem install`. Using libxml2 can be prohibited by instead passing
`-- --without-libxml2`.

Functionally, the only difference between using libxml2 or not is in the
behavior of `Nokogiri::XML::Node#line`. If it is used, then `#line` will
return the line number of the corresponding node. Otherwise, it will return 0.

# Installation

git clone https://github.com/rubys/nokogumbo.git
Expand Down
30 changes: 24 additions & 6 deletions ext/nokogumbo/nokogumbo.c
Expand Up @@ -290,6 +290,17 @@ static xmlNsPtr lookup_or_add_ns (
#endif
}

static void set_line(xmlNodePtr node, size_t line) {
#if NGLIB
// libxml2 uses 65535 to mean look elsewhere for the line number on some
// nodes.
if (line < 65535)
node->line = (unsigned short)line;
#else
// XXX: If Nokogiri gets a `#line=` method, we'll use that.
#endif
}

// Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted
// at gumbo_node.
static void build_tree (
Expand Down Expand Up @@ -323,31 +334,38 @@ static void build_tree (
continue;
}
const GumboNode *gumbo_child = children->data[child_index++];
xmlNodePtr xml_child;

switch (gumbo_child->type) {
case GUMBO_NODE_DOCUMENT:
abort(); // Bug in Gumbo.

case GUMBO_NODE_TEXT:
case GUMBO_NODE_WHITESPACE:
xmlAddChild(xml_node, xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text));
xml_child = xmlNewDocText(doc, BAD_CAST gumbo_child->v.text.text);
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;

case GUMBO_NODE_CDATA:
xmlAddChild(xml_node,
xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text,
(int) strlen(gumbo_child->v.text.text)));
xml_child = xmlNewCDataBlock(doc, BAD_CAST gumbo_child->v.text.text,
(int) strlen(gumbo_child->v.text.text));
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;

case GUMBO_NODE_COMMENT:
xmlAddChild(xml_node, xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text));
xml_child = xmlNewDocComment(doc, BAD_CAST gumbo_child->v.text.text);
set_line(xml_child, gumbo_child->v.text.start_pos.line);
xmlAddChild(xml_node, xml_child);
break;

case GUMBO_NODE_TEMPLATE:
// XXX: Should create a template element and a new DocumentFragment
case GUMBO_NODE_ELEMENT:
{
xmlNodePtr xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL);
xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL);
set_line(xml_child, gumbo_child->v.text.start_pos.line);
if (xml_root == NIL)
xml_root = xml_child;
xmlNsPtr ns = NIL;
Expand Down
5 changes: 5 additions & 0 deletions test/test_nokogumbo.rb
Expand Up @@ -220,6 +220,11 @@ def test_document_encoding
assert_equal "Кирилические символы", doc.at('body').text.gsub(/\n\s+/,'')
end

def test_line_numbers
doc = Nokogiri::HTML5(buffer)
assert_includes [0, 8], doc.at('h1').line
assert_includes [0, 10], doc.at('span').line
end

private

Expand Down