Skip to content
This repository has been archived by the owner on Aug 26, 2023. It is now read-only.

Commit

Permalink
Include line numbers from the parser.
Browse files Browse the repository at this point in the history
Neither libxml2 nor Nokogiri contain an API for setting the line numbers
for a node. When the libxml2 headers are available, the line numbers can
be set directly in the node structure.
  • Loading branch information
stevecheckoway committed Apr 11, 2017
1 parent 2f3dbfa commit a76c00d
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 5 deletions.
73 changes: 68 additions & 5 deletions ext/nokogumboc/nokogumbo.c
Expand Up @@ -165,22 +165,85 @@ static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
}

static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
xmlNodePtr output_node = NIL;
size_t line = 0;

switch (node->type) {
case GUMBO_NODE_DOCUMENT:
return NIL;
output_node = NIL;
break;
case GUMBO_NODE_ELEMENT:
case GUMBO_NODE_TEMPLATE:
return walk_element(document, &node->v.element);
output_node = walk_element(document, &node->v.element);
line = node->v.element.start_pos.line;
break;
case GUMBO_NODE_TEXT:
case GUMBO_NODE_WHITESPACE:
return xmlNewDocText(document, CONST_CAST node->v.text.text);
output_node = xmlNewDocText(document, CONST_CAST node->v.text.text);
line = node->v.text.start_pos.line;
break;
case GUMBO_NODE_CDATA:
return xmlNewCDataBlock(document,
output_node = xmlNewCDataBlock(document,
CONST_CAST node->v.text.original_text.data,
(int) node->v.text.original_text.length);
line = node->v.text.start_pos.line;
break;
case GUMBO_NODE_COMMENT:
return xmlNewDocComment(document, CONST_CAST node->v.text.text);
output_node = xmlNewDocComment(document, CONST_CAST node->v.text.text);
line = node->v.text.start_pos.line;
break;
}

if (!output_node)
return NIL;

#if NGLIB
// Set the line number.
//
// Gumbo uses an unsigned int for a line number. libxml2 uses a long but
// internally stores the line number as an unsigned short (which it assumes is
// 16-bit. To handle larger documents, complex logic is used to depending on
// the type of the element as follows.
//
// For element, text, comment, and PI nodes, if the line number is less than
// 65535, it is stored directly in the xmlNode's line field. Other nodes
// don't store the line number directly, but instead look at the previous
// node or the parent node.
//
// If the line number is at least 65535, then line is set to 65535 and
// - text nodes store the real line number in the psvi field;
// - element nodes don't store the real line number, forcing a lookup in
// the element's children or next or previous nodes; and
// - comment and PI nodes look at the next or previous nodes.
//
// Note that nodes on line 65535 don't always report their line number
// correctly.
//
// The lookup alogorithm is tortured and could trivially get into infinite
// recursion. To prevent that, libxml2 will only examine 5 nodes before
// giving up.
//
// This function only creates element, text, CDATA, and comment
// nodes. libxml2 will ignore CDATA nodes in this look up, but there should
// be no harm in setting the line number, assuming it's small enough.
// Otherwise, set line to 65535 and, if the node is a text node, set psvi to
// the line number.

if (line < 65535)
output_node->line = (unsigned short)line;
else {
output_node->line = 65535;
if (output_node->type == XML_TEXT_NODE)
output_node->psvi = (void *)line;
}
#else
// It'd be great to handle this without the libxml2 headers, but I do not
// know a good way to do this since there is no ruby API to set the line
// number.
(void)line;
#endif

return output_node;
}

// Parse a string using gumbo_parse into a Nokogiri document
Expand Down
9 changes: 9 additions & 0 deletions test-nokogumbo.rb
Expand Up @@ -137,6 +137,15 @@ def test_parse_fragment_errors
refute_empty doc.errors
end

def test_line_numbers
doc = Nokogiri::HTML5(buffer)
assert_includes [-1, 8], doc.at('h1').line
assert_includes [-1, 10], doc.at('span').line
doc = Nokogiri::HTML5("<!DOCTYPE html>" + "\n"*65535 + buffer)
assert_includes [-1, 65535+8], doc.at('h1').line
assert_includes [-1, 65535+10], doc.at('span').line
end

private

def buffer
Expand Down

0 comments on commit a76c00d

Please sign in to comment.