From 6bac76214d0f5c9276b3eb1cb17857b53537a553 Mon Sep 17 00:00:00 2001 From: Stephen Checkoway Date: Tue, 11 Apr 2017 15:34:53 -0500 Subject: [PATCH] Include line numbers from the parser. Neither libxml2 nor Nokogiri contain an API for setting the line numbers for a node. When the libxml2 headers are available, the line numbers can be set directly in the node structure. --- ext/nokogumboc/nokogumbo.c | 73 +++++++++++++++++++++++++++++++++++--- test-nokogumbo.rb | 9 +++++ 2 files changed, 77 insertions(+), 5 deletions(-) diff --git a/ext/nokogumboc/nokogumbo.c b/ext/nokogumboc/nokogumbo.c index a43c34a8..f5f861f7 100644 --- a/ext/nokogumboc/nokogumbo.c +++ b/ext/nokogumboc/nokogumbo.c @@ -165,22 +165,85 @@ static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) { } static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) { + xmlNodePtr output_node = NIL; + size_t line = 0; + switch (node->type) { case GUMBO_NODE_DOCUMENT: - return NIL; + output_node = NIL; + break; case GUMBO_NODE_ELEMENT: case GUMBO_NODE_TEMPLATE: - return walk_element(document, &node->v.element); + output_node = walk_element(document, &node->v.element); + line = node->v.element.start_pos.line; + break; case GUMBO_NODE_TEXT: case GUMBO_NODE_WHITESPACE: - return xmlNewDocText(document, CONST_CAST node->v.text.text); + output_node = xmlNewDocText(document, CONST_CAST node->v.text.text); + line = node->v.text.start_pos.line; + break; case GUMBO_NODE_CDATA: - return xmlNewCDataBlock(document, + output_node = xmlNewCDataBlock(document, CONST_CAST node->v.text.original_text.data, (int) node->v.text.original_text.length); + line = node->v.text.start_pos.line; + break; case GUMBO_NODE_COMMENT: - return xmlNewDocComment(document, CONST_CAST node->v.text.text); + output_node = xmlNewDocComment(document, CONST_CAST node->v.text.text); + line = node->v.text.start_pos.line; + break; + } + + if (!output_node) + return NIL; + +#if NGLIB + // Set the line number. + // + // Gumbo uses an unsigned int for a line number. libxml2 uses a long but + // internally stores the line number as an unsigned short (which it assumes is + // 16-bit. To handle larger documents, complex logic is used to depending on + // the type of the element as follows. + // + // For element, text, comment, and PI nodes, if the line number is less than + // 65535, it is stored directly in the xmlNode's line field. Other nodes + // don't store the line number directly, but instead look at the previous + // node or the parent node. + // + // If the line number is at least 65535, then line is set to 65535 and + // - text nodes store the real line number in the psvi field; + // - element nodes don't store the real line number, forcing a lookup in + // the element's children or next or previous nodes; and + // - comment and PI nodes look at the next or previous nodes. + // + // Note that nodes on line 65535 don't always report their line number + // correctly. + // + // The lookup alogorithm is tortured and could trivially get into infinite + // recursion. To prevent that, libxml2 will only examine 5 nodes before + // giving up. + // + // This function only creates element, text, CDATA, and comment + // nodes. libxml2 will ignore CDATA nodes in this look up, but there should + // be no harm in setting the line number, assuming it's small enough. + // Otherwise, set line to 65535 and, if the node is a text node, set psvi to + // the line number. + + if (line < 65535) + output_node->line = (unsigned short)line; + else { + output_node->line = 65535; + if (output_node->type == XML_TEXT_NODE) + output_node->psvi = (void *)line; } +#else + // It'd be great to handle this without the libxml2 headers, but I do not + // know a good way to do this since there is no ruby API to set the line + // number. + (void)line; +#endif + + return output_node; } // Parse a string using gumbo_parse into a Nokogiri document diff --git a/test-nokogumbo.rb b/test-nokogumbo.rb index a8f6eb85..31591261 100644 --- a/test-nokogumbo.rb +++ b/test-nokogumbo.rb @@ -137,6 +137,15 @@ def test_parse_fragment_errors refute_empty doc.errors end + def test_line_numbers + doc = Nokogiri::HTML5(buffer) + assert_includes [0, 8], doc.at('h1').line + assert_includes [0, 10], doc.at('span').line + doc = Nokogiri::HTML5("" + "\n"*65535 + buffer) + assert_includes [0, 65535+8], doc.at('h1').line + assert_includes [0, 65535+10], doc.at('span').line + end + private def buffer