Skip to content

Commit

Permalink
Merge pull request #2309 from sparklemotion/1764-long-line-numbers
Browse files Browse the repository at this point in the history
feat(cruby): support line numbers larger than a short

---

**What problem is this PR intended to solve?**

As noted in #1493, #1617, #1505, #1003, and #533, libxml2 has not historically supported line numbers greater than a `short int`. Starting in libxml v2.9.0, setting the parse option `BIG_LINES` would allow tracking line numbers in longer documents.

Specifically this PR makes the following changes:

- set `BIG_LINES` parse option by default which will allow `Node#line` to return large integers
- allow `Node#line=` to set large line numbers on text nodes

Fixes #1764 

**Have you included adequate test coverage?**

Yes!

**Does this change affect the behavior of either the C or the Java implementations?**

JRuby's Xerces-based implementation did not suffer from this particular shortcoming, although its line number functionality is questionable in other ways (see #2177 / b32c875).
  • Loading branch information
flavorjones committed Aug 14, 2021
2 parents 3ae1f38 + eb4a988 commit 8cf77aa
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 17 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Expand Up @@ -4,6 +4,13 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

---

## next / unreleased

### Improved

* [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES` and set in `ParseOptions::DEFAULT_XML`, `::DEFAULT_XSLT`, `::DEFAULT_HTML`, and `::DEFAULT_SCHEMA`. (Note that JRuby never had this problem.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)]


## 1.12.3 / 2021-08-10

### Fixed
Expand Down
34 changes: 21 additions & 13 deletions ext/nokogiri/xml_node.c
Expand Up @@ -1375,12 +1375,12 @@ native_write_to(
* Returns the line for this Node
*/
static VALUE
line(VALUE self)
rb_xml_node_line(VALUE rb_node)
{
xmlNodePtr node;
Data_Get_Struct(self, xmlNode, node);
xmlNodePtr c_node;
Data_Get_Struct(rb_node, xmlNode, c_node);

return INT2NUM(xmlGetLineNo(node));
return INT2NUM(xmlGetLineNo(c_node));
}

/*
Expand All @@ -1390,17 +1390,25 @@ line(VALUE self)
* Sets the line for this Node. num must be less than 65535.
*/
static VALUE
set_line(VALUE self, VALUE num)
rb_xml_node_line_set(VALUE rb_node, VALUE rb_line_number)
{
xmlNodePtr node;
int value = NUM2INT(num);
xmlNodePtr c_node;
int line_number = NUM2INT(rb_line_number);

Data_Get_Struct(self, xmlNode, node);
if (value < 65535) {
node->line = value;
Data_Get_Struct(rb_node, xmlNode, c_node);

// libxml2 optionally uses xmlNode.psvi to store longer line numbers, but only for text nodes.
// search for "psvi" in SAX2.c and tree.c to learn more.
if (line_number < 65535) {
c_node->line = (short) line_number;
} else {
c_node->line = 65535;
if (c_node->type == XML_TEXT_NODE) {
c_node->psvi = (void *)(ptrdiff_t) line_number;
}
}

return num;
return rb_line_number;
}

/*
Expand Down Expand Up @@ -1805,8 +1813,8 @@ noko_init_xml_node()
rb_define_method(cNokogiriXmlNode, "create_internal_subset", create_internal_subset, 3);
rb_define_method(cNokogiriXmlNode, "create_external_subset", create_external_subset, 3);
rb_define_method(cNokogiriXmlNode, "pointer_id", pointer_id, 0);
rb_define_method(cNokogiriXmlNode, "line", line, 0);
rb_define_method(cNokogiriXmlNode, "line=", set_line, 1);
rb_define_method(cNokogiriXmlNode, "line", rb_xml_node_line, 0);
rb_define_method(cNokogiriXmlNode, "line=", rb_xml_node_line_set, 1);
rb_define_method(cNokogiriXmlNode, "content", get_native_content, 0);
rb_define_method(cNokogiriXmlNode, "native_content=", set_native_content, 1);
rb_define_method(cNokogiriXmlNode, "lang", get_lang, 0);
Expand Down
10 changes: 6 additions & 4 deletions lib/nokogiri/xml/parse_options.rb
Expand Up @@ -68,15 +68,17 @@ class ParseOptions
NOBASEFIX = 1 << 18
# relax any hardcoded limit from the parser
HUGE = 1 << 19
# line numbers stored as long int (instead of a short int)
BIG_LINES = 1 << 22

# the default options used for parsing XML documents
DEFAULT_XML = RECOVER | NONET
DEFAULT_XML = RECOVER | NONET | BIG_LINES
# the default options used for parsing XSLT stylesheets
DEFAULT_XSLT = RECOVER | NONET | NOENT | DTDLOAD | DTDATTR | NOCDATA
DEFAULT_XSLT = RECOVER | NONET | NOENT | DTDLOAD | DTDATTR | NOCDATA | BIG_LINES
# the default options used for parsing HTML documents
DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET
DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET | BIG_LINES
# the default options used for parsing XML schemas
DEFAULT_SCHEMA = NONET
DEFAULT_SCHEMA = NONET | BIG_LINES

attr_accessor :options
def initialize options = STRICT
Expand Down
35 changes: 35 additions & 0 deletions test/xml/test_node.rb
Expand Up @@ -1258,6 +1258,29 @@ def test_wrap
assert_equal(2, set[0].line)
assert_equal(5, set[1].line)
end

it "supports a line number greater than a short int" do
max_short_int = (1 << 16) - 1
xml = StringIO.new.tap do |io|
io << "<root>"
max_short_int.times do |j|
io << "<a>#{j}</a>\n"
io << "<b>#{j}</b>\n"
end
io << "<x/>\n"
io << "</root>"
end.string

if Nokogiri.uses_libxml?
doc = Nokogiri::XML(xml) { |c| c.nobig_lines }
node = doc.at_css("x")
assert_operator(node.line, :==, max_short_int)
end

doc = Nokogiri::XML(xml)
node = doc.at_css("x")
assert_operator(node.line, :>, max_short_int)
end
end

describe "#line=" do
Expand All @@ -1268,6 +1291,18 @@ def test_wrap
node.line = 54321
assert_equal(54321, node.line)
end

it "supports a line number greater than a short int for text nodes" do
skip_unless_libxml2("Xerces does not have line numbers for nodes")

max_short_int = (1 << 16) - 1
line_number = max_short_int + 100

document = Nokogiri::XML::Document.parse("<root><a>text node</a></root>")
node = document.at_css("a").children.first
node.line = line_number
assert_equal(line_number, node.line)
end
end
end
end
Expand Down

0 comments on commit 8cf77aa

Please sign in to comment.