diff --git a/CHANGELOG.md b/CHANGELOG.md index 0493b07170..fdd157d862 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA --- +## next / unreleased + +### Improved + +* [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES` and set in `ParseOptions::DEFAULT_XML`, `::DEFAULT_XSLT`, `::DEFAULT_HTML`, and `::DEFAULT_SCHEMA`. (Note that JRuby never had this problem.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)] + + ## 1.12.3 / 2021-08-10 ### Fixed diff --git a/ext/nokogiri/xml_node.c b/ext/nokogiri/xml_node.c index c28fb751a9..545a39b106 100644 --- a/ext/nokogiri/xml_node.c +++ b/ext/nokogiri/xml_node.c @@ -1375,12 +1375,12 @@ native_write_to( * Returns the line for this Node */ static VALUE -line(VALUE self) +rb_xml_node_line(VALUE rb_node) { - xmlNodePtr node; - Data_Get_Struct(self, xmlNode, node); + xmlNodePtr c_node; + Data_Get_Struct(rb_node, xmlNode, c_node); - return INT2NUM(xmlGetLineNo(node)); + return INT2NUM(xmlGetLineNo(c_node)); } /* @@ -1390,17 +1390,25 @@ line(VALUE self) * Sets the line for this Node. num must be less than 65535. */ static VALUE -set_line(VALUE self, VALUE num) +rb_xml_node_line_set(VALUE rb_node, VALUE rb_line_number) { - xmlNodePtr node; - int value = NUM2INT(num); + xmlNodePtr c_node; + int line_number = NUM2INT(rb_line_number); - Data_Get_Struct(self, xmlNode, node); - if (value < 65535) { - node->line = value; + Data_Get_Struct(rb_node, xmlNode, c_node); + + // libxml2 optionally uses xmlNode.psvi to store longer line numbers, but only for text nodes. + // search for "psvi" in SAX2.c and tree.c to learn more. + if (line_number < 65535) { + c_node->line = (short) line_number; + } else { + c_node->line = 65535; + if (c_node->type == XML_TEXT_NODE) { + c_node->psvi = (void *)(ptrdiff_t) line_number; + } } - return num; + return rb_line_number; } /* @@ -1805,8 +1813,8 @@ noko_init_xml_node() rb_define_method(cNokogiriXmlNode, "create_internal_subset", create_internal_subset, 3); rb_define_method(cNokogiriXmlNode, "create_external_subset", create_external_subset, 3); rb_define_method(cNokogiriXmlNode, "pointer_id", pointer_id, 0); - rb_define_method(cNokogiriXmlNode, "line", line, 0); - rb_define_method(cNokogiriXmlNode, "line=", set_line, 1); + rb_define_method(cNokogiriXmlNode, "line", rb_xml_node_line, 0); + rb_define_method(cNokogiriXmlNode, "line=", rb_xml_node_line_set, 1); rb_define_method(cNokogiriXmlNode, "content", get_native_content, 0); rb_define_method(cNokogiriXmlNode, "native_content=", set_native_content, 1); rb_define_method(cNokogiriXmlNode, "lang", get_lang, 0); diff --git a/lib/nokogiri/xml/parse_options.rb b/lib/nokogiri/xml/parse_options.rb index 36186ab7b9..12019198ab 100644 --- a/lib/nokogiri/xml/parse_options.rb +++ b/lib/nokogiri/xml/parse_options.rb @@ -68,15 +68,17 @@ class ParseOptions NOBASEFIX = 1 << 18 # relax any hardcoded limit from the parser HUGE = 1 << 19 + # line numbers stored as long int (instead of a short int) + BIG_LINES = 1 << 22 # the default options used for parsing XML documents - DEFAULT_XML = RECOVER | NONET + DEFAULT_XML = RECOVER | NONET | BIG_LINES # the default options used for parsing XSLT stylesheets - DEFAULT_XSLT = RECOVER | NONET | NOENT | DTDLOAD | DTDATTR | NOCDATA + DEFAULT_XSLT = RECOVER | NONET | NOENT | DTDLOAD | DTDATTR | NOCDATA | BIG_LINES # the default options used for parsing HTML documents - DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET + DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET | BIG_LINES # the default options used for parsing XML schemas - DEFAULT_SCHEMA = NONET + DEFAULT_SCHEMA = NONET | BIG_LINES attr_accessor :options def initialize options = STRICT diff --git a/test/xml/test_node.rb b/test/xml/test_node.rb index f125d41d54..48cec9b297 100644 --- a/test/xml/test_node.rb +++ b/test/xml/test_node.rb @@ -1258,6 +1258,29 @@ def test_wrap assert_equal(2, set[0].line) assert_equal(5, set[1].line) end + + it "supports a line number greater than a short int" do + max_short_int = (1 << 16) - 1 + xml = StringIO.new.tap do |io| + io << "" + max_short_int.times do |j| + io << "#{j}\n" + io << "#{j}\n" + end + io << "\n" + io << "" + end.string + + if Nokogiri.uses_libxml? + doc = Nokogiri::XML(xml) { |c| c.nobig_lines } + node = doc.at_css("x") + assert_operator(node.line, :==, max_short_int) + end + + doc = Nokogiri::XML(xml) + node = doc.at_css("x") + assert_operator(node.line, :>, max_short_int) + end end describe "#line=" do @@ -1268,6 +1291,18 @@ def test_wrap node.line = 54321 assert_equal(54321, node.line) end + + it "supports a line number greater than a short int for text nodes" do + skip_unless_libxml2("Xerces does not have line numbers for nodes") + + max_short_int = (1 << 16) - 1 + line_number = max_short_int + 100 + + document = Nokogiri::XML::Document.parse("text node") + node = document.at_css("a").children.first + node.line = line_number + assert_equal(line_number, node.line) + end end end end