Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(cruby): support line numbers larger than a short #2309

Merged
merged 1 commit into from Aug 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Expand Up @@ -4,6 +4,13 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

---

## next / unreleased

### Improved

* [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES` and set in `ParseOptions::DEFAULT_XML`, `::DEFAULT_XSLT`, `::DEFAULT_HTML`, and `::DEFAULT_SCHEMA`. (Note that JRuby never had this problem.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)]


## 1.12.3 / 2021-08-10

### Fixed
Expand Down
34 changes: 21 additions & 13 deletions ext/nokogiri/xml_node.c
Expand Up @@ -1375,12 +1375,12 @@ native_write_to(
* Returns the line for this Node
*/
static VALUE
line(VALUE self)
rb_xml_node_line(VALUE rb_node)
{
xmlNodePtr node;
Data_Get_Struct(self, xmlNode, node);
xmlNodePtr c_node;
Data_Get_Struct(rb_node, xmlNode, c_node);

return INT2NUM(xmlGetLineNo(node));
return INT2NUM(xmlGetLineNo(c_node));
}

/*
Expand All @@ -1390,17 +1390,25 @@ line(VALUE self)
* Sets the line for this Node. num must be less than 65535.
*/
static VALUE
set_line(VALUE self, VALUE num)
rb_xml_node_line_set(VALUE rb_node, VALUE rb_line_number)
{
xmlNodePtr node;
int value = NUM2INT(num);
xmlNodePtr c_node;
int line_number = NUM2INT(rb_line_number);

Data_Get_Struct(self, xmlNode, node);
if (value < 65535) {
node->line = value;
Data_Get_Struct(rb_node, xmlNode, c_node);

// libxml2 optionally uses xmlNode.psvi to store longer line numbers, but only for text nodes.
// search for "psvi" in SAX2.c and tree.c to learn more.
if (line_number < 65535) {
c_node->line = (short) line_number;
} else {
c_node->line = 65535;
if (c_node->type == XML_TEXT_NODE) {
c_node->psvi = (void *)(ptrdiff_t) line_number;
}
}

return num;
return rb_line_number;
}

/*
Expand Down Expand Up @@ -1805,8 +1813,8 @@ noko_init_xml_node()
rb_define_method(cNokogiriXmlNode, "create_internal_subset", create_internal_subset, 3);
rb_define_method(cNokogiriXmlNode, "create_external_subset", create_external_subset, 3);
rb_define_method(cNokogiriXmlNode, "pointer_id", pointer_id, 0);
rb_define_method(cNokogiriXmlNode, "line", line, 0);
rb_define_method(cNokogiriXmlNode, "line=", set_line, 1);
rb_define_method(cNokogiriXmlNode, "line", rb_xml_node_line, 0);
rb_define_method(cNokogiriXmlNode, "line=", rb_xml_node_line_set, 1);
rb_define_method(cNokogiriXmlNode, "content", get_native_content, 0);
rb_define_method(cNokogiriXmlNode, "native_content=", set_native_content, 1);
rb_define_method(cNokogiriXmlNode, "lang", get_lang, 0);
Expand Down
10 changes: 6 additions & 4 deletions lib/nokogiri/xml/parse_options.rb
Expand Up @@ -68,15 +68,17 @@ class ParseOptions
NOBASEFIX = 1 << 18
# relax any hardcoded limit from the parser
HUGE = 1 << 19
# line numbers stored as long int (instead of a short int)
BIG_LINES = 1 << 22

# the default options used for parsing XML documents
DEFAULT_XML = RECOVER | NONET
DEFAULT_XML = RECOVER | NONET | BIG_LINES
# the default options used for parsing XSLT stylesheets
DEFAULT_XSLT = RECOVER | NONET | NOENT | DTDLOAD | DTDATTR | NOCDATA
DEFAULT_XSLT = RECOVER | NONET | NOENT | DTDLOAD | DTDATTR | NOCDATA | BIG_LINES
# the default options used for parsing HTML documents
DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET
DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET | BIG_LINES
# the default options used for parsing XML schemas
DEFAULT_SCHEMA = NONET
DEFAULT_SCHEMA = NONET | BIG_LINES

attr_accessor :options
def initialize options = STRICT
Expand Down
35 changes: 35 additions & 0 deletions test/xml/test_node.rb
Expand Up @@ -1258,6 +1258,29 @@ def test_wrap
assert_equal(2, set[0].line)
assert_equal(5, set[1].line)
end

it "supports a line number greater than a short int" do
max_short_int = (1 << 16) - 1
xml = StringIO.new.tap do |io|
io << "<root>"
max_short_int.times do |j|
io << "<a>#{j}</a>\n"
io << "<b>#{j}</b>\n"
end
io << "<x/>\n"
io << "</root>"
end.string

if Nokogiri.uses_libxml?
doc = Nokogiri::XML(xml) { |c| c.nobig_lines }
node = doc.at_css("x")
assert_operator(node.line, :==, max_short_int)
end

doc = Nokogiri::XML(xml)
node = doc.at_css("x")
assert_operator(node.line, :>, max_short_int)
end
end

describe "#line=" do
Expand All @@ -1268,6 +1291,18 @@ def test_wrap
node.line = 54321
assert_equal(54321, node.line)
end

it "supports a line number greater than a short int for text nodes" do
skip_unless_libxml2("Xerces does not have line numbers for nodes")

max_short_int = (1 << 16) - 1
line_number = max_short_int + 100

document = Nokogiri::XML::Document.parse("<root><a>text node</a></root>")
node = document.at_css("a").children.first
node.line = line_number
assert_equal(line_number, node.line)
end
end
end
end
Expand Down