Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvement: adds optional source position tracking #1790

Merged
merged 2 commits into from Jun 13, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES
@@ -1,6 +1,9 @@
jsoup changelog

*** Release 1.15.2 [PENDING]
* Improvement: added the ability to track the position (line, column, index) in the original input source from where
a given node was parsed. Accessible via Node.sourceRange() and Element.endSourceRange().

* Improvement: added Element.firstElementChild(), Element.lastElementChild(), Node.firstChild(), Node.lastChild(),
as convenient accessors to those child nodes and elements.

Expand Down
12 changes: 12 additions & 0 deletions src/main/java/org/jsoup/helper/Validate.java
Expand Up @@ -28,6 +28,18 @@ public static void notNull(@Nullable Object obj, String msg) {
throw new IllegalArgumentException(msg);
}

/**
Verifies the input object is not null, and returns that object. Effectively this casts a nullable object to a non-
null object. (Works around lack of Objects.requestNonNull in Android version.)
* @param obj nullable object to case to not-null
* @return the object, or throws an NPE.
*/
public static Object ensureNotNull(@Nullable Object obj) {
if (obj == null)
throw new NullPointerException();
else return obj;
}

/**
* Validates that the value is true
* @param val object to test
Expand Down
57 changes: 47 additions & 10 deletions src/main/java/org/jsoup/nodes/Attributes.java
Expand Up @@ -49,7 +49,7 @@ public class Attributes implements Iterable<Attribute>, Cloneable {
// the number of instance fields is kept as low as possible giving an object size of 24 bytes
private int size = 0; // number of slots used (not total capacity, which is keys.length)
String[] keys = new String[InitialCapacity];
String[] vals = new String[InitialCapacity];
Object[] vals = new Object[InitialCapacity]; // Genericish: all non-internal attribute values must be Strings and are cast on access.

// check there's room for more
private void checkCapacity(int minNewSize) {
Expand Down Expand Up @@ -84,8 +84,9 @@ private int indexOfKeyIgnoreCase(String key) {
}

// we track boolean attributes as null in values - they're just keys. so returns empty for consumers
static String checkNotNull(@Nullable String val) {
return val == null ? EmptyString : val;
// casts to String, so only for non-internal attributes
static String checkNotNull(@Nullable Object val) {
return val == null ? EmptyString : (String) val;
}

/**
Expand All @@ -109,16 +110,33 @@ public String getIgnoreCase(String key) {
return i == NotFound ? EmptyString : checkNotNull(vals[i]);
}

/**
Get an arbitrary user data object by key.
* @param key case sensitive key to the object.
* @return the object associated to this key, or {@code null} if not found.
*/
@Nullable
Object getUserData(String key) {
Validate.notNull(key);
if (!isInternalKey(key)) key = internalKey(key);
int i = indexOfKeyIgnoreCase(key);
return i == NotFound ? null : vals[i];
}

/**
* Adds a new attribute. Will produce duplicates if the key already exists.
* @see Attributes#put(String, String)
*/
public Attributes add(String key, @Nullable String value) {
addObject(key, value);
return this;
}

private void addObject(String key, @Nullable Object value) {
checkCapacity(size + 1);
keys[size] = key;
vals[size] = value;
size++;
return this;
}

/**
Expand All @@ -137,6 +155,25 @@ public Attributes put(String key, @Nullable String value) {
return this;
}

/**
Put an arbitrary user-data object by key. Will be treated as an internal attribute, so will not be emitted in HTML.
* @param key case sensitive key
* @param value object value
* @return these attributes
* @see #getUserData(String)
*/
Attributes putUserData(String key, Object value) {
Validate.notNull(key);
if (!isInternalKey(key)) key = internalKey(key);
Validate.notNull(value);
int i = indexOfKey(key);
if (i != NotFound)
vals[i] = value;
else
addObject(key, value);
return this;
}

void putIgnoreCase(String key, @Nullable String value) {
int i = indexOfKeyIgnoreCase(key);
if (i != NotFound) {
Expand Down Expand Up @@ -299,7 +336,7 @@ public boolean hasNext() {

@Override
public Attribute next() {
final Attribute attr = new Attribute(keys[i], vals[i], Attributes.this);
final Attribute attr = new Attribute(keys[i], (String) vals[i], Attributes.this);
i++;
return attr;
}
Expand All @@ -313,14 +350,14 @@ public void remove() {

/**
Get the attributes as a List, for iteration.
@return an view of the attributes as an unmodifiable List.
@return a view of the attributes as an unmodifiable List.
*/
public List<Attribute> asList() {
ArrayList<Attribute> list = new ArrayList<>(size);
for (int i = 0; i < size; i++) {
if (isInternalKey(keys[i]))
continue; // skip internal keys
Attribute attr = new Attribute(keys[i], vals[i], Attributes.this);
Attribute attr = new Attribute(keys[i], (String) vals[i], Attributes.this);
list.add(attr);
}
return Collections.unmodifiableList(list);
Expand Down Expand Up @@ -356,7 +393,7 @@ final void html(final Appendable accum, final Document.OutputSettings out) throw
continue;
final String key = Attribute.getValidKey(keys[i], out.syntax());
if (key != null)
Attribute.htmlNoValidate(key, vals[i], accum.append(' '), out);
Attribute.htmlNoValidate(key, (String) vals[i], accum.append(' '), out);
}
}

Expand All @@ -383,8 +420,8 @@ public boolean equals(@Nullable Object o) {
int thatI = that.indexOfKey(key);
if (thatI == NotFound)
return false;
String val = vals[i];
String thatVal = that.vals[thatI];
Object val = vals[i];
Object thatVal = that.vals[thatI];
if (val == null) {
if (thatVal != null)
return false;
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/org/jsoup/nodes/Element.java
Expand Up @@ -1653,6 +1653,19 @@ public Element val(String value) {
return this;
}

/**
Get the source range (start and end positions) of the end (closing) tag for this Element. Position tracking must be
enabled prior to parsing the content.
@return the range of the closing tag for this element, if it was explicitly closed in the source. {@code Untracked}
otherwise.
@see org.jsoup.parser.Parser#setTrackPosition(boolean)
@see Node#sourceRange()
@since 1.15.2
*/
public Range endSourceRange() {
return Range.of(this, false);
}

boolean shouldIndent(final Document.OutputSettings out) {
return out.prettyPrint() && isFormatAsBlock(out) && !isInlineable(out);
}
Expand Down
12 changes: 12 additions & 0 deletions src/main/java/org/jsoup/nodes/Node.java
Expand Up @@ -713,6 +713,18 @@ public <T extends Appendable> T html(T appendable) {
return appendable;
}

/**
Get the source range (start and end positions) in the original input source that this node was parsed from. Position
tracking must be enabled prior to parsing the content. For an Element, this will be the positions of the start tag.
@return the range for the start of the node.
@see org.jsoup.parser.Parser#setTrackPosition(boolean)
@see Element#endSourceRange()
@since 1.15.2
*/
public Range sourceRange() {
return Range.of(this, true);
}

/**
* Gets this node's outer HTML.
* @return outer HTML.
Expand Down
187 changes: 187 additions & 0 deletions src/main/java/org/jsoup/nodes/Range.java
@@ -0,0 +1,187 @@
package org.jsoup.nodes;

import org.jsoup.helper.Validate;

/**
A Range object tracks the character positions in the original input source where a Node starts or ends. If you want to
track these positions, tracking must be enabled in the Parser with
{@link org.jsoup.parser.Parser#setTrackPosition(boolean)}.
@see Node#sourceRange()
@since 1.15.2
*/
public class Range {
private final Position start, end;

private static final String RangeKey = Attributes.internalKey("jsoup.sourceRange");
private static final String EndRangeKey = Attributes.internalKey("jsoup.endSourceRange");
private static final Position UntrackedPos = new Position(-1, -1, -1);
private static final Range Untracked = new Range(UntrackedPos, UntrackedPos);

/**
Creates a new Range with start and end Positions. Called by TreeBuilder when position tracking is on.
* @param start the start position
* @param end the end position
*/
public Range(Position start, Position end) {
this.start = start;
this.end = end;
}

/**
Get the start position of this node.
* @return the start position
*/
public Position start() {
return start;
}

/**
Get the end position of this node.
* @return the end position
*/
public Position end() {
return end;
}

/**
Test if this source range was tracked during parsing.
* @return true if this was tracked during parsing, false otherwise (and all fields will be {@code -1}).
*/
public boolean isTracked() {
return this != Untracked;
}

/**
Retrieves the source range for a given Node.
* @param node the node to retrieve the position for
* @param start if this is the starting range. {@code false} for Element end tags.
* @return the Range, or the Untracked (-1) position if tracking is disabled.
*/
static Range of(Node node, boolean start) {
final String key = start ? RangeKey : EndRangeKey;
if (!node.hasAttr(key))
return Untracked;
else
return (Range) Validate.ensureNotNull(node.attributes().getUserData(key));
}

/**
Internal jsoup method, called by the TreeBuilder. Tracks a Range for a Node.
* @param node the node to associate this position to
* @param start if this is the starting range. {@code false} for Element end tags.
*/
public void track(Node node, boolean start) {
node.attributes().putUserData(start ? RangeKey : EndRangeKey, this);
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;

Range range = (Range) o;

if (!start.equals(range.start)) return false;
return end.equals(range.end);
}

@Override
public int hashCode() {
int result = start.hashCode();
result = 31 * result + end.hashCode();
return result;
}

/**
Gets a String presentation of this Range, in the format {@code line,column:pos-line,column:pos}.
* @return a String
*/
@Override
public String toString() {
return start + "-" + end;
}

/**
A Position object tracks the character position in the original input source where a Node starts or ends. If you want to
track these positions, tracking must be enabled in the Parser with
{@link org.jsoup.parser.Parser#setTrackPosition(boolean)}.
@see Node#sourceRange()
*/
public static class Position {
private final int pos, lineNumber, columnNumber;

/**
Create a new Position object. Called by the TreeBuilder if source position tracking is on.
* @param pos position index
* @param lineNumber line number
* @param columnNumber column number
*/
public Position(int pos, int lineNumber, int columnNumber) {
this.pos = pos;
this.lineNumber = lineNumber;
this.columnNumber = columnNumber;
}

/**
Gets the position index (0-based) of the original input source that this Position was read at. This tracks the
total number of characters read into the source at this position, regardless of the number of preceeding lines.
* @return the position, or {@code -1} if untracked.
*/
public int pos() {
return pos;
}

/**
Gets the line number (1-based) of the original input source that this Position was read at.
* @return the line number, or {@code -1} if untracked.
*/
public int lineNumber() {
return lineNumber;
}

/**
Gets the cursor number (1-based) of the original input source that this Position was read at. The cursor number
resets to 1 on every new line.
* @return the cursor number, or {@code -1} if untracked.
*/
public int columnNumber() {
return columnNumber;
}

/**
Test if this position was tracked during parsing.
* @return true if this was tracked during parsing, false otherwise (and all fields will be {@code -1}).
*/
public boolean isTracked() {
return this != UntrackedPos;
}

/**
Gets a String presentation of this Position, in the format {@code line,column:pos}.
* @return a String
*/
@Override
public String toString() {
return lineNumber + "," + columnNumber + ":" + pos;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Position position = (Position) o;
if (pos != position.pos) return false;
if (lineNumber != position.lineNumber) return false;
return columnNumber == position.columnNumber;
}

@Override
public int hashCode() {
int result = pos;
result = 31 * result + lineNumber;
result = 31 * result + columnNumber;
return result;
}

}
}