diff --git a/CHANGES b/CHANGES index 5a1b807d63..6014b78b8f 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,10 @@ jsoup changelog *** Release 1.15.2 [PENDING] + * Improvement: added the ability to track the position (line, column, index) in the original input source from where + a given node was parsed. Accessible via Node.sourceRange() and Element.endSourceRange(). + + * Improvement: added Element.firstElementChild(), Element.lastElementChild(), Node.firstChild(), Node.lastChild(), as convenient accessors to those child nodes and elements. diff --git a/src/main/java/org/jsoup/helper/Validate.java b/src/main/java/org/jsoup/helper/Validate.java index e934faa944..97e7b67cfe 100644 --- a/src/main/java/org/jsoup/helper/Validate.java +++ b/src/main/java/org/jsoup/helper/Validate.java @@ -28,6 +28,18 @@ public static void notNull(@Nullable Object obj, String msg) { throw new IllegalArgumentException(msg); } + /** + Verifies the input object is not null, and returns that object. Effectively this casts a nullable object to a non- + null object. (Works around lack of Objects.requestNonNull in Android version.) + * @param obj nullable object to case to not-null + * @return the object, or throws an NPE. + */ + public static Object ensureNotNull(@Nullable Object obj) { + if (obj == null) + throw new NullPointerException(); + else return obj; + } + /** * Validates that the value is true * @param val object to test diff --git a/src/main/java/org/jsoup/nodes/Attributes.java b/src/main/java/org/jsoup/nodes/Attributes.java index 2e989fd385..76b6590e35 100644 --- a/src/main/java/org/jsoup/nodes/Attributes.java +++ b/src/main/java/org/jsoup/nodes/Attributes.java @@ -49,7 +49,7 @@ public class Attributes implements Iterable, Cloneable { // the number of instance fields is kept as low as possible giving an object size of 24 bytes private int size = 0; // number of slots used (not total capacity, which is keys.length) String[] keys = new String[InitialCapacity]; - String[] vals = new String[InitialCapacity]; + Object[] vals = new Object[InitialCapacity]; // Genericish: all non-internal attribute values must be Strings and are cast on access. // check there's room for more private void checkCapacity(int minNewSize) { @@ -84,8 +84,9 @@ private int indexOfKeyIgnoreCase(String key) { } // we track boolean attributes as null in values - they're just keys. so returns empty for consumers - static String checkNotNull(@Nullable String val) { - return val == null ? EmptyString : val; + // casts to String, so only for non-internal attributes + static String checkNotNull(@Nullable Object val) { + return val == null ? EmptyString : (String) val; } /** @@ -109,16 +110,33 @@ public String getIgnoreCase(String key) { return i == NotFound ? EmptyString : checkNotNull(vals[i]); } + /** + Get an arbitrary user data object by key. + * @param key case sensitive key to the object. + * @return the object associated to this key, or {@code null} if not found. + */ + @Nullable + Object getUserData(String key) { + Validate.notNull(key); + if (!isInternalKey(key)) key = internalKey(key); + int i = indexOfKeyIgnoreCase(key); + return i == NotFound ? null : vals[i]; + } + /** * Adds a new attribute. Will produce duplicates if the key already exists. * @see Attributes#put(String, String) */ public Attributes add(String key, @Nullable String value) { + addObject(key, value); + return this; + } + + private void addObject(String key, @Nullable Object value) { checkCapacity(size + 1); keys[size] = key; vals[size] = value; size++; - return this; } /** @@ -137,6 +155,25 @@ public Attributes put(String key, @Nullable String value) { return this; } + /** + Put an arbitrary user-data object by key. Will be treated as an internal attribute, so will not be emitted in HTML. + * @param key case sensitive key + * @param value object value + * @return these attributes + * @see #getUserData(String) + */ + Attributes putUserData(String key, Object value) { + Validate.notNull(key); + if (!isInternalKey(key)) key = internalKey(key); + Validate.notNull(value); + int i = indexOfKey(key); + if (i != NotFound) + vals[i] = value; + else + addObject(key, value); + return this; + } + void putIgnoreCase(String key, @Nullable String value) { int i = indexOfKeyIgnoreCase(key); if (i != NotFound) { @@ -299,7 +336,7 @@ public boolean hasNext() { @Override public Attribute next() { - final Attribute attr = new Attribute(keys[i], vals[i], Attributes.this); + final Attribute attr = new Attribute(keys[i], (String) vals[i], Attributes.this); i++; return attr; } @@ -313,14 +350,14 @@ public void remove() { /** Get the attributes as a List, for iteration. - @return an view of the attributes as an unmodifiable List. + @return a view of the attributes as an unmodifiable List. */ public List asList() { ArrayList list = new ArrayList<>(size); for (int i = 0; i < size; i++) { if (isInternalKey(keys[i])) continue; // skip internal keys - Attribute attr = new Attribute(keys[i], vals[i], Attributes.this); + Attribute attr = new Attribute(keys[i], (String) vals[i], Attributes.this); list.add(attr); } return Collections.unmodifiableList(list); @@ -356,7 +393,7 @@ final void html(final Appendable accum, final Document.OutputSettings out) throw continue; final String key = Attribute.getValidKey(keys[i], out.syntax()); if (key != null) - Attribute.htmlNoValidate(key, vals[i], accum.append(' '), out); + Attribute.htmlNoValidate(key, (String) vals[i], accum.append(' '), out); } } @@ -383,8 +420,8 @@ public boolean equals(@Nullable Object o) { int thatI = that.indexOfKey(key); if (thatI == NotFound) return false; - String val = vals[i]; - String thatVal = that.vals[thatI]; + Object val = vals[i]; + Object thatVal = that.vals[thatI]; if (val == null) { if (thatVal != null) return false; diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index a934a2f8d7..3b2d54aec4 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -1653,6 +1653,19 @@ public Element val(String value) { return this; } + /** + Get the source range (start and end positions) of the end (closing) tag for this Element. Position tracking must be + enabled prior to parsing the content. + @return the range of the closing tag for this element, if it was explicitly closed in the source. {@code Untracked} + otherwise. + @see org.jsoup.parser.Parser#setTrackPosition(boolean) + @see Node#sourceRange() + @since 1.15.2 + */ + public Range endSourceRange() { + return Range.of(this, false); + } + boolean shouldIndent(final Document.OutputSettings out) { return out.prettyPrint() && isFormatAsBlock(out) && !isInlineable(out); } diff --git a/src/main/java/org/jsoup/nodes/Node.java b/src/main/java/org/jsoup/nodes/Node.java index 9c9e201dc7..fdbc65eefb 100644 --- a/src/main/java/org/jsoup/nodes/Node.java +++ b/src/main/java/org/jsoup/nodes/Node.java @@ -713,6 +713,18 @@ public T html(T appendable) { return appendable; } + /** + Get the source range (start and end positions) in the original input source that this node was parsed from. Position + tracking must be enabled prior to parsing the content. For an Element, this will be the positions of the start tag. + @return the range for the start of the node. + @see org.jsoup.parser.Parser#setTrackPosition(boolean) + @see Element#endSourceRange() + @since 1.15.2 + */ + public Range sourceRange() { + return Range.of(this, true); + } + /** * Gets this node's outer HTML. * @return outer HTML. diff --git a/src/main/java/org/jsoup/nodes/Range.java b/src/main/java/org/jsoup/nodes/Range.java new file mode 100644 index 0000000000..d110d4c8d2 --- /dev/null +++ b/src/main/java/org/jsoup/nodes/Range.java @@ -0,0 +1,187 @@ +package org.jsoup.nodes; + +import org.jsoup.helper.Validate; + +/** + A Range object tracks the character positions in the original input source where a Node starts or ends. If you want to + track these positions, tracking must be enabled in the Parser with + {@link org.jsoup.parser.Parser#setTrackPosition(boolean)}. + @see Node#sourceRange() + @since 1.15.2 + */ +public class Range { + private final Position start, end; + + private static final String RangeKey = Attributes.internalKey("jsoup.sourceRange"); + private static final String EndRangeKey = Attributes.internalKey("jsoup.endSourceRange"); + private static final Position UntrackedPos = new Position(-1, -1, -1); + private static final Range Untracked = new Range(UntrackedPos, UntrackedPos); + + /** + Creates a new Range with start and end Positions. Called by TreeBuilder when position tracking is on. + * @param start the start position + * @param end the end position + */ + public Range(Position start, Position end) { + this.start = start; + this.end = end; + } + + /** + Get the start position of this node. + * @return the start position + */ + public Position start() { + return start; + } + + /** + Get the end position of this node. + * @return the end position + */ + public Position end() { + return end; + } + + /** + Test if this source range was tracked during parsing. + * @return true if this was tracked during parsing, false otherwise (and all fields will be {@code -1}). + */ + public boolean isTracked() { + return this != Untracked; + } + + /** + Retrieves the source range for a given Node. + * @param node the node to retrieve the position for + * @param start if this is the starting range. {@code false} for Element end tags. + * @return the Range, or the Untracked (-1) position if tracking is disabled. + */ + static Range of(Node node, boolean start) { + final String key = start ? RangeKey : EndRangeKey; + if (!node.hasAttr(key)) + return Untracked; + else + return (Range) Validate.ensureNotNull(node.attributes().getUserData(key)); + } + + /** + Internal jsoup method, called by the TreeBuilder. Tracks a Range for a Node. + * @param node the node to associate this position to + * @param start if this is the starting range. {@code false} for Element end tags. + */ + public void track(Node node, boolean start) { + node.attributes().putUserData(start ? RangeKey : EndRangeKey, this); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Range range = (Range) o; + + if (!start.equals(range.start)) return false; + return end.equals(range.end); + } + + @Override + public int hashCode() { + int result = start.hashCode(); + result = 31 * result + end.hashCode(); + return result; + } + + /** + Gets a String presentation of this Range, in the format {@code line,column:pos-line,column:pos}. + * @return a String + */ + @Override + public String toString() { + return start + "-" + end; + } + + /** + A Position object tracks the character position in the original input source where a Node starts or ends. If you want to + track these positions, tracking must be enabled in the Parser with + {@link org.jsoup.parser.Parser#setTrackPosition(boolean)}. + @see Node#sourceRange() + */ + public static class Position { + private final int pos, lineNumber, columnNumber; + + /** + Create a new Position object. Called by the TreeBuilder if source position tracking is on. + * @param pos position index + * @param lineNumber line number + * @param columnNumber column number + */ + public Position(int pos, int lineNumber, int columnNumber) { + this.pos = pos; + this.lineNumber = lineNumber; + this.columnNumber = columnNumber; + } + + /** + Gets the position index (0-based) of the original input source that this Position was read at. This tracks the + total number of characters read into the source at this position, regardless of the number of preceeding lines. + * @return the position, or {@code -1} if untracked. + */ + public int pos() { + return pos; + } + + /** + Gets the line number (1-based) of the original input source that this Position was read at. + * @return the line number, or {@code -1} if untracked. + */ + public int lineNumber() { + return lineNumber; + } + + /** + Gets the cursor number (1-based) of the original input source that this Position was read at. The cursor number + resets to 1 on every new line. + * @return the cursor number, or {@code -1} if untracked. + */ + public int columnNumber() { + return columnNumber; + } + + /** + Test if this position was tracked during parsing. + * @return true if this was tracked during parsing, false otherwise (and all fields will be {@code -1}). + */ + public boolean isTracked() { + return this != UntrackedPos; + } + + /** + Gets a String presentation of this Position, in the format {@code line,column:pos}. + * @return a String + */ + @Override + public String toString() { + return lineNumber + "," + columnNumber + ":" + pos; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Position position = (Position) o; + if (pos != position.pos) return false; + if (lineNumber != position.lineNumber) return false; + return columnNumber == position.columnNumber; + } + + @Override + public int hashCode() { + int result = pos; + result = 31 * result + lineNumber; + result = 31 * result + columnNumber; + return result; + } + + } +} diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index 605b19978e..df902b1684 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -109,7 +109,7 @@ private void bufferUp() { } /** - * Gets the current cursor position in the content. + * Gets the position currently read to in the content. Starts at 0. * @return current position */ public int pos() { @@ -149,14 +149,18 @@ Get the current line number (that the reader has consumed to). Starts at line #1 @see #trackNewlines(boolean) */ public int lineNumber() { + return lineNumber(pos()); + } + + int lineNumber(int pos) { + // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that + // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array if (!isTrackNewlines()) return 1; - int i = lineNumIndex(); + int i = lineNumIndex(pos); if (i == -1) return lineNumberOffset; // first line - if (i < 0) - return Math.abs(i) + lineNumberOffset - 1; return i + lineNumberOffset + 1; } @@ -166,16 +170,18 @@ Get the current column number (that the reader has consumed to). Starts at colum @since 1.14.3 @see #trackNewlines(boolean) */ - int columnNumber() { + public int columnNumber() { + return columnNumber(pos()); + } + + int columnNumber(int pos) { if (!isTrackNewlines()) - return pos() + 1; + return pos + 1; - int i = lineNumIndex(); + int i = lineNumIndex(pos); if (i == -1) - return pos() + 1; - if (i < 0) - i = Math.abs(i) - 2; - return pos() - newlinePositions.get(i) + 1; + return pos + 1; + return pos - newlinePositions.get(i) + 1; } /** @@ -189,9 +195,11 @@ String cursorPos() { return lineNumber() + ":" + columnNumber(); } - private int lineNumIndex() { + private int lineNumIndex(int pos) { if (!isTrackNewlines()) return 0; - return Collections.binarySearch(newlinePositions, pos()); + int i = Collections.binarySearch(newlinePositions, pos); + if (i < -1) i = Math.abs(i) - 2; + return i; } /** @@ -201,13 +209,16 @@ private void scanBufferForNewlines() { if (!isTrackNewlines()) return; - lineNumberOffset += newlinePositions.size(); - int lastPos = newlinePositions.size() > 0 ? newlinePositions.get(newlinePositions.size() -1) : -1; - newlinePositions.clear(); - if (lastPos != -1) { - newlinePositions.add(lastPos); // roll the last pos to first, for cursor num after buffer - lineNumberOffset--; // as this takes a position + if (newlinePositions.size() > 0) { + // work out the line number that we have read up to (as we have likely scanned past this point) + int index = lineNumIndex(readerPos); + if (index == -1) index = 0; // first line + int linePos = newlinePositions.get(index); + lineNumberOffset += index; // the num lines we've read up to + newlinePositions.clear(); + newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer } + for (int i = bufPos; i < bufLength; i++) { if (charBuf[i] == '\n') newlinePositions.add(1 + readerPos + i); diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index 7ddda8e7d9..58760d1679 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -246,7 +246,7 @@ Element insert(final Token.StartTag startTag) { } Element el = new Element(tagFor(startTag.name(), settings), null, settings.normalizeAttributes(startTag.attributes)); - insert(el); + insert(el, startTag); return el; } @@ -257,14 +257,19 @@ Element insertStartTag(String startTagName) { } void insert(Element el) { - insertNode(el); + insertNode(el, null); + stack.add(el); + } + + private void insert(Element el, @Nullable Token token) { + insertNode(el, token); stack.add(el); } Element insertEmpty(Token.StartTag startTag) { Tag tag = tagFor(startTag.name(), settings); Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); - insertNode(el); + insertNode(el, startTag); if (startTag.isSelfClosing()) { if (tag.isKnownTag()) { if (!tag.isEmpty()) @@ -285,7 +290,7 @@ FormElement insertForm(Token.StartTag startTag, boolean onStack, boolean checkTe } else setFormElement(el); - insertNode(el); + insertNode(el, startTag); if (onStack) stack.add(el); return el; @@ -293,7 +298,7 @@ FormElement insertForm(Token.StartTag startTag, boolean onStack, boolean checkTe void insert(Token.Comment commentToken) { Comment comment = new Comment(commentToken.getData()); - insertNode(comment); + insertNode(comment, commentToken); } void insert(Token.Character characterToken) { @@ -309,9 +314,10 @@ else if (isContentForTagData(tagName)) else node = new TextNode(data); el.appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. + onNodeInserted(node, characterToken); } - private void insertNode(Node node) { + private void insertNode(Node node, @Nullable Token token) { // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc if (stack.isEmpty()) doc.appendChild(node); @@ -325,6 +331,7 @@ else if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), if (formElement != null) formElement.addElement((Element) node); } + onNodeInserted(node, token); } Element pop() { @@ -390,8 +397,11 @@ Element popStackToClose(String elName) { for (int pos = stack.size() -1; pos >= 0; pos--) { Element el = stack.get(pos); stack.remove(pos); - if (el.normalName().equals(elName)) + if (el.normalName().equals(elName)) { + if (currentToken instanceof Token.EndTag) + onNodeClosed(el, currentToken); return el; + } } return null; } diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index af1fefe453..57e42d4dfc 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -31,6 +31,7 @@ boolean process(Token t, HtmlTreeBuilder tb) { tb.settings.normalizeTag(d.getName()), d.getPublicIdentifier(), d.getSystemIdentifier()); doctype.setPubSysKey(d.getPubSysKey()); tb.getDocument().appendChild(doctype); + tb.onNodeInserted(doctype, t); if (d.isForceQuirks()) tb.getDocument().quirksMode(Document.QuirksMode.quirks); tb.transition(BeforeHtml); diff --git a/src/main/java/org/jsoup/parser/ParseError.java b/src/main/java/org/jsoup/parser/ParseError.java index feccea6800..7571aa4902 100644 --- a/src/main/java/org/jsoup/parser/ParseError.java +++ b/src/main/java/org/jsoup/parser/ParseError.java @@ -49,7 +49,7 @@ public int getPosition() { } /** - Get the formatted line:column cursor position where the error occured. + Get the formatted line:column cursor position where the error occurred. @return line:number cursor position */ public String getCursorPos() { diff --git a/src/main/java/org/jsoup/parser/ParseSettings.java b/src/main/java/org/jsoup/parser/ParseSettings.java index b83ac07764..56ff672d05 100644 --- a/src/main/java/org/jsoup/parser/ParseSettings.java +++ b/src/main/java/org/jsoup/parser/ParseSettings.java @@ -1,11 +1,11 @@ package org.jsoup.parser; import org.jsoup.nodes.Attributes; - +import javax.annotation.Nullable; import static org.jsoup.internal.Normalizer.lowerCase; /** - * Controls parser settings, to optionally preserve tag and/or attribute name case. + * Controls parser case settings, to optionally preserve tag and/or attribute name case. */ public class ParseSettings { /** @@ -73,7 +73,7 @@ public String normalizeAttribute(String name) { return name; } - Attributes normalizeAttributes(Attributes attributes) { + @Nullable Attributes normalizeAttributes(@Nullable Attributes attributes) { if (attributes != null && !preserveAttributeCase) { attributes.normalize(); } diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java index efc5f3c449..93bb1c031e 100644 --- a/src/main/java/org/jsoup/parser/Parser.java +++ b/src/main/java/org/jsoup/parser/Parser.java @@ -16,6 +16,7 @@ public class Parser { private TreeBuilder treeBuilder; private ParseErrorList errors; private ParseSettings settings; + private boolean trackPosition = false; /** * Create a new Parser, using the specified TreeBuilder @@ -39,6 +40,7 @@ private Parser(Parser copy) { treeBuilder = copy.treeBuilder.newInstance(); // because extended errors = new ParseErrorList(copy.errors); // only copies size, not contents settings = new ParseSettings(copy.settings); + trackPosition = copy.trackPosition; } public Document parseInput(String html, String baseUri) { @@ -63,7 +65,7 @@ public TreeBuilder getTreeBuilder() { /** * Update the TreeBuilder used when parsing content. - * @param treeBuilder current TreeBuilder + * @param treeBuilder new TreeBuilder * @return this, for chaining */ public Parser setTreeBuilder(TreeBuilder treeBuilder) { @@ -99,11 +101,40 @@ public ParseErrorList getErrors() { return errors; } + /** + Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input + source they were created from. By default, tracking is not enabled. + * @return current track position setting + */ + public boolean isTrackPosition() { + return trackPosition; + } + + /** + Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original + input source they were created from. + @param trackPosition position tracking setting; {@code true} to enable + @return this Parser, for chaining + */ + public Parser setTrackPosition(boolean trackPosition) { + this.trackPosition = trackPosition; + return this; + } + + /** + Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes. + * @param settings the new settings + * @return this Parser + */ public Parser settings(ParseSettings settings) { this.settings = settings; return this; } + /** + Gets the current ParseSettings for this Parser + * @return current ParseSettings + */ public ParseSettings settings() { return settings; } diff --git a/src/main/java/org/jsoup/parser/Token.java b/src/main/java/org/jsoup/parser/Token.java index 7f4296584e..819b8aef39 100644 --- a/src/main/java/org/jsoup/parser/Token.java +++ b/src/main/java/org/jsoup/parser/Token.java @@ -5,13 +5,13 @@ import javax.annotation.Nullable; -import static org.jsoup.internal.Normalizer.lowerCase; - /** * Parse tokens for the Tokeniser. */ abstract class Token { TokenType type; + static final int Unset = -1; + private int startPos, endPos = Unset; // position in CharacterReader this token was read from private Token() { } @@ -24,7 +24,27 @@ String tokenType() { * Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every * piece of data, which immediately get GCed. */ - abstract Token reset(); + Token reset() { + startPos = Unset; + endPos = Unset; + return this; + } + + int startPos() { + return startPos; + } + + void startPos(int pos) { + startPos = pos; + } + + int endPos() { + return endPos; + } + + void endPos(int pos) { + endPos = pos; + } static void reset(StringBuilder sb) { if (sb != null) { @@ -45,6 +65,7 @@ static final class Doctype extends Token { @Override Token reset() { + super.reset(); reset(name); pubSysKey = null; reset(publicIdentifier); @@ -97,6 +118,7 @@ static abstract class Tag extends Token { @Override Tag reset() { + super.reset(); tagName = null; normalName = null; reset(attrName); @@ -315,6 +337,7 @@ final static class Comment extends Token { @Override Token reset() { + super.reset(); reset(data); dataS = null; bogus = false; @@ -369,6 +392,7 @@ static class Character extends Token { @Override Token reset() { + super.reset(); data = null; return this; } @@ -408,6 +432,7 @@ final static class EOF extends Token { @Override Token reset() { + super.reset(); return this; } diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java index 1ebf0871d9..0eb40875ee 100644 --- a/src/main/java/org/jsoup/parser/Tokeniser.java +++ b/src/main/java/org/jsoup/parser/Tokeniser.java @@ -49,6 +49,9 @@ final class Tokeniser { private String lastStartTag; // the last start tag emitted, to test appropriate end tag @Nullable private String lastStartCloseSeq; // " char ref -> data + charStartPos = reader.pos(); + } + + this.state = newState; } - void advanceTransition(TokeniserState state) { + void advanceTransition(TokeniserState newState) { + transition(newState); reader.advance(); - this.state = state; } final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index f895c9ed28..902cbdf0ae 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -5,6 +5,7 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; +import org.jsoup.nodes.Range; import javax.annotation.Nullable; import javax.annotation.ParametersAreNonnullByDefault; @@ -32,6 +33,8 @@ abstract class TreeBuilder { private Token.EndTag end = new Token.EndTag(); abstract ParseSettings defaultSettings(); + private boolean trackSourceRange; // optionally tracks the source range of nodes + @ParametersAreNonnullByDefault protected void initialiseParse(Reader input, String baseUri, Parser parser) { Validate.notNull(input, "String input must not be null"); @@ -43,7 +46,8 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { this.parser = parser; settings = parser.settings(); reader = new CharacterReader(input); - reader.trackNewlines(parser.isTrackErrors()); // when tracking errors, enable newline tracking for better error reports + trackSourceRange = parser.isTrackPosition(); + reader.trackNewlines(parser.isTrackErrors() || trackSourceRange); // when tracking errors or source ranges, enable newline tracking for better legibility currentToken = null; tokeniser = new Tokeniser(reader, parser.getErrors()); stack = new ArrayList<>(32); @@ -91,6 +95,7 @@ protected void runParser() { protected abstract boolean process(Token token); protected boolean processStartTag(String name) { + // these are "virtual" start tags (auto-created by the treebuilder), so not tracking the start position final Token.StartTag start = this.start; if (currentToken == start) { // don't recycle an in-use token return process(new Token.StartTag().name(name)); @@ -173,4 +178,37 @@ protected Tag tagFor(String tagName, ParseSettings settings) { } return tag; } + + /** + Called by implementing TreeBuilders when a node has been inserted. This implementation includes optionally tracking + the source range of the node. + * @param node the node that was just inserted + * @param token the (optional) token that created this node + */ + protected void onNodeInserted(Node node, @Nullable Token token) { + trackNodePosition(node, token, true); + } + + /** + Called by implementing TreeBuilders when a node is explicitly closed. This implementation includes optionally + tracking the closing source range of the node. + * @param node the node being closed + * @param token the end-tag token that closed this node + */ + protected void onNodeClosed(Node node, Token token) { + trackNodePosition(node, token, false); + } + + private void trackNodePosition(Node node, @Nullable Token token, boolean start) { + if (trackSourceRange && token != null) { + int startPos = token.startPos(); + if (startPos == Token.Unset) return; // untracked, virtual token + + Range.Position startRange = new Range.Position(startPos, reader.lineNumber(startPos), reader.columnNumber(startPos)); + int endPos = token.endPos(); + Range.Position endRange = new Range.Position(endPos, reader.lineNumber(endPos), reader.columnNumber(endPos)); + Range range = new Range(startRange, endRange); + range.track(node, start); + } + } } diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java index a3dc7917b5..9660723935 100644 --- a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java @@ -80,6 +80,12 @@ protected boolean process(Token token) { protected void insertNode(Node node) { currentElement().appendChild(node); + onNodeInserted(node, null); + } + + protected void insertNode(Node node, Token token) { + currentElement().appendChild(node); + onNodeInserted(node, token); } Element insert(Token.StartTag startTag) { @@ -89,7 +95,7 @@ Element insert(Token.StartTag startTag) { startTag.attributes.deduplicate(settings); Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); - insertNode(el); + insertNode(el, startTag); if (startTag.isSelfClosing()) { if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above. tag.setSelfClosing(); @@ -109,18 +115,18 @@ void insert(Token.Comment commentToken) { if (decl != null) insert = decl; } - insertNode(insert); + insertNode(insert, commentToken); } void insert(Token.Character token) { final String data = token.getData(); - insertNode(token.isCData() ? new CDataNode(data) : new TextNode(data)); + insertNode(token.isCData() ? new CDataNode(data) : new TextNode(data), token); } void insert(Token.Doctype d) { DocumentType doctypeNode = new DocumentType(settings.normalizeTag(d.getName()), d.getPublicIdentifier(), d.getSystemIdentifier()); doctypeNode.setPubSysKey(d.getPubSysKey()); - insertNode(doctypeNode); + insertNode(doctypeNode, d); } /** @@ -150,8 +156,10 @@ protected void popStackToClose(Token.EndTag endTag) { for (int pos = stack.size() -1; pos >= 0; pos--) { Element next = stack.get(pos); stack.remove(pos); - if (next == firstFound) + if (next == firstFound) { + onNodeClosed(next, endTag); break; + } } } private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain diff --git a/src/main/java/org/jsoup/parser/package-info.java b/src/main/java/org/jsoup/parser/package-info.java index 168fdf4086..f1b3c88741 100644 --- a/src/main/java/org/jsoup/parser/package-info.java +++ b/src/main/java/org/jsoup/parser/package-info.java @@ -1,4 +1,7 @@ /** Contains the HTML parser, tag specifications, and HTML tokeniser. */ +@NonnullByDefault package org.jsoup.parser; + +import org.jsoup.internal.NonnullByDefault; diff --git a/src/test/java/org/jsoup/integration/ConnectTest.java b/src/test/java/org/jsoup/integration/ConnectTest.java index 429e925821..e18a0f1a0e 100644 --- a/src/test/java/org/jsoup/integration/ConnectTest.java +++ b/src/test/java/org/jsoup/integration/ConnectTest.java @@ -652,7 +652,7 @@ public void maxBodySize() throws IOException { Connection.Response largeRes = Jsoup.connect(url).maxBodySize(300 * 1024).execute(); // does not crop Connection.Response unlimitedRes = Jsoup.connect(url).maxBodySize(0).execute(); - int actualDocText = 269541; + int actualDocText = 269535; assertEquals(actualDocText, defaultRes.parse().text().length()); assertEquals(49165, smallRes.parse().text().length()); assertEquals(196577, mediumRes.parse().text().length()); diff --git a/src/test/java/org/jsoup/nodes/PositionTest.java b/src/test/java/org/jsoup/nodes/PositionTest.java new file mode 100644 index 0000000000..4fc3b32174 --- /dev/null +++ b/src/test/java/org/jsoup/nodes/PositionTest.java @@ -0,0 +1,180 @@ +package org.jsoup.nodes; + +import org.jsoup.Jsoup; +import org.jsoup.integration.TestServer; +import org.jsoup.integration.servlets.EchoServlet; +import org.jsoup.integration.servlets.FileServlet; +import org.jsoup.parser.Parser; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.io.IOException; + +import static org.junit.jupiter.api.Assertions.*; + +/** + Functional tests for the Position tracking behavior (across nodes, treebuilder, etc.) + */ +class PositionTest { + static Parser TrackingParser = Parser.htmlParser().setTrackPosition(true); + + @Test void parserTrackDefaults() { + Parser htmlParser = Parser.htmlParser(); + assertFalse(htmlParser.isTrackPosition()); + htmlParser.setTrackPosition(true); + assertTrue(htmlParser.isTrackPosition()); + + Parser xmlParser = Parser.htmlParser(); + assertFalse(xmlParser.isTrackPosition()); + xmlParser.setTrackPosition(true); + assertTrue(xmlParser.isTrackPosition()); + } + + @Test void tracksPosition() { + String html = "

\nHello\n ®\n there ©. now.\n "; + Document doc = Jsoup.parse(html, TrackingParser); + + Element body = doc.selectFirst("body"); + Element p = doc.selectFirst("p"); + Element span = doc.selectFirst("span"); + TextNode text = (TextNode) span.firstChild(); + TextNode now = (TextNode) span.nextSibling(); + Comment comment = (Comment) now.nextSibling(); + + assertFalse(body.sourceRange().isTracked()); + + Range pRange = p.sourceRange(); + assertEquals("1,1:0-2,12:19", pRange.toString()); + + // no explicit P closer + Range pEndRange = p.endSourceRange(); + assertFalse(pEndRange.isTracked()); + + Range.Position pStart = pRange.start(); + assertTrue(pStart.isTracked()); + assertEquals(0, pStart.pos()); + assertEquals(1, pStart.columnNumber()); + assertEquals(1, pStart.lineNumber()); + assertEquals("1,1:0", pStart.toString()); + + Range.Position pEnd = pRange.end(); + assertTrue(pStart.isTracked()); + assertEquals(19, pEnd.pos()); + assertEquals(12, pEnd.columnNumber()); + assertEquals(2, pEnd.lineNumber()); + assertEquals("2,12:19", pEnd.toString()); + + assertEquals("3,1:20", span.sourceRange().start().toString()); + assertEquals("3,7:26", span.sourceRange().end().toString()); + + // span end tag + Range spanEnd = span.endSourceRange(); + assertTrue(spanEnd.isTracked()); + assertEquals("5,14:52-5,21:59", spanEnd.toString()); + + String wholeText = text.getWholeText(); + assertEquals("Hello\n ®\n there ©.", wholeText); + String textOrig = "Hello\n ®\n there ©."; + Range textRange = text.sourceRange(); + assertEquals(textRange.end().pos() - textRange.start().pos(), textOrig.length()); + assertEquals("3,7:26", textRange.start().toString()); + assertEquals("5,14:52", textRange.end().toString()); + + assertEquals("6,2:66", comment.sourceRange().start().toString()); + assertEquals("6,18:82", comment.sourceRange().end().toString()); + } + + @Test void tracksMarkup() { + String html = "\njsoup ©\n2022\n\n]]>"; + Document doc = Jsoup.parse(html, TrackingParser); + + DocumentType doctype = doc.documentType(); + assertNotNull(doctype); + assertEquals("html", doctype.name()); + assertEquals("1,1:0-2,6:15", doctype.sourceRange().toString()); + + Element title = doc.selectFirst("title"); + TextNode titleText = (TextNode) title.firstChild(); + assertEquals("jsoup ©\n2022", title.text()); + assertEquals(titleText.getWholeText(), title.text()); + assertEquals("3,1:16-3,8:23", title.sourceRange().toString()); + assertEquals("3,8:23-4,5:40", titleText.sourceRange().toString()); + + CDataNode cdata = (CDataNode) doc.body().childNode(1); + assertEquals("\n\n", cdata.text()); + assertEquals("5,1:55-7,4:76", cdata.sourceRange().toString()); + } + + @Test void tracksDataNodes() { + String html = "\n"; + Document doc = Jsoup.parse(html, TrackingParser); + + Element script = doc.selectFirst("script"); + assertNotNull(script); + assertEquals("2,1:7-2,9:15", script.sourceRange().toString()); + DataNode data = (DataNode) script.firstChild(); + assertEquals("2,9:15-4,8:33", data.sourceRange().toString()); + } + + @Test void tracksXml() { + String xml = "\n\n\nXML\n\n"; + Document doc = Jsoup.parse(xml, Parser.xmlParser().setTrackPosition(true)); + + XmlDeclaration decl = (XmlDeclaration) doc.childNode(0); + assertEquals("1,1:0-1,39:38", decl.sourceRange().toString()); + + DocumentType doctype = (DocumentType) doc.childNode(2); + assertEquals("2,1:39-2,16:54", doctype.sourceRange().toString()); + + Element rss = doc.firstElementChild(); + assertNotNull(rss); + assertEquals("3,1:55-3,14:68", rss.sourceRange().toString()); + assertEquals("5,1:73-5,7:79", rss.endSourceRange().toString()); + + TextNode text = (TextNode) rss.firstChild(); + assertNotNull(text); + assertEquals("3,14:68-5,1:73", text.sourceRange().toString()); + + Comment comment = (Comment) rss.nextSibling().nextSibling(); + assertEquals("6,1:80-6,17:96", comment.sourceRange().toString()); + } + + @BeforeAll + static void setUp() { + TestServer.start(); + } + + @AfterAll + static void tearDown() { + TestServer.stop(); + } + + @Test void tracksFromFetch() throws IOException { + String url = FileServlet.urlTo("/htmltests/large.html"); // 280 K + Document doc = Jsoup.connect(url).parser(TrackingParser).get(); + + Element firstP = doc.selectFirst("p"); + assertNotNull(firstP); + assertEquals("4,1:53-4,4:56", firstP.sourceRange().toString()); + + Element p = doc.selectFirst("#xy"); + assertNotNull(p); + assertEquals("1000,1:279646-1000,10:279655", p.sourceRange().toString()); + assertEquals("1000,567:280212-1000,571:280216", p.endSourceRange().toString()); + + TextNode text = (TextNode) p.firstChild(); + assertEquals("1000,10:279655-1000,357:280002", text.sourceRange().toString()); + } + + @Test void tracksFromXmlFetch() throws IOException { + String url = FileServlet.urlTo("/htmltests/test-rss.xml"); + Document doc = Jsoup.connect(url).parser(Parser.xmlParser().setTrackPosition(true)).get(); + + Element item = doc.selectFirst("item + item"); + assertNotNull(item); + assertEquals("13,5:496-13,11:502", item.sourceRange().toString()); + assertEquals("17,5:779-17,12:786", item.endSourceRange().toString()); + } + +} \ No newline at end of file diff --git a/src/test/resources/htmltests/large.html b/src/test/resources/htmltests/large.html index bdf04dd320..6e418ddb08 100644 --- a/src/test/resources/htmltests/large.html +++ b/src/test/resources/htmltests/large.html @@ -997,7 +997,7 @@

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Mauris ipsum. Nulla metus metus, ullamcorper vel, tincidunt sed, euismod in, nibh. Quisque volutpat condimentum velit. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nam nec ante. Sed lacinia, urna non tincidunt mattis, tortor neque adipiscing diam, a cursus ipsum ante quis turpis. Curabitur sodales ligula in libero. Nulla facilisi. Vestibulum lacinia arcu eget nulla. Ut fringilla. Suspendisse potenti. Nunc feugiat mi a tellus consequat imperdiet. Vestibulum sapien. Nam nec ante. Proin quam.

-

Etiam ultrices. Suspendisse in justo eu magna luctus suscipit. Sed lectus. Integer euismod lacus luctus magna. Quisque cursus, metus vitae pharetra auctor, sem massa mattis sem, at interdum magna augue eget diam. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Morbi lacinia molestie dui. Praesent blandit dolor. Ut fringilla. Sed non quam. Ut fringilla. In vel mi sit amet augue congue elementum. Morbi in ipsum sit amet pede facilisis laoreet. Donec lacus nunc, viverra nec, blandit vel, egestas et, augue.

+

Ultrices. Suspendisse in justo eu magna luctus suscipit. Sed lectus. Integer euismod lacus luctus magna. Quisque cursus, metus vitae pharetra auctor, sem massa mattis sem, at interdum magna augue eget diam. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Morbi lacinia molestie dui. Praesent blandit dolor. Ut fringilla. Sed non quam. Ut fringilla. In vel mi sit amet augue congue elementum. Morbi in ipsum sit amet pede facilisis laoreet. Donec lacus nunc, viverra nec, blandit vel, egestas et, augue.

VESTIBULUM tincidunt malesuada tellus. Ut ultrices ultrices enim. Proin quam. Curabitur sit amet mauris. Morbi in dui quis est pulvinar ullamcorper. Nulla facilisi. Integer lacinia sollicitudin massa. Cras metus. Sed aliquet risus a tortor. Sed non quam. Integer id quam. Curabitur sit amet mauris. Morbi mi. In vel mi sit amet augue congue elementum. Quisque nisl felis, venenatis tristique, dignissim in, ultrices sit amet, augue. Proin sodales libero eget ante.