Skip to content

Commit

Permalink
Use a string builder when attribute names need to be accumulated
Browse files Browse the repository at this point in the history
Fixes #1605
  • Loading branch information
jhy committed Aug 4, 2021
1 parent fb15b71 commit b4f4f2c
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 44 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ jsoup changelog
* Bugfix [Fuzz]: Speed improvement when the stack was thousands of items deep, and non-matching close tags sent.
<https://github.com/jhy/jsoup/issues/1596>

* Bugfix [Fuzz]: Speed improvement when an attribute name is 600K of quote characters or otherwise needs accumulation
vs being able to read in one hit.
<https://github.com/jhy/jsoup/issues/1605>

*** Release 1.14.1 [2021-Jul-10]
* Change: updated the minimum supported Java version from Java 7 to Java 8.

Expand Down
117 changes: 73 additions & 44 deletions src/main/java/org/jsoup/parser/Token.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Attributes;

import javax.annotation.Nullable;

import static org.jsoup.internal.Normalizer.lowerCase;

/**
Expand Down Expand Up @@ -73,25 +75,32 @@ public boolean isForceQuirks() {
}

static abstract class Tag extends Token {
protected String tagName;
protected String normalName; // lc version of tag name, for case insensitive tree build
private String pendingAttributeName; // attribute names are generally caught in one hop, not accumulated
private StringBuilder pendingAttributeValue = new StringBuilder(); // but values are accumulated, from e.g. & in hrefs
private String pendingAttributeValueS; // try to get attr vals in one shot, vs Builder
private boolean hasEmptyAttributeValue = false; // distinguish boolean attribute from empty string value
private boolean hasPendingAttributeValue = false;
@Nullable protected String tagName;
@Nullable protected String normalName; // lc version of tag name, for case insensitive tree build

private final StringBuilder attrName = new StringBuilder(); // try to get attr names and vals in one shot, vs Builder
@Nullable private String attrNameS;
private boolean hasAttrName = false;

private final StringBuilder attrValue = new StringBuilder();
@Nullable private String attrValueS;
private boolean hasAttrValue = false;
private boolean hasEmptyAttrValue = false; // distinguish boolean attribute from empty string value

boolean selfClosing = false;
Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used).
@Nullable Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used).

@Override
Tag reset() {
tagName = null;
normalName = null;
pendingAttributeName = null;
reset(pendingAttributeValue);
pendingAttributeValueS = null;
hasEmptyAttributeValue = false;
hasPendingAttributeValue = false;
reset(attrName);
attrNameS = null;
hasAttrName = false;
reset(attrValue);
attrValueS = null;
hasEmptyAttrValue = false;
hasAttrValue = false;
selfClosing = false;
attributes = null;
return this;
Expand All @@ -106,26 +115,30 @@ final void newAttribute() {
if (attributes == null)
attributes = new Attributes();

if (pendingAttributeName != null && attributes.size() < MaxAttributes) {
if (hasAttrName && attributes.size() < MaxAttributes) {
// the tokeniser has skipped whitespace control chars, but trimming could collapse to empty for other control codes, so verify here
pendingAttributeName = pendingAttributeName.trim();
if (pendingAttributeName.length() > 0) {
String name = attrName.length() > 0 ? attrName.toString() : attrNameS;
name = name.trim();
if (name.length() > 0) {
String value;
if (hasPendingAttributeValue)
value = pendingAttributeValue.length() > 0 ? pendingAttributeValue.toString() : pendingAttributeValueS;
else if (hasEmptyAttributeValue)
if (hasAttrValue)
value = attrValue.length() > 0 ? attrValue.toString() : attrValueS;
else if (hasEmptyAttrValue)
value = "";
else
value = null;
// note that we add, not put. So that the first is kept, and rest are deduped, once in a context where case sensitivity is known (the appropriate tree builder).
attributes.add(pendingAttributeName, value);
attributes.add(name, value);
}
}
pendingAttributeName = null;
hasEmptyAttributeValue = false;
hasPendingAttributeValue = false;
reset(pendingAttributeValue);
pendingAttributeValueS = null;
reset(attrName);
attrNameS = null;
hasAttrName = false;

reset(attrValue);
attrValueS = null;
hasAttrValue = false;
hasEmptyAttrValue = false;
}

final boolean hasAttributes() {
Expand All @@ -138,7 +151,7 @@ final boolean hasAttribute(String key) {

final void finaliseTag() {
// finalises for emit
if (pendingAttributeName != null) {
if (hasAttrName) {
newAttribute();
}
}
Expand Down Expand Up @@ -183,49 +196,65 @@ final void appendTagName(char append) {
final void appendAttributeName(String append) {
// might have null chars because we eat in one pass - need to replace with null replacement character
append = append.replace(TokeniserState.nullChar, Tokeniser.replacementChar);
pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append);

ensureAttrName();
if (attrName.length() == 0) {
attrNameS = append;
} else {
attrName.append(append);
}
}

final void appendAttributeName(char append) {
appendAttributeName(String.valueOf(append));
ensureAttrName();
attrName.append(append);
}

final void appendAttributeValue(String append) {
ensureAttributeValue();
if (pendingAttributeValue.length() == 0) {
pendingAttributeValueS = append;
ensureAttrValue();
if (attrValue.length() == 0) {
attrValueS = append;
} else {
pendingAttributeValue.append(append);
attrValue.append(append);
}
}

final void appendAttributeValue(char append) {
ensureAttributeValue();
pendingAttributeValue.append(append);
ensureAttrValue();
attrValue.append(append);
}

final void appendAttributeValue(char[] append) {
ensureAttributeValue();
pendingAttributeValue.append(append);
ensureAttrValue();
attrValue.append(append);
}

final void appendAttributeValue(int[] appendCodepoints) {
ensureAttributeValue();
ensureAttrValue();
for (int codepoint : appendCodepoints) {
pendingAttributeValue.appendCodePoint(codepoint);
attrValue.appendCodePoint(codepoint);
}
}

final void setEmptyAttributeValue() {
hasEmptyAttributeValue = true;
hasEmptyAttrValue = true;
}

private void ensureAttrName() {
hasAttrName = true;
// if on second hit, we'll need to move to the builder
if (attrNameS != null) {
attrName.append(attrNameS);
attrNameS = null;
}
}

private void ensureAttributeValue() {
hasPendingAttributeValue = true;
private void ensureAttrValue() {
hasAttrValue = true;
// if on second hit, we'll need to move to the builder
if (pendingAttributeValueS != null) {
pendingAttributeValue.append(pendingAttributeValueS);
pendingAttributeValueS = null;
if (attrValueS != null) {
attrValue.append(attrValueS);
attrValueS = null;
}
}

Expand Down
12 changes: 12 additions & 0 deletions src/test/java/org/jsoup/integration/FuzzFixesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,16 @@ public void parseTimeout1596() throws IOException {
Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
assertNotNull(docXml);
}

@Test
public void parseTimeout1605() throws IOException {
// timesink with 600K of accumulating attribute name
File in = ParseTest.getFile("/fuzztests/1605.html.gz");

Document doc = Jsoup.parse(in, "UTF-8");
assertNotNull(doc);

Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
assertNotNull(docXml);
}
}
Binary file added src/test/resources/fuzztests/1605.html.gz
Binary file not shown.

0 comments on commit b4f4f2c

Please sign in to comment.