Skip to content

Commit

Permalink
Perf increase when attribute name is packed with thousands of nullchars
Browse files Browse the repository at this point in the history
Fixes #1580 for attribute names
  • Loading branch information
jhy committed Jul 15, 2021
1 parent fce241b commit 009dbb1
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 9 deletions.
2 changes: 1 addition & 1 deletion CHANGES
Expand Up @@ -14,7 +14,7 @@ jsoup changelog
* Bugfix: updated the HtmlTreeParser resetInsertionMode to the current spec for supported elements
<https://github.com/jhy/jsoup/issues/1491>

* Bugfix [Fuzz]: fixed a slow parse when a tag has thousands of null characters in it.
* Bugfix [Fuzz]: fixed a slow parse when a tag or an attribute name has thousands of null characters in it.
<https://github.com/jhy/jsoup/issues/1580>

* Bugfix [Fuzz]: the adoption agency algorithm can have an incorrect bookmark position
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jsoup/parser/Token.java
Expand Up @@ -181,6 +181,8 @@ final void appendTagName(char append) {
}

final void appendAttributeName(String append) {
// might have null chars because we eat in one pass - need to replace with null replacement character
append = append.replace(TokeniserState.nullChar, Tokeniser.replacementChar);
pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append);
}

Expand Down
9 changes: 2 additions & 7 deletions src/main/java/org/jsoup/parser/TokeniserState.java
Expand Up @@ -144,7 +144,6 @@ void read(Tokeniser t, CharacterReader r) {
// from < or </ in data, will have start or end tag pending
void read(Tokeniser t, CharacterReader r) {
// previous TagOpen state did NOT consume, will have a letter char in current
//String tagName = r.consumeToAnySorted(tagCharsSorted).toLowerCase();
String tagName = r.consumeTagName();
t.tagPending.appendTagName(tagName);

Expand Down Expand Up @@ -608,7 +607,7 @@ void read(Tokeniser t, CharacterReader r) {
AttributeName {
// from before attribute name
void read(Tokeniser t, CharacterReader r) {
String name = r.consumeToAnySorted(attributeNameCharsSorted);
String name = r.consumeToAnySorted(attributeNameCharsSorted); // spec deviate - consume and emit nulls in one hit vs stepping
t.tagPending.appendAttributeName(name);

char c = r.consume();
Expand All @@ -630,10 +629,6 @@ void read(Tokeniser t, CharacterReader r) {
t.emitTagPending();
t.transition(Data);
break;
case nullChar:
t.error(this);
t.tagPending.appendAttributeName(replacementChar);
break;
case eof:
t.eofError(this);
t.transition(Data);
Expand Down Expand Up @@ -1631,7 +1626,7 @@ void read(Tokeniser t, CharacterReader r) {

static final char nullChar = '\u0000';
// char searches. must be sorted, used in inSorted. MUST update TokenisetStateTest if more arrays are added.
static final char[] attributeNameCharsSorted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>'};
static final char[] attributeNameCharsSorted = new char[]{'\t', '\n', '\f', '\r', ' ', '"', '\'', '/', '<', '=', '>'};
static final char[] attributeValueUnquoted = new char[]{nullChar, '\t', '\n', '\f', '\r', ' ', '"', '&', '\'', '<', '=', '>', '`'};

private static final char replacementChar = Tokeniser.replacementChar;
Expand Down
12 changes: 11 additions & 1 deletion src/test/java/org/jsoup/integration/FuzzFixesTest.java
Expand Up @@ -104,12 +104,22 @@ public void scope1579() {
@Test
public void overflow1577() throws IOException {
// https://github.com/jhy/jsoup/issues/1577
// no repro - fixed elsewhere?
File in = ParseTest.getFile("/fuzztests/1577.html.gz");
Document doc = Jsoup.parse(in, "UTF-8");
assertNotNull(doc);

Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
assertNotNull(docXml);
}

@Test
public void parseTimeout36150() throws IOException {
File in = ParseTest.getFile("/fuzztests/1580-attrname.html.gz");
// pretty much 1MB of null chars in text head
Document doc = Jsoup.parse(in, "UTF-8");
assertNotNull(doc);

Document docXml = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://example.com", Parser.xmlParser());
assertNotNull(docXml);
}
}
Binary file not shown.

0 comments on commit 009dbb1

Please sign in to comment.