Skip to content

Commit

Permalink
Make sure tags and related start with an ascii alpha, per spec
Browse files Browse the repository at this point in the history
Fixes #1006
  • Loading branch information
jhy committed Aug 12, 2021
1 parent 0e1ca51 commit e6b11b0
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 14 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Expand Up @@ -30,6 +30,9 @@ jsoup changelog
* Bugfix: fixed an IOOB when parsing a formatting fragment into a standalone p element.
<https://github.com/jhy/jsoup/issues/1602>

* Bugfix: tag names must start with an ascii-alpha character.
<https://github.com/jhy/jsoup/issues/1006>

* Bugfix [Fuzz]: fixed a slow parse when a tag or an attribute name has thousands of null characters in it.
<https://github.com/jhy/jsoup/issues/1580>

Expand Down
11 changes: 11 additions & 0 deletions src/main/java/org/jsoup/parser/CharacterReader.java
Expand Up @@ -514,6 +514,17 @@ boolean matchesLetter() {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c);
}

/**
Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha
@return if it matches or not
*/
boolean matchesAsciiAlpha() {
if (isEmpty())
return false;
char c = charBuf[bufPos];
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}

boolean matchesDigit() {
if (isEmpty())
return false;
Expand Down
22 changes: 11 additions & 11 deletions src/main/java/org/jsoup/parser/TokeniserState.java
Expand Up @@ -109,7 +109,7 @@ void read(Tokeniser t, CharacterReader r) {
t.advanceTransition(BogusComment);
break;
default:
if (r.matchesLetter()) {
if (r.matchesAsciiAlpha()) {
t.createTagPending(true);
t.transition(TagName);
} else {
Expand All @@ -127,7 +127,7 @@ void read(Tokeniser t, CharacterReader r) {
t.eofError(this);
t.emit("</");
t.transition(Data);
} else if (r.matchesLetter()) {
} else if (r.matchesAsciiAlpha()) {
t.createTagPending(false);
t.transition(TagName);
} else if (r.matches('>')) {
Expand All @@ -136,7 +136,7 @@ void read(Tokeniser t, CharacterReader r) {
} else {
t.error(this);
t.createBogusCommentPending();
t.advanceTransition(BogusComment);
t.transition(BogusComment); // reconsume char
}
}
},
Expand Down Expand Up @@ -185,7 +185,7 @@ void read(Tokeniser t, CharacterReader r) {
if (r.matches('/')) {
t.createTempBuffer();
t.advanceTransition(RCDATAEndTagOpen);
} else if (r.matchesLetter() && t.appropriateEndTagName() != null && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
} else if (r.matchesAsciiAlpha() && t.appropriateEndTagName() != null && !r.containsIgnoreCase("</" + t.appropriateEndTagName())) {
// diverge from spec: got a start tag, but there's no appropriate end tag (</title>), so rather than
// consuming to EOF; break out here
t.tagPending = t.createTagPending(false).name(t.appropriateEndTagName());
Expand All @@ -199,7 +199,7 @@ void read(Tokeniser t, CharacterReader r) {
},
RCDATAEndTagOpen {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
if (r.matchesAsciiAlpha()) {
t.createTagPending(false);
t.tagPending.appendTagName(r.current());
t.dataBuffer.append(r.current());
Expand All @@ -212,7 +212,7 @@ void read(Tokeniser t, CharacterReader r) {
},
RCDATAEndTagName {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
if (r.matchesAsciiAlpha()) {
String name = r.consumeLetterSequence();
t.tagPending.appendTagName(name);
t.dataBuffer.append(name);
Expand Down Expand Up @@ -419,7 +419,7 @@ void read(Tokeniser t, CharacterReader r) {
},
ScriptDataEscapedLessthanSign {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
if (r.matchesAsciiAlpha()) {
t.createTempBuffer();
t.dataBuffer.append(r.current());
t.emit("<");
Expand All @@ -436,7 +436,7 @@ void read(Tokeniser t, CharacterReader r) {
},
ScriptDataEscapedEndTagOpen {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
if (r.matchesAsciiAlpha()) {
t.createTagPending(false);
t.tagPending.appendTagName(r.current());
t.dataBuffer.append(r.current());
Expand Down Expand Up @@ -925,7 +925,7 @@ void read(Tokeniser t, CharacterReader r) {
} else if (r.matchConsumeIgnoreCase("DOCTYPE")) {
t.transition(Doctype);
} else if (r.matchConsume("[CDATA[")) {
// todo: should actually check current namepspace, and only non-html allows cdata. until namespace
// todo: should actually check current namespace, and only non-html allows cdata. until namespace
// is implemented properly, keep handling as cdata
//} else if (!t.currentNodeInHtmlNS() && r.matchConsume("[CDATA[")) {
t.createTempBuffer();
Expand Down Expand Up @@ -1128,7 +1128,7 @@ void read(Tokeniser t, CharacterReader r) {
},
BeforeDoctypeName {
void read(Tokeniser t, CharacterReader r) {
if (r.matchesLetter()) {
if (r.matchesAsciiAlpha()) {
t.createDoctypePending();
t.transition(DoctypeName);
return;
Expand Down Expand Up @@ -1708,7 +1708,7 @@ private static void readCharRef(Tokeniser t, TokeniserState advance) {
}

private static void readEndTag(Tokeniser t, CharacterReader r, TokeniserState a, TokeniserState b) {
if (r.matchesLetter()) {
if (r.matchesAsciiAlpha()) {
t.createTagPending(false);
t.transition(a);
} else {
Expand Down
27 changes: 24 additions & 3 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Expand Up @@ -1042,11 +1042,11 @@ public void testInvalidTableContents() throws IOException {
}

@Test public void testSupportsNonAsciiTags() {
String body = "<進捗推移グラフ>Yes</進捗推移グラフ><русский-тэг>Correct</<русский-тэг>";
String body = "<a進捗推移グラフ>Yes</a進捗推移グラフ><bрусский-тэг>Correct</<bрусский-тэг>";
Document doc = Jsoup.parse(body);
Elements els = doc.select("進捗推移グラフ");
Elements els = doc.select("a進捗推移グラフ");
assertEquals("Yes", els.text());
els = doc.select("русский-тэг");
els = doc.select("bрусский-тэг");
assertEquals("Correct", els.text());
}

Expand Down Expand Up @@ -1457,4 +1457,25 @@ private boolean didAddElements(String input) {
assertNotNull(doc);
assertEquals("<a> <b> </b></a><b><div><a> </a><a>test</a> </div> </b>", TextUtil.stripNewlines(doc.body().html()));
}

@Test public void tagsMustStartWithAscii() {
// https://github.com/jhy/jsoup/issues/1006
String[] valid = {"a一", "a会员挂单金额5", "table(╯°□°)╯"};
String[] invalid = {"一", "会员挂单金额5", "(╯°□°)╯"};

for (String tag : valid) {
Document doc = Jsoup.parse("<" + tag + ">Text</" + tag + ">");
Elements els = doc.getElementsByTag(tag);
assertEquals(1, els.size());
assertEquals(tag, els.get(0).tagName());
assertEquals("Text", els.get(0).text());
}

for (String tag : invalid) {
Document doc = Jsoup.parse("<" + tag + ">Text</" + tag + ">");
Elements els = doc.getElementsByTag(tag);
assertEquals(0, els.size());
assertEquals("&lt;" + tag + "&gt;Text<!--/" + tag + "-->", doc.body().html());
}
}
}

0 comments on commit e6b11b0

Please sign in to comment.