From 26d3c14ec3f0bf81c12e490efd7923e659696cca Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Wed, 4 Aug 2021 17:00:38 +1000 Subject: [PATCH] Fixed an NPE hit on Current Element if there was nothing left on the stack And guarded other calls to Current Element when used in the same manner Fixes #1603 --- CHANGES | 5 +- .../jsoup/parser/HtmlTreeBuilderState.java | 49 ++++++++++--------- .../java/org/jsoup/parser/TreeBuilder.java | 13 ++++- .../java/org/jsoup/parser/HtmlParserTest.java | 7 +++ 4 files changed, 48 insertions(+), 26 deletions(-) diff --git a/CHANGES b/CHANGES index 479e93130d..18ac10cfb6 100644 --- a/CHANGES +++ b/CHANGES @@ -14,9 +14,12 @@ jsoup changelog * Bugfix: in the XML parser, XML processing instructions without attributes would be serialized as if they did. - * Bugfix: updated the HtmlTreeParser resetInsertionMode to the current spec for supported elements + * Bugfix: updated the HtmlTreeParser resetInsertionMode to the current spec for supported elements. + * Bugfix: fixed an NPE when parsing fragment HTML into a table element. + + * Bugfix [Fuzz]: fixed a slow parse when a tag or an attribute name has thousands of null characters in it. diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index fcef2bcc3d..f5b65ba770 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -580,7 +580,7 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) { // static final String[] InBodyStartOptions = new String[]{"optgroup", "option"}; case "optgroup": case "option": - if (tb.currentElement().normalName().equals("option")) + if (tb.currentElementIs("option")) tb.processEndTag("option"); tb.reconstructFormattingElements(); tb.insert(startTag); @@ -590,7 +590,7 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) { case "rt": if (tb.inScope("ruby")) { tb.generateImpliedEndTags(); - if (!tb.currentElement().normalName().equals("ruby")) { + if (!tb.currentElementIs("ruby")) { tb.error(this); tb.popStackToBefore("ruby"); // i.e. close up to but not include name } @@ -648,7 +648,7 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) { return false; } else { tb.generateImpliedEndTags(name); - if (!tb.currentElement().normalName().equals(name)) + if (!tb.currentElementIs(name)) tb.error(this); tb.popStackToClose(name); } @@ -675,7 +675,7 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) { return false; } else { tb.generateImpliedEndTags(); - if (!tb.currentElement().normalName().equals(name)) + if (!tb.currentElementIs(name)) tb.error(this); // remove currentForm from stack. will shift anything under up. tb.removeFromStack(currentForm); @@ -688,7 +688,7 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) { return tb.process(endTag); } else { tb.generateImpliedEndTags(name); - if (!tb.currentElement().normalName().equals(name)) + if (!tb.currentElementIs(name)) tb.error(this); tb.popStackToClose(name); } @@ -700,7 +700,7 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) { return false; } else { tb.generateImpliedEndTags(name); - if (!tb.currentElement().normalName().equals(name)) + if (!tb.currentElementIs(name)) tb.error(this); tb.popStackToClose(name); } @@ -716,7 +716,7 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) { return false; } else { tb.generateImpliedEndTags(name); - if (!tb.currentElement().normalName().equals(name)) + if (!tb.currentElementIs(name)) tb.error(this); tb.popStackToClose(Constants.Headings); } @@ -736,7 +736,7 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) { return false; } else { tb.generateImpliedEndTags(); - if (!tb.currentElement().normalName().equals(name)) + if (!tb.currentElementIs(name)) tb.error(this); tb.popStackToClose(name); } @@ -747,7 +747,7 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) { return false; } tb.generateImpliedEndTags(); - if (!tb.currentElement().normalName().equals(name)) + if (!tb.currentElementIs(name)) tb.error(this); tb.popStackToClose(name); tb.clearFormattingElementsToLastMarker(); @@ -1000,7 +1000,8 @@ boolean process(Token t, HtmlTreeBuilder tb) { } return true; // todo: as above todo } else if (t.isEOF()) { - if (tb.currentElement().normalName().equals("html")) + Element el = tb.currentElement(); + if (tb.currentElementIs("html")) tb.error(this); return true; // stops parsing } @@ -1059,7 +1060,7 @@ boolean process(Token t, HtmlTreeBuilder tb) { return false; } else { tb.generateImpliedEndTags(); - if (!tb.currentElement().normalName().equals("caption")) + if (!tb.currentElementIs("caption")) tb.error(this); tb.popStackToClose("caption"); tb.clearFormattingElementsToLastMarker(); @@ -1110,7 +1111,7 @@ boolean process(Token t, HtmlTreeBuilder tb) { case EndTag: Token.EndTag endTag = t.asEndTag(); if (endTag.normalName.equals("colgroup")) { - if (tb.currentElement().normalName().equals("html")) { // frag case + if (tb.currentElementIs("html")) { // frag case tb.error(this); return false; } else { @@ -1121,7 +1122,7 @@ boolean process(Token t, HtmlTreeBuilder tb) { return anythingElse(t, tb); break; case EOF: - if (tb.currentElement().normalName().equals("html")) + if (tb.currentElementIs("html")) return true; // stop parsing; frag case else return anythingElse(t, tb); @@ -1277,7 +1278,7 @@ boolean process(Token t, HtmlTreeBuilder tb) { return false; } tb.generateImpliedEndTags(); - if (!tb.currentElement().normalName().equals(name)) + if (!tb.currentElementIs(name)) tb.error(this); tb.popStackToClose(name); tb.clearFormattingElementsToLastMarker(); @@ -1344,13 +1345,13 @@ boolean process(Token t, HtmlTreeBuilder tb) { if (name.equals("html")) return tb.process(start, InBody); else if (name.equals("option")) { - if (tb.currentElement().normalName().equals("option")) + if (tb.currentElementIs("option")) tb.processEndTag("option"); tb.insert(start); } else if (name.equals("optgroup")) { - if (tb.currentElement().normalName().equals("option")) + if (tb.currentElementIs("option")) tb.processEndTag("option"); // pop option and flow to pop optgroup - if (tb.currentElement().normalName().equals("optgroup")) + if (tb.currentElementIs("optgroup")) tb.processEndTag("optgroup"); tb.insert(start); } else if (name.equals("select")) { @@ -1373,15 +1374,15 @@ else if (name.equals("option")) { name = end.normalName(); switch (name) { case "optgroup": - if (tb.currentElement().normalName().equals("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).normalName().equals("optgroup")) + if (tb.currentElementIs("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).normalName().equals("optgroup")) tb.processEndTag("option"); - if (tb.currentElement().normalName().equals("optgroup")) + if (tb.currentElementIs("optgroup")) tb.pop(); else tb.error(this); break; case "option": - if (tb.currentElement().normalName().equals("option")) + if (tb.currentElementIs("option")) tb.pop(); else tb.error(this); @@ -1400,7 +1401,7 @@ else if (name.equals("option")) { } break; case EOF: - if (!tb.currentElement().normalName().equals("html")) + if (!tb.currentElementIs("html")) tb.error(this); break; default: @@ -1487,17 +1488,17 @@ boolean process(Token t, HtmlTreeBuilder tb) { return false; } } else if (t.isEndTag() && t.asEndTag().normalName().equals("frameset")) { - if (tb.currentElement().normalName().equals("html")) { // frag + if (tb.currentElementIs("html")) { // frag tb.error(this); return false; } else { tb.pop(); - if (!tb.isFragmentParsing() && !tb.currentElement().normalName().equals("frameset")) { + if (!tb.isFragmentParsing() && !tb.currentElementIs("frameset")) { tb.transition(AfterFrameset); } } } else if (t.isEOF()) { - if (!tb.currentElement().normalName().equals("html")) { + if (!tb.currentElementIs("html")) { tb.error(this); return true; } diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 1f37f3345d..6aa5b7999c 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -6,6 +6,7 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; +import javax.annotation.Nullable; import javax.annotation.ParametersAreNonnullByDefault; import java.io.Reader; import java.util.ArrayList; @@ -109,11 +110,21 @@ protected boolean processEndTag(String name) { } - protected Element currentElement() { + @Nullable protected Element currentElement() { int size = stack.size(); return size > 0 ? stack.get(size-1) : null; } + /** + Checks if the Current Element's normal name equals the supplied name. + @param normalName name to check + @return true if there is a current element on the stack, and its name equals the supplied + */ + protected boolean currentElementIs(String normalName) { + Element current = currentElement(); + return current != null && current.normalName().equals(normalName); + } + /** * If the parser is tracking errors, add an error at the current position. * @param msg error message diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 7f2ff97fa6..ae9e014bfa 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -1424,4 +1424,11 @@ private boolean didAddElements(String input) { int xmlElementCount = xml.getAllElements().size(); return htmlElementCount > xmlElementCount; } + + @Test public void canSetHtmlOnCreatedTableElements() { + // https://github.com/jhy/jsoup/issues/1603 + Element element = new Element("tr"); + element.html("One"); + assertEquals("\n \n One\n \n", element.outerHtml()); + } }