From dfb065adb7efb7108931720d101e9d237f085ea8 Mon Sep 17 00:00:00 2001 From: Benoit Tellier Date: Fri, 1 Jul 2022 08:56:27 +0700 Subject: [PATCH 1/9] Reuse buffers --- pom.xml | 13 ++++ .../java/org/jsoup/helper/BufferRecycler.java | 31 ++++++++ .../jsoup/helper/StringBuilderRecycler.java | 23 ++++++ .../org/jsoup/parser/CharacterReader.java | 35 ++++++++- src/main/java/org/jsoup/parser/Tokeniser.java | 27 ++++++- .../java/org/jsoup/parser/TreeBuilder.java | 1 + .../org/jsoup/benchmarks/JMHBenchmark.java | 74 +++++++++++++++++++ 7 files changed, 199 insertions(+), 5 deletions(-) create mode 100644 src/main/java/org/jsoup/helper/BufferRecycler.java create mode 100644 src/main/java/org/jsoup/helper/StringBuilderRecycler.java create mode 100644 src/test/java/org/jsoup/benchmarks/JMHBenchmark.java diff --git a/pom.xml b/pom.xml index 19fa0239b5..7e7cddd104 100644 --- a/pom.xml +++ b/pom.xml @@ -324,6 +324,19 @@ test + + org.openjdk.jmh + jmh-core + 1.35 + test + + + org.openjdk.jmh + jmh-generator-annprocess + 1.35 + test + + com.google.code.gson diff --git a/src/main/java/org/jsoup/helper/BufferRecycler.java b/src/main/java/org/jsoup/helper/BufferRecycler.java new file mode 100644 index 0000000000..0409630ea0 --- /dev/null +++ b/src/main/java/org/jsoup/helper/BufferRecycler.java @@ -0,0 +1,31 @@ +package org.jsoup.helper; + +import java.util.ArrayList; + +public class BufferRecycler { + protected final ArrayList charBuffers = new ArrayList<>(); + + public char[] allocCharBuffer(int minSize) { + if (minSize < 1024) { + return calloc(minSize); + } + char[] buffer = null; + if (!charBuffers.isEmpty()) { + buffer = charBuffers.remove(charBuffers.size() -1); + } + if (buffer == null || buffer.length < minSize) { + buffer = calloc(minSize); + } + return buffer; + } + + public void releaseCharBuffer(char[] buffer) { + if (buffer != null && buffer.length >= 1024) { + charBuffers.add(buffer); + } + } + + protected char[] calloc(int size) { + return new char[size]; + } +} diff --git a/src/main/java/org/jsoup/helper/StringBuilderRecycler.java b/src/main/java/org/jsoup/helper/StringBuilderRecycler.java new file mode 100644 index 0000000000..35f6adbb0c --- /dev/null +++ b/src/main/java/org/jsoup/helper/StringBuilderRecycler.java @@ -0,0 +1,23 @@ +package org.jsoup.helper; + +import java.util.ArrayList; + +public class StringBuilderRecycler { + protected final ArrayList stringBuilders = new ArrayList<>(); + + public StringBuilder get(int minSize) { + if (!stringBuilders.isEmpty()) { + StringBuilder stringBuilder = stringBuilders.remove(stringBuilders.size() - 1); + // Too small string builders are thrown away + if (stringBuilder.capacity() >= minSize) { + stringBuilder.setLength(0); + return stringBuilder; + } + } + return new StringBuilder(minSize); + } + + public void releaseByteBuffer(StringBuilder stringBuilder) { + stringBuilders.add(stringBuilder); + } +} diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index df902b1684..c79d977480 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -1,12 +1,14 @@ package org.jsoup.parser; import org.jsoup.UncheckedIOException; +import org.jsoup.helper.BufferRecycler; import org.jsoup.helper.Validate; import javax.annotation.Nullable; import java.io.IOException; import java.io.Reader; import java.io.StringReader; +import java.lang.ref.SoftReference; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -16,6 +18,33 @@ CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. */ public final class CharacterReader { + protected static final ThreadLocal> _recyclerRef = new ThreadLocal<>(); + protected static final ThreadLocal> stringCacheRef = new ThreadLocal<>(); + + public static BufferRecycler getBufferRecycler() { + SoftReference ref = _recyclerRef.get(); + BufferRecycler br = (ref == null) ? null : ref.get(); + + if (br == null) { + br = new BufferRecycler(); + ref = new SoftReference<>(br); + _recyclerRef.set(ref); + } + return br; + } + + public static String[] getStringCache() { + SoftReference ref = stringCacheRef.get(); + String[] stringCache = (ref == null) ? null : ref.get(); + + if (stringCache == null) { + stringCache = new String[stringCacheSize]; + ref = new SoftReference<>(stringCache); + stringCacheRef.set(ref); + } + return stringCache; + } + static final char EOF = (char) -1; private static final int maxStringCacheLen = 12; static final int maxBufferLen = 1024 * 32; // visible for testing @@ -30,7 +59,7 @@ public final class CharacterReader { private int readerPos; private int bufMark = -1; private static final int stringCacheSize = 512; - private String[] stringCache = new String[stringCacheSize]; // holds reused strings in this doc, to lessen garbage + private String[] stringCache = getStringCache(); // holds reused strings in this doc, to lessen garbage @Nullable private ArrayList newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp() private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)] @@ -39,7 +68,7 @@ public CharacterReader(Reader input, int sz) { Validate.notNull(input); Validate.isTrue(input.markSupported()); reader = input; - charBuf = new char[Math.min(sz, maxBufferLen)]; + charBuf = getBufferRecycler().allocCharBuffer(Math.min(sz, maxBufferLen)); bufferUp(); } @@ -59,8 +88,8 @@ public void close() { } catch (IOException ignored) { } finally { reader = null; - charBuf = null; stringCache = null; + getBufferRecycler().releaseCharBuffer(charBuf); } } diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java index be00b7f63c..3d0771b73d 100644 --- a/src/main/java/org/jsoup/parser/Tokeniser.java +++ b/src/main/java/org/jsoup/parser/Tokeniser.java @@ -1,16 +1,33 @@ package org.jsoup.parser; +import org.jsoup.helper.StringBuilderRecycler; import org.jsoup.helper.Validate; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Entities; import javax.annotation.Nullable; + +import java.lang.ref.SoftReference; import java.util.Arrays; /** * Readers the input stream into tokens. */ final class Tokeniser { + protected static final ThreadLocal> stringCacheRef = new ThreadLocal<>(); + + public static StringBuilderRecycler getStringBuilderCache() { + SoftReference ref = stringCacheRef.get(); + StringBuilderRecycler br = (ref == null) ? null : ref.get(); + + if (br == null) { + br = new StringBuilderRecycler(); + ref = new SoftReference<>(br); + stringCacheRef.set(ref); + } + return br; + } + static final char replacementChar = '\uFFFD'; // replaces null character private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'}; @@ -37,8 +54,8 @@ final class Tokeniser { @Nullable private Token emitPending = null; // the token we are about to emit on next read private boolean isEmitPending = false; @Nullable private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one - private final StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read - StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for + private StringBuilder charsBuilder = getStringBuilderCache().get(1024); // buffers characters to output as one token, if more than one emit per read + StringBuilder dataBuffer = getStringBuilderCache().get(1024); // buffers data looking for Token.StartTag startPending = new Token.StartTag(); Token.EndTag endPending = new Token.EndTag(); @@ -360,4 +377,10 @@ String unescapeEntities(boolean inAttribute) { } return StringUtil.releaseBuilder(builder); } + + void release() { + StringBuilderRecycler stringBuilderCache = getStringBuilderCache(); + stringBuilderCache.releaseByteBuffer(charsBuilder); + stringBuilderCache.releaseByteBuffer(dataBuffer); + } } diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 77083b2ea0..c67cec52df 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -63,6 +63,7 @@ Document parse(Reader input, String baseUri, Parser parser) { // tidy up - as the Parser and Treebuilder are retained in document for settings / fragments reader.close(); reader = null; + tokeniser.release(); tokeniser = null; stack = null; seenTags = null; diff --git a/src/test/java/org/jsoup/benchmarks/JMHBenchmark.java b/src/test/java/org/jsoup/benchmarks/JMHBenchmark.java new file mode 100644 index 0000000000..10cdf7b323 --- /dev/null +++ b/src/test/java/org/jsoup/benchmarks/JMHBenchmark.java @@ -0,0 +1,74 @@ +package org.jsoup.benchmarks; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeUnit; + +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.infra.Blackhole; +import org.openjdk.jmh.profile.GCProfiler; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; +import org.openjdk.jmh.runner.options.TimeValue; + +public class JMHBenchmark { + private static final String CONTENT_SMALL = "

small

"; + private static final String CONTENT_MEDIUM = " \n" + + " \n" + + " Large HTML \n" + + " \n" + + " \n" + + "

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum. Praesent mauris. Fusce nec tellus sed augue semper porta. Mauris massa. Vestibulum lacinia arcu eget nulla. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Curabitur sodales ligula in libero.

\n" + + "

Sed dignissim lacinia nunc. Curabitur tortor. Pellentesque nibh. Aenean quam. In scelerisque sem at dolor. Maecenas mattis. Sed convallis tristique sem. Proin ut ligula vel nunc egestas porttitor. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi lectus risus, iaculis vel, suscipit quis, luctus non, massa. Fusce ac turpis quis ligula lacinia aliquet. Mauris ipsum. Aenean quam. Nulla metus metus, ullamcorper vel, tincidunt sed, euismod in, nibh.

\n" + + "

Quisque volutpat condimentum velit. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nam nec ante. Sed lacinia, urna non tincidunt mattis, tortor neque adipiscing diam, a cursus ipsum ante quis turpis. Nulla facilisi. Ut fringilla. Suspendisse potenti. Nunc feugiat mi a tellus consequat imperdiet. Vestibulum sapien. Proin quam. Sed lacinia, urna non tincidunt mattis, tortor neque adipiscing diam, a cursus ipsum ante quis turpis. Etiam ultrices. Suspendisse in justo eu magna luctus suscipit. Sed lectus. Integer euismod lacus luctus magna.

\n" + + "

Quisque cursus, metus vitae pharetra auctor, sem massa mattis sem, at interdum magna augue eget diam. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Morbi lacinia molestie dui. Praesent blandit dolor. Sed non quam. In vel mi sit amet augue congue elementum. Morbi in ipsum sit amet pede facilisis laoreet. Suspendisse in justo eu magna luctus suscipit. Donec lacus nunc, viverra nec, blandit vel, egestas et, augue. Vestibulum tincidunt malesuada tellus. Ut ultrices ultrices enim. Curabitur sit amet mauris. Morbi in dui quis est pulvinar ullamcorper. Nulla facilisi.

\n" + + "

Integer lacinia sollicitudin massa. Quisque cursus, metus vitae pharetra auctor, sem massa mattis sem, at interdum magna augue eget diam. Cras metus. Sed aliquet risus a tortor. Integer id quam. Morbi mi. Morbi in dui quis est pulvinar ullamcorper. Quisque nisl felis, venenatis tristique, dignissim in, ultrices sit amet, augue. Proin sodales libero eget ante. Nulla quam. Aenean laoreet. Vestibulum nisi lectus, commodo ac, facilisis ac, ultricies eu, pede. Ut orci risus, accumsan porttitor, cursus quis, aliquet eget, justo. Sed pretium blandit orci. Quisque nisl felis, venenatis tristique, dignissim in, ultrices sit amet, augue. Ut eu diam at pede suscipit sodales. Aenean lectus elit, fermentum non, convallis id, sagittis at, neque.

\n" + + "

Nullam mauris orci, aliquet et, iaculis et, viverra vitae, ligula. Nulla ut felis in purus aliquam imperdiet. Maecenas aliquet mollis lectus. Vivamus consectetuer risus et tortor. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum.

\n" + + "

Proin sodales libero eget ante. Praesent mauris. Fusce nec tellus sed augue semper porta. Nullam mauris orci, aliquet et, iaculis et, viverra vitae, ligula. Mauris massa. Vestibulum lacinia arcu eget nulla. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Curabitur sodales ligula in libero. Sed dignissim lacinia nunc. Curabitur tortor. Pellentesque nibh. Aenean quam. In scelerisque sem at dolor. Maecenas mattis.

\n" + + "

Sed convallis tristique sem. Proin ut ligula vel nunc egestas porttitor. Morbi lectus risus, iaculis vel, suscipit quis, luctus non, massa. Fusce ac turpis quis ligula lacinia aliquet. Mauris ipsum. Nulla metus metus, ullamcorper vel, tincidunt sed, euismod in, nibh. Quisque volutpat condimentum velit. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Sed convallis tristique sem. Nam nec ante. Sed lacinia, urna non tincidunt mattis, tortor neque adipiscing diam, a cursus ipsum ante quis turpis. Nulla facilisi. Ut fringilla. Suspendisse potenti. Nunc feugiat mi a tellus consequat imperdiet.

\n" + + "

Vestibulum sapien. Proin quam. Fusce ac turpis quis ligula lacinia aliquet. Etiam ultrices. Suspendisse in justo eu magna luctus suscipit. Sed lectus. Integer euismod lacus luctus magna. Quisque cursus, metus vitae pharetra auctor, sem massa mattis sem, at interdum magna augue eget diam. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Morbi lacinia molestie dui. Suspendisse potenti. Praesent blandit dolor. Sed non quam. In vel mi sit amet augue congue elementum. Morbi in ipsum sit amet pede facilisis laoreet. Donec lacus nunc, viverra nec, blandit vel, egestas et, augue.

\n" + + "

Vestibulum tincidunt malesuada tellus. Ut ultrices ultrices enim. Curabitur sit amet mauris. Morbi in dui quis est pulvinar ullamcorper. Nulla facilisi. Quisque cursus, metus vitae pharetra auctor, sem massa mattis sem, at interdum magna augue eget diam. Integer lacinia sollicitudin massa. Cras metus. Sed aliquet risus a tortor. Integer id quam. Morbi mi. Quisque nisl felis, venenatis tristique, dignissim in, ultrices sit amet, augue. Proin sodales libero eget ante. Nulla quam.

\n" + + "

Aenean laoreet. Vestibulum nisi lectus, commodo ac, facilisis ac, ultricies eu, pede. Ut orci risus, accumsan porttitor, cursus quis, aliquet eget, justo. Sed pretium blandit orci. Ut eu diam at pede suscipit sodales. Aenean lectus elit, fermentum non, convallis id, sagittis at, neque. Nullam mauris orci, aliquet et, iaculis et, viverra vitae, ligula. Nulla ut felis in purus aliquam imperdiet. Maecenas aliquet mollis lectus. Vivamus consectetuer risus et tortor. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio.

\n" + + " \n" + + ""; + private static final byte[] CONTENT_MEDIUM_AS_BYTES = CONTENT_MEDIUM.getBytes(StandardCharsets.UTF_8); + + @Test + public void launchBenchmark() throws Exception { + Options opt = new OptionsBuilder() + .include(this.getClass().getName() + ".*") + .mode(Mode.AverageTime) + .addProfiler(GCProfiler.class) + .timeUnit(TimeUnit.MICROSECONDS) + .warmupTime(TimeValue.seconds(5)) + .warmupIterations(3) + .measurementTime(TimeValue.seconds(5)) + .measurementIterations(5) + .threads(1) + .forks(1) + .shouldFailOnError(true) + .shouldDoGC(true) + .build(); + + new Runner(opt).run(); + } + + @Benchmark + public void benchmarkSmall(Blackhole bh) throws Exception{ + bh.consume(Jsoup.parse(CONTENT_SMALL)); + } + + @Benchmark + public void benchmarkMediumInputStream(Blackhole bh) throws Exception{ + bh.consume(Jsoup.parse(new ByteArrayInputStream(CONTENT_MEDIUM_AS_BYTES), StandardCharsets.UTF_8.name(), "")); + } + + @Benchmark + public void benchmarkMedium(Blackhole bh) throws Exception{ + bh.consume(Jsoup.parse(CONTENT_MEDIUM)); + } +} From a950665834c5af483362796abfac629c91978fb2 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sat, 7 Jan 2023 13:25:11 +1100 Subject: [PATCH 2/9] General implementation of a threadlocal pool --- .../java/org/jsoup/helper/BufferRecycler.java | 31 -------- .../java/org/jsoup/internal/BufferPool.java | 79 +++++++++++++++++++ .../java/org/jsoup/internal/StringUtil.java | 44 +++++------ .../org/jsoup/parser/CharacterReader.java | 68 ++++++++-------- .../org/jsoup/benchmarks/JMHBenchmark.java | 2 + .../servlets/InterruptedServlet.java | 2 +- .../org/jsoup/parser/CharacterReaderTest.java | 14 ++-- 7 files changed, 144 insertions(+), 96 deletions(-) delete mode 100644 src/main/java/org/jsoup/helper/BufferRecycler.java create mode 100644 src/main/java/org/jsoup/internal/BufferPool.java diff --git a/src/main/java/org/jsoup/helper/BufferRecycler.java b/src/main/java/org/jsoup/helper/BufferRecycler.java deleted file mode 100644 index 0409630ea0..0000000000 --- a/src/main/java/org/jsoup/helper/BufferRecycler.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.jsoup.helper; - -import java.util.ArrayList; - -public class BufferRecycler { - protected final ArrayList charBuffers = new ArrayList<>(); - - public char[] allocCharBuffer(int minSize) { - if (minSize < 1024) { - return calloc(minSize); - } - char[] buffer = null; - if (!charBuffers.isEmpty()) { - buffer = charBuffers.remove(charBuffers.size() -1); - } - if (buffer == null || buffer.length < minSize) { - buffer = calloc(minSize); - } - return buffer; - } - - public void releaseCharBuffer(char[] buffer) { - if (buffer != null && buffer.length >= 1024) { - charBuffers.add(buffer); - } - } - - protected char[] calloc(int size) { - return new char[size]; - } -} diff --git a/src/main/java/org/jsoup/internal/BufferPool.java b/src/main/java/org/jsoup/internal/BufferPool.java new file mode 100644 index 0000000000..f84e5ad341 --- /dev/null +++ b/src/main/java/org/jsoup/internal/BufferPool.java @@ -0,0 +1,79 @@ +package org.jsoup.internal; + +import org.jsoup.helper.Validate; + +import javax.annotation.Nullable; +import java.lang.ref.SoftReference; +import java.util.Stack; + +/** + jsoup internal use only - maintains a pool of various threadlocal, soft-reference buffers used by the parser and for + serialization. */ +public final class BufferPool { + // char[] used in CharacterReader (input we are reading) + // String[] used in CharacterReader (token cache) + // String Builders (accumulators) + + public interface Lifecycle { + /** + Called when an object is borrowed but none are in the pool. Should return a new object of the desired min + capacity. + */ + T create(); + + /** + Called when an object is returned. Should resize the object (via create) if it has grown too large, and empty + its contents if not for reuse. + */ + T reset(T obj); + } + + final Lifecycle lifecycle; + final ThreadLocal>> pool; + final int maxIdle; + + /** + Creats a new BufferPool. Prefer to reuse an existing pool - centralize if required. + @param maxIdle the maximum number of objects to hold that are not actively in use. Per thread - used so that + e.g. multiple StringBuilders can be in use at once. + @param lifecycle the implemention of the pooled objects lifecycle -- create and reset + */ + public BufferPool(int maxIdle, Lifecycle lifecycle) { + this.lifecycle = lifecycle; + this.maxIdle = maxIdle; + pool = ThreadLocal.withInitial(() -> new SoftReference<>(new Stack<>())); + } + + /** + Grab an object from the pool, or create one if required. + */ + public T borrow() { + Stack stack = getStack(); + return stack.empty() ? + lifecycle.create() : + stack.pop(); + } + + /** + Place an object back in the pool. Will reset() the object, and trims the pool to maxIdle. + */ + public void release(T obj) { + Validate.notNull(obj); + obj = lifecycle.reset(obj); // clear the contents, and prevent from growing too large (per impl) + Stack stack = getStack(); + stack.push(obj); + + // trim stack to maxIdle objects + while (stack.size() > maxIdle) stack.pop(); + } + + private Stack getStack() { + SoftReference> ref = pool.get(); + @Nullable Stack stack = ref.get(); + if (stack == null) { // got GCed, reset it + stack = new Stack<>(); + pool.set(new SoftReference<>(stack)); + } + return stack; + } +} diff --git a/src/main/java/org/jsoup/internal/StringUtil.java b/src/main/java/org/jsoup/internal/StringUtil.java index 73a589b17c..6e179a11ac 100644 --- a/src/main/java/org/jsoup/internal/StringUtil.java +++ b/src/main/java/org/jsoup/internal/StringUtil.java @@ -8,7 +8,6 @@ import java.util.Arrays; import java.util.Collection; import java.util.Iterator; -import java.util.Stack; import java.util.regex.Pattern; /** @@ -256,7 +255,7 @@ public static boolean in(final String needle, final String... haystack) { final int len = haystack.length; for (int i = 0; i < len; i++) { if (haystack[i].equals(needle)) - return true; + return true; } return false; } @@ -335,13 +334,6 @@ private static String stripControlChars(final String input) { return controlChars.matcher(input).replaceAll(""); } - private static final ThreadLocal> threadLocalBuilders = new ThreadLocal>() { - @Override - protected Stack initialValue() { - return new Stack<>(); - } - }; - /** * Maintains cached StringBuilders in a flyweight pattern, to minimize new StringBuilder GCs. The StringBuilder is * prevented from growing too large. @@ -350,10 +342,7 @@ protected Stack initialValue() { * @return an empty StringBuilder */ public static StringBuilder borrowBuilder() { - Stack builders = threadLocalBuilders.get(); - return builders.empty() ? - new StringBuilder(MaxCachedBuilderSize) : - builders.pop(); + return StringBuilders.borrow(); } /** @@ -365,21 +354,24 @@ public static StringBuilder borrowBuilder() { public static String releaseBuilder(StringBuilder sb) { Validate.notNull(sb); String string = sb.toString(); - - if (sb.length() > MaxCachedBuilderSize) - sb = new StringBuilder(MaxCachedBuilderSize); // make sure it hasn't grown too big - else - sb.delete(0, sb.length()); // make sure it's emptied on release - - Stack builders = threadLocalBuilders.get(); - builders.push(sb); - - while (builders.size() > MaxIdleBuilders) { - builders.pop(); - } + StringBuilders.release(sb); return string; } - private static final int MaxCachedBuilderSize = 8 * 1024; + private static final int MinCapacity = 1024; + private static final int MaxCapacity = 8 * MinCapacity; private static final int MaxIdleBuilders = 8; + + private static final BufferPool StringBuilders = + new BufferPool<>(MaxIdleBuilders, new BufferPool.Lifecycle() { + @Override public StringBuilder create() { + return new StringBuilder(MinCapacity); + } + + @Override public StringBuilder reset(StringBuilder sb) { + if (sb.length() > MaxCapacity) + return create(); + return sb.delete(0, sb.length()); + } + }); } diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index c79d977480..6b83cc6355 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -3,6 +3,7 @@ import org.jsoup.UncheckedIOException; import org.jsoup.helper.BufferRecycler; import org.jsoup.helper.Validate; +import org.jsoup.internal.BufferPool; import javax.annotation.Nullable; import java.io.IOException; @@ -18,36 +19,9 @@ CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. */ public final class CharacterReader { - protected static final ThreadLocal> _recyclerRef = new ThreadLocal<>(); - protected static final ThreadLocal> stringCacheRef = new ThreadLocal<>(); - - public static BufferRecycler getBufferRecycler() { - SoftReference ref = _recyclerRef.get(); - BufferRecycler br = (ref == null) ? null : ref.get(); - - if (br == null) { - br = new BufferRecycler(); - ref = new SoftReference<>(br); - _recyclerRef.set(ref); - } - return br; - } - - public static String[] getStringCache() { - SoftReference ref = stringCacheRef.get(); - String[] stringCache = (ref == null) ? null : ref.get(); - - if (stringCache == null) { - stringCache = new String[stringCacheSize]; - ref = new SoftReference<>(stringCache); - stringCacheRef.set(ref); - } - return stringCache; - } - static final char EOF = (char) -1; private static final int maxStringCacheLen = 12; - static final int maxBufferLen = 1024 * 32; // visible for testing + static final int maxBufferLen = 1024 * 8; // visible for testing static final int readAheadLimit = (int) (maxBufferLen * 0.75); // visible for testing private static final int minReadAheadLen = 1024; // the minimum mark length supported. No HTML entities can be larger than this. @@ -59,16 +33,18 @@ public static String[] getStringCache() { private int readerPos; private int bufMark = -1; private static final int stringCacheSize = 512; - private String[] stringCache = getStringCache(); // holds reused strings in this doc, to lessen garbage + private String[] stringCache; // holds reused strings in this doc, to lessen garbage @Nullable private ArrayList newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp() private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)] public CharacterReader(Reader input, int sz) { + // todo - sz is defunct now that we have a fixed sized re-used buffer; remove Validate.notNull(input); Validate.isTrue(input.markSupported()); reader = input; - charBuf = getBufferRecycler().allocCharBuffer(Math.min(sz, maxBufferLen)); + charBuf = CharArrayPool.borrow(); + stringCache = StringCachePool.borrow(); bufferUp(); } @@ -88,8 +64,10 @@ public void close() { } catch (IOException ignored) { } finally { reader = null; + CharArrayPool.release(charBuf); + charBuf = null; + StringCachePool.release(stringCache); stringCache = null; - getBufferRecycler().releaseCharBuffer(charBuf); } } @@ -785,4 +763,32 @@ static boolean rangeEquals(final char[] charBuf, final int start, int count, fin boolean rangeEquals(final int start, final int count, final String cached) { return rangeEquals(charBuf, start, count, cached); } + + // only useful for CharacterReader - does not reset so can be used across reads + private static final BufferPool StringCachePool = + new BufferPool<>(2, new BufferPool.Lifecycle() { + + @Override public String[] create() { + return new String[stringCacheSize]; + } + + @Override public String[] reset(String[] arr) { + // no-op as we don't grow, and want to reuse the contents between parses + return arr; + } + }); + + public static final BufferPool CharArrayPool = + new BufferPool<>(2, new BufferPool.Lifecycle() { + @Override public char[] create() { + return new char[maxBufferLen]; + } + + @Override public char[] reset(char[] arr) { + if (arr.length > maxBufferLen) + return create(); + // does not clear contents, as the consumer (CharacterReader) reads into it and maintains length/pos etc + return arr; + } + }); } diff --git a/src/test/java/org/jsoup/benchmarks/JMHBenchmark.java b/src/test/java/org/jsoup/benchmarks/JMHBenchmark.java index 10cdf7b323..79b1fb305a 100644 --- a/src/test/java/org/jsoup/benchmarks/JMHBenchmark.java +++ b/src/test/java/org/jsoup/benchmarks/JMHBenchmark.java @@ -5,6 +5,7 @@ import java.util.concurrent.TimeUnit; import org.jsoup.Jsoup; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.Mode; @@ -15,6 +16,7 @@ import org.openjdk.jmh.runner.options.OptionsBuilder; import org.openjdk.jmh.runner.options.TimeValue; +@Disabled public class JMHBenchmark { private static final String CONTENT_SMALL = "

small

"; private static final String CONTENT_MEDIUM = " \n" + diff --git a/src/test/java/org/jsoup/integration/servlets/InterruptedServlet.java b/src/test/java/org/jsoup/integration/servlets/InterruptedServlet.java index 22180e13d5..8891882f04 100644 --- a/src/test/java/org/jsoup/integration/servlets/InterruptedServlet.java +++ b/src/test/java/org/jsoup/integration/servlets/InterruptedServlet.java @@ -22,7 +22,7 @@ protected void doGet(HttpServletRequest req, HttpServletResponse res) throws IOE StringBuilder sb = new StringBuilder(); sb.append("Something"); - while (sb.length() <= CharacterReaderTest.maxBufferLen) { + while (sb.length() <= CharacterReaderTest.maxBufferLen * 4) { sb.append("A suitable amount of data. \n"); } sb.append("

Finale.

"); diff --git a/src/test/java/org/jsoup/parser/CharacterReaderTest.java b/src/test/java/org/jsoup/parser/CharacterReaderTest.java index d9f2280177..5b79285483 100644 --- a/src/test/java/org/jsoup/parser/CharacterReaderTest.java +++ b/src/test/java/org/jsoup/parser/CharacterReaderTest.java @@ -410,10 +410,10 @@ public void notEmptyAtBufferSplitPoint() { // get over the buffer while (!noTrack.matches("[foo]")) noTrack.consumeTo("[foo]"); - assertEquals(32778, noTrack.pos()); + assertEquals(8194, noTrack.pos()); // sensitive to maxBufferLen assertEquals(1, noTrack.lineNumber()); assertEquals(noTrack.pos()+1, noTrack.columnNumber()); - assertEquals("1:32779", noTrack.cursorPos()); + assertEquals("1:8195", noTrack.cursorPos()); // and the line numbers: "\n\n\n" assertEquals(0, track.pos()); @@ -441,12 +441,12 @@ public void notEmptyAtBufferSplitPoint() { // get over the buffer while (!track.matches("[foo]")) track.consumeTo("[foo]"); - assertEquals(32778, track.pos()); + assertEquals(8194, track.pos()); assertEquals(4, track.lineNumber()); - assertEquals(32761, track.columnNumber()); - assertEquals("4:32761", track.cursorPos()); + assertEquals(8177, track.columnNumber()); + assertEquals("4:8177", track.cursorPos()); track.consumeTo('\n'); - assertEquals("4:32766", track.cursorPos()); + assertEquals("4:8182", track.cursorPos()); track.consumeTo("[bar]"); assertEquals(5, track.lineNumber()); @@ -466,7 +466,7 @@ public void notEmptyAtBufferSplitPoint() { assertEquals("1:1", reader.cursorPos()); while (!reader.isEmpty()) reader.consume(); - assertEquals(131096, reader.pos()); + assertEquals(32816, reader.pos()); // sensitive to maxBufferLen assertEquals(reader.pos() + 1, reader.columnNumber()); assertEquals(1, reader.lineNumber()); } From f108026ba7f2937e3579f8d1931a8a88002ba5f3 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Sat, 7 Jan 2023 13:55:03 +1100 Subject: [PATCH 3/9] Don't use withInitial, not in default Android level --- src/main/java/org/jsoup/internal/BufferPool.java | 7 ++++++- src/main/java/org/jsoup/parser/CharacterReader.java | 2 -- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/jsoup/internal/BufferPool.java b/src/main/java/org/jsoup/internal/BufferPool.java index f84e5ad341..89106ac570 100644 --- a/src/main/java/org/jsoup/internal/BufferPool.java +++ b/src/main/java/org/jsoup/internal/BufferPool.java @@ -41,7 +41,12 @@ Called when an object is returned. Should resize the object (via create) if it h public BufferPool(int maxIdle, Lifecycle lifecycle) { this.lifecycle = lifecycle; this.maxIdle = maxIdle; - pool = ThreadLocal.withInitial(() -> new SoftReference<>(new Stack<>())); + pool = new ThreadLocal>>() { + @Override protected SoftReference> initialValue() { + return new SoftReference<>(new Stack<>()); + } + }; + // todo - not yet using .withInitialValue as it's not in Android default - give our new instructions to enable desugaring some time to land. simplify getStack } /** diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index 6b83cc6355..7cbce59dda 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -1,7 +1,6 @@ package org.jsoup.parser; import org.jsoup.UncheckedIOException; -import org.jsoup.helper.BufferRecycler; import org.jsoup.helper.Validate; import org.jsoup.internal.BufferPool; @@ -9,7 +8,6 @@ import java.io.IOException; import java.io.Reader; import java.io.StringReader; -import java.lang.ref.SoftReference; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; From b699998af1278f1b9c4bea3a97a8882621fd0fd7 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 9 Jan 2023 16:02:09 +1100 Subject: [PATCH 4/9] Moving char buffer size back to 32K Perf tests show this performs better when parsing larger docs; small strings still get the speed boost as the buffer is recycled. --- .../java/org/jsoup/parser/CharacterReader.java | 2 +- .../java/org/jsoup/parser/CharacterReaderTest.java | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index 7cbce59dda..22e9582f53 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -19,7 +19,7 @@ public final class CharacterReader { static final char EOF = (char) -1; private static final int maxStringCacheLen = 12; - static final int maxBufferLen = 1024 * 8; // visible for testing + static final int maxBufferLen = 1024 * 32; // visible for testing static final int readAheadLimit = (int) (maxBufferLen * 0.75); // visible for testing private static final int minReadAheadLen = 1024; // the minimum mark length supported. No HTML entities can be larger than this. diff --git a/src/test/java/org/jsoup/parser/CharacterReaderTest.java b/src/test/java/org/jsoup/parser/CharacterReaderTest.java index 5b79285483..3c5d4a8ec3 100644 --- a/src/test/java/org/jsoup/parser/CharacterReaderTest.java +++ b/src/test/java/org/jsoup/parser/CharacterReaderTest.java @@ -410,10 +410,10 @@ public void notEmptyAtBufferSplitPoint() { // get over the buffer while (!noTrack.matches("[foo]")) noTrack.consumeTo("[foo]"); - assertEquals(8194, noTrack.pos()); // sensitive to maxBufferLen + assertEquals(32778, noTrack.pos()); // sensitive to maxBufferLen assertEquals(1, noTrack.lineNumber()); assertEquals(noTrack.pos()+1, noTrack.columnNumber()); - assertEquals("1:8195", noTrack.cursorPos()); + assertEquals("1:32779", noTrack.cursorPos()); // and the line numbers: "\n\n\n" assertEquals(0, track.pos()); @@ -441,12 +441,12 @@ public void notEmptyAtBufferSplitPoint() { // get over the buffer while (!track.matches("[foo]")) track.consumeTo("[foo]"); - assertEquals(8194, track.pos()); + assertEquals(32778, track.pos()); assertEquals(4, track.lineNumber()); - assertEquals(8177, track.columnNumber()); - assertEquals("4:8177", track.cursorPos()); + assertEquals(32761, track.columnNumber()); + assertEquals("4:32761", track.cursorPos()); track.consumeTo('\n'); - assertEquals("4:8182", track.cursorPos()); + assertEquals("4:32766", track.cursorPos()); track.consumeTo("[bar]"); assertEquals(5, track.lineNumber()); @@ -466,7 +466,7 @@ public void notEmptyAtBufferSplitPoint() { assertEquals("1:1", reader.cursorPos()); while (!reader.isEmpty()) reader.consume(); - assertEquals(32816, reader.pos()); // sensitive to maxBufferLen + assertEquals(131096, reader.pos()); // sensitive to maxBufferLen assertEquals(reader.pos() + 1, reader.columnNumber()); assertEquals(1, reader.lineNumber()); } From 30e73c8a3ef0448d461bb1648e04b3ef69d14605 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Thu, 12 Jan 2023 17:04:49 +1100 Subject: [PATCH 5/9] Removed need of BufferedReader In CharacterReader. This removes the redundant buffer allocation, and simplifies the read. For small file reads (same as small string test), updated version is ~ 20% faster than original. --- src/main/java/org/jsoup/helper/DataUtil.java | 4 +- .../org/jsoup/parser/CharacterReader.java | 71 ++++++++++--------- .../org/jsoup/parser/CharacterReaderTest.java | 2 +- 3 files changed, 39 insertions(+), 38 deletions(-) diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java index 3f34450b7f..31d9d93cc2 100644 --- a/src/main/java/org/jsoup/helper/DataUtil.java +++ b/src/main/java/org/jsoup/helper/DataUtil.java @@ -15,7 +15,6 @@ import javax.annotation.Nullable; import javax.annotation.WillClose; -import java.io.BufferedReader; import java.io.CharArrayReader; import java.io.File; import java.io.FileInputStream; @@ -90,7 +89,6 @@ public static Document load(File file, @Nullable String charsetName, String base zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes } finally { stream.close(); - } stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file); } @@ -208,7 +206,7 @@ else if (first instanceof Comment) { if (doc == null) { if (charsetName == null) charsetName = defaultCharsetName; - BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), bufferSize); // Android level does not allow us try-with-resources + InputStreamReader reader = new InputStreamReader(input, Charset.forName(charsetName)); // Android level does not allow us try-with-resources try { if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here long skipped = reader.skip(1); diff --git a/src/main/java/org/jsoup/parser/CharacterReader.java b/src/main/java/org/jsoup/parser/CharacterReader.java index 22e9582f53..b6a0692b62 100644 --- a/src/main/java/org/jsoup/parser/CharacterReader.java +++ b/src/main/java/org/jsoup/parser/CharacterReader.java @@ -36,22 +36,30 @@ public final class CharacterReader { @Nullable private ArrayList newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp() private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)] - public CharacterReader(Reader input, int sz) { - // todo - sz is defunct now that we have a fixed sized re-used buffer; remove + /** + Create a new CharacterReader that reads from the input Reader. + @param input the Reader to read from. Need not be a BufferedReader, as CharacterReader maintains a buffer + internally. + */ + public CharacterReader(Reader input) { Validate.notNull(input); - Validate.isTrue(input.markSupported()); reader = input; charBuf = CharArrayPool.borrow(); stringCache = StringCachePool.borrow(); bufferUp(); } - public CharacterReader(Reader input) { - this(input, maxBufferLen); + /** + @deprecated The initial buffer size parameter is no longer supported. This method will be removed in the next + release. + */ + @Deprecated + public CharacterReader(Reader input, int sz) { + this(input); } public CharacterReader(String input) { - this(new StringReader(input), input.length()); + this(new StringReader(input)); } public void close() { @@ -70,40 +78,36 @@ public void close() { } private boolean readFully; // if the underlying stream has been completely read, no value in further buffering + private void bufferUp() { if (readFully || bufPos < bufSplitPoint) return; - final int pos; - final int offset; - if (bufMark != -1) { - pos = bufMark; - offset = bufPos - bufMark; - } else { - pos = bufPos; - offset = 0; + // discard the consumed characters from the buffer and pull the remainder forward: + if (bufPos > 0) { + int offset = bufMark == -1 ? bufPos : bufMark; // if set, mark is before pos + int remainder = bufLength - offset; + System.arraycopy(charBuf, offset, charBuf, 0, remainder); + readerPos += offset; + bufLength = remainder; + if (bufMark != -1) { + bufPos = bufMark; + bufMark = 0; + } else { + bufPos = 0; + } } + // blocking read until we get the minimum buffered up, then read until would block try { - final long skipped = reader.skip(pos); - reader.mark(maxBufferLen); - int read = 0; - while (read <= minReadAheadLen) { - int thisRead = reader.read(charBuf, read, charBuf.length - read); - if (thisRead == -1) + while (bufLength <= minReadAheadLen || (bufLength < charBuf.length && reader.ready())) { + int thisRead = reader.read(charBuf, bufLength, charBuf.length - bufLength); + if (thisRead == -1) { readFully = true; - if (thisRead <= 0) break; - read += thisRead; - } - reader.reset(); - if (read > 0) { - Validate.isTrue(skipped == pos); // Previously asserted that there is room in buf to skip, so this will be a WTF - bufLength = read; - readerPos += pos; - bufPos = offset; - if (bufMark != -1) - bufMark = 0; + } + bufLength += thisRead; + if (bufMark != -1) bufMark = 0; bufSplitPoint = Math.min(bufLength, readAheadLimit); } } catch (IOException e) { @@ -783,9 +787,8 @@ boolean rangeEquals(final int start, final int count, final String cached) { } @Override public char[] reset(char[] arr) { - if (arr.length > maxBufferLen) - return create(); - // does not clear contents, as the consumer (CharacterReader) reads into it and maintains length/pos etc + // no size check as it's fixed size + Arrays.fill(arr, '\0'); // zero it to make sure we never inadvertently read stale input return arr; } }); diff --git a/src/test/java/org/jsoup/parser/CharacterReaderTest.java b/src/test/java/org/jsoup/parser/CharacterReaderTest.java index 3c5d4a8ec3..bf29b30d6c 100644 --- a/src/test/java/org/jsoup/parser/CharacterReaderTest.java +++ b/src/test/java/org/jsoup/parser/CharacterReaderTest.java @@ -332,7 +332,7 @@ public void consumeToNonexistentEndWhenAtAnd() { @Test public void notEmptyAtBufferSplitPoint() { - CharacterReader r = new CharacterReader(new StringReader("How about now"), 3); + CharacterReader r = new CharacterReader(new StringReader("How about now")); assertEquals("How", r.consumeTo(' ')); assertFalse(r.isEmpty(), "Should not be empty"); From 51f7d89e9cc6b71d0e94b884f259c64f792b07af Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 30 Oct 2023 12:20:38 +1100 Subject: [PATCH 6/9] consumeSubQuery should not drop leading combinators Refactored so that it eats until a combinator is seen after non-combinator content, and returns it all. And corrected unit tests that were incorrectly relying on that behavior. Note that a leading combinator will combine against the root element, which is either the Document, or the context element. Fixes #1707 --- CHANGES | 4 ++++ src/main/java/org/jsoup/select/QueryParser.java | 9 ++++++--- src/test/java/org/jsoup/select/QueryParserTest.java | 6 +++--- src/test/java/org/jsoup/select/SelectorTest.java | 13 +++++++++++++ 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/CHANGES b/CHANGES index 1e84076053..e3661405c4 100644 --- a/CHANGES +++ b/CHANGES @@ -16,6 +16,10 @@ Release 1.17.1 [PENDING] elements to be returned when used on elements other than the root document. + * Bugfix: in a sub-query such as `p:has(> span, > i)`, combinators following the `,` Or combinator would be + incorrectly skipped, such that the sub-query was parsed as `i` instead of `> i`. + + Release 1.16.2 [20-Oct-2023] * Improvement: optimized the performance of complex CSS selectors, by adding a cost-based query planner. Evaluators are sorted by their relative execution cost, and executed in order of lower to higher cost. This speeds the diff --git a/src/main/java/org/jsoup/select/QueryParser.java b/src/main/java/org/jsoup/select/QueryParser.java index 09f53bdd00..30872eb53b 100644 --- a/src/main/java/org/jsoup/select/QueryParser.java +++ b/src/main/java/org/jsoup/select/QueryParser.java @@ -145,18 +145,21 @@ private void combinator(char combinator) { private String consumeSubQuery() { StringBuilder sq = StringUtil.borrowBuilder(); + boolean seenNonCombinator = false; // eat until we hit a combinator after eating something else while (!tq.isEmpty()) { if (tq.matches("(")) sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); else if (tq.matches("[")) sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); else if (tq.matchesAny(Combinators)) - if (sq.length() > 0) + if (seenNonCombinator) break; else - tq.consume(); - else + sq.append(tq.consume()); + else { + seenNonCombinator = true; sq.append(tq.consume()); + } } return StringUtil.releaseBuilder(sq); } diff --git a/src/test/java/org/jsoup/select/QueryParserTest.java b/src/test/java/org/jsoup/select/QueryParserTest.java index ae2f344886..51b7c925d2 100644 --- a/src/test/java/org/jsoup/select/QueryParserTest.java +++ b/src/test/java/org/jsoup/select/QueryParserTest.java @@ -18,10 +18,10 @@ public class QueryParserTest { "
  • l2
  • " + "

    yes

    " + ""); - assertEquals("l1 l2 yes", doc.body().select(">p>strong,>*>li>strong").text()); + assertEquals("l1 yes", doc.body().select(">p>strong,>li>strong").text()); // selecting immediate from body + assertEquals("l2 yes", doc.select("body>p>strong,body>*>li>strong").text()); + assertEquals("l2 yes", doc.select("body>*>li>strong,body>p>strong").text()); assertEquals("l2 yes", doc.select("body>p>strong,body>*>li>strong").text()); - assertEquals("yes", doc.select(">body>*>li>strong,>body>p>strong").text()); - assertEquals("l2", doc.select(">body>p>strong,>body>*>li>strong").text()); } @Test public void testImmediateParentRun() { diff --git a/src/test/java/org/jsoup/select/SelectorTest.java b/src/test/java/org/jsoup/select/SelectorTest.java index e941f9032b..686214aa6b 100644 --- a/src/test/java/org/jsoup/select/SelectorTest.java +++ b/src/test/java/org/jsoup/select/SelectorTest.java @@ -1206,4 +1206,17 @@ public void wildcardNamespaceMatchesNoNamespace() { Elements innerLisFromParent = li2.select("ul li"); assertEquals(innerLis, innerLisFromParent); } + + @Test public void rootImmediateParentSubquery() { + // a combinator at the start of the query is applied to the Root selector. i.e. "> p" matches a P immediately parented + // by the Root (which is for a top level query, or the context element in :has) + // in the sub query, the combinator was dropped incorrectly + String html = "

    A

    B

    C

    \n"; + Document doc = Jsoup.parse(html); + + Elements els = doc.select("p:has(> span, > i)"); // should match a p with an immediate span or i + assertEquals(2, els.size()); + assertEquals("0", els.get(0).id()); + assertEquals("2", els.get(1).id()); + } } From 1c57f30a71d7af85950d34dad733f13f0fe7c3c5 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 30 Oct 2023 14:43:53 +1100 Subject: [PATCH 7/9] Tidied up some tests with an assertion helper --- .../java/org/jsoup/select/SelectorTest.java | 120 +++++++----------- 1 file changed, 48 insertions(+), 72 deletions(-) diff --git a/src/test/java/org/jsoup/select/SelectorTest.java b/src/test/java/org/jsoup/select/SelectorTest.java index 686214aa6b..1299acfdcb 100644 --- a/src/test/java/org/jsoup/select/SelectorTest.java +++ b/src/test/java/org/jsoup/select/SelectorTest.java @@ -19,16 +19,31 @@ * @author Jonathan Hedley, jonathan@hedley.net */ public class SelectorTest { + + /** Test that the selected elements match exactly the specified IDs. */ + static void assertSelectedIds(Elements els, String... ids) { + assertNotNull(els); + assertEquals(ids.length, els.size()); + for (int i = 0; i < ids.length; i++) { + assertEquals(ids[i], els.get(i).id()); + } + } + + static void assertSelectedOwnText(Elements els, String... ownTexts) { + assertNotNull(els); + assertEquals(ownTexts.length, els.size()); + for (int i = 0; i < ownTexts.length; i++) { + assertEquals(ownTexts[i], els.get(i).ownText()); + } + } + @Test public void testByTag() { - // should be case insensitive + // should be case-insensitive Elements els = Jsoup.parse("

    Hello

    ").select("DIV"); - assertEquals(3, els.size()); - assertEquals("1", els.get(0).id()); - assertEquals("2", els.get(1).id()); - assertEquals("3", els.get(2).id()); + assertSelectedIds(els, "1", "2", "3"); Elements none = Jsoup.parse("

    Hello

    ").select("span"); - assertEquals(0, none.size()); + assertTrue(none.isEmpty()); } @Test public void byEscapedTag() { @@ -44,12 +59,10 @@ public class SelectorTest { @Test public void testById() { Elements els = Jsoup.parse("

    Hello

    Foo two!

    ").select("#foo"); - assertEquals(2, els.size()); - assertEquals("Hello", els.get(0).text()); - assertEquals("Foo two!", els.get(1).text()); + assertSelectedOwnText(els, "Hello", "Foo two!"); Elements none = Jsoup.parse("
    ").select("#foo"); - assertEquals(0, none.size()); + assertTrue(none.isEmpty()); } @Test public void byEscapedId() { @@ -67,22 +80,18 @@ public class SelectorTest { @Test public void testByClass() { Elements els = Jsoup.parse("

    ").select("P.One"); - assertEquals(2, els.size()); - assertEquals("0", els.get(0).id()); - assertEquals("1", els.get(1).id()); + assertSelectedIds(els, "0", "1"); Elements none = Jsoup.parse("

    ").select(".foo"); - assertEquals(0, none.size()); + assertTrue(none.isEmpty()); - Elements els2 = Jsoup.parse("
    ").select(".one-two"); - assertEquals(1, els2.size()); + Elements els2 = Jsoup.parse("
    ").select(".one-two"); + assertSelectedIds(els2, "1"); } @Test public void byEscapedClass() { - Element els = Jsoup.parse("

    One

    "); - - Element one = els.expectFirst("p.one\\.two\\#three"); - assertEquals("One", one.text()); + Document doc = Jsoup.parse("

    One

    "); + assertSelectedOwnText(doc.select("p.one\\.two\\#three"), "One"); } @Test public void testByClassCaseInsensitive() { @@ -91,8 +100,7 @@ public class SelectorTest { Elements elsFromAttr = Jsoup.parse(html).select("p[class=foo]"); assertEquals(elsFromAttr.size(), elsFromClass.size()); - assertEquals(3, elsFromClass.size()); - assertEquals("Two", elsFromClass.get(1).text()); + assertSelectedOwnText(elsFromClass, "One", "Two", "Three"); } @@ -143,43 +151,31 @@ public void testByAttribute(Locale locale) { @Test public void testNamespacedTag() { Document doc = Jsoup.parse("
    Hello
    There"); Elements byTag = doc.select("abc|def"); - assertEquals(2, byTag.size()); - assertEquals("1", byTag.first().id()); - assertEquals("2", byTag.last().id()); + assertSelectedIds(byTag, "1", "2"); Elements byAttr = doc.select(".bold"); - assertEquals(1, byAttr.size()); - assertEquals("2", byAttr.last().id()); + assertSelectedIds(byAttr, "2"); Elements byTagAttr = doc.select("abc|def.bold"); - assertEquals(1, byTagAttr.size()); - assertEquals("2", byTagAttr.last().id()); + assertSelectedIds(byTagAttr, "2"); Elements byContains = doc.select("abc|def:contains(e)"); - assertEquals(2, byContains.size()); - assertEquals("1", byContains.first().id()); - assertEquals("2", byContains.last().id()); + assertSelectedIds(byContains, "1", "2"); } @Test public void testWildcardNamespacedTag() { Document doc = Jsoup.parse("
    Hello
    There"); Elements byTag = doc.select("*|def"); - assertEquals(2, byTag.size()); - assertEquals("1", byTag.first().id()); - assertEquals("2", byTag.last().id()); + assertSelectedIds(byTag, "1", "2"); Elements byAttr = doc.select(".bold"); - assertEquals(1, byAttr.size()); - assertEquals("2", byAttr.last().id()); + assertSelectedIds(byAttr, "2"); Elements byTagAttr = doc.select("*|def.bold"); - assertEquals(1, byTagAttr.size()); - assertEquals("2", byTagAttr.last().id()); + assertSelectedIds(byTagAttr, "2"); Elements byContains = doc.select("*|def:contains(e)"); - assertEquals(2, byContains.size()); - assertEquals("1", byContains.first().id()); - assertEquals("2", byContains.last().id()); + assertSelectedIds(byContains, "1", "2"); } @Test public void testWildcardNamespacedXmlTag() { @@ -189,22 +185,16 @@ public void testByAttribute(Locale locale) { ); Elements byTag = doc.select("*|Def"); - assertEquals(2, byTag.size()); - assertEquals("1", byTag.first().id()); - assertEquals("2", byTag.last().id()); + assertSelectedIds(byTag, "1", "2"); Elements byAttr = doc.select(".bold"); - assertEquals(1, byAttr.size()); - assertEquals("2", byAttr.last().id()); + assertSelectedIds(byAttr, "2"); Elements byTagAttr = doc.select("*|Def.bold"); - assertEquals(1, byTagAttr.size()); - assertEquals("2", byTagAttr.last().id()); + assertSelectedIds(byTagAttr, "2"); Elements byContains = doc.select("*|Def:contains(e)"); - assertEquals(2, byContains.size()); - assertEquals("1", byContains.first().id()); - assertEquals("2", byContains.last().id()); + assertSelectedIds(byContains, "1", "2"); } @Test public void testWildCardNamespacedCaseVariations() { @@ -242,18 +232,13 @@ public void testByAttributeStarting(Locale locale) { @Test public void testByAttributeRegex() { Document doc = Jsoup.parse("

    "); Elements imgs = doc.select("img[src~=(?i)\\.(png|jpe?g)]"); - assertEquals(3, imgs.size()); - assertEquals("1", imgs.get(0).id()); - assertEquals("2", imgs.get(1).id()); - assertEquals("3", imgs.get(2).id()); + assertSelectedIds(imgs, "1", "2", "3"); } @Test public void testByAttributeRegexCharacterClass() { Document doc = Jsoup.parse("

    "); Elements imgs = doc.select("img[src~=[o]]"); - assertEquals(2, imgs.size()); - assertEquals("1", imgs.get(0).id()); - assertEquals("4", imgs.get(1).id()); + assertSelectedIds(imgs, "1", "4"); } @Test public void testByAttributeRegexCombined() { @@ -1172,14 +1157,8 @@ public void wildcardNamespaceMatchesNoNamespace() { Elements empty = doc.select("li:empty"); Elements notEmpty = doc.select("li:not(:empty)"); - assertEquals(3, empty.size()); - assertEquals(2, notEmpty.size()); - - assertEquals("1", empty.get(0).id()); - assertEquals("2", empty.get(1).id()); - assertEquals("3", empty.get(2).id()); - assertEquals("4", notEmpty.get(0).id()); - assertEquals("5", notEmpty.get(1).id()); + assertSelectedIds(empty, "1", "2", "3"); + assertSelectedIds(notEmpty, "4", "5"); } @Test public void parentFromSpecifiedDescender() { @@ -1199,8 +1178,7 @@ public void wildcardNamespaceMatchesNoNamespace() { // And now for the bug - li2 select was not restricted to the li2 context Elements innerLis = li2.select("ul > li"); - assertEquals(2, innerLis.size()); - assertEquals("Baz", innerLis.first().ownText()); + assertSelectedOwnText(innerLis, "Baz", "Qux"); // Confirm that parent selector (" ") works same as immediate parent (">"); Elements innerLisFromParent = li2.select("ul li"); @@ -1215,8 +1193,6 @@ public void wildcardNamespaceMatchesNoNamespace() { Document doc = Jsoup.parse(html); Elements els = doc.select("p:has(> span, > i)"); // should match a p with an immediate span or i - assertEquals(2, els.size()); - assertEquals("0", els.get(0).id()); - assertEquals("2", els.get(1).id()); + assertSelectedIds(els, "0", "2"); } } From ef8106e04041436c4e8cf319de626b21fb8cdd07 Mon Sep 17 00:00:00 2001 From: Jonathan Hedley Date: Mon, 30 Oct 2023 15:49:23 +1100 Subject: [PATCH 8/9] Added the `:is` pseudo selector --- CHANGES | 3 ++ .../java/org/jsoup/select/QueryParser.java | 9 +++++ src/main/java/org/jsoup/select/Selector.java | 3 +- .../org/jsoup/select/StructuralEvaluator.java | 21 ++++++++++++ .../java/org/jsoup/select/SelectorTest.java | 33 ++++++++++++++++--- 5 files changed, 64 insertions(+), 5 deletions(-) diff --git a/CHANGES b/CHANGES index e3661405c4..b20a2e4392 100644 --- a/CHANGES +++ b/CHANGES @@ -8,6 +8,9 @@ Release 1.17.1 [PENDING] * Improvement: when changing the OutputSettings syntax to XML, the xhtml EscapeMode is automatically set by default. + * Improvement: added the `:is(selector list)` pseudo-selector, which finds elements that match any of the selectors in + the selector list. Useful for making large ORed selectors more readable. + * Bugfix: when outputting with XML syntax, HTML elements that were parsed as data nodes (