Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement buffer recycling for CharacterReader #1800

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
7 changes: 7 additions & 0 deletions CHANGES
Expand Up @@ -8,6 +8,9 @@ Release 1.17.1 [PENDING]

* Improvement: when changing the OutputSettings syntax to XML, the xhtml EscapeMode is automatically set by default.

* Improvement: added the `:is(selector list)` pseudo-selector, which finds elements that match any of the selectors in
the selector list. Useful for making large ORed selectors more readable.

* Bugfix: when outputting with XML syntax, HTML elements that were parsed as data nodes (<script> and <style>) should
be emitted as CDATA nodes, so that they can be parsed correctly by an XML parser.
<https://github.com/jhy/jsoup/pull/1720>
Expand All @@ -16,6 +19,10 @@ Release 1.17.1 [PENDING]
elements to be returned when used on elements other than the root document.
<https://github.com/jhy/jsoup/issues/2018>

* Bugfix: in a sub-query such as `p:has(> span, > i)`, combinators following the `,` Or combinator would be
incorrectly skipped, such that the sub-query was parsed as `i` instead of `> i`.
<https://github.com/jhy/jsoup/issues/1707>

Release 1.16.2 [20-Oct-2023]
* Improvement: optimized the performance of complex CSS selectors, by adding a cost-based query planner. Evaluators
are sorted by their relative execution cost, and executed in order of lower to higher cost. This speeds the
Expand Down
13 changes: 13 additions & 0 deletions pom.xml
Expand Up @@ -333,6 +333,19 @@
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-core</artifactId>
<version>1.35</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-generator-annprocess</artifactId>
<version>1.35</version>
<scope>test</scope>
</dependency>

<dependency>
<!-- gson, to fetch entities from w3.org -->
<groupId>com.google.code.gson</groupId>
Expand Down
4 changes: 1 addition & 3 deletions src/main/java/org/jsoup/helper/DataUtil.java
Expand Up @@ -14,7 +14,6 @@
import javax.annotation.Nullable;
import javax.annotation.WillClose;

import java.io.BufferedReader;
import java.io.CharArrayReader;
import java.io.File;
import java.io.FileInputStream;
Expand Down Expand Up @@ -90,7 +89,6 @@ public static Document load(File file, @Nullable String charsetName, String base
zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
} finally {
stream.close();

}
stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
}
Expand Down Expand Up @@ -208,7 +206,7 @@ else if (first instanceof Comment) {
if (doc == null) {
if (charsetName == null)
charsetName = defaultCharsetName;
BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), bufferSize); // Android level does not allow us try-with-resources
InputStreamReader reader = new InputStreamReader(input, Charset.forName(charsetName)); // Android level does not allow us try-with-resources
try {
if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here
long skipped = reader.skip(1);
Expand Down
23 changes: 23 additions & 0 deletions src/main/java/org/jsoup/helper/StringBuilderRecycler.java
@@ -0,0 +1,23 @@
package org.jsoup.helper;

import java.util.ArrayList;

public class StringBuilderRecycler {
protected final ArrayList<StringBuilder> stringBuilders = new ArrayList<>();

public StringBuilder get(int minSize) {
if (!stringBuilders.isEmpty()) {
StringBuilder stringBuilder = stringBuilders.remove(stringBuilders.size() - 1);
// Too small string builders are thrown away
if (stringBuilder.capacity() >= minSize) {
stringBuilder.setLength(0);
return stringBuilder;
}
}
return new StringBuilder(minSize);
}

public void releaseByteBuffer(StringBuilder stringBuilder) {
stringBuilders.add(stringBuilder);
}
}
79 changes: 79 additions & 0 deletions src/main/java/org/jsoup/internal/BufferPool.java
@@ -0,0 +1,79 @@
package org.jsoup.internal;

import org.jsoup.helper.Validate;

import javax.annotation.Nullable;
import java.lang.ref.SoftReference;
import java.util.Stack;

/**
jsoup internal use only - maintains a pool of various threadlocal, soft-reference buffers used by the parser and for
serialization. */
public final class BufferPool<T> {
// char[] used in CharacterReader (input we are reading)
// String[] used in CharacterReader (token cache)
// String Builders (accumulators)

public interface Lifecycle<T> {
/**
Called when an object is borrowed but none are in the pool. Should return a new object of the desired min
capacity.
*/
T create();

/**
Called when an object is returned. Should resize the object (via create) if it has grown too large, and empty
its contents if not for reuse.
*/
T reset(T obj);
}

final Lifecycle<T> lifecycle;
final ThreadLocal<SoftReference<Stack<T>>> pool;
final int maxIdle;

/**
Creats a new BufferPool. Prefer to reuse an existing pool - centralize if required.
@param maxIdle the maximum number of objects to hold that are not actively in use. Per thread - used so that
e.g. multiple StringBuilders can be in use at once.
@param lifecycle the implemention of the pooled objects lifecycle -- create and reset
*/
public BufferPool(int maxIdle, Lifecycle<T> lifecycle) {
this.lifecycle = lifecycle;
this.maxIdle = maxIdle;
pool = ThreadLocal.withInitial(() -> new SoftReference<>(new Stack<>()));
}

/**
Grab an object from the pool, or create one if required.
*/
public T borrow() {
Stack<T> stack = getStack();
return stack.empty() ?
lifecycle.create() :
stack.pop();
}

/**
Place an object back in the pool. Will reset() the object, and trims the pool to maxIdle.
*/
public void release(T obj) {
Validate.notNull(obj);
obj = lifecycle.reset(obj); // clear the contents, and prevent from growing too large (per impl)
Stack<T> stack = getStack();
stack.push(obj);

// trim stack to maxIdle objects
while (stack.size() > maxIdle) stack.pop();
}

private Stack<T> getStack() {
SoftReference<Stack<T>> ref = pool.get();
@Nullable Stack<T> stack = ref.get();
if (stack == null) { // got GCed, reset it
stack = new Stack<>();
pool.set(new SoftReference<>(stack));
}
return stack;
}
}
39 changes: 18 additions & 21 deletions src/main/java/org/jsoup/internal/StringUtil.java
Expand Up @@ -8,7 +8,6 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.Stack;
import java.util.regex.Pattern;

/**
Expand Down Expand Up @@ -256,7 +255,7 @@ public static boolean in(final String needle, final String... haystack) {
final int len = haystack.length;
for (int i = 0; i < len; i++) {
if (haystack[i].equals(needle))
return true;
return true;
}
return false;
}
Expand Down Expand Up @@ -335,8 +334,6 @@ private static String stripControlChars(final String input) {
return controlChars.matcher(input).replaceAll("");
}

private static final ThreadLocal<Stack<StringBuilder>> threadLocalBuilders = ThreadLocal.withInitial(Stack::new);

/**
* Maintains cached StringBuilders in a flyweight pattern, to minimize new StringBuilder GCs. The StringBuilder is
* prevented from growing too large.
Expand All @@ -345,10 +342,7 @@ private static String stripControlChars(final String input) {
* @return an empty StringBuilder
*/
public static StringBuilder borrowBuilder() {
Stack<StringBuilder> builders = threadLocalBuilders.get();
return builders.empty() ?
new StringBuilder(MaxCachedBuilderSize) :
builders.pop();
return StringBuilders.borrow();
}

/**
Expand All @@ -360,21 +354,24 @@ public static StringBuilder borrowBuilder() {
public static String releaseBuilder(StringBuilder sb) {
Validate.notNull(sb);
String string = sb.toString();

if (sb.length() > MaxCachedBuilderSize)
sb = new StringBuilder(MaxCachedBuilderSize); // make sure it hasn't grown too big
else
sb.delete(0, sb.length()); // make sure it's emptied on release

Stack<StringBuilder> builders = threadLocalBuilders.get();
builders.push(sb);

while (builders.size() > MaxIdleBuilders) {
builders.pop();
}
StringBuilders.release(sb);
return string;
}

private static final int MaxCachedBuilderSize = 8 * 1024;
private static final int MinCapacity = 1024;
private static final int MaxCapacity = 8 * MinCapacity;
private static final int MaxIdleBuilders = 8;

private static final BufferPool<StringBuilder> StringBuilders =
new BufferPool<>(MaxIdleBuilders, new BufferPool.Lifecycle<StringBuilder>() {
@Override public StringBuilder create() {
return new StringBuilder(MinCapacity);
}

@Override public StringBuilder reset(StringBuilder sb) {
if (sb.length() > MaxCapacity)
return create();
return sb.delete(0, sb.length());
}
});
}