Skip to content

Commit

Permalink
AdaptivePoolingAllocator EventLoop Magazine's affinity (#14017)
Browse files Browse the repository at this point in the history
Motivation:

AdaptivePoolingAllocator uses some shared Magazines hoping thread's id
distribution to improve contended behaviour, but event loops threads can
just uses their own Magazine(s) and save 2 atomic operations (write
lock/unlock) in the hot path, further improving performance.

Modification:

Implements dedicated event loop Magazines

Result:

Better allocation performance for event loop threads
  • Loading branch information
franz1981 committed May 6, 2024
1 parent 2e413d6 commit 3438798
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 10 deletions.
24 changes: 22 additions & 2 deletions buffer/src/main/java/io/netty/buffer/AdaptiveByteBufAllocator.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@
*/
package io.netty.buffer;

import io.netty.buffer.AdaptivePoolingAllocator.MagazineCaching;
import io.netty.util.internal.PlatformDependent;
import io.netty.util.internal.SystemPropertyUtil;
import io.netty.util.internal.UnstableApi;
import io.netty.util.internal.logging.InternalLogger;
import io.netty.util.internal.logging.InternalLoggerFactory;

/**
* An auto-tuning pooling {@link ByteBufAllocator}, that follows an anti-generational hypothesis.
Expand All @@ -29,6 +33,16 @@
@UnstableApi
public final class AdaptiveByteBufAllocator extends AbstractByteBufAllocator
implements ByteBufAllocatorMetricProvider, ByteBufAllocatorMetric {
private static final InternalLogger logger = InternalLoggerFactory.getInstance(AdaptiveByteBufAllocator.class);
private static final boolean DEFAULT_USE_CACHED_MAGAZINES_FOR_NON_EVENT_LOOP_THREADS;

static {
DEFAULT_USE_CACHED_MAGAZINES_FOR_NON_EVENT_LOOP_THREADS = SystemPropertyUtil.getBoolean(
"io.netty.allocator.useCachedMagazinesForNonEventLoopThreads", false);
logger.debug("-Dio.netty.allocator.useCachedMagazinesForNonEventLoopThreads: {}",
DEFAULT_USE_CACHED_MAGAZINES_FOR_NON_EVENT_LOOP_THREADS);
}

private final AdaptivePoolingAllocator direct;
private final AdaptivePoolingAllocator heap;

Expand All @@ -37,9 +51,15 @@ public AdaptiveByteBufAllocator() {
}

public AdaptiveByteBufAllocator(boolean preferDirect) {
this(preferDirect, DEFAULT_USE_CACHED_MAGAZINES_FOR_NON_EVENT_LOOP_THREADS);
}

public AdaptiveByteBufAllocator(boolean preferDirect, boolean useCacheForNonEventLoopThreads) {
super(preferDirect);
direct = new AdaptivePoolingAllocator(new DirectChunkAllocator(this));
heap = new AdaptivePoolingAllocator(new HeapChunkAllocator(this));
MagazineCaching magazineCaching = useCacheForNonEventLoopThreads?
MagazineCaching.FastThreadLocalThreads : MagazineCaching.EventLoopThreads;
direct = new AdaptivePoolingAllocator(new DirectChunkAllocator(this), magazineCaching);
heap = new AdaptivePoolingAllocator(new HeapChunkAllocator(this), magazineCaching);
}

@Override
Expand Down
84 changes: 76 additions & 8 deletions buffer/src/main/java/io/netty/buffer/AdaptivePoolingAllocator.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
import io.netty.util.ByteProcessor;
import io.netty.util.NettyRuntime;
import io.netty.util.ReferenceCounted;
import io.netty.util.concurrent.FastThreadLocal;
import io.netty.util.concurrent.FastThreadLocalThread;
import io.netty.util.internal.ObjectPool;
import io.netty.util.internal.ObjectUtil;
import io.netty.util.internal.PlatformDependent;
import io.netty.util.internal.SuppressJava6Requirement;
import io.netty.util.internal.SystemPropertyUtil;
import io.netty.util.internal.ThreadExecutorMap;
import io.netty.util.internal.UnstableApi;

import java.io.IOException;
Expand All @@ -37,7 +39,9 @@
import java.nio.channels.ScatteringByteChannel;
import java.util.Arrays;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.CopyOnWriteArraySet;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
import java.util.concurrent.locks.StampedLock;
Expand Down Expand Up @@ -73,6 +77,15 @@
@SuppressJava6Requirement(reason = "Guarded by version check")
@UnstableApi
final class AdaptivePoolingAllocator {

enum MagazineCaching {
EventLoopThreads,
FastThreadLocalThreads,
None
}

private static final int EXPANSION_ATTEMPTS = 3;
private static final int INITIAL_MAGAZINES = 4;
private static final int RETIRE_CAPACITY = 4 * 1024;
private static final int MIN_CHUNK_SIZE = 128 * 1024;
private static final int MAX_STRIPES = NettyRuntime.availableProcessors() * 2;
Expand All @@ -97,20 +110,55 @@ final class AdaptivePoolingAllocator {
private static final int CENTRAL_QUEUE_CAPACITY = SystemPropertyUtil.getInt(
"io.netty.allocator.centralQueueCapacity", NettyRuntime.availableProcessors());

private static final Object NO_MAGAZINE = Boolean.TRUE;

private final ChunkAllocator chunkAllocator;
private final Queue<ChunkByteBuf> centralQueue;
private final StampedLock magazineExpandLock;
private volatile Magazine[] magazines;
private final FastThreadLocal<Object> threadLocalMagazine;
private final Set<Magazine> liveCachedMagazines;

AdaptivePoolingAllocator(ChunkAllocator chunkAllocator) {
AdaptivePoolingAllocator(ChunkAllocator chunkAllocator, MagazineCaching magazineCaching) {
ObjectUtil.checkNotNull(chunkAllocator, "chunkAllocator");
ObjectUtil.checkNotNull(magazineCaching, "magazineCaching");
this.chunkAllocator = chunkAllocator;
if (javaVersion() < 8) {
// The implementation uses StampedLock, which was introduced in Java 8.
throw new IllegalStateException("This allocator require Java 8 or newer.");
}
centralQueue = ObjectUtil.checkNotNull(createSharedChunkQueue(), "centralQueue");
magazineExpandLock = new StampedLock();
Magazine[] mags = new Magazine[4];
if (magazineCaching != MagazineCaching.None) {
assert magazineCaching == MagazineCaching.EventLoopThreads ||
magazineCaching == MagazineCaching.FastThreadLocalThreads;
final boolean cachedMagazinesNonEventLoopThreads =
magazineCaching == MagazineCaching.FastThreadLocalThreads;
final Set<Magazine> liveMagazines = new CopyOnWriteArraySet<Magazine>();
threadLocalMagazine = new FastThreadLocal<Object>() {
@Override
protected Object initialValue() {
if (cachedMagazinesNonEventLoopThreads || ThreadExecutorMap.currentExecutor() != null) {
Magazine mag = new Magazine(AdaptivePoolingAllocator.this, false);
liveMagazines.add(mag);
return mag;
}
return NO_MAGAZINE;
}

@Override
protected void onRemoval(final Object value) throws Exception {
if (value != NO_MAGAZINE) {
liveMagazines.remove(value);
}
}
};
liveCachedMagazines = liveMagazines;
} else {
threadLocalMagazine = null;
liveCachedMagazines = null;
}
Magazine[] mags = new Magazine[INITIAL_MAGAZINES];
for (int i = 0; i < mags.length; i++) {
mags[i] = new Magazine(this);
}
Expand Down Expand Up @@ -158,8 +206,15 @@ ByteBuf allocate(int size, int maxCapacity) {
}

private AdaptiveByteBuf allocate(int size, int maxCapacity, Thread currentThread, AdaptiveByteBuf buf) {
long threadId = currentThread.getId();
int sizeBucket = AllocationStatistics.sizeBucket(size); // Compute outside of Magazine lock for better ILP.
FastThreadLocal<Object> threadLocalMagazine = this.threadLocalMagazine;
if (threadLocalMagazine != null && currentThread instanceof FastThreadLocalThread) {
Object mag = threadLocalMagazine.get();
if (mag != NO_MAGAZINE) {
return ((Magazine) mag).allocate(size, sizeBucket, maxCapacity, buf);
}
}
long threadId = currentThread.getId();
Magazine[] mags;
int expansions = 0;
do {
Expand All @@ -178,7 +233,7 @@ private AdaptiveByteBuf allocate(int size, int maxCapacity, Thread currentThread
}
}
expansions++;
} while (expansions <= 3 && tryExpandMagazines(mags.length));
} while (expansions <= EXPANSION_ATTEMPTS && tryExpandMagazines(mags.length));
return null;
}

Expand All @@ -204,6 +259,11 @@ long usedMemory() {
for (Magazine magazine : magazines) {
sum += magazine.usedMemory.get();
}
if (liveCachedMagazines != null) {
for (Magazine magazine : liveCachedMagazines) {
sum += magazine.usedMemory.get();
}
}
return sum;
}

Expand Down Expand Up @@ -247,6 +307,7 @@ private static class AllocationStatistics extends StampedLock {
private static final int HISTO_MAX_BUCKET_MASK = HISTO_BUCKET_COUNT - 1;

protected final AdaptivePoolingAllocator parent;
private final boolean shareable;
private final short[][] histos = {
new short[HISTO_BUCKET_COUNT], new short[HISTO_BUCKET_COUNT],
new short[HISTO_BUCKET_COUNT], new short[HISTO_BUCKET_COUNT],
Expand All @@ -260,8 +321,9 @@ private static class AllocationStatistics extends StampedLock {
private volatile int sharedPrefChunkSize = MIN_CHUNK_SIZE;
protected volatile int localPrefChunkSize = MIN_CHUNK_SIZE;

private AllocationStatistics(AdaptivePoolingAllocator parent) {
private AllocationStatistics(AdaptivePoolingAllocator parent, boolean shareable) {
this.parent = parent;
this.shareable = shareable;
}

protected void recordAllocationSize(int bucket) {
Expand Down Expand Up @@ -300,8 +362,10 @@ private void rotateHistograms() {
int percentileSize = 1 << sizeBucket + HISTO_MIN_BUCKET_SHIFT;
int prefChunkSize = Math.max(percentileSize * BUFS_PER_CHUNK, MIN_CHUNK_SIZE);
localPrefChunkSize = prefChunkSize;
for (Magazine mag : parent.magazines) {
prefChunkSize = Math.max(prefChunkSize, mag.localPrefChunkSize);
if (shareable) {
for (Magazine mag : parent.magazines) {
prefChunkSize = Math.max(prefChunkSize, mag.localPrefChunkSize);
}
}
if (sharedPrefChunkSize != prefChunkSize) {
// Preferred chunk size changed. Increase check frequency.
Expand Down Expand Up @@ -344,7 +408,11 @@ private static final class Magazine extends AllocationStatistics {
private final AtomicLong usedMemory;

Magazine(AdaptivePoolingAllocator parent) {
super(parent);
this(parent, true);
}

Magazine(AdaptivePoolingAllocator parent, boolean shareable) {
super(parent, shareable);
usedMemory = new AtomicLong();
}

Expand Down

0 comments on commit 3438798

Please sign in to comment.