netty5/buffer/src/main/java/io/netty/buffer/PooledByteBufAllocator.java

633 lines
24 KiB
Java
Raw Normal View History

/*
* Copyright 2012 The Netty Project
*
* The Netty Project licenses this file to you under the Apache License,
* version 2.0 (the "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package io.netty.buffer;
Enable configuring available processors Motivation: In cases when an application is running in a container or is otherwise constrained to the number of processors that it is using, the JVM invocation Runtime#availableProcessors will not return the constrained value but rather the number of processors available to the virtual machine. Netty uses this number in sizing various resources. Additionally, some applications will constrain the number of threads that they are using independenly of the number of processors available on the system. Thus, applications should have a way to globally configure the number of processors. Modifications: Rather than invoking Runtime#availableProcessors, Netty should rely on a method that enables configuration when the JVM is started or by the application. This commit exposes a new class NettyRuntime for enabling such configuraiton. This value can only be set once. Its default value is Runtime#availableProcessors so that there is no visible change to existing applications, but enables configuring either a system property or configuring during application startup (e.g., based on settings used to configure the application). Additionally, we introduce the usage of forbidden-apis to prevent future uses of Runtime#availableProcessors from creeping. Future work should enable the bundled signatures and clean up uses of deprecated and other forbidden methods. Result: Netty can be configured to not use the underlying number of processors, but rather the constrained number of processors.
2017-01-16 18:36:32 +01:00
import io.netty.util.NettyRuntime;
Refactor FastThreadLocal to simplify TLV management Motivation: When Netty runs in a managed environment such as web application server, Netty needs to provide an explicit way to remove the thread-local variables it created to prevent class loader leaks. FastThreadLocal uses different execution paths for storing a thread-local variable depending on the type of the current thread. It increases the complexity of thread-local removal. Modifications: - Moved FastThreadLocal and FastThreadLocalThread out of the internal package so that a user can use it. - FastThreadLocal now keeps track of all thread local variables it has initialized, and calling FastThreadLocal.removeAll() will remove all thread-local variables of the caller thread. - Added FastThreadLocal.size() for diagnostics and tests - Introduce InternalThreadLocalMap which is a mixture of hard-wired thread local variable fields and extensible indexed variables - FastThreadLocal now uses InternalThreadLocalMap to implement a thread-local variable. - Added ThreadDeathWatcher.unwatch() so that PooledByteBufAllocator tells it to stop watching when its thread-local cache has been freed by FastThreadLocal.removeAll(). - Added FastThreadLocalTest to ensure that removeAll() works - Added microbenchmark for FastThreadLocal and JDK ThreadLocal - Upgraded to JMH 0.9 Result: - A user can remove all thread-local variables Netty created, as long as he or she did not exit from the current thread. (Note that there's no way to remove a thread-local variable from outside of the thread.) - FastThreadLocal exposes more useful operations such as isSet() because we always implement a thread local variable via InternalThreadLocalMap instead of falling back to JDK ThreadLocal. - FastThreadLocalBenchmark shows that this change improves the performance of FastThreadLocal even more.
2014-06-17 11:37:58 +02:00
import io.netty.util.concurrent.FastThreadLocal;
import io.netty.util.concurrent.FastThreadLocalThread;
import io.netty.util.internal.PlatformDependent;
import io.netty.util.internal.StringUtil;
import io.netty.util.internal.SystemPropertyUtil;
import io.netty.util.internal.logging.InternalLogger;
import io.netty.util.internal.logging.InternalLoggerFactory;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class PooledByteBufAllocator extends AbstractByteBufAllocator implements ByteBufAllocatorMetricProvider {
private static final InternalLogger logger = InternalLoggerFactory.getInstance(PooledByteBufAllocator.class);
private static final int DEFAULT_NUM_HEAP_ARENA;
private static final int DEFAULT_NUM_DIRECT_ARENA;
private static final int DEFAULT_PAGE_SIZE;
private static final int DEFAULT_MAX_ORDER; // 8192 << 11 = 16 MiB per chunk
private static final int DEFAULT_TINY_CACHE_SIZE;
private static final int DEFAULT_SMALL_CACHE_SIZE;
private static final int DEFAULT_NORMAL_CACHE_SIZE;
private static final int DEFAULT_MAX_CACHED_BUFFER_CAPACITY;
private static final int DEFAULT_CACHE_TRIM_INTERVAL;
private static final boolean DEFAULT_USE_CACHE_FOR_ALL_THREADS;
private static final int DEFAULT_DIRECT_MEMORY_CACHE_ALIGNMENT;
private static final int MIN_PAGE_SIZE = 4096;
private static final int MAX_CHUNK_SIZE = (int) (((long) Integer.MAX_VALUE + 1) / 2);
static {
int defaultPageSize = SystemPropertyUtil.getInt("io.netty.allocator.pageSize", 8192);
Throwable pageSizeFallbackCause = null;
try {
validateAndCalculatePageShifts(defaultPageSize);
} catch (Throwable t) {
pageSizeFallbackCause = t;
defaultPageSize = 8192;
}
DEFAULT_PAGE_SIZE = defaultPageSize;
int defaultMaxOrder = SystemPropertyUtil.getInt("io.netty.allocator.maxOrder", 11);
Throwable maxOrderFallbackCause = null;
try {
validateAndCalculateChunkSize(DEFAULT_PAGE_SIZE, defaultMaxOrder);
} catch (Throwable t) {
maxOrderFallbackCause = t;
defaultMaxOrder = 11;
}
DEFAULT_MAX_ORDER = defaultMaxOrder;
// Determine reasonable default for nHeapArena and nDirectArena.
// Assuming each arena has 3 chunks, the pool should not consume more than 50% of max memory.
final Runtime runtime = Runtime.getRuntime();
Enable configuring available processors Motivation: In cases when an application is running in a container or is otherwise constrained to the number of processors that it is using, the JVM invocation Runtime#availableProcessors will not return the constrained value but rather the number of processors available to the virtual machine. Netty uses this number in sizing various resources. Additionally, some applications will constrain the number of threads that they are using independenly of the number of processors available on the system. Thus, applications should have a way to globally configure the number of processors. Modifications: Rather than invoking Runtime#availableProcessors, Netty should rely on a method that enables configuration when the JVM is started or by the application. This commit exposes a new class NettyRuntime for enabling such configuraiton. This value can only be set once. Its default value is Runtime#availableProcessors so that there is no visible change to existing applications, but enables configuring either a system property or configuring during application startup (e.g., based on settings used to configure the application). Additionally, we introduce the usage of forbidden-apis to prevent future uses of Runtime#availableProcessors from creeping. Future work should enable the bundled signatures and clean up uses of deprecated and other forbidden methods. Result: Netty can be configured to not use the underlying number of processors, but rather the constrained number of processors.
2017-01-16 18:36:32 +01:00
/*
* We use 2 * available processors by default to reduce contention as we use 2 * available processors for the
* number of EventLoops in NIO and EPOLL as well. If we choose a smaller number we will run into hot spots as
* allocation and de-allocation needs to be synchronized on the PoolArena.
*
* See https://github.com/netty/netty/issues/3888.
*/
final int defaultMinNumArena = NettyRuntime.availableProcessors() * 2;
final int defaultChunkSize = DEFAULT_PAGE_SIZE << DEFAULT_MAX_ORDER;
DEFAULT_NUM_HEAP_ARENA = Math.max(0,
SystemPropertyUtil.getInt(
"io.netty.allocator.numHeapArenas",
(int) Math.min(
defaultMinNumArena,
runtime.maxMemory() / defaultChunkSize / 2 / 3)));
DEFAULT_NUM_DIRECT_ARENA = Math.max(0,
SystemPropertyUtil.getInt(
"io.netty.allocator.numDirectArenas",
(int) Math.min(
defaultMinNumArena,
PlatformDependent.maxDirectMemory() / defaultChunkSize / 2 / 3)));
// cache sizes
DEFAULT_TINY_CACHE_SIZE = SystemPropertyUtil.getInt("io.netty.allocator.tinyCacheSize", 512);
DEFAULT_SMALL_CACHE_SIZE = SystemPropertyUtil.getInt("io.netty.allocator.smallCacheSize", 256);
DEFAULT_NORMAL_CACHE_SIZE = SystemPropertyUtil.getInt("io.netty.allocator.normalCacheSize", 64);
// 32 kb is the default maximum capacity of the cached buffer. Similar to what is explained in
// 'Scalable memory allocation using jemalloc'
DEFAULT_MAX_CACHED_BUFFER_CAPACITY = SystemPropertyUtil.getInt(
"io.netty.allocator.maxCachedBufferCapacity", 32 * 1024);
// the number of threshold of allocations when cached entries will be freed up if not frequently used
DEFAULT_CACHE_TRIM_INTERVAL = SystemPropertyUtil.getInt(
"io.netty.allocator.cacheTrimInterval", 8192);
DEFAULT_USE_CACHE_FOR_ALL_THREADS = SystemPropertyUtil.getBoolean(
"io.netty.allocator.useCacheForAllThreads", true);
DEFAULT_DIRECT_MEMORY_CACHE_ALIGNMENT = SystemPropertyUtil.getInt(
"io.netty.allocator.directMemoryCacheAlignment", 0);
if (logger.isDebugEnabled()) {
logger.debug("-Dio.netty.allocator.numHeapArenas: {}", DEFAULT_NUM_HEAP_ARENA);
logger.debug("-Dio.netty.allocator.numDirectArenas: {}", DEFAULT_NUM_DIRECT_ARENA);
if (pageSizeFallbackCause == null) {
logger.debug("-Dio.netty.allocator.pageSize: {}", DEFAULT_PAGE_SIZE);
} else {
logger.debug("-Dio.netty.allocator.pageSize: {}", DEFAULT_PAGE_SIZE, pageSizeFallbackCause);
}
if (maxOrderFallbackCause == null) {
logger.debug("-Dio.netty.allocator.maxOrder: {}", DEFAULT_MAX_ORDER);
} else {
logger.debug("-Dio.netty.allocator.maxOrder: {}", DEFAULT_MAX_ORDER, maxOrderFallbackCause);
}
logger.debug("-Dio.netty.allocator.chunkSize: {}", DEFAULT_PAGE_SIZE << DEFAULT_MAX_ORDER);
logger.debug("-Dio.netty.allocator.tinyCacheSize: {}", DEFAULT_TINY_CACHE_SIZE);
logger.debug("-Dio.netty.allocator.smallCacheSize: {}", DEFAULT_SMALL_CACHE_SIZE);
logger.debug("-Dio.netty.allocator.normalCacheSize: {}", DEFAULT_NORMAL_CACHE_SIZE);
logger.debug("-Dio.netty.allocator.maxCachedBufferCapacity: {}", DEFAULT_MAX_CACHED_BUFFER_CAPACITY);
logger.debug("-Dio.netty.allocator.cacheTrimInterval: {}", DEFAULT_CACHE_TRIM_INTERVAL);
logger.debug("-Dio.netty.allocator.useCacheForAllThreads: {}", DEFAULT_USE_CACHE_FOR_ALL_THREADS);
}
}
public static final PooledByteBufAllocator DEFAULT =
new PooledByteBufAllocator(PlatformDependent.directBufferPreferred());
private final PoolArena<byte[]>[] heapArenas;
private final PoolArena<ByteBuffer>[] directArenas;
private final int tinyCacheSize;
private final int smallCacheSize;
private final int normalCacheSize;
private final List<PoolArenaMetric> heapArenaMetrics;
private final List<PoolArenaMetric> directArenaMetrics;
private final PoolThreadLocalCache threadCache;
private final int chunkSize;
private final PooledByteBufAllocatorMetric metric;
public PooledByteBufAllocator() {
this(false);
}
@SuppressWarnings("deprecation")
public PooledByteBufAllocator(boolean preferDirect) {
this(preferDirect, DEFAULT_NUM_HEAP_ARENA, DEFAULT_NUM_DIRECT_ARENA, DEFAULT_PAGE_SIZE, DEFAULT_MAX_ORDER);
}
@SuppressWarnings("deprecation")
public PooledByteBufAllocator(int nHeapArena, int nDirectArena, int pageSize, int maxOrder) {
this(false, nHeapArena, nDirectArena, pageSize, maxOrder);
}
/**
* @deprecated use
* {@link PooledByteBufAllocator#PooledByteBufAllocator(boolean, int, int, int, int, int, int, int, boolean)}
*/
@Deprecated
public PooledByteBufAllocator(boolean preferDirect, int nHeapArena, int nDirectArena, int pageSize, int maxOrder) {
this(preferDirect, nHeapArena, nDirectArena, pageSize, maxOrder,
DEFAULT_TINY_CACHE_SIZE, DEFAULT_SMALL_CACHE_SIZE, DEFAULT_NORMAL_CACHE_SIZE);
}
/**
* @deprecated use
* {@link PooledByteBufAllocator#PooledByteBufAllocator(boolean, int, int, int, int, int, int, int, boolean)}
*/
@Deprecated
public PooledByteBufAllocator(boolean preferDirect, int nHeapArena, int nDirectArena, int pageSize, int maxOrder,
int tinyCacheSize, int smallCacheSize, int normalCacheSize) {
this(preferDirect, nHeapArena, nDirectArena, pageSize, maxOrder, tinyCacheSize, smallCacheSize,
normalCacheSize, DEFAULT_USE_CACHE_FOR_ALL_THREADS, DEFAULT_DIRECT_MEMORY_CACHE_ALIGNMENT);
}
public PooledByteBufAllocator(boolean preferDirect, int nHeapArena,
int nDirectArena, int pageSize, int maxOrder, int tinyCacheSize,
int smallCacheSize, int normalCacheSize,
boolean useCacheForAllThreads) {
this(preferDirect, nHeapArena, nDirectArena, pageSize, maxOrder,
tinyCacheSize, smallCacheSize, normalCacheSize,
useCacheForAllThreads, DEFAULT_DIRECT_MEMORY_CACHE_ALIGNMENT);
}
public PooledByteBufAllocator(boolean preferDirect, int nHeapArena, int nDirectArena, int pageSize, int maxOrder,
int tinyCacheSize, int smallCacheSize, int normalCacheSize,
boolean useCacheForAllThreads, int directMemoryCacheAlignment) {
super(preferDirect);
threadCache = new PoolThreadLocalCache(useCacheForAllThreads);
this.tinyCacheSize = tinyCacheSize;
this.smallCacheSize = smallCacheSize;
this.normalCacheSize = normalCacheSize;
chunkSize = validateAndCalculateChunkSize(pageSize, maxOrder);
if (nHeapArena < 0) {
throw new IllegalArgumentException("nHeapArena: " + nHeapArena + " (expected: >= 0)");
}
if (nDirectArena < 0) {
throw new IllegalArgumentException("nDirectArea: " + nDirectArena + " (expected: >= 0)");
}
if (directMemoryCacheAlignment < 0) {
throw new IllegalArgumentException("directMemoryCacheAlignment: "
+ directMemoryCacheAlignment + " (expected: >= 0)");
}
if (directMemoryCacheAlignment > 0 && !isDirectMemoryCacheAlignmentSupported()) {
throw new IllegalArgumentException("directMemoryCacheAlignment is not supported");
}
if ((directMemoryCacheAlignment & -directMemoryCacheAlignment) != directMemoryCacheAlignment) {
throw new IllegalArgumentException("directMemoryCacheAlignment: "
+ directMemoryCacheAlignment + " (expected: power of two)");
}
int pageShifts = validateAndCalculatePageShifts(pageSize);
if (nHeapArena > 0) {
heapArenas = newArenaArray(nHeapArena);
List<PoolArenaMetric> metrics = new ArrayList<PoolArenaMetric>(heapArenas.length);
for (int i = 0; i < heapArenas.length; i ++) {
PoolArena.HeapArena arena = new PoolArena.HeapArena(this,
pageSize, maxOrder, pageShifts, chunkSize,
directMemoryCacheAlignment);
heapArenas[i] = arena;
metrics.add(arena);
}
heapArenaMetrics = Collections.unmodifiableList(metrics);
} else {
heapArenas = null;
heapArenaMetrics = Collections.emptyList();
}
if (nDirectArena > 0) {
directArenas = newArenaArray(nDirectArena);
List<PoolArenaMetric> metrics = new ArrayList<PoolArenaMetric>(directArenas.length);
for (int i = 0; i < directArenas.length; i ++) {
PoolArena.DirectArena arena = new PoolArena.DirectArena(
this, pageSize, maxOrder, pageShifts, chunkSize, directMemoryCacheAlignment);
directArenas[i] = arena;
metrics.add(arena);
}
directArenaMetrics = Collections.unmodifiableList(metrics);
} else {
directArenas = null;
directArenaMetrics = Collections.emptyList();
}
metric = new PooledByteBufAllocatorMetric(this);
}
@SuppressWarnings("unchecked")
private static <T> PoolArena<T>[] newArenaArray(int size) {
return new PoolArena[size];
}
private static int validateAndCalculatePageShifts(int pageSize) {
if (pageSize < MIN_PAGE_SIZE) {
throw new IllegalArgumentException("pageSize: " + pageSize + " (expected: " + MIN_PAGE_SIZE + ")");
}
if ((pageSize & pageSize - 1) != 0) {
throw new IllegalArgumentException("pageSize: " + pageSize + " (expected: power of 2)");
}
// Logarithm base 2. At this point we know that pageSize is a power of two.
return Integer.SIZE - 1 - Integer.numberOfLeadingZeros(pageSize);
}
private static int validateAndCalculateChunkSize(int pageSize, int maxOrder) {
if (maxOrder > 14) {
throw new IllegalArgumentException("maxOrder: " + maxOrder + " (expected: 0-14)");
}
// Ensure the resulting chunkSize does not overflow.
int chunkSize = pageSize;
for (int i = maxOrder; i > 0; i --) {
if (chunkSize > MAX_CHUNK_SIZE / 2) {
throw new IllegalArgumentException(String.format(
"pageSize (%d) << maxOrder (%d) must not exceed %d", pageSize, maxOrder, MAX_CHUNK_SIZE));
}
chunkSize <<= 1;
}
return chunkSize;
}
@Override
protected ByteBuf newHeapBuffer(int initialCapacity, int maxCapacity) {
PoolThreadCache cache = threadCache.get();
PoolArena<byte[]> heapArena = cache.heapArena;
final ByteBuf buf;
if (heapArena != null) {
buf = heapArena.allocate(cache, initialCapacity, maxCapacity);
} else {
buf = PlatformDependent.hasUnsafe() ?
new UnpooledUnsafeHeapByteBuf(this, initialCapacity, maxCapacity) :
new UnpooledHeapByteBuf(this, initialCapacity, maxCapacity);
}
return toLeakAwareBuffer(buf);
}
@Override
protected ByteBuf newDirectBuffer(int initialCapacity, int maxCapacity) {
PoolThreadCache cache = threadCache.get();
PoolArena<ByteBuffer> directArena = cache.directArena;
final ByteBuf buf;
if (directArena != null) {
buf = directArena.allocate(cache, initialCapacity, maxCapacity);
} else {
buf = PlatformDependent.hasUnsafe() ?
UnsafeByteBufUtil.newUnsafeDirectByteBuf(this, initialCapacity, maxCapacity) :
new UnpooledDirectByteBuf(this, initialCapacity, maxCapacity);
}
return toLeakAwareBuffer(buf);
}
/**
2017-04-19 22:37:03 +02:00
* Default number of heap arenas - System Property: io.netty.allocator.numHeapArenas - default 2 * cores
*/
public static int defaultNumHeapArena() {
return DEFAULT_NUM_HEAP_ARENA;
}
/**
2017-04-19 22:37:03 +02:00
* Default number of direct arenas - System Property: io.netty.allocator.numDirectArenas - default 2 * cores
*/
public static int defaultNumDirectArena() {
return DEFAULT_NUM_DIRECT_ARENA;
}
/**
* Default buffer page size - System Property: io.netty.allocator.pageSize - default 8192
*/
public static int defaultPageSize() {
return DEFAULT_PAGE_SIZE;
}
/**
* Default maximum order - System Property: io.netty.allocator.maxOrder - default 11
*/
public static int defaultMaxOrder() {
return DEFAULT_MAX_ORDER;
}
/**
* Default thread caching behavior - System Property: io.netty.allocator.useCacheForAllThreads - default true
*/
public static boolean defaultUseCacheForAllThreads() {
return DEFAULT_USE_CACHE_FOR_ALL_THREADS;
}
/**
* Default prefer direct - System Property: io.netty.noPreferDirect - default false
*/
public static boolean defaultPreferDirect() {
return PlatformDependent.directBufferPreferred();
}
/**
* Default tiny cache size - System Property: io.netty.allocator.tinyCacheSize - default 512
*/
public static int defaultTinyCacheSize() {
return DEFAULT_TINY_CACHE_SIZE;
}
/**
* Default small cache size - System Property: io.netty.allocator.smallCacheSize - default 256
*/
public static int defaultSmallCacheSize() {
return DEFAULT_SMALL_CACHE_SIZE;
}
/**
* Default normal cache size - System Property: io.netty.allocator.normalCacheSize - default 64
*/
public static int defaultNormalCacheSize() {
return DEFAULT_NORMAL_CACHE_SIZE;
}
/**
2017-04-19 22:37:03 +02:00
* Return {@code true} if direct memory cache alignment is supported, {@code false} otherwise.
*/
public static boolean isDirectMemoryCacheAlignmentSupported() {
return PlatformDependent.hasUnsafe();
}
@Override
public boolean isDirectBufferPooled() {
return directArenas != null;
}
/**
* Returns {@code true} if the calling {@link Thread} has a {@link ThreadLocal} cache for the allocated
* buffers.
*/
@Deprecated
public boolean hasThreadLocalCache() {
Refactor FastThreadLocal to simplify TLV management Motivation: When Netty runs in a managed environment such as web application server, Netty needs to provide an explicit way to remove the thread-local variables it created to prevent class loader leaks. FastThreadLocal uses different execution paths for storing a thread-local variable depending on the type of the current thread. It increases the complexity of thread-local removal. Modifications: - Moved FastThreadLocal and FastThreadLocalThread out of the internal package so that a user can use it. - FastThreadLocal now keeps track of all thread local variables it has initialized, and calling FastThreadLocal.removeAll() will remove all thread-local variables of the caller thread. - Added FastThreadLocal.size() for diagnostics and tests - Introduce InternalThreadLocalMap which is a mixture of hard-wired thread local variable fields and extensible indexed variables - FastThreadLocal now uses InternalThreadLocalMap to implement a thread-local variable. - Added ThreadDeathWatcher.unwatch() so that PooledByteBufAllocator tells it to stop watching when its thread-local cache has been freed by FastThreadLocal.removeAll(). - Added FastThreadLocalTest to ensure that removeAll() works - Added microbenchmark for FastThreadLocal and JDK ThreadLocal - Upgraded to JMH 0.9 Result: - A user can remove all thread-local variables Netty created, as long as he or she did not exit from the current thread. (Note that there's no way to remove a thread-local variable from outside of the thread.) - FastThreadLocal exposes more useful operations such as isSet() because we always implement a thread local variable via InternalThreadLocalMap instead of falling back to JDK ThreadLocal. - FastThreadLocalBenchmark shows that this change improves the performance of FastThreadLocal even more.
2014-06-17 11:37:58 +02:00
return threadCache.isSet();
}
/**
* Free all cached buffers for the calling {@link Thread}.
*/
@Deprecated
public void freeThreadLocalCache() {
Refactor FastThreadLocal to simplify TLV management Motivation: When Netty runs in a managed environment such as web application server, Netty needs to provide an explicit way to remove the thread-local variables it created to prevent class loader leaks. FastThreadLocal uses different execution paths for storing a thread-local variable depending on the type of the current thread. It increases the complexity of thread-local removal. Modifications: - Moved FastThreadLocal and FastThreadLocalThread out of the internal package so that a user can use it. - FastThreadLocal now keeps track of all thread local variables it has initialized, and calling FastThreadLocal.removeAll() will remove all thread-local variables of the caller thread. - Added FastThreadLocal.size() for diagnostics and tests - Introduce InternalThreadLocalMap which is a mixture of hard-wired thread local variable fields and extensible indexed variables - FastThreadLocal now uses InternalThreadLocalMap to implement a thread-local variable. - Added ThreadDeathWatcher.unwatch() so that PooledByteBufAllocator tells it to stop watching when its thread-local cache has been freed by FastThreadLocal.removeAll(). - Added FastThreadLocalTest to ensure that removeAll() works - Added microbenchmark for FastThreadLocal and JDK ThreadLocal - Upgraded to JMH 0.9 Result: - A user can remove all thread-local variables Netty created, as long as he or she did not exit from the current thread. (Note that there's no way to remove a thread-local variable from outside of the thread.) - FastThreadLocal exposes more useful operations such as isSet() because we always implement a thread local variable via InternalThreadLocalMap instead of falling back to JDK ThreadLocal. - FastThreadLocalBenchmark shows that this change improves the performance of FastThreadLocal even more.
2014-06-17 11:37:58 +02:00
threadCache.remove();
}
final class PoolThreadLocalCache extends FastThreadLocal<PoolThreadCache> {
private final boolean useCacheForAllThreads;
PoolThreadLocalCache(boolean useCacheForAllThreads) {
this.useCacheForAllThreads = useCacheForAllThreads;
}
@Override
Change arena to thread cache mapping algorithm to be closer to ideal. Motivation: Circular assignment of arenas to thread caches can lead to less than optimal mappings in cases where threads are (frequently) shutdown and started. Example Scenario: There are a total of 2 arenas. The first two threads performing an allocation would lead to the following mapping: Thread 0 -> Arena 0 Thread 1 -> Arena 1 Now, assume Thread 1 is shut down and another Thread 2 is started. The current circular assignment algorithm would lead to the following mapping: Thread 0 -> Arena 0 Thread 2 -> Arena 0 Ideally, we want Thread 2 to use Arena 1 though. Presumably, this is not much of an issue for most Netty applications that do all the allocations inside the eventloop, as eventloop threads are seldomly shut down and restarted. However, applications that only use the netty-buffer package or implement their own threading model outside the eventloop might suffer from increased contention. For example, gRPC Java when using the blocking stub performs some allocations outside the eventloop and within its own thread pool that is dynamically sized depending on system load. Modifications: Implement a linear scan algorithm that assigns a new thread cache to the arena that currently backs the fewest thread caches. Result: Closer to ideal mappings between thread caches and arenas. In order to always get an ideal mapping, we would have to re-balance the mapping whenever a thread dies. However, that's difficult because of deallocation.
2016-03-14 17:25:43 +01:00
protected synchronized PoolThreadCache initialValue() {
final PoolArena<byte[]> heapArena = leastUsedArena(heapArenas);
final PoolArena<ByteBuffer> directArena = leastUsedArena(directArenas);
Thread current = Thread.currentThread();
if (useCacheForAllThreads || current instanceof FastThreadLocalThread) {
return new PoolThreadCache(
heapArena, directArena, tinyCacheSize, smallCacheSize, normalCacheSize,
DEFAULT_MAX_CACHED_BUFFER_CAPACITY, DEFAULT_CACHE_TRIM_INTERVAL);
}
// No caching so just use 0 as sizes.
return new PoolThreadCache(heapArena, directArena, 0, 0, 0, 0, 0);
}
Refactor FastThreadLocal to simplify TLV management Motivation: When Netty runs in a managed environment such as web application server, Netty needs to provide an explicit way to remove the thread-local variables it created to prevent class loader leaks. FastThreadLocal uses different execution paths for storing a thread-local variable depending on the type of the current thread. It increases the complexity of thread-local removal. Modifications: - Moved FastThreadLocal and FastThreadLocalThread out of the internal package so that a user can use it. - FastThreadLocal now keeps track of all thread local variables it has initialized, and calling FastThreadLocal.removeAll() will remove all thread-local variables of the caller thread. - Added FastThreadLocal.size() for diagnostics and tests - Introduce InternalThreadLocalMap which is a mixture of hard-wired thread local variable fields and extensible indexed variables - FastThreadLocal now uses InternalThreadLocalMap to implement a thread-local variable. - Added ThreadDeathWatcher.unwatch() so that PooledByteBufAllocator tells it to stop watching when its thread-local cache has been freed by FastThreadLocal.removeAll(). - Added FastThreadLocalTest to ensure that removeAll() works - Added microbenchmark for FastThreadLocal and JDK ThreadLocal - Upgraded to JMH 0.9 Result: - A user can remove all thread-local variables Netty created, as long as he or she did not exit from the current thread. (Note that there's no way to remove a thread-local variable from outside of the thread.) - FastThreadLocal exposes more useful operations such as isSet() because we always implement a thread local variable via InternalThreadLocalMap instead of falling back to JDK ThreadLocal. - FastThreadLocalBenchmark shows that this change improves the performance of FastThreadLocal even more.
2014-06-17 11:37:58 +02:00
@Override
Change arena to thread cache mapping algorithm to be closer to ideal. Motivation: Circular assignment of arenas to thread caches can lead to less than optimal mappings in cases where threads are (frequently) shutdown and started. Example Scenario: There are a total of 2 arenas. The first two threads performing an allocation would lead to the following mapping: Thread 0 -> Arena 0 Thread 1 -> Arena 1 Now, assume Thread 1 is shut down and another Thread 2 is started. The current circular assignment algorithm would lead to the following mapping: Thread 0 -> Arena 0 Thread 2 -> Arena 0 Ideally, we want Thread 2 to use Arena 1 though. Presumably, this is not much of an issue for most Netty applications that do all the allocations inside the eventloop, as eventloop threads are seldomly shut down and restarted. However, applications that only use the netty-buffer package or implement their own threading model outside the eventloop might suffer from increased contention. For example, gRPC Java when using the blocking stub performs some allocations outside the eventloop and within its own thread pool that is dynamically sized depending on system load. Modifications: Implement a linear scan algorithm that assigns a new thread cache to the arena that currently backs the fewest thread caches. Result: Closer to ideal mappings between thread caches and arenas. In order to always get an ideal mapping, we would have to re-balance the mapping whenever a thread dies. However, that's difficult because of deallocation.
2016-03-14 17:25:43 +01:00
protected void onRemoval(PoolThreadCache threadCache) {
threadCache.free();
}
private <T> PoolArena<T> leastUsedArena(PoolArena<T>[] arenas) {
if (arenas == null || arenas.length == 0) {
return null;
}
PoolArena<T> minArena = arenas[0];
for (int i = 1; i < arenas.length; i++) {
PoolArena<T> arena = arenas[i];
if (arena.numThreadCaches.get() < minArena.numThreadCaches.get()) {
minArena = arena;
}
}
return minArena;
}
}
@Override
public PooledByteBufAllocatorMetric metric() {
return metric;
}
/**
* Return the number of heap arenas.
*
* @deprecated use {@link PooledByteBufAllocatorMetric#numHeapArenas()}.
*/
@Deprecated
public int numHeapArenas() {
return heapArenaMetrics.size();
}
/**
* Return the number of direct arenas.
*
* @deprecated use {@link PooledByteBufAllocatorMetric#numDirectArenas()}.
*/
@Deprecated
public int numDirectArenas() {
return directArenaMetrics.size();
}
/**
* Return a {@link List} of all heap {@link PoolArenaMetric}s that are provided by this pool.
*
* @deprecated use {@link PooledByteBufAllocatorMetric#heapArenas()}.
*/
@Deprecated
public List<PoolArenaMetric> heapArenas() {
return heapArenaMetrics;
}
/**
* Return a {@link List} of all direct {@link PoolArenaMetric}s that are provided by this pool.
*
* @deprecated use {@link PooledByteBufAllocatorMetric#directArenas()}.
*/
@Deprecated
public List<PoolArenaMetric> directArenas() {
return directArenaMetrics;
}
/**
* Return the number of thread local caches used by this {@link PooledByteBufAllocator}.
*
* @deprecated use {@link PooledByteBufAllocatorMetric#numThreadLocalCaches()}.
*/
@Deprecated
public int numThreadLocalCaches() {
Change arena to thread cache mapping algorithm to be closer to ideal. Motivation: Circular assignment of arenas to thread caches can lead to less than optimal mappings in cases where threads are (frequently) shutdown and started. Example Scenario: There are a total of 2 arenas. The first two threads performing an allocation would lead to the following mapping: Thread 0 -> Arena 0 Thread 1 -> Arena 1 Now, assume Thread 1 is shut down and another Thread 2 is started. The current circular assignment algorithm would lead to the following mapping: Thread 0 -> Arena 0 Thread 2 -> Arena 0 Ideally, we want Thread 2 to use Arena 1 though. Presumably, this is not much of an issue for most Netty applications that do all the allocations inside the eventloop, as eventloop threads are seldomly shut down and restarted. However, applications that only use the netty-buffer package or implement their own threading model outside the eventloop might suffer from increased contention. For example, gRPC Java when using the blocking stub performs some allocations outside the eventloop and within its own thread pool that is dynamically sized depending on system load. Modifications: Implement a linear scan algorithm that assigns a new thread cache to the arena that currently backs the fewest thread caches. Result: Closer to ideal mappings between thread caches and arenas. In order to always get an ideal mapping, we would have to re-balance the mapping whenever a thread dies. However, that's difficult because of deallocation.
2016-03-14 17:25:43 +01:00
PoolArena<?>[] arenas = heapArenas != null ? heapArenas : directArenas;
if (arenas == null) {
return 0;
}
int total = 0;
for (PoolArena<?> arena : arenas) {
total += arena.numThreadCaches.get();
Change arena to thread cache mapping algorithm to be closer to ideal. Motivation: Circular assignment of arenas to thread caches can lead to less than optimal mappings in cases where threads are (frequently) shutdown and started. Example Scenario: There are a total of 2 arenas. The first two threads performing an allocation would lead to the following mapping: Thread 0 -> Arena 0 Thread 1 -> Arena 1 Now, assume Thread 1 is shut down and another Thread 2 is started. The current circular assignment algorithm would lead to the following mapping: Thread 0 -> Arena 0 Thread 2 -> Arena 0 Ideally, we want Thread 2 to use Arena 1 though. Presumably, this is not much of an issue for most Netty applications that do all the allocations inside the eventloop, as eventloop threads are seldomly shut down and restarted. However, applications that only use the netty-buffer package or implement their own threading model outside the eventloop might suffer from increased contention. For example, gRPC Java when using the blocking stub performs some allocations outside the eventloop and within its own thread pool that is dynamically sized depending on system load. Modifications: Implement a linear scan algorithm that assigns a new thread cache to the arena that currently backs the fewest thread caches. Result: Closer to ideal mappings between thread caches and arenas. In order to always get an ideal mapping, we would have to re-balance the mapping whenever a thread dies. However, that's difficult because of deallocation.
2016-03-14 17:25:43 +01:00
}
return total;
}
/**
* Return the size of the tiny cache.
*
* @deprecated use {@link PooledByteBufAllocatorMetric#tinyCacheSize()}.
*/
@Deprecated
public int tinyCacheSize() {
return tinyCacheSize;
}
/**
* Return the size of the small cache.
*
* @deprecated use {@link PooledByteBufAllocatorMetric#smallCacheSize()}.
*/
@Deprecated
public int smallCacheSize() {
return smallCacheSize;
}
/**
* Return the size of the normal cache.
*
* @deprecated use {@link PooledByteBufAllocatorMetric#normalCacheSize()}.
*/
@Deprecated
public int normalCacheSize() {
return normalCacheSize;
}
/**
* Return the chunk size for an arena.
*
* @deprecated use {@link PooledByteBufAllocatorMetric#chunkSize()}.
*/
@Deprecated
public final int chunkSize() {
return chunkSize;
}
final long usedHeapMemory() {
return usedMemory(heapArenas);
}
final long usedDirectMemory() {
return usedMemory(directArenas);
}
private static long usedMemory(PoolArena<?>... arenas) {
if (arenas == null) {
return -1;
}
long used = 0;
for (PoolArena<?> arena : arenas) {
used += arena.numActiveBytes();
if (used < 0) {
return Long.MAX_VALUE;
}
}
return used;
}
final PoolThreadCache threadCache() {
PoolThreadCache cache = threadCache.get();
assert cache != null;
return cache;
}
/**
* Returns the status of the allocator (which contains all metrics) as string. Be aware this may be expensive
* and so should not called too frequently.
*/
public String dumpStats() {
int heapArenasLen = heapArenas == null ? 0 : heapArenas.length;
StringBuilder buf = new StringBuilder(512)
.append(heapArenasLen)
.append(" heap arena(s):")
.append(StringUtil.NEWLINE);
if (heapArenasLen > 0) {
for (PoolArena<byte[]> a: heapArenas) {
buf.append(a);
}
}
int directArenasLen = directArenas == null ? 0 : directArenas.length;
buf.append(directArenasLen)
.append(" direct arena(s):")
.append(StringUtil.NEWLINE);
if (directArenasLen > 0) {
for (PoolArena<ByteBuffer> a: directArenas) {
buf.append(a);
}
}
return buf.toString();
}
}