netty5/buffer/src/main/java/io/netty/buffer/PoolArena.java

/*
 * Copyright 2012 The Netty Project
 *
 * The Netty Project licenses this file to you under the Apache License,
 * version 2.0 (the "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at:
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

package io.netty.buffer;

import io.netty.util.internal.LongCounter;
import io.netty.util.internal.PlatformDependent;
import io.netty.util.internal.StringUtil;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

import static java.lang.Math.max;

abstract class PoolArena<T> implements PoolArenaMetric {
    static final boolean HAS_UNSAFE = PlatformDependent.hasUnsafe();

    enum SizeClass {
        Tiny,
        Small,
        Normal
    }

    static final int numTinySubpagePools = 512 >>> 4;

    final PooledByteBufAllocator parent;

    private final int maxOrder;
    final int pageSize;
    final int pageShifts;
    final int chunkSize;
    final int subpageOverflowMask;
    final int numSmallSubpagePools;
    final int directMemoryCacheAlignment;
    final int directMemoryCacheAlignmentMask;
    private final PoolSubpage<T>[] tinySubpagePools;
    private final PoolSubpage<T>[] smallSubpagePools;

    private final PoolChunkList<T> q050;
    private final PoolChunkList<T> q025;
    private final PoolChunkList<T> q000;
    private final PoolChunkList<T> qInit;
    private final PoolChunkList<T> q075;
    private final PoolChunkList<T> q100;

    private final List<PoolChunkListMetric> chunkListMetrics;

    // Metrics for allocations and deallocations
    private long allocationsNormal;
    // We need to use the LongCounter here as this is not guarded via synchronized block.
    private final LongCounter allocationsTiny = PlatformDependent.newLongCounter();
    private final LongCounter allocationsSmall = PlatformDependent.newLongCounter();
    private final LongCounter allocationsHuge = PlatformDependent.newLongCounter();
    private final LongCounter activeBytesHuge = PlatformDependent.newLongCounter();

    private long deallocationsTiny;
    private long deallocationsSmall;
    private long deallocationsNormal;

    // We need to use the LongCounter here as this is not guarded via synchronized block.
    private final LongCounter deallocationsHuge = PlatformDependent.newLongCounter();

    // Number of thread caches backed by this arena.
    final AtomicInteger numThreadCaches = new AtomicInteger();

    // TODO: Test if adding padding helps under contention
    //private long pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7;

    protected PoolArena(PooledByteBufAllocator parent, int pageSize,
          int maxOrder, int pageShifts, int chunkSize, int cacheAlignment) {
        this.parent = parent;
        this.pageSize = pageSize;
        this.maxOrder = maxOrder;
        this.pageShifts = pageShifts;
        this.chunkSize = chunkSize;
        this.directMemoryCacheAlignment = cacheAlignment;
        this.directMemoryCacheAlignmentMask = cacheAlignment - 1;
        subpageOverflowMask = ~(pageSize - 1);
        tinySubpagePools = newSubpagePoolArray(numTinySubpagePools);
        for (int i = 0; i < tinySubpagePools.length; i ++) {
            tinySubpagePools[i] = newSubpagePoolHead(pageSize);
        }

        numSmallSubpagePools = pageShifts - 9;
        smallSubpagePools = newSubpagePoolArray(numSmallSubpagePools);
        for (int i = 0; i < smallSubpagePools.length; i ++) {
            smallSubpagePools[i] = newSubpagePoolHead(pageSize);
        }

        q100 = new PoolChunkList<T>(null, 100, Integer.MAX_VALUE, chunkSize);
        q075 = new PoolChunkList<T>(q100, 75, 100, chunkSize);
        q050 = new PoolChunkList<T>(q075, 50, 100, chunkSize);
        q025 = new PoolChunkList<T>(q050, 25, 75, chunkSize);
        q000 = new PoolChunkList<T>(q025, 1, 50, chunkSize);
        qInit = new PoolChunkList<T>(q000, Integer.MIN_VALUE, 25, chunkSize);

        q100.prevList(q075);
        q075.prevList(q050);
        q050.prevList(q025);
        q025.prevList(q000);
        q000.prevList(null);
        qInit.prevList(qInit);

        List<PoolChunkListMetric> metrics = new ArrayList<PoolChunkListMetric>(6);
        metrics.add(qInit);
        metrics.add(q000);
        metrics.add(q025);
        metrics.add(q050);
        metrics.add(q075);
        metrics.add(q100);
        chunkListMetrics = Collections.unmodifiableList(metrics);
    }

    private PoolSubpage<T> newSubpagePoolHead(int pageSize) {
        PoolSubpage<T> head = new PoolSubpage<T>(pageSize);
        head.prev = head;
        head.next = head;
        return head;
    }

    @SuppressWarnings("unchecked")
    private PoolSubpage<T>[] newSubpagePoolArray(int size) {
        return new PoolSubpage[size];
    }

    abstract boolean isDirect();

    PooledByteBuf<T> allocate(PoolThreadCache cache, int reqCapacity, int maxCapacity) {
        PooledByteBuf<T> buf = newByteBuf(maxCapacity);
        allocate(cache, buf, reqCapacity);
        return buf;
    }

    static int tinyIdx(int normCapacity) {
        return normCapacity >>> 4;
    }

    static int smallIdx(int normCapacity) {
        int tableIdx = 0;
        int i = normCapacity >>> 10;
        while (i != 0) {
            i >>>= 1;
            tableIdx ++;
        }
        return tableIdx;
    }

    // capacity < pageSize
    boolean isTinyOrSmall(int normCapacity) {
        return (normCapacity & subpageOverflowMask) == 0;
    }

    // normCapacity < 512
    static boolean isTiny(int normCapacity) {
        return (normCapacity & 0xFFFFFE00) == 0;
    }

    private void allocate(PoolThreadCache cache, PooledByteBuf<T> buf, final int reqCapacity) {
        final int normCapacity = normalizeCapacity(reqCapacity);
        if (isTinyOrSmall(normCapacity)) { // capacity < pageSize
            int tableIdx;
            PoolSubpage<T>[] table;
            boolean tiny = isTiny(normCapacity);
            if (tiny) { // < 512
                if (cache.allocateTiny(this, buf, reqCapacity, normCapacity)) {
                    // was able to allocate out of the cache so move on
                    return;
                }
                tableIdx = tinyIdx(normCapacity);
                table = tinySubpagePools;
            } else {
                if (cache.allocateSmall(this, buf, reqCapacity, normCapacity)) {
                    // was able to allocate out of the cache so move on
                    return;
                }
                tableIdx = smallIdx(normCapacity);
                table = smallSubpagePools;
            }

            final PoolSubpage<T> head = table[tableIdx];

            /**
             * Synchronize on the head. This is needed as {@link PoolChunk#allocateSubpage(int)} and
             * {@link PoolChunk#free(long)} may modify the doubly linked list as well.
             */
            synchronized (head) {
                final PoolSubpage<T> s = head.next;
                if (s != head) {
                    assert s.doNotDestroy && s.elemSize == normCapacity;
                    long handle = s.allocate();
                    assert handle >= 0;
                    s.chunk.initBufWithSubpage(buf, handle, reqCapacity);
                    incTinySmallAllocation(tiny);
                    return;
                }
            }
            synchronized (this) {
                allocateNormal(buf, reqCapacity, normCapacity);
            }

            incTinySmallAllocation(tiny);
            return;
        }
        if (normCapacity <= chunkSize) {
            if (cache.allocateNormal(this, buf, reqCapacity, normCapacity)) {
                // was able to allocate out of the cache so move on
                return;
            }
            synchronized (this) {
                allocateNormal(buf, reqCapacity, normCapacity);
                ++allocationsNormal;
            }
        } else {
            // Huge allocations are never served via the cache so just call allocateHuge
            allocateHuge(buf, reqCapacity);
        }
    }

    // Method must be called insided synchronized(this) { ... } block
    private void allocateNormal(PooledByteBuf<T> buf, int reqCapacity, int normCapacity) {
        if (q050.allocate(buf, reqCapacity, normCapacity) || q025.allocate(buf, reqCapacity, normCapacity) ||
            q000.allocate(buf, reqCapacity, normCapacity) || qInit.allocate(buf, reqCapacity, normCapacity) ||
            q075.allocate(buf, reqCapacity, normCapacity)) {
            return;
        }

        // Add a new chunk.
        PoolChunk<T> c = newChunk(pageSize, maxOrder, pageShifts, chunkSize);
        long handle = c.allocate(normCapacity);
        assert handle > 0;
        c.initBuf(buf, handle, reqCapacity);
        qInit.add(c);
    }

    private void incTinySmallAllocation(boolean tiny) {
        if (tiny) {
            allocationsTiny.increment();
        } else {
            allocationsSmall.increment();
        }
    }

    private void allocateHuge(PooledByteBuf<T> buf, int reqCapacity) {
        PoolChunk<T> chunk = newUnpooledChunk(reqCapacity);
        activeBytesHuge.add(chunk.chunkSize());
        buf.initUnpooled(chunk, reqCapacity);
        allocationsHuge.increment();
    }

    void free(PoolChunk<T> chunk, long handle, int normCapacity, PoolThreadCache cache) {
        if (chunk.unpooled) {
            int size = chunk.chunkSize();
            destroyChunk(chunk);
            activeBytesHuge.add(-size);
            deallocationsHuge.increment();
        } else {
            SizeClass sizeClass = sizeClass(normCapacity);
            if (cache != null && cache.add(this, chunk, handle, normCapacity, sizeClass)) {
                // cached so not free it.
                return;
            }

            freeChunk(chunk, handle, sizeClass);
        }
    }

    private SizeClass sizeClass(int normCapacity) {
        if (!isTinyOrSmall(normCapacity)) {
            return SizeClass.Normal;
        }
        return isTiny(normCapacity) ? SizeClass.Tiny : SizeClass.Small;
    }

    void freeChunk(PoolChunk<T> chunk, long handle, SizeClass sizeClass) {
        final boolean destroyChunk;
        synchronized (this) {
            switch (sizeClass) {
            case Normal:
                ++deallocationsNormal;
                break;
            case Small:
                ++deallocationsSmall;
                break;
            case Tiny:
                ++deallocationsTiny;
                break;
            default:
                throw new Error();
            }
            destroyChunk = !chunk.parent.free(chunk, handle);
        }
        if (destroyChunk) {
            // destroyChunk not need to be called while holding the synchronized lock.
            destroyChunk(chunk);
        }
    }

    PoolSubpage<T> findSubpagePoolHead(int elemSize) {
        int tableIdx;
        PoolSubpage<T>[] table;
        if (isTiny(elemSize)) { // < 512
            tableIdx = elemSize >>> 4;
            table = tinySubpagePools;
        } else {
            tableIdx = 0;
            elemSize >>>= 10;
            while (elemSize != 0) {
                elemSize >>>= 1;
                tableIdx ++;
            }
            table = smallSubpagePools;
        }

        return table[tableIdx];
    }

    int normalizeCapacity(int reqCapacity) {
        if (reqCapacity < 0) {
            throw new IllegalArgumentException("capacity: " + reqCapacity + " (expected: 0+)");
        }

        if (reqCapacity >= chunkSize) {
            return directMemoryCacheAlignment == 0 ? reqCapacity : alignCapacity(reqCapacity);
        }

        if (!isTiny(reqCapacity)) { // >= 512
            // Doubled

            int normalizedCapacity = reqCapacity;
            normalizedCapacity --;
            normalizedCapacity |= normalizedCapacity >>>  1;
            normalizedCapacity |= normalizedCapacity >>>  2;
            normalizedCapacity |= normalizedCapacity >>>  4;
            normalizedCapacity |= normalizedCapacity >>>  8;
            normalizedCapacity |= normalizedCapacity >>> 16;
            normalizedCapacity ++;

            if (normalizedCapacity < 0) {
                normalizedCapacity >>>= 1;
            }
            assert directMemoryCacheAlignment == 0 || (normalizedCapacity & directMemoryCacheAlignmentMask) == 0;

            return normalizedCapacity;
        }

        if (directMemoryCacheAlignment > 0) {
            return alignCapacity(reqCapacity);
        }

        // Quantum-spaced
        if ((reqCapacity & 15) == 0) {
            return reqCapacity;
        }

        return (reqCapacity & ~15) + 16;
    }

    int alignCapacity(int reqCapacity) {
        int delta = reqCapacity & directMemoryCacheAlignmentMask;
        return delta == 0 ? reqCapacity : reqCapacity + directMemoryCacheAlignment - delta;
    }

    void reallocate(PooledByteBuf<T> buf, int newCapacity, boolean freeOldMemory) {
        if (newCapacity < 0 || newCapacity > buf.maxCapacity()) {
            throw new IllegalArgumentException("newCapacity: " + newCapacity);
        }

        int oldCapacity = buf.length;
        if (oldCapacity == newCapacity) {
            return;
        }

        PoolChunk<T> oldChunk = buf.chunk;
        long oldHandle = buf.handle;
        T oldMemory = buf.memory;
        int oldOffset = buf.offset;
        int oldMaxLength = buf.maxLength;
        int readerIndex = buf.readerIndex();
        int writerIndex = buf.writerIndex();

        allocate(parent.threadCache(), buf, newCapacity);
        if (newCapacity > oldCapacity) {
            memoryCopy(
                    oldMemory, oldOffset,
                    buf.memory, buf.offset, oldCapacity);
        } else if (newCapacity < oldCapacity) {
            if (readerIndex < newCapacity) {
                if (writerIndex > newCapacity) {
                    writerIndex = newCapacity;
                }
                memoryCopy(
                        oldMemory, oldOffset + readerIndex,
                        buf.memory, buf.offset + readerIndex, writerIndex - readerIndex);
            } else {
                readerIndex = writerIndex = newCapacity;
            }
        }

        buf.setIndex(readerIndex, writerIndex);

        if (freeOldMemory) {
            free(oldChunk, oldHandle, oldMaxLength, buf.cache);
        }
    }

    @Override
    public int numThreadCaches() {
        return numThreadCaches.get();
    }

    @Override
    public int numTinySubpages() {
        return tinySubpagePools.length;
    }

    @Override
    public int numSmallSubpages() {
        return smallSubpagePools.length;
    }

    @Override
    public int numChunkLists() {
        return chunkListMetrics.size();
    }

    @Override
    public List<PoolSubpageMetric> tinySubpages() {
        return subPageMetricList(tinySubpagePools);
    }

    @Override
    public List<PoolSubpageMetric> smallSubpages() {
        return subPageMetricList(smallSubpagePools);
    }

    @Override
    public List<PoolChunkListMetric> chunkLists() {
        return chunkListMetrics;
    }

    private static List<PoolSubpageMetric> subPageMetricList(PoolSubpage<?>[] pages) {
        List<PoolSubpageMetric> metrics = new ArrayList<PoolSubpageMetric>();
        for (PoolSubpage<?> head : pages) {
            if (head.next == head) {
                continue;
            }
            PoolSubpage<?> s = head.next;
            for (;;) {
                metrics.add(s);
                s = s.next;
                if (s == head) {
                    break;
                }
            }
        }
        return metrics;
    }

    @Override
    public long numAllocations() {
        final long allocsNormal;
        synchronized (this) {
            allocsNormal = allocationsNormal;
        }
        return allocationsTiny.value() + allocationsSmall.value() + allocsNormal + allocationsHuge.value();
    }

    @Override
    public long numTinyAllocations() {
        return allocationsTiny.value();
    }

    @Override
    public long numSmallAllocations() {
        return allocationsSmall.value();
    }

    @Override
    public synchronized long numNormalAllocations() {
        return allocationsNormal;
    }

    @Override
    public long numDeallocations() {
        final long deallocs;
        synchronized (this) {
            deallocs = deallocationsTiny + deallocationsSmall + deallocationsNormal;
        }
        return deallocs + deallocationsHuge.value();
    }

    @Override
    public synchronized long numTinyDeallocations() {
        return deallocationsTiny;
    }

    @Override
    public synchronized long numSmallDeallocations() {
        return deallocationsSmall;
    }

    @Override
    public synchronized long numNormalDeallocations() {
        return deallocationsNormal;
    }

    @Override
    public long numHugeAllocations() {
        return allocationsHuge.value();
    }

    @Override
    public long numHugeDeallocations() {
        return deallocationsHuge.value();
    }

    @Override
    public  long numActiveAllocations() {
        long val = allocationsTiny.value() + allocationsSmall.value() + allocationsHuge.value()
                - deallocationsHuge.value();
        synchronized (this) {
            val += allocationsNormal - (deallocationsTiny + deallocationsSmall + deallocationsNormal);
        }
        return max(val, 0);
    }

    @Override
    public long numActiveTinyAllocations() {
        return max(numTinyAllocations() - numTinyDeallocations(), 0);
    }

    @Override
    public long numActiveSmallAllocations() {
        return max(numSmallAllocations() - numSmallDeallocations(), 0);
    }

    @Override
    public long numActiveNormalAllocations() {
        final long val;
        synchronized (this) {
            val = allocationsNormal - deallocationsNormal;
        }
        return max(val, 0);
    }

    @Override
    public long numActiveHugeAllocations() {
        return max(numHugeAllocations() - numHugeDeallocations(), 0);
    }

    @Override
    public long numActiveBytes() {
        long val = activeBytesHuge.value();
        synchronized (this) {
            for (int i = 0; i < chunkListMetrics.size(); i++) {
                for (PoolChunkMetric m: chunkListMetrics.get(i)) {
                    val += m.chunkSize();
                }
            }
        }
        return max(0, val);
    }

    protected abstract PoolChunk<T> newChunk(int pageSize, int maxOrder, int pageShifts, int chunkSize);
    protected abstract PoolChunk<T> newUnpooledChunk(int capacity);
    protected abstract PooledByteBuf<T> newByteBuf(int maxCapacity);
    protected abstract void memoryCopy(T src, int srcOffset, T dst, int dstOffset, int length);
    protected abstract void destroyChunk(PoolChunk<T> chunk);

    @Override
    public synchronized String toString() {
        StringBuilder buf = new StringBuilder()
            .append("Chunk(s) at 0~25%:")
            .append(StringUtil.NEWLINE)
            .append(qInit)
            .append(StringUtil.NEWLINE)
            .append("Chunk(s) at 0~50%:")
            .append(StringUtil.NEWLINE)
            .append(q000)
            .append(StringUtil.NEWLINE)
            .append("Chunk(s) at 25~75%:")
            .append(StringUtil.NEWLINE)
            .append(q025)
            .append(StringUtil.NEWLINE)
            .append("Chunk(s) at 50~100%:")
            .append(StringUtil.NEWLINE)
            .append(q050)
            .append(StringUtil.NEWLINE)
            .append("Chunk(s) at 75~100%:")
            .append(StringUtil.NEWLINE)
            .append(q075)
            .append(StringUtil.NEWLINE)
            .append("Chunk(s) at 100%:")
            .append(StringUtil.NEWLINE)
            .append(q100)
            .append(StringUtil.NEWLINE)
            .append("tiny subpages:");
        appendPoolSubPages(buf, tinySubpagePools);
        buf.append(StringUtil.NEWLINE)
           .append("small subpages:");
        appendPoolSubPages(buf, smallSubpagePools);
        buf.append(StringUtil.NEWLINE);

        return buf.toString();
    }

    private static void appendPoolSubPages(StringBuilder buf, PoolSubpage<?>[] subpages) {
        for (int i = 0; i < subpages.length; i ++) {
            PoolSubpage<?> head = subpages[i];
            if (head.next == head) {
                continue;
            }

            buf.append(StringUtil.NEWLINE)
                    .append(i)
                    .append(": ");
            PoolSubpage<?> s = head.next;
            for (;;) {
                buf.append(s);
                s = s.next;
                if (s == head) {
                    break;
                }
            }
        }
    }

    @Override
    protected final void finalize() throws Throwable {
        try {
            super.finalize();
        } finally {
            destroyPoolSubPages(smallSubpagePools);
            destroyPoolSubPages(tinySubpagePools);
            destroyPoolChunkLists(qInit, q000, q025, q050, q075, q100);
        }
    }

    private static void destroyPoolSubPages(PoolSubpage<?>[] pages) {
        for (PoolSubpage<?> page : pages) {
            page.destroy();
        }
    }

    private void destroyPoolChunkLists(PoolChunkList<T>... chunkLists) {
        for (PoolChunkList<T> chunkList: chunkLists) {
            chunkList.destroy(this);
        }
    }

    static final class HeapArena extends PoolArena<byte[]> {

        HeapArena(PooledByteBufAllocator parent, int pageSize, int maxOrder,
                int pageShifts, int chunkSize, int directMemoryCacheAlignment) {
            super(parent, pageSize, maxOrder, pageShifts, chunkSize,
                    directMemoryCacheAlignment);
        }

        @Override
        boolean isDirect() {
            return false;
        }

        @Override
        protected PoolChunk<byte[]> newChunk(int pageSize, int maxOrder, int pageShifts, int chunkSize) {
            return new PoolChunk<byte[]>(this, new byte[chunkSize], pageSize, maxOrder, pageShifts, chunkSize, 0);
        }

        @Override
        protected PoolChunk<byte[]> newUnpooledChunk(int capacity) {
            return new PoolChunk<byte[]>(this, new byte[capacity], capacity, 0);
        }

        @Override
        protected void destroyChunk(PoolChunk<byte[]> chunk) {
            // Rely on GC.
        }

        @Override
        protected PooledByteBuf<byte[]> newByteBuf(int maxCapacity) {
            return HAS_UNSAFE ? PooledUnsafeHeapByteBuf.newUnsafeInstance(maxCapacity)
                    : PooledHeapByteBuf.newInstance(maxCapacity);
        }

        @Override
        protected void memoryCopy(byte[] src, int srcOffset, byte[] dst, int dstOffset, int length) {
            if (length == 0) {
                return;
            }

            System.arraycopy(src, srcOffset, dst, dstOffset, length);
        }
    }

    static final class DirectArena extends PoolArena<ByteBuffer> {

        DirectArena(PooledByteBufAllocator parent, int pageSize, int maxOrder,
                int pageShifts, int chunkSize, int directMemoryCacheAlignment) {
            super(parent, pageSize, maxOrder, pageShifts, chunkSize,
                    directMemoryCacheAlignment);
        }

        @Override
        boolean isDirect() {
            return true;
        }

        private int offsetCacheLine(ByteBuffer memory) {
            // We can only calculate the offset if Unsafe is present as otherwise directBufferAddress(...) will
            // throw an NPE.
            return HAS_UNSAFE ?
                    (int) (PlatformDependent.directBufferAddress(memory) & directMemoryCacheAlignmentMask) : 0;
        }

        @Override
        protected PoolChunk<ByteBuffer> newChunk(int pageSize, int maxOrder,
                int pageShifts, int chunkSize) {
            if (directMemoryCacheAlignment == 0) {
                return new PoolChunk<ByteBuffer>(this,
                        allocateDirect(chunkSize), pageSize, maxOrder,
                        pageShifts, chunkSize, 0);
            }
            final ByteBuffer memory = allocateDirect(chunkSize
                    + directMemoryCacheAlignment);
            return new PoolChunk<ByteBuffer>(this, memory, pageSize,
                    maxOrder, pageShifts, chunkSize,
                    offsetCacheLine(memory));
        }

        @Override
        protected PoolChunk<ByteBuffer> newUnpooledChunk(int capacity) {
            if (directMemoryCacheAlignment == 0) {
                return new PoolChunk<ByteBuffer>(this,
                        allocateDirect(capacity), capacity, 0);
            }
            final ByteBuffer memory = allocateDirect(capacity
                    + directMemoryCacheAlignment);
            return new PoolChunk<ByteBuffer>(this, memory, capacity,
                    offsetCacheLine(memory));
        }

        private static ByteBuffer allocateDirect(int capacity) {
            return PlatformDependent.useDirectBufferNoCleaner() ?
                    PlatformDependent.allocateDirectNoCleaner(capacity) : ByteBuffer.allocateDirect(capacity);
        }

        @Override
        protected void destroyChunk(PoolChunk<ByteBuffer> chunk) {
            if (PlatformDependent.useDirectBufferNoCleaner()) {
                PlatformDependent.freeDirectNoCleaner(chunk.memory);
            } else {
                PlatformDependent.freeDirectBuffer(chunk.memory);
            }
        }

        @Override
        protected PooledByteBuf<ByteBuffer> newByteBuf(int maxCapacity) {
            if (HAS_UNSAFE) {
                return PooledUnsafeDirectByteBuf.newInstance(maxCapacity);
            } else {
                return PooledDirectByteBuf.newInstance(maxCapacity);
            }
        }

        @Override
        protected void memoryCopy(ByteBuffer src, int srcOffset, ByteBuffer dst, int dstOffset, int length) {
            if (length == 0) {
                return;
            }

            if (HAS_UNSAFE) {
                PlatformDependent.copyMemory(
                        PlatformDependent.directBufferAddress(src) + srcOffset,
                        PlatformDependent.directBufferAddress(dst) + dstOffset, length);
            } else {
                // We must duplicate the NIO buffers because they may be accessed by other Netty buffers.
                src = src.duplicate();
                dst = dst.duplicate();
                src.position(srcOffset).limit(srcOffset + length);
                dst.position(dstOffset);
                dst.put(src);
            }
        }
    }
}
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								/*
 								 * Copyright 2012 The Netty Project
 								 *
 								 * The Netty Project licenses this file to you under the Apache License,
 								 * version 2.0 (the "License"); you may not use this file except in compliance
 								 * with the License. You may obtain a copy of the License at:
 								 *
 								 *   http://www.apache.org/licenses/LICENSE-2.0
 								 *
 								 * Unless required by applicable law or agreed to in writing, software
 								 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 								 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 								 * License for the specific language governing permissions and limitations
 								 * under the License.
 								 */
 								package io.netty.buffer;
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								import io.netty.util.internal.LongCounter;
-												Replace and merge DetectionUtil and DirectByteBufUtil into PlatformDependent and PlatformDependent0

PlatformDependent delegates the operations requires sun.misc.* to PlatformDependent0 to avoid runtime errors due to missing sun.misc.* classes.

											
										
										
											2013-01-11 06:03:27 +01:00
+								import io.netty.util.internal.PlatformDependent;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								import io.netty.util.internal.StringUtil;
 								import java.nio.ByteBuffer;
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								import java.util.ArrayList;
 								import java.util.Collections;
 								import java.util.List;
-												Change arena to thread cache mapping algorithm to be closer to ideal.

Motivation:
Circular assignment of arenas to thread caches can lead to less than optimal
mappings in cases where threads are (frequently) shutdown and started.

Example Scenario:
There are a total of 2 arenas. The first two threads performing an allocation
would lead to the following mapping:

Thread 0 -> Arena 0
Thread 1 -> Arena 1

Now, assume Thread 1 is shut down and another Thread 2 is started. The current
circular assignment algorithm would lead to the following mapping:

Thread 0 -> Arena 0
Thread 2 -> Arena 0

Ideally, we want Thread 2 to use Arena 1 though.

Presumably, this is not much of an issue for most Netty applications that do all
the allocations inside the eventloop, as eventloop threads are seldomly shut down
and restarted. However, applications that only use the netty-buffer package
or implement their own threading model outside the eventloop might suffer from
increased contention. For example, gRPC Java when using the blocking stub
performs some allocations outside the eventloop and within its own thread pool
that is dynamically sized depending on system load.

Modifications:

Implement a linear scan algorithm that assigns a new thread cache to the arena
that currently backs the fewest thread caches.

Result:

Closer to ideal mappings between thread caches and arenas. In order to always
get an ideal mapping, we would have to re-balance the mapping whenever a thread
dies. However, that's difficult because of deallocation.

											
										
										
											2016-03-14 17:25:43 +01:00
+								import java.util.concurrent.atomic.AtomicInteger;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
-												Cleanup PoolChunk and PoolArena

Motivation:

To make it easier to understand PoolChunk and PoolArena we should cleanup duplicated code.

Modifications:

- Move reused code into methods
- Use Math.max(...)

Result:

Cleaner code and easier to understand.

											
										
										
											2016-04-08 11:56:32 +02:00
+								import static java.lang.Math.max;
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								abstract class PoolArena<T> implements PoolArenaMetric {
-												Add *UnsafeHeapByteBuf for improve performance on systems with sun.misc.Unsafe

Motivation:

sun.misc.Unsafe allows us to handle heap ByteBuf in a more efficient matter. We should use special ByteBuf implementation when sun.misc.Unsafe can be used to increase performance.

Modifications:

- Add PooledUnsafeHeapByteBuf and UnpooledUnsafeHeapByteBuf that are used when sun.misc.Unsafe is ready to use.
- Add UnsafeHeapSwappedByteBuf

Result:

Better performance when using heap buffers and sun.misc.Unsafe is ready to use.

											
										
										
											2015-10-09 21:03:03 +02:00
+								    static final boolean HAS_UNSAFE = PlatformDependent.hasUnsafe();
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
-												[#3654] No need to hold lock while destroy a chunk

Motiviation:

At the moment we sometimes hold the lock on the PoolArena during destroy a PoolChunk. This is not needed.

Modification:

- Ensure we not hold the lock during destroy a PoolChunk
- Move all synchronized usage in PoolArena
- Cleanup

Result:

Less condition.

											
										
										
											2015-05-25 21:00:24 +02:00
+								    enum SizeClass {
 								        Tiny,
 								        Small,
 								        Normal
 								    }
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								    static final int numTinySubpagePools = 512 >>> 4;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    final PooledByteBufAllocator parent;
 								    private final int maxOrder;
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								    final int pageSize;
 								    final int pageShifts;
 								    final int chunkSize;
 								    final int subpageOverflowMask;
 								    final int numSmallSubpagePools;
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								    final int directMemoryCacheAlignment;
 								    final int directMemoryCacheAlignmentMask;
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								    private final PoolSubpage<T>[] tinySubpagePools;
 								    private final PoolSubpage<T>[] smallSubpagePools;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
 								    private final PoolChunkList<T> q050;
 								    private final PoolChunkList<T> q025;
 								    private final PoolChunkList<T> q000;
 								    private final PoolChunkList<T> qInit;
 								    private final PoolChunkList<T> q075;
 								    private final PoolChunkList<T> q100;
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    private final List<PoolChunkListMetric> chunkListMetrics;
 								    // Metrics for allocations and deallocations
 								    private long allocationsNormal;
 								    // We need to use the LongCounter here as this is not guarded via synchronized block.
-												Add final keyword which was missing in 47b598e6ce6c1f0e57d6955d10652dd5e5bc1462

Motivation:

The two fields should have final keyword.

Modifications:

Add final keyword

Result:

Cleaner code.

											
										
										
											2016-03-14 09:00:50 +01:00
+								    private final LongCounter allocationsTiny = PlatformDependent.newLongCounter();
 								    private final LongCounter allocationsSmall = PlatformDependent.newLongCounter();
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    private final LongCounter allocationsHuge = PlatformDependent.newLongCounter();
-												Allow to retrieve the number of active bytes per PoolArena.

Motivation:

To better understand how much memory is used by Netty for ByteBufs it is useful to understand how many bytes are currently active (allocated) per PoolArena.

Modifications:

- Add PoolArenaMetric.numActiveBytes()

Result:

The user is able to get better insight into the PooledByteBufAllocator.

											
										
										
											2016-04-06 15:12:37 +02:00
+								    private final LongCounter activeBytesHuge = PlatformDependent.newLongCounter();
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
 								    private long deallocationsTiny;
 								    private long deallocationsSmall;
 								    private long deallocationsNormal;
-												Add final keyword which was missing in 47b598e6ce6c1f0e57d6955d10652dd5e5bc1462

Motivation:

The two fields should have final keyword.

Modifications:

Add final keyword

Result:

Cleaner code.

											
										
										
											2016-03-14 09:00:50 +01:00
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    // We need to use the LongCounter here as this is not guarded via synchronized block.
 								    private final LongCounter deallocationsHuge = PlatformDependent.newLongCounter();
-												Change arena to thread cache mapping algorithm to be closer to ideal.

Motivation:
Circular assignment of arenas to thread caches can lead to less than optimal
mappings in cases where threads are (frequently) shutdown and started.

Example Scenario:
There are a total of 2 arenas. The first two threads performing an allocation
would lead to the following mapping:

Thread 0 -> Arena 0
Thread 1 -> Arena 1

Now, assume Thread 1 is shut down and another Thread 2 is started. The current
circular assignment algorithm would lead to the following mapping:

Thread 0 -> Arena 0
Thread 2 -> Arena 0

Ideally, we want Thread 2 to use Arena 1 though.

Presumably, this is not much of an issue for most Netty applications that do all
the allocations inside the eventloop, as eventloop threads are seldomly shut down
and restarted. However, applications that only use the netty-buffer package
or implement their own threading model outside the eventloop might suffer from
increased contention. For example, gRPC Java when using the blocking stub
performs some allocations outside the eventloop and within its own thread pool
that is dynamically sized depending on system load.

Modifications:

Implement a linear scan algorithm that assigns a new thread cache to the arena
that currently backs the fewest thread caches.

Result:

Closer to ideal mappings between thread caches and arenas. In order to always
get an ideal mapping, we would have to re-balance the mapping whenever a thread
dies. However, that's difficult because of deallocation.

											
										
										
											2016-03-14 17:25:43 +01:00
+								    // Number of thread caches backed by this arena.
 								    final AtomicInteger numThreadCaches = new AtomicInteger();
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    // TODO: Test if adding padding helps under contention
 								    //private long pad0, pad1, pad2, pad3, pad4, pad5, pad6, pad7;
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								    protected PoolArena(PooledByteBufAllocator parent, int pageSize,
 								          int maxOrder, int pageShifts, int chunkSize, int cacheAlignment) {
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        this.parent = parent;
 								        this.pageSize = pageSize;
 								        this.maxOrder = maxOrder;
 								        this.pageShifts = pageShifts;
 								        this.chunkSize = chunkSize;
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								        this.directMemoryCacheAlignment = cacheAlignment;
 								        this.directMemoryCacheAlignmentMask = cacheAlignment - 1;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        subpageOverflowMask = ~(pageSize - 1);
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								        tinySubpagePools = newSubpagePoolArray(numTinySubpagePools);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        for (int i = 0; i < tinySubpagePools.length; i ++) {
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								            tinySubpagePools[i] = newSubpagePoolHead(pageSize);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								        numSmallSubpagePools = pageShifts - 9;
 								        smallSubpagePools = newSubpagePoolArray(numSmallSubpagePools);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        for (int i = 0; i < smallSubpagePools.length; i ++) {
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								            smallSubpagePools[i] = newSubpagePoolHead(pageSize);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
-												PoolChunkList.allocate(...) should return false without the need to walk all the contained PoolChunks when the requested capacity is too big.

Motivation:

PoolChunkList.allocate(...) should return false without the need to walk all the contained PoolChunks when the requested capacity is larger then the capacity that can be allocated out of the PoolChunks in respect to the minUsage() and maxUsage() of the PoolChunkList.

Modifications:

Precompute the maximal capacity that can be allocated out of the PoolChunks that are contained in the PoolChunkList and use this to fast return from the allocate(...) method if an allocation capacity larger then that is requested.

Result:

Faster detection of allocations that can not be handled by the PoolChunkList and so faster allocations in general via the PoolArena.

											
										
										
											2016-04-06 20:05:23 +02:00
+								        q100 = new PoolChunkList<T>(null, 100, Integer.MAX_VALUE, chunkSize);
 								        q075 = new PoolChunkList<T>(q100, 75, 100, chunkSize);
 								        q050 = new PoolChunkList<T>(q075, 50, 100, chunkSize);
 								        q025 = new PoolChunkList<T>(q050, 25, 75, chunkSize);
 								        q000 = new PoolChunkList<T>(q025, 1, 50, chunkSize);
 								        qInit = new PoolChunkList<T>(q000, Integer.MIN_VALUE, 25, chunkSize);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
-												[#3654] No need to hold lock while destroy a chunk

Motiviation:

At the moment we sometimes hold the lock on the PoolArena during destroy a PoolChunk. This is not needed.

Modification:

- Ensure we not hold the lock during destroy a PoolChunk
- Move all synchronized usage in PoolArena
- Cleanup

Result:

Less condition.

											
										
										
											2015-05-25 21:00:24 +02:00
+								        q100.prevList(q075);
 								        q075.prevList(q050);
 								        q050.prevList(q025);
 								        q025.prevList(q000);
 								        q000.prevList(null);
 								        qInit.prevList(qInit);
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
 								        List<PoolChunkListMetric> metrics = new ArrayList<PoolChunkListMetric>(6);
 								        metrics.add(qInit);
 								        metrics.add(q000);
 								        metrics.add(q025);
 								        metrics.add(q050);
 								        metrics.add(q075);
 								        metrics.add(q100);
 								        chunkListMetrics = Collections.unmodifiableList(metrics);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    }
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								    private PoolSubpage<T> newSubpagePoolHead(int pageSize) {
 								        PoolSubpage<T> head = new PoolSubpage<T>(pageSize);
 								        head.prev = head;
 								        head.next = head;
 								        return head;
 								    }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    @SuppressWarnings("unchecked")
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								    private PoolSubpage<T>[] newSubpagePoolArray(int size) {
 								        return new PoolSubpage[size];
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    }
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								    abstract boolean isDirect();
-												Return the new buffer's capacity is same with the requested capacity

- Rename capacity variables to reqCapacity or normCapacity to distinguish if its the request capacity or the normalized capacity
- Do not reallocate on ByteBuf.capacity(int) if reallocation is unnecessary; just update the index range.
- Revert the workaround in DefaultChannelHandlerContext

											
										
										
											2012-12-19 08:48:53 +01:00
+								    PooledByteBuf<T> allocate(PoolThreadCache cache, int reqCapacity, int maxCapacity) {
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        PooledByteBuf<T> buf = newByteBuf(maxCapacity);
-												Return the new buffer's capacity is same with the requested capacity

- Rename capacity variables to reqCapacity or normCapacity to distinguish if its the request capacity or the normalized capacity
- Do not reallocate on ByteBuf.capacity(int) if reallocation is unnecessary; just update the index range.
- Revert the workaround in DefaultChannelHandlerContext

											
										
										
											2012-12-19 08:48:53 +01:00
+								        allocate(cache, buf, reqCapacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        return buf;
 								    }
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								    static int tinyIdx(int normCapacity) {
 								        return normCapacity >>> 4;
 								    }
 								    static int smallIdx(int normCapacity) {
 								        int tableIdx = 0;
 								        int i = normCapacity >>> 10;
 								        while (i != 0) {
 								            i >>>= 1;
 								            tableIdx ++;
 								        }
 								        return tableIdx;
 								    }
 								    // capacity < pageSize
 								    boolean isTinyOrSmall(int normCapacity) {
 								        return (normCapacity & subpageOverflowMask) == 0;
 								    }
 								    // normCapacity < 512
 								    static boolean isTiny(int normCapacity) {
 								        return (normCapacity & 0xFFFFFE00) == 0;
 								    }
-												Return the new buffer's capacity is same with the requested capacity

- Rename capacity variables to reqCapacity or normCapacity to distinguish if its the request capacity or the normalized capacity
- Do not reallocate on ByteBuf.capacity(int) if reallocation is unnecessary; just update the index range.
- Revert the workaround in DefaultChannelHandlerContext

											
										
										
											2012-12-19 08:48:53 +01:00
+								    private void allocate(PoolThreadCache cache, PooledByteBuf<T> buf, final int reqCapacity) {
 								        final int normCapacity = normalizeCapacity(reqCapacity);
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								        if (isTinyOrSmall(normCapacity)) { // capacity < pageSize
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								            int tableIdx;
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								            PoolSubpage<T>[] table;
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								            boolean tiny = isTiny(normCapacity);
 								            if (tiny) { // < 512
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								                if (cache.allocateTiny(this, buf, reqCapacity, normCapacity)) {
 								                    // was able to allocate out of the cache so move on
 								                    return;
 								                }
 								                tableIdx = tinyIdx(normCapacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								                table = tinySubpagePools;
 								            } else {
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								                if (cache.allocateSmall(this, buf, reqCapacity, normCapacity)) {
 								                    // was able to allocate out of the cache so move on
 								                    return;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								                }
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								                tableIdx = smallIdx(normCapacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								                table = smallSubpagePools;
 								            }
-												No need to release lock and acquire again when allocate normal size.

Motiviation:

When tried to allocate tiny and small sized and failed to serve these out of the PoolSubPage we exit the synchronization
block just to enter it again when call allocateNormal(...).

Modification:

Not exit the synchronized block until allocateNormal(...) is done.

Result:

Better performance.

											
										
										
											2015-05-13 13:50:22 +02:00
+								            final PoolSubpage<T> head = table[tableIdx];
-												[#3654] Synchronize on PoolSubpage head when allocate / free PoolSubpages

Motivation:

Currently we hold a lock on the PoolArena when we allocate / free PoolSubpages, which is wasteful as this also affects "normal" allocations. The same is true vice-verse.

Modifications:

Ensure we synchronize on the head of the PoolSubPages pool. This is done per size and so it is possible to concurrently allocate / deallocate PoolSubPages with different sizes, and also normal allocations.

Result:

Less condition and so faster allocation/deallocation.

Before this commit:
xxx:~/wrk $ ./wrk -H 'Connection: keep-alive' -d 120 -c 256 -t 16 -s scripts/pipeline-many.lua  http://xxx:8080/plaintext
Running 2m test @ http://xxx:8080/plaintext
  16 threads and 256 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency    17.61ms   29.52ms 689.73ms   97.27%
    Req/Sec   278.93k    41.97k  351.04k    84.83%
  530527460 requests in 2.00m, 71.64GB read
Requests/sec: 4422226.13
Transfer/sec:    611.52MB

After this commit:
xxx:~/wrk $ ./wrk -H 'Connection: keep-alive' -d 120 -c 256 -t 16 -s scripts/pipeline-many.lua  http://xxx:8080/plaintext
Running 2m test @ http://xxx:8080/plaintext
  16 threads and 256 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency    15.85ms   24.50ms 681.61ms   97.42%
    Req/Sec   287.14k    38.39k  360.33k    85.88%
  547902773 requests in 2.00m, 73.99GB read
Requests/sec: 4567066.11
Transfer/sec:    631.55MB

This is reproducable every time.

											
										
										
											2015-05-20 07:27:55 +02:00
 								            /**
-												[#4198] Fix race-condition when allocate from multiple-thread.

Motivation:

Fix a race condition that was introduced by f18990a8a507d52fc40416d169db340105b10ec0 that could lead to a NPE when allocate from the PooledByteBufAllocator concurrently by many threads.

Modifications:

Correctly synchronize on the PoolSubPage head.

Result:

No more race.

											
										
										
											2015-10-22 22:09:41 +02:00
+								             * Synchronize on the head. This is needed as {@link PoolChunk#allocateSubpage(int)} and
 								             * {@link PoolChunk#free(long)} may modify the doubly linked list as well.
-												[#3654] Synchronize on PoolSubpage head when allocate / free PoolSubpages

Motivation:

Currently we hold a lock on the PoolArena when we allocate / free PoolSubpages, which is wasteful as this also affects "normal" allocations. The same is true vice-verse.

Modifications:

Ensure we synchronize on the head of the PoolSubPages pool. This is done per size and so it is possible to concurrently allocate / deallocate PoolSubPages with different sizes, and also normal allocations.

Result:

Less condition and so faster allocation/deallocation.

Before this commit:
xxx:~/wrk $ ./wrk -H 'Connection: keep-alive' -d 120 -c 256 -t 16 -s scripts/pipeline-many.lua  http://xxx:8080/plaintext
Running 2m test @ http://xxx:8080/plaintext
  16 threads and 256 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency    17.61ms   29.52ms 689.73ms   97.27%
    Req/Sec   278.93k    41.97k  351.04k    84.83%
  530527460 requests in 2.00m, 71.64GB read
Requests/sec: 4422226.13
Transfer/sec:    611.52MB

After this commit:
xxx:~/wrk $ ./wrk -H 'Connection: keep-alive' -d 120 -c 256 -t 16 -s scripts/pipeline-many.lua  http://xxx:8080/plaintext
Running 2m test @ http://xxx:8080/plaintext
  16 threads and 256 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency    15.85ms   24.50ms 681.61ms   97.42%
    Req/Sec   287.14k    38.39k  360.33k    85.88%
  547902773 requests in 2.00m, 73.99GB read
Requests/sec: 4567066.11
Transfer/sec:    631.55MB

This is reproducable every time.

											
										
										
											2015-05-20 07:27:55 +02:00
+								             */
 								            synchronized (head) {
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								                final PoolSubpage<T> s = head.next;
 								                if (s != head) {
 								                    assert s.doNotDestroy && s.elemSize == normCapacity;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								                    long handle = s.allocate();
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								                    assert handle >= 0;
 								                    s.chunk.initBufWithSubpage(buf, handle, reqCapacity);
-												Calculate correct count for tiny/small/normal allocation

Motivation:

Disable ThreadLocal Cache, then allocate Pooled ByteBuf and release all these buffers, PoolArena's tiny/small/normal allocation count is incorrect.

Modifications:

- Calculate PoolArena's tiny/small/normal allocation one time
- Add testAllocationCounter TestCase

Result:

Fixes #6282 .

											
										
										
											2017-01-27 09:28:19 +01:00
+								                    incTinySmallAllocation(tiny);
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								                    return;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								                }
 								            }
-												Calculate correct count for tiny/small/normal allocation

Motivation:

Disable ThreadLocal Cache, then allocate Pooled ByteBuf and release all these buffers, PoolArena's tiny/small/normal allocation count is incorrect.

Modifications:

- Calculate PoolArena's tiny/small/normal allocation one time
- Add testAllocationCounter TestCase

Result:

Fixes #6282 .

											
										
										
											2017-01-27 09:28:19 +01:00
+								            synchronized (this) {
 								                allocateNormal(buf, reqCapacity, normCapacity);
 								            }
 								            incTinySmallAllocation(tiny);
-												[#3654] Synchronize on PoolSubpage head when allocate / free PoolSubpages

Motivation:

Currently we hold a lock on the PoolArena when we allocate / free PoolSubpages, which is wasteful as this also affects "normal" allocations. The same is true vice-verse.

Modifications:

Ensure we synchronize on the head of the PoolSubPages pool. This is done per size and so it is possible to concurrently allocate / deallocate PoolSubPages with different sizes, and also normal allocations.

Result:

Less condition and so faster allocation/deallocation.

Before this commit:
xxx:~/wrk $ ./wrk -H 'Connection: keep-alive' -d 120 -c 256 -t 16 -s scripts/pipeline-many.lua  http://xxx:8080/plaintext
Running 2m test @ http://xxx:8080/plaintext
  16 threads and 256 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency    17.61ms   29.52ms 689.73ms   97.27%
    Req/Sec   278.93k    41.97k  351.04k    84.83%
  530527460 requests in 2.00m, 71.64GB read
Requests/sec: 4422226.13
Transfer/sec:    611.52MB

After this commit:
xxx:~/wrk $ ./wrk -H 'Connection: keep-alive' -d 120 -c 256 -t 16 -s scripts/pipeline-many.lua  http://xxx:8080/plaintext
Running 2m test @ http://xxx:8080/plaintext
  16 threads and 256 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency    15.85ms   24.50ms 681.61ms   97.42%
    Req/Sec   287.14k    38.39k  360.33k    85.88%
  547902773 requests in 2.00m, 73.99GB read
Requests/sec: 4567066.11
Transfer/sec:    631.55MB

This is reproducable every time.

											
										
										
											2015-05-20 07:27:55 +02:00
+								            return;
-												No need to release lock and acquire again when allocate normal size.

Motiviation:

When tried to allocate tiny and small sized and failed to serve these out of the PoolSubPage we exit the synchronization
block just to enter it again when call allocateNormal(...).

Modification:

Not exit the synchronized block until allocateNormal(...) is done.

Result:

Better performance.

											
										
										
											2015-05-13 13:50:22 +02:00
+								        }
 								        if (normCapacity <= chunkSize) {
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								            if (cache.allocateNormal(this, buf, reqCapacity, normCapacity)) {
 								                // was able to allocate out of the cache so move on
 								                return;
 								            }
-												Calculate correct count for tiny/small/normal allocation

Motivation:

Disable ThreadLocal Cache, then allocate Pooled ByteBuf and release all these buffers, PoolArena's tiny/small/normal allocation count is incorrect.

Modifications:

- Calculate PoolArena's tiny/small/normal allocation one time
- Add testAllocationCounter TestCase

Result:

Fixes #6282 .

											
										
										
											2017-01-27 09:28:19 +01:00
+								            synchronized (this) {
 								                allocateNormal(buf, reqCapacity, normCapacity);
 								                ++allocationsNormal;
 								            }
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								        } else {
 								            // Huge allocations are never served via the cache so just call allocateHuge
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								            allocateHuge(buf, reqCapacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
 								    }
-												Calculate correct count for tiny/small/normal allocation

Motivation:

Disable ThreadLocal Cache, then allocate Pooled ByteBuf and release all these buffers, PoolArena's tiny/small/normal allocation count is incorrect.

Modifications:

- Calculate PoolArena's tiny/small/normal allocation one time
- Add testAllocationCounter TestCase

Result:

Fixes #6282 .

											
										
										
											2017-01-27 09:28:19 +01:00
+								    // Method must be called insided synchronized(this) { ... } block
 								    private void allocateNormal(PooledByteBuf<T> buf, int reqCapacity, int normCapacity) {
-												Return the new buffer's capacity is same with the requested capacity

- Rename capacity variables to reqCapacity or normCapacity to distinguish if its the request capacity or the normalized capacity
- Do not reallocate on ByteBuf.capacity(int) if reallocation is unnecessary; just update the index range.
- Revert the workaround in DefaultChannelHandlerContext

											
										
										
											2012-12-19 08:48:53 +01:00
+								        if (q050.allocate(buf, reqCapacity, normCapacity) || q025.allocate(buf, reqCapacity, normCapacity) ||
 								            q000.allocate(buf, reqCapacity, normCapacity) || qInit.allocate(buf, reqCapacity, normCapacity) ||
-												Not try to allocate out of the PoolChunkList that contains only full PoolChunks

Motivation:

When doing a normal allocation in PoolArena we also tried to allocate out of the PoolChunkList that only contains completely full PoolChunks. This makes no sense as these are full anyway so an allocation will never work here and just gives a perf hit as we need to walk the whole list of PoolChunks in the list.

Modifications:

Not try to allocate from PoolChunkList that only contains full PoolChunks

Result:

Faster allocation times when a new PoolChunk must be created.

											
										
										
											2016-04-06 11:05:59 +02:00
+								            q075.allocate(buf, reqCapacity, normCapacity)) {
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								            return;
 								        }
 								        // Add a new chunk.
 								        PoolChunk<T> c = newChunk(pageSize, maxOrder, pageShifts, chunkSize);
-												Return the new buffer's capacity is same with the requested capacity

- Rename capacity variables to reqCapacity or normCapacity to distinguish if its the request capacity or the normalized capacity
- Do not reallocate on ByteBuf.capacity(int) if reallocation is unnecessary; just update the index range.
- Revert the workaround in DefaultChannelHandlerContext

											
										
										
											2012-12-19 08:48:53 +01:00
+								        long handle = c.allocate(normCapacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        assert handle > 0;
-												Return the new buffer's capacity is same with the requested capacity

- Rename capacity variables to reqCapacity or normCapacity to distinguish if its the request capacity or the normalized capacity
- Do not reallocate on ByteBuf.capacity(int) if reallocation is unnecessary; just update the index range.
- Revert the workaround in DefaultChannelHandlerContext

											
										
										
											2012-12-19 08:48:53 +01:00
+								        c.initBuf(buf, handle, reqCapacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        qInit.add(c);
 								    }
-												Calculate correct count for tiny/small/normal allocation

Motivation:

Disable ThreadLocal Cache, then allocate Pooled ByteBuf and release all these buffers, PoolArena's tiny/small/normal allocation count is incorrect.

Modifications:

- Calculate PoolArena's tiny/small/normal allocation one time
- Add testAllocationCounter TestCase

Result:

Fixes #6282 .

											
										
										
											2017-01-27 09:28:19 +01:00
+								    private void incTinySmallAllocation(boolean tiny) {
 								        if (tiny) {
 								            allocationsTiny.increment();
 								        } else {
 								            allocationsSmall.increment();
 								        }
 								    }
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								    private void allocateHuge(PooledByteBuf<T> buf, int reqCapacity) {
-												Allow to retrieve the number of active bytes per PoolArena.

Motivation:

To better understand how much memory is used by Netty for ByteBufs it is useful to understand how many bytes are currently active (allocated) per PoolArena.

Modifications:

- Add PoolArenaMetric.numActiveBytes()

Result:

The user is able to get better insight into the PooledByteBufAllocator.

											
										
										
											2016-04-06 15:12:37 +02:00
+								        PoolChunk<T> chunk = newUnpooledChunk(reqCapacity);
 								        activeBytesHuge.add(chunk.chunkSize());
 								        buf.initUnpooled(chunk, reqCapacity);
-												Only increment metric for huge / normal allocations after the allocation was really done.

Motivation:

We should only increment the metric for the huge / normal allocation after it is done. Also we should only decrement once deallocate.

Modifications:

- Move increment after the allocation.
- Fix deallocation metric and move it after deallocation

Result:

More correct metrics.

											
										
										
											2016-04-05 11:46:25 +02:00
+								        allocationsHuge.increment();
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								    }
-												Let PoolThreadCache work even if allocation and deallocation Thread are different

Motivation:

PoolThreadCache did only cache allocations if the allocation and deallocation Thread were the same. This is not optimal as often people write from differen thread then the actual EventLoop thread.

Modification:

- Add MpscArrayQueue which was forked from jctools and lightly modified.
- Use MpscArrayQueue for caches and always add buffer back to the cache that belongs to the allocation thread.

Result:

ThreadPoolCache is now also usable and so gives performance improvements when allocation and deallocation thread are different.

Performance when using same thread for allocation and deallocation is noticable worse then before.

											
										
										
											2015-05-19 15:03:29 +02:00
+								    void free(PoolChunk<T> chunk, long handle, int normCapacity, PoolThreadCache cache) {
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								        if (chunk.unpooled) {
-												Allow to retrieve the number of active bytes per PoolArena.

Motivation:

To better understand how much memory is used by Netty for ByteBufs it is useful to understand how many bytes are currently active (allocated) per PoolArena.

Modifications:

- Add PoolArenaMetric.numActiveBytes()

Result:

The user is able to get better insight into the PooledByteBufAllocator.

											
										
										
											2016-04-06 15:12:37 +02:00
+								            int size = chunk.chunkSize();
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								            destroyChunk(chunk);
-												Allow to retrieve the number of active bytes per PoolArena.

Motivation:

To better understand how much memory is used by Netty for ByteBufs it is useful to understand how many bytes are currently active (allocated) per PoolArena.

Modifications:

- Add PoolArenaMetric.numActiveBytes()

Result:

The user is able to get better insight into the PooledByteBufAllocator.

											
										
										
											2016-04-06 15:12:37 +02:00
+								            activeBytesHuge.add(-size);
-												[#5216] Correctly increment deallocationsHuge when call PoolArena.free

Motivation:

We called deallocationsHuge.decrement() but it needs to be increment()

Modifications:

Replace decrement() with increment()

Result:

Correct metrics.

											
										
										
											2016-05-05 20:54:08 +02:00
+								            deallocationsHuge.increment();
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								        } else {
-												[#3654] No need to hold lock while destroy a chunk

Motiviation:

At the moment we sometimes hold the lock on the PoolArena during destroy a PoolChunk. This is not needed.

Modification:

- Ensure we not hold the lock during destroy a PoolChunk
- Move all synchronized usage in PoolArena
- Cleanup

Result:

Less condition.

											
										
										
											2015-05-25 21:00:24 +02:00
+								            SizeClass sizeClass = sizeClass(normCapacity);
-												Let PoolThreadCache work even if allocation and deallocation Thread are different

Motivation:

PoolThreadCache did only cache allocations if the allocation and deallocation Thread were the same. This is not optimal as often people write from differen thread then the actual EventLoop thread.

Modification:

- Add MpscArrayQueue which was forked from jctools and lightly modified.
- Use MpscArrayQueue for caches and always add buffer back to the cache that belongs to the allocation thread.

Result:

ThreadPoolCache is now also usable and so gives performance improvements when allocation and deallocation thread are different.

Performance when using same thread for allocation and deallocation is noticable worse then before.

											
										
										
											2015-05-19 15:03:29 +02:00
+								            if (cache != null && cache.add(this, chunk, handle, normCapacity, sizeClass)) {
 								                // cached so not free it.
 								                return;
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								            }
-												Let PoolThreadCache work even if allocation and deallocation Thread are different

Motivation:

PoolThreadCache did only cache allocations if the allocation and deallocation Thread were the same. This is not optimal as often people write from differen thread then the actual EventLoop thread.

Modification:

- Add MpscArrayQueue which was forked from jctools and lightly modified.
- Use MpscArrayQueue for caches and always add buffer back to the cache that belongs to the allocation thread.

Result:

ThreadPoolCache is now also usable and so gives performance improvements when allocation and deallocation thread are different.

Performance when using same thread for allocation and deallocation is noticable worse then before.

											
										
										
											2015-05-19 15:03:29 +02:00
-												[#3654] No need to hold lock while destroy a chunk

Motiviation:

At the moment we sometimes hold the lock on the PoolArena during destroy a PoolChunk. This is not needed.

Modification:

- Ensure we not hold the lock during destroy a PoolChunk
- Move all synchronized usage in PoolArena
- Cleanup

Result:

Less condition.

											
										
										
											2015-05-25 21:00:24 +02:00
+								            freeChunk(chunk, handle, sizeClass);
 								        }
 								    }
-												Disable caching of PooledByteBuf for different threads.

Motivation:

We introduced a PoolThreadCache which is used in our PooledByteBufAllocator to reduce the synchronization overhead on PoolArenas when allocate / deallocate PooledByteBuf instances. This cache is used for both the allocation path and deallocation path by:
  - Look for cached memory in the PoolThreadCache for the Thread that tries to allocate a new PooledByteBuf and if one is found return it.
  - Add the memory that is used by a PooledByteBuf to the PoolThreadCache of the Thread that release the PooledByteBuf

This works out very well when all allocation / deallocation is done in the EventLoop as the EventLoop will be used for read and write. On the otherside this can lead to surprising side-effects if the user allocate from outside the EventLoop and and pass the ByteBuf over for writing. The problem here is that the memory will be added to the PoolThreadCache that did the actual write on the underlying transport and not on the Thread that previously allocated the buffer.

Modifications:

Don't cache if different Threads are used for allocating/deallocating

Result:

Less confusing behavior for users that allocate PooledByteBufs from outside the EventLoop.

											
										
										
											2014-09-02 07:15:58 +02:00
-												[#3654] No need to hold lock while destroy a chunk

Motiviation:

At the moment we sometimes hold the lock on the PoolArena during destroy a PoolChunk. This is not needed.

Modification:

- Ensure we not hold the lock during destroy a PoolChunk
- Move all synchronized usage in PoolArena
- Cleanup

Result:

Less condition.

											
										
										
											2015-05-25 21:00:24 +02:00
+								    private SizeClass sizeClass(int normCapacity) {
 								        if (!isTinyOrSmall(normCapacity)) {
 								            return SizeClass.Normal;
 								        }
 								        return isTiny(normCapacity) ? SizeClass.Tiny : SizeClass.Small;
 								    }
 								    void freeChunk(PoolChunk<T> chunk, long handle, SizeClass sizeClass) {
 								        final boolean destroyChunk;
 								        synchronized (this) {
 								            switch (sizeClass) {
 								            case Normal:
 								                ++deallocationsNormal;
 								                break;
 								            case Small:
 								                ++deallocationsSmall;
 								                break;
 								            case Tiny:
 								                ++deallocationsTiny;
 								                break;
 								            default:
 								                throw new Error();
-												[#2021] No need to synchronize for unpooled chunks

											
										
										
											2013-11-30 19:48:40 +01:00
+								            }
-												[#3654] No need to hold lock while destroy a chunk

Motiviation:

At the moment we sometimes hold the lock on the PoolArena during destroy a PoolChunk. This is not needed.

Modification:

- Ensure we not hold the lock during destroy a PoolChunk
- Move all synchronized usage in PoolArena
- Cleanup

Result:

Less condition.

											
										
										
											2015-05-25 21:00:24 +02:00
+								            destroyChunk = !chunk.parent.free(chunk, handle);
 								        }
 								        if (destroyChunk) {
 								            // destroyChunk not need to be called while holding the synchronized lock.
 								            destroyChunk(chunk);
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								        }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    }
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								    PoolSubpage<T> findSubpagePoolHead(int elemSize) {
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        int tableIdx;
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								        PoolSubpage<T>[] table;
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								        if (isTiny(elemSize)) { // < 512
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								            tableIdx = elemSize >>> 4;
 								            table = tinySubpagePools;
 								        } else {
 								            tableIdx = 0;
 								            elemSize >>>= 10;
 								            while (elemSize != 0) {
 								                elemSize >>>= 1;
 								                tableIdx ++;
 								            }
 								            table = smallSubpagePools;
 								        }
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								        return table[tableIdx];
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    }
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								    int normalizeCapacity(int reqCapacity) {
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								        if (reqCapacity < 0) {
 								            throw new IllegalArgumentException("capacity: " + reqCapacity + " (expected: 0+)");
 								        }
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								        if (reqCapacity >= chunkSize) {
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								            return directMemoryCacheAlignment == 0 ? reqCapacity : alignCapacity(reqCapacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								        if (!isTiny(reqCapacity)) { // >= 512
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								            // Doubled
-												Make AdaptiveRecvByteBufAllocator's lookup table simpler / Optimize buffer size normalization

- No need to have fine-grained lookup table because the buffer pool has
  much more coarse capacities available
- No need to use a loop to normalize a buffer capacity

											
										
										
											2013-06-20 11:38:11 +02:00
 								            int normalizedCapacity = reqCapacity;
-												Fix error that causes (up to) double memory usage

Motivation:

PoolArena's 'normalizeCapacity' function was micro-optimized some
time ago to remove a while loop. However, there was a change of
behavior in the function as a result. Capacities passed into it
that are already powers of 2 (and >= 512) are doubled in size. So
if I ask for a buffer with a capacity of 1024, I will get back one
that actually uses 2048 bytes (stored in maxLength).

Aligning to powers of two for book keeping ease is reasonable,
and if someone tries to expand a buffer, you might as well use some
of the previously wasted space. However, since this distinction
between 'easily expanded' and 'costly to expand' space is not
supported at all by the APIs, I cannot imagine this change to
doubling is desirable or intentional.

This is especially costly when using composite buffers. They
frequently allocate components with a capacity that is a power of
2, and they never attempt to expand components themselves. The end
result is that heavy use of pool-backed composite buffers wastes
almost half of the memory pool (the smaller / initial components are
<512 and so are not affected by the off-by-one bug).

Modifications:

Although I find it difficult to believe that such an optimization
is really helpful, I left it in and fixed the off-by-one issue by
decrementing the value at the start.

I also added a simple test to both attempt to verify that the
decrement fixes the issue without introducing any other change, and
to make it easy for a reviewer to test the existing behavior. PoolArena
does not seem to have much testing or testability support though so
the test is kind of a hack and will break for unrelated changes. I
suggest either removing it or factoring out the single non-static
portion of normalizeCapacity so that the fragile dummy PoolArena is
not required.

Result:

Pooled allocators will allocate less resources to the highly
inefficient and undocumented buffer section between length and
maxLength.

Composite buffers of non-trivial size that are backed by pooled
allocators will use about half as much memory.

											
										
										
											2014-04-14 22:57:24 +02:00
+								            normalizedCapacity --;
-												Make AdaptiveRecvByteBufAllocator's lookup table simpler / Optimize buffer size normalization

- No need to have fine-grained lookup table because the buffer pool has
  much more coarse capacities available
- No need to use a loop to normalize a buffer capacity

											
										
										
											2013-06-20 11:38:11 +02:00
+								            normalizedCapacity |= normalizedCapacity >>>  1;
 								            normalizedCapacity |= normalizedCapacity >>>  2;
 								            normalizedCapacity |= normalizedCapacity >>>  4;
 								            normalizedCapacity |= normalizedCapacity >>>  8;
 								            normalizedCapacity |= normalizedCapacity >>> 16;
 								            normalizedCapacity ++;
 								            if (normalizedCapacity < 0) {
 								                normalizedCapacity >>>= 1;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								            }
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								            assert directMemoryCacheAlignment == 0 || (normalizedCapacity & directMemoryCacheAlignmentMask) == 0;
-												Make AdaptiveRecvByteBufAllocator's lookup table simpler / Optimize buffer size normalization

- No need to have fine-grained lookup table because the buffer pool has
  much more coarse capacities available
- No need to use a loop to normalize a buffer capacity

											
										
										
											2013-06-20 11:38:11 +02:00
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								            return normalizedCapacity;
 								        }
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								        if (directMemoryCacheAlignment > 0) {
 								            return alignCapacity(reqCapacity);
 								        }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        // Quantum-spaced
-												Return the new buffer's capacity is same with the requested capacity

- Rename capacity variables to reqCapacity or normCapacity to distinguish if its the request capacity or the normalized capacity
- Do not reallocate on ByteBuf.capacity(int) if reallocation is unnecessary; just update the index range.
- Revert the workaround in DefaultChannelHandlerContext

											
										
										
											2012-12-19 08:48:53 +01:00
+								        if ((reqCapacity & 15) == 0) {
 								            return reqCapacity;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
-												Return the new buffer's capacity is same with the requested capacity

- Rename capacity variables to reqCapacity or normCapacity to distinguish if its the request capacity or the normalized capacity
- Do not reallocate on ByteBuf.capacity(int) if reallocation is unnecessary; just update the index range.
- Revert the workaround in DefaultChannelHandlerContext

											
										
										
											2012-12-19 08:48:53 +01:00
+								        return (reqCapacity & ~15) + 16;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    }
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								    int alignCapacity(int reqCapacity) {
 								        int delta = reqCapacity & directMemoryCacheAlignmentMask;
 								        return delta == 0 ? reqCapacity : reqCapacity + directMemoryCacheAlignment - delta;
 								    }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    void reallocate(PooledByteBuf<T> buf, int newCapacity, boolean freeOldMemory) {
 								        if (newCapacity < 0 || newCapacity > buf.maxCapacity()) {
 								            throw new IllegalArgumentException("newCapacity: " + newCapacity);
 								        }
 								        int oldCapacity = buf.length;
 								        if (oldCapacity == newCapacity) {
 								            return;
 								        }
 								        PoolChunk<T> oldChunk = buf.chunk;
 								        long oldHandle = buf.handle;
 								        T oldMemory = buf.memory;
 								        int oldOffset = buf.offset;
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								        int oldMaxLength = buf.maxLength;
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        int readerIndex = buf.readerIndex();
 								        int writerIndex = buf.writerIndex();
-												[#3654] No need to hold lock while destroy a chunk

Motiviation:

At the moment we sometimes hold the lock on the PoolArena during destroy a PoolChunk. This is not needed.

Modification:

- Ensure we not hold the lock during destroy a PoolChunk
- Move all synchronized usage in PoolArena
- Cleanup

Result:

Less condition.

											
										
										
											2015-05-25 21:00:24 +02:00
+								        allocate(parent.threadCache(), buf, newCapacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        if (newCapacity > oldCapacity) {
 								            memoryCopy(
-												[#1800] [#1802] Correctly expand capacity of ByteBuf while preserve content

											
										
										
											2013-11-04 15:18:21 +01:00
+								                    oldMemory, oldOffset,
 								                    buf.memory, buf.offset, oldCapacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        } else if (newCapacity < oldCapacity) {
 								            if (readerIndex < newCapacity) {
 								                if (writerIndex > newCapacity) {
 								                    writerIndex = newCapacity;
 								                }
 								                memoryCopy(
 								                        oldMemory, oldOffset + readerIndex,
 								                        buf.memory, buf.offset + readerIndex, writerIndex - readerIndex);
 								            } else {
 								                readerIndex = writerIndex = newCapacity;
 								            }
 								        }
 								        buf.setIndex(readerIndex, writerIndex);
 								        if (freeOldMemory) {
-												Let PoolThreadCache work even if allocation and deallocation Thread are different

Motivation:

PoolThreadCache did only cache allocations if the allocation and deallocation Thread were the same. This is not optimal as often people write from differen thread then the actual EventLoop thread.

Modification:

- Add MpscArrayQueue which was forked from jctools and lightly modified.
- Use MpscArrayQueue for caches and always add buffer back to the cache that belongs to the allocation thread.

Result:

ThreadPoolCache is now also usable and so gives performance improvements when allocation and deallocation thread are different.

Performance when using same thread for allocation and deallocation is noticable worse then before.

											
										
										
											2015-05-19 15:03:29 +02:00
+								            free(oldChunk, oldHandle, oldMaxLength, buf.cache);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
 								    }
-												Change arena to thread cache mapping algorithm to be closer to ideal.

Motivation:
Circular assignment of arenas to thread caches can lead to less than optimal
mappings in cases where threads are (frequently) shutdown and started.

Example Scenario:
There are a total of 2 arenas. The first two threads performing an allocation
would lead to the following mapping:

Thread 0 -> Arena 0
Thread 1 -> Arena 1

Now, assume Thread 1 is shut down and another Thread 2 is started. The current
circular assignment algorithm would lead to the following mapping:

Thread 0 -> Arena 0
Thread 2 -> Arena 0

Ideally, we want Thread 2 to use Arena 1 though.

Presumably, this is not much of an issue for most Netty applications that do all
the allocations inside the eventloop, as eventloop threads are seldomly shut down
and restarted. However, applications that only use the netty-buffer package
or implement their own threading model outside the eventloop might suffer from
increased contention. For example, gRPC Java when using the blocking stub
performs some allocations outside the eventloop and within its own thread pool
that is dynamically sized depending on system load.

Modifications:

Implement a linear scan algorithm that assigns a new thread cache to the arena
that currently backs the fewest thread caches.

Result:

Closer to ideal mappings between thread caches and arenas. In order to always
get an ideal mapping, we would have to re-balance the mapping whenever a thread
dies. However, that's difficult because of deallocation.

											
										
										
											2016-03-14 17:25:43 +01:00
+								    @Override
 								    public int numThreadCaches() {
 								        return numThreadCaches.get();
 								    }
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    @Override
 								    public int numTinySubpages() {
 								        return tinySubpagePools.length;
 								    }
 								    @Override
 								    public int numSmallSubpages() {
 								        return smallSubpagePools.length;
 								    }
 								    @Override
 								    public int numChunkLists() {
 								        return chunkListMetrics.size();
 								    }
 								    @Override
 								    public List<PoolSubpageMetric> tinySubpages() {
 								        return subPageMetricList(tinySubpagePools);
 								    }
 								    @Override
 								    public List<PoolSubpageMetric> smallSubpages() {
 								        return subPageMetricList(smallSubpagePools);
 								    }
 								    @Override
 								    public List<PoolChunkListMetric> chunkLists() {
 								        return chunkListMetrics;
 								    }
 								    private static List<PoolSubpageMetric> subPageMetricList(PoolSubpage<?>[] pages) {
 								        List<PoolSubpageMetric> metrics = new ArrayList<PoolSubpageMetric>();
-												Cleanup : for loops for arrays to make code easier to read and removed unnecessary toLowerCase()

											
										
										
											2017-02-04 00:08:46 +01:00
+								        for (PoolSubpage<?> head : pages) {
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								            if (head.next == head) {
 								                continue;
 								            }
 								            PoolSubpage<?> s = head.next;
 								            for (;;) {
 								                metrics.add(s);
 								                s = s.next;
 								                if (s == head) {
 								                    break;
 								                }
 								            }
 								        }
 								        return metrics;
 								    }
 								    @Override
 								    public long numAllocations() {
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								        final long allocsNormal;
 								        synchronized (this) {
 								            allocsNormal = allocationsNormal;
 								        }
 								        return allocationsTiny.value() + allocationsSmall.value() + allocsNormal + allocationsHuge.value();
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    }
 								    @Override
 								    public long numTinyAllocations() {
-												Fix race in PoolArena.allocate. Fixes #4829

Motivation:

The statistic counters PoolArena.(allocationsTiny|allocationsSmall) are
not protected by a per arena lock, but by a per size class lock. Thus,
two concurrent allocations of different size (class) could lead to a
race and ultimately to wrong statistics.

Modifications:

Use a thread-safe LongCounter instead of a plain long data type.

Result:

Fewer data races.

											
										
										
											2016-03-13 22:27:56 +01:00
+								        return allocationsTiny.value();
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    }
 								    @Override
 								    public long numSmallAllocations() {
-												Fix race in PoolArena.allocate. Fixes #4829

Motivation:

The statistic counters PoolArena.(allocationsTiny|allocationsSmall) are
not protected by a per arena lock, but by a per size class lock. Thus,
two concurrent allocations of different size (class) could lead to a
race and ultimately to wrong statistics.

Modifications:

Use a thread-safe LongCounter instead of a plain long data type.

Result:

Fewer data races.

											
										
										
											2016-03-13 22:27:56 +01:00
+								        return allocationsSmall.value();
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    }
 								    @Override
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								    public synchronized long numNormalAllocations() {
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								        return allocationsNormal;
 								    }
 								    @Override
 								    public long numDeallocations() {
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								        final long deallocs;
 								        synchronized (this) {
-												Fix calculation of PoolArena metrics after introducing a regression in 89da788fd23f3e745c799abb575a98696ebcdc2c

											
										
										
											2016-03-17 10:06:37 +01:00
+								            deallocs = deallocationsTiny + deallocationsSmall + deallocationsNormal;
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								        }
 								        return deallocs + deallocationsHuge.value();
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    }
 								    @Override
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								    public synchronized long numTinyDeallocations() {
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								        return deallocationsTiny;
 								    }
 								    @Override
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								    public synchronized long numSmallDeallocations() {
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								        return deallocationsSmall;
 								    }
 								    @Override
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								    public synchronized long numNormalDeallocations() {
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								        return deallocationsNormal;
 								    }
 								    @Override
 								    public long numHugeAllocations() {
 								        return allocationsHuge.value();
 								    }
 								    @Override
 								    public long numHugeDeallocations() {
 								        return deallocationsHuge.value();
 								    }
 								    @Override
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								    public  long numActiveAllocations() {
-												Fix calculation of PoolArena metrics after introducing a regression in 89da788fd23f3e745c799abb575a98696ebcdc2c

											
										
										
											2016-03-17 10:06:37 +01:00
+								        long val = allocationsTiny.value() + allocationsSmall.value() + allocationsHuge.value()
 								                - deallocationsHuge.value();
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								        synchronized (this) {
-												Fix calculation of PoolArena metrics after introducing a regression in 89da788fd23f3e745c799abb575a98696ebcdc2c

											
										
										
											2016-03-17 10:06:37 +01:00
+								            val += allocationsNormal - (deallocationsTiny + deallocationsSmall + deallocationsNormal);
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								        }
-												Cleanup PoolChunk and PoolArena

Motivation:

To make it easier to understand PoolChunk and PoolArena we should cleanup duplicated code.

Modifications:

- Move reused code into methods
- Use Math.max(...)

Result:

Cleaner code and easier to understand.

											
										
										
											2016-04-08 11:56:32 +02:00
+								        return max(val, 0);
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    }
 								    @Override
 								    public long numActiveTinyAllocations() {
-												Cleanup PoolChunk and PoolArena

Motivation:

To make it easier to understand PoolChunk and PoolArena we should cleanup duplicated code.

Modifications:

- Move reused code into methods
- Use Math.max(...)

Result:

Cleaner code and easier to understand.

											
										
										
											2016-04-08 11:56:32 +02:00
+								        return max(numTinyAllocations() - numTinyDeallocations(), 0);
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    }
 								    @Override
 								    public long numActiveSmallAllocations() {
-												Cleanup PoolChunk and PoolArena

Motivation:

To make it easier to understand PoolChunk and PoolArena we should cleanup duplicated code.

Modifications:

- Move reused code into methods
- Use Math.max(...)

Result:

Cleaner code and easier to understand.

											
										
										
											2016-04-08 11:56:32 +02:00
+								        return max(numSmallAllocations() - numSmallDeallocations(), 0);
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    }
 								    @Override
 								    public long numActiveNormalAllocations() {
-												Add proper synchronization when access metrics.

Motivation:

We also need to add synchronization when access fields to ensure we see the latest updates.

Modifications:

Add synchronization when read fields that are written concurrently.

Result:

Ensure correct visibility of updated.

											
										
										
											2016-03-14 09:09:24 +01:00
+								        final long val;
 								        synchronized (this) {
 								            val = allocationsNormal - deallocationsNormal;
 								        }
-												Cleanup PoolChunk and PoolArena

Motivation:

To make it easier to understand PoolChunk and PoolArena we should cleanup duplicated code.

Modifications:

- Move reused code into methods
- Use Math.max(...)

Result:

Cleaner code and easier to understand.

											
										
										
											2016-04-08 11:56:32 +02:00
+								        return max(val, 0);
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    }
 								    @Override
 								    public long numActiveHugeAllocations() {
-												Cleanup PoolChunk and PoolArena

Motivation:

To make it easier to understand PoolChunk and PoolArena we should cleanup duplicated code.

Modifications:

- Move reused code into methods
- Use Math.max(...)

Result:

Cleaner code and easier to understand.

											
										
										
											2016-04-08 11:56:32 +02:00
+								        return max(numHugeAllocations() - numHugeDeallocations(), 0);
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    }
-												Allow to retrieve the number of active bytes per PoolArena.

Motivation:

To better understand how much memory is used by Netty for ByteBufs it is useful to understand how many bytes are currently active (allocated) per PoolArena.

Modifications:

- Add PoolArenaMetric.numActiveBytes()

Result:

The user is able to get better insight into the PooledByteBufAllocator.

											
										
										
											2016-04-06 15:12:37 +02:00
+								    @Override
 								    public long numActiveBytes() {
 								        long val = activeBytesHuge.value();
 								        synchronized (this) {
 								            for (int i = 0; i < chunkListMetrics.size(); i++) {
 								                for (PoolChunkMetric m: chunkListMetrics.get(i)) {
 								                    val += m.chunkSize();
 								                }
 								            }
 								        }
 								        return max(0, val);
 								    }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    protected abstract PoolChunk<T> newChunk(int pageSize, int maxOrder, int pageShifts, int chunkSize);
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								    protected abstract PoolChunk<T> newUnpooledChunk(int capacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    protected abstract PooledByteBuf<T> newByteBuf(int maxCapacity);
 								    protected abstract void memoryCopy(T src, int srcOffset, T dst, int dstOffset, int length);
 								    protected abstract void destroyChunk(PoolChunk<T> chunk);
-												Expose metrics for PooledByteBufAllocator

Motivation:

The PooledByteBufAllocator is more or less a black-box atm. We need to expose some metrics to allow the user to get a better idea how to tune it.

Modifications:

- Expose different metrics via PooledByteBufAllocator
- Add *Metrics interfaces

Result:

It is now easy to gather metrics and detail about the PooledByteBufAllocator and so get a better understanding about resource-usage etc.

											
										
										
											2015-05-13 17:15:06 +02:00
+								    @Override
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    public synchronized String toString() {
-												Small performance improvements

Motivation:

Found performance issues via FindBugs and PMD.

Modifications:

- Removed unnecessary boxing/unboxing operations in DefaultTextHeaders.convertToInt(CharSequence) and DefaultTextHeaders.convertToLong(CharSequence). A boxed primitive is created from a string, just to extract the unboxed primitive value.
- Added a static modifier for DefaultHttp2Connection.ParentChangedEvent class. This class is an inner class, but does not use its embedded reference to the object which created it. This reference makes the instances of the class larger, and may keep the reference to the creator object alive longer than necessary.
- Added a static compiled Pattern to avoid compile it each time it is used when we need to replace some part of authority.
- Improved using of StringBuilders.

Result:

Performance improvements.

											
										
										
											2014-11-08 23:46:30 +01:00
+								        StringBuilder buf = new StringBuilder()
 								            .append("Chunk(s) at 0~25%:")
 								            .append(StringUtil.NEWLINE)
 								            .append(qInit)
 								            .append(StringUtil.NEWLINE)
 								            .append("Chunk(s) at 0~50%:")
 								            .append(StringUtil.NEWLINE)
 								            .append(q000)
 								            .append(StringUtil.NEWLINE)
 								            .append("Chunk(s) at 25~75%:")
 								            .append(StringUtil.NEWLINE)
 								            .append(q025)
 								            .append(StringUtil.NEWLINE)
 								            .append("Chunk(s) at 50~100%:")
 								            .append(StringUtil.NEWLINE)
 								            .append(q050)
 								            .append(StringUtil.NEWLINE)
 								            .append("Chunk(s) at 75~100%:")
 								            .append(StringUtil.NEWLINE)
 								            .append(q075)
 								            .append(StringUtil.NEWLINE)
 								            .append("Chunk(s) at 100%:")
 								            .append(StringUtil.NEWLINE)
 								            .append(q100)
 								            .append(StringUtil.NEWLINE)
 								            .append("tiny subpages:");
-												[#5520] Correctly include all PoolSubpage metrics

Motivation:

Because of a bug we missed to include the first PoolSubpage when collection metrics.

Modifications:

- Correctly include all subpages
- Add unit test

Result:

Correctly include all subpages

											
										
										
											2016-07-10 12:54:30 +02:00
+								        appendPoolSubPages(buf, tinySubpagePools);
-												Small performance improvements

Motivation:

Found performance issues via FindBugs and PMD.

Modifications:

- Removed unnecessary boxing/unboxing operations in DefaultTextHeaders.convertToInt(CharSequence) and DefaultTextHeaders.convertToLong(CharSequence). A boxed primitive is created from a string, just to extract the unboxed primitive value.
- Added a static modifier for DefaultHttp2Connection.ParentChangedEvent class. This class is an inner class, but does not use its embedded reference to the object which created it. This reference makes the instances of the class larger, and may keep the reference to the creator object alive longer than necessary.
- Added a static compiled Pattern to avoid compile it each time it is used when we need to replace some part of authority.
- Improved using of StringBuilders.

Result:

Performance improvements.

											
										
										
											2014-11-08 23:46:30 +01:00
+								        buf.append(StringUtil.NEWLINE)
 								           .append("small subpages:");
-												[#5520] Correctly include all PoolSubpage metrics

Motivation:

Because of a bug we missed to include the first PoolSubpage when collection metrics.

Modifications:

- Correctly include all subpages
- Add unit test

Result:

Correctly include all subpages

											
										
										
											2016-07-10 12:54:30 +02:00
+								        appendPoolSubPages(buf, smallSubpagePools);
 								        buf.append(StringUtil.NEWLINE);
 								        return buf.toString();
 								    }
 								    private static void appendPoolSubPages(StringBuilder buf, PoolSubpage<?>[] subpages) {
 								        for (int i = 0; i < subpages.length; i ++) {
 								            PoolSubpage<?> head = subpages[i];
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								            if (head.next == head) {
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								                continue;
 								            }
-												Small performance improvements

Motivation:

Found performance issues via FindBugs and PMD.

Modifications:

- Removed unnecessary boxing/unboxing operations in DefaultTextHeaders.convertToInt(CharSequence) and DefaultTextHeaders.convertToLong(CharSequence). A boxed primitive is created from a string, just to extract the unboxed primitive value.
- Added a static modifier for DefaultHttp2Connection.ParentChangedEvent class. This class is an inner class, but does not use its embedded reference to the object which created it. This reference makes the instances of the class larger, and may keep the reference to the creator object alive longer than necessary.
- Added a static compiled Pattern to avoid compile it each time it is used when we need to replace some part of authority.
- Improved using of StringBuilders.

Result:

Performance improvements.

											
										
										
											2014-11-08 23:46:30 +01:00
+								            buf.append(StringUtil.NEWLINE)
-												[#5520] Correctly include all PoolSubpage metrics

Motivation:

Because of a bug we missed to include the first PoolSubpage when collection metrics.

Modifications:

- Correctly include all subpages
- Add unit test

Result:

Correctly include all subpages

											
										
										
											2016-07-10 12:54:30 +02:00
+								                    .append(i)
 								                    .append(": ");
 								            PoolSubpage<?> s = head.next;
-												Fix a bug in PoolArena and PoolSubpage where subpage pools are not updated correctly

- Make PoolSubpage a linked list node in the pool
- Now that a subpage is added to and removed from the pool correctly, allocating a subpage from the pool became vastly simpler.

											
										
										
											2013-03-05 15:55:41 +01:00
+								            for (;;) {
 								                buf.append(s);
 								                s = s.next;
 								                if (s == head) {
 								                    break;
 								                }
 								            }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
 								    }
-												[#5833] Ensure direct memory is released when DirectPoolArena is collected

Motivation:

We need to ensure we release all direct memory once the DirectPoolArena is collected. Otherwise we may never reclaim the memory and so leak memory.

Modifications:

Ensure we destroy all PoolChunk memory when DirectPoolArena is collected.

Result:

Free up unreleased memory when DirectPoolArena is collected.

											
										
										
											2016-09-17 04:43:51 +02:00
+								    @Override
 								    protected final void finalize() throws Throwable {
 								        try {
 								            super.finalize();
 								        } finally {
 								            destroyPoolSubPages(smallSubpagePools);
 								            destroyPoolSubPages(tinySubpagePools);
 								            destroyPoolChunkLists(qInit, q000, q025, q050, q075, q100);
 								        }
 								    }
 								    private static void destroyPoolSubPages(PoolSubpage<?>[] pages) {
 								        for (PoolSubpage<?> page : pages) {
 								            page.destroy();
 								        }
 								    }
 								    private void destroyPoolChunkLists(PoolChunkList<T>... chunkLists) {
 								        for (PoolChunkList<T> chunkList: chunkLists) {
 								            chunkList.destroy(this);
 								        }
 								    }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								    static final class HeapArena extends PoolArena<byte[]> {
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								        HeapArena(PooledByteBufAllocator parent, int pageSize, int maxOrder,
 								                int pageShifts, int chunkSize, int directMemoryCacheAlignment) {
 								            super(parent, pageSize, maxOrder, pageShifts, chunkSize,
 								                    directMemoryCacheAlignment);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								        @Override
 								        boolean isDirect() {
 								            return false;
 								        }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        @Override
 								        protected PoolChunk<byte[]> newChunk(int pageSize, int maxOrder, int pageShifts, int chunkSize) {
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								            return new PoolChunk<byte[]>(this, new byte[chunkSize], pageSize, maxOrder, pageShifts, chunkSize, 0);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								        @Override
 								        protected PoolChunk<byte[]> newUnpooledChunk(int capacity) {
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								            return new PoolChunk<byte[]>(this, new byte[capacity], capacity, 0);
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								        }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        @Override
 								        protected void destroyChunk(PoolChunk<byte[]> chunk) {
 								            // Rely on GC.
 								        }
 								        @Override
 								        protected PooledByteBuf<byte[]> newByteBuf(int maxCapacity) {
-												Add *UnsafeHeapByteBuf for improve performance on systems with sun.misc.Unsafe

Motivation:

sun.misc.Unsafe allows us to handle heap ByteBuf in a more efficient matter. We should use special ByteBuf implementation when sun.misc.Unsafe can be used to increase performance.

Modifications:

- Add PooledUnsafeHeapByteBuf and UnpooledUnsafeHeapByteBuf that are used when sun.misc.Unsafe is ready to use.
- Add UnsafeHeapSwappedByteBuf

Result:

Better performance when using heap buffers and sun.misc.Unsafe is ready to use.

											
										
										
											2015-10-09 21:03:03 +02:00
+								            return HAS_UNSAFE ? PooledUnsafeHeapByteBuf.newUnsafeInstance(maxCapacity)
 								                    : PooledHeapByteBuf.newInstance(maxCapacity);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
 								        @Override
 								        protected void memoryCopy(byte[] src, int srcOffset, byte[] dst, int dstOffset, int length) {
 								            if (length == 0) {
 								                return;
 								            }
 								            System.arraycopy(src, srcOffset, dst, dstOffset, length);
 								        }
 								    }
 								    static final class DirectArena extends PoolArena<ByteBuffer> {
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								        DirectArena(PooledByteBufAllocator parent, int pageSize, int maxOrder,
 								                int pageShifts, int chunkSize, int directMemoryCacheAlignment) {
 								            super(parent, pageSize, maxOrder, pageShifts, chunkSize,
 								                    directMemoryCacheAlignment);
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
-												Implement Thread caches for pooled buffers to minimize conditions. This fixes [#2264] and [#808].

Motivation:
Remove the synchronization bottleneck in PoolArena and so speed up things

Modifications:

This implementation uses kind of the same technics as outlined in the jemalloc paper and jemalloc
blogpost https://www.facebook.com/notes/facebook-engineering/scalable-memory-allocation-using-jemalloc/480222803919.

At the moment we only cache for "known" Threads (that powers EventExecutors) and not for others to keep the overhead
minimal when need to free up unused buffers in the cache and free up cached buffers once the Thread completes. Here
we use multi-level caches for tiny, small and normal allocations. Huge allocations are not cached at all to keep the
memory usage at a sane level. All the different cache configurations can be adjusted via system properties or the constructor
directly where it makes sense.

Result:
Less conditions as most allocations can be served by the cache itself

											
										
										
											2014-03-01 15:47:03 +01:00
+								        @Override
 								        boolean isDirect() {
 								            return true;
 								        }
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								        private int offsetCacheLine(ByteBuffer memory) {
-												Only try to calculate direct memory offset when sun.misc.Unsafe is present

Motivation:

We should only try to calculate the direct memory offset when sun.misc.Unsafe is present as otherwise it will fail with an NPE as PlatformDependent.directBufferAddress(...) will throw it.
This problem was introduced by 66b9be3a469a2cdcc5d18a8b94c679940ce002a9.

Modifications:

Use offset of 0 if no sun.misc.Unsafe is present.

Result:

PooledByteBufAllocator also works again when no sun.misc.Unsafe is present.

											
										
										
											2017-02-13 07:42:22 +01:00
+								            // We can only calculate the offset if Unsafe is present as otherwise directBufferAddress(...) will
 								            // throw an NPE.
 								            return HAS_UNSAFE ?
 								                    (int) (PlatformDependent.directBufferAddress(memory) & directMemoryCacheAlignmentMask) : 0;
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								        }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        @Override
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								        protected PoolChunk<ByteBuffer> newChunk(int pageSize, int maxOrder,
 								                int pageShifts, int chunkSize) {
 								            if (directMemoryCacheAlignment == 0) {
 								                return new PoolChunk<ByteBuffer>(this,
 								                        allocateDirect(chunkSize), pageSize, maxOrder,
 								                        pageShifts, chunkSize, 0);
 								            }
 								            final ByteBuffer memory = allocateDirect(chunkSize
 								                    + directMemoryCacheAlignment);
 								            return new PoolChunk<ByteBuffer>(this, memory, pageSize,
 								                    maxOrder, pageShifts, chunkSize,
 								                    offsetCacheLine(memory));
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								        @Override
 								        protected PoolChunk<ByteBuffer> newUnpooledChunk(int capacity) {
-												Allow to allign allocated Buffers

Motivation:

64-byte alignment is recommended by the Intel performance guide (https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors) for data-structures over 64 bytes.
Requiring padding to a multiple of 64 bytes allows for using SIMD instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code.

Modification:

At the moment cache alignment must be setup manually. But probably it might be taken from the system. The original code was introduced by @normanmaurer https://github.com/netty/netty/pull/4726/files

Result:

Buffer alignment works better than miss-align cache.

											
										
										
											2017-01-29 22:26:40 +01:00
+								            if (directMemoryCacheAlignment == 0) {
 								                return new PoolChunk<ByteBuffer>(this,
 								                        allocateDirect(capacity), capacity, 0);
 								            }
 								            final ByteBuffer memory = allocateDirect(capacity
 								                    + directMemoryCacheAlignment);
 								            return new PoolChunk<ByteBuffer>(this, memory, capacity,
 								                    offsetCacheLine(memory));
-												Allow to create Unsafe ByteBuf implementations that not use a Cleaner to clean the native memory.

Motivation:

Using the Cleaner to release the native memory has a few drawbacks:

- Cleaner.clean() uses static synchronized internally which means it can be a performance bottleneck
- It put more load on the GC

Modifications:

Add new buffer implementations that can be enabled with a system flag as optimizations. In this case no Cleaner is used at all and the user must ensure everything is always released.

Result:

Less performance impact by direct buffers when need to be allocated and released.

											
										
										
											2016-05-23 11:59:55 +02:00
+								        }
 								        private static ByteBuffer allocateDirect(int capacity) {
 								            return PlatformDependent.useDirectBufferNoCleaner() ?
 								                    PlatformDependent.allocateDirectNoCleaner(capacity) : ByteBuffer.allocateDirect(capacity);
-												Remove the notion of ByteBufAllocator.bufferMaxCapacity()

- Allocate the unpooled memory if the requested capacity is greater then the chunkSize
- Fixes #834

											
										
										
											2012-12-19 09:35:32 +01:00
+								        }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        @Override
 								        protected void destroyChunk(PoolChunk<ByteBuffer> chunk) {
-												Allow to create Unsafe ByteBuf implementations that not use a Cleaner to clean the native memory.

Motivation:

Using the Cleaner to release the native memory has a few drawbacks:

- Cleaner.clean() uses static synchronized internally which means it can be a performance bottleneck
- It put more load on the GC

Modifications:

Add new buffer implementations that can be enabled with a system flag as optimizations. In this case no Cleaner is used at all and the user must ensure everything is always released.

Result:

Less performance impact by direct buffers when need to be allocated and released.

											
										
										
											2016-05-23 11:59:55 +02:00
+								            if (PlatformDependent.useDirectBufferNoCleaner()) {
 								                PlatformDependent.freeDirectNoCleaner(chunk.memory);
 								            } else {
 								                PlatformDependent.freeDirectBuffer(chunk.memory);
 								            }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
 								        @Override
 								        protected PooledByteBuf<ByteBuffer> newByteBuf(int maxCapacity) {
-												Make hasUnsafe() return true only when all necessary low level operations are available for reliable direct buffer access

											
										
										
											2013-03-05 07:25:25 +01:00
+								            if (HAS_UNSAFE) {
-												Recycle PooledByteBuf partially

- Related issue: #1397
- Resource leak detection should be turned off and the maxCapacity has to be Integer.MAX_VALUE
- It's technically possible to pool PooledByteBufs with different maxCapacity, which will be addressed in another commit.

											
										
										
											2013-06-10 12:52:56 +02:00
+								                return PooledUnsafeDirectByteBuf.newInstance(maxCapacity);
-												Use sun.misc.Unsafe to access a direct buffer if possible

											
										
										
											2013-01-10 10:27:16 +01:00
+								            } else {
-												Recycle PooledByteBuf partially

- Related issue: #1397
- Resource leak detection should be turned off and the maxCapacity has to be Integer.MAX_VALUE
- It's technically possible to pool PooledByteBufs with different maxCapacity, which will be addressed in another commit.

											
										
										
											2013-06-10 12:52:56 +02:00
+								                return PooledDirectByteBuf.newInstance(maxCapacity);
-												Use sun.misc.Unsafe to access a direct buffer if possible

											
										
										
											2013-01-10 10:27:16 +01:00
+								            }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
 								        @Override
 								        protected void memoryCopy(ByteBuffer src, int srcOffset, ByteBuffer dst, int dstOffset, int length) {
 								            if (length == 0) {
 								                return;
 								            }
-												Make hasUnsafe() return true only when all necessary low level operations are available for reliable direct buffer access

											
										
										
											2013-03-05 07:25:25 +01:00
+								            if (HAS_UNSAFE) {
-												Use sun.misc.Unsafe to access a direct ByteBuffer

- Add PooledUnsafeDirectByteBuf, a variant of PooledDirectByteBuf, which
  accesses its underlying direct ByteBuffer using sun.misc.Unsafe.
- To decouple Netty from sun.misc.*, sun.misc.Unsafe is accessed via
  PlatformDependent.
- This change solely introduces about 8+% improvement in direct memory
  access according to the tests conducted as described in #918

											
										
										
											2013-01-10 10:27:16 +01:00
+								                PlatformDependent.copyMemory(
 								                        PlatformDependent.directBufferAddress(src) + srcOffset,
 								                        PlatformDependent.directBufferAddress(dst) + dstOffset, length);
 								            } else {
 								                // We must duplicate the NIO buffers because they may be accessed by other Netty buffers.
 								                src = src.duplicate();
 								                dst = dst.duplicate();
 								                src.position(srcOffset).limit(srcOffset + length);
 								                dst.position(dstOffset);
 								                dst.put(src);
 								            }
-												Add PooledByteBufAllocator + microbenchmark module

This pull request introduces the new default ByteBufAllocator implementation based on jemalloc, with a some differences:

* Minimum possible buffer capacity is 16 (jemalloc: 2)
* Uses binary heap with random branching (jemalloc: red-black tree)
* No thread-local cache yet (jemalloc has thread-local cache)
* Default page size is 8 KiB (jemalloc: 4 KiB)
* Default chunk size is 16 MiB (jemalloc: 2 MiB)
* Cannot allocate a buffer bigger than the chunk size (jemalloc: possible) because we don't have control over memory layout in Java. A user can work around this issue by creating a composite buffer, but it's not always a feasible option. Although 16 MiB is a pretty big default, a user's handler might need to deal with the bounded buffers when the user wants to deal with a large message.

Also, to ensure the new allocator performs good enough, I wrote a microbenchmark for it and made it a dedicated Maven module. It uses Google's Caliper framework to run and publish the test result (example)

Miscellaneous changes:

* Made some ByteBuf implementations public so that those who implements a new allocator can make use of them.
* Added ByteBufAllocator.compositeBuffer() and its variants.
* ByteBufAllocator.ioBuffer() creates a buffer with 0 capacity.

											
										
										
											2012-12-05 22:11:48 +01:00
+								        }
 								    }
 								}