From 35161ad174937273694e7de921f9c234a708ab55 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 28 Feb 2019 11:40:42 -0800 Subject: [PATCH] Further reduce ensureAccessible() overhead (#8895) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Motivation: This PR fixes some non-negligible overhead discovered in the ByteBuf accessibility (non-zero refcount) checking. The cause turned out to be mostly twofold: - Unnecessary operations used to calculate the refcount from the "raw" encoded int field value - Call stack depths exceeding the default limit for inlining, in some places (CompositeByteBuf in particular) It's a follow-on from #8882 which uses the maxCapacity field for a simpler non-negative check. The performance gap between these two variants appears to be _mostly_ closed, but there's one exception which may warrant further analysis. Modifications: - Replace ABB.internalRefCount() with ByteBuf.isAccessible(), the default still checks for non-zero refCnt() - Just test for parity of raw refCnt instead of converting to "real", with fast-path for specific small values - Make sure isAccessible() is delegated by derived/wrapper ByteBufs - Use existing freed flag in CompositeByteBuf for faster isAccessible() - Manually inline some calls in methods like CompositeByteBuf.setLong() and AbstractReferenceCountedByteBuf.isAccessible() to reduce stack depths (to ensure default inlining limit isn't hit) - Add ByteBufAccessBenchmark which is an extension of UnsafeByteBufBenchmark (maybe latter could now be removed) Results: Before: Benchmark (bufferType) (checkAccessible) (checkBounds) Mode Cnt Score Error Units readBatch UNSAFE true true thrpt 30 84524972.863 ± 518338.811 ops/s readBatch UNSAFE_SLICE true true thrpt 30 38608795.037 ± 298176.974 ops/s readBatch HEAP true true thrpt 30 80003697.649 ± 974674.119 ops/s readBatch COMPOSITE true true thrpt 30 18495554.788 ± 108075.023 ops/s setGetLong UNSAFE true true thrpt 30 247069881.578 ± 10839162.593 ops/s setGetLong UNSAFE_SLICE true true thrpt 30 196355905.206 ± 1802420.990 ops/s setGetLong HEAP true true thrpt 30 245686644.713 ± 11769311.527 ops/s setGetLong COMPOSITE true true thrpt 30 83170940.687 ± 657524.123 ops/s setLong UNSAFE true true thrpt 30 278940253.918 ± 1807265.259 ops/s setLong UNSAFE_SLICE true true thrpt 30 202556738.764 ± 11887973.563 ops/s setLong HEAP true true thrpt 30 280045958.053 ± 2719583.400 ops/s setLong COMPOSITE true true thrpt 30 121299806.002 ± 2155084.707 ops/s After: Benchmark (bufferType) (checkAccessible) (checkBounds) Mode Cnt Score Error Units readBatch UNSAFE true true thrpt 30 101641801.035 ± 3950050.059 ops/s readBatch UNSAFE_SLICE true true thrpt 30 84395902.846 ± 4339579.057 ops/s readBatch HEAP true true thrpt 30 100179060.207 ± 3222487.287 ops/s readBatch COMPOSITE true true thrpt 30 42288494.472 ± 294919.633 ops/s setGetLong UNSAFE true true thrpt 30 304530755.027 ± 6574163.899 ops/s setGetLong UNSAFE_SLICE true true thrpt 30 212028547.645 ± 14277828.768 ops/s setGetLong HEAP true true thrpt 30 309335422.609 ± 2272150.415 ops/s setGetLong COMPOSITE true true thrpt 30 160383609.236 ± 966484.033 ops/s setLong UNSAFE true true thrpt 30 298055969.747 ± 7437449.627 ops/s setLong UNSAFE_SLICE true true thrpt 30 223784178.650 ± 9869750.095 ops/s setLong HEAP true true thrpt 30 302543263.328 ± 8140104.706 ops/s setLong COMPOSITE true true thrpt 30 157083673.285 ± 3528779.522 ops/s There's also a similar knock-on improvement to other benchmarks (e.g. HPACK encoding/decoding) as shown in #8882. For sanity I did a final comparison of the "fast path" tweak using one of the HPACK benchmarks: (rawCnt & 1) == 0: Benchmark (limitToAscii) (sensitive) (size) Mode Cnt Score Error Units HpackDecoderBenchmark.decode true true MEDIUM thrpt 30 50914.479 ± 940.114 ops/s rawCnt == 2 || rawCnt == 4 || rawCnt == 6 || rawCnt == 8 || (rawCnt & 1) == 0: Benchmark (limitToAscii) (sensitive) (size) Mode Cnt Score Error Units HpackDecoderBenchmark.decode true true MEDIUM thrpt 30 60036.425 ± 1478.196 ops/s --- .../java/io/netty/buffer/AbstractByteBuf.java | 10 +- .../netty/buffer/AbstractDerivedByteBuf.java | 5 + .../AbstractReferenceCountedByteBuf.java | 13 +- .../main/java/io/netty/buffer/ByteBuf.java | 8 + .../io/netty/buffer/CompositeByteBuf.java | 22 ++- .../java/io/netty/buffer/SwappedByteBuf.java | 5 + .../java/io/netty/buffer/WrappedByteBuf.java | 5 + .../netty/buffer/WrappedCompositeByteBuf.java | 4 +- .../netty/buffer/ByteBufAccessBenchmark.java | 157 ++++++++++++++++++ 9 files changed, 209 insertions(+), 20 deletions(-) create mode 100644 microbench/src/main/java/io/netty/buffer/ByteBufAccessBenchmark.java diff --git a/buffer/src/main/java/io/netty/buffer/AbstractByteBuf.java b/buffer/src/main/java/io/netty/buffer/AbstractByteBuf.java index 68258fcbe4..eceef5072e 100644 --- a/buffer/src/main/java/io/netty/buffer/AbstractByteBuf.java +++ b/buffer/src/main/java/io/netty/buffer/AbstractByteBuf.java @@ -1383,19 +1383,11 @@ public abstract class AbstractByteBuf extends ByteBuf { * if the buffer was released before. */ protected final void ensureAccessible() { - if (checkAccessible && internalRefCnt() == 0) { + if (checkAccessible && !isAccessible()) { throw new IllegalReferenceCountException(0); } } - /** - * Returns the reference count that is used internally by {@link #ensureAccessible()} to try to guard - * against using the buffer after it was released (best-effort). - */ - int internalRefCnt() { - return refCnt(); - } - final void setIndex0(int readerIndex, int writerIndex) { this.readerIndex = readerIndex; this.writerIndex = writerIndex; diff --git a/buffer/src/main/java/io/netty/buffer/AbstractDerivedByteBuf.java b/buffer/src/main/java/io/netty/buffer/AbstractDerivedByteBuf.java index 58f1d907a1..40844020dd 100644 --- a/buffer/src/main/java/io/netty/buffer/AbstractDerivedByteBuf.java +++ b/buffer/src/main/java/io/netty/buffer/AbstractDerivedByteBuf.java @@ -31,6 +31,11 @@ public abstract class AbstractDerivedByteBuf extends AbstractByteBuf { super(maxCapacity); } + @Override + final boolean isAccessible() { + return unwrap().isAccessible(); + } + @Override public final int refCnt() { return refCnt0(); diff --git a/buffer/src/main/java/io/netty/buffer/AbstractReferenceCountedByteBuf.java b/buffer/src/main/java/io/netty/buffer/AbstractReferenceCountedByteBuf.java index ba26b1921a..548bdba5e4 100644 --- a/buffer/src/main/java/io/netty/buffer/AbstractReferenceCountedByteBuf.java +++ b/buffer/src/main/java/io/netty/buffer/AbstractReferenceCountedByteBuf.java @@ -64,10 +64,19 @@ public abstract class AbstractReferenceCountedByteBuf extends AbstractByteBuf { } @Override - int internalRefCnt() { + boolean isAccessible() { // Try to do non-volatile read for performance as the ensureAccessible() is racy anyway and only provide // a best-effort guard. - return realRefCnt(nonVolatileRawCnt()); + + // This is copied explicitly from the nonVolatileRawCnt() method above to reduce call stack depth, + // to avoid hitting the default limit for inlining (9) + final int rawCnt = REFCNT_FIELD_OFFSET != -1 ? PlatformDependent.getInt(this, REFCNT_FIELD_OFFSET) + : refCntUpdater.get(this); + + // The "real" ref count is > 0 if the rawCnt is even. + // (x & y) appears to be surprisingly expensive relative to (x == y). Thus the expression below provides + // a fast path for most common cases where the ref count is 1, 2, 3 or 4. + return rawCnt == 2 || rawCnt == 4 || rawCnt == 6 || rawCnt == 8 || (rawCnt & 1) == 0; } @Override diff --git a/buffer/src/main/java/io/netty/buffer/ByteBuf.java b/buffer/src/main/java/io/netty/buffer/ByteBuf.java index 7889c516d0..993761f999 100644 --- a/buffer/src/main/java/io/netty/buffer/ByteBuf.java +++ b/buffer/src/main/java/io/netty/buffer/ByteBuf.java @@ -2400,4 +2400,12 @@ public abstract class ByteBuf implements ReferenceCounted, Comparable { @Override public abstract ByteBuf touch(Object hint); + + /** + * Used internally by {@link AbstractByteBuf#ensureAccessible()} to try to guard + * against using the buffer after it was released (best-effort). + */ + boolean isAccessible() { + return refCnt() != 0; + } } diff --git a/buffer/src/main/java/io/netty/buffer/CompositeByteBuf.java b/buffer/src/main/java/io/netty/buffer/CompositeByteBuf.java index c49cb404fb..d7a1c6acd8 100644 --- a/buffer/src/main/java/io/netty/buffer/CompositeByteBuf.java +++ b/buffer/src/main/java/io/netty/buffer/CompositeByteBuf.java @@ -299,7 +299,7 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements @SuppressWarnings("deprecation") private Component newComponent(ByteBuf buf, int offset) { - if (checkAccessible && buf.refCnt() == 0) { + if (checkAccessible && !buf.isAccessible()) { throw new IllegalReferenceCountException(0); } int srcIndex = buf.readerIndex(), len = buf.readableBytes(); @@ -1073,7 +1073,8 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements @Override public CompositeByteBuf setShort(int index, int value) { - super.setShort(index, value); + checkIndex(index, 2); + _setShort(index, value); return this; } @@ -1107,7 +1108,8 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements @Override public CompositeByteBuf setMedium(int index, int value) { - super.setMedium(index, value); + checkIndex(index, 3); + _setMedium(index, value); return this; } @@ -1141,7 +1143,8 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements @Override public CompositeByteBuf setInt(int index, int value) { - super.setInt(index, value); + checkIndex(index, 4); + _setInt(index, value); return this; } @@ -1175,7 +1178,8 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements @Override public CompositeByteBuf setLong(int index, long value) { - super.setLong(index, value); + checkIndex(index, 8); + _setLong(index, value); return this; } @@ -1283,7 +1287,6 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements int i = toComponentIndex0(index); int readBytes = 0; - do { Component c = components[i]; int localLength = Math.min(length, c.endOffset - index); @@ -2023,7 +2026,7 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements @Override public CompositeByteBuf writeBytes(byte[] src) { - writeBytes(src, 0, src.length); + super.writeBytes(src, 0, src.length); return this; } @@ -2091,6 +2094,11 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements } } + @Override + boolean isAccessible() { + return !freed; + } + @Override public ByteBuf unwrap() { return null; diff --git a/buffer/src/main/java/io/netty/buffer/SwappedByteBuf.java b/buffer/src/main/java/io/netty/buffer/SwappedByteBuf.java index df740075a0..1da1f12055 100644 --- a/buffer/src/main/java/io/netty/buffer/SwappedByteBuf.java +++ b/buffer/src/main/java/io/netty/buffer/SwappedByteBuf.java @@ -971,6 +971,11 @@ public class SwappedByteBuf extends ByteBuf { return buf.refCnt(); } + @Override + final boolean isAccessible() { + return buf.isAccessible(); + } + @Override public ByteBuf retain() { buf.retain(); diff --git a/buffer/src/main/java/io/netty/buffer/WrappedByteBuf.java b/buffer/src/main/java/io/netty/buffer/WrappedByteBuf.java index 6efe49dfe3..29d5919b66 100644 --- a/buffer/src/main/java/io/netty/buffer/WrappedByteBuf.java +++ b/buffer/src/main/java/io/netty/buffer/WrappedByteBuf.java @@ -1009,4 +1009,9 @@ class WrappedByteBuf extends ByteBuf { public boolean release(int decrement) { return buf.release(decrement); } + + @Override + final boolean isAccessible() { + return buf.isAccessible(); + } } diff --git a/buffer/src/main/java/io/netty/buffer/WrappedCompositeByteBuf.java b/buffer/src/main/java/io/netty/buffer/WrappedCompositeByteBuf.java index 62b0368760..9550030c48 100644 --- a/buffer/src/main/java/io/netty/buffer/WrappedCompositeByteBuf.java +++ b/buffer/src/main/java/io/netty/buffer/WrappedCompositeByteBuf.java @@ -424,8 +424,8 @@ class WrappedCompositeByteBuf extends CompositeByteBuf { } @Override - int internalRefCnt() { - return wrapped.internalRefCnt(); + final boolean isAccessible() { + return wrapped.isAccessible(); } @Override diff --git a/microbench/src/main/java/io/netty/buffer/ByteBufAccessBenchmark.java b/microbench/src/main/java/io/netty/buffer/ByteBufAccessBenchmark.java new file mode 100644 index 0000000000..c7cad7ba07 --- /dev/null +++ b/microbench/src/main/java/io/netty/buffer/ByteBufAccessBenchmark.java @@ -0,0 +1,157 @@ +/* +* Copyright 2019 The Netty Project +* +* The Netty Project licenses this file to you under the Apache License, +* version 2.0 (the "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at: +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +* License for the specific language governing permissions and limitations +* under the License. +*/ +package io.netty.buffer; + +import java.nio.ByteBuffer; +import java.util.concurrent.TimeUnit; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Warmup; + +import io.netty.microbench.util.AbstractMicrobenchmark; +import io.netty.util.internal.PlatformDependent; + +@Warmup(iterations = 5, time = 1500, timeUnit = TimeUnit.MILLISECONDS) +@Measurement(iterations = 10, time = 1500, timeUnit = TimeUnit.MILLISECONDS) +@Fork(3) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +public class ByteBufAccessBenchmark extends AbstractMicrobenchmark { + + static final class NioFacade extends WrappedByteBuf { + private final ByteBuffer byteBuffer; + NioFacade(ByteBuffer byteBuffer) { + super(Unpooled.EMPTY_BUFFER); + this.byteBuffer = byteBuffer; + } + @Override + public ByteBuf setLong(int index, long value) { + byteBuffer.putLong(index, value); + return this; + } + @Override + public long getLong(int index) { + return byteBuffer.getLong(index); + } + @Override + public byte readByte() { + return byteBuffer.get(); + } + @Override + public ByteBuf touch() { + // hack since WrappedByteBuf.readerIndex(int) is final + byteBuffer.position(0); + return this; + } + @Override + public boolean release() { + PlatformDependent.freeDirectBuffer(byteBuffer); + return true; + } + } + + public enum ByteBufType { + UNSAFE { + @Override + ByteBuf newBuffer() { + return new UnpooledUnsafeDirectByteBuf( + UnpooledByteBufAllocator.DEFAULT, 64, 64).setIndex(0, 64); + } + }, + UNSAFE_SLICE { + @Override + ByteBuf newBuffer() { + return UNSAFE.newBuffer().slice(16, 48); + } + }, + HEAP { + @Override + ByteBuf newBuffer() { + return new UnpooledUnsafeHeapByteBuf( + UnpooledByteBufAllocator.DEFAULT, 64, 64).setIndex(0, 64); + } + }, + COMPOSITE { + @Override + ByteBuf newBuffer() { + return Unpooled.wrappedBuffer(UNSAFE.newBuffer(), HEAP.newBuffer()); + } + }, + NIO { + @Override + ByteBuf newBuffer() { + return new NioFacade(ByteBuffer.allocateDirect(64)); + } + }; + abstract ByteBuf newBuffer(); + } + + @Param + public ByteBufType bufferType; + + @Param({ "true", "false" }) + public String checkAccessible; + + @Param({ "true", "false" }) + public String checkBounds; + + @Param({ "8" }) + public int batchSize; // applies only to readBatch benchmark + + @Setup + public void setup() { + System.setProperty("io.netty.buffer.checkAccessible", checkAccessible); + System.setProperty("io.netty.buffer.checkBounds", checkBounds); + buffer = bufferType.newBuffer(); + } + + private ByteBuf buffer; + + @TearDown + public void tearDown() { + buffer.release(); + System.clearProperty("io.netty.buffer.checkAccessible"); + System.clearProperty("io.netty.buffer.checkBounds"); + } + + @Benchmark + public long setGetLong() { + return buffer.setLong(0, 1).getLong(0); + } + + @Benchmark + public ByteBuf setLong() { + return buffer.setLong(0, 1); + } + + @Benchmark + public int readBatch() { + buffer.readerIndex(0).touch(); + int result = 0; + for (int i = 0, size = batchSize; i < size; i++) { + result += buffer.readByte(); + } + return result; + } +}