Further reduce ensureAccessible() overhead (#8895)
Motivation: This PR fixes some non-negligible overhead discovered in the ByteBuf accessibility (non-zero refcount) checking. The cause turned out to be mostly twofold: - Unnecessary operations used to calculate the refcount from the "raw" encoded int field value - Call stack depths exceeding the default limit for inlining, in some places (CompositeByteBuf in particular) It's a follow-on from #8882 which uses the maxCapacity field for a simpler non-negative check. The performance gap between these two variants appears to be _mostly_ closed, but there's one exception which may warrant further analysis. Modifications: - Replace ABB.internalRefCount() with ByteBuf.isAccessible(), the default still checks for non-zero refCnt() - Just test for parity of raw refCnt instead of converting to "real", with fast-path for specific small values - Make sure isAccessible() is delegated by derived/wrapper ByteBufs - Use existing freed flag in CompositeByteBuf for faster isAccessible() - Manually inline some calls in methods like CompositeByteBuf.setLong() and AbstractReferenceCountedByteBuf.isAccessible() to reduce stack depths (to ensure default inlining limit isn't hit) - Add ByteBufAccessBenchmark which is an extension of UnsafeByteBufBenchmark (maybe latter could now be removed) Results: Before: Benchmark (bufferType) (checkAccessible) (checkBounds) Mode Cnt Score Error Units readBatch UNSAFE true true thrpt 30 84524972.863 ± 518338.811 ops/s readBatch UNSAFE_SLICE true true thrpt 30 38608795.037 ± 298176.974 ops/s readBatch HEAP true true thrpt 30 80003697.649 ± 974674.119 ops/s readBatch COMPOSITE true true thrpt 30 18495554.788 ± 108075.023 ops/s setGetLong UNSAFE true true thrpt 30 247069881.578 ± 10839162.593 ops/s setGetLong UNSAFE_SLICE true true thrpt 30 196355905.206 ± 1802420.990 ops/s setGetLong HEAP true true thrpt 30 245686644.713 ± 11769311.527 ops/s setGetLong COMPOSITE true true thrpt 30 83170940.687 ± 657524.123 ops/s setLong UNSAFE true true thrpt 30 278940253.918 ± 1807265.259 ops/s setLong UNSAFE_SLICE true true thrpt 30 202556738.764 ± 11887973.563 ops/s setLong HEAP true true thrpt 30 280045958.053 ± 2719583.400 ops/s setLong COMPOSITE true true thrpt 30 121299806.002 ± 2155084.707 ops/s After: Benchmark (bufferType) (checkAccessible) (checkBounds) Mode Cnt Score Error Units readBatch UNSAFE true true thrpt 30 101641801.035 ± 3950050.059 ops/s readBatch UNSAFE_SLICE true true thrpt 30 84395902.846 ± 4339579.057 ops/s readBatch HEAP true true thrpt 30 100179060.207 ± 3222487.287 ops/s readBatch COMPOSITE true true thrpt 30 42288494.472 ± 294919.633 ops/s setGetLong UNSAFE true true thrpt 30 304530755.027 ± 6574163.899 ops/s setGetLong UNSAFE_SLICE true true thrpt 30 212028547.645 ± 14277828.768 ops/s setGetLong HEAP true true thrpt 30 309335422.609 ± 2272150.415 ops/s setGetLong COMPOSITE true true thrpt 30 160383609.236 ± 966484.033 ops/s setLong UNSAFE true true thrpt 30 298055969.747 ± 7437449.627 ops/s setLong UNSAFE_SLICE true true thrpt 30 223784178.650 ± 9869750.095 ops/s setLong HEAP true true thrpt 30 302543263.328 ± 8140104.706 ops/s setLong COMPOSITE true true thrpt 30 157083673.285 ± 3528779.522 ops/s There's also a similar knock-on improvement to other benchmarks (e.g. HPACK encoding/decoding) as shown in #8882. For sanity I did a final comparison of the "fast path" tweak using one of the HPACK benchmarks: (rawCnt & 1) == 0: Benchmark (limitToAscii) (sensitive) (size) Mode Cnt Score Error Units HpackDecoderBenchmark.decode true true MEDIUM thrpt 30 50914.479 ± 940.114 ops/s rawCnt == 2 || rawCnt == 4 || rawCnt == 6 || rawCnt == 8 || (rawCnt & 1) == 0: Benchmark (limitToAscii) (sensitive) (size) Mode Cnt Score Error Units HpackDecoderBenchmark.decode true true MEDIUM thrpt 30 60036.425 ± 1478.196 ops/s
This commit is contained in:
parent
625c4e8286
commit
0811409ca3
@ -1437,19 +1437,11 @@ public abstract class AbstractByteBuf extends ByteBuf {
|
|||||||
* if the buffer was released before.
|
* if the buffer was released before.
|
||||||
*/
|
*/
|
||||||
protected final void ensureAccessible() {
|
protected final void ensureAccessible() {
|
||||||
if (checkAccessible && internalRefCnt() == 0) {
|
if (checkAccessible && !isAccessible()) {
|
||||||
throw new IllegalReferenceCountException(0);
|
throw new IllegalReferenceCountException(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the reference count that is used internally by {@link #ensureAccessible()} to try to guard
|
|
||||||
* against using the buffer after it was released (best-effort).
|
|
||||||
*/
|
|
||||||
int internalRefCnt() {
|
|
||||||
return refCnt();
|
|
||||||
}
|
|
||||||
|
|
||||||
final void setIndex0(int readerIndex, int writerIndex) {
|
final void setIndex0(int readerIndex, int writerIndex) {
|
||||||
this.readerIndex = readerIndex;
|
this.readerIndex = readerIndex;
|
||||||
this.writerIndex = writerIndex;
|
this.writerIndex = writerIndex;
|
||||||
|
@ -31,6 +31,11 @@ public abstract class AbstractDerivedByteBuf extends AbstractByteBuf {
|
|||||||
super(maxCapacity);
|
super(maxCapacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
final boolean isAccessible() {
|
||||||
|
return unwrap().isAccessible();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final int refCnt() {
|
public final int refCnt() {
|
||||||
return refCnt0();
|
return refCnt0();
|
||||||
|
@ -64,10 +64,19 @@ public abstract class AbstractReferenceCountedByteBuf extends AbstractByteBuf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
int internalRefCnt() {
|
boolean isAccessible() {
|
||||||
// Try to do non-volatile read for performance as the ensureAccessible() is racy anyway and only provide
|
// Try to do non-volatile read for performance as the ensureAccessible() is racy anyway and only provide
|
||||||
// a best-effort guard.
|
// a best-effort guard.
|
||||||
return realRefCnt(nonVolatileRawCnt());
|
|
||||||
|
// This is copied explicitly from the nonVolatileRawCnt() method above to reduce call stack depth,
|
||||||
|
// to avoid hitting the default limit for inlining (9)
|
||||||
|
final int rawCnt = REFCNT_FIELD_OFFSET != -1 ? PlatformDependent.getInt(this, REFCNT_FIELD_OFFSET)
|
||||||
|
: refCntUpdater.get(this);
|
||||||
|
|
||||||
|
// The "real" ref count is > 0 if the rawCnt is even.
|
||||||
|
// (x & y) appears to be surprisingly expensive relative to (x == y). Thus the expression below provides
|
||||||
|
// a fast path for most common cases where the ref count is 1, 2, 3 or 4.
|
||||||
|
return rawCnt == 2 || rawCnt == 4 || rawCnt == 6 || rawCnt == 8 || (rawCnt & 1) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -2445,4 +2445,12 @@ public abstract class ByteBuf implements ReferenceCounted, Comparable<ByteBuf> {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public abstract ByteBuf touch(Object hint);
|
public abstract ByteBuf touch(Object hint);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used internally by {@link AbstractByteBuf#ensureAccessible()} to try to guard
|
||||||
|
* against using the buffer after it was released (best-effort).
|
||||||
|
*/
|
||||||
|
boolean isAccessible() {
|
||||||
|
return refCnt() != 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -300,7 +300,7 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements
|
|||||||
|
|
||||||
@SuppressWarnings("deprecation")
|
@SuppressWarnings("deprecation")
|
||||||
private Component newComponent(ByteBuf buf, int offset) {
|
private Component newComponent(ByteBuf buf, int offset) {
|
||||||
if (checkAccessible && buf.refCnt() == 0) {
|
if (checkAccessible && !buf.isAccessible()) {
|
||||||
throw new IllegalReferenceCountException(0);
|
throw new IllegalReferenceCountException(0);
|
||||||
}
|
}
|
||||||
int srcIndex = buf.readerIndex(), len = buf.readableBytes();
|
int srcIndex = buf.readerIndex(), len = buf.readableBytes();
|
||||||
@ -1074,7 +1074,8 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CompositeByteBuf setShort(int index, int value) {
|
public CompositeByteBuf setShort(int index, int value) {
|
||||||
super.setShort(index, value);
|
checkIndex(index, 2);
|
||||||
|
_setShort(index, value);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1108,7 +1109,8 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CompositeByteBuf setMedium(int index, int value) {
|
public CompositeByteBuf setMedium(int index, int value) {
|
||||||
super.setMedium(index, value);
|
checkIndex(index, 3);
|
||||||
|
_setMedium(index, value);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1142,7 +1144,8 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CompositeByteBuf setInt(int index, int value) {
|
public CompositeByteBuf setInt(int index, int value) {
|
||||||
super.setInt(index, value);
|
checkIndex(index, 4);
|
||||||
|
_setInt(index, value);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1176,7 +1179,8 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CompositeByteBuf setLong(int index, long value) {
|
public CompositeByteBuf setLong(int index, long value) {
|
||||||
super.setLong(index, value);
|
checkIndex(index, 8);
|
||||||
|
_setLong(index, value);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1284,7 +1288,6 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements
|
|||||||
|
|
||||||
int i = toComponentIndex0(index);
|
int i = toComponentIndex0(index);
|
||||||
int readBytes = 0;
|
int readBytes = 0;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
Component c = components[i];
|
Component c = components[i];
|
||||||
int localLength = Math.min(length, c.endOffset - index);
|
int localLength = Math.min(length, c.endOffset - index);
|
||||||
@ -2052,7 +2055,7 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CompositeByteBuf writeBytes(byte[] src) {
|
public CompositeByteBuf writeBytes(byte[] src) {
|
||||||
writeBytes(src, 0, src.length);
|
super.writeBytes(src, 0, src.length);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2120,6 +2123,11 @@ public class CompositeByteBuf extends AbstractReferenceCountedByteBuf implements
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
boolean isAccessible() {
|
||||||
|
return !freed;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ByteBuf unwrap() {
|
public ByteBuf unwrap() {
|
||||||
return null;
|
return null;
|
||||||
|
@ -997,6 +997,11 @@ public class SwappedByteBuf extends ByteBuf {
|
|||||||
return buf.refCnt();
|
return buf.refCnt();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
final boolean isAccessible() {
|
||||||
|
return buf.isAccessible();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ByteBuf retain() {
|
public ByteBuf retain() {
|
||||||
buf.retain();
|
buf.retain();
|
||||||
|
@ -1033,4 +1033,9 @@ class WrappedByteBuf extends ByteBuf {
|
|||||||
public boolean release(int decrement) {
|
public boolean release(int decrement) {
|
||||||
return buf.release(decrement);
|
return buf.release(decrement);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
final boolean isAccessible() {
|
||||||
|
return buf.isAccessible();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -424,8 +424,8 @@ class WrappedCompositeByteBuf extends CompositeByteBuf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
int internalRefCnt() {
|
final boolean isAccessible() {
|
||||||
return wrapped.internalRefCnt();
|
return wrapped.isAccessible();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -0,0 +1,157 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2019 The Netty Project
|
||||||
|
*
|
||||||
|
* The Netty Project licenses this file to you under the Apache License,
|
||||||
|
* version 2.0 (the "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at:
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||||
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||||
|
* License for the specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package io.netty.buffer;
|
||||||
|
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
import org.openjdk.jmh.annotations.Benchmark;
|
||||||
|
import org.openjdk.jmh.annotations.BenchmarkMode;
|
||||||
|
import org.openjdk.jmh.annotations.Fork;
|
||||||
|
import org.openjdk.jmh.annotations.Measurement;
|
||||||
|
import org.openjdk.jmh.annotations.Mode;
|
||||||
|
import org.openjdk.jmh.annotations.OutputTimeUnit;
|
||||||
|
import org.openjdk.jmh.annotations.Param;
|
||||||
|
import org.openjdk.jmh.annotations.Setup;
|
||||||
|
import org.openjdk.jmh.annotations.TearDown;
|
||||||
|
import org.openjdk.jmh.annotations.Warmup;
|
||||||
|
|
||||||
|
import io.netty.microbench.util.AbstractMicrobenchmark;
|
||||||
|
import io.netty.util.internal.PlatformDependent;
|
||||||
|
|
||||||
|
@Warmup(iterations = 5, time = 1500, timeUnit = TimeUnit.MILLISECONDS)
|
||||||
|
@Measurement(iterations = 10, time = 1500, timeUnit = TimeUnit.MILLISECONDS)
|
||||||
|
@Fork(3)
|
||||||
|
@BenchmarkMode(Mode.AverageTime)
|
||||||
|
@OutputTimeUnit(TimeUnit.NANOSECONDS)
|
||||||
|
public class ByteBufAccessBenchmark extends AbstractMicrobenchmark {
|
||||||
|
|
||||||
|
static final class NioFacade extends WrappedByteBuf {
|
||||||
|
private final ByteBuffer byteBuffer;
|
||||||
|
NioFacade(ByteBuffer byteBuffer) {
|
||||||
|
super(Unpooled.EMPTY_BUFFER);
|
||||||
|
this.byteBuffer = byteBuffer;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public ByteBuf setLong(int index, long value) {
|
||||||
|
byteBuffer.putLong(index, value);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public long getLong(int index) {
|
||||||
|
return byteBuffer.getLong(index);
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public byte readByte() {
|
||||||
|
return byteBuffer.get();
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public ByteBuf touch() {
|
||||||
|
// hack since WrappedByteBuf.readerIndex(int) is final
|
||||||
|
byteBuffer.position(0);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public boolean release() {
|
||||||
|
PlatformDependent.freeDirectBuffer(byteBuffer);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public enum ByteBufType {
|
||||||
|
UNSAFE {
|
||||||
|
@Override
|
||||||
|
ByteBuf newBuffer() {
|
||||||
|
return new UnpooledUnsafeDirectByteBuf(
|
||||||
|
UnpooledByteBufAllocator.DEFAULT, 64, 64).setIndex(0, 64);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
UNSAFE_SLICE {
|
||||||
|
@Override
|
||||||
|
ByteBuf newBuffer() {
|
||||||
|
return UNSAFE.newBuffer().slice(16, 48);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
HEAP {
|
||||||
|
@Override
|
||||||
|
ByteBuf newBuffer() {
|
||||||
|
return new UnpooledUnsafeHeapByteBuf(
|
||||||
|
UnpooledByteBufAllocator.DEFAULT, 64, 64).setIndex(0, 64);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
COMPOSITE {
|
||||||
|
@Override
|
||||||
|
ByteBuf newBuffer() {
|
||||||
|
return Unpooled.wrappedBuffer(UNSAFE.newBuffer(), HEAP.newBuffer());
|
||||||
|
}
|
||||||
|
},
|
||||||
|
NIO {
|
||||||
|
@Override
|
||||||
|
ByteBuf newBuffer() {
|
||||||
|
return new NioFacade(ByteBuffer.allocateDirect(64));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
abstract ByteBuf newBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Param
|
||||||
|
public ByteBufType bufferType;
|
||||||
|
|
||||||
|
@Param({ "true", "false" })
|
||||||
|
public String checkAccessible;
|
||||||
|
|
||||||
|
@Param({ "true", "false" })
|
||||||
|
public String checkBounds;
|
||||||
|
|
||||||
|
@Param({ "8" })
|
||||||
|
public int batchSize; // applies only to readBatch benchmark
|
||||||
|
|
||||||
|
@Setup
|
||||||
|
public void setup() {
|
||||||
|
System.setProperty("io.netty.buffer.checkAccessible", checkAccessible);
|
||||||
|
System.setProperty("io.netty.buffer.checkBounds", checkBounds);
|
||||||
|
buffer = bufferType.newBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
private ByteBuf buffer;
|
||||||
|
|
||||||
|
@TearDown
|
||||||
|
public void tearDown() {
|
||||||
|
buffer.release();
|
||||||
|
System.clearProperty("io.netty.buffer.checkAccessible");
|
||||||
|
System.clearProperty("io.netty.buffer.checkBounds");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public long setGetLong() {
|
||||||
|
return buffer.setLong(0, 1).getLong(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public ByteBuf setLong() {
|
||||||
|
return buffer.setLong(0, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
public int readBatch() {
|
||||||
|
buffer.readerIndex(0).touch();
|
||||||
|
int result = 0;
|
||||||
|
for (int i = 0, size = batchSize; i < size; i++) {
|
||||||
|
result += buffer.readByte();
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user