Reduce overhead by ByteBufUtil.decodeString(...) which is used by `AbstractByteBuf.toString(...)` and `AbstractByteBuf.getCharSequence(...)` (#8388)

Motivation:

Our current implementation that is used for toString(Charset) operations on AbstractByteBuf implementation is quite slow as it does a lot of uncessary memory copies. We should just use new String(...) as it has a lot of optimizations to handle these cases.

Modifications:

Rewrite ByteBufUtil.decodeString(...) to use new String(...)

Result:

Less overhead for toString(Charset) operations.

Benchmark                                         (charsetName)  (direct)  (size)   Mode  Cnt         Score         Error  Units
ByteBufUtilDecodeStringBenchmark.decodeString          US-ASCII     false       8  thrpt   20  22401645.093 ? 4671452.479  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString          US-ASCII     false      64  thrpt   20  23678483.384 ? 3749164.446  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString          US-ASCII      true       8  thrpt   20  15731142.651 ? 3782931.591  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString          US-ASCII      true      64  thrpt   20  16244232.229 ? 1886259.658  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString             UTF-8     false       8  thrpt   20  25983680.959 ? 5045782.289  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString             UTF-8     false      64  thrpt   20  26235589.339 ? 2867004.950  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString             UTF-8      true       8  thrpt   20  18499027.808 ? 4784684.268  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString             UTF-8      true      64  thrpt   20  16825286.141 ? 1008712.342  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString            UTF-16     false       8  thrpt   20   5789879.092 ? 1201786.359  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString            UTF-16     false      64  thrpt   20   2173243.225 ?  417809.341  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString            UTF-16      true       8  thrpt   20   5035583.011 ? 1001978.854  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString            UTF-16      true      64  thrpt   20   2162345.301 ?  402410.408  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString        ISO-8859-1     false       8  thrpt   20  30039052.376 ? 6539111.622  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString        ISO-8859-1     false      64  thrpt   20  31414163.515 ? 2096710.526  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString        ISO-8859-1      true       8  thrpt   20  19538587.855 ? 4639115.572  ops/s
ByteBufUtilDecodeStringBenchmark.decodeString        ISO-8859-1      true      64  thrpt   20  19467839.722 ? 1672687.213  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld       US-ASCII     false       8  thrpt   20  10787326.745 ? 1034197.864  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld       US-ASCII     false      64  thrpt   20   7129801.930 ? 1363019.209  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld       US-ASCII      true       8  thrpt   20   9002529.605 ? 2017642.445  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld       US-ASCII      true      64  thrpt   20   3860192.352 ?  826218.738  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld          UTF-8     false       8  thrpt   20  10532838.027 ? 2151743.968  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld          UTF-8     false      64  thrpt   20   7185554.597 ? 1387685.785  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld          UTF-8      true       8  thrpt   20   7352253.316 ? 1333823.850  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld          UTF-8      true      64  thrpt   20   2825578.707 ?  349701.156  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld         UTF-16     false       8  thrpt   20   7277446.665 ? 1447034.346  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld         UTF-16     false      64  thrpt   20   2445929.579 ?  562816.641  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld         UTF-16      true       8  thrpt   20   6201174.401 ? 1236137.786  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld         UTF-16      true      64  thrpt   20   2310674.973 ?  525587.959  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld     ISO-8859-1     false       8  thrpt   20  11142625.392 ? 1680556.468  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld     ISO-8859-1     false      64  thrpt   20   8127116.405 ? 1128513.860  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld     ISO-8859-1      true       8  thrpt   20   9405751.952 ? 2193324.806  ops/s
ByteBufUtilDecodeStringBenchmark.decodeStringOld     ISO-8859-1      true      64  thrpt   20   3943282.076 ?  737798.070  ops/s

Benchmark result is saved to /home/norman/mainframer/netty/microbench/target/reports/performance/ByteBufUtilDecodeStringBenchmark.json
Tests run: 1, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 1,030.173 sec - in io.netty.buffer.ByteBufUtilDecodeStringBenchmark
[1030.460s][info   ][gc,heap,exit ] Heap
[1030.460s][info   ][gc,heap,exit ]  garbage-first heap   total 516096K, used 257918K [0x0000000609a00000, 0x0000000800000000)
[1030.460s][info   ][gc,heap,exit ]   region size 2048K, 127 young (260096K), 2 survivors (4096K)
[1030.460s][info   ][gc,heap,exit ]  Metaspace       used 17123K, capacity 17438K, committed 17792K, reserved 1064960K
[1030.460s][info   ][gc,heap,exit ]   class space    used 1709K, capacity 1827K, committed 1920K, reserved 1048576K
This commit is contained in:
Norman Maurer 2018-10-19 14:00:13 +02:00 committed by GitHub
parent 69545aedc4
commit 87ec2f882a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 133 additions and 42 deletions

View File

@ -53,10 +53,10 @@ import static io.netty.util.internal.StringUtil.isSurrogate;
public final class ByteBufUtil {
private static final InternalLogger logger = InternalLoggerFactory.getInstance(ByteBufUtil.class);
private static final FastThreadLocal<CharBuffer> CHAR_BUFFERS = new FastThreadLocal<CharBuffer>() {
private static final FastThreadLocal<byte[]> BYTE_ARRAYS = new FastThreadLocal<byte[]>() {
@Override
protected CharBuffer initialValue() throws Exception {
return CharBuffer.allocate(1024);
protected byte[] initialValue() throws Exception {
return PlatformDependent.allocateUninitializedArray(1024);
}
};
@ -756,52 +756,31 @@ public final class ByteBufUtil {
}
}
@SuppressWarnings("deprecation")
static String decodeString(ByteBuf src, int readerIndex, int len, Charset charset) {
if (len == 0) {
return StringUtil.EMPTY_STRING;
}
final CharsetDecoder decoder = CharsetUtil.decoder(charset);
final int maxLength = (int) ((double) len * decoder.maxCharsPerByte());
CharBuffer dst = CHAR_BUFFERS.get();
if (dst.length() < maxLength) {
dst = CharBuffer.allocate(maxLength);
if (maxLength <= MAX_CHAR_BUFFER_SIZE) {
CHAR_BUFFERS.set(dst);
}
} else {
dst.clear();
}
if (src.nioBufferCount() == 1) {
decodeString(decoder, src.nioBuffer(readerIndex, len), dst);
} else {
// We use a heap buffer as CharsetDecoder is most likely able to use a fast-path if src and dst buffers
// are both backed by a byte array.
ByteBuf buffer = src.alloc().heapBuffer(len);
try {
buffer.writeBytes(src, readerIndex, len);
// Use internalNioBuffer(...) to reduce object creation.
decodeString(decoder, buffer.internalNioBuffer(buffer.readerIndex(), len), dst);
} finally {
// Release the temporary buffer again.
buffer.release();
}
}
return dst.flip().toString();
}
final byte[] array;
final int offset;
private static void decodeString(CharsetDecoder decoder, ByteBuffer src, CharBuffer dst) {
try {
CoderResult cr = decoder.decode(src, dst, true);
if (!cr.isUnderflow()) {
cr.throwException();
if (src.hasArray()) {
array = src.array();
offset = src.arrayOffset() + readerIndex;
} else {
if (len <= 1024) {
array = BYTE_ARRAYS.get();
} else {
array = PlatformDependent.allocateUninitializedArray(len);
}
cr = decoder.flush(dst);
if (!cr.isUnderflow()) {
cr.throwException();
}
} catch (CharacterCodingException x) {
throw new IllegalStateException(x);
offset = 0;
src.getBytes(readerIndex, array, 0, len);
}
if (CharsetUtil.US_ASCII.equals(charset)) {
// Fast-path for US-ASCII which is used frequently.
return new String(array, 0, offset, len);
}
return new String(array, offset, len, charset);
}
/**

View File

@ -0,0 +1,112 @@
/*
* Copyright 2018 The Netty Project
*
* The Netty Project licenses this file to you under the Apache License,
* version 2.0 (the "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package io.netty.buffer;
import io.netty.microbench.util.AbstractMicrobenchmark;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.concurrent.TimeUnit;
@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 10, time = 1, timeUnit = TimeUnit.SECONDS)
public class ByteBufUtilDecodeStringBenchmark extends AbstractMicrobenchmark {
public enum ByteBufType {
DIRECT {
@Override
ByteBuf newBuffer(byte[] bytes, int length) {
ByteBuf buffer = Unpooled.directBuffer(length);
buffer.writeBytes(bytes, 0, length);
return buffer;
}
},
HEAP_OFFSET {
@Override
ByteBuf newBuffer(byte[] bytes, int length) {
return Unpooled.wrappedBuffer(bytes, 1, length);
}
},
HEAP {
@Override
ByteBuf newBuffer(byte[] bytes, int length) {
return Unpooled.wrappedBuffer(bytes, 0, length);
}
},
COMPOSITE {
@Override
ByteBuf newBuffer(byte[] bytes, int length) {
CompositeByteBuf buffer = Unpooled.compositeBuffer();
int offset = 0;
// 8 buffers per composite.
int capacity = length / 8;
while (length > 0) {
buffer.addComponent(true, Unpooled.wrappedBuffer(bytes, offset, Math.min(length, capacity)));
length -= capacity;
offset += capacity;
}
return buffer;
}
};
abstract ByteBuf newBuffer(byte[] bytes, int length);
}
@Param({ "8", "64", "1024", "10240", "1073741824" })
public int size;
@Param({ "US-ASCII", "UTF-8" })
public String charsetName;
@Param
public ByteBufType bufferType;
private ByteBuf buffer;
private Charset charset;
@Override
protected String[] jvmArgs() {
// Ensure we minimize the GC overhead by sizing the heap big enough.
return new String[] { "-XX:MaxDirectMemorySize=2g", "-Xmx8g", "-Xms8g", "-Xmn6g" };
}
@Setup
public void setup() {
byte[] bytes = new byte[size + 2];
Arrays.fill(bytes, (byte) 'a');
// Use an offset to not allow any optimizations because we use the exact passed in byte[] for heap buffers.
buffer = bufferType.newBuffer(bytes, size);
charset = Charset.forName(charsetName);
}
@TearDown
public void teardown() {
buffer.release();
}
@Benchmark
public String decodeString() {
return ByteBufUtil.decodeString(buffer, buffer.readerIndex(), size, charset);
}
}