Reduce overhead by ByteBufUtil.decodeString(...) which is used by AbstractByteBuf.toString(...) and AbstractByteBuf.getCharSequence(...) (#8388)

Motivation: Our current implementation that is used for toString(Charset) operations on AbstractByteBuf implementation is quite slow as it does a lot of uncessary memory copies. We should just use new String(...) as it has a lot of optimizations to handle these cases. Modifications: Rewrite ByteBufUtil.decodeString(...) to use new String(...) Result: Less overhead for toString(Charset) operations. Benchmark (charsetName) (direct) (size) Mode Cnt Score Error Units ByteBufUtilDecodeStringBenchmark.decodeString US-ASCII false 8 thrpt 20 22401645.093 ? 4671452.479 ops/s ByteBufUtilDecodeStringBenchmark.decodeString US-ASCII false 64 thrpt 20 23678483.384 ? 3749164.446 ops/s ByteBufUtilDecodeStringBenchmark.decodeString US-ASCII true 8 thrpt 20 15731142.651 ? 3782931.591 ops/s ByteBufUtilDecodeStringBenchmark.decodeString US-ASCII true 64 thrpt 20 16244232.229 ? 1886259.658 ops/s ByteBufUtilDecodeStringBenchmark.decodeString UTF-8 false 8 thrpt 20 25983680.959 ? 5045782.289 ops/s ByteBufUtilDecodeStringBenchmark.decodeString UTF-8 false 64 thrpt 20 26235589.339 ? 2867004.950 ops/s ByteBufUtilDecodeStringBenchmark.decodeString UTF-8 true 8 thrpt 20 18499027.808 ? 4784684.268 ops/s ByteBufUtilDecodeStringBenchmark.decodeString UTF-8 true 64 thrpt 20 16825286.141 ? 1008712.342 ops/s ByteBufUtilDecodeStringBenchmark.decodeString UTF-16 false 8 thrpt 20 5789879.092 ? 1201786.359 ops/s ByteBufUtilDecodeStringBenchmark.decodeString UTF-16 false 64 thrpt 20 2173243.225 ? 417809.341 ops/s ByteBufUtilDecodeStringBenchmark.decodeString UTF-16 true 8 thrpt 20 5035583.011 ? 1001978.854 ops/s ByteBufUtilDecodeStringBenchmark.decodeString UTF-16 true 64 thrpt 20 2162345.301 ? 402410.408 ops/s ByteBufUtilDecodeStringBenchmark.decodeString ISO-8859-1 false 8 thrpt 20 30039052.376 ? 6539111.622 ops/s ByteBufUtilDecodeStringBenchmark.decodeString ISO-8859-1 false 64 thrpt 20 31414163.515 ? 2096710.526 ops/s ByteBufUtilDecodeStringBenchmark.decodeString ISO-8859-1 true 8 thrpt 20 19538587.855 ? 4639115.572 ops/s ByteBufUtilDecodeStringBenchmark.decodeString ISO-8859-1 true 64 thrpt 20 19467839.722 ? 1672687.213 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld US-ASCII false 8 thrpt 20 10787326.745 ? 1034197.864 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld US-ASCII false 64 thrpt 20 7129801.930 ? 1363019.209 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld US-ASCII true 8 thrpt 20 9002529.605 ? 2017642.445 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld US-ASCII true 64 thrpt 20 3860192.352 ? 826218.738 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld UTF-8 false 8 thrpt 20 10532838.027 ? 2151743.968 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld UTF-8 false 64 thrpt 20 7185554.597 ? 1387685.785 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld UTF-8 true 8 thrpt 20 7352253.316 ? 1333823.850 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld UTF-8 true 64 thrpt 20 2825578.707 ? 349701.156 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld UTF-16 false 8 thrpt 20 7277446.665 ? 1447034.346 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld UTF-16 false 64 thrpt 20 2445929.579 ? 562816.641 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld UTF-16 true 8 thrpt 20 6201174.401 ? 1236137.786 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld UTF-16 true 64 thrpt 20 2310674.973 ? 525587.959 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld ISO-8859-1 false 8 thrpt 20 11142625.392 ? 1680556.468 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld ISO-8859-1 false 64 thrpt 20 8127116.405 ? 1128513.860 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld ISO-8859-1 true 8 thrpt 20 9405751.952 ? 2193324.806 ops/s ByteBufUtilDecodeStringBenchmark.decodeStringOld ISO-8859-1 true 64 thrpt 20 3943282.076 ? 737798.070 ops/s Benchmark result is saved to /home/norman/mainframer/netty/microbench/target/reports/performance/ByteBufUtilDecodeStringBenchmark.json Tests run: 1, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 1,030.173 sec - in io.netty.buffer.ByteBufUtilDecodeStringBenchmark [1030.460s][info ][gc,heap,exit ] Heap [1030.460s][info ][gc,heap,exit ] garbage-first heap total 516096K, used 257918K [0x0000000609a00000, 0x0000000800000000) [1030.460s][info ][gc,heap,exit ] region size 2048K, 127 young (260096K), 2 survivors (4096K) [1030.460s][info ][gc,heap,exit ] Metaspace used 17123K, capacity 17438K, committed 17792K, reserved 1064960K [1030.460s][info ][gc,heap,exit ] class space used 1709K, capacity 1827K, committed 1920K, reserved 1048576K
2018-10-19 14:00:13 +02:00 · 2018-10-19 14:00:13 +02:00 · 87ec2f882a
commit 87ec2f882a
parent 69545aedc4
2 changed files with 133 additions and 42 deletions
--- a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
+++ b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
@ -53,10 +53,10 @@ import static io.netty.util.internal.StringUtil.isSurrogate;
 public final class ByteBufUtil {

    private static final InternalLogger logger = InternalLoggerFactory.getInstance(ByteBufUtil.class);
-    private static final FastThreadLocal<CharBuffer> CHAR_BUFFERS = new FastThreadLocal<CharBuffer>() {
+    private static final FastThreadLocal<byte[]> BYTE_ARRAYS = new FastThreadLocal<byte[]>() {
        @Override
-        protected CharBuffer initialValue() throws Exception {
-            return CharBuffer.allocate(1024);
+        protected byte[] initialValue() throws Exception {
+            return PlatformDependent.allocateUninitializedArray(1024);
        }
    };

@ -756,52 +756,31 @@ public final class ByteBufUtil {
        }
    }

+    @SuppressWarnings("deprecation")
    static String decodeString(ByteBuf src, int readerIndex, int len, Charset charset) {
        if (len == 0) {
            return StringUtil.EMPTY_STRING;
        }
-        final CharsetDecoder decoder = CharsetUtil.decoder(charset);
-        final int maxLength = (int) ((double) len * decoder.maxCharsPerByte());
-        CharBuffer dst = CHAR_BUFFERS.get();
-        if (dst.length() < maxLength) {
-            dst = CharBuffer.allocate(maxLength);
-            if (maxLength <= MAX_CHAR_BUFFER_SIZE) {
-                CHAR_BUFFERS.set(dst);
-            }
-        } else {
-            dst.clear();
-        }
-        if (src.nioBufferCount() == 1) {
-            decodeString(decoder, src.nioBuffer(readerIndex, len), dst);
-        } else {
-            // We use a heap buffer as CharsetDecoder is most likely able to use a fast-path if src and dst buffers
-            // are both backed by a byte array.
-            ByteBuf buffer = src.alloc().heapBuffer(len);
-            try {
-                buffer.writeBytes(src, readerIndex, len);
-                // Use internalNioBuffer(...) to reduce object creation.
-                decodeString(decoder, buffer.internalNioBuffer(buffer.readerIndex(), len), dst);
-            } finally {
-                // Release the temporary buffer again.
-                buffer.release();
-            }
-        }
-        return dst.flip().toString();
-    }
+        final byte[] array;
+        final int offset;

-    private static void decodeString(CharsetDecoder decoder, ByteBuffer src, CharBuffer dst) {
-        try {
-            CoderResult cr = decoder.decode(src, dst, true);
-            if (!cr.isUnderflow()) {
-                cr.throwException();
+        if (src.hasArray()) {
+            array = src.array();
+            offset = src.arrayOffset() + readerIndex;
+        } else {
+            if (len <= 1024) {
+                array = BYTE_ARRAYS.get();
+            } else {
+                array = PlatformDependent.allocateUninitializedArray(len);
            }
-            cr = decoder.flush(dst);
-            if (!cr.isUnderflow()) {
-                cr.throwException();
-            }
-        } catch (CharacterCodingException x) {
-            throw new IllegalStateException(x);
+            offset = 0;
+            src.getBytes(readerIndex, array, 0, len);
        }
+        if (CharsetUtil.US_ASCII.equals(charset)) {
+            // Fast-path for US-ASCII which is used frequently.
+            return new String(array, 0, offset, len);
+        }
+        return new String(array, offset, len, charset);
    }

    /**
--- a/microbench/src/main/java/io/netty/buffer/ByteBufUtilDecodeStringBenchmark.java
+++ b/microbench/src/main/java/io/netty/buffer/ByteBufUtilDecodeStringBenchmark.java
@ -0,0 +1,112 @@
+/*
+ * Copyright 2018 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.buffer;
+
+import io.netty.microbench.util.AbstractMicrobenchmark;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.concurrent.TimeUnit;
+
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 10, time = 1, timeUnit = TimeUnit.SECONDS)
+public class ByteBufUtilDecodeStringBenchmark extends AbstractMicrobenchmark {
+
+    public enum ByteBufType {
+        DIRECT {
+            @Override
+            ByteBuf newBuffer(byte[] bytes, int length) {
+                ByteBuf buffer = Unpooled.directBuffer(length);
+                buffer.writeBytes(bytes, 0, length);
+                return buffer;
+            }
+        },
+        HEAP_OFFSET {
+            @Override
+            ByteBuf newBuffer(byte[] bytes, int length) {
+                return Unpooled.wrappedBuffer(bytes, 1, length);
+            }
+        },
+        HEAP {
+            @Override
+            ByteBuf newBuffer(byte[] bytes, int length) {
+                return Unpooled.wrappedBuffer(bytes, 0, length);
+            }
+        },
+        COMPOSITE {
+            @Override
+            ByteBuf newBuffer(byte[] bytes, int length) {
+                CompositeByteBuf buffer = Unpooled.compositeBuffer();
+                int offset = 0;
+                // 8 buffers per composite.
+                int capacity = length / 8;
+
+                while (length > 0) {
+                    buffer.addComponent(true, Unpooled.wrappedBuffer(bytes, offset, Math.min(length, capacity)));
+                    length -= capacity;
+                    offset += capacity;
+                }
+                return buffer;
+            }
+        };
+
+        abstract ByteBuf newBuffer(byte[] bytes, int length);
+    }
+
+    @Param({ "8", "64", "1024", "10240", "1073741824" })
+    public int size;
+
+    @Param({ "US-ASCII", "UTF-8" })
+    public String charsetName;
+
+    @Param
+    public ByteBufType bufferType;
+
+    private ByteBuf buffer;
+    private Charset charset;
+
+    @Override
+    protected String[] jvmArgs() {
+        // Ensure we minimize the GC overhead by sizing the heap big enough.
+        return new String[] { "-XX:MaxDirectMemorySize=2g", "-Xmx8g", "-Xms8g", "-Xmn6g" };
+    }
+
+    @Setup
+    public void setup() {
+        byte[] bytes = new byte[size + 2];
+        Arrays.fill(bytes, (byte) 'a');
+
+        // Use an offset to not allow any optimizations because we use the exact passed in byte[] for heap buffers.
+        buffer = bufferType.newBuffer(bytes, size);
+        charset = Charset.forName(charsetName);
+    }
+
+    @TearDown
+    public void teardown() {
+        buffer.release();
+    }
+
+    @Benchmark
+    public String decodeString() {
+        return ByteBufUtil.decodeString(buffer, buffer.readerIndex(), size, charset);
+    }
+}