From 2af769f6dc76e3aad88a52c345689d739a9130a2 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Fri, 21 Jun 2019 05:05:35 -0700
Subject: [PATCH] Subsequence versions of ByteBufUtil#writeUtf8(...) methods
 (#9224)

Motivation

It would be useful to be able to write UTF-8 encoded subsequence of
CharSequence characters to a ByteBuf without needing to create a
temporary object via CharSequence#subSequence().

Modification

Add overloads of ByteBufUtil writeUtf8, reserveAndWriteUtf8 and
utf8Bytes methods which take explicit subsequence bounds.

Result

More efficient writing of substrings to byte buffers possible
---
 .../java/io/netty/buffer/ByteBufUtil.java     | 75 ++++++++++++---
 .../java/io/netty/buffer/ByteBufUtilTest.java | 91 +++++++++++++++++++
 2 files changed, 154 insertions(+), 12 deletions(-)
diff --git a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
index b1e86afce6..ae8d9ed3ea 100644
--- a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
+++ b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
@@ -21,6 +21,7 @@ import io.netty.util.CharsetUtil;
 import io.netty.util.Recycler;
 import io.netty.util.Recycler.Handle;
 import io.netty.util.concurrent.FastThreadLocal;
+import io.netty.util.internal.MathUtil;
 import io.netty.util.internal.PlatformDependent;
 import io.netty.util.internal.StringUtil;
 import io.netty.util.internal.SystemPropertyUtil;
@@ -472,6 +473,14 @@ public final class ByteBufUtil {
         return buffer.forEachByteDesc(toIndex, fromIndex - toIndex, new ByteProcessor.IndexOfProcessor(value));
     }
 
+    private static CharSequence checkCharSequenceBounds(CharSequence seq, int start, int end) {
+        if (MathUtil.isOutOfBounds(start, end - start, seq.length())) {
+            throw new IndexOutOfBoundsException("expected: 0 <= start(" + start + ") <= end (" + end
+                    + ") <= seq.length(" + seq.length() + ')');
+        }
+        return seq;
+    }
+
     /**
      * Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
      * it to a {@link ByteBuf} allocated with {@code alloc}.
@@ -496,7 +505,17 @@ public final class ByteBufUtil {
      * This method returns the actual number of bytes written.
      */
     public static int writeUtf8(ByteBuf buf, CharSequence seq) {
-        return reserveAndWriteUtf8(buf, seq, utf8MaxBytes(seq));
+        int seqLength = seq.length();
+        return reserveAndWriteUtf8Seq(buf, seq, 0, seqLength, utf8MaxBytes(seqLength));
+    }
+
+    /**
+     * Equivalent to <code>{@link #writeUtf8(ByteBuf, CharSequence) writeUtf8(buf, seq.subSequence(start, end))}</code>
+     * but avoids subsequence object allocation.
+     */
+    public static int writeUtf8(ByteBuf buf, CharSequence seq, int start, int end) {
+        checkCharSequenceBounds(seq, start, end);
+        return reserveAndWriteUtf8Seq(buf, seq, start, end, utf8MaxBytes(end - start));
     }
 
     /**
@@ -509,6 +528,21 @@ public final class ByteBufUtil {
      * This method returns the actual number of bytes written.
      */
     public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int reserveBytes) {
+        return reserveAndWriteUtf8Seq(buf, seq, 0, seq.length(), reserveBytes);
+    }
+
+    /**
+     * Equivalent to <code>{@link #reserveAndWriteUtf8(ByteBuf, CharSequence, int)
+     * reserveAndWriteUtf8(buf, seq.subSequence(start, end), reserveBytes)}</code> but avoids
+     * subsequence object allocation if possible.
+     *
+     * @return actual number of bytes written
+     */
+    public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int start, int end, int reserveBytes) {
+        return reserveAndWriteUtf8Seq(buf, checkCharSequenceBounds(seq, start, end), start, end, reserveBytes);
+    }
+
+    private static int reserveAndWriteUtf8Seq(ByteBuf buf, CharSequence seq, int start, int end, int reserveBytes) {
         for (;;) {
             if (buf instanceof WrappedCompositeByteBuf) {
                 // WrappedCompositeByteBuf is a sub-class of AbstractByteBuf so it needs special handling.
@@ -516,27 +550,31 @@ public final class ByteBufUtil {
             } else if (buf instanceof AbstractByteBuf) {
                 AbstractByteBuf byteBuf = (AbstractByteBuf) buf;
                 byteBuf.ensureWritable0(reserveBytes);
-                int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, seq.length());
+                int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, start, end);
                 byteBuf.writerIndex += written;
                 return written;
             } else if (buf instanceof WrappedByteBuf) {
                 // Unwrap as the wrapped buffer may be an AbstractByteBuf and so we can use fast-path.
                 buf = buf.unwrap();
             } else {
-                byte[] bytes = seq.toString().getBytes(CharsetUtil.UTF_8);
+                byte[] bytes = seq.subSequence(start, end).toString().getBytes(CharsetUtil.UTF_8);
                 buf.writeBytes(bytes);
                 return bytes.length;
             }
         }
     }
 
-    // Fast-Path implementation
     static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq, int len) {
+        return writeUtf8(buffer, writerIndex, seq, 0, len);
+    }
+
+    // Fast-Path implementation
+    static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq, int start, int end) {
         int oldWriterIndex = writerIndex;
 
         // We can use the _set methods as these not need to do any index checks and reference checks.
         // This is possible as we called ensureWritable(...) before.
-        for (int i = 0; i < len; i++) {
+        for (int i = start; i < end; i++) {
             char c = seq.charAt(i);
             if (c < 0x80) {
                 buffer._setByte(writerIndex++, (byte) c);
@@ -606,22 +644,35 @@ public final class ByteBufUtil {
      * This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence)}.
      */
     public static int utf8Bytes(final CharSequence seq) {
+        return utf8ByteCount(seq, 0, seq.length());
+    }
+
+    /**
+     * Equivalent to <code>{@link #utf8Bytes(CharSequence) utf8Bytes(seq.subSequence(start, end))}</code>
+     * but avoids subsequence object allocation.
+     * <p>
+     * This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence, int, int)}.
+     */
+    public static int utf8Bytes(final CharSequence seq, int start, int end) {
+        return utf8ByteCount(checkCharSequenceBounds(seq, start, end), start, end);
+    }
+
+    private static int utf8ByteCount(final CharSequence seq, int start, int end) {
         if (seq instanceof AsciiString) {
-            return seq.length();
+            return end - start;
         }
-        int seqLength = seq.length();
-        int i = 0;
+        int i = start;
         // ASCII fast path
-        while (i < seqLength && seq.charAt(i) < 0x80) {
+        while (i < end && seq.charAt(i) < 0x80) {
             ++i;
         }
         // !ASCII is packed in a separate method to let the ASCII case be smaller
-        return i < seqLength ? i + utf8Bytes(seq, i, seqLength) : i;
+        return i < end ? (i - start) + utf8BytesNonAscii(seq, i, end) : i - start;
     }
 
-    private static int utf8Bytes(final CharSequence seq, final int start, final int length) {
+    private static int utf8BytesNonAscii(final CharSequence seq, final int start, final int end) {
         int encodedLength = 0;
-        for (int i = start; i < length; i++) {
+        for (int i = start; i < end; i++) {
             final char c = seq.charAt(i);
             // making it 100% branchless isn't rewarding due to the many bit operations necessary!
             if (c < 0x800) {
diff --git a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
index da9b344f40..92913ce4f9 100644
--- a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
+++ b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
@@ -510,6 +510,97 @@ public class ByteBufUtilTest {
         assertTrue(buf instanceof WrappedByteBuf);
     }
 
+    @Test
+    public void testWriteUtf8Subsequence() {
+        String usAscii = "Some UTF-8 like äÄ∏ŒŒ";
+        ByteBuf buf = Unpooled.buffer(16);
+        buf.writeBytes(usAscii.substring(5, 18).getBytes(CharsetUtil.UTF_8));
+        ByteBuf buf2 = Unpooled.buffer(16);
+        ByteBufUtil.writeUtf8(buf2, usAscii, 5, 18);
+
+        assertEquals(buf, buf2);
+
+        buf.release();
+        buf2.release();
+    }
+
+    @Test
+    public void testReserveAndWriteUtf8Subsequence() {
+        String usAscii = "Some UTF-8 like äÄ∏ŒŒ";
+        ByteBuf buf = Unpooled.buffer(16);
+        buf.writeBytes(usAscii.substring(5, 18).getBytes(CharsetUtil.UTF_8));
+        ByteBuf buf2 = Unpooled.buffer(16);
+        int count = ByteBufUtil.reserveAndWriteUtf8(buf2, usAscii, 5, 18, 16);
+
+        assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), count);
+
+        buf.release();
+        buf2.release();
+    }
+
+    @Test
+    public void testUtf8BytesSubsequence() {
+        String usAscii = "Some UTF-8 like äÄ∏ŒŒ";
+        assertEquals(usAscii.substring(5, 18).getBytes(CharsetUtil.UTF_8).length,
+                ByteBufUtil.utf8Bytes(usAscii, 5, 18));
+    }
+
+    private static int[][] INVALID_RANGES = new int[][] {
+        { -1, 5 }, { 5, 30 }, { 10, 5 }
+    };
+
+    interface TestMethod {
+        int invoke(Object... args);
+    }
+
+    private void testInvalidSubsequences(TestMethod method) {
+        for (int [] range : INVALID_RANGES) {
+            ByteBuf buf = Unpooled.buffer(16);
+            try {
+                method.invoke(buf, "Some UTF-8 like äÄ∏ŒŒ", range[0], range[1]);
+                fail("Did not throw IndexOutOfBoundsException for range (" + range[0] + ", " + range[1] + ")");
+            } catch (IndexOutOfBoundsException iiobe) {
+                // expected
+            } finally {
+                assertFalse(buf.isReadable());
+                buf.release();
+            }
+        }
+    }
+
+    @Test
+    public void testWriteUtf8InvalidSubsequences() {
+        testInvalidSubsequences(new TestMethod() {
+            @Override
+            public int invoke(Object... args) {
+                return ByteBufUtil.writeUtf8((ByteBuf) args[0], (String) args[1],
+                        (Integer) args[2], (Integer) args[3]);
+            }
+        });
+    }
+
+    @Test
+    public void testReserveAndWriteUtf8InvalidSubsequences() {
+        testInvalidSubsequences(new TestMethod() {
+            @Override
+            public int invoke(Object... args) {
+                return ByteBufUtil.reserveAndWriteUtf8((ByteBuf) args[0], (String) args[1],
+                        (Integer) args[2], (Integer) args[3], 32);
+            }
+        });
+    }
+
+    @Test
+    public void testUtf8BytesInvalidSubsequences() {
+        testInvalidSubsequences(new TestMethod() {
+            @Override
+            public int invoke(Object... args) {
+                return ByteBufUtil.utf8Bytes((String) args[1], (Integer) args[2], (Integer) args[3]);
+            }
+        });
+    }
+
     @Test
     public void testDecodeUsAscii() {
         testDecodeString("This is a test", CharsetUtil.US_ASCII);