Added exact utf8 length estimator and exposed writeUtf8 with custom space reservation on destination buffer

Motivation: To avoid eager allocation of the destination and to perform length prefixed encoding of UTF-8 string with forward only access pattern Modifications: The original writeUtf8 is modified by allowing customization of the reserved bytes on the destination buffer and is introduced an exact UTF-8 length estimator. Result: Is now possible to perform length first encoding with UTF-8 well-formed char sequences following a forward only write access pattern on the destination buffer.
2017-12-22 09:07:03 +01:00 · 2017-12-22 09:07:03 +01:00 · bc8e022601
commit bc8e022601
parent dc3036a202
2 changed files with 129 additions and 5 deletions
--- a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
+++ b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
@ -475,14 +475,29 @@ public final class ByteBufUtil {
    /**
     * Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
     * it to a {@link ByteBuf}.
-     *
+     * <p>
+     * It behaves like {@link #reserveAndWriteUtf8(ByteBuf, CharSequence, int)} with {@code reserveBytes}
+     * computed by {@link #utf8MaxBytes(CharSequence)}.<br>
     * This method returns the actual number of bytes written.
     */
    public static int writeUtf8(ByteBuf buf, CharSequence seq) {
+        return reserveAndWriteUtf8(buf, seq, utf8MaxBytes(seq));
+    }
+
+    /**
+     * Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
+     * it into {@code reserveBytes} of a {@link ByteBuf}.
+     * <p>
+     * The {@code reserveBytes} must be computed (ie eagerly using {@link #utf8MaxBytes(CharSequence)}
+     * or exactly with {@link #utf8Bytes(CharSequence)}) to ensure this method to not fail: for performance reasons
+     * the index checks will be performed using just {@code reserveBytes}.<br>
+     * This method returns the actual number of bytes written.
+     */
+    public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int reserveBytes) {
        for (;;) {
            if (buf instanceof AbstractByteBuf) {
                AbstractByteBuf byteBuf = (AbstractByteBuf) buf;
-                byteBuf.ensureWritable0(utf8MaxBytes(seq));
+                byteBuf.ensureWritable0(reserveBytes);
                int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, seq.length());
                byteBuf.writerIndex += written;
                return written;
@ -521,7 +536,7 @@ public final class ByteBufUtil {
                    // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
                    // re-throw a more informative exception describing the problem.
                    c2 = seq.charAt(++i);
-                } catch (IndexOutOfBoundsException e) {
+                } catch (IndexOutOfBoundsException ignored) {
                    buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
                    break;
                }
@ -545,11 +560,77 @@ public final class ByteBufUtil {
        return writerIndex - oldWriterIndex;
    }

+    /**
+     * Returns max bytes length of UTF8 character sequence of the given length.
+     */
+    public static int utf8MaxBytes(final int seqLength) {
+        return seqLength * MAX_BYTES_PER_CHAR_UTF8;
+    }
+
    /**
     * Returns max bytes length of UTF8 character sequence.
+     * <p>
+     * It behaves like {@link #utf8MaxBytes(int)} applied to {@code seq} {@link CharSequence#length()}.
     */
    public static int utf8MaxBytes(CharSequence seq) {
-        return seq.length() * MAX_BYTES_PER_CHAR_UTF8;
+        return utf8MaxBytes(seq.length());
+    }
+
+    /**
+     * Returns the exact bytes length of UTF8 character sequence.
+     * <p>
+     * This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence)}.
+     */
+    public static int utf8Bytes(final CharSequence seq) {
+        if (seq instanceof AsciiString) {
+            return seq.length();
+        }
+        int seqLength = seq.length();
+        int i = 0;
+        // ASCII fast path
+        while (i < seqLength && seq.charAt(i) < 0x80) {
+            ++i;
+        }
+        // !ASCII is packed in a separate method to let the ASCII case be smaller
+        return i < seqLength ? i + utf8Bytes(seq, i, seqLength) : i;
+    }
+
+    private static int utf8Bytes(final CharSequence seq, final int start, final int length) {
+        int encodedLength = 0;
+        for (int i = start; i < length; i++) {
+            final char c = seq.charAt(i);
+            // making it 100% branchless isn't rewarding due to the many bit operations necessary!
+            if (c < 0x800) {
+                // branchless version of: (c <= 127 ? 0:1) + 1
+                encodedLength += ((0x7f - c) >>> 31) + 1;
+            } else if (isSurrogate(c)) {
+                if (!Character.isHighSurrogate(c)) {
+                    encodedLength++;
+                    // WRITE_UTF_UNKNOWN
+                    continue;
+                }
+                final char c2;
+                try {
+                    // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
+                    // duplicate bounds checking with charAt.
+                    c2 = seq.charAt(++i);
+                } catch (IndexOutOfBoundsException ignored) {
+                    encodedLength++;
+                    // WRITE_UTF_UNKNOWN
+                    break;
+                }
+                if (!Character.isLowSurrogate(c2)) {
+                    // WRITE_UTF_UNKNOWN + (Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2)
+                    encodedLength += 2;
+                    continue;
+                }
+                // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
+                encodedLength += 4;
+            } else {
+                encodedLength += 3;
+            }
+        }
+        return encodedLength;
    }

    /**
--- a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
+++ b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
@ -267,6 +267,7 @@ public class ByteBufUtilTest {
        ByteBufUtil.writeUtf8(buf2, surrogateString);

        assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

        buf.release();
        buf2.release();
@ -285,6 +286,7 @@ public class ByteBufUtilTest {
        ByteBufUtil.writeUtf8(buf2, surrogateString);

        assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

        buf.release();
        buf2.release();
@ -303,6 +305,7 @@ public class ByteBufUtilTest {
        ByteBufUtil.writeUtf8(buf2, surrogateString);

        assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

        buf.release();
        buf2.release();
@ -322,6 +325,7 @@ public class ByteBufUtilTest {
        ByteBufUtil.writeUtf8(buf2, surrogateString);

        assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

        buf.release();
        buf2.release();
@ -341,7 +345,7 @@ public class ByteBufUtilTest {
        ByteBufUtil.writeUtf8(buf2, surrogateString);

        assertEquals(buf, buf2);
-
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
        buf.release();
        buf2.release();
    }
@ -360,6 +364,7 @@ public class ByteBufUtilTest {
        ByteBufUtil.writeUtf8(buf2, surrogateString);

        assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

        buf.release();
        buf2.release();
@ -376,6 +381,7 @@ public class ByteBufUtilTest {
        ByteBufUtil.writeUtf8(buf2, surrogateString);

        assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

        buf.release();
        buf2.release();
@ -392,6 +398,7 @@ public class ByteBufUtilTest {
        ByteBufUtil.writeUtf8(buf2, surrogateString);

        assertEquals(buf, buf2);
+        assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));

        buf.release();
        buf2.release();
@ -546,6 +553,42 @@ public class ByteBufUtilTest {
        }
    }

+    @Test
+    public void testUtf8Bytes() {
+        final String s = "Some UTF-8 like äÄ∏ŒŒ";
+        checkUtf8Bytes(s);
+    }
+
+    @Test
+    public void testUtf8BytesWithSurrogates() {
+        final String s = "a\uD800\uDC00b";
+        checkUtf8Bytes(s);
+    }
+
+    @Test
+    public void testUtf8BytesWithNonSurrogates3Bytes() {
+        final String s = "a\uE000b";
+        checkUtf8Bytes(s);
+    }
+
+    @Test
+    public void testUtf8BytesWithNonSurrogatesNonAscii() {
+        final char nonAscii = (char) 0x81;
+        final String s = "a" + nonAscii + "b";
+        checkUtf8Bytes(s);
+    }
+
+    private static void checkUtf8Bytes(final CharSequence charSequence) {
+        final ByteBuf buf = Unpooled.buffer(ByteBufUtil.utf8MaxBytes(charSequence));
+        try {
+            final int writtenBytes = ByteBufUtil.writeUtf8(buf, charSequence);
+            final int utf8Bytes = ByteBufUtil.utf8Bytes(charSequence);
+            assertEquals(writtenBytes, utf8Bytes);
+        } finally {
+            buf.release();
+        }
+    }
+
    private static void assertIsText(byte[] bytes, boolean expected, Charset charset) {
        ByteBuf buffer = Unpooled.buffer();
        try {