ByteBufUtil.writeUtf8 Surrogate Support

Motivation: UTF-16 can not represent the full range of Unicode characters, and thus has the concept of Surrogate Pair (http://unicode.org/glossary/#surrogate_pair) where 2 16-bit code units can be used to represent the missing characters. ByteBufUtil.writeUtf8 is currently does not support this and is thus incomplete. Modifications: - Add support for surrogate pairs in ByteBufUtil.writeUtf8 Result: ByteBufUtil.writeUtf8 now supports surrogate pairs and is correctly converting to UTF-8.
2015-12-18 10:53:54 -08:00 · 2015-12-18 10:53:54 -08:00 · c5dec770b8
commit c5dec770b8
parent 53040fd399
4 changed files with 69 additions and 2 deletions
--- a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
+++ b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
@ -35,9 +35,10 @@ import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;
 import java.util.Locale;

-import static io.netty.util.internal.StringUtil.NEWLINE;
-import static io.netty.util.internal.ObjectUtil.checkNotNull;
 import static io.netty.util.internal.MathUtil.isOutOfBounds;
+import static io.netty.util.internal.ObjectUtil.checkNotNull;
+import static io.netty.util.internal.StringUtil.NEWLINE;
+import static io.netty.util.internal.StringUtil.isSurrogate;

 /**
 * A collection of utility methods that is related with handling {@link ByteBuf},
@ -379,6 +380,31 @@ public final class ByteBufUtil {
            } else if (c < 0x800) {
                buffer._setByte(writerIndex++, (byte) (0xc0 | (c >> 6)));
                buffer._setByte(writerIndex++, (byte) (0x80 | (c & 0x3f)));
+            } else if (isSurrogate(c)) {
+                if (!Character.isHighSurrogate(c)) {
+                    throw new IllegalArgumentException("Invalid encoding. " +
+                            "Expected high (leading) surrogate at index " + i + " but got " + c);
+                }
+                final char c2;
+                try {
+                    // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
+                    // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
+                    // re-throw a more informative exception describing the problem.
+                    c2 = seq.charAt(++i);
+                } catch (IndexOutOfBoundsException e) {
+                    throw new IllegalArgumentException("Underflow. " +
+                            "Expected low (trailing) surrogate at index " + i + " but no more characters found.", e);
+                }
+                if (!Character.isLowSurrogate(c2)) {
+                    throw new IllegalArgumentException("Invalid encoding. " +
+                            "Expected low (trailing) surrogate at index " + i + " but got " + c2);
+                }
+                int codePoint = Character.toCodePoint(c, c2);
+                // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
+                buffer._setByte(writerIndex++, (byte) (0xf0 | (codePoint >> 18)));
+                buffer._setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 12) & 0x3f)));
+                buffer._setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 6) & 0x3f)));
+                buffer._setByte(writerIndex++, (byte) (0x80 | (codePoint & 0x3f)));
            } else {
                buffer._setByte(writerIndex++, (byte) (0xe0 | (c >> 12)));
                buffer._setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f)));
--- a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
+++ b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
@ -59,6 +59,21 @@ public class ByteBufUtilTest {
        Assert.assertEquals(buf, buf2);
    }

+    @Test
+    public void testWriteUtf8Surrogates() {
+        // leading surrogate + trailing surrogate
+        String surrogateString = new StringBuilder(2)
+                                .append('\uD800')
+                                .append('\uDC00')
+                                .toString();
+        ByteBuf buf = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
+        buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
+        ByteBuf buf2 = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
+        ByteBufUtil.writeUtf8(buf2, surrogateString);
+
+        Assert.assertEquals(buf, buf2);
+    }
+
    @Test
    public void testWriteUtf8Wrapped() {
        String usAscii = "Some UTF-8 like äÄ∏ŒŒ";
--- a/common/src/main/java/io/netty/util/internal/MathUtil.java
+++ b/common/src/main/java/io/netty/util/internal/MathUtil.java
@ -48,4 +48,19 @@ public final class MathUtil {
    public static boolean isOutOfBounds(int index, int length, int capacity) {
        return (index | length | (index + length) | (capacity - (index + length))) < 0;
    }
+
+    /**
+     * Compare to {@code long} values.
+     * @param x the first {@code long} to compare.
+     * @param y the second {@code long} to compare.
+     * @return
+     * <ul>
+     * <li>0 if {@code x == y}</li>
+     * <li>{@code > 0} if {@code x > y}</li>
+     * <li>{@code < 0} if {@code x < y}</li>
+     * </ul>
+     */
+    public static int compare(long x, long y) {
+        return (x < y) ? -1 : (x > y) ? 1 : 0;
+    }
 }
--- a/common/src/main/java/io/netty/util/internal/StringUtil.java
+++ b/common/src/main/java/io/netty/util/internal/StringUtil.java
@ -387,6 +387,17 @@ public final class StringUtil {
        return s == null || s.isEmpty();
    }

+    /**
+     * Determine if {@code c} lies within the range of values defined for
+     * <a href="http://unicode.org/glossary/#surrogate_code_point">Surrogate Code Point</a>.
+     * @param c the character to check.
+     * @return {@code true} if {@code c} lies within the range of values defined for
+     * <a href="http://unicode.org/glossary/#surrogate_code_point">Surrogate Code Point</a>. {@code false} otherwise.
+     */
+    public static boolean isSurrogate(char c) {
+        return c >= '\uD800' && c <= '\uDFFF';
+    }
+
    private static boolean isDoubleQuote(char c) {
        return c == DOUBLE_QUOTE;
    }