Added isText to validate if a ByteBuf is compliant with the specified charset.

Motivation: See #82. Modifications: - Added `isText` to validate if the given ByteBuf is compliant with the specified charset. - Optimized for UTF-8 and ASCII. For other cases, `CharsetDecoder.decoder` is used. Result: Users can validate ByteBuf with given charset.
2016-06-16 22:17:35 -07:00 · 2016-06-16 22:17:35 -07:00 · 8ae0b4530b
commit 8ae0b4530b
parent 26e67171a3
2 changed files with 295 additions and 0 deletions
--- a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
+++ b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
@ -33,6 +33,7 @@ import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
 import java.util.Locale;

 import static io.netty.util.internal.MathUtil.isOutOfBounds;
@ -893,5 +894,202 @@ public final class ByteBufUtil {
        }
    }

+    /**
+     * Returns {@code true} if the given {@link ByteBuf} is valid text using the given {@link Charset},
+     * otherwise return {@code false}.
+     *
+     * @param buf The given {@link ByteBuf}.
+     * @param charset The specified {@link Charset}.
+     */
+    public static boolean isText(ByteBuf buf, Charset charset) {
+        return isText(buf, buf.readerIndex(), buf.readableBytes(), charset);
+    }
+
+    /**
+     * Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid
+     * text using the given {@link Charset}, otherwise return {@code false}.
+     *
+     * @param buf The given {@link ByteBuf}.
+     * @param index The start index of the specified buffer.
+     * @param length The length of the specified buffer.
+     * @param charset The specified {@link Charset}.
+     *
+     * @throws IndexOutOfBoundsException if {@code index} + {@code length} is greater than {@code buf.readableBytes}
+     */
+    public static boolean isText(ByteBuf buf, int index, int length, Charset charset) {
+        checkNotNull(buf, "buf");
+        checkNotNull(charset, "charset");
+        final int maxIndex = buf.readerIndex() + buf.readableBytes();
+        if (index < 0 || length < 0 || index > maxIndex - length) {
+            throw new IndexOutOfBoundsException("index: " + index + " length: " + length);
+        }
+        if (charset.equals(CharsetUtil.UTF_8)) {
+            return isUtf8(buf, index, length);
+        } else if (charset.equals(CharsetUtil.US_ASCII)) {
+            return isAscii(buf, index, length);
+        } else {
+            CharsetDecoder decoder = CharsetUtil.decoder(charset, CodingErrorAction.REPORT, CodingErrorAction.REPORT);
+            try {
+                if (buf.nioBufferCount() == 1) {
+                    decoder.decode(buf.internalNioBuffer(index, length));
+                } else {
+                    ByteBuf heapBuffer =  buf.alloc().heapBuffer(length);
+                    try {
+                        heapBuffer.writeBytes(buf, index, length);
+                        decoder.decode(heapBuffer.internalNioBuffer(0, length));
+                    } finally {
+                        heapBuffer.release();
+                    }
+                }
+                return true;
+            } catch (CharacterCodingException ignore) {
+                return false;
+            }
+        }
+    }
+
+    /**
+     * Aborts on a byte which is not a valid ASCII character.
+     */
+    private static final ByteBufProcessor FIND_NON_ASCII = new ByteBufProcessor() {
+        @Override
+        public boolean process(byte value) {
+            return value >= 0;
+        }
+    };
+
+    /**
+     * Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid
+     * ASCII text, otherwise return {@code false}.
+     *
+     * @param buf    The given {@link ByteBuf}.
+     * @param index  The start index of the specified buffer.
+     * @param length The length of the specified buffer.
+     */
+    private static boolean isAscii(ByteBuf buf, int index, int length) {
+        return buf.forEachByte(index, length, FIND_NON_ASCII) == -1;
+    }
+
+    /**
+     * Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid
+     * UTF8 text, otherwise return {@code false}.
+     *
+     * @param buf The given {@link ByteBuf}.
+     * @param index The start index of the specified buffer.
+     * @param length The length of the specified buffer.
+     *
+     * @see
+     * <a href=http://www.ietf.org/rfc/rfc3629.txt>UTF-8 Definition</a>
+     *
+     * <pre>
+     * 1. Bytes format of UTF-8
+     *
+     * The table below summarizes the format of these different octet types.
+     * The letter x indicates bits available for encoding bits of the character number.
+     *
+     * Char. number range  |        UTF-8 octet sequence
+     *    (hexadecimal)    |              (binary)
+     * --------------------+---------------------------------------------
+     * 0000 0000-0000 007F | 0xxxxxxx
+     * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+     * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+     * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+     * </pre>
+     *
+     * <pre>
+     * 2. Syntax of UTF-8 Byte Sequences
+     *
+     * UTF8-octets = *( UTF8-char )
+     * UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+     * UTF8-1      = %x00-7F
+     * UTF8-2      = %xC2-DF UTF8-tail
+     * UTF8-3      = %xE0 %xA0-BF UTF8-tail /
+     *               %xE1-EC 2( UTF8-tail ) /
+     *               %xED %x80-9F UTF8-tail /
+     *               %xEE-EF 2( UTF8-tail )
+     * UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
+     *               %xF1-F3 3( UTF8-tail ) /
+     *               %xF4 %x80-8F 2( UTF8-tail )
+     * UTF8-tail   = %x80-BF
+     * </pre>
+     */
+    private static boolean isUtf8(ByteBuf buf, int index, int length) {
+        final int endIndex = index + length;
+        while (index < endIndex) {
+            byte b1 = buf.getByte(index++);
+            byte b2, b3, b4;
+            if ((b1 & 0x80) == 0) {
+                // 1 byte
+                continue;
+            }
+            if ((b1 & 0xE0) == 0xC0) {
+                // 2 bytes
+                //
+                // Bit/Byte pattern
+                // 110xxxxx    10xxxxxx
+                // C2..DF      80..BF
+                if (index >= endIndex) { // no enough bytes
+                    return false;
+                }
+                b2 = buf.getByte(index++);
+                if ((b2 & 0xC0) != 0x80) { // 2nd byte not starts with 10
+                    return false;
+                }
+                if ((b1 & 0xFF) < 0xC2) { // out of lower bound
+                    return false;
+                }
+            } else if ((b1 & 0xF0) == 0xE0) {
+                // 3 bytes
+                //
+                // Bit/Byte pattern
+                // 1110xxxx    10xxxxxx    10xxxxxx
+                // E0          A0..BF      80..BF
+                // E1..EC      80..BF      80..BF
+                // ED          80..9F      80..BF
+                // E1..EF      80..BF      80..BF
+                if (index > endIndex - 2) { // no enough bytes
+                    return false;
+                }
+                b2 = buf.getByte(index++);
+                b3 = buf.getByte(index++);
+                if ((b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80) { // 2nd or 3rd bytes not start with 10
+                    return false;
+                }
+                if ((b1 & 0x0F) == 0x00 && (b2 & 0xFF) < 0xA0) { // out of lower bound
+                    return false;
+                }
+                if ((b1 & 0x0F) == 0x0D && (b2 & 0xFF) > 0x9F) { // out of upper bound
+                    return false;
+                }
+            } else if ((b1 & 0xF8) == 0xF0) {
+                // 4 bytes
+                //
+                // Bit/Byte pattern
+                // 11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
+                // F0          90..BF      80..BF      80..BF
+                // F1..F3      80..BF      80..BF      80..BF
+                // F4          80..8F      80..BF      80..BF
+                if (index > endIndex - 3) { // no enough bytes
+                    return false;
+                }
+                b2 = buf.getByte(index++);
+                b3 = buf.getByte(index++);
+                b4 = buf.getByte(index++);
+                if ((b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 || (b4 & 0xC0) != 0x80) {
+                    // 2nd, 3rd or 4th bytes not start with 10
+                    return false;
+                }
+                if ((b1 & 0xFF) > 0xF4 // b1 invalid
+                        || (b1 & 0xFF) == 0xF0 && (b2 & 0xFF) < 0x90    // b2 out of lower bound
+                        || (b1 & 0xFF) == 0xF4 && (b2 & 0xFF) > 0x8F) { // b2 out of upper bound
+                    return false;
+                }
+            } else {
+                return false;
+            }
+        }
+        return true;
+    }
+
    private ByteBufUtil() { }
 }
--- a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
+++ b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
@ -24,6 +24,7 @@ import static io.netty.buffer.Unpooled.unreleasableBuffer;
 import static io.netty.util.ReferenceCountUtil.releaseLater;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;

 public class ByteBufUtilTest {

@ -228,4 +229,100 @@ public class ByteBufUtilTest {
            buffer.release();
        }
    }
+
+    @Test
+    public void testIsTextWithUtf8() {
+        byte[][] validUtf8Bytes = new byte[][]{
+                "netty".getBytes(CharsetUtil.UTF_8),
+                new byte[]{(byte) 0x24},
+                new byte[]{(byte) 0xC2, (byte) 0xA2},
+                new byte[]{(byte) 0xE2, (byte) 0x82, (byte) 0xAC},
+                new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88},
+                new byte[]{(byte) 0x24,
+                        (byte) 0xC2, (byte) 0xA2,
+                        (byte) 0xE2, (byte) 0x82, (byte) 0xAC,
+                        (byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88} // multiple characters
+        };
+        for (byte[] bytes : validUtf8Bytes) {
+            assertIsText(bytes, true, CharsetUtil.UTF_8);
+        }
+        byte[][] invalidUtf8Bytes = new byte[][]{
+                new byte[]{(byte) 0x80},
+                new byte[]{(byte) 0xF0, (byte) 0x82, (byte) 0x82, (byte) 0xAC}, // Overlong encodings
+                new byte[]{(byte) 0xC2},                                        // not enough bytes
+                new byte[]{(byte) 0xE2, (byte) 0x82},                           // not enough bytes
+                new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D},              // not enough bytes
+                new byte[]{(byte) 0xC2, (byte) 0xC0},                           // not correct bytes
+                new byte[]{(byte) 0xE2, (byte) 0x82, (byte) 0xC0},              // not correct bytes
+                new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0xC0}, // not correct bytes
+                new byte[]{(byte) 0xC1, (byte) 0x80},                           // out of lower bound
+                new byte[]{(byte) 0xE0, (byte) 0x80, (byte) 0x80},              // out of lower bound
+                new byte[]{(byte) 0xED, (byte) 0xAF, (byte) 0x80}               // out of upper bound
+        };
+        for (byte[] bytes : invalidUtf8Bytes) {
+            assertIsText(bytes, false, CharsetUtil.UTF_8);
+        }
+    }
+
+    @Test
+    public void testIsTextWithoutOptimization() {
+        byte[] validBytes = new byte[]{(byte) 0x01, (byte) 0xD8, (byte) 0x37, (byte) 0xDC};
+        byte[] invalidBytes = new byte[]{(byte) 0x01, (byte) 0xD8};
+
+        assertIsText(validBytes, true, CharsetUtil.UTF_16LE);
+        assertIsText(invalidBytes, false, CharsetUtil.UTF_16LE);
+    }
+
+    @Test
+    public void testIsTextWithAscii() {
+        byte[] validBytes = new byte[]{(byte) 0x00, (byte) 0x01, (byte) 0x37, (byte) 0x7F};
+        byte[] invalidBytes = new byte[]{(byte) 0x80, (byte) 0xFF};
+
+        assertIsText(validBytes, true, CharsetUtil.US_ASCII);
+        assertIsText(invalidBytes, false, CharsetUtil.US_ASCII);
+    }
+
+    @Test
+    public void testIsTextWithInvalidIndexAndLength() {
+        ByteBuf buffer = Unpooled.buffer();
+        try {
+            buffer.writeBytes(new byte[4]);
+            int[][] validIndexLengthPairs = new int[][] {
+                    new int[]{4, 0},
+                    new int[]{0, 4},
+                    new int[]{1, 3},
+            };
+            for (int[] pair : validIndexLengthPairs) {
+                assertTrue(ByteBufUtil.isText(buffer, pair[0], pair[1], CharsetUtil.US_ASCII));
+            }
+            int[][] invalidIndexLengthPairs = new int[][]{
+                    new int[]{4, 1},
+                    new int[]{-1, 2},
+                    new int[]{3, -1},
+                    new int[]{3, -2},
+                    new int[]{5, 0},
+                    new int[]{1, 5},
+            };
+            for (int[] pair : invalidIndexLengthPairs) {
+                try {
+                    ByteBufUtil.isText(buffer, pair[0], pair[1], CharsetUtil.US_ASCII);
+                    fail("Expected IndexOutOfBoundsException");
+                } catch (IndexOutOfBoundsException e) {
+                    // expected
+                }
+            }
+        } finally {
+            buffer.release();
+        }
+    }
+
+    private static void assertIsText(byte[] bytes, boolean expected, Charset charset) {
+        ByteBuf buffer = Unpooled.buffer();
+        try {
+            buffer.writeBytes(bytes);
+            assertEquals(expected, ByteBufUtil.isText(buffer, charset));
+        } finally {
+            buffer.release();
+        }
+    }
 }