From 9602535b7d0bc349a8912286d3812629175b79e4 Mon Sep 17 00:00:00 2001 From: Xiaoyan Lin Date: Thu, 16 Jun 2016 22:17:35 -0700 Subject: [PATCH] Added `isText` to validate if a ByteBuf is compliant with the specified charset. Motivation: See #82. Modifications: - Added `isText` to validate if the given ByteBuf is compliant with the specified charset. - Optimized for UTF-8 and ASCII. For other cases, `CharsetDecoder.decoder` is used. Result: Users can validate ByteBuf with given charset. --- .../java/io/netty/buffer/ByteBufUtil.java | 198 ++++++++++++++++++ .../java/io/netty/buffer/ByteBufUtilTest.java | 97 +++++++++ 2 files changed, 295 insertions(+) diff --git a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java index 60a9c3b5a6..591bc7db91 100644 --- a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java +++ b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java @@ -35,6 +35,7 @@ import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; import java.util.Arrays; import java.util.Locale; @@ -999,5 +1000,202 @@ public final class ByteBufUtil { } } + /** + * Returns {@code true} if the given {@link ByteBuf} is valid text using the given {@link Charset}, + * otherwise return {@code false}. + * + * @param buf The given {@link ByteBuf}. + * @param charset The specified {@link Charset}. + */ + public static boolean isText(ByteBuf buf, Charset charset) { + return isText(buf, buf.readerIndex(), buf.readableBytes(), charset); + } + + /** + * Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid + * text using the given {@link Charset}, otherwise return {@code false}. + * + * @param buf The given {@link ByteBuf}. + * @param index The start index of the specified buffer. + * @param length The length of the specified buffer. + * @param charset The specified {@link Charset}. + * + * @throws IndexOutOfBoundsException if {@code index} + {@code length} is greater than {@code buf.readableBytes} + */ + public static boolean isText(ByteBuf buf, int index, int length, Charset charset) { + checkNotNull(buf, "buf"); + checkNotNull(charset, "charset"); + final int maxIndex = buf.readerIndex() + buf.readableBytes(); + if (index < 0 || length < 0 || index > maxIndex - length) { + throw new IndexOutOfBoundsException("index: " + index + " length: " + length); + } + if (charset.equals(CharsetUtil.UTF_8)) { + return isUtf8(buf, index, length); + } else if (charset.equals(CharsetUtil.US_ASCII)) { + return isAscii(buf, index, length); + } else { + CharsetDecoder decoder = CharsetUtil.decoder(charset, CodingErrorAction.REPORT, CodingErrorAction.REPORT); + try { + if (buf.nioBufferCount() == 1) { + decoder.decode(buf.internalNioBuffer(index, length)); + } else { + ByteBuf heapBuffer = buf.alloc().heapBuffer(length); + try { + heapBuffer.writeBytes(buf, index, length); + decoder.decode(heapBuffer.internalNioBuffer(0, length)); + } finally { + heapBuffer.release(); + } + } + return true; + } catch (CharacterCodingException ignore) { + return false; + } + } + } + + /** + * Aborts on a byte which is not a valid ASCII character. + */ + private static final ByteProcessor FIND_NON_ASCII = new ByteProcessor() { + @Override + public boolean process(byte value) { + return value >= 0; + } + }; + + /** + * Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid + * ASCII text, otherwise return {@code false}. + * + * @param buf The given {@link ByteBuf}. + * @param index The start index of the specified buffer. + * @param length The length of the specified buffer. + */ + private static boolean isAscii(ByteBuf buf, int index, int length) { + return buf.forEachByte(index, length, FIND_NON_ASCII) == -1; + } + + /** + * Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid + * UTF8 text, otherwise return {@code false}. + * + * @param buf The given {@link ByteBuf}. + * @param index The start index of the specified buffer. + * @param length The length of the specified buffer. + * + * @see + * UTF-8 Definition + * + *
+     * 1. Bytes format of UTF-8
+     *
+     * The table below summarizes the format of these different octet types.
+     * The letter x indicates bits available for encoding bits of the character number.
+     *
+     * Char. number range  |        UTF-8 octet sequence
+     *    (hexadecimal)    |              (binary)
+     * --------------------+---------------------------------------------
+     * 0000 0000-0000 007F | 0xxxxxxx
+     * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+     * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+     * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+     * 
+ * + *
+     * 2. Syntax of UTF-8 Byte Sequences
+     *
+     * UTF8-octets = *( UTF8-char )
+     * UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+     * UTF8-1      = %x00-7F
+     * UTF8-2      = %xC2-DF UTF8-tail
+     * UTF8-3      = %xE0 %xA0-BF UTF8-tail /
+     *               %xE1-EC 2( UTF8-tail ) /
+     *               %xED %x80-9F UTF8-tail /
+     *               %xEE-EF 2( UTF8-tail )
+     * UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
+     *               %xF1-F3 3( UTF8-tail ) /
+     *               %xF4 %x80-8F 2( UTF8-tail )
+     * UTF8-tail   = %x80-BF
+     * 
+ */ + private static boolean isUtf8(ByteBuf buf, int index, int length) { + final int endIndex = index + length; + while (index < endIndex) { + byte b1 = buf.getByte(index++); + byte b2, b3, b4; + if ((b1 & 0x80) == 0) { + // 1 byte + continue; + } + if ((b1 & 0xE0) == 0xC0) { + // 2 bytes + // + // Bit/Byte pattern + // 110xxxxx 10xxxxxx + // C2..DF 80..BF + if (index >= endIndex) { // no enough bytes + return false; + } + b2 = buf.getByte(index++); + if ((b2 & 0xC0) != 0x80) { // 2nd byte not starts with 10 + return false; + } + if ((b1 & 0xFF) < 0xC2) { // out of lower bound + return false; + } + } else if ((b1 & 0xF0) == 0xE0) { + // 3 bytes + // + // Bit/Byte pattern + // 1110xxxx 10xxxxxx 10xxxxxx + // E0 A0..BF 80..BF + // E1..EC 80..BF 80..BF + // ED 80..9F 80..BF + // E1..EF 80..BF 80..BF + if (index > endIndex - 2) { // no enough bytes + return false; + } + b2 = buf.getByte(index++); + b3 = buf.getByte(index++); + if ((b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80) { // 2nd or 3rd bytes not start with 10 + return false; + } + if ((b1 & 0x0F) == 0x00 && (b2 & 0xFF) < 0xA0) { // out of lower bound + return false; + } + if ((b1 & 0x0F) == 0x0D && (b2 & 0xFF) > 0x9F) { // out of upper bound + return false; + } + } else if ((b1 & 0xF8) == 0xF0) { + // 4 bytes + // + // Bit/Byte pattern + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // F0 90..BF 80..BF 80..BF + // F1..F3 80..BF 80..BF 80..BF + // F4 80..8F 80..BF 80..BF + if (index > endIndex - 3) { // no enough bytes + return false; + } + b2 = buf.getByte(index++); + b3 = buf.getByte(index++); + b4 = buf.getByte(index++); + if ((b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 || (b4 & 0xC0) != 0x80) { + // 2nd, 3rd or 4th bytes not start with 10 + return false; + } + if ((b1 & 0xFF) > 0xF4 // b1 invalid + || (b1 & 0xFF) == 0xF0 && (b2 & 0xFF) < 0x90 // b2 out of lower bound + || (b1 & 0xFF) == 0xF4 && (b2 & 0xFF) > 0x8F) { // b2 out of upper bound + return false; + } + } else { + return false; + } + } + return true; + } + private ByteBufUtil() { } } diff --git a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java index 3095b111f1..d516555877 100644 --- a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java +++ b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java @@ -27,6 +27,7 @@ import static io.netty.util.ReferenceCountUtil.releaseLater; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; public class ByteBufUtilTest { @Test @@ -307,4 +308,100 @@ public class ByteBufUtilTest { buffer.release(); } } + + @Test + public void testIsTextWithUtf8() { + byte[][] validUtf8Bytes = new byte[][]{ + "netty".getBytes(CharsetUtil.UTF_8), + new byte[]{(byte) 0x24}, + new byte[]{(byte) 0xC2, (byte) 0xA2}, + new byte[]{(byte) 0xE2, (byte) 0x82, (byte) 0xAC}, + new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88}, + new byte[]{(byte) 0x24, + (byte) 0xC2, (byte) 0xA2, + (byte) 0xE2, (byte) 0x82, (byte) 0xAC, + (byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88} // multiple characters + }; + for (byte[] bytes : validUtf8Bytes) { + assertIsText(bytes, true, CharsetUtil.UTF_8); + } + byte[][] invalidUtf8Bytes = new byte[][]{ + new byte[]{(byte) 0x80}, + new byte[]{(byte) 0xF0, (byte) 0x82, (byte) 0x82, (byte) 0xAC}, // Overlong encodings + new byte[]{(byte) 0xC2}, // not enough bytes + new byte[]{(byte) 0xE2, (byte) 0x82}, // not enough bytes + new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D}, // not enough bytes + new byte[]{(byte) 0xC2, (byte) 0xC0}, // not correct bytes + new byte[]{(byte) 0xE2, (byte) 0x82, (byte) 0xC0}, // not correct bytes + new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0xC0}, // not correct bytes + new byte[]{(byte) 0xC1, (byte) 0x80}, // out of lower bound + new byte[]{(byte) 0xE0, (byte) 0x80, (byte) 0x80}, // out of lower bound + new byte[]{(byte) 0xED, (byte) 0xAF, (byte) 0x80} // out of upper bound + }; + for (byte[] bytes : invalidUtf8Bytes) { + assertIsText(bytes, false, CharsetUtil.UTF_8); + } + } + + @Test + public void testIsTextWithoutOptimization() { + byte[] validBytes = new byte[]{(byte) 0x01, (byte) 0xD8, (byte) 0x37, (byte) 0xDC}; + byte[] invalidBytes = new byte[]{(byte) 0x01, (byte) 0xD8}; + + assertIsText(validBytes, true, CharsetUtil.UTF_16LE); + assertIsText(invalidBytes, false, CharsetUtil.UTF_16LE); + } + + @Test + public void testIsTextWithAscii() { + byte[] validBytes = new byte[]{(byte) 0x00, (byte) 0x01, (byte) 0x37, (byte) 0x7F}; + byte[] invalidBytes = new byte[]{(byte) 0x80, (byte) 0xFF}; + + assertIsText(validBytes, true, CharsetUtil.US_ASCII); + assertIsText(invalidBytes, false, CharsetUtil.US_ASCII); + } + + @Test + public void testIsTextWithInvalidIndexAndLength() { + ByteBuf buffer = Unpooled.buffer(); + try { + buffer.writeBytes(new byte[4]); + int[][] validIndexLengthPairs = new int[][] { + new int[]{4, 0}, + new int[]{0, 4}, + new int[]{1, 3}, + }; + for (int[] pair : validIndexLengthPairs) { + assertTrue(ByteBufUtil.isText(buffer, pair[0], pair[1], CharsetUtil.US_ASCII)); + } + int[][] invalidIndexLengthPairs = new int[][]{ + new int[]{4, 1}, + new int[]{-1, 2}, + new int[]{3, -1}, + new int[]{3, -2}, + new int[]{5, 0}, + new int[]{1, 5}, + }; + for (int[] pair : invalidIndexLengthPairs) { + try { + ByteBufUtil.isText(buffer, pair[0], pair[1], CharsetUtil.US_ASCII); + fail("Expected IndexOutOfBoundsException"); + } catch (IndexOutOfBoundsException e) { + // expected + } + } + } finally { + buffer.release(); + } + } + + private static void assertIsText(byte[] bytes, boolean expected, Charset charset) { + ByteBuf buffer = Unpooled.buffer(); + try { + buffer.writeBytes(bytes); + assertEquals(expected, ByteBufUtil.isText(buffer, charset)); + } finally { + buffer.release(); + } + } }