Added isText to validate if a ByteBuf is compliant with the specified charset.

Motivation:

See #82.

Modifications:

- Added `isText` to validate if the given ByteBuf is compliant with the specified charset.
- Optimized for UTF-8 and ASCII. For other cases, `CharsetDecoder.decoder` is used.

Result:

Users can validate ByteBuf with given charset.
This commit is contained in:
Xiaoyan Lin 2016-06-16 22:17:35 -07:00 committed by Norman Maurer
parent c7a0a0f325
commit 9602535b7d
2 changed files with 295 additions and 0 deletions

View File

@ -35,6 +35,7 @@ import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import java.util.Locale;
@ -999,5 +1000,202 @@ public final class ByteBufUtil {
}
}
/**
* Returns {@code true} if the given {@link ByteBuf} is valid text using the given {@link Charset},
* otherwise return {@code false}.
*
* @param buf The given {@link ByteBuf}.
* @param charset The specified {@link Charset}.
*/
public static boolean isText(ByteBuf buf, Charset charset) {
return isText(buf, buf.readerIndex(), buf.readableBytes(), charset);
}
/**
* Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid
* text using the given {@link Charset}, otherwise return {@code false}.
*
* @param buf The given {@link ByteBuf}.
* @param index The start index of the specified buffer.
* @param length The length of the specified buffer.
* @param charset The specified {@link Charset}.
*
* @throws IndexOutOfBoundsException if {@code index} + {@code length} is greater than {@code buf.readableBytes}
*/
public static boolean isText(ByteBuf buf, int index, int length, Charset charset) {
checkNotNull(buf, "buf");
checkNotNull(charset, "charset");
final int maxIndex = buf.readerIndex() + buf.readableBytes();
if (index < 0 || length < 0 || index > maxIndex - length) {
throw new IndexOutOfBoundsException("index: " + index + " length: " + length);
}
if (charset.equals(CharsetUtil.UTF_8)) {
return isUtf8(buf, index, length);
} else if (charset.equals(CharsetUtil.US_ASCII)) {
return isAscii(buf, index, length);
} else {
CharsetDecoder decoder = CharsetUtil.decoder(charset, CodingErrorAction.REPORT, CodingErrorAction.REPORT);
try {
if (buf.nioBufferCount() == 1) {
decoder.decode(buf.internalNioBuffer(index, length));
} else {
ByteBuf heapBuffer = buf.alloc().heapBuffer(length);
try {
heapBuffer.writeBytes(buf, index, length);
decoder.decode(heapBuffer.internalNioBuffer(0, length));
} finally {
heapBuffer.release();
}
}
return true;
} catch (CharacterCodingException ignore) {
return false;
}
}
}
/**
* Aborts on a byte which is not a valid ASCII character.
*/
private static final ByteProcessor FIND_NON_ASCII = new ByteProcessor() {
@Override
public boolean process(byte value) {
return value >= 0;
}
};
/**
* Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid
* ASCII text, otherwise return {@code false}.
*
* @param buf The given {@link ByteBuf}.
* @param index The start index of the specified buffer.
* @param length The length of the specified buffer.
*/
private static boolean isAscii(ByteBuf buf, int index, int length) {
return buf.forEachByte(index, length, FIND_NON_ASCII) == -1;
}
/**
* Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid
* UTF8 text, otherwise return {@code false}.
*
* @param buf The given {@link ByteBuf}.
* @param index The start index of the specified buffer.
* @param length The length of the specified buffer.
*
* @see
* <a href=http://www.ietf.org/rfc/rfc3629.txt>UTF-8 Definition</a>
*
* <pre>
* 1. Bytes format of UTF-8
*
* The table below summarizes the format of these different octet types.
* The letter x indicates bits available for encoding bits of the character number.
*
* Char. number range | UTF-8 octet sequence
* (hexadecimal) | (binary)
* --------------------+---------------------------------------------
* 0000 0000-0000 007F | 0xxxxxxx
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* </pre>
*
* <pre>
* 2. Syntax of UTF-8 Byte Sequences
*
* UTF8-octets = *( UTF8-char )
* UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
* UTF8-1 = %x00-7F
* UTF8-2 = %xC2-DF UTF8-tail
* UTF8-3 = %xE0 %xA0-BF UTF8-tail /
* %xE1-EC 2( UTF8-tail ) /
* %xED %x80-9F UTF8-tail /
* %xEE-EF 2( UTF8-tail )
* UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
* %xF1-F3 3( UTF8-tail ) /
* %xF4 %x80-8F 2( UTF8-tail )
* UTF8-tail = %x80-BF
* </pre>
*/
private static boolean isUtf8(ByteBuf buf, int index, int length) {
final int endIndex = index + length;
while (index < endIndex) {
byte b1 = buf.getByte(index++);
byte b2, b3, b4;
if ((b1 & 0x80) == 0) {
// 1 byte
continue;
}
if ((b1 & 0xE0) == 0xC0) {
// 2 bytes
//
// Bit/Byte pattern
// 110xxxxx 10xxxxxx
// C2..DF 80..BF
if (index >= endIndex) { // no enough bytes
return false;
}
b2 = buf.getByte(index++);
if ((b2 & 0xC0) != 0x80) { // 2nd byte not starts with 10
return false;
}
if ((b1 & 0xFF) < 0xC2) { // out of lower bound
return false;
}
} else if ((b1 & 0xF0) == 0xE0) {
// 3 bytes
//
// Bit/Byte pattern
// 1110xxxx 10xxxxxx 10xxxxxx
// E0 A0..BF 80..BF
// E1..EC 80..BF 80..BF
// ED 80..9F 80..BF
// E1..EF 80..BF 80..BF
if (index > endIndex - 2) { // no enough bytes
return false;
}
b2 = buf.getByte(index++);
b3 = buf.getByte(index++);
if ((b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80) { // 2nd or 3rd bytes not start with 10
return false;
}
if ((b1 & 0x0F) == 0x00 && (b2 & 0xFF) < 0xA0) { // out of lower bound
return false;
}
if ((b1 & 0x0F) == 0x0D && (b2 & 0xFF) > 0x9F) { // out of upper bound
return false;
}
} else if ((b1 & 0xF8) == 0xF0) {
// 4 bytes
//
// Bit/Byte pattern
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// F0 90..BF 80..BF 80..BF
// F1..F3 80..BF 80..BF 80..BF
// F4 80..8F 80..BF 80..BF
if (index > endIndex - 3) { // no enough bytes
return false;
}
b2 = buf.getByte(index++);
b3 = buf.getByte(index++);
b4 = buf.getByte(index++);
if ((b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 || (b4 & 0xC0) != 0x80) {
// 2nd, 3rd or 4th bytes not start with 10
return false;
}
if ((b1 & 0xFF) > 0xF4 // b1 invalid
|| (b1 & 0xFF) == 0xF0 && (b2 & 0xFF) < 0x90 // b2 out of lower bound
|| (b1 & 0xFF) == 0xF4 && (b2 & 0xFF) > 0x8F) { // b2 out of upper bound
return false;
}
} else {
return false;
}
}
return true;
}
private ByteBufUtil() { }
}

View File

@ -27,6 +27,7 @@ import static io.netty.util.ReferenceCountUtil.releaseLater;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
public class ByteBufUtilTest {
@Test
@ -307,4 +308,100 @@ public class ByteBufUtilTest {
buffer.release();
}
}
@Test
public void testIsTextWithUtf8() {
byte[][] validUtf8Bytes = new byte[][]{
"netty".getBytes(CharsetUtil.UTF_8),
new byte[]{(byte) 0x24},
new byte[]{(byte) 0xC2, (byte) 0xA2},
new byte[]{(byte) 0xE2, (byte) 0x82, (byte) 0xAC},
new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88},
new byte[]{(byte) 0x24,
(byte) 0xC2, (byte) 0xA2,
(byte) 0xE2, (byte) 0x82, (byte) 0xAC,
(byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88} // multiple characters
};
for (byte[] bytes : validUtf8Bytes) {
assertIsText(bytes, true, CharsetUtil.UTF_8);
}
byte[][] invalidUtf8Bytes = new byte[][]{
new byte[]{(byte) 0x80},
new byte[]{(byte) 0xF0, (byte) 0x82, (byte) 0x82, (byte) 0xAC}, // Overlong encodings
new byte[]{(byte) 0xC2}, // not enough bytes
new byte[]{(byte) 0xE2, (byte) 0x82}, // not enough bytes
new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D}, // not enough bytes
new byte[]{(byte) 0xC2, (byte) 0xC0}, // not correct bytes
new byte[]{(byte) 0xE2, (byte) 0x82, (byte) 0xC0}, // not correct bytes
new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0xC0}, // not correct bytes
new byte[]{(byte) 0xC1, (byte) 0x80}, // out of lower bound
new byte[]{(byte) 0xE0, (byte) 0x80, (byte) 0x80}, // out of lower bound
new byte[]{(byte) 0xED, (byte) 0xAF, (byte) 0x80} // out of upper bound
};
for (byte[] bytes : invalidUtf8Bytes) {
assertIsText(bytes, false, CharsetUtil.UTF_8);
}
}
@Test
public void testIsTextWithoutOptimization() {
byte[] validBytes = new byte[]{(byte) 0x01, (byte) 0xD8, (byte) 0x37, (byte) 0xDC};
byte[] invalidBytes = new byte[]{(byte) 0x01, (byte) 0xD8};
assertIsText(validBytes, true, CharsetUtil.UTF_16LE);
assertIsText(invalidBytes, false, CharsetUtil.UTF_16LE);
}
@Test
public void testIsTextWithAscii() {
byte[] validBytes = new byte[]{(byte) 0x00, (byte) 0x01, (byte) 0x37, (byte) 0x7F};
byte[] invalidBytes = new byte[]{(byte) 0x80, (byte) 0xFF};
assertIsText(validBytes, true, CharsetUtil.US_ASCII);
assertIsText(invalidBytes, false, CharsetUtil.US_ASCII);
}
@Test
public void testIsTextWithInvalidIndexAndLength() {
ByteBuf buffer = Unpooled.buffer();
try {
buffer.writeBytes(new byte[4]);
int[][] validIndexLengthPairs = new int[][] {
new int[]{4, 0},
new int[]{0, 4},
new int[]{1, 3},
};
for (int[] pair : validIndexLengthPairs) {
assertTrue(ByteBufUtil.isText(buffer, pair[0], pair[1], CharsetUtil.US_ASCII));
}
int[][] invalidIndexLengthPairs = new int[][]{
new int[]{4, 1},
new int[]{-1, 2},
new int[]{3, -1},
new int[]{3, -2},
new int[]{5, 0},
new int[]{1, 5},
};
for (int[] pair : invalidIndexLengthPairs) {
try {
ByteBufUtil.isText(buffer, pair[0], pair[1], CharsetUtil.US_ASCII);
fail("Expected IndexOutOfBoundsException");
} catch (IndexOutOfBoundsException e) {
// expected
}
}
} finally {
buffer.release();
}
}
private static void assertIsText(byte[] bytes, boolean expected, Charset charset) {
ByteBuf buffer = Unpooled.buffer();
try {
buffer.writeBytes(bytes);
assertEquals(expected, ByteBufUtil.isText(buffer, charset));
} finally {
buffer.release();
}
}
}