Added isText
to validate if a ByteBuf is compliant with the specified charset.
Motivation: See #82. Modifications: - Added `isText` to validate if the given ByteBuf is compliant with the specified charset. - Optimized for UTF-8 and ASCII. For other cases, `CharsetDecoder.decoder` is used. Result: Users can validate ByteBuf with given charset.
This commit is contained in:
parent
26e67171a3
commit
8ae0b4530b
@ -33,6 +33,7 @@ import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.CoderResult;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.util.Locale;
|
||||
|
||||
import static io.netty.util.internal.MathUtil.isOutOfBounds;
|
||||
@ -893,5 +894,202 @@ public final class ByteBufUtil {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns {@code true} if the given {@link ByteBuf} is valid text using the given {@link Charset},
|
||||
* otherwise return {@code false}.
|
||||
*
|
||||
* @param buf The given {@link ByteBuf}.
|
||||
* @param charset The specified {@link Charset}.
|
||||
*/
|
||||
public static boolean isText(ByteBuf buf, Charset charset) {
|
||||
return isText(buf, buf.readerIndex(), buf.readableBytes(), charset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid
|
||||
* text using the given {@link Charset}, otherwise return {@code false}.
|
||||
*
|
||||
* @param buf The given {@link ByteBuf}.
|
||||
* @param index The start index of the specified buffer.
|
||||
* @param length The length of the specified buffer.
|
||||
* @param charset The specified {@link Charset}.
|
||||
*
|
||||
* @throws IndexOutOfBoundsException if {@code index} + {@code length} is greater than {@code buf.readableBytes}
|
||||
*/
|
||||
public static boolean isText(ByteBuf buf, int index, int length, Charset charset) {
|
||||
checkNotNull(buf, "buf");
|
||||
checkNotNull(charset, "charset");
|
||||
final int maxIndex = buf.readerIndex() + buf.readableBytes();
|
||||
if (index < 0 || length < 0 || index > maxIndex - length) {
|
||||
throw new IndexOutOfBoundsException("index: " + index + " length: " + length);
|
||||
}
|
||||
if (charset.equals(CharsetUtil.UTF_8)) {
|
||||
return isUtf8(buf, index, length);
|
||||
} else if (charset.equals(CharsetUtil.US_ASCII)) {
|
||||
return isAscii(buf, index, length);
|
||||
} else {
|
||||
CharsetDecoder decoder = CharsetUtil.decoder(charset, CodingErrorAction.REPORT, CodingErrorAction.REPORT);
|
||||
try {
|
||||
if (buf.nioBufferCount() == 1) {
|
||||
decoder.decode(buf.internalNioBuffer(index, length));
|
||||
} else {
|
||||
ByteBuf heapBuffer = buf.alloc().heapBuffer(length);
|
||||
try {
|
||||
heapBuffer.writeBytes(buf, index, length);
|
||||
decoder.decode(heapBuffer.internalNioBuffer(0, length));
|
||||
} finally {
|
||||
heapBuffer.release();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} catch (CharacterCodingException ignore) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Aborts on a byte which is not a valid ASCII character.
|
||||
*/
|
||||
private static final ByteBufProcessor FIND_NON_ASCII = new ByteBufProcessor() {
|
||||
@Override
|
||||
public boolean process(byte value) {
|
||||
return value >= 0;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid
|
||||
* ASCII text, otherwise return {@code false}.
|
||||
*
|
||||
* @param buf The given {@link ByteBuf}.
|
||||
* @param index The start index of the specified buffer.
|
||||
* @param length The length of the specified buffer.
|
||||
*/
|
||||
private static boolean isAscii(ByteBuf buf, int index, int length) {
|
||||
return buf.forEachByte(index, length, FIND_NON_ASCII) == -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns {@code true} if the specified {@link ByteBuf} starting at {@code index} with {@code length} is valid
|
||||
* UTF8 text, otherwise return {@code false}.
|
||||
*
|
||||
* @param buf The given {@link ByteBuf}.
|
||||
* @param index The start index of the specified buffer.
|
||||
* @param length The length of the specified buffer.
|
||||
*
|
||||
* @see
|
||||
* <a href=http://www.ietf.org/rfc/rfc3629.txt>UTF-8 Definition</a>
|
||||
*
|
||||
* <pre>
|
||||
* 1. Bytes format of UTF-8
|
||||
*
|
||||
* The table below summarizes the format of these different octet types.
|
||||
* The letter x indicates bits available for encoding bits of the character number.
|
||||
*
|
||||
* Char. number range | UTF-8 octet sequence
|
||||
* (hexadecimal) | (binary)
|
||||
* --------------------+---------------------------------------------
|
||||
* 0000 0000-0000 007F | 0xxxxxxx
|
||||
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
|
||||
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
|
||||
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
* </pre>
|
||||
*
|
||||
* <pre>
|
||||
* 2. Syntax of UTF-8 Byte Sequences
|
||||
*
|
||||
* UTF8-octets = *( UTF8-char )
|
||||
* UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
|
||||
* UTF8-1 = %x00-7F
|
||||
* UTF8-2 = %xC2-DF UTF8-tail
|
||||
* UTF8-3 = %xE0 %xA0-BF UTF8-tail /
|
||||
* %xE1-EC 2( UTF8-tail ) /
|
||||
* %xED %x80-9F UTF8-tail /
|
||||
* %xEE-EF 2( UTF8-tail )
|
||||
* UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
|
||||
* %xF1-F3 3( UTF8-tail ) /
|
||||
* %xF4 %x80-8F 2( UTF8-tail )
|
||||
* UTF8-tail = %x80-BF
|
||||
* </pre>
|
||||
*/
|
||||
private static boolean isUtf8(ByteBuf buf, int index, int length) {
|
||||
final int endIndex = index + length;
|
||||
while (index < endIndex) {
|
||||
byte b1 = buf.getByte(index++);
|
||||
byte b2, b3, b4;
|
||||
if ((b1 & 0x80) == 0) {
|
||||
// 1 byte
|
||||
continue;
|
||||
}
|
||||
if ((b1 & 0xE0) == 0xC0) {
|
||||
// 2 bytes
|
||||
//
|
||||
// Bit/Byte pattern
|
||||
// 110xxxxx 10xxxxxx
|
||||
// C2..DF 80..BF
|
||||
if (index >= endIndex) { // no enough bytes
|
||||
return false;
|
||||
}
|
||||
b2 = buf.getByte(index++);
|
||||
if ((b2 & 0xC0) != 0x80) { // 2nd byte not starts with 10
|
||||
return false;
|
||||
}
|
||||
if ((b1 & 0xFF) < 0xC2) { // out of lower bound
|
||||
return false;
|
||||
}
|
||||
} else if ((b1 & 0xF0) == 0xE0) {
|
||||
// 3 bytes
|
||||
//
|
||||
// Bit/Byte pattern
|
||||
// 1110xxxx 10xxxxxx 10xxxxxx
|
||||
// E0 A0..BF 80..BF
|
||||
// E1..EC 80..BF 80..BF
|
||||
// ED 80..9F 80..BF
|
||||
// E1..EF 80..BF 80..BF
|
||||
if (index > endIndex - 2) { // no enough bytes
|
||||
return false;
|
||||
}
|
||||
b2 = buf.getByte(index++);
|
||||
b3 = buf.getByte(index++);
|
||||
if ((b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80) { // 2nd or 3rd bytes not start with 10
|
||||
return false;
|
||||
}
|
||||
if ((b1 & 0x0F) == 0x00 && (b2 & 0xFF) < 0xA0) { // out of lower bound
|
||||
return false;
|
||||
}
|
||||
if ((b1 & 0x0F) == 0x0D && (b2 & 0xFF) > 0x9F) { // out of upper bound
|
||||
return false;
|
||||
}
|
||||
} else if ((b1 & 0xF8) == 0xF0) {
|
||||
// 4 bytes
|
||||
//
|
||||
// Bit/Byte pattern
|
||||
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
// F0 90..BF 80..BF 80..BF
|
||||
// F1..F3 80..BF 80..BF 80..BF
|
||||
// F4 80..8F 80..BF 80..BF
|
||||
if (index > endIndex - 3) { // no enough bytes
|
||||
return false;
|
||||
}
|
||||
b2 = buf.getByte(index++);
|
||||
b3 = buf.getByte(index++);
|
||||
b4 = buf.getByte(index++);
|
||||
if ((b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 || (b4 & 0xC0) != 0x80) {
|
||||
// 2nd, 3rd or 4th bytes not start with 10
|
||||
return false;
|
||||
}
|
||||
if ((b1 & 0xFF) > 0xF4 // b1 invalid
|
||||
|| (b1 & 0xFF) == 0xF0 && (b2 & 0xFF) < 0x90 // b2 out of lower bound
|
||||
|| (b1 & 0xFF) == 0xF4 && (b2 & 0xFF) > 0x8F) { // b2 out of upper bound
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private ByteBufUtil() { }
|
||||
}
|
||||
|
@ -24,6 +24,7 @@ import static io.netty.buffer.Unpooled.unreleasableBuffer;
|
||||
import static io.netty.util.ReferenceCountUtil.releaseLater;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
public class ByteBufUtilTest {
|
||||
|
||||
@ -228,4 +229,100 @@ public class ByteBufUtilTest {
|
||||
buffer.release();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIsTextWithUtf8() {
|
||||
byte[][] validUtf8Bytes = new byte[][]{
|
||||
"netty".getBytes(CharsetUtil.UTF_8),
|
||||
new byte[]{(byte) 0x24},
|
||||
new byte[]{(byte) 0xC2, (byte) 0xA2},
|
||||
new byte[]{(byte) 0xE2, (byte) 0x82, (byte) 0xAC},
|
||||
new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88},
|
||||
new byte[]{(byte) 0x24,
|
||||
(byte) 0xC2, (byte) 0xA2,
|
||||
(byte) 0xE2, (byte) 0x82, (byte) 0xAC,
|
||||
(byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0x88} // multiple characters
|
||||
};
|
||||
for (byte[] bytes : validUtf8Bytes) {
|
||||
assertIsText(bytes, true, CharsetUtil.UTF_8);
|
||||
}
|
||||
byte[][] invalidUtf8Bytes = new byte[][]{
|
||||
new byte[]{(byte) 0x80},
|
||||
new byte[]{(byte) 0xF0, (byte) 0x82, (byte) 0x82, (byte) 0xAC}, // Overlong encodings
|
||||
new byte[]{(byte) 0xC2}, // not enough bytes
|
||||
new byte[]{(byte) 0xE2, (byte) 0x82}, // not enough bytes
|
||||
new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D}, // not enough bytes
|
||||
new byte[]{(byte) 0xC2, (byte) 0xC0}, // not correct bytes
|
||||
new byte[]{(byte) 0xE2, (byte) 0x82, (byte) 0xC0}, // not correct bytes
|
||||
new byte[]{(byte) 0xF0, (byte) 0x90, (byte) 0x8D, (byte) 0xC0}, // not correct bytes
|
||||
new byte[]{(byte) 0xC1, (byte) 0x80}, // out of lower bound
|
||||
new byte[]{(byte) 0xE0, (byte) 0x80, (byte) 0x80}, // out of lower bound
|
||||
new byte[]{(byte) 0xED, (byte) 0xAF, (byte) 0x80} // out of upper bound
|
||||
};
|
||||
for (byte[] bytes : invalidUtf8Bytes) {
|
||||
assertIsText(bytes, false, CharsetUtil.UTF_8);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIsTextWithoutOptimization() {
|
||||
byte[] validBytes = new byte[]{(byte) 0x01, (byte) 0xD8, (byte) 0x37, (byte) 0xDC};
|
||||
byte[] invalidBytes = new byte[]{(byte) 0x01, (byte) 0xD8};
|
||||
|
||||
assertIsText(validBytes, true, CharsetUtil.UTF_16LE);
|
||||
assertIsText(invalidBytes, false, CharsetUtil.UTF_16LE);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIsTextWithAscii() {
|
||||
byte[] validBytes = new byte[]{(byte) 0x00, (byte) 0x01, (byte) 0x37, (byte) 0x7F};
|
||||
byte[] invalidBytes = new byte[]{(byte) 0x80, (byte) 0xFF};
|
||||
|
||||
assertIsText(validBytes, true, CharsetUtil.US_ASCII);
|
||||
assertIsText(invalidBytes, false, CharsetUtil.US_ASCII);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIsTextWithInvalidIndexAndLength() {
|
||||
ByteBuf buffer = Unpooled.buffer();
|
||||
try {
|
||||
buffer.writeBytes(new byte[4]);
|
||||
int[][] validIndexLengthPairs = new int[][] {
|
||||
new int[]{4, 0},
|
||||
new int[]{0, 4},
|
||||
new int[]{1, 3},
|
||||
};
|
||||
for (int[] pair : validIndexLengthPairs) {
|
||||
assertTrue(ByteBufUtil.isText(buffer, pair[0], pair[1], CharsetUtil.US_ASCII));
|
||||
}
|
||||
int[][] invalidIndexLengthPairs = new int[][]{
|
||||
new int[]{4, 1},
|
||||
new int[]{-1, 2},
|
||||
new int[]{3, -1},
|
||||
new int[]{3, -2},
|
||||
new int[]{5, 0},
|
||||
new int[]{1, 5},
|
||||
};
|
||||
for (int[] pair : invalidIndexLengthPairs) {
|
||||
try {
|
||||
ByteBufUtil.isText(buffer, pair[0], pair[1], CharsetUtil.US_ASCII);
|
||||
fail("Expected IndexOutOfBoundsException");
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
buffer.release();
|
||||
}
|
||||
}
|
||||
|
||||
private static void assertIsText(byte[] bytes, boolean expected, Charset charset) {
|
||||
ByteBuf buffer = Unpooled.buffer();
|
||||
try {
|
||||
buffer.writeBytes(bytes);
|
||||
assertEquals(expected, ByteBufUtil.isText(buffer, charset));
|
||||
} finally {
|
||||
buffer.release();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user