Added exact utf8 length estimator and exposed writeUtf8 with custom space reservation on destination buffer
Motivation: To avoid eager allocation of the destination and to perform length prefixed encoding of UTF-8 string with forward only access pattern Modifications: The original writeUtf8 is modified by allowing customization of the reserved bytes on the destination buffer and is introduced an exact UTF-8 length estimator. Result: Is now possible to perform length first encoding with UTF-8 well-formed char sequences following a forward only write access pattern on the destination buffer.
This commit is contained in:
parent
dc3036a202
commit
bc8e022601
@ -475,14 +475,29 @@ public final class ByteBufUtil {
|
||||
/**
|
||||
* Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
|
||||
* it to a {@link ByteBuf}.
|
||||
*
|
||||
* <p>
|
||||
* It behaves like {@link #reserveAndWriteUtf8(ByteBuf, CharSequence, int)} with {@code reserveBytes}
|
||||
* computed by {@link #utf8MaxBytes(CharSequence)}.<br>
|
||||
* This method returns the actual number of bytes written.
|
||||
*/
|
||||
public static int writeUtf8(ByteBuf buf, CharSequence seq) {
|
||||
return reserveAndWriteUtf8(buf, seq, utf8MaxBytes(seq));
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
|
||||
* it into {@code reserveBytes} of a {@link ByteBuf}.
|
||||
* <p>
|
||||
* The {@code reserveBytes} must be computed (ie eagerly using {@link #utf8MaxBytes(CharSequence)}
|
||||
* or exactly with {@link #utf8Bytes(CharSequence)}) to ensure this method to not fail: for performance reasons
|
||||
* the index checks will be performed using just {@code reserveBytes}.<br>
|
||||
* This method returns the actual number of bytes written.
|
||||
*/
|
||||
public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int reserveBytes) {
|
||||
for (;;) {
|
||||
if (buf instanceof AbstractByteBuf) {
|
||||
AbstractByteBuf byteBuf = (AbstractByteBuf) buf;
|
||||
byteBuf.ensureWritable0(utf8MaxBytes(seq));
|
||||
byteBuf.ensureWritable0(reserveBytes);
|
||||
int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, seq.length());
|
||||
byteBuf.writerIndex += written;
|
||||
return written;
|
||||
@ -521,7 +536,7 @@ public final class ByteBufUtil {
|
||||
// duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
|
||||
// re-throw a more informative exception describing the problem.
|
||||
c2 = seq.charAt(++i);
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
} catch (IndexOutOfBoundsException ignored) {
|
||||
buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
|
||||
break;
|
||||
}
|
||||
@ -545,11 +560,77 @@ public final class ByteBufUtil {
|
||||
return writerIndex - oldWriterIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns max bytes length of UTF8 character sequence of the given length.
|
||||
*/
|
||||
public static int utf8MaxBytes(final int seqLength) {
|
||||
return seqLength * MAX_BYTES_PER_CHAR_UTF8;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns max bytes length of UTF8 character sequence.
|
||||
* <p>
|
||||
* It behaves like {@link #utf8MaxBytes(int)} applied to {@code seq} {@link CharSequence#length()}.
|
||||
*/
|
||||
public static int utf8MaxBytes(CharSequence seq) {
|
||||
return seq.length() * MAX_BYTES_PER_CHAR_UTF8;
|
||||
return utf8MaxBytes(seq.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the exact bytes length of UTF8 character sequence.
|
||||
* <p>
|
||||
* This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence)}.
|
||||
*/
|
||||
public static int utf8Bytes(final CharSequence seq) {
|
||||
if (seq instanceof AsciiString) {
|
||||
return seq.length();
|
||||
}
|
||||
int seqLength = seq.length();
|
||||
int i = 0;
|
||||
// ASCII fast path
|
||||
while (i < seqLength && seq.charAt(i) < 0x80) {
|
||||
++i;
|
||||
}
|
||||
// !ASCII is packed in a separate method to let the ASCII case be smaller
|
||||
return i < seqLength ? i + utf8Bytes(seq, i, seqLength) : i;
|
||||
}
|
||||
|
||||
private static int utf8Bytes(final CharSequence seq, final int start, final int length) {
|
||||
int encodedLength = 0;
|
||||
for (int i = start; i < length; i++) {
|
||||
final char c = seq.charAt(i);
|
||||
// making it 100% branchless isn't rewarding due to the many bit operations necessary!
|
||||
if (c < 0x800) {
|
||||
// branchless version of: (c <= 127 ? 0:1) + 1
|
||||
encodedLength += ((0x7f - c) >>> 31) + 1;
|
||||
} else if (isSurrogate(c)) {
|
||||
if (!Character.isHighSurrogate(c)) {
|
||||
encodedLength++;
|
||||
// WRITE_UTF_UNKNOWN
|
||||
continue;
|
||||
}
|
||||
final char c2;
|
||||
try {
|
||||
// Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
|
||||
// duplicate bounds checking with charAt.
|
||||
c2 = seq.charAt(++i);
|
||||
} catch (IndexOutOfBoundsException ignored) {
|
||||
encodedLength++;
|
||||
// WRITE_UTF_UNKNOWN
|
||||
break;
|
||||
}
|
||||
if (!Character.isLowSurrogate(c2)) {
|
||||
// WRITE_UTF_UNKNOWN + (Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2)
|
||||
encodedLength += 2;
|
||||
continue;
|
||||
}
|
||||
// See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
|
||||
encodedLength += 4;
|
||||
} else {
|
||||
encodedLength += 3;
|
||||
}
|
||||
}
|
||||
return encodedLength;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -267,6 +267,7 @@ public class ByteBufUtilTest {
|
||||
ByteBufUtil.writeUtf8(buf2, surrogateString);
|
||||
|
||||
assertEquals(buf, buf2);
|
||||
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
|
||||
|
||||
buf.release();
|
||||
buf2.release();
|
||||
@ -285,6 +286,7 @@ public class ByteBufUtilTest {
|
||||
ByteBufUtil.writeUtf8(buf2, surrogateString);
|
||||
|
||||
assertEquals(buf, buf2);
|
||||
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
|
||||
|
||||
buf.release();
|
||||
buf2.release();
|
||||
@ -303,6 +305,7 @@ public class ByteBufUtilTest {
|
||||
ByteBufUtil.writeUtf8(buf2, surrogateString);
|
||||
|
||||
assertEquals(buf, buf2);
|
||||
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
|
||||
|
||||
buf.release();
|
||||
buf2.release();
|
||||
@ -322,6 +325,7 @@ public class ByteBufUtilTest {
|
||||
ByteBufUtil.writeUtf8(buf2, surrogateString);
|
||||
|
||||
assertEquals(buf, buf2);
|
||||
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
|
||||
|
||||
buf.release();
|
||||
buf2.release();
|
||||
@ -341,7 +345,7 @@ public class ByteBufUtilTest {
|
||||
ByteBufUtil.writeUtf8(buf2, surrogateString);
|
||||
|
||||
assertEquals(buf, buf2);
|
||||
|
||||
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
|
||||
buf.release();
|
||||
buf2.release();
|
||||
}
|
||||
@ -360,6 +364,7 @@ public class ByteBufUtilTest {
|
||||
ByteBufUtil.writeUtf8(buf2, surrogateString);
|
||||
|
||||
assertEquals(buf, buf2);
|
||||
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
|
||||
|
||||
buf.release();
|
||||
buf2.release();
|
||||
@ -376,6 +381,7 @@ public class ByteBufUtilTest {
|
||||
ByteBufUtil.writeUtf8(buf2, surrogateString);
|
||||
|
||||
assertEquals(buf, buf2);
|
||||
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
|
||||
|
||||
buf.release();
|
||||
buf2.release();
|
||||
@ -392,6 +398,7 @@ public class ByteBufUtilTest {
|
||||
ByteBufUtil.writeUtf8(buf2, surrogateString);
|
||||
|
||||
assertEquals(buf, buf2);
|
||||
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
|
||||
|
||||
buf.release();
|
||||
buf2.release();
|
||||
@ -546,6 +553,42 @@ public class ByteBufUtilTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUtf8Bytes() {
|
||||
final String s = "Some UTF-8 like äÄ∏ŒŒ";
|
||||
checkUtf8Bytes(s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUtf8BytesWithSurrogates() {
|
||||
final String s = "a\uD800\uDC00b";
|
||||
checkUtf8Bytes(s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUtf8BytesWithNonSurrogates3Bytes() {
|
||||
final String s = "a\uE000b";
|
||||
checkUtf8Bytes(s);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUtf8BytesWithNonSurrogatesNonAscii() {
|
||||
final char nonAscii = (char) 0x81;
|
||||
final String s = "a" + nonAscii + "b";
|
||||
checkUtf8Bytes(s);
|
||||
}
|
||||
|
||||
private static void checkUtf8Bytes(final CharSequence charSequence) {
|
||||
final ByteBuf buf = Unpooled.buffer(ByteBufUtil.utf8MaxBytes(charSequence));
|
||||
try {
|
||||
final int writtenBytes = ByteBufUtil.writeUtf8(buf, charSequence);
|
||||
final int utf8Bytes = ByteBufUtil.utf8Bytes(charSequence);
|
||||
assertEquals(writtenBytes, utf8Bytes);
|
||||
} finally {
|
||||
buf.release();
|
||||
}
|
||||
}
|
||||
|
||||
private static void assertIsText(byte[] bytes, boolean expected, Charset charset) {
|
||||
ByteBuf buffer = Unpooled.buffer();
|
||||
try {
|
||||
|
Loading…
x
Reference in New Issue
Block a user