Added exact utf8 length estimator and exposed writeUtf8 with custom space reservation on destination buffer

Motivation:

To avoid eager allocation of the destination and to perform length prefixed encoding of UTF-8 string with forward only access pattern

Modifications:

The original writeUtf8 is modified by allowing customization of the reserved bytes on the destination buffer and is introduced an exact UTF-8 length estimator.

Result:

Is now possible to perform length first encoding with UTF-8 well-formed char sequences following a forward only write access pattern on the destination buffer.
This commit is contained in:
Francesco Nigro 2017-12-22 09:07:03 +01:00 committed by Norman Maurer
parent dc3036a202
commit bc8e022601
2 changed files with 129 additions and 5 deletions

View File

@ -475,14 +475,29 @@ public final class ByteBufUtil {
/**
* Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
* it to a {@link ByteBuf}.
*
* <p>
* It behaves like {@link #reserveAndWriteUtf8(ByteBuf, CharSequence, int)} with {@code reserveBytes}
* computed by {@link #utf8MaxBytes(CharSequence)}.<br>
* This method returns the actual number of bytes written.
*/
public static int writeUtf8(ByteBuf buf, CharSequence seq) {
return reserveAndWriteUtf8(buf, seq, utf8MaxBytes(seq));
}
/**
* Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
* it into {@code reserveBytes} of a {@link ByteBuf}.
* <p>
* The {@code reserveBytes} must be computed (ie eagerly using {@link #utf8MaxBytes(CharSequence)}
* or exactly with {@link #utf8Bytes(CharSequence)}) to ensure this method to not fail: for performance reasons
* the index checks will be performed using just {@code reserveBytes}.<br>
* This method returns the actual number of bytes written.
*/
public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int reserveBytes) {
for (;;) {
if (buf instanceof AbstractByteBuf) {
AbstractByteBuf byteBuf = (AbstractByteBuf) buf;
byteBuf.ensureWritable0(utf8MaxBytes(seq));
byteBuf.ensureWritable0(reserveBytes);
int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, seq.length());
byteBuf.writerIndex += written;
return written;
@ -521,7 +536,7 @@ public final class ByteBufUtil {
// duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
// re-throw a more informative exception describing the problem.
c2 = seq.charAt(++i);
} catch (IndexOutOfBoundsException e) {
} catch (IndexOutOfBoundsException ignored) {
buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
break;
}
@ -545,11 +560,77 @@ public final class ByteBufUtil {
return writerIndex - oldWriterIndex;
}
/**
* Returns max bytes length of UTF8 character sequence of the given length.
*/
public static int utf8MaxBytes(final int seqLength) {
return seqLength * MAX_BYTES_PER_CHAR_UTF8;
}
/**
* Returns max bytes length of UTF8 character sequence.
* <p>
* It behaves like {@link #utf8MaxBytes(int)} applied to {@code seq} {@link CharSequence#length()}.
*/
public static int utf8MaxBytes(CharSequence seq) {
return seq.length() * MAX_BYTES_PER_CHAR_UTF8;
return utf8MaxBytes(seq.length());
}
/**
* Returns the exact bytes length of UTF8 character sequence.
* <p>
* This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence)}.
*/
public static int utf8Bytes(final CharSequence seq) {
if (seq instanceof AsciiString) {
return seq.length();
}
int seqLength = seq.length();
int i = 0;
// ASCII fast path
while (i < seqLength && seq.charAt(i) < 0x80) {
++i;
}
// !ASCII is packed in a separate method to let the ASCII case be smaller
return i < seqLength ? i + utf8Bytes(seq, i, seqLength) : i;
}
private static int utf8Bytes(final CharSequence seq, final int start, final int length) {
int encodedLength = 0;
for (int i = start; i < length; i++) {
final char c = seq.charAt(i);
// making it 100% branchless isn't rewarding due to the many bit operations necessary!
if (c < 0x800) {
// branchless version of: (c <= 127 ? 0:1) + 1
encodedLength += ((0x7f - c) >>> 31) + 1;
} else if (isSurrogate(c)) {
if (!Character.isHighSurrogate(c)) {
encodedLength++;
// WRITE_UTF_UNKNOWN
continue;
}
final char c2;
try {
// Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
// duplicate bounds checking with charAt.
c2 = seq.charAt(++i);
} catch (IndexOutOfBoundsException ignored) {
encodedLength++;
// WRITE_UTF_UNKNOWN
break;
}
if (!Character.isLowSurrogate(c2)) {
// WRITE_UTF_UNKNOWN + (Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2)
encodedLength += 2;
continue;
}
// See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
encodedLength += 4;
} else {
encodedLength += 3;
}
}
return encodedLength;
}
/**

View File

@ -267,6 +267,7 @@ public class ByteBufUtilTest {
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
buf.release();
buf2.release();
@ -285,6 +286,7 @@ public class ByteBufUtilTest {
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
buf.release();
buf2.release();
@ -303,6 +305,7 @@ public class ByteBufUtilTest {
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
buf.release();
buf2.release();
@ -322,6 +325,7 @@ public class ByteBufUtilTest {
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
buf.release();
buf2.release();
@ -341,7 +345,7 @@ public class ByteBufUtilTest {
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
buf.release();
buf2.release();
}
@ -360,6 +364,7 @@ public class ByteBufUtilTest {
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
buf.release();
buf2.release();
@ -376,6 +381,7 @@ public class ByteBufUtilTest {
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
buf.release();
buf2.release();
@ -392,6 +398,7 @@ public class ByteBufUtilTest {
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), ByteBufUtil.utf8Bytes(surrogateString));
buf.release();
buf2.release();
@ -546,6 +553,42 @@ public class ByteBufUtilTest {
}
}
@Test
public void testUtf8Bytes() {
final String s = "Some UTF-8 like äÄ∏ŒŒ";
checkUtf8Bytes(s);
}
@Test
public void testUtf8BytesWithSurrogates() {
final String s = "a\uD800\uDC00b";
checkUtf8Bytes(s);
}
@Test
public void testUtf8BytesWithNonSurrogates3Bytes() {
final String s = "a\uE000b";
checkUtf8Bytes(s);
}
@Test
public void testUtf8BytesWithNonSurrogatesNonAscii() {
final char nonAscii = (char) 0x81;
final String s = "a" + nonAscii + "b";
checkUtf8Bytes(s);
}
private static void checkUtf8Bytes(final CharSequence charSequence) {
final ByteBuf buf = Unpooled.buffer(ByteBufUtil.utf8MaxBytes(charSequence));
try {
final int writtenBytes = ByteBufUtil.writeUtf8(buf, charSequence);
final int utf8Bytes = ByteBufUtil.utf8Bytes(charSequence);
assertEquals(writtenBytes, utf8Bytes);
} finally {
buf.release();
}
}
private static void assertIsText(byte[] bytes, boolean expected, Charset charset) {
ByteBuf buffer = Unpooled.buffer();
try {