Fix ByteBufUtil#writeUtf8 subsequence split surrogate edge-case bug (#9437)
Motivation: #9224 introduced overrides of ByteBufUtil#writeUtf8(...) and related methods to operate on a sub-CharSequence directly to save having to allocate substrings, but it missed an edge case where the subsequence does not extend to the end of the CharSequence and the last char in the sequence is a high surrogate. Due to the catch-IndexOutOfBoundsException optimization that avoids an additional bounds check, it would be possible to read past the specified end char index and successfully decode a surrogate pair which would otherwise result in a '?' byte being written. Modifications: - Check for end-of-subsequence before reading next char after a high surrogate is encountered in the writeUtf8(AbstractByteBuf,int,CharSequence,int,int) and utf8BytesNonAscii methods - Add unit test for this edge case Result: Bug is fixed. This removes the bounds-check-avoidance optimization but it does not appear to have a measurable impact on benchmark results, including when the char sequence contains many surrogate pairs (which should be rare in any case).
This commit is contained in:
parent
cbc8fc4428
commit
fedcc40196
@ -586,18 +586,13 @@ public final class ByteBufUtil {
|
||||
buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
|
||||
continue;
|
||||
}
|
||||
final char c2;
|
||||
try {
|
||||
// Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
|
||||
// duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
|
||||
// re-throw a more informative exception describing the problem.
|
||||
c2 = seq.charAt(++i);
|
||||
} catch (IndexOutOfBoundsException ignored) {
|
||||
// Surrogate Pair consumes 2 characters.
|
||||
if (++i == end) {
|
||||
buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
|
||||
break;
|
||||
}
|
||||
// Extra method to allow inlining the rest of writeUtf8 which is the most likely code path.
|
||||
writerIndex = writeUtf8Surrogate(buffer, writerIndex, c, c2);
|
||||
writerIndex = writeUtf8Surrogate(buffer, writerIndex, c, seq.charAt(i));
|
||||
} else {
|
||||
buffer._setByte(writerIndex++, (byte) (0xe0 | (c >> 12)));
|
||||
buffer._setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f)));
|
||||
@ -684,17 +679,13 @@ public final class ByteBufUtil {
|
||||
// WRITE_UTF_UNKNOWN
|
||||
continue;
|
||||
}
|
||||
final char c2;
|
||||
try {
|
||||
// Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
|
||||
// duplicate bounds checking with charAt.
|
||||
c2 = seq.charAt(++i);
|
||||
} catch (IndexOutOfBoundsException ignored) {
|
||||
// Surrogate Pair consumes 2 characters.
|
||||
if (++i == end) {
|
||||
encodedLength++;
|
||||
// WRITE_UTF_UNKNOWN
|
||||
break;
|
||||
}
|
||||
if (!Character.isLowSurrogate(c2)) {
|
||||
if (!Character.isLowSurrogate(seq.charAt(i))) {
|
||||
// WRITE_UTF_UNKNOWN + (Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2)
|
||||
encodedLength += 2;
|
||||
continue;
|
||||
|
@ -524,6 +524,20 @@ public class ByteBufUtilTest {
|
||||
buf2.release();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteUtf8SubsequenceSplitSurrogate() {
|
||||
String usAscii = "\uD800\uDC00"; // surrogate pair: one code point, two chars
|
||||
ByteBuf buf = Unpooled.buffer(16);
|
||||
buf.writeBytes(usAscii.substring(0, 1).getBytes(CharsetUtil.UTF_8));
|
||||
ByteBuf buf2 = Unpooled.buffer(16);
|
||||
ByteBufUtil.writeUtf8(buf2, usAscii, 0, 1);
|
||||
|
||||
assertEquals(buf, buf2);
|
||||
|
||||
buf.release();
|
||||
buf2.release();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReserveAndWriteUtf8Subsequence() {
|
||||
String usAscii = "Some UTF-8 like äÄ∏ŒŒ";
|
||||
|
Loading…
Reference in New Issue
Block a user