From dcd145864ab137d748dfd5c3c1298ef87049710b Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 10 Aug 2019 11:54:04 -0700 Subject: [PATCH] Fix ByteBufUtil#writeUtf8 subsequence split surrogate edge-case bug (#9437) Motivation: #9224 introduced overrides of ByteBufUtil#writeUtf8(...) and related methods to operate on a sub-CharSequence directly to save having to allocate substrings, but it missed an edge case where the subsequence does not extend to the end of the CharSequence and the last char in the sequence is a high surrogate. Due to the catch-IndexOutOfBoundsException optimization that avoids an additional bounds check, it would be possible to read past the specified end char index and successfully decode a surrogate pair which would otherwise result in a '?' byte being written. Modifications: - Check for end-of-subsequence before reading next char after a high surrogate is encountered in the writeUtf8(AbstractByteBuf,int,CharSequence,int,int) and utf8BytesNonAscii methods - Add unit test for this edge case Result: Bug is fixed. This removes the bounds-check-avoidance optimization but it does not appear to have a measurable impact on benchmark results, including when the char sequence contains many surrogate pairs (which should be rare in any case). --- .../java/io/netty/buffer/ByteBufUtil.java | 21 ++++++------------- .../java/io/netty/buffer/ByteBufUtilTest.java | 14 +++++++++++++ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java index 1d5bfce1ee..2582c9399b 100644 --- a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java +++ b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java @@ -586,18 +586,13 @@ public final class ByteBufUtil { buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN); continue; } - final char c2; - try { - // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid - // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will - // re-throw a more informative exception describing the problem. - c2 = seq.charAt(++i); - } catch (IndexOutOfBoundsException ignored) { + // Surrogate Pair consumes 2 characters. + if (++i == end) { buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN); break; } // Extra method to allow inlining the rest of writeUtf8 which is the most likely code path. - writerIndex = writeUtf8Surrogate(buffer, writerIndex, c, c2); + writerIndex = writeUtf8Surrogate(buffer, writerIndex, c, seq.charAt(i)); } else { buffer._setByte(writerIndex++, (byte) (0xe0 | (c >> 12))); buffer._setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f))); @@ -684,17 +679,13 @@ public final class ByteBufUtil { // WRITE_UTF_UNKNOWN continue; } - final char c2; - try { - // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid - // duplicate bounds checking with charAt. - c2 = seq.charAt(++i); - } catch (IndexOutOfBoundsException ignored) { + // Surrogate Pair consumes 2 characters. + if (++i == end) { encodedLength++; // WRITE_UTF_UNKNOWN break; } - if (!Character.isLowSurrogate(c2)) { + if (!Character.isLowSurrogate(seq.charAt(i))) { // WRITE_UTF_UNKNOWN + (Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2) encodedLength += 2; continue; diff --git a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java index 8721be1c7a..78fa1a7344 100644 --- a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java +++ b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java @@ -524,6 +524,20 @@ public class ByteBufUtilTest { buf2.release(); } + @Test + public void testWriteUtf8SubsequenceSplitSurrogate() { + String usAscii = "\uD800\uDC00"; // surrogate pair: one code point, two chars + ByteBuf buf = Unpooled.buffer(16); + buf.writeBytes(usAscii.substring(0, 1).getBytes(CharsetUtil.UTF_8)); + ByteBuf buf2 = Unpooled.buffer(16); + ByteBufUtil.writeUtf8(buf2, usAscii, 0, 1); + + assertEquals(buf, buf2); + + buf.release(); + buf2.release(); + } + @Test public void testReserveAndWriteUtf8Subsequence() { String usAscii = "Some UTF-8 like äÄ∏ŒŒ";