From 2af769f6dc76e3aad88a52c345689d739a9130a2 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 21 Jun 2019 05:05:35 -0700 Subject: [PATCH] Subsequence versions of ByteBufUtil#writeUtf8(...) methods (#9224) Motivation It would be useful to be able to write UTF-8 encoded subsequence of CharSequence characters to a ByteBuf without needing to create a temporary object via CharSequence#subSequence(). Modification Add overloads of ByteBufUtil writeUtf8, reserveAndWriteUtf8 and utf8Bytes methods which take explicit subsequence bounds. Result More efficient writing of substrings to byte buffers possible --- .../java/io/netty/buffer/ByteBufUtil.java | 75 ++++++++++++--- .../java/io/netty/buffer/ByteBufUtilTest.java | 91 +++++++++++++++++++ 2 files changed, 154 insertions(+), 12 deletions(-) diff --git a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java index b1e86afce6..ae8d9ed3ea 100644 --- a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java +++ b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java @@ -21,6 +21,7 @@ import io.netty.util.CharsetUtil; import io.netty.util.Recycler; import io.netty.util.Recycler.Handle; import io.netty.util.concurrent.FastThreadLocal; +import io.netty.util.internal.MathUtil; import io.netty.util.internal.PlatformDependent; import io.netty.util.internal.StringUtil; import io.netty.util.internal.SystemPropertyUtil; @@ -472,6 +473,14 @@ public final class ByteBufUtil { return buffer.forEachByteDesc(toIndex, fromIndex - toIndex, new ByteProcessor.IndexOfProcessor(value)); } + private static CharSequence checkCharSequenceBounds(CharSequence seq, int start, int end) { + if (MathUtil.isOutOfBounds(start, end - start, seq.length())) { + throw new IndexOutOfBoundsException("expected: 0 <= start(" + start + ") <= end (" + end + + ") <= seq.length(" + seq.length() + ')'); + } + return seq; + } + /** * Encode a {@link CharSequence} in UTF-8 and write * it to a {@link ByteBuf} allocated with {@code alloc}. @@ -496,7 +505,17 @@ public final class ByteBufUtil { * This method returns the actual number of bytes written. */ public static int writeUtf8(ByteBuf buf, CharSequence seq) { - return reserveAndWriteUtf8(buf, seq, utf8MaxBytes(seq)); + int seqLength = seq.length(); + return reserveAndWriteUtf8Seq(buf, seq, 0, seqLength, utf8MaxBytes(seqLength)); + } + + /** + * Equivalent to {@link #writeUtf8(ByteBuf, CharSequence) writeUtf8(buf, seq.subSequence(start, end))} + * but avoids subsequence object allocation. + */ + public static int writeUtf8(ByteBuf buf, CharSequence seq, int start, int end) { + checkCharSequenceBounds(seq, start, end); + return reserveAndWriteUtf8Seq(buf, seq, start, end, utf8MaxBytes(end - start)); } /** @@ -509,6 +528,21 @@ public final class ByteBufUtil { * This method returns the actual number of bytes written. */ public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int reserveBytes) { + return reserveAndWriteUtf8Seq(buf, seq, 0, seq.length(), reserveBytes); + } + + /** + * Equivalent to {@link #reserveAndWriteUtf8(ByteBuf, CharSequence, int) + * reserveAndWriteUtf8(buf, seq.subSequence(start, end), reserveBytes)} but avoids + * subsequence object allocation if possible. + * + * @return actual number of bytes written + */ + public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int start, int end, int reserveBytes) { + return reserveAndWriteUtf8Seq(buf, checkCharSequenceBounds(seq, start, end), start, end, reserveBytes); + } + + private static int reserveAndWriteUtf8Seq(ByteBuf buf, CharSequence seq, int start, int end, int reserveBytes) { for (;;) { if (buf instanceof WrappedCompositeByteBuf) { // WrappedCompositeByteBuf is a sub-class of AbstractByteBuf so it needs special handling. @@ -516,27 +550,31 @@ public final class ByteBufUtil { } else if (buf instanceof AbstractByteBuf) { AbstractByteBuf byteBuf = (AbstractByteBuf) buf; byteBuf.ensureWritable0(reserveBytes); - int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, seq.length()); + int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, start, end); byteBuf.writerIndex += written; return written; } else if (buf instanceof WrappedByteBuf) { // Unwrap as the wrapped buffer may be an AbstractByteBuf and so we can use fast-path. buf = buf.unwrap(); } else { - byte[] bytes = seq.toString().getBytes(CharsetUtil.UTF_8); + byte[] bytes = seq.subSequence(start, end).toString().getBytes(CharsetUtil.UTF_8); buf.writeBytes(bytes); return bytes.length; } } } - // Fast-Path implementation static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq, int len) { + return writeUtf8(buffer, writerIndex, seq, 0, len); + } + + // Fast-Path implementation + static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq, int start, int end) { int oldWriterIndex = writerIndex; // We can use the _set methods as these not need to do any index checks and reference checks. // This is possible as we called ensureWritable(...) before. - for (int i = 0; i < len; i++) { + for (int i = start; i < end; i++) { char c = seq.charAt(i); if (c < 0x80) { buffer._setByte(writerIndex++, (byte) c); @@ -606,22 +644,35 @@ public final class ByteBufUtil { * This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence)}. */ public static int utf8Bytes(final CharSequence seq) { + return utf8ByteCount(seq, 0, seq.length()); + } + + /** + * Equivalent to {@link #utf8Bytes(CharSequence) utf8Bytes(seq.subSequence(start, end))} + * but avoids subsequence object allocation. + *

+ * This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence, int, int)}. + */ + public static int utf8Bytes(final CharSequence seq, int start, int end) { + return utf8ByteCount(checkCharSequenceBounds(seq, start, end), start, end); + } + + private static int utf8ByteCount(final CharSequence seq, int start, int end) { if (seq instanceof AsciiString) { - return seq.length(); + return end - start; } - int seqLength = seq.length(); - int i = 0; + int i = start; // ASCII fast path - while (i < seqLength && seq.charAt(i) < 0x80) { + while (i < end && seq.charAt(i) < 0x80) { ++i; } // !ASCII is packed in a separate method to let the ASCII case be smaller - return i < seqLength ? i + utf8Bytes(seq, i, seqLength) : i; + return i < end ? (i - start) + utf8BytesNonAscii(seq, i, end) : i - start; } - private static int utf8Bytes(final CharSequence seq, final int start, final int length) { + private static int utf8BytesNonAscii(final CharSequence seq, final int start, final int end) { int encodedLength = 0; - for (int i = start; i < length; i++) { + for (int i = start; i < end; i++) { final char c = seq.charAt(i); // making it 100% branchless isn't rewarding due to the many bit operations necessary! if (c < 0x800) { diff --git a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java index da9b344f40..92913ce4f9 100644 --- a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java +++ b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java @@ -510,6 +510,97 @@ public class ByteBufUtilTest { assertTrue(buf instanceof WrappedByteBuf); } + @Test + public void testWriteUtf8Subsequence() { + String usAscii = "Some UTF-8 like äÄ∏ŒŒ"; + ByteBuf buf = Unpooled.buffer(16); + buf.writeBytes(usAscii.substring(5, 18).getBytes(CharsetUtil.UTF_8)); + ByteBuf buf2 = Unpooled.buffer(16); + ByteBufUtil.writeUtf8(buf2, usAscii, 5, 18); + + assertEquals(buf, buf2); + + buf.release(); + buf2.release(); + } + + @Test + public void testReserveAndWriteUtf8Subsequence() { + String usAscii = "Some UTF-8 like äÄ∏ŒŒ"; + ByteBuf buf = Unpooled.buffer(16); + buf.writeBytes(usAscii.substring(5, 18).getBytes(CharsetUtil.UTF_8)); + ByteBuf buf2 = Unpooled.buffer(16); + int count = ByteBufUtil.reserveAndWriteUtf8(buf2, usAscii, 5, 18, 16); + + assertEquals(buf, buf2); + assertEquals(buf.readableBytes(), count); + + buf.release(); + buf2.release(); + } + + @Test + public void testUtf8BytesSubsequence() { + String usAscii = "Some UTF-8 like äÄ∏ŒŒ"; + assertEquals(usAscii.substring(5, 18).getBytes(CharsetUtil.UTF_8).length, + ByteBufUtil.utf8Bytes(usAscii, 5, 18)); + } + + private static int[][] INVALID_RANGES = new int[][] { + { -1, 5 }, { 5, 30 }, { 10, 5 } + }; + + interface TestMethod { + int invoke(Object... args); + } + + private void testInvalidSubsequences(TestMethod method) { + for (int [] range : INVALID_RANGES) { + ByteBuf buf = Unpooled.buffer(16); + try { + method.invoke(buf, "Some UTF-8 like äÄ∏ŒŒ", range[0], range[1]); + fail("Did not throw IndexOutOfBoundsException for range (" + range[0] + ", " + range[1] + ")"); + } catch (IndexOutOfBoundsException iiobe) { + // expected + } finally { + assertFalse(buf.isReadable()); + buf.release(); + } + } + } + + @Test + public void testWriteUtf8InvalidSubsequences() { + testInvalidSubsequences(new TestMethod() { + @Override + public int invoke(Object... args) { + return ByteBufUtil.writeUtf8((ByteBuf) args[0], (String) args[1], + (Integer) args[2], (Integer) args[3]); + } + }); + } + + @Test + public void testReserveAndWriteUtf8InvalidSubsequences() { + testInvalidSubsequences(new TestMethod() { + @Override + public int invoke(Object... args) { + return ByteBufUtil.reserveAndWriteUtf8((ByteBuf) args[0], (String) args[1], + (Integer) args[2], (Integer) args[3], 32); + } + }); + } + + @Test + public void testUtf8BytesInvalidSubsequences() { + testInvalidSubsequences(new TestMethod() { + @Override + public int invoke(Object... args) { + return ByteBufUtil.utf8Bytes((String) args[1], (Integer) args[2], (Integer) args[3]); + } + }); + } + @Test public void testDecodeUsAscii() { testDecodeString("This is a test", CharsetUtil.US_ASCII);