ByteBufUtil.writeUtf8 not consistent with String.getBytes(Charset)

Motivation:
f750d6e36c added support for surrogates in the writeUtf8 conversion. However exceptions are thrown if invalid input is detected, but the JDK (and slow path of writeUtf8) uses a replacement character and does not throw. We should behave the same way.

Modificiations:
- Don't throw in ByteBufUtil.writeUtf8, and instead use a replacement character consistent with the JDK

Result:
ByteBufUtil.writeUtf8 behavior is consistent with the JDK UTF_8 conversion.
This commit is contained in:
Scott Mitchell 2016-02-10 19:07:20 -08:00
parent cd56f87ca1
commit 691bc1690e
2 changed files with 137 additions and 28 deletions

View File

@ -57,6 +57,7 @@ public final class ByteBufUtil {
}
};
private static final byte WRITE_UTF_UNKNOWN = (byte) '?';
private static final int MAX_CHAR_BUFFER_SIZE;
private static final int THREAD_LOCAL_BUFFER_SIZE;
@ -400,8 +401,8 @@ public final class ByteBufUtil {
buffer._setByte(writerIndex++, (byte) (0x80 | (c & 0x3f)));
} else if (isSurrogate(c)) {
if (!Character.isHighSurrogate(c)) {
throw new IllegalArgumentException("Invalid encoding. " +
"Expected high (leading) surrogate at index " + i + " but got " + c);
buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
continue;
}
final char c2;
try {
@ -410,12 +411,13 @@ public final class ByteBufUtil {
// re-throw a more informative exception describing the problem.
c2 = seq.charAt(++i);
} catch (IndexOutOfBoundsException e) {
throw new IllegalArgumentException("Underflow. " +
"Expected low (trailing) surrogate at index " + i + " but no more characters found.", e);
buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
break;
}
if (!Character.isLowSurrogate(c2)) {
throw new IllegalArgumentException("Invalid encoding. " +
"Expected low (trailing) surrogate at index " + i + " but got " + c2);
buffer._setByte(writerIndex++, WRITE_UTF_UNKNOWN);
buffer._setByte(writerIndex++, Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2);
continue;
}
int codePoint = Character.toCodePoint(c, c2);
// See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.

View File

@ -17,13 +17,14 @@ package io.netty.buffer;
import io.netty.util.AsciiString;
import io.netty.util.CharsetUtil;
import io.netty.util.ReferenceCountUtil;
import org.junit.Assert;
import org.junit.Test;
import java.nio.charset.Charset;
import java.util.Random;
import static io.netty.buffer.Unpooled.unreleasableBuffer;
import static io.netty.util.ReferenceCountUtil.releaseLater;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@ -97,75 +98,181 @@ public class ByteBufUtilTest {
@Test
public void testWriteUsAscii() {
String usAscii = "NettyRocks";
ByteBuf buf = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(usAscii.getBytes(CharsetUtil.US_ASCII));
ByteBuf buf2 = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeAscii(buf2, usAscii);
Assert.assertEquals(buf, buf2);
assertEquals(buf, buf2);
}
@Test
public void testWriteUsAsciiWrapped() {
String usAscii = "NettyRocks";
ByteBuf buf = Unpooled.unreleasableBuffer(ReferenceCountUtil.releaseLater(Unpooled.buffer(16)));
ByteBuf buf = unreleasableBuffer(releaseLater(Unpooled.buffer(16)));
assertWrapped(buf);
buf.writeBytes(usAscii.getBytes(CharsetUtil.US_ASCII));
ByteBuf buf2 = Unpooled.unreleasableBuffer(ReferenceCountUtil.releaseLater(Unpooled.buffer(16)));
ByteBuf buf2 = unreleasableBuffer(releaseLater(Unpooled.buffer(16)));
assertWrapped(buf2);
ByteBufUtil.writeAscii(buf2, usAscii);
Assert.assertEquals(buf, buf2);
assertEquals(buf, buf2);
}
@Test
public void testWriteUtf8() {
String usAscii = "Some UTF-8 like äÄ∏ŒŒ";
ByteBuf buf = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(usAscii.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeUtf8(buf2, usAscii);
Assert.assertEquals(buf, buf2);
assertEquals(buf, buf2);
}
@Test
public void testWriteUtf8Surrogates() {
// leading surrogate + trailing surrogate
String surrogateString = new StringBuilder(2)
.append('a')
.append('\uD800')
.append('\uDC00')
.append('b')
.toString();
ByteBuf buf = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeUtf8(buf2, surrogateString);
Assert.assertEquals(buf, buf2);
assertEquals(buf, buf2);
}
@Test
public void testWriteUtf8InvalidOnlyTrailingSurrogate() {
String surrogateString = new StringBuilder(2)
.append('a')
.append('\uDC00')
.append('b')
.toString();
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
}
@Test
public void testWriteUtf8InvalidOnlyLeadingSurrogate() {
String surrogateString = new StringBuilder(2)
.append('a')
.append('\uD800')
.append('b')
.toString();
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
}
@Test
public void testWriteUtf8InvalidSurrogatesSwitched() {
String surrogateString = new StringBuilder(2)
.append('a')
.append('\uDC00')
.append('\uD800')
.append('b')
.toString();
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
}
@Test
public void testWriteUtf8InvalidTwoLeadingSurrogates() {
String surrogateString = new StringBuilder(2)
.append('a')
.append('\uD800')
.append('\uD800')
.append('b')
.toString();
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
}
@Test
public void testWriteUtf8InvalidTwoTrailingSurrogates() {
String surrogateString = new StringBuilder(2)
.append('a')
.append('\uDC00')
.append('\uDC00')
.append('b')
.toString();
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
}
@Test
public void testWriteUtf8InvalidEndOnLeadingSurrogate() {
String surrogateString = new StringBuilder(2)
.append('\uD800')
.toString();
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
}
@Test
public void testWriteUtf8InvalidEndOnTrailingSurrogate() {
String surrogateString = new StringBuilder(2)
.append('\uDC00')
.toString();
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeUtf8(buf2, surrogateString);
assertEquals(buf, buf2);
}
@Test
public void testWriteUsAsciiString() {
AsciiString usAscii = new AsciiString("NettyRocks");
ByteBuf buf = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
ByteBuf buf = releaseLater(Unpooled.buffer(16));
buf.writeBytes(usAscii.toString().getBytes(CharsetUtil.US_ASCII));
ByteBuf buf2 = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
ByteBuf buf2 = releaseLater(Unpooled.buffer(16));
ByteBufUtil.writeAscii(buf2, usAscii);
Assert.assertEquals(buf, buf2);
assertEquals(buf, buf2);
}
@Test
public void testWriteUtf8Wrapped() {
String usAscii = "Some UTF-8 like äÄ∏ŒŒ";
ByteBuf buf = Unpooled.unreleasableBuffer(ReferenceCountUtil.releaseLater(Unpooled.buffer(16)));
ByteBuf buf = unreleasableBuffer(releaseLater(Unpooled.buffer(16)));
assertWrapped(buf);
buf.writeBytes(usAscii.getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = Unpooled.unreleasableBuffer(ReferenceCountUtil.releaseLater(Unpooled.buffer(16)));
ByteBuf buf2 = unreleasableBuffer(releaseLater(Unpooled.buffer(16)));
assertWrapped(buf2);
ByteBufUtil.writeUtf8(buf2, usAscii);
Assert.assertEquals(buf, buf2);
assertEquals(buf, buf2);
}
private static void assertWrapped(ByteBuf buf) {
@ -184,7 +291,7 @@ public class ByteBufUtilTest {
private static void testDecodeString(String text, Charset charset) {
ByteBuf buffer = Unpooled.copiedBuffer(text, charset);
Assert.assertEquals(text, ByteBufUtil.decodeString(buffer, 0, buffer.readableBytes(), charset));
assertEquals(text, ByteBufUtil.decodeString(buffer, 0, buffer.readableBytes(), charset));
buffer.release();
}
@ -195,7 +302,7 @@ public class ByteBufUtilTest {
byte[] bytes = "1234".getBytes(CharsetUtil.UTF_8);
buffer.addComponent(Unpooled.buffer(bytes.length).writeBytes(bytes));
buffer.addComponent(Unpooled.buffer(bytes.length).writeBytes(bytes));
Assert.assertEquals("1234", buffer.toString(bytes.length, bytes.length, CharsetUtil.UTF_8));
assertEquals("1234", buffer.toString(bytes.length, bytes.length, CharsetUtil.UTF_8));
} finally {
buffer.release();
}