ByteBufUtil.writeUtf8 Surrogate Support
Motivation: UTF-16 can not represent the full range of Unicode characters, and thus has the concept of Surrogate Pair (http://unicode.org/glossary/#surrogate_pair) where 2 16-bit code units can be used to represent the missing characters. ByteBufUtil.writeUtf8 is currently does not support this and is thus incomplete. Modifications: - Add support for surrogate pairs in ByteBufUtil.writeUtf8 Result: ByteBufUtil.writeUtf8 now supports surrogate pairs and is correctly converting to UTF-8.
This commit is contained in:
parent
693633eeff
commit
f750d6e36c
@ -38,9 +38,10 @@ import java.nio.charset.CoderResult;
|
||||
import java.util.Arrays;
|
||||
import java.util.Locale;
|
||||
|
||||
import static io.netty.util.internal.StringUtil.NEWLINE;
|
||||
import static io.netty.util.internal.ObjectUtil.checkNotNull;
|
||||
import static io.netty.util.internal.MathUtil.isOutOfBounds;
|
||||
import static io.netty.util.internal.ObjectUtil.checkNotNull;
|
||||
import static io.netty.util.internal.StringUtil.NEWLINE;
|
||||
import static io.netty.util.internal.StringUtil.isSurrogate;
|
||||
|
||||
/**
|
||||
* A collection of utility methods that is related with handling {@link ByteBuf},
|
||||
@ -397,6 +398,31 @@ public final class ByteBufUtil {
|
||||
} else if (c < 0x800) {
|
||||
buffer._setByte(writerIndex++, (byte) (0xc0 | (c >> 6)));
|
||||
buffer._setByte(writerIndex++, (byte) (0x80 | (c & 0x3f)));
|
||||
} else if (isSurrogate(c)) {
|
||||
if (!Character.isHighSurrogate(c)) {
|
||||
throw new IllegalArgumentException("Invalid encoding. " +
|
||||
"Expected high (leading) surrogate at index " + i + " but got " + c);
|
||||
}
|
||||
final char c2;
|
||||
try {
|
||||
// Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid
|
||||
// duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will
|
||||
// re-throw a more informative exception describing the problem.
|
||||
c2 = seq.charAt(++i);
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
throw new IllegalArgumentException("Underflow. " +
|
||||
"Expected low (trailing) surrogate at index " + i + " but no more characters found.", e);
|
||||
}
|
||||
if (!Character.isLowSurrogate(c2)) {
|
||||
throw new IllegalArgumentException("Invalid encoding. " +
|
||||
"Expected low (trailing) surrogate at index " + i + " but got " + c2);
|
||||
}
|
||||
int codePoint = Character.toCodePoint(c, c2);
|
||||
// See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
|
||||
buffer._setByte(writerIndex++, (byte) (0xf0 | (codePoint >> 18)));
|
||||
buffer._setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 12) & 0x3f)));
|
||||
buffer._setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 6) & 0x3f)));
|
||||
buffer._setByte(writerIndex++, (byte) (0x80 | (codePoint & 0x3f)));
|
||||
} else {
|
||||
buffer._setByte(writerIndex++, (byte) (0xe0 | (c >> 12)));
|
||||
buffer._setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f)));
|
||||
|
@ -15,19 +15,17 @@
|
||||
*/
|
||||
package io.netty.buffer;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import io.netty.util.AsciiString;
|
||||
import io.netty.util.CharsetUtil;
|
||||
import io.netty.util.ReferenceCountUtil;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Random;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class ByteBufUtilTest {
|
||||
@Test
|
||||
@ -131,6 +129,21 @@ public class ByteBufUtilTest {
|
||||
Assert.assertEquals(buf, buf2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteUtf8Surrogates() {
|
||||
// leading surrogate + trailing surrogate
|
||||
String surrogateString = new StringBuilder(2)
|
||||
.append('\uD800')
|
||||
.append('\uDC00')
|
||||
.toString();
|
||||
ByteBuf buf = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
|
||||
buf.writeBytes(surrogateString.getBytes(CharsetUtil.UTF_8));
|
||||
ByteBuf buf2 = ReferenceCountUtil.releaseLater(Unpooled.buffer(16));
|
||||
ByteBufUtil.writeUtf8(buf2, surrogateString);
|
||||
|
||||
Assert.assertEquals(buf, buf2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteUsAsciiString() {
|
||||
AsciiString usAscii = new AsciiString("NettyRocks");
|
||||
|
@ -60,9 +60,6 @@ public final class MathUtil {
|
||||
* </ul>
|
||||
*/
|
||||
public static int compare(long x, long y) {
|
||||
if (PlatformDependent.javaVersion() < 7) {
|
||||
return (x < y) ? -1 : (x > y) ? 1 : 0;
|
||||
}
|
||||
return Long.compare(x, y);
|
||||
return (x < y) ? -1 : (x > y) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
@ -15,7 +15,6 @@
|
||||
*/
|
||||
package io.netty.util.internal;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Formatter;
|
||||
@ -391,6 +390,17 @@ public final class StringUtil {
|
||||
return s == null || s.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if {@code c} lies within the range of values defined for
|
||||
* <a href="http://unicode.org/glossary/#surrogate_code_point">Surrogate Code Point</a>.
|
||||
* @param c the character to check.
|
||||
* @return {@code true} if {@code c} lies within the range of values defined for
|
||||
* <a href="http://unicode.org/glossary/#surrogate_code_point">Surrogate Code Point</a>. {@code false} otherwise.
|
||||
*/
|
||||
public static boolean isSurrogate(char c) {
|
||||
return c >= '\uD800' && c <= '\uDFFF';
|
||||
}
|
||||
|
||||
private static boolean isDoubleQuote(char c) {
|
||||
return c == DOUBLE_QUOTE;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user