fix: when decoding with UTF-8 fails, create a new buffer for the retry with CESU-8.

If the decoding fails and there are UTF-8 decodable bytes before the bytes that couldn't be decoded, then the read index of the original buffer is incremented and those bytes will be missing from the decode result.
Now we create a new buffer and the decoding will start at the original start offset.

issue #2546
This commit is contained in:
Comnir 2021-04-02 21:29:03 +03:00
parent 0a7b843786
commit f10060fe8f
2 changed files with 17 additions and 2 deletions

View File

@ -284,8 +284,8 @@ public class StringBlock {
@VisibleForTesting
String decodeString(int offset, int length) {
final ByteBuffer wrappedBuffer = ByteBuffer.wrap(m_strings, offset, length);
try {
final ByteBuffer wrappedBuffer = ByteBuffer.wrap(m_strings, offset, length);
return (m_isUTF8 ? UTF8_DECODER : UTF16LE_DECODER).decode(wrappedBuffer).toString();
} catch (CharacterCodingException ex) {
if (!m_isUTF8) {
@ -295,9 +295,10 @@ public class StringBlock {
}
try {
final ByteBuffer wrappedBufferRetry = ByteBuffer.wrap(m_strings, offset, length);
// in some places, Android uses 3-byte UTF-8 sequences instead of 4-bytes.
// If decoding failed, we try to use CESU-8 decoder, which is closer to what Android actually uses.
return CESU8_DECODER.decode(wrappedBuffer).toString();
return CESU8_DECODER.decode(wrappedBufferRetry).toString();
} catch (CharacterCodingException e) {
LOGGER.warning("Failed to decode a string with CESU-8 decoder.");
return null;

View File

@ -52,6 +52,20 @@ public class StringBlockWithSurrogatePairInUtf8Test {
// See: https://github.com/iBotPeaches/Apktool/issues/2299
final String actual = new StringBlock(new byte[] { (byte) 0xED, (byte) 0xA0, (byte) 0xBD, (byte) 0xED, (byte) 0xB4, (byte) 0x86}, true).decodeString(0, 6);
assertEquals("Incorrect decoding", "\uD83D\uDD06", actual);
// See: https://github.com/iBotPeaches/Apktool/issues/2546
final byte[] bytesWithCharactersBeforeSurrogatePair = {'G', 'o', 'o', 'd', ' ', 'm', 'o', 'r', 'n', 'i', 'n', 'g', '!', ' ',
(byte) 0xED, (byte) 0xA0, (byte) 0xBD, (byte) 0xED, (byte) 0xB1, (byte) 0x8B,
' ', 'S', 'u', 'n', ' ',
(byte) 0xED, (byte) 0xA0, (byte) 0xBD, (byte) 0xED, (byte) 0xBC, (byte) 0x9E
};
final String actual2 = new StringBlock(bytesWithCharactersBeforeSurrogatePair, true).decodeString(0, 31);
// D83D -> ED 0xA0 0xBD
// DC4B -> 0xED 0xB1 0x8B
// DF1E -> 0xED 0xBC 0x9E
assertEquals("Incorrect decoding when there are valid characters before the surrogate pair",
"Good morning! \uD83D\uDC4B Sun \uD83C\uDF1E", actual2);
}
@Test