Truncate filenames based on their utf-8 length

2025-01-26 19:57:34 +01:00 · 2014-02-01 14:43:54 -08:00 · 2014-02-01 14:43:54 -08:00 · 83e63dab7a
commit 83e63dab7a
parent 838b35e477
2 changed files with 155 additions and 13 deletions
--- a/brut.apktool.smali/util/src/main/java/org/jf/util/ClassFileNameHandler.java
+++ b/brut.apktool.smali/util/src/main/java/org/jf/util/ClassFileNameHandler.java
@ -33,7 +33,9 @@ import ds.tree.RadixTreeImpl;
 import javax.annotation.Nonnull;
 import java.io.*;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.IntBuffer;
 import java.util.regex.Pattern;
 /**
@ -87,8 +89,9 @@ public class ClassFileNameHandler {
                    packageElement += "#";
                }
-                if (packageElement.length() > MAX_FILENAME_LENGTH) {
+                int utf8Length = utf8Length(packageElement);
-                    packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH);
+                if (utf8Length > MAX_FILENAME_LENGTH) {
                    packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH);
                }
                packageElements[elementIndex++] = packageElement;
@ -109,8 +112,9 @@ public class ClassFileNameHandler {
            packageElement += "#";
        }
-        if ((packageElement.length() + fileExtension.length()) > MAX_FILENAME_LENGTH) {
+        int utf8Length = utf8Length(packageElement) + utf8Length(fileExtension);
-            packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH - fileExtension.length());
+        if (utf8Length > MAX_FILENAME_LENGTH) {
            packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH);
        }
        packageElements[elementIndex] = packageElement;
@ -118,12 +122,87 @@ public class ClassFileNameHandler {
        return top.addUniqueChild(packageElements, 0);
    }
-    @Nonnull
+    private static int utf8Length(String str) {
-    static String shortenPathComponent(@Nonnull String pathComponent, int maxLength) {
+        int utf8Length = 0;
-        int toRemove = pathComponent.length() - maxLength + 1;
+        int i=0;
        while (i<str.length()) {
            int c = str.codePointAt(i);
            utf8Length += utf8Length(c);
            i += Character.charCount(c);
        }
        return utf8Length;
    }
-        int firstIndex = (pathComponent.length()/2) - (toRemove/2);
+    private static int utf8Length(int codePoint) {
-        return pathComponent.substring(0, firstIndex) + "#" + pathComponent.substring(firstIndex+toRemove);
+        if (codePoint < 0x80) {
            return 1;
        } else if (codePoint < 0x800) {
            return 2;
        } else if (codePoint < 0x10000) {
            return 3;
        } else {
            return 4;
        }
    }
    /**
     * Shortens an individual file/directory name, removing the necessary number of code points
     * from the middle of the string such that the utf-8 encoding of the string is at least
     * bytesToRemove bytes shorter than the original.
     *
     * The removed codePoints in the middle of the string will be replaced with a # character.
     */
    @Nonnull
    static String shortenPathComponent(@Nonnull String pathComponent, int bytesToRemove) {
        // We replace the removed part with a #, so we need to remove 1 extra char
        bytesToRemove++;
        int[] codePoints;
        try {
            IntBuffer intBuffer = ByteBuffer.wrap(pathComponent.getBytes("UTF-32BE")).asIntBuffer();
            codePoints = new int[intBuffer.limit()];
            intBuffer.get(codePoints);
        } catch (UnsupportedEncodingException ex) {
            throw new RuntimeException(ex);
        }
        int midPoint = codePoints.length/2;
        int delta = 0;
        int firstEnd = midPoint; // exclusive
        int secondStart = midPoint+1; // inclusive
        int bytesRemoved = utf8Length(codePoints[midPoint]);
        // if we have an even number of codepoints, start by removing both middle characters,
        // unless just removing the first already removes enough bytes
        if (((codePoints.length % 2) == 0) && bytesRemoved < bytesToRemove) {
            bytesRemoved += utf8Length(codePoints[secondStart]);
            secondStart++;
        }
        while ((bytesRemoved < bytesToRemove) &&
                (firstEnd > 0 || secondStart < codePoints.length)) {
            if (firstEnd > 0) {
                firstEnd--;
                bytesRemoved += utf8Length(codePoints[firstEnd]);
            }
            if (bytesRemoved < bytesToRemove && secondStart < codePoints.length) {
                bytesRemoved += utf8Length(codePoints[secondStart]);
                secondStart++;
            }
        }
        StringBuilder sb = new StringBuilder();
        for (int i=0; i<firstEnd; i++) {
            sb.appendCodePoint(codePoints[i]);
        }
        sb.append('#');
        for (int i=secondStart; i<codePoints.length; i++) {
            sb.appendCodePoint(codePoints[i]);
        }
        return sb.toString();
    }
    private static boolean testForWindowsReservedFileNames(File path) {
--- a/brut.apktool.smali/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java
+++ b/brut.apktool.smali/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java
@ -34,16 +34,79 @@ package org.jf.util;
 import junit.framework.Assert;
 import org.junit.Test;
 import java.nio.charset.Charset;
 public class ClassFileNameHandlerTest {
    private final Charset UTF8 = Charset.forName("UTF-8");
    @Test
-    public void testShortedPathComponent() {
+    public void test1ByteEncodings() {
        StringBuilder sb = new StringBuilder();
-        for (int i=0; i<300; i++) {
+        for (int i=0; i<100; i++) {
            sb.append((char)i);
        }
-        String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 255);
+        String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5);
        Assert.assertEquals(95, result.getBytes(UTF8).length);
        Assert.assertEquals(95, result.length());
    }
-        Assert.assertEquals(255, result.length());
+    @Test
    public void test2ByteEncodings() {
        StringBuilder sb = new StringBuilder();
        for (int i=0x80; i<0x80+100; i++) {
            sb.append((char)i);
        }
        // remove a total of 3 2-byte characters, and then add back in the 1-byte '#'
        String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 4);
        Assert.assertEquals(200, sb.toString().getBytes(UTF8).length);
        Assert.assertEquals(195, result.getBytes(UTF8).length);
        Assert.assertEquals(98, result.length());
        // remove a total of 3 2-byte characters, and then add back in the 1-byte '#'
        result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5);
        Assert.assertEquals(200, sb.toString().getBytes(UTF8).length);
        Assert.assertEquals(195, result.getBytes(UTF8).length);
        Assert.assertEquals(98, result.length());
    }
    @Test
    public void test3ByteEncodings() {
        StringBuilder sb = new StringBuilder();
        for (int i=0x800; i<0x800+100; i++) {
            sb.append((char)i);
        }
        // remove a total of 3 3-byte characters, and then add back in the 1-byte '#'
        String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 6);
        Assert.assertEquals(300, sb.toString().getBytes(UTF8).length);
        Assert.assertEquals(292, result.getBytes(UTF8).length);
        Assert.assertEquals(98, result.length());
        // remove a total of 3 3-byte characters, and then add back in the 1-byte '#'
        result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7);
        Assert.assertEquals(300, sb.toString().getBytes(UTF8).length);
        Assert.assertEquals(292, result.getBytes(UTF8).length);
        Assert.assertEquals(98, result.length());
    }
    public void test4ByteEncodings() {
        StringBuilder sb = new StringBuilder();
        for (int i=0x10000; i<0x10000+100; i++) {
            sb.appendCodePoint(i);
        }
        // we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#'
        String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 8);
        Assert.assertEquals(400, sb.toString().getBytes(UTF8).length);
        Assert.assertEquals(389, result.getBytes(UTF8).length);
        Assert.assertEquals(98, result.length());
        // we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#'
        result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7);
        Assert.assertEquals(400, sb.toString().getBytes(UTF8).length);
        Assert.assertEquals(3892, result.getBytes(UTF8).length);
        Assert.assertEquals(98, result.length());
    }
 }