Truncate filenames based on their utf-8 length

2025-01-11 12:35:52 +01:00 · 2014-02-01 14:43:54 -08:00 · 2014-02-01 14:43:54 -08:00 · 83e63dab7a
commit 83e63dab7a
parent 838b35e477
2 changed files with 155 additions and 13 deletions
--- a/brut.apktool.smali/util/src/main/java/org/jf/util/ClassFileNameHandler.java
+++ b/brut.apktool.smali/util/src/main/java/org/jf/util/ClassFileNameHandler.java
@ -33,7 +33,9 @@ import ds.tree.RadixTreeImpl;

 import javax.annotation.Nonnull;
 import java.io.*;
+import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
+import java.nio.IntBuffer;
 import java.util.regex.Pattern;

 /**
@ -87,8 +89,9 @@ public class ClassFileNameHandler {
                    packageElement += "#";
                }

-                if (packageElement.length() > MAX_FILENAME_LENGTH) {
-                    packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH);
+                int utf8Length = utf8Length(packageElement);
+                if (utf8Length > MAX_FILENAME_LENGTH) {
+                    packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH);
                }

                packageElements[elementIndex++] = packageElement;
@ -109,8 +112,9 @@ public class ClassFileNameHandler {
            packageElement += "#";
        }

-        if ((packageElement.length() + fileExtension.length()) > MAX_FILENAME_LENGTH) {
-            packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH - fileExtension.length());
+        int utf8Length = utf8Length(packageElement) + utf8Length(fileExtension);
+        if (utf8Length > MAX_FILENAME_LENGTH) {
+            packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH);
        }

        packageElements[elementIndex] = packageElement;
@ -118,12 +122,87 @@ public class ClassFileNameHandler {
        return top.addUniqueChild(packageElements, 0);
    }

-    @Nonnull
-    static String shortenPathComponent(@Nonnull String pathComponent, int maxLength) {
-        int toRemove = pathComponent.length() - maxLength + 1;
+    private static int utf8Length(String str) {
+        int utf8Length = 0;
+        int i=0;
+        while (i<str.length()) {
+            int c = str.codePointAt(i);
+            utf8Length += utf8Length(c);
+            i += Character.charCount(c);
+        }
+        return utf8Length;
+    }

-        int firstIndex = (pathComponent.length()/2) - (toRemove/2);
-        return pathComponent.substring(0, firstIndex) + "#" + pathComponent.substring(firstIndex+toRemove);
+    private static int utf8Length(int codePoint) {
+        if (codePoint < 0x80) {
+            return 1;
+        } else if (codePoint < 0x800) {
+            return 2;
+        } else if (codePoint < 0x10000) {
+            return 3;
+        } else {
+            return 4;
+        }
+    }
+
+    /**
+     * Shortens an individual file/directory name, removing the necessary number of code points
+     * from the middle of the string such that the utf-8 encoding of the string is at least
+     * bytesToRemove bytes shorter than the original.
+     *
+     * The removed codePoints in the middle of the string will be replaced with a # character.
+     */
+    @Nonnull
+    static String shortenPathComponent(@Nonnull String pathComponent, int bytesToRemove) {
+        // We replace the removed part with a #, so we need to remove 1 extra char
+        bytesToRemove++;
+
+        int[] codePoints;
+        try {
+            IntBuffer intBuffer = ByteBuffer.wrap(pathComponent.getBytes("UTF-32BE")).asIntBuffer();
+            codePoints = new int[intBuffer.limit()];
+            intBuffer.get(codePoints);
+        } catch (UnsupportedEncodingException ex) {
+            throw new RuntimeException(ex);
+        }
+
+        int midPoint = codePoints.length/2;
+        int delta = 0;
+
+        int firstEnd = midPoint; // exclusive
+        int secondStart = midPoint+1; // inclusive
+        int bytesRemoved = utf8Length(codePoints[midPoint]);
+
+        // if we have an even number of codepoints, start by removing both middle characters,
+        // unless just removing the first already removes enough bytes
+        if (((codePoints.length % 2) == 0) && bytesRemoved < bytesToRemove) {
+            bytesRemoved += utf8Length(codePoints[secondStart]);
+            secondStart++;
+        }
+
+        while ((bytesRemoved < bytesToRemove) &&
+                (firstEnd > 0 || secondStart < codePoints.length)) {
+            if (firstEnd > 0) {
+                firstEnd--;
+                bytesRemoved += utf8Length(codePoints[firstEnd]);
+            }
+
+            if (bytesRemoved < bytesToRemove && secondStart < codePoints.length) {
+                bytesRemoved += utf8Length(codePoints[secondStart]);
+                secondStart++;
+            }
+        }
+
+        StringBuilder sb = new StringBuilder();
+        for (int i=0; i<firstEnd; i++) {
+            sb.appendCodePoint(codePoints[i]);
+        }
+        sb.append('#');
+        for (int i=secondStart; i<codePoints.length; i++) {
+            sb.appendCodePoint(codePoints[i]);
+        }
+
+        return sb.toString();
    }

    private static boolean testForWindowsReservedFileNames(File path) {
--- a/brut.apktool.smali/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java
+++ b/brut.apktool.smali/util/src/test/java/org/jf/util/ClassFileNameHandlerTest.java
@ -34,16 +34,79 @@ package org.jf.util;
 import junit.framework.Assert;
 import org.junit.Test;

+import java.nio.charset.Charset;
+
 public class ClassFileNameHandlerTest {
+    private final Charset UTF8 = Charset.forName("UTF-8");
+
    @Test
-    public void testShortedPathComponent() {
+    public void test1ByteEncodings() {
        StringBuilder sb = new StringBuilder();
-        for (int i=0; i<300; i++) {
+        for (int i=0; i<100; i++) {
            sb.append((char)i);
        }

-        String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 255);
+        String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5);
+        Assert.assertEquals(95, result.getBytes(UTF8).length);
+        Assert.assertEquals(95, result.length());
+    }

-        Assert.assertEquals(255, result.length());
+    @Test
+    public void test2ByteEncodings() {
+        StringBuilder sb = new StringBuilder();
+        for (int i=0x80; i<0x80+100; i++) {
+            sb.append((char)i);
+        }
+
+        // remove a total of 3 2-byte characters, and then add back in the 1-byte '#'
+        String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 4);
+        Assert.assertEquals(200, sb.toString().getBytes(UTF8).length);
+        Assert.assertEquals(195, result.getBytes(UTF8).length);
+        Assert.assertEquals(98, result.length());
+
+        // remove a total of 3 2-byte characters, and then add back in the 1-byte '#'
+        result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5);
+        Assert.assertEquals(200, sb.toString().getBytes(UTF8).length);
+        Assert.assertEquals(195, result.getBytes(UTF8).length);
+        Assert.assertEquals(98, result.length());
+    }
+
+    @Test
+    public void test3ByteEncodings() {
+        StringBuilder sb = new StringBuilder();
+        for (int i=0x800; i<0x800+100; i++) {
+            sb.append((char)i);
+        }
+
+        // remove a total of 3 3-byte characters, and then add back in the 1-byte '#'
+        String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 6);
+        Assert.assertEquals(300, sb.toString().getBytes(UTF8).length);
+        Assert.assertEquals(292, result.getBytes(UTF8).length);
+        Assert.assertEquals(98, result.length());
+
+        // remove a total of 3 3-byte characters, and then add back in the 1-byte '#'
+        result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7);
+        Assert.assertEquals(300, sb.toString().getBytes(UTF8).length);
+        Assert.assertEquals(292, result.getBytes(UTF8).length);
+        Assert.assertEquals(98, result.length());
+    }
+
+    public void test4ByteEncodings() {
+        StringBuilder sb = new StringBuilder();
+        for (int i=0x10000; i<0x10000+100; i++) {
+            sb.appendCodePoint(i);
+        }
+
+        // we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#'
+        String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 8);
+        Assert.assertEquals(400, sb.toString().getBytes(UTF8).length);
+        Assert.assertEquals(389, result.getBytes(UTF8).length);
+        Assert.assertEquals(98, result.length());
+
+        // we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#'
+        result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7);
+        Assert.assertEquals(400, sb.toString().getBytes(UTF8).length);
+        Assert.assertEquals(3892, result.getBytes(UTF8).length);
+        Assert.assertEquals(98, result.length());
    }
 }