Truncate filenames based on their utf-8 length

This commit is contained in:
Ben Gruver 2014-02-01 14:43:54 -08:00 committed by Connor Tumbleson
parent 838b35e477
commit 83e63dab7a
2 changed files with 155 additions and 13 deletions

View File

@ -33,7 +33,9 @@ import ds.tree.RadixTreeImpl;
import javax.annotation.Nonnull; import javax.annotation.Nonnull;
import java.io.*; import java.io.*;
import java.nio.ByteBuffer;
import java.nio.CharBuffer; import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.util.regex.Pattern; import java.util.regex.Pattern;
/** /**
@ -87,8 +89,9 @@ public class ClassFileNameHandler {
packageElement += "#"; packageElement += "#";
} }
if (packageElement.length() > MAX_FILENAME_LENGTH) { int utf8Length = utf8Length(packageElement);
packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH); if (utf8Length > MAX_FILENAME_LENGTH) {
packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH);
} }
packageElements[elementIndex++] = packageElement; packageElements[elementIndex++] = packageElement;
@ -109,8 +112,9 @@ public class ClassFileNameHandler {
packageElement += "#"; packageElement += "#";
} }
if ((packageElement.length() + fileExtension.length()) > MAX_FILENAME_LENGTH) { int utf8Length = utf8Length(packageElement) + utf8Length(fileExtension);
packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH - fileExtension.length()); if (utf8Length > MAX_FILENAME_LENGTH) {
packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH);
} }
packageElements[elementIndex] = packageElement; packageElements[elementIndex] = packageElement;
@ -118,12 +122,87 @@ public class ClassFileNameHandler {
return top.addUniqueChild(packageElements, 0); return top.addUniqueChild(packageElements, 0);
} }
@Nonnull private static int utf8Length(String str) {
static String shortenPathComponent(@Nonnull String pathComponent, int maxLength) { int utf8Length = 0;
int toRemove = pathComponent.length() - maxLength + 1; int i=0;
while (i<str.length()) {
int c = str.codePointAt(i);
utf8Length += utf8Length(c);
i += Character.charCount(c);
}
return utf8Length;
}
int firstIndex = (pathComponent.length()/2) - (toRemove/2); private static int utf8Length(int codePoint) {
return pathComponent.substring(0, firstIndex) + "#" + pathComponent.substring(firstIndex+toRemove); if (codePoint < 0x80) {
return 1;
} else if (codePoint < 0x800) {
return 2;
} else if (codePoint < 0x10000) {
return 3;
} else {
return 4;
}
}
/**
* Shortens an individual file/directory name, removing the necessary number of code points
* from the middle of the string such that the utf-8 encoding of the string is at least
* bytesToRemove bytes shorter than the original.
*
* The removed codePoints in the middle of the string will be replaced with a # character.
*/
@Nonnull
static String shortenPathComponent(@Nonnull String pathComponent, int bytesToRemove) {
// We replace the removed part with a #, so we need to remove 1 extra char
bytesToRemove++;
int[] codePoints;
try {
IntBuffer intBuffer = ByteBuffer.wrap(pathComponent.getBytes("UTF-32BE")).asIntBuffer();
codePoints = new int[intBuffer.limit()];
intBuffer.get(codePoints);
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException(ex);
}
int midPoint = codePoints.length/2;
int delta = 0;
int firstEnd = midPoint; // exclusive
int secondStart = midPoint+1; // inclusive
int bytesRemoved = utf8Length(codePoints[midPoint]);
// if we have an even number of codepoints, start by removing both middle characters,
// unless just removing the first already removes enough bytes
if (((codePoints.length % 2) == 0) && bytesRemoved < bytesToRemove) {
bytesRemoved += utf8Length(codePoints[secondStart]);
secondStart++;
}
while ((bytesRemoved < bytesToRemove) &&
(firstEnd > 0 || secondStart < codePoints.length)) {
if (firstEnd > 0) {
firstEnd--;
bytesRemoved += utf8Length(codePoints[firstEnd]);
}
if (bytesRemoved < bytesToRemove && secondStart < codePoints.length) {
bytesRemoved += utf8Length(codePoints[secondStart]);
secondStart++;
}
}
StringBuilder sb = new StringBuilder();
for (int i=0; i<firstEnd; i++) {
sb.appendCodePoint(codePoints[i]);
}
sb.append('#');
for (int i=secondStart; i<codePoints.length; i++) {
sb.appendCodePoint(codePoints[i]);
}
return sb.toString();
} }
private static boolean testForWindowsReservedFileNames(File path) { private static boolean testForWindowsReservedFileNames(File path) {

View File

@ -34,16 +34,79 @@ package org.jf.util;
import junit.framework.Assert; import junit.framework.Assert;
import org.junit.Test; import org.junit.Test;
import java.nio.charset.Charset;
public class ClassFileNameHandlerTest { public class ClassFileNameHandlerTest {
private final Charset UTF8 = Charset.forName("UTF-8");
@Test @Test
public void testShortedPathComponent() { public void test1ByteEncodings() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for (int i=0; i<300; i++) { for (int i=0; i<100; i++) {
sb.append((char)i); sb.append((char)i);
} }
String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 255); String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5);
Assert.assertEquals(95, result.getBytes(UTF8).length);
Assert.assertEquals(95, result.length());
}
Assert.assertEquals(255, result.length()); @Test
public void test2ByteEncodings() {
StringBuilder sb = new StringBuilder();
for (int i=0x80; i<0x80+100; i++) {
sb.append((char)i);
}
// remove a total of 3 2-byte characters, and then add back in the 1-byte '#'
String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 4);
Assert.assertEquals(200, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(195, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
// remove a total of 3 2-byte characters, and then add back in the 1-byte '#'
result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5);
Assert.assertEquals(200, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(195, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
}
@Test
public void test3ByteEncodings() {
StringBuilder sb = new StringBuilder();
for (int i=0x800; i<0x800+100; i++) {
sb.append((char)i);
}
// remove a total of 3 3-byte characters, and then add back in the 1-byte '#'
String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 6);
Assert.assertEquals(300, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(292, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
// remove a total of 3 3-byte characters, and then add back in the 1-byte '#'
result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7);
Assert.assertEquals(300, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(292, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
}
public void test4ByteEncodings() {
StringBuilder sb = new StringBuilder();
for (int i=0x10000; i<0x10000+100; i++) {
sb.appendCodePoint(i);
}
// we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#'
String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 8);
Assert.assertEquals(400, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(389, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
// we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#'
result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7);
Assert.assertEquals(400, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(3892, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
} }
} }