Truncate filenames based on their utf-8 length

This commit is contained in:
Ben Gruver 2014-02-01 14:43:54 -08:00 committed by Connor Tumbleson
parent 838b35e477
commit 83e63dab7a
2 changed files with 155 additions and 13 deletions

View File

@ -33,7 +33,9 @@ import ds.tree.RadixTreeImpl;
import javax.annotation.Nonnull;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.util.regex.Pattern;
/**
@ -87,8 +89,9 @@ public class ClassFileNameHandler {
packageElement += "#";
}
if (packageElement.length() > MAX_FILENAME_LENGTH) {
packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH);
int utf8Length = utf8Length(packageElement);
if (utf8Length > MAX_FILENAME_LENGTH) {
packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH);
}
packageElements[elementIndex++] = packageElement;
@ -109,8 +112,9 @@ public class ClassFileNameHandler {
packageElement += "#";
}
if ((packageElement.length() + fileExtension.length()) > MAX_FILENAME_LENGTH) {
packageElement = shortenPathComponent(packageElement, MAX_FILENAME_LENGTH - fileExtension.length());
int utf8Length = utf8Length(packageElement) + utf8Length(fileExtension);
if (utf8Length > MAX_FILENAME_LENGTH) {
packageElement = shortenPathComponent(packageElement, utf8Length - MAX_FILENAME_LENGTH);
}
packageElements[elementIndex] = packageElement;
@ -118,12 +122,87 @@ public class ClassFileNameHandler {
return top.addUniqueChild(packageElements, 0);
}
@Nonnull
static String shortenPathComponent(@Nonnull String pathComponent, int maxLength) {
int toRemove = pathComponent.length() - maxLength + 1;
private static int utf8Length(String str) {
int utf8Length = 0;
int i=0;
while (i<str.length()) {
int c = str.codePointAt(i);
utf8Length += utf8Length(c);
i += Character.charCount(c);
}
return utf8Length;
}
int firstIndex = (pathComponent.length()/2) - (toRemove/2);
return pathComponent.substring(0, firstIndex) + "#" + pathComponent.substring(firstIndex+toRemove);
private static int utf8Length(int codePoint) {
if (codePoint < 0x80) {
return 1;
} else if (codePoint < 0x800) {
return 2;
} else if (codePoint < 0x10000) {
return 3;
} else {
return 4;
}
}
/**
* Shortens an individual file/directory name, removing the necessary number of code points
* from the middle of the string such that the utf-8 encoding of the string is at least
* bytesToRemove bytes shorter than the original.
*
* The removed codePoints in the middle of the string will be replaced with a # character.
*/
@Nonnull
static String shortenPathComponent(@Nonnull String pathComponent, int bytesToRemove) {
// We replace the removed part with a #, so we need to remove 1 extra char
bytesToRemove++;
int[] codePoints;
try {
IntBuffer intBuffer = ByteBuffer.wrap(pathComponent.getBytes("UTF-32BE")).asIntBuffer();
codePoints = new int[intBuffer.limit()];
intBuffer.get(codePoints);
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException(ex);
}
int midPoint = codePoints.length/2;
int delta = 0;
int firstEnd = midPoint; // exclusive
int secondStart = midPoint+1; // inclusive
int bytesRemoved = utf8Length(codePoints[midPoint]);
// if we have an even number of codepoints, start by removing both middle characters,
// unless just removing the first already removes enough bytes
if (((codePoints.length % 2) == 0) && bytesRemoved < bytesToRemove) {
bytesRemoved += utf8Length(codePoints[secondStart]);
secondStart++;
}
while ((bytesRemoved < bytesToRemove) &&
(firstEnd > 0 || secondStart < codePoints.length)) {
if (firstEnd > 0) {
firstEnd--;
bytesRemoved += utf8Length(codePoints[firstEnd]);
}
if (bytesRemoved < bytesToRemove && secondStart < codePoints.length) {
bytesRemoved += utf8Length(codePoints[secondStart]);
secondStart++;
}
}
StringBuilder sb = new StringBuilder();
for (int i=0; i<firstEnd; i++) {
sb.appendCodePoint(codePoints[i]);
}
sb.append('#');
for (int i=secondStart; i<codePoints.length; i++) {
sb.appendCodePoint(codePoints[i]);
}
return sb.toString();
}
private static boolean testForWindowsReservedFileNames(File path) {

View File

@ -34,16 +34,79 @@ package org.jf.util;
import junit.framework.Assert;
import org.junit.Test;
import java.nio.charset.Charset;
public class ClassFileNameHandlerTest {
private final Charset UTF8 = Charset.forName("UTF-8");
@Test
public void testShortedPathComponent() {
public void test1ByteEncodings() {
StringBuilder sb = new StringBuilder();
for (int i=0; i<300; i++) {
for (int i=0; i<100; i++) {
sb.append((char)i);
}
String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 255);
String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5);
Assert.assertEquals(95, result.getBytes(UTF8).length);
Assert.assertEquals(95, result.length());
}
Assert.assertEquals(255, result.length());
@Test
public void test2ByteEncodings() {
StringBuilder sb = new StringBuilder();
for (int i=0x80; i<0x80+100; i++) {
sb.append((char)i);
}
// remove a total of 3 2-byte characters, and then add back in the 1-byte '#'
String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 4);
Assert.assertEquals(200, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(195, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
// remove a total of 3 2-byte characters, and then add back in the 1-byte '#'
result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 5);
Assert.assertEquals(200, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(195, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
}
@Test
public void test3ByteEncodings() {
StringBuilder sb = new StringBuilder();
for (int i=0x800; i<0x800+100; i++) {
sb.append((char)i);
}
// remove a total of 3 3-byte characters, and then add back in the 1-byte '#'
String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 6);
Assert.assertEquals(300, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(292, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
// remove a total of 3 3-byte characters, and then add back in the 1-byte '#'
result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7);
Assert.assertEquals(300, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(292, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
}
public void test4ByteEncodings() {
StringBuilder sb = new StringBuilder();
for (int i=0x10000; i<0x10000+100; i++) {
sb.appendCodePoint(i);
}
// we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#'
String result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 8);
Assert.assertEquals(400, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(389, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
// we remove 3 codepoints == 6 characters == 12 bytes, and then add back in the 1-byte '#'
result = ClassFileNameHandler.shortenPathComponent(sb.toString(), 7);
Assert.assertEquals(400, sb.toString().getBytes(UTF8).length);
Assert.assertEquals(3892, result.getBytes(UTF8).length);
Assert.assertEquals(98, result.length());
}
}