Add a new faster, memory efficient URL decoder.

Query string parameters need to be decoded as per RFC 3986, Section 2. The implementation of the URLDecoder in the JDK is slow because it takes long and inefficient code paths, and it generates a lot more garbage than necessary. This decoder is about 2x faster and doesn't allocate any memory in the easy case where the decoded string is unchanged. When the string needs to be changed, only one buffer is allocated for the decoded string.
2010-12-03 23:12:03 -08:00 · 2010-12-03 23:12:03 -08:00 · 1f1b8d8325
commit 1f1b8d8325
parent 3d85d56631
2 changed files with 149 additions and 6 deletions
--- a/src/main/java/org/jboss/netty/handler/codec/http/QueryStringDecoder.java
+++ b/src/main/java/org/jboss/netty/handler/codec/http/QueryStringDecoder.java
@ -17,7 +17,6 @@ package org.jboss.netty.handler.codec.http;

 import java.io.UnsupportedEncodingException;
 import java.net.URI;
-import java.net.URLDecoder;
 import java.nio.charset.Charset;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.ArrayList;
@ -171,15 +170,121 @@ public class QueryStringDecoder {
        return params;
    }

-    private static String decodeComponent(String s, Charset charset) {
+    /**
+     * Decodes a bit of an URL encoded by a browser.
+     * <p>
+     * This is equivalent to calling {@link decodeComponent(String, Charset)}
+     * with the UTF-8 charset (recommended to comply with RFC 3986, Section 2).
+     * @param s The string to decode (can be empty).
+     * @return The decoded string, or {@code s} if there's nothing to decode.
+     * If the string to decode is {@code null}, returns an empty string.
+     * @throws IllegalArgumentException if the string contains a malformed
+     * escape sequence.
+     */
+    public static String decodeComponent(final String s) {
+        return decodeComponent(s, HttpCodecUtil.DEFAULT_CHARSET);
+    }
+
+    /**
+     * Decodes a bit of an URL encoded by a browser.
+     * <p>
+     * The string is expected to be encoded as per RFC 3986, Section 2.
+     * This is the encoding used by JavaScript functions {@code encodeURI}
+     * and {@code encodeURIComponent}, but not {@code escape}.  For example
+     * in this encoding, &eacute; (in Unicode {@code U+00E9} or in UTF-8
+     * {@code 0xC3 0xA9}) is encoded as {@code %C3%A9} or {@code %c3%a9}.
+     * <p>
+     * This is essentially equivalent to calling
+     *   <code>{@link java.net.URLDecoder URLDecoder}.{@link
+     *   java.net.URLDecoder.decode}(s, charset.name())</code>
+     * except that it's over 2x faster and generates less garbage for the GC.
+     * Actually this function doesn't allocate any memory if there's nothing
+     * to decode, the argument itself is returned.
+     * @param s The string to decode (can be empty).
+     * @param charset The charset to use to decode the string (should really
+     * be {@link CharsetUtil.UTF_8}.
+     * @return The decoded string, or {@code s} if there's nothing to decode.
+     * If the string to decode is {@code null}, returns an empty string.
+     * @throws IllegalArgumentException if the string contains a malformed
+     * escape sequence.
+     */
+    @SuppressWarnings("fallthrough")
+    public static String decodeComponent(final String s,
+                                         final Charset charset) {
        if (s == null) {
            return "";
        }
+        final int size = s.length();
+        boolean modified = false;
+        for (int i = 0; i < size; i++) {
+            final char c = s.charAt(i);
+            switch (c) {
+                case '%':
+                    i++;  // We can skip at least one char, e.g. `%%'.
+                    // Fall through.
+                case '+':
+                    modified = true;
+                    break;
+            }
+        }
+        if (!modified) {
+            return s;
+        }
+        final byte[] buf = new byte[size];
+        int pos = 0;  // position in `buf'.
+        for (int i = 0; i < size; i++) {
+            char c = s.charAt(i);
+            switch (c) {
+                case '+':
+                    buf[pos++] = ' ';  // "+" -> " "
+                    break;
+                case '%':
+                    if (i == size - 1) {
+                        throw new IllegalArgumentException("unterminated escape"
+                                + " sequence at end of string: " + s);
+                    }
+                    c = s.charAt(++i);
+                    if (c == '%') {
+                        buf[pos++] = '%';  // "%%" -> "%"
+                        break;
+                    } else if (i == size - 1) {
+                        throw new IllegalArgumentException("partial escape"
+                                + " sequence at end of string: " + s);
+                    }
+                    c = decodeHexNibble(c);
+                    final char c2 = decodeHexNibble(s.charAt(++i));
+                    if (c == Character.MAX_VALUE || c2 == Character.MAX_VALUE) {
+                        throw new IllegalArgumentException(
+                                "invalid escape sequence `%" + s.charAt(i - 1)
+                                + s.charAt(i) + "' at index " + (i - 2)
+                                + " of: " + s);
+                    }
+                    c = (char) (c * 16 + c2);
+                    // Fall through.
+                default:
+                    buf[pos++] = (byte) c;
+                    break;
+            }
+        }
+        return new String(buf, 0, pos, charset);
+    }

-        try {
-            return URLDecoder.decode(s, charset.name());
-        } catch (UnsupportedEncodingException e) {
-            throw new UnsupportedCharsetException(charset.name());
+    /**
+     * Helper to decode half of a hexadecimal number from a string.
+     * @param c The ASCII character of the hexadecimal number to decode.
+     * Must be in the range {@code [0-9a-fA-F]}.
+     * @return The hexadecimal value represented in the ASCII character
+     * given, or {@link Character.MAX_VALUE} if the character is invalid.
+     */
+    private static char decodeHexNibble(final char c) {
+        if ('0' <= c && c <= '9') {
+            return (char) (c - '0');
+        } else if ('a' <= c && c <= 'f') {
+            return (char) (c - 'a' + 10);
+        } else if ('A' <= c && c <= 'F') {
+            return (char) (c - 'A' + 10);
+        } else {
+            return Character.MAX_VALUE;
        }
    }

--- a/src/test/java/org/jboss/netty/handler/codec/http/QueryStringDecoderTest.java
+++ b/src/test/java/org/jboss/netty/handler/codec/http/QueryStringDecoderTest.java
@ -73,6 +73,7 @@ public class QueryStringDecoderTest {
        Assert.assertEquals("1=", d.getParameters().get("a").get(0));
        Assert.assertEquals("=2", d.getParameters().get("a").get(1));
    }
+
    @Test
    public void testExotic() throws Exception {
        assertQueryString("", "");
@ -97,6 +98,43 @@ public class QueryStringDecoderTest {
        assertQueryString("/foo?a=1&a=&a=", "/foo?a=1&a&a=");
    }

+    @Test
+    public void testUrlDecoding() throws Exception {
+        final String caffe = new String(
+                // "Caffé" but instead of putting the literal E-acute in the
+                // source file, we directly use the UTF-8 encoding so as to
+                // not rely on the platform's default encoding (not portable).
+                new byte[] {'C', 'a', 'f', 'f', (byte) 0xC3, (byte) 0xA9},
+                "UTF-8");
+        final String[] tests = new String[] {
+            // Encoded   ->   Decoded or error message substring
+            "",               "",
+            "foo",            "foo",
+            "f%%b",           "f%b",
+            "f+o",            "f o",
+            "f++",            "f  ",
+            "fo%",            "unterminated escape sequence",
+            "%42",            "B",
+            "%5f",            "_",
+            "f%4",            "partial escape sequence",
+            "%x2",            "invalid escape sequence `%x2' at index 0 of: %x2",
+            "%4x",            "invalid escape sequence `%4x' at index 0 of: %4x",
+            "Caff%C3%A9",     caffe,
+        };
+        for (int i = 0; i < tests.length; i += 2) {
+            final String encoded = tests[i];
+            final String expected = tests[i + 1];
+            try {
+                final String decoded = QueryStringDecoder.decodeComponent(encoded);
+                Assert.assertEquals(expected, decoded);
+            } catch (IllegalArgumentException e) {
+                Assert.assertTrue("String \"" + e.getMessage() + "\" does"
+                                  + " not contain \"" + expected + '"',
+                                  e.getMessage().contains(expected));
+            }
+        }
+    }
+
    private static void assertQueryString(String expected, String actual) {
        QueryStringDecoder ed = new QueryStringDecoder(expected, CharsetUtil.UTF_8);
        QueryStringDecoder ad = new QueryStringDecoder(actual, CharsetUtil.UTF_8);