Optimize the QueryStringEncoder performance (#9568)

Motivation: Optimize the QueryStringEncoder for lower memory overhead and higher encode speed. Modification: Encode the space to + directly, and reuse the uriStringBuilder rather then create a new one. Result: Improved performance
2019-09-21 03:07:13 +08:00 · 2019-09-21 03:07:13 +08:00 · 07fe1a299a
commit 07fe1a299a
parent 39cafcb05c
1 changed files with 135 additions and 26 deletions
--- a/codec-http/src/main/java/io/netty/handler/codec/http/QueryStringEncoder.java
+++ b/codec-http/src/main/java/io/netty/handler/codec/http/QueryStringEncoder.java
@ -15,14 +15,15 @@
 */
 package io.netty.handler.codec.http;

+import io.netty.buffer.ByteBufUtil;
+import io.netty.util.CharsetUtil;
 import io.netty.util.internal.ObjectUtil;
+import io.netty.util.internal.StringUtil;

-import java.io.UnsupportedEncodingException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.URLEncoder;
 import java.nio.charset.Charset;
-import java.nio.charset.UnsupportedCharsetException;

 /**
 * Creates an URL-encoded URI from a path string and key-value parameter pairs.
@ -33,13 +34,16 @@ import java.nio.charset.UnsupportedCharsetException;
 * encoder.addParam("recipient", "world");
 * assert encoder.toString().equals("/hello?recipient=world");
 * </pre>
+ *
 * @see QueryStringDecoder
 */
 public class QueryStringEncoder {

-    private final String charsetName;
+    private final Charset charset;
    private final StringBuilder uriBuilder;
    private boolean hasParams;
+    private static final byte WRITE_UTF_UNKNOWN = (byte) '?';
+    private static final char[] CHAR_MAP = "0123456789ABCDEF".toCharArray();

    /**
     * Creates a new encoder that encodes a URI that starts with the specified
@ -54,8 +58,9 @@ public class QueryStringEncoder {
     * path string in the specified charset.
     */
    public QueryStringEncoder(String uri, Charset charset) {
+        ObjectUtil.checkNotNull(charset, "charset");
        uriBuilder = new StringBuilder(uri);
-        charsetName = charset.name();
+        this.charset = CharsetUtil.UTF_8.equals(charset) ? null : charset;
    }

    /**
@ -69,10 +74,19 @@ public class QueryStringEncoder {
            uriBuilder.append('?');
            hasParams = true;
        }
-        appendComponent(name, charsetName, uriBuilder);
+
+        encodeComponent(name);
        if (value != null) {
            uriBuilder.append('=');
-            appendComponent(value, charsetName, uriBuilder);
+            encodeComponent(value);
+        }
+    }
+
+    private void encodeComponent(CharSequence s) {
+        if (charset == null) {
+            encodeUtf8Component(s);
+        } else {
+            encodeNonUtf8Component(s);
        }
    }

@ -95,28 +109,123 @@ public class QueryStringEncoder {
        return uriBuilder.toString();
    }

-    private static void appendComponent(String s, String charset, StringBuilder sb) {
-        try {
-            s = URLEncoder.encode(s, charset);
-        } catch (UnsupportedEncodingException ignored) {
-            throw new UnsupportedCharsetException(charset);
+    /**
+     * Encode the String as per RFC 3986, Section 2.
+     * <p>
+     * There is a little different between the JDK's encode method : {@link URLEncoder#encode(String, String)}.
+     * The JDK's encoder encode the space to {@code +} and this method directly encode the blank to {@code %20}
+     * beyond that , this method reuse the {@link #uriBuilder} in this class rather then create a new one,
+     * thus generates less garbage for the GC.
+     *
+     * @param s The String to encode
+     */
+    private void encodeNonUtf8Component(CharSequence s) {
+        //Don't allocate memory until needed
+        char[] buf = null;
+
+        for (int i = 0, len = s.length(); i < len;) {
+            char c = s.charAt(i);
+            if (dontNeedEncoding(c)) {
+                uriBuilder.append(c);
+                i++;
+            } else {
+                int index = 0;
+                if (buf == null) {
+                    buf = new char[s.length() - i];
                }
-        // replace all '+' with "%20"
-        int idx = s.indexOf('+');
-        if (idx == -1) {
-            sb.append(s);
+
+                do {
+                    buf[index] = c;
+                    index++;
+                    i++;
+                } while (i < s.length() && !dontNeedEncoding(c = s.charAt(i)));
+
+                byte[] bytes = new String(buf, 0, index).getBytes(charset);
+
+                for (byte b : bytes) {
+                    appendEncoded(b);
+                }
+            }
+        }
+    }
+
+    /**
+     * @see ByteBufUtil#writeUtf8(io.netty.buffer.ByteBuf, CharSequence, int, int)
+     */
+    private void encodeUtf8Component(CharSequence s) {
+        for (int i = 0, len = s.length(); i < len; i++) {
+            char c = s.charAt(i);
+            if (c < 0x80) {
+                if (dontNeedEncoding(c)) {
+                    uriBuilder.append(c);
+                } else {
+                    appendEncoded(c);
+                }
+            } else if (c < 0x800) {
+                appendEncoded(0xc0 | (c >> 6));
+                appendEncoded(0x80 | (c & 0x3f));
+            } else if (StringUtil.isSurrogate(c)) {
+                if (!Character.isHighSurrogate(c)) {
+                    appendEncoded(WRITE_UTF_UNKNOWN);
+                    continue;
+                }
+                // Surrogate Pair consumes 2 characters.
+                if (++i == s.length()) {
+                    appendEncoded(WRITE_UTF_UNKNOWN);
+                    break;
+                }
+                // Extra method to allow inlining the rest of writeUtf8 which is the most likely code path.
+                writeUtf8Surrogate(c, s.charAt(i));
+            } else {
+                appendEncoded(0xe0 | (c >> 12));
+                appendEncoded(0x80 | ((c >> 6) & 0x3f));
+                appendEncoded(0x80 | (c & 0x3f));
+            }
+        }
+    }
+
+    private void writeUtf8Surrogate(char c, char c2) {
+        if (!Character.isLowSurrogate(c2)) {
+            appendEncoded(WRITE_UTF_UNKNOWN);
+            appendEncoded(Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2);
            return;
        }
-        sb.append(s, 0, idx).append("%20");
-        int size = s.length();
-        idx++;
-        for (; idx < size; idx++) {
-            char c = s.charAt(idx);
-            if (c != '+') {
-                sb.append(c);
-            } else {
-                sb.append("%20");
+        int codePoint = Character.toCodePoint(c, c2);
+        // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
+        appendEncoded(0xf0 | (codePoint >> 18));
+        appendEncoded(0x80 | ((codePoint >> 12) & 0x3f));
+        appendEncoded(0x80 | ((codePoint >> 6) & 0x3f));
+        appendEncoded(0x80 | (codePoint & 0x3f));
    }
+
+    private void appendEncoded(int b) {
+        uriBuilder.append('%').append(forDigit(b >> 4)).append(forDigit(b));
    }
+
+    /**
+     * Convert the given digit to a upper hexadecimal char.
+     *
+     * @param digit the number to convert to a character.
+     * @return the {@code char} representation of the specified digit
+     * in hexadecimal.
+     */
+    private static char forDigit(int digit) {
+        return CHAR_MAP[digit & 0xF];
+    }
+
+    /**
+     * Determines whether the given character is a unreserved character.
+     * <p>
+     * unreserved characters do not need to be encoded, and include uppercase and lowercase
+     * letters, decimal digits, hyphen, period, underscore, and tilde.
+     * <p>
+     * unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
+     *
+     * @param ch the char to be judged whether it need to be encode
+     * @return true or false
+     */
+    private static boolean dontNeedEncoding(char ch) {
+        return ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z' || ch >= '0' && ch <= '9'
+                || ch == '-' || ch == '_' || ch == '.' || ch == '*';
    }
 }