Use Two way algorithm to optimize ByteBufUtil.indexOf() method (#11367)

Use Two way algorithm to optimize ByteBufUtil.indexOf() method Motivation: ByteBufUtil.indexOf can be inefficient for substring search on ByteBuf, in terms of algorithm complexity (O(needle.readableBytes * haystack.readableBytes)), consider using the Two Way algorithm to optimize the ByteBufUtil.indexOf() method Modification: Use the Two Way algorithm to optimize ByteBufUtil.indexOf() method. Result: The performance of the ByteBufUtil.indexOf() method is higher than the original implementation
2021-06-28 17:07:17 +08:00 · 2021-06-28 17:07:17 +08:00 · d99a8f75b4
commit d99a8f75b4
parent 6c618e30af
2 changed files with 141 additions and 7 deletions
--- a/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
+++ b/buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
@ -227,20 +227,125 @@ public final class ByteBufUtil {

    /**
     * Returns the reader index of needle in haystack, or -1 if needle is not in haystack.
+     * This method uses the <a href="https://en.wikipedia.org/wiki/Two-way_string-matching_algorithm">Two-Way
+     * string matching algorithm</a>, which yields O(1) space complexity and excellent performance.
     */
    public static int indexOf(ByteBuf needle, ByteBuf haystack) {
-        // TODO: maybe use Boyer Moore for efficiency.
-        int attempts = haystack.readableBytes() - needle.readableBytes() + 1;
-        for (int i = 0; i < attempts; i++) {
-            if (equals(needle, needle.readerIndex(),
-                       haystack, haystack.readerIndex() + i,
-                       needle.readableBytes())) {
-                return haystack.readerIndex() + i;
+        if (haystack == null || needle == null) {
+            return -1;
+        }
+
+        if (needle.readableBytes() > haystack.readableBytes()) {
+            return -1;
+        }
+
+        int n = haystack.readableBytes();
+        int m = needle.readableBytes();
+        if (m == 0) {
+            return 0;
+        }
+
+        // When the needle has only one byte that can be read,
+        // the firstIndexOf method needs to be called
+        if (m == 1) {
+            return firstIndexOf((AbstractByteBuf) haystack, haystack.readerIndex(),
+                    haystack.writerIndex(), needle.getByte(needle.readerIndex()));
+        }
+
+        int i;
+        int j = 0;
+        int aStartIndex = needle.readerIndex();
+        int bStartIndex = haystack.readerIndex();
+        long suffixes =  maxSuf(needle, m, aStartIndex, true);
+        long prefixes = maxSuf(needle, m, aStartIndex, false);
+        int ell = Math.max((int) (suffixes >> 32), (int) (prefixes >> 32));
+        int per = Math.max((int) suffixes, (int) prefixes);
+        int memory;
+        int length = Math.min(m - per, ell + 1);
+
+        if (equals(needle, aStartIndex, needle, aStartIndex + per,  length)) {
+            memory = -1;
+            while (j <= n - m) {
+                i = Math.max(ell, memory) + 1;
+                while (i < m && needle.getByte(i + aStartIndex) == haystack.getByte(i + j + bStartIndex)) {
+                    ++i;
+                }
+                if (i > n) {
+                    return -1;
+                }
+                if (i >= m) {
+                    i = ell;
+                    while (i > memory && needle.getByte(i + aStartIndex) == haystack.getByte(i + j + bStartIndex)) {
+                        --i;
+                    }
+                    if (i <= memory) {
+                        return j;
+                    }
+                    j += per;
+                    memory = m - per - 1;
+                } else {
+                    j += i - ell;
+                    memory = -1;
+                }
+            }
+        } else {
+            per = Math.max(ell + 1, m - ell - 1) + 1;
+            while (j <= n - m) {
+                i = ell + 1;
+                while (i < m && needle.getByte(i + aStartIndex) == haystack.getByte(i + j + bStartIndex)) {
+                    ++i;
+                }
+                if (i > n) {
+                    return -1;
+                }
+                if (i >= m) {
+                    i = ell;
+                    while (i >= 0 && needle.getByte(i + aStartIndex) == haystack.getByte(i + j + bStartIndex)) {
+                        --i;
+                    }
+                    if (i < 0) {
+                        return j;
+                    }
+                    j += per;
+                } else {
+                    j += i - ell;
+                }
            }
        }
        return -1;
    }

+    private static long maxSuf(ByteBuf x, int m, int start, boolean isSuffix) {
+        int p = 1;
+        int ms = -1;
+        int j = start;
+        int k = 1;
+        byte a;
+        byte b;
+        while (j + k < m) {
+            a = x.getByte(j + k);
+            b = x.getByte(ms + k);
+            boolean suffix = isSuffix ? a < b : a > b;
+            if (suffix) {
+                j += k;
+                k = 1;
+                p = j - ms;
+            } else if (a == b) {
+                if (k != p) {
+                    ++k;
+                } else {
+                    j += p;
+                    k = 1;
+                }
+            } else {
+                ms = j;
+                j = ms + 1;
+                k = p = 1;
+            }
+        }
+        return ((long) ms << 32) + p;
+    }
+
    /**
     * Returns {@code true} if and only if the two specified buffers are
     * identical to each other for {@code length} bytes starting at {@code aStartIndex}
--- a/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
+++ b/buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
@ -118,6 +118,35 @@ public class ByteBufUtilTest {
        });
    }

+    @Test
+    public void testIndexOf() {
+        ByteBuf haystack = Unpooled.copiedBuffer("abc123", CharsetUtil.UTF_8);
+        assertEquals(0, ByteBufUtil.indexOf(Unpooled.copiedBuffer("a", CharsetUtil.UTF_8), haystack));
+        assertEquals(1, ByteBufUtil.indexOf(Unpooled.copiedBuffer("bc".getBytes(CharsetUtil.UTF_8)), haystack));
+        assertEquals(2, ByteBufUtil.indexOf(Unpooled.copiedBuffer("c".getBytes(CharsetUtil.UTF_8)), haystack));
+        assertEquals(0, ByteBufUtil.indexOf(Unpooled.copiedBuffer("abc12".getBytes(CharsetUtil.UTF_8)), haystack));
+        assertEquals(-1, ByteBufUtil.indexOf(Unpooled.copiedBuffer("abcdef".getBytes(CharsetUtil.UTF_8)), haystack));
+        assertEquals(-1, ByteBufUtil.indexOf(Unpooled.copiedBuffer("abc12x".getBytes(CharsetUtil.UTF_8)), haystack));
+        assertEquals(-1, ByteBufUtil.indexOf(Unpooled.copiedBuffer("abc123def".getBytes(CharsetUtil.UTF_8)), haystack));
+
+        final ByteBuf needle = Unpooled.copiedBuffer("abc12", CharsetUtil.UTF_8);
+        haystack.readerIndex(1);
+        needle.readerIndex(1);
+        assertEquals(0, ByteBufUtil.indexOf(needle, haystack));
+        haystack.readerIndex(2);
+        needle.readerIndex(3);
+        assertEquals(1, ByteBufUtil.indexOf(needle, haystack));
+        haystack.readerIndex(1);
+        needle.readerIndex(2);
+        assertEquals(1, ByteBufUtil.indexOf(needle, haystack));
+        haystack.release();
+
+        haystack = Unpooled.copiedBuffer("123aab123", CharsetUtil.UTF_8);
+        assertEquals(3, ByteBufUtil.indexOf(Unpooled.copiedBuffer("aab", CharsetUtil.UTF_8), haystack));
+        haystack.release();
+        needle.release();
+    }
+
    @Test
    public void equalsBufferSubsections() {
        byte[] b1 = new byte[128];