Fix LZ4 encoder/decoder performance with (default) xxHash32 (#9249)

Motivation:

Lz4FrameEncoder and Lz4FrameDecoder in their default configuration use
an extremely inefficient way to checksum direct byte buffers. In
particular, for every byte checksummed, a single-element byte array is
being allocated and a JNI cal is made, which in some internal testing
makes a 25x difference in total throughput and allocates *a lot* of
garbage.

Modifications:

Lz4XXHash32, an implementation of ByteBufChecksum specifically for use
by Lz4FrameEncoder and Lz4FrameDecoder, is introduced. It utilises
xxHash32 block API which provides a hash() method that accepts a
ByteBuffer as an argument. Lz4FrameEncoder and Lz4FrameDecoder are
modified to use this implementation by default.

Result:

Lz4FrameEncoder and Lz4FrameDecoder perform well again when operating
on direct byte buffers with default checksum configuration; a public
implementation is provided for those who need to override the seed.
This commit is contained in:
Aleksey Yeschenko 2019-06-18 08:29:25 +01:00 committed by Norman Maurer
parent a2583d0d3c
commit 93414db1f3
5 changed files with 123 additions and 10 deletions

View File

@ -66,6 +66,9 @@ abstract class ByteBufChecksum implements Checksum {
static ByteBufChecksum wrapChecksum(Checksum checksum) {
ObjectUtil.checkNotNull(checksum, "checksum");
if (checksum instanceof ByteBufChecksum) {
return (ByteBufChecksum) checksum;
}
if (checksum instanceof Adler32 && ADLER32_UPDATE_METHOD != null) {
return new ReflectiveByteBufChecksum(checksum, ADLER32_UPDATE_METHOD);
}

View File

@ -21,7 +21,6 @@ import io.netty.handler.codec.ByteToMessageDecoder;
import net.jpountz.lz4.LZ4Exception;
import net.jpountz.lz4.LZ4Factory;
import net.jpountz.lz4.LZ4FastDecompressor;
import net.jpountz.xxhash.XXHashFactory;
import java.util.List;
import java.util.zip.Checksum;
@ -124,9 +123,7 @@ public class Lz4FrameDecoder extends ByteToMessageDecoder {
* <a href="https://github.com/Cyan4973/xxHash">Github</a>.
*/
public Lz4FrameDecoder(LZ4Factory factory, boolean validateChecksums) {
this(factory, validateChecksums ?
XXHashFactory.fastestInstance().newStreamingHash32(DEFAULT_SEED).asChecksum()
: null);
this(factory, validateChecksums ? new Lz4XXHash32(DEFAULT_SEED) : null);
}
/**

View File

@ -31,7 +31,6 @@ import io.netty.util.internal.ObjectUtil;
import net.jpountz.lz4.LZ4Compressor;
import net.jpountz.lz4.LZ4Exception;
import net.jpountz.lz4.LZ4Factory;
import net.jpountz.xxhash.XXHashFactory;
import java.nio.ByteBuffer;
import java.util.concurrent.TimeUnit;
@ -125,8 +124,7 @@ public class Lz4FrameEncoder extends MessageToByteEncoder<ByteBuf> {
* and is slower but compresses more efficiently
*/
public Lz4FrameEncoder(boolean highCompressor) {
this(LZ4Factory.fastestInstance(), highCompressor, DEFAULT_BLOCK_SIZE,
XXHashFactory.fastestInstance().newStreamingHash32(DEFAULT_SEED).asChecksum());
this(LZ4Factory.fastestInstance(), highCompressor, DEFAULT_BLOCK_SIZE, new Lz4XXHash32(DEFAULT_SEED));
}
/**

View File

@ -0,0 +1,107 @@
/*
* Copyright 2019 The Netty Project
*
* The Netty Project licenses this file to you under the Apache License,
* version 2.0 (the "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package io.netty.handler.codec.compression;
import io.netty.buffer.ByteBuf;
import net.jpountz.xxhash.StreamingXXHash32;
import net.jpountz.xxhash.XXHash32;
import net.jpountz.xxhash.XXHashFactory;
import java.nio.ByteBuffer;
import java.util.zip.Checksum;
/**
* A special-purpose {@link ByteBufChecksum} implementation for use with
* {@link Lz4FrameEncoder} and {@link Lz4FrameDecoder}.
*
* {@link StreamingXXHash32#asChecksum()} has a particularly nasty implementation
* of {@link Checksum#update(int)} that allocates a single-element byte array for
* every invocation.
*
* In addition to that, it doesn't implement an overload that accepts a {@link ByteBuffer}
* as an argument.
*
* Combined, this means that we can't use {@code ReflectiveByteBufChecksum} at all,
* and can't use {@code SlowByteBufChecksum} because of its atrocious performance
* with direct byte buffers (allocating an array and making a JNI call for every byte
* checksummed might be considered sub-optimal by some).
*
* Block version of xxHash32 ({@link XXHash32}), however, does provide
* {@link XXHash32#hash(ByteBuffer, int)} method that is efficient and does exactly
* what we need, with a caveat that we can only invoke it once before having to reset.
* This, however, is fine for our purposes, given the way we use it in
* {@link Lz4FrameEncoder} and {@link Lz4FrameDecoder}:
* {@code reset()}, followed by one {@code update()}, followed by {@code getValue()}.
*/
public final class Lz4XXHash32 extends ByteBufChecksum {
private static final XXHash32 XXHASH32 = XXHashFactory.fastestInstance().hash32();
private final int seed;
private boolean used;
private int value;
@SuppressWarnings("WeakerAccess")
public Lz4XXHash32(int seed) {
this.seed = seed;
}
@Override
public void update(int b) {
throw new UnsupportedOperationException();
}
@Override
public void update(byte[] b, int off, int len) {
if (used) {
throw new IllegalStateException();
}
value = XXHASH32.hash(b, off, len, seed);
used = true;
}
@Override
public void update(ByteBuf b, int off, int len) {
if (used) {
throw new IllegalStateException();
}
if (b.hasArray()) {
value = XXHASH32.hash(b.array(), b.arrayOffset() + off, len, seed);
} else {
value = XXHASH32.hash(CompressionUtil.safeNioBuffer(b, off, len), seed);
}
used = true;
}
@Override
public long getValue() {
if (!used) {
throw new IllegalStateException();
}
/*
* If you look carefully, you'll notice that the most significant nibble
* is being discarded; we believe this to be a bug, but this is what
* StreamingXXHash32#asChecksum() implementation of getValue() does,
* so we have to retain this behaviour for compatibility reasons.
*/
return value & 0xFFFFFFFL;
}
@Override
public void reset() {
used = false;
}
}

View File

@ -26,6 +26,7 @@ import java.util.zip.Adler32;
import java.util.zip.CRC32;
import java.util.zip.Checksum;
import static io.netty.handler.codec.compression.Lz4Constants.DEFAULT_SEED;
import static org.junit.Assert.*;
public class ByteBufChecksumTest {
@ -51,7 +52,14 @@ public class ByteBufChecksumTest {
private static void testUpdate(ByteBuf buf) {
try {
testUpdate(xxHash32(), ByteBufChecksum.wrapChecksum(xxHash32()), buf);
// all variations of xxHash32: slow and naive, optimised, wrapped optimised;
// the last two should be literally identical, but it's best to guard against
// an accidental regression in ByteBufChecksum#wrapChecksum(Checksum)
testUpdate(xxHash32(DEFAULT_SEED), ByteBufChecksum.wrapChecksum(xxHash32(DEFAULT_SEED)), buf);
testUpdate(xxHash32(DEFAULT_SEED), new Lz4XXHash32(DEFAULT_SEED), buf);
testUpdate(xxHash32(DEFAULT_SEED), ByteBufChecksum.wrapChecksum(new Lz4XXHash32(DEFAULT_SEED)), buf);
// CRC32 and Adler32, special-cased to use ReflectiveByteBufChecksum
testUpdate(new CRC32(), ByteBufChecksum.wrapChecksum(new CRC32()), buf);
testUpdate(new Adler32(), ByteBufChecksum.wrapChecksum(new Adler32()), buf);
} finally {
@ -76,7 +84,7 @@ public class ByteBufChecksumTest {
assertEquals(checksum.getValue(), wrapped.getValue());
}
private static Checksum xxHash32() {
return XXHashFactory.fastestInstance().newStreamingHash32(Lz4Constants.DEFAULT_SEED).asChecksum();
private static Checksum xxHash32(int seed) {
return XXHashFactory.fastestInstance().newStreamingHash32(seed).asChecksum();
}
}