Arm64 CRC32 parallel computation optimization for RocksDB (#5494)
Summary: Crc32c Parallel computation optimization: Algorithm comes from Intel whitepaper: [crc-iscsi-polynomial-crc32-instruction-paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf) Input data is divided into three equal-sized blocks Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes 1. crc32c_test: ``` [==========] Running 4 tests from 1 test case. [----------] Global test environment set-up. [----------] 4 tests from CRC [ RUN ] CRC.StandardResults [ OK ] CRC.StandardResults (1 ms) [ RUN ] CRC.Values [ OK ] CRC.Values (0 ms) [ RUN ] CRC.Extend [ OK ] CRC.Extend (0 ms) [ RUN ] CRC.Mask [ OK ] CRC.Mask (0 ms) [----------] 4 tests from CRC (1 ms total) [----------] Global test environment tear-down [==========] 4 tests from 1 test case ran. (1 ms total) [ PASSED ] 4 tests. ``` 2. RocksDB benchmark: db_bench --benchmarks="crc32c" ``` Linear Arm crc32c: crc32c: 1.005 micros/op 995133 ops/sec; 3887.2 MB/s (4096 per op) ``` ``` Parallel optimization with Armv8 crypto extension: crc32c: 0.419 micros/op 2385078 ops/sec; 9316.7 MB/s (4096 per op) ``` It gets ~2.4x speedup compared to linear Arm crc32c instructions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5494 Differential Revision: D16340806 fbshipit-source-id: 95dae9a5b646fd20a8303671d82f17b2e162e945
This commit is contained in:
parent
74fb7f0ba5
commit
a3c1832e86
4
Makefile
4
Makefile
@ -144,8 +144,8 @@ HAVE_POWER8=1
|
||||
endif
|
||||
|
||||
ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc -xc /dev/null 2>&1))
|
||||
CXXFLAGS += -march=armv8-a+crc
|
||||
CFLAGS += -march=armv8-a+crc
|
||||
CXXFLAGS += -march=armv8-a+crc+crypto
|
||||
CFLAGS += -march=armv8-a+crc+crypto
|
||||
ARMCRC_SOURCE=1
|
||||
endif
|
||||
|
||||
|
@ -19,35 +19,82 @@ uint32_t crc32c_runtime_check(void) {
|
||||
|
||||
uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
|
||||
unsigned len) {
|
||||
const uint8_t *buf1;
|
||||
const uint16_t *buf2;
|
||||
const uint32_t *buf4;
|
||||
const uint64_t *buf8;
|
||||
|
||||
int64_t length = (int64_t)len;
|
||||
|
||||
const uint8_t *buf8;
|
||||
const uint64_t *buf64 = (uint64_t *)data;
|
||||
int length = (int)len;
|
||||
crc ^= 0xffffffff;
|
||||
buf8 = (const uint64_t *)data;
|
||||
while ((length -= sizeof(uint64_t)) >= 0) {
|
||||
crc = __crc32cd(crc, *buf8++);
|
||||
|
||||
#ifdef HAVE_ARM64_CRYPTO
|
||||
/* Crc32c Parallel computation
|
||||
* Algorithm comes from Intel whitepaper:
|
||||
* crc-iscsi-polynomial-crc32-instruction-paper
|
||||
*
|
||||
* Input data is divided into three equal-sized blocks
|
||||
* Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
|
||||
* One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
|
||||
*/
|
||||
#define BLK_LENGTH 42
|
||||
while (length >= 1024) {
|
||||
uint64_t t0, t1;
|
||||
uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
|
||||
|
||||
/* Parallel Param:
|
||||
* k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
|
||||
* k1 = CRC32(x ^ (42 * 8 * 8 - 1));
|
||||
*/
|
||||
uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
|
||||
|
||||
/* First 8 bytei for better pipelining */
|
||||
crc0 = crc32c_u64(crc, *buf64++);
|
||||
|
||||
/* 3 blocks crc32c parallel computation
|
||||
*
|
||||
* 42 * 8 * 3 = 1008 (bytes)
|
||||
*/
|
||||
for (int i = 0; i < BLK_LENGTH; i++, buf64++) {
|
||||
crc0 = crc32c_u64(crc0, *buf64);
|
||||
crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH));
|
||||
crc2 = crc32c_u64(crc2, *(buf64 + (BLK_LENGTH * 2)));
|
||||
}
|
||||
buf64 += (BLK_LENGTH * 2);
|
||||
|
||||
/* Last 8 bytes */
|
||||
crc = crc32c_u64(crc2, *buf64++);
|
||||
|
||||
t0 = (uint64_t)vmull_p64(crc0, k0);
|
||||
t1 = (uint64_t)vmull_p64(crc1, k1);
|
||||
|
||||
/* Merge (crc0, crc1, crc2) -> crc */
|
||||
crc1 = crc32c_u64(0, t1);
|
||||
crc ^= crc1;
|
||||
crc0 = crc32c_u64(0, t0);
|
||||
crc ^= crc0;
|
||||
|
||||
length -= 1024;
|
||||
}
|
||||
#endif
|
||||
buf8 = (const uint8_t *)buf64;
|
||||
while (length >= 8) {
|
||||
crc = crc32c_u64(crc, *(const uint64_t*)buf8);
|
||||
buf8 += 8;
|
||||
length -= 8;
|
||||
}
|
||||
|
||||
/* The following is more efficient than the straight loop */
|
||||
buf4 = (const uint32_t *)buf8;
|
||||
if (length & sizeof(uint32_t)) {
|
||||
crc = __crc32cw(crc, *buf4++);
|
||||
if (length >= 4) {
|
||||
crc = crc32c_u32(crc, *(const uint32_t*)buf8);
|
||||
buf8 += 4;
|
||||
length -= 4;
|
||||
}
|
||||
|
||||
buf2 = (const uint16_t *)buf4;
|
||||
if (length & sizeof(uint16_t)) {
|
||||
crc = __crc32ch(crc, *buf2++);
|
||||
if (length >= 2) {
|
||||
crc = crc32c_u16(crc, *(const uint16_t*)buf8);
|
||||
buf8 += 2;
|
||||
length -= 2;
|
||||
}
|
||||
|
||||
buf1 = (const uint8_t *)buf2;
|
||||
if (length & sizeof(uint8_t))
|
||||
crc = __crc32cb(crc, *buf1);
|
||||
if (length >= 1)
|
||||
crc = crc32c_u8(crc, *buf8);
|
||||
|
||||
crc ^= 0xffffffff;
|
||||
return crc;
|
||||
|
@ -9,13 +9,24 @@
|
||||
#include <cinttypes>
|
||||
|
||||
#if defined(__aarch64__) || defined(__AARCH64__)
|
||||
|
||||
#ifdef __ARM_FEATURE_CRC32
|
||||
#define HAVE_ARM64_CRC
|
||||
#include <arm_acle.h>
|
||||
#define crc32c_u8(crc, v) __crc32cb(crc, v)
|
||||
#define crc32c_u16(crc, v) __crc32ch(crc, v)
|
||||
#define crc32c_u32(crc, v) __crc32cw(crc, v)
|
||||
#define crc32c_u64(crc, v) __crc32cd(crc, v)
|
||||
|
||||
extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
|
||||
extern uint32_t crc32c_runtime_check(void);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_FEATURE_CRYPTO
|
||||
#define HAVE_ARM64_CRYPTO
|
||||
#include <arm_neon.h>
|
||||
#endif // __ARM_FEATURE_CRYPTO
|
||||
#endif // __ARM_FEATURE_CRC32
|
||||
|
||||
#endif // defined(__aarch64__) || defined(__AARCH64__)
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user