crc32c_arm64 performance optimization (#5675)
Summary: Crc32c Parallel computation coding optimization: Macro unfolding removes the "for" loop and is good to decrease branch-miss in arm64 micro architecture 1024 Bytes is divided into 8(head) + 1008( 6 * 7 * 3 * 8 ) + 8(tail) three parts Macro unfolding 42 loops to 6 CRC32C7X24BYTESs 1 CRC32C7X24BYTES containing 7 CRC32C24BYTESs 1, crc32c_test [==========] Running 4 tests from 1 test case. [----------] Global test environment set-up. [----------] 4 tests from CRC [ RUN ] CRC.StandardResults [ OK ] CRC.StandardResults (1 ms) [ RUN ] CRC.Values [ OK ] CRC.Values (0 ms) [ RUN ] CRC.Extend [ OK ] CRC.Extend (0 ms) [ RUN ] CRC.Mask [ OK ] CRC.Mask (0 ms) [----------] 4 tests from CRC (1 ms total) [----------] Global test environment tear-down [==========] 4 tests from 1 test case ran. (1 ms total) [ PASSED ] 4 tests. 2, db_bench --benchmarks="crc32c" crc32c : 0.218 micros/op 4595390 ops/sec; 17950.7 MB/s (4096 per op) 3, repeated crc32c_test case 60000 times perf stat -e branch-miss -- ./crc32c_test before optimization: 739,426,504 branch-miss after optimization: 1,128,572 branch-miss Pull Request resolved: https://github.com/facebook/rocksdb/pull/5675 Differential Revision: D16989210 fbshipit-source-id: 7204e6069bb6ed066d49c2d1b3ac385065a98557
This commit is contained in:
parent
df8c307d63
commit
26293c89a6
@ -12,6 +12,26 @@
|
||||
#ifndef HWCAP_CRC32
|
||||
#define HWCAP_CRC32 (1 << 7)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_ARM64_CRYPTO
|
||||
/* unfolding to compute 8 * 3 = 24 bytes parallelly */
|
||||
#define CRC32C24BYTES(ITR) \
|
||||
crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR)));\
|
||||
crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH*2 + (ITR)));\
|
||||
crc0 = crc32c_u64(crc0, *(buf64 + (ITR)));
|
||||
|
||||
/* unfolding to compute 24 * 7 = 168 bytes parallelly */
|
||||
#define CRC32C7X24BYTES(ITR) do {\
|
||||
CRC32C24BYTES((ITR)*7+0) \
|
||||
CRC32C24BYTES((ITR)*7+1) \
|
||||
CRC32C24BYTES((ITR)*7+2) \
|
||||
CRC32C24BYTES((ITR)*7+3) \
|
||||
CRC32C24BYTES((ITR)*7+4) \
|
||||
CRC32C24BYTES((ITR)*7+5) \
|
||||
CRC32C24BYTES((ITR)*7+6) \
|
||||
} while(0)
|
||||
#endif
|
||||
|
||||
uint32_t crc32c_runtime_check(void) {
|
||||
uint64_t auxv = getauxval(AT_HWCAP);
|
||||
return (auxv & HWCAP_CRC32) != 0;
|
||||
@ -48,15 +68,16 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
|
||||
crc0 = crc32c_u64(crc, *buf64++);
|
||||
|
||||
/* 3 blocks crc32c parallel computation
|
||||
*
|
||||
* 42 * 8 * 3 = 1008 (bytes)
|
||||
* Macro unfolding to compute parallelly
|
||||
* 168 * 6 = 1008 (bytes)
|
||||
*/
|
||||
for (int i = 0; i < BLK_LENGTH; i++, buf64++) {
|
||||
crc0 = crc32c_u64(crc0, *buf64);
|
||||
crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH));
|
||||
crc2 = crc32c_u64(crc2, *(buf64 + (BLK_LENGTH * 2)));
|
||||
}
|
||||
buf64 += (BLK_LENGTH * 2);
|
||||
CRC32C7X24BYTES(0);
|
||||
CRC32C7X24BYTES(1);
|
||||
CRC32C7X24BYTES(2);
|
||||
CRC32C7X24BYTES(3);
|
||||
CRC32C7X24BYTES(4);
|
||||
CRC32C7X24BYTES(5);
|
||||
buf64 += (BLK_LENGTH * 3);
|
||||
|
||||
/* Last 8 bytes */
|
||||
crc = crc32c_u64(crc2, *buf64++);
|
||||
@ -72,6 +93,9 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
|
||||
|
||||
length -= 1024;
|
||||
}
|
||||
|
||||
if (length == 0)
|
||||
return crc ^ (0xffffffffU);
|
||||
#endif
|
||||
buf8 = (const uint8_t *)buf64;
|
||||
while (length >= 8) {
|
||||
|
Loading…
Reference in New Issue
Block a user