diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc index d346c2612..79081298d 100644 --- a/util/crc32c_arm64.cc +++ b/util/crc32c_arm64.cc @@ -64,7 +64,10 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, */ uint32_t k0 = 0xe417f38a, k1 = 0x8f158014; - /* First 8 bytei for better pipelining */ + /* Prefetch data for following block to avoid cache miss */ + PREF1KL1((uint8_t *)buf64, 1024); + + /* First 8 byte for better pipelining */ crc0 = crc32c_u64(crc, *buf64++); /* 3 blocks crc32c parallel computation diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h index fb727ce40..2594f2470 100644 --- a/util/crc32c_arm64.h +++ b/util/crc32c_arm64.h @@ -17,6 +17,17 @@ #define crc32c_u16(crc, v) __crc32ch(crc, v) #define crc32c_u32(crc, v) __crc32cw(crc, v) #define crc32c_u64(crc, v) __crc32cd(crc, v) +#define PREF4X64L1(buffer,PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); + +#define PREF1KL1(buffer,PREF_OFFSET) \ + PREF4X64L1(buffer,(PREF_OFFSET), 0) \ + PREF4X64L1(buffer,(PREF_OFFSET), 4) \ + PREF4X64L1(buffer,(PREF_OFFSET), 8) \ + PREF4X64L1(buffer,(PREF_OFFSET), 12) extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len); extern uint32_t crc32c_runtime_check(void);