SSE4 optimization
Summary: This speeds up CRC computation significantly on hardware that supports it. Enabled via -msse4. Note: the binary won't be usable on older CPUs that don't support the instruction. Test Plan: crc32c_test Reviewers: dhruba Reviewed By: dhruba Differential Revision: https://reviews.facebook.net/D3201
This commit is contained in:
parent
8d41351666
commit
95af128225
@ -1,3 +1,10 @@
|
||||
* How to compile using fbcode and jemalloc
|
||||
source fbcode.sh
|
||||
make
|
||||
|
||||
* Compiling for CPUs with SSE4 support
|
||||
|
||||
make OPT='-O2 -DNDEBUG -msse4'
|
||||
|
||||
This makes CRC computation much faster, but
|
||||
binaries won't run on CPUs that don't support it.
|
||||
|
@ -7,6 +7,9 @@
|
||||
|
||||
#include "util/crc32c.h"
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#include <nmmintrin.h>
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#include "util/coding.h"
|
||||
|
||||
@ -283,6 +286,10 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
|
||||
return DecodeFixed32(reinterpret_cast<const char*>(p));
|
||||
}
|
||||
|
||||
static inline uint64_t LE_LOAD64(const uint8_t *p) {
|
||||
return DecodeFixed64(reinterpret_cast<const char*>(p));
|
||||
}
|
||||
|
||||
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
|
||||
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
|
||||
const uint8_t *e = p + size;
|
||||
@ -303,7 +310,12 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
|
||||
table1_[(c >> 16) & 0xff] ^ \
|
||||
table0_[c >> 24]; \
|
||||
} while (0)
|
||||
|
||||
#ifdef __SSE4_2__
|
||||
#define STEP8 do { l = _mm_crc32_u64(l, LE_LOAD64(p)); p += 8; } while(0)
|
||||
#else
|
||||
#define STEP8 do { STEP4; STEP4; } while(0)
|
||||
#endif
|
||||
|
||||
// Point x at first 16-byte aligned byte in string. This might be
|
||||
// just past the end of the string.
|
||||
|
Loading…
Reference in New Issue
Block a user