SSE4 optimization
Summary: This speeds up CRC computation significantly on hardware that supports it. Enabled via -msse4. Note: the binary won't be usable on older CPUs that don't support the instruction. Test Plan: crc32c_test Reviewers: dhruba Reviewed By: dhruba Differential Revision: https://reviews.facebook.net/D3201
This commit is contained in:
parent
8d41351666
commit
95af128225
@ -1,3 +1,10 @@
|
|||||||
* How to compile using fbcode and jemalloc
|
* How to compile using fbcode and jemalloc
|
||||||
source fbcode.sh
|
source fbcode.sh
|
||||||
make
|
make
|
||||||
|
|
||||||
|
* Compiling for CPUs with SSE4 support
|
||||||
|
|
||||||
|
make OPT='-O2 -DNDEBUG -msse4'
|
||||||
|
|
||||||
|
This makes CRC computation much faster, but
|
||||||
|
binaries won't run on CPUs that don't support it.
|
||||||
|
@ -7,6 +7,9 @@
|
|||||||
|
|
||||||
#include "util/crc32c.h"
|
#include "util/crc32c.h"
|
||||||
|
|
||||||
|
#ifdef __SSE4_2__
|
||||||
|
#include <nmmintrin.h>
|
||||||
|
#endif
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
|
||||||
@ -283,6 +286,10 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
|
|||||||
return DecodeFixed32(reinterpret_cast<const char*>(p));
|
return DecodeFixed32(reinterpret_cast<const char*>(p));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uint64_t LE_LOAD64(const uint8_t *p) {
|
||||||
|
return DecodeFixed64(reinterpret_cast<const char*>(p));
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
|
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
|
||||||
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
|
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
|
||||||
const uint8_t *e = p + size;
|
const uint8_t *e = p + size;
|
||||||
@ -303,7 +310,12 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
|
|||||||
table1_[(c >> 16) & 0xff] ^ \
|
table1_[(c >> 16) & 0xff] ^ \
|
||||||
table0_[c >> 24]; \
|
table0_[c >> 24]; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#ifdef __SSE4_2__
|
||||||
|
#define STEP8 do { l = _mm_crc32_u64(l, LE_LOAD64(p)); p += 8; } while(0)
|
||||||
|
#else
|
||||||
#define STEP8 do { STEP4; STEP4; } while(0)
|
#define STEP8 do { STEP4; STEP4; } while(0)
|
||||||
|
#endif
|
||||||
|
|
||||||
// Point x at first 16-byte aligned byte in string. This might be
|
// Point x at first 16-byte aligned byte in string. This might be
|
||||||
// just past the end of the string.
|
// just past the end of the string.
|
||||||
|
Loading…
Reference in New Issue
Block a user