SSE4 optimization

Summary:
This speeds up CRC computation significantly on
hardware that supports it. Enabled via -msse4.

Note: the binary won't be usable on older CPUs
that don't support the instruction.

Test Plan: crc32c_test

Reviewers: dhruba

Reviewed By: dhruba

Differential Revision: https://reviews.facebook.net/D3201
This commit is contained in:
Arun Sharma 2012-05-04 01:10:17 +00:00
parent 8d41351666
commit 95af128225
2 changed files with 19 additions and 0 deletions

View File

@ -1,3 +1,10 @@
* How to compile using fbcode and jemalloc
source fbcode.sh
make
* Compiling for CPUs with SSE4 support
make OPT='-O2 -DNDEBUG -msse4'
This makes CRC computation much faster, but
binaries won't run on CPUs that don't support it.

View File

@ -7,6 +7,9 @@
#include "util/crc32c.h"
#ifdef __SSE4_2__
#include <nmmintrin.h>
#endif
#include <stdint.h>
#include "util/coding.h"
@ -283,6 +286,10 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
return DecodeFixed32(reinterpret_cast<const char*>(p));
}
static inline uint64_t LE_LOAD64(const uint8_t *p) {
return DecodeFixed64(reinterpret_cast<const char*>(p));
}
uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
const uint8_t *e = p + size;
@ -303,7 +310,12 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
table1_[(c >> 16) & 0xff] ^ \
table0_[c >> 24]; \
} while (0)
#ifdef __SSE4_2__
#define STEP8 do { l = _mm_crc32_u64(l, LE_LOAD64(p)); p += 8; } while(0)
#else
#define STEP8 do { STEP4; STEP4; } while(0)
#endif
// Point x at first 16-byte aligned byte in string. This might be
// just past the end of the string.