From 95af128225a0e504670718df581e45ba1eee8520 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 4 May 2012 01:10:17 +0000 Subject: [PATCH] SSE4 optimization Summary: This speeds up CRC computation significantly on hardware that supports it. Enabled via -msse4. Note: the binary won't be usable on older CPUs that don't support the instruction. Test Plan: crc32c_test Reviewers: dhruba Reviewed By: dhruba Differential Revision: https://reviews.facebook.net/D3201 --- README.fb | 7 +++++++ util/crc32c.cc | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/README.fb b/README.fb index 691b472c8..e2581eb5f 100644 --- a/README.fb +++ b/README.fb @@ -1,3 +1,10 @@ * How to compile using fbcode and jemalloc source fbcode.sh make + +* Compiling for CPUs with SSE4 support + + make OPT='-O2 -DNDEBUG -msse4' + + This makes CRC computation much faster, but + binaries won't run on CPUs that don't support it. diff --git a/util/crc32c.cc b/util/crc32c.cc index 663cd9c9c..f91ca9f93 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -7,6 +7,9 @@ #include "util/crc32c.h" +#ifdef __SSE4_2__ +#include +#endif #include #include "util/coding.h" @@ -283,6 +286,10 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) { return DecodeFixed32(reinterpret_cast(p)); } +static inline uint64_t LE_LOAD64(const uint8_t *p) { + return DecodeFixed64(reinterpret_cast(p)); +} + uint32_t Extend(uint32_t crc, const char* buf, size_t size) { const uint8_t *p = reinterpret_cast(buf); const uint8_t *e = p + size; @@ -303,7 +310,12 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) { table1_[(c >> 16) & 0xff] ^ \ table0_[c >> 24]; \ } while (0) + +#ifdef __SSE4_2__ +#define STEP8 do { l = _mm_crc32_u64(l, LE_LOAD64(p)); p += 8; } while(0) +#else #define STEP8 do { STEP4; STEP4; } while(0) +#endif // Point x at first 16-byte aligned byte in string. This might be // just past the end of the string.