Avoid popcnt on Windows when unavailable and in portable builds (#9680)

Summary:
Fixes https://github.com/facebook/rocksdb/issues/9560. Only use popcnt intrinsic when HAVE_SSE42 is set. Also avoid setting it based on compiler test in portable builds because such test will pass on MSVC even without proper arch flags (ref: https://devblogs.microsoft.com/oldnewthing/20201026-00/?p=104397).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9680

Test Plan: verified the combinations of -DPORTABLE and -DFORCE_SSE42 produce expected compiler flags on Linux. Verified MSVC build using PORTABLE=1 (in CircleCI) does not set HAVE_SSE42.

Reviewed By: pdillinger

Differential Revision: D34739033

Pulled By: ajkr

fbshipit-source-id: d10456f3392945fc3e59430a1777840f7b60b276
This commit is contained in:
Andrew Kryczka 2022-03-09 21:07:31 -08:00
parent 2e9a9f04d7
commit 1a2781d48c
2 changed files with 23 additions and 13 deletions

View File

@ -321,7 +321,8 @@ if(NOT MSVC)
set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul") set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
endif() endif()
CHECK_CXX_SOURCE_COMPILES(" if (NOT PORTABLE OR FORCE_SSE42)
CHECK_CXX_SOURCE_COMPILES("
#include <cstdint> #include <cstdint>
#include <nmmintrin.h> #include <nmmintrin.h>
#include <wmmintrin.h> #include <wmmintrin.h>
@ -333,11 +334,12 @@ int main() {
auto d = _mm_cvtsi128_si64(c); auto d = _mm_cvtsi128_si64(c);
} }
" HAVE_SSE42) " HAVE_SSE42)
if(HAVE_SSE42) if(HAVE_SSE42)
add_definitions(-DHAVE_SSE42) add_definitions(-DHAVE_SSE42)
add_definitions(-DHAVE_PCLMUL) add_definitions(-DHAVE_PCLMUL)
elseif(FORCE_SSE42) elseif(FORCE_SSE42)
message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled") message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
endif()
endif() endif()
# Check if -latomic is required or not # Check if -latomic is required or not

View File

@ -92,18 +92,25 @@ inline int CountTrailingZeroBits(T v) {
#endif #endif
} }
#if defined(_MSC_VER) && !defined(_M_X64) // Not all MSVC compile settings will use `BitsSetToOneFallback()`. We include
// the following code at coarse granularity for simpler macros. It's important
// to exclude at least so our non-MSVC unit test coverage tool doesn't see it.
#ifdef _MSC_VER
namespace detail { namespace detail {
template <typename T> template <typename T>
int BitsSetToOneFallback(T v) { int BitsSetToOneFallback(T v) {
const int kBits = static_cast<int>(sizeof(T)) * 8; const int kBits = static_cast<int>(sizeof(T)) * 8;
static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits"); static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits");
// we static_cast these bit patterns in order to truncate them to the correct // we static_cast these bit patterns in order to truncate them to the correct
// size // size. Warning C4309 dislikes this technique, so disable it here.
#pragma warning(disable : 4309)
v = static_cast<T>(v - ((v >> 1) & static_cast<T>(0x5555555555555555ull))); v = static_cast<T>(v - ((v >> 1) & static_cast<T>(0x5555555555555555ull)));
v = static_cast<T>((v & static_cast<T>(0x3333333333333333ull)) + v = static_cast<T>((v & static_cast<T>(0x3333333333333333ull)) +
((v >> 2) & static_cast<T>(0x3333333333333333ull))); ((v >> 2) & static_cast<T>(0x3333333333333333ull)));
v = static_cast<T>((v + (v >> 4)) & static_cast<T>(0x0F0F0F0F0F0F0F0Full)); v = static_cast<T>((v + (v >> 4)) & static_cast<T>(0x0F0F0F0F0F0F0F0Full));
#pragma warning(default : 4309)
for (int shift_bits = 8; shift_bits < kBits; shift_bits <<= 1) { for (int shift_bits = 8; shift_bits < kBits; shift_bits <<= 1) {
v += static_cast<T>(v >> shift_bits); v += static_cast<T>(v >> shift_bits);
} }
@ -113,7 +120,8 @@ int BitsSetToOneFallback(T v) {
} }
} // namespace detail } // namespace detail
#endif
#endif // _MSC_VER
// Number of bits set to 1. Also known as "population count". // Number of bits set to 1. Also known as "population count".
template <typename T> template <typename T>
@ -126,21 +134,21 @@ inline int BitsSetToOne(T v) {
constexpr auto mm = 8 * sizeof(uint32_t) - 1; constexpr auto mm = 8 * sizeof(uint32_t) - 1;
// The bit mask is to neutralize sign extension on small signed types // The bit mask is to neutralize sign extension on small signed types
constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1; constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1;
#if defined(_M_X64) || defined(_M_IX86) #if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
return static_cast<int>(__popcnt(static_cast<uint32_t>(v) & m)); return static_cast<int>(__popcnt(static_cast<uint32_t>(v) & m));
#else #else
return static_cast<int>(detail::BitsSetToOneFallback(v) & m); return static_cast<int>(detail::BitsSetToOneFallback(v) & m);
#endif #endif
} else if (sizeof(T) == sizeof(uint32_t)) { } else if (sizeof(T) == sizeof(uint32_t)) {
#if defined(_M_X64) || defined(_M_IX86) #if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
return static_cast<int>(__popcnt(static_cast<uint32_t>(v))); return static_cast<int>(__popcnt(static_cast<uint32_t>(v)));
#else #else
return detail::BitsSetToOneFallback(static_cast<uint32_t>(v)); return detail::BitsSetToOneFallback(static_cast<uint32_t>(v));
#endif #endif
} else { } else {
#ifdef _M_X64 #if defined(HAVE_SSE42) && defined(_M_X64)
return static_cast<int>(__popcnt64(static_cast<uint64_t>(v))); return static_cast<int>(__popcnt64(static_cast<uint64_t>(v)));
#elif defined(_M_IX86) #elif defined(HAVE_SSE42) && defined(_M_IX86)
return static_cast<int>( return static_cast<int>(
__popcnt(static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32) + __popcnt(static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32) +
__popcnt(static_cast<uint32_t>(v)))); __popcnt(static_cast<uint32_t>(v))));