Fix undefined behavior in Hash

Summary:
Instead of ignoring UBSan checks, fix the negative shifts in
Hash(). Also add test to make sure the hash values are stable over
time. The values were computed before this change, so the test also
verifies the correctness of the change.
Closes https://github.com/facebook/rocksdb/pull/2546

Differential Revision: D5386369

Pulled By: yiwu-arbug

fbshipit-source-id: 6de4b44461a544d6222cc5d72d8cda2c0373d17e
This commit is contained in:
Giuseppe Ottaviano 2017-07-10 12:22:26 -07:00 committed by Facebook Github Bot
parent 643b787c7b
commit 8f927e5f75
5 changed files with 92 additions and 23 deletions

View File

@ -697,6 +697,7 @@ set(TESTS
util/event_logger_test.cc util/event_logger_test.cc
util/file_reader_writer_test.cc util/file_reader_writer_test.cc
util/filelock_test.cc util/filelock_test.cc
util/hash_test.cc
util/heap_test.cc util/heap_test.cc
util/rate_limiter_test.cc util/rate_limiter_test.cc
util/slice_transform_test.cc util/slice_transform_test.cc

View File

@ -336,6 +336,7 @@ TESTS = \
inlineskiplist_test \ inlineskiplist_test \
env_basic_test \ env_basic_test \
env_test \ env_test \
hash_test \
thread_local_test \ thread_local_test \
rate_limiter_test \ rate_limiter_test \
perf_context_test \ perf_context_test \
@ -990,6 +991,9 @@ cache_test: cache/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
hash_test: util/hash_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
option_change_migration_test: utilities/option_change_migration/option_change_migration_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) option_change_migration_test: utilities/option_change_migration/option_change_migration_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)

View File

@ -11,7 +11,7 @@ rocksdb_compiler_flags = [
"-DROCKSDB_MALLOC_USABLE_SIZE", "-DROCKSDB_MALLOC_USABLE_SIZE",
"-DROCKSDB_RANGESYNC_PRESENT", "-DROCKSDB_RANGESYNC_PRESENT",
"-DROCKSDB_SCHED_GETCPU_PRESENT", "-DROCKSDB_SCHED_GETCPU_PRESENT",
"-DROCKSDB_SUPPORT_THREAD_LOCAL", "-DROCKSDB_SUPPORT_THREAD_LOCAL",
"-DHAVE_SSE42", "-DHAVE_SSE42",
"-DOS_LINUX", "-DOS_LINUX",
# Flags to enable libs we include # Flags to enable libs we include
@ -407,6 +407,7 @@ ROCKS_TESTS = [['arena_test', 'util/arena_test.cc', 'serial'],
['hash_table_test', ['hash_table_test',
'utilities/persistent_cache/hash_table_test.cc', 'utilities/persistent_cache/hash_table_test.cc',
'serial'], 'serial'],
['hash_test', 'util/hash_test.cc', 'serial'],
['heap_test', 'util/heap_test.cc', 'serial'], ['heap_test', 'util/heap_test.cc', 'serial'],
['histogram_test', 'monitoring/histogram_test.cc', 'serial'], ['histogram_test', 'monitoring/histogram_test.cc', 'serial'],
['inlineskiplist_test', 'memtable/inlineskiplist_test.cc', 'parallel'], ['inlineskiplist_test', 'memtable/inlineskiplist_test.cc', 'parallel'],

View File

@ -15,13 +15,6 @@
namespace rocksdb { namespace rocksdb {
#ifdef ROCKSDB_UBSAN_RUN
#if defined(__clang__)
__attribute__((__no_sanitize__("shift")))
#elif defined(__GNUC__)
__attribute__((__no_sanitize_undefined__))
#endif
#endif
uint32_t Hash(const char* data, size_t n, uint32_t seed) { uint32_t Hash(const char* data, size_t n, uint32_t seed) {
// Similar to murmur hash // Similar to murmur hash
const uint32_t m = 0xc6a4a793; const uint32_t m = 0xc6a4a793;
@ -40,26 +33,21 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
// Pick up remaining bytes // Pick up remaining bytes
switch (limit - data) { switch (limit - data) {
// Note: It would be better if this was cast to unsigned char, but that // Note: The original hash implementation used data[i] << shift, which
// would be a disk format change since we previously didn't have any cast // promotes the char to int and then performs the shift. If the char is
// at all (so gcc used signed char). // negative, the shift is undefined behavior in C++. The hash algorithm is
// To understand the difference between shifting unsigned and signed chars, // part of the format definition, so we cannot change it; to obtain the same
// let's use 250 as an example. unsigned char will be 250, while signed char // behavior in a legal way we just cast to uint32_t, which will do
// will be -6. Bit-wise, they are equivalent: 11111010. However, when // sign-extension. To guarantee compatibility with architectures where chars
// converting negative number (signed char) to int, it will be converted // are unsigned we first cast the char to int8_t.
// into negative int (of equivalent value, which is -6), while converting
// positive number (unsigned char) will be converted to 250. Bitwise,
// this looks like this:
// signed char 11111010 -> int 11111111111111111111111111111010
// unsigned char 11111010 -> int 00000000000000000000000011111010
case 3: case 3:
h += static_cast<uint32_t>(static_cast<signed char>(data[2]) << 16); h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16;
// fall through // fall through
case 2: case 2:
h += static_cast<uint32_t>(static_cast<signed char>(data[1]) << 8); h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8;
// fall through // fall through
case 1: case 1:
h += static_cast<uint32_t>(static_cast<signed char>(data[0])); h += static_cast<uint32_t>(static_cast<int8_t>(data[0]));
h *= m; h *= m;
h ^= (h >> r); h ^= (h >> r);
break; break;

75
util/hash_test.cc Normal file
View File

@ -0,0 +1,75 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// This source code is also licensed under the GPLv2 license found in the
// COPYING file in the root directory of this source tree.
//
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <vector>
#include "util/hash.h"
#include "util/testharness.h"
// The hash algorithm is part of the file format, for example for the Bloom
// filters. Test that the hash values are stable for a set of random strings of
// varying lengths.
TEST(HashTest, Values) {
using rocksdb::Hash;
constexpr uint32_t kSeed = 0xbc9f1d34; // Same as BloomHash.
EXPECT_EQ(Hash("", 0, kSeed), 3164544308);
EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524);
EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998);
EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349);
EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383);
EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956);
EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105);
EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776);
EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603);
EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797);
EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265);
EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661);
EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222);
EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265);
EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611);
EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572);
EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355);
EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370);
EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402);
EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743);
EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500);
EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008);
EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482);
EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422);
EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029);
EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748);
EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698);
EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227);
EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252);
EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402);
EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698);
EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809);
EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120);
EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed), 2706087434);
EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed), 1534654151);
EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed), 2355554696);
EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed), 1400800912);
EXPECT_EQ(Hash("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
3420325137);
EXPECT_EQ(Hash("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
3427803584);
EXPECT_EQ(Hash("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
1152407945);
EXPECT_EQ(Hash("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
3382479516);
}
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}