Upgrade xxhash, add Hash128 (#8634)

Summary:
With expected use for a 128-bit hash, xxhash library is
upgraded to current dev (2c611a76f914828bed675f0f342d6c4199ffee1e)
as of Aug 6 so that we can use production version of XXH3_128bits
as new Hash128 function (added in hash128.h).

To make this work, however, we have to carve out the "preview" version
of XXH3 that is used in new SST Bloom and Ribbon filters, since that
will not get maintenance in xxhash releases. I have consolidated all the
relevant code into xxph3.h and made it "inline only" (no .cc file). The
working name for this hash function is changed from XXH3p to XXPH3
(XX Preview Hash) because the latter is easier to get working with no
symbol name conflicts between the headers.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8634

Test Plan:
no expected change in existing functionality. For Hash128,
added some unit tests based on those for Hash64 to ensure some basic
properties and that the values do not change accidentally.

Reviewed By: zhichao-cao

Differential Revision: D30173490

Pulled By: pdillinger

fbshipit-source-id: 06aa542a7a28b353bc2c865b9b2f8bdfe44158e4
This commit is contained in:
Peter Dillinger 2021-08-20 18:40:53 -07:00 committed by Facebook GitHub Bot
parent 2a383f21f4
commit 22161b7547
11 changed files with 7102 additions and 2939 deletions

View File

@ -44,13 +44,13 @@ Slice FinishAlwaysFalse(std::unique_ptr<const char[]>* /*buf*/) {
// Base class for filter builders using the XXH3 preview hash,
// also known as Hash64 or GetSliceHash64.
class XXH3pFilterBitsBuilder : public BuiltinFilterBitsBuilder {
class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
public:
explicit XXH3pFilterBitsBuilder(
explicit XXPH3FilterBitsBuilder(
std::atomic<int64_t>* aggregate_rounding_balance)
: aggregate_rounding_balance_(aggregate_rounding_balance) {}
~XXH3pFilterBitsBuilder() override {}
~XXPH3FilterBitsBuilder() override {}
virtual void AddKey(const Slice& key) override {
uint64_t hash = GetSliceHash64(key);
@ -70,8 +70,8 @@ class XXH3pFilterBitsBuilder : public BuiltinFilterBitsBuilder {
protected:
static constexpr uint32_t kMetadataLen = 5;
// For delegating between XXH3pFilterBitsBuilders
void SwapEntriesWith(XXH3pFilterBitsBuilder* other) {
// For delegating between XXPH3FilterBitsBuilders
void SwapEntriesWith(XXPH3FilterBitsBuilder* other) {
std::swap(hash_entries_, other->hash_entries_);
}
@ -188,13 +188,13 @@ class XXH3pFilterBitsBuilder : public BuiltinFilterBitsBuilder {
// ############## also known as format_version=5 Bloom filter ########## //
// See description in FastLocalBloomImpl
class FastLocalBloomBitsBuilder : public XXH3pFilterBitsBuilder {
class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder {
public:
// Non-null aggregate_rounding_balance implies optimize_filters_for_memory
explicit FastLocalBloomBitsBuilder(
const int millibits_per_key,
std::atomic<int64_t>* aggregate_rounding_balance)
: XXH3pFilterBitsBuilder(aggregate_rounding_balance),
: XXPH3FilterBitsBuilder(aggregate_rounding_balance),
millibits_per_key_(millibits_per_key) {
assert(millibits_per_key >= 1000);
}
@ -421,12 +421,12 @@ struct Standard128RibbonRehasherTypesAndSettings {
using Standard128RibbonTypesAndSettings =
ribbon::StandardRehasherAdapter<Standard128RibbonRehasherTypesAndSettings>;
class Standard128RibbonBitsBuilder : public XXH3pFilterBitsBuilder {
class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder {
public:
explicit Standard128RibbonBitsBuilder(
double desired_one_in_fp_rate, int bloom_millibits_per_key,
std::atomic<int64_t>* aggregate_rounding_balance, Logger* info_log)
: XXH3pFilterBitsBuilder(aggregate_rounding_balance),
: XXPH3FilterBitsBuilder(aggregate_rounding_balance),
desired_one_in_fp_rate_(desired_one_in_fp_rate),
info_log_(info_log),
bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance) {

View File

@ -8,10 +8,15 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/hash.h"
#include <string.h>
#include "port/lang.h"
#include "util/coding.h"
#include "util/hash128.h"
#include "util/math128.h"
#include "util/xxhash.h"
#include "util/xxph3.h"
namespace ROCKSDB_NAMESPACE {
@ -74,12 +79,12 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
// bundling hash functions specialized for particular lengths with
// the prefix extractors.
uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
return XXH3p_64bits_withSeed(data, n, seed);
return XXPH3_64bits_withSeed(data, n, seed);
}
uint64_t Hash64(const char* data, size_t n) {
// Same as seed = 0
return XXH3p_64bits(data, n);
return XXPH3_64bits(data, n);
}
uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
@ -97,4 +102,15 @@ uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
return NPHash64(concat_data.data(), concat_len, seed);
}
Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) {
auto h = XXH3_128bits_withSeed(data, n, seed);
return (Unsigned128{h.high64} << 64) | (h.low64);
}
Unsigned128 Hash128(const char* data, size_t n) {
// Same as seed = 0
auto h = XXH3_128bits(data, n);
return (Unsigned128{h.high64} << 64) | (h.low64);
}
} // namespace ROCKSDB_NAMESPACE

View File

@ -10,7 +10,7 @@
// Common hash functions with convenient interfaces. If hashing a
// statically-sized input in a performance-critical context, consider
// calling a specific hash implementation directly, such as
// XXH3p_64bits from xxhash.h.
// XXH3_64bits from xxhash.h.
//
// Since this is a very common header, implementation details are kept
// out-of-line. Out-of-lining also aids in tracking the time spent in

26
util/hash128.h Normal file
View File

@ -0,0 +1,26 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
// 128-bit hash gets it own header so that more popular hash.h doesn't
// depend on math128.h
#include "rocksdb/slice.h"
#include "util/math128.h"
namespace ROCKSDB_NAMESPACE {
// Stable/persistent 128-bit hash for non-cryptographic applications.
Unsigned128 Hash128(const char* data, size_t n, uint64_t seed);
// Specific optimization without seed (same as seed = 0)
Unsigned128 Hash128(const char* data, size_t n);
inline Unsigned128 GetSliceHash128(const Slice& key) {
return Hash128(key.data(), key.size());
}
} // namespace ROCKSDB_NAMESPACE

View File

@ -14,15 +14,20 @@
#include "test_util/testharness.h"
#include "util/coding.h"
#include "util/hash128.h"
#include "util/math128.h"
using ROCKSDB_NAMESPACE::EncodeFixed32;
using ROCKSDB_NAMESPACE::GetSliceHash64;
using ROCKSDB_NAMESPACE::Hash;
using ROCKSDB_NAMESPACE::Hash128;
using ROCKSDB_NAMESPACE::Hash64;
using ROCKSDB_NAMESPACE::Lower32of64;
using ROCKSDB_NAMESPACE::Lower64of128;
using ROCKSDB_NAMESPACE::Slice;
using ROCKSDB_NAMESPACE::Unsigned128;
using ROCKSDB_NAMESPACE::Upper32of64;
using ROCKSDB_NAMESPACE::Upper64of128;
// The hash algorithm is part of the file format, for example for the Bloom
// filters. Test that the hash values are stable for a set of random strings of
@ -93,7 +98,8 @@ TEST(HashTest, Hash64Misc) {
for (size_t size = 0; size <= max_size; ++size) {
uint64_t here = Hash64(str.data(), size, kSeed);
// Must be same as GetSliceHash64
// Must be same as unseeded Hash64 and GetSliceHash64
EXPECT_EQ(here, Hash64(str.data(), size));
EXPECT_EQ(here, GetSliceHash64(Slice(str.data(), size)));
// Upper and Lower must reconstruct hash
@ -234,7 +240,7 @@ std::string Hash64TestDescriptor(const char *repeat, size_t limit) {
return rv;
}
// XXH3p changes its algorithm for various sizes up through 250 bytes, so
// XXPH3 changes its algorithm for various sizes up through 250 bytes, so
// we need to check the stability of larger sizes also.
TEST(HashTest, Hash64LargeValueSchema) {
// Each of these derives a "descriptor" from the hash values for all
@ -267,6 +273,117 @@ TEST(HashTest, Hash64LargeValueSchema) {
"eMFlxCIYUpTCsal2qsmnGOWa8WCcefrohMjDj1fjzSvSaQwlpyR1GZHF2uPOoQagiCpHpm");
}
TEST(HashTest, Hash128Misc) {
constexpr uint32_t kSeed = 0; // Same as GetSliceHash128
for (char fill : {'\0', 'a', '1', '\xff'}) {
const size_t max_size = 1000;
const std::string str(max_size, fill);
for (size_t size = 0; size <= max_size; ++size) {
Unsigned128 here = Hash128(str.data(), size, kSeed);
// Must be same as unseeded Hash128 and GetSliceHash128
EXPECT_EQ(here, Hash128(str.data(), size));
EXPECT_EQ(here, GetSliceHash128(Slice(str.data(), size)));
// Upper and Lower must reconstruct hash
EXPECT_EQ(here,
(Unsigned128{Upper64of128(here)} << 64) | Lower64of128(here));
EXPECT_EQ(here,
(Unsigned128{Upper64of128(here)} << 64) ^ Lower64of128(here));
// Seed changes hash value (with high probability)
for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) {
EXPECT_NE(here, Hash128(str.data(), size, var_seed));
}
// Size changes hash value (with high probability)
size_t max_smaller_by = std::min(size_t{30}, size);
for (size_t smaller_by = 1; smaller_by <= max_smaller_by; ++smaller_by) {
EXPECT_NE(here, Hash128(str.data(), size - smaller_by, kSeed));
}
}
}
}
// Test that hash values are "non-trivial" for "trivial" inputs
TEST(HashTest, Hash128Trivial) {
// Thorough test too slow for regression testing
constexpr bool thorough = false;
// For various seeds, make sure hash of empty string is not zero.
constexpr uint64_t max_seed = thorough ? 0x1000000 : 0x10000;
for (uint64_t seed = 0; seed < max_seed; ++seed) {
Unsigned128 here = Hash128("", 0, seed);
EXPECT_NE(Lower64of128(here), 0u);
EXPECT_NE(Upper64of128(here), 0u);
}
// For standard seed, make sure hash of small strings are not zero
constexpr uint32_t kSeed = 0; // Same as GetSliceHash128
char input[4];
constexpr int max_len = thorough ? 3 : 2;
for (int len = 1; len <= max_len; ++len) {
for (uint32_t i = 0; (i >> (len * 8)) == 0; ++i) {
EncodeFixed32(input, i);
Unsigned128 here = Hash128(input, len, kSeed);
EXPECT_NE(Lower64of128(here), 0u);
EXPECT_NE(Upper64of128(here), 0u);
}
}
}
std::string Hash128TestDescriptor(const char *repeat, size_t limit) {
const char *mod61_encode =
"abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
std::string input;
while (input.size() < limit) {
input.append(repeat);
}
std::string rv;
for (size_t i = 0; i < limit; ++i) {
auto h = GetSliceHash128(Slice(input.data(), i));
uint64_t h2 = Upper64of128(h) + Lower64of128(h);
rv.append(1, mod61_encode[static_cast<size_t>(h2 % 61)]);
}
return rv;
}
// XXH3 changes its algorithm for various sizes up through 250 bytes, so
// we need to check the stability of larger sizes also.
TEST(HashTest, Hash128ValueSchema) {
// Each of these derives a "descriptor" from the hash values for all
// lengths up to 430.
// Note that "b" is common for the zero-length string.
EXPECT_EQ(
Hash128TestDescriptor("foo", 430),
"bUMA3As8n9I4vNGhThXlEevxZlyMcbb6TYAlIKJ2f5ponsv99q962rYclQ7u3gfnRdCDQ5JI"
"2LrGUaCycbXrvLFe4SjgRb9RQwCfrnmNQ7VSEwSKMnkGCK3bDbXSrnIh5qLXdtvIZklbJpGH"
"Dqr93BlqF9ubTnOSYkSdx89XvQqflMIW8bjfQp9BPjQejWOeEQspnN1D3sfgVdFhpaQdHYA5"
"pI2XcPlCMFPxvrFuRr7joaDvjNe9IUZaunLPMewuXmC3EL95h52Ju3D7y9RNKhgYxMTrA84B"
"yJrMvyjdm3vlBxet4EN7v2GEyjbGuaZW9UL6lrX6PghJDg7ACfLGdxNbH3qXM4zaiG2RKnL5"
"S3WXKR78RBB5fRFQ8KDIEQjHFvSNsc3GrAEi6W8P2lv8JMTzjBODO2uN4wadVQFT9wpGfV");
// Note that "35D2v" is common for "Rocks"
EXPECT_EQ(
Hash128TestDescriptor("Rocks", 430),
"b35D2vzvklFVDqJmyLRXyApwGGO3EAT3swhe8XJAN3mY2UVPglzdmydxcba6JI2tSvwO6zSu"
"ANpjSM7tc9G5iMhsa7R8GfyCXRO1TnLg7HvdWNdgGGBirxZR68BgT7TQsYJt6zyEyISeXI1n"
"MXA48Xo7dWfJeYN6Z4KWlqZY7TgFXGbks9AX4ehZNSGtIhdO5i58qlgVX1bEejeOVaCcjC79"
"67DrMfOKds7rUQzjBa77sMPcoPW1vu6ljGJPZH3XkRyDMZ1twxXKkNxN3tE8nR7JHwyqBAxE"
"fTcjbOWrLZ1irWxRSombD8sGDEmclgF11IxqEhe3Rt7gyofO3nExGckKkS9KfRqsCHbiUyva"
"JGkJwUHRXaZnh58b4i1Ei9aQKZjXlvIVDixoZrjcNaH5XJIJlRZce9Z9t82wYapTpckYSg");
EXPECT_EQ(
Hash128TestDescriptor("RocksDB", 430),
"b35D2vFUst3XDZCRlSrhmYYakmqImV97LbBsV6EZlOEQpUPH1d1sD3xMKAPlA5UErHehg5O7"
"n966fZqhAf3hRc24kGCLfNAWjyUa7vSNOx3IcPoTyVRFZeFlcCtfl7t1QJumHOCpS33EBmBF"
"hvK13QjBbDWYWeHQhJhgV9Mqbx17TIcvUkEnYZxb8IzWNmjVsJG44Z7v52DjGj1ZzS62S2Vv"
"qWcDO7apvH5VHg68E9Wl6nXP21vlmUqEH9GeWRehfWVvY7mUpsAg5drHHQyDSdiMceiUuUxJ"
"XJqHFcDdzbbPk7xDvbLgWCKvH8k3MpQNWOmbSSRDdAP6nGlDjoTToYkcqVREHJzztSWAAq5h"
"GHSUNJ6OxsMHhf8EhXfHtKyUzRmPtjYyeckQcGmrQfFFLidc6cjMDKCdBG6c6HVBrS7H2R");
}
TEST(FastRange32Test, Values) {
using ROCKSDB_NAMESPACE::FastRange32;
// Zero range

View File

@ -148,7 +148,7 @@ struct AddInputSelector<Key, ResultRow, true /*IsFilter*/> {
// they are provided to TypesAndSettings::HashFn in case that function does
// not provide sufficiently independent hashes when iterating merely
// sequentially on seeds. (This for example works around a problem with the
// preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXH3p or Hash64, and
// preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXPH3 or Hash64, and
// MurmurHash1 used in RocksDB, a.k.a. Hash.) We say this pre-mixing step
// translates "ordinal seeds," which we iterate sequentially to find a
// solution, into "raw seeds," with many more bits changing for each

View File

@ -204,7 +204,7 @@ struct DefaultTypesAndSettings {
static constexpr bool kUseSmash = false;
static constexpr bool kAllowZeroStarts = false;
static Hash HashFn(const Key& key, uint64_t raw_seed) {
// This version 0.7.2 preview of XXH3 (a.k.a. XXH3p) function does
// This version 0.7.2 preview of XXH3 (a.k.a. XXPH3) function does
// not pass SmallKeyGen tests below without some seed premixing from
// StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469
return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1761
util/xxph3.h Normal file

File diff suppressed because it is too large Load Diff