Ribbon: major re-work of hashing, seeds, and more (#7635)
Summary: * Fully optimized StandardHasher, in terms of efficiently generating Start, CoeffRow, and ResultRow from a stock hash value, with sufficient independence between them to have no measurably degraded behavior. (Degraded behavior would be an FP rate higher than explainable by 2^-b and, if using a 32-bit stock hash function, expected stock hash collisions.) Details in code comments. * Our standard 64-bit and 32-bit hash functions do not exhibit sufficient independence on sequential seeds (for one Ribbon construction attempt to have independent probability from the next). I have worked around this in the Ribbon code by "pre-mixing" "ordinal seeds," sequentially tried and appropriate for storage in persisted metadata, into "raw seeds," ready for application and appropriate for in-memory storage. This way the pre-mixing step (though fast) is only applied on loading or configuring the structure, not on each query or banding add. * Fix a subtle flaw in which backtracking not clearing ResultRow data could lead to elevated FP rate on keys that were backtracked on and should (for generality) exhibit the same FP rate as novel keys. * Added a basic test for PhsfQuery and construction algorithms (map or "retrieval structure" rather than set or filter), and made a few trivial related fixes. * Better random configuration generation in unit tests * Some other minor cleanup / clarification / etc. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7635 Test Plan: unit tests included Reviewed By: jay-zhuang Differential Revision: D24738978 Pulled By: pdillinger fbshipit-source-id: f9d03599d9e2ca3e30e9d3e7d81cd936b56f76f0
This commit is contained in:
parent
1e40696dd1
commit
8b8a2e9f05
@ -29,6 +29,8 @@ namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
// Stable/persistent 64-bit hash. Higher quality and generally faster than
|
||||
// Hash(), especially for inputs > 24 bytes.
|
||||
// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
|
||||
// results from previous seed. Recommend incrementing by a large odd number.
|
||||
extern uint64_t Hash64(const char* data, size_t n, uint64_t seed);
|
||||
|
||||
// Specific optimization without seed (same as seed = 0)
|
||||
@ -37,6 +39,8 @@ extern uint64_t Hash64(const char* data, size_t n);
|
||||
// Non-persistent hash. Must only used for in-memory data structure.
|
||||
// The hash results are thus applicable to change. (Thus, it rarely makes
|
||||
// sense to specify a seed for this function.)
|
||||
// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
|
||||
// results from previous seed. Recommend incrementing by a large odd number.
|
||||
inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
|
||||
// Currently same as Hash64
|
||||
return Hash64(data, n, seed);
|
||||
@ -51,6 +55,8 @@ inline uint64_t NPHash64(const char* data, size_t n) {
|
||||
// Stable/persistent 32-bit hash. Moderate quality and high speed on
|
||||
// small inputs.
|
||||
// TODO: consider rename to Hash32
|
||||
// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
|
||||
// results from previous seed. Recommend pseudorandom or hashed seeds.
|
||||
extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
|
||||
|
||||
// TODO: consider rename to LegacyBloomHash32
|
||||
|
@ -405,7 +405,10 @@ namespace ribbon {
|
||||
// // big enough for the largest number of columns allowed.
|
||||
// typename ResultRow;
|
||||
// // An unsigned integer type sufficient for representing the number of
|
||||
// // rows in the solution structure. (TODO: verify any extra needed?)
|
||||
// // rows in the solution structure, and at least the arithmetic
|
||||
// // promotion size (usually 32 bits). uint32_t recommended because a
|
||||
// // single Ribbon construction doesn't really scale to billions of
|
||||
// // entries.
|
||||
// typename Index;
|
||||
// };
|
||||
|
||||
@ -554,11 +557,10 @@ bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
|
||||
int tz = CountTrailingZeroBits(cr);
|
||||
i += static_cast<Index>(tz);
|
||||
cr >>= tz;
|
||||
} else {
|
||||
assert((cr & 1) == 1);
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
assert((cr & 1) == 1);
|
||||
CoeffRow other = *(bs->CoeffRowPtr(i));
|
||||
if (other == 0) {
|
||||
*(bs->CoeffRowPtr(i)) = cr;
|
||||
@ -568,16 +570,19 @@ bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
|
||||
return true;
|
||||
}
|
||||
assert((other & 1) == 1);
|
||||
// Gaussian row reduction
|
||||
cr ^= other;
|
||||
rr ^= *(bs->ResultRowPtr(i));
|
||||
if (cr == 0) {
|
||||
// Inconsistency or (less likely) redundancy
|
||||
break;
|
||||
}
|
||||
// Find relative offset of next non-zero coefficient.
|
||||
int tz = CountTrailingZeroBits(cr);
|
||||
i += static_cast<Index>(tz);
|
||||
cr >>= tz;
|
||||
}
|
||||
|
||||
// Failed, unless result row == 0 because e.g. a duplicate input or a
|
||||
// stock hash collision, with same result row. (For filter, stock hash
|
||||
// collision implies same result row.) Or we could have a full equation
|
||||
@ -674,7 +679,11 @@ bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
|
||||
--backtrack_pos;
|
||||
Index i = bts->BacktrackGet(backtrack_pos);
|
||||
*(bs->CoeffRowPtr(i)) = 0;
|
||||
// Not required: *(bs->ResultRowPtr(i)) = 0;
|
||||
// Not strictly required, but is required for good FP rate on
|
||||
// inputs that might have been backtracked out. (We don't want
|
||||
// anything we've backtracked on to leak into final result, as
|
||||
// that might not be "harmless".)
|
||||
*(bs->ResultRowPtr(i)) = 0;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
@ -1088,8 +1097,8 @@ typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
|
||||
const Hash hash = hasher.GetHash(key);
|
||||
const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
|
||||
|
||||
const Index upper_start_block = iss->GetUpperStartBlock();
|
||||
Index num_columns = iss->GetUpperNumColumns();
|
||||
const Index upper_start_block = iss.GetUpperStartBlock();
|
||||
Index num_columns = iss.GetUpperNumColumns();
|
||||
Index start_block_num = start_slot / kCoeffBits;
|
||||
Index segment = start_block_num * num_columns -
|
||||
std::min(start_block_num, upper_start_block);
|
||||
@ -1103,14 +1112,14 @@ typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
|
||||
ResultRow sr = 0;
|
||||
const CoeffRow cr_left = cr << start_bit;
|
||||
for (Index i = 0; i < num_columns; ++i) {
|
||||
sr ^= BitParity(iss->LoadSegment(segment + i) & cr_left) << i;
|
||||
sr ^= BitParity(iss.LoadSegment(segment + i) & cr_left) << i;
|
||||
}
|
||||
|
||||
if (start_bit > 0) {
|
||||
segment += num_columns;
|
||||
const CoeffRow cr_right = cr >> (kCoeffBits - start_bit);
|
||||
for (Index i = 0; i < num_columns; ++i) {
|
||||
sr ^= BitParity(iss->LoadSegment(segment + i) & cr_right) << i;
|
||||
sr ^= BitParity(iss.LoadSegment(segment + i) & cr_right) << i;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1158,6 +1167,9 @@ bool InterleavedFilterQuery(const typename FilterQueryHasher::Key &key,
|
||||
|
||||
const ResultRow expected = hasher.GetResultRowFromHash(hash);
|
||||
|
||||
// TODO: consider optimizations such as
|
||||
// * mask fetched values and shift cr, rather than shifting fetched values
|
||||
// * get rid of start_bit == 0 condition with careful fetching & shifting
|
||||
if (start_bit == 0) {
|
||||
for (Index i = 0; i < num_columns; ++i) {
|
||||
if (BitParity(iss.LoadSegment(segment + i) & cr) !=
|
||||
|
@ -39,7 +39,8 @@ namespace ribbon {
|
||||
// static constexpr bool kFirstCoeffAlwaysOne;
|
||||
//
|
||||
// // An unsigned integer type for identifying a hash seed, typically
|
||||
// // uint32_t or uint64_t.
|
||||
// // uint32_t or uint64_t. Importantly, this is the amount of data
|
||||
// // stored in memory for identifying a raw seed. See StandardHasher.
|
||||
// typename Seed;
|
||||
//
|
||||
// // When true, the PHSF implements a static filter, expecting just
|
||||
@ -65,12 +66,7 @@ namespace ribbon {
|
||||
// // A seedable stock hash function on Keys. All bits of Hash must
|
||||
// // be reasonably high quality. XXH functions recommended, but
|
||||
// // Murmur, City, Farm, etc. also work.
|
||||
// //
|
||||
// // If sequential seeds are not sufficiently independent for your
|
||||
// // stock hash function, consider multiplying by a large odd constant.
|
||||
// // If seed 0 is still undesirable, consider adding 1 before the
|
||||
// // multiplication.
|
||||
// static Hash HashFn(const Key &, Seed);
|
||||
// static Hash HashFn(const Key &, Seed raw_seed);
|
||||
// };
|
||||
|
||||
// A bit of a hack to automatically construct the type for
|
||||
@ -114,6 +110,12 @@ struct AddInputSelector<Key, ResultRow, true /*IsFilter*/> {
|
||||
0, \
|
||||
"avoid unused warnings, semicolon expected after macro call")
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4309) // cast truncating constant
|
||||
#pragma warning(disable : 4307) // arithmetic constant overflow
|
||||
#endif
|
||||
|
||||
// StandardHasher: A standard implementation of concepts RibbonTypes,
|
||||
// PhsfQueryHasher, FilterQueryHasher, and BandingHasher from ribbon_alg.h.
|
||||
//
|
||||
@ -126,15 +128,31 @@ struct AddInputSelector<Key, ResultRow, true /*IsFilter*/> {
|
||||
// can do" with available hash information in terms of FP rate and
|
||||
// compactness. (64 bits recommended and sufficient for PHSF practical
|
||||
// purposes.)
|
||||
//
|
||||
// Another feature of this hasher is a minimal "premixing" of seeds before
|
||||
// they are provided to TypesAndSettings::HashFn in case that function does
|
||||
// not provide sufficiently independent hashes when iterating merely
|
||||
// sequentially on seeds. (This for example works around a problem with the
|
||||
// preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXH3p or Hash64, and
|
||||
// MurmurHash1 used in RocksDB, a.k.a. Hash.) We say this pre-mixing step
|
||||
// translates "ordinal seeds," which we iterate sequentially to find a
|
||||
// solution, into "raw seeds," with many more bits changing for each
|
||||
// iteration. The translation is an easily reversible lightweight mixing,
|
||||
// not suitable for hashing on its own. An advantage of this approach is that
|
||||
// StandardHasher can store just the raw seed (e.g. 64 bits) for fast query
|
||||
// times, while from the application perspective, we can limit to a small
|
||||
// number of ordinal keys (e.g. 64 in 6 bits) for saving in metadata.
|
||||
//
|
||||
// The default constructor initializes the seed to ordinal seed zero, which
|
||||
// is equal to raw seed zero.
|
||||
//
|
||||
template <class TypesAndSettings>
|
||||
class StandardHasher {
|
||||
public:
|
||||
IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
|
||||
|
||||
StandardHasher(Seed seed = 0) : seed_(seed) {}
|
||||
|
||||
inline Hash GetHash(const Key& key) const {
|
||||
return TypesAndSettings::HashFn(key, seed_);
|
||||
return TypesAndSettings::HashFn(key, raw_seed_);
|
||||
};
|
||||
// For when AddInput == pair<Key, ResultRow> (kIsFilter == false)
|
||||
inline Hash GetHash(const std::pair<Key, ResultRow>& bi) const {
|
||||
@ -180,18 +198,59 @@ class StandardHasher {
|
||||
}
|
||||
}
|
||||
inline CoeffRow GetCoeffRow(Hash h) const {
|
||||
// This is a reasonably cheap but empirically effective remix/expansion
|
||||
// of the hash data to fill CoeffRow. (Large primes)
|
||||
// This is not so much "critical path" code because it can be done in
|
||||
// parallel (instruction level) with memory lookup.
|
||||
Unsigned128 a = Multiply64to128(h, 0x85EBCA77C2B2AE63U);
|
||||
Unsigned128 b = Multiply64to128(h, 0x27D4EB2F165667C5U);
|
||||
auto cr = static_cast<CoeffRow>(b ^ (a << 64) ^ (a >> 64));
|
||||
//
|
||||
// We do not need exhaustive remixing for CoeffRow, but just enough that
|
||||
// (a) every bit is reasonably independent from Start.
|
||||
// (b) every Hash-length bit subsequence of the CoeffRow has full or
|
||||
// nearly full entropy from h.
|
||||
// (c) if nontrivial bit subsequences within are correlated, it needs to
|
||||
// be more complicated than exact copy or bitwise not (at least without
|
||||
// kFirstCoeffAlwaysOne), or else there seems to be a kind of
|
||||
// correlated clustering effect.
|
||||
// (d) the CoeffRow is not zero, so that no one input on its own can
|
||||
// doom construction success. (Preferably a mix of 1's and 0's if
|
||||
// satisfying above.)
|
||||
|
||||
// First, establish sufficient bitwise independence from Start, with
|
||||
// multiplication by a large random prime.
|
||||
// Note that we cast to Hash because if we use product bits beyond
|
||||
// original input size, that's going to correlate with Start (FastRange)
|
||||
// even with a (likely) different multiplier here.
|
||||
Hash a = h * kCoeffAndResultFactor;
|
||||
|
||||
// If that's big enough, we're done. If not, we have to expand it,
|
||||
// maybe up to 4x size.
|
||||
uint64_t b = a;
|
||||
static_assert(
|
||||
sizeof(Hash) == sizeof(uint64_t) || sizeof(Hash) == sizeof(uint32_t),
|
||||
"Supported sizes");
|
||||
if (sizeof(Hash) < sizeof(uint64_t)) {
|
||||
// Almost-trivial hash expansion (OK - see above), favoring roughly
|
||||
// equal number of 1's and 0's in result
|
||||
b = (b << 32) ^ b ^ kCoeffXor32;
|
||||
}
|
||||
Unsigned128 c = b;
|
||||
static_assert(sizeof(CoeffRow) == sizeof(uint64_t) ||
|
||||
sizeof(CoeffRow) == sizeof(Unsigned128),
|
||||
"Supported sizes");
|
||||
if (sizeof(uint64_t) < sizeof(CoeffRow)) {
|
||||
// Almost-trivial hash expansion (OK - see above), favoring roughly
|
||||
// equal number of 1's and 0's in result
|
||||
c = (c << 64) ^ c ^ kCoeffXor64;
|
||||
}
|
||||
auto cr = static_cast<CoeffRow>(c);
|
||||
|
||||
// Now ensure the value is non-zero
|
||||
if (kFirstCoeffAlwaysOne) {
|
||||
cr |= 1;
|
||||
} else if (sizeof(CoeffRow) == sizeof(Hash)) {
|
||||
// Still have to ensure some bit is non-zero
|
||||
cr |= (cr == 0) ? 1 : 0;
|
||||
} else {
|
||||
// Still have to ensure non-zero
|
||||
cr |= static_cast<unsigned>(cr == 0);
|
||||
// (We did trivial expansion with constant xor, which ensures some
|
||||
// bits are non-zero.)
|
||||
}
|
||||
return cr;
|
||||
}
|
||||
@ -203,11 +262,19 @@ class StandardHasher {
|
||||
}
|
||||
inline ResultRow GetResultRowFromHash(Hash h) const {
|
||||
if (TypesAndSettings::kIsFilter) {
|
||||
// In contrast to GetStart, here we draw primarily from lower bits,
|
||||
// but not literally, which seemed to cause FP rate hit in some cases.
|
||||
// This is not so much "critical path" code because it can be done in
|
||||
// parallel (instruction level) with memory lookup.
|
||||
auto rr = static_cast<ResultRow>(h ^ (h >> 13) ^ (h >> 26));
|
||||
//
|
||||
// There is no evidence that ResultRow needs to be independent from
|
||||
// CoeffRow, so we draw from the same bits computed for CoeffRow,
|
||||
// which are reasonably independent from Start. (Inlining and common
|
||||
// subexpression elimination with GetCoeffRow should make this
|
||||
// a single shared multiplication in generated code.)
|
||||
Hash a = h * kCoeffAndResultFactor;
|
||||
// The bits here that are *most* independent of Start are the highest
|
||||
// order bits (as in Knuth multiplicative hash). To make those the
|
||||
// most preferred for use in the result row, we do a bswap here.
|
||||
auto rr = static_cast<ResultRow>(EndianSwapValue(a));
|
||||
return rr & GetResultRowMask();
|
||||
} else {
|
||||
// Must be zero
|
||||
@ -226,33 +293,80 @@ class StandardHasher {
|
||||
return bi.second;
|
||||
}
|
||||
|
||||
bool NextSeed(Seed max_seed) {
|
||||
if (seed_ >= max_seed) {
|
||||
return false;
|
||||
} else {
|
||||
++seed_;
|
||||
return true;
|
||||
}
|
||||
// Seed tracking APIs - see class comment
|
||||
void SetRawSeed(Seed seed) { raw_seed_ = seed; }
|
||||
Seed GetRawSeed() { return raw_seed_; }
|
||||
void SetOrdinalSeed(Seed count) {
|
||||
// A simple, reversible mixing of any size (whole bytes) up to 64 bits.
|
||||
// This allows casting the raw seed to any smaller size we use for
|
||||
// ordinal seeds without risk of duplicate raw seeds for unique ordinal
|
||||
// seeds.
|
||||
|
||||
// Seed type might be smaller than numerical promotion size, but Hash
|
||||
// should be at least that size, so we use Hash as intermediate type.
|
||||
static_assert(sizeof(Seed) <= sizeof(Hash),
|
||||
"Hash must be at least size of Seed");
|
||||
|
||||
// Multiply by a large random prime (one-to-one for any prefix of bits)
|
||||
Hash tmp = count * kToRawSeedFactor;
|
||||
// Within-byte one-to-one mixing
|
||||
static_assert((kSeedMixMask & (kSeedMixMask >> kSeedMixShift)) == 0,
|
||||
"Illegal mask+shift");
|
||||
tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift;
|
||||
raw_seed_ = static_cast<Seed>(tmp);
|
||||
// dynamic verification
|
||||
assert(GetOrdinalSeed() == count);
|
||||
}
|
||||
Seed GetOrdinalSeed() {
|
||||
Hash tmp = raw_seed_;
|
||||
// Within-byte one-to-one mixing (its own inverse)
|
||||
tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift;
|
||||
// Multiply by 64-bit multiplicative inverse
|
||||
static_assert(kToRawSeedFactor * kFromRawSeedFactor == Hash{1},
|
||||
"Must be inverses");
|
||||
return static_cast<Seed>(tmp * kFromRawSeedFactor);
|
||||
}
|
||||
Seed GetSeed() const { return seed_; }
|
||||
void ResetSeed(Seed seed = 0) { seed_ = seed; }
|
||||
|
||||
protected:
|
||||
Seed seed_;
|
||||
// For expanding hash:
|
||||
// large random prime
|
||||
static constexpr Hash kCoeffAndResultFactor =
|
||||
static_cast<Hash>(0xc28f82822b650bedULL);
|
||||
// random-ish data
|
||||
static constexpr uint32_t kCoeffXor32 = 0xa6293635U;
|
||||
static constexpr uint64_t kCoeffXor64 = 0xc367844a6e52731dU;
|
||||
|
||||
// For pre-mixing seeds
|
||||
static constexpr Hash kSeedMixMask = static_cast<Hash>(0xf0f0f0f0f0f0f0f0ULL);
|
||||
static constexpr unsigned kSeedMixShift = 4U;
|
||||
static constexpr Hash kToRawSeedFactor =
|
||||
static_cast<Hash>(0xc78219a23eeadd03ULL);
|
||||
static constexpr Hash kFromRawSeedFactor =
|
||||
static_cast<Hash>(0xfe1a137d14b475abULL);
|
||||
|
||||
// See class description
|
||||
Seed raw_seed_ = 0;
|
||||
};
|
||||
|
||||
// StandardRehasher (and StandardRehasherAdapter): A variant of
|
||||
// StandardHasher that uses the same type for keys as for hashes.
|
||||
// This is primarily intended for building a Ribbon filter/PHSF
|
||||
// from existing hashes without going back to original inputs in order
|
||||
// to apply a different seed. This hasher seeds a 1-to-1 mixing
|
||||
// transformation to apply a seed to an existing hash (or hash-sized key).
|
||||
// This is primarily intended for building a Ribbon filter
|
||||
// from existing hashes without going back to original inputs in
|
||||
// order to apply a different seed. This hasher seeds a 1-to-1 mixing
|
||||
// transformation to apply a seed to an existing hash. (Untested for
|
||||
// hash-sized keys that are not already uniformly distributed.) This
|
||||
// transformation builds on the seed pre-mixing done in StandardHasher.
|
||||
//
|
||||
// Testing suggests essentially no degradation of solution success rate
|
||||
// vs. going back to original inputs when changing hash seeds. For example:
|
||||
// Average re-seeds for solution with r=128, 1.02x overhead, and ~100k keys
|
||||
// is about 1.10 for both StandardHasher and StandardRehasher.
|
||||
//
|
||||
// StandardRehasher is not really recommended for general PHSFs (not
|
||||
// filters) because a collision in the original hash could prevent
|
||||
// construction despite re-seeding the Rehasher. (Such collisions
|
||||
// do not interfere with filter construction.)
|
||||
//
|
||||
// concept RehasherTypesAndSettings: like TypesAndSettings but
|
||||
// does not require Key or HashFn.
|
||||
template <class RehasherTypesAndSettings>
|
||||
@ -262,28 +376,20 @@ class StandardRehasherAdapter : public RehasherTypesAndSettings {
|
||||
using Key = Hash;
|
||||
using Seed = typename RehasherTypesAndSettings::Seed;
|
||||
|
||||
static Hash HashFn(const Hash& input, Seed seed) {
|
||||
static_assert(sizeof(Hash) <= 8, "Hash too big");
|
||||
if (sizeof(Hash) > 4) {
|
||||
// XXH3_avalanche / XXH3p_avalanche (64-bit), modified for seed
|
||||
uint64_t h = input;
|
||||
h ^= h >> 37;
|
||||
h ^= seed * uint64_t{0xC2B2AE3D27D4EB4F};
|
||||
h *= uint64_t{0x165667B19E3779F9};
|
||||
h ^= h >> 32;
|
||||
return static_cast<Hash>(h);
|
||||
} else {
|
||||
// XXH32_avalanche (32-bit), modified for seed
|
||||
uint32_t h32 = static_cast<uint32_t>(input);
|
||||
h32 ^= h32 >> 15;
|
||||
h32 ^= seed * uint32_t{0x27D4EB4F};
|
||||
h32 *= uint32_t{0x85EBCA77};
|
||||
h32 ^= h32 >> 13;
|
||||
h32 *= uint32_t{0xC2B2AE3D};
|
||||
h32 ^= h32 >> 16;
|
||||
return static_cast<Hash>(h32);
|
||||
}
|
||||
static Hash HashFn(const Hash& input, Seed raw_seed) {
|
||||
// Note: raw_seed is already lightly pre-mixed, and this multiplication
|
||||
// by a large prime is sufficient mixing (low-to-high bits) on top of
|
||||
// that for good FastRange results, which depends primarily on highest
|
||||
// bits. (The hashed CoeffRow and ResultRow are less sensitive to
|
||||
// mixing than Start.)
|
||||
// Also note: did consider adding ^ (input >> some) before the
|
||||
// multiplication, but doesn't appear to be necessary.
|
||||
return (input ^ raw_seed) * kRehashFactor;
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr Hash kRehashFactor =
|
||||
static_cast<Hash>(0x6193d459236a3a0dULL);
|
||||
};
|
||||
|
||||
// See comment on StandardRehasherAdapter
|
||||
@ -291,6 +397,10 @@ template <class RehasherTypesAndSettings>
|
||||
using StandardRehasher =
|
||||
StandardHasher<StandardRehasherAdapter<RehasherTypesAndSettings>>;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
// Especially with smaller hashes (e.g. 32 bit), there can be noticeable
|
||||
// false positives due to collisions in the Hash returned by GetHash.
|
||||
// This function returns the expected FP rate due to those collisions,
|
||||
@ -442,9 +552,17 @@ class StandardBanding : public StandardHasher<TypesAndSettings> {
|
||||
|
||||
// Iteratively (a) resets the structure for `num_slots`, (b) attempts
|
||||
// to add the range of inputs, and (c) if unsuccessful, chooses next
|
||||
// hash seed, until either successful or unsuccessful with max_seed
|
||||
// (minimum one seed attempted). Returns true if successful. In that
|
||||
// case, use GetSeed() to get the successful seed.
|
||||
// hash seed, until either successful or unsuccessful with all the
|
||||
// allowed seeds. Returns true if successful. In that case, use
|
||||
// GetOrdinalSeed() or GetRawSeed() to get the successful seed.
|
||||
//
|
||||
// The allowed sequence of hash seeds is determined by
|
||||
// `starting_ordinal_seed,` the first ordinal seed to be attempted
|
||||
// (see StandardHasher), and `ordinal_seed_mask,` a bit mask (power of
|
||||
// two minus one) for the range of ordinal seeds to consider. The
|
||||
// max number of seeds considered will be ordinal_seed_mask + 1.
|
||||
// For filters we suggest `starting_ordinal_seed` be chosen randomly
|
||||
// or round-robin, to minimize false positive correlations between keys.
|
||||
//
|
||||
// If unsuccessful, how best to continue is going to be application
|
||||
// specific. It should be possible to choose parameters such that
|
||||
@ -459,16 +577,27 @@ class StandardBanding : public StandardHasher<TypesAndSettings> {
|
||||
// significant correlation in success, rather than independence.)
|
||||
template <typename InputIterator>
|
||||
bool ResetAndFindSeedToSolve(Index num_slots, InputIterator begin,
|
||||
InputIterator end, Seed max_seed) {
|
||||
StandardHasher<TypesAndSettings>::ResetSeed();
|
||||
InputIterator end,
|
||||
Seed starting_ordinal_seed = 0U,
|
||||
Seed ordinal_seed_mask = 63U) {
|
||||
// power of 2 minus 1
|
||||
assert((ordinal_seed_mask & (ordinal_seed_mask + 1)) == 0);
|
||||
// starting seed is within mask
|
||||
assert((starting_ordinal_seed & ordinal_seed_mask) ==
|
||||
starting_ordinal_seed);
|
||||
starting_ordinal_seed &= ordinal_seed_mask; // if not debug
|
||||
|
||||
Seed cur_ordinal_seed = starting_ordinal_seed;
|
||||
do {
|
||||
StandardHasher<TypesAndSettings>::SetOrdinalSeed(cur_ordinal_seed);
|
||||
Reset(num_slots);
|
||||
bool success = AddRange(begin, end);
|
||||
if (success) {
|
||||
return true;
|
||||
}
|
||||
} while (StandardHasher<TypesAndSettings>::NextSeed(max_seed));
|
||||
// No seed through max_seed worked.
|
||||
cur_ordinal_seed = (cur_ordinal_seed + 1) & ordinal_seed_mask;
|
||||
} while (cur_ordinal_seed != starting_ordinal_seed);
|
||||
// Reached limit by circling around
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -27,22 +27,119 @@ class RibbonTypeParamTest : public ::testing::Test {};
|
||||
|
||||
class RibbonTest : public ::testing::Test {};
|
||||
|
||||
namespace {
|
||||
|
||||
// Different ways of generating keys for testing
|
||||
|
||||
// Generate semi-sequential keys
|
||||
struct StandardKeyGen {
|
||||
StandardKeyGen(const std::string& prefix, uint64_t id)
|
||||
: id_(id), str_(prefix) {
|
||||
ROCKSDB_NAMESPACE::PutFixed64(&str_, /*placeholder*/ 0);
|
||||
}
|
||||
|
||||
// Prefix (only one required)
|
||||
StandardKeyGen& operator++() {
|
||||
++id_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::string& operator*() {
|
||||
// Use multiplication to mix things up a little in the key
|
||||
ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
|
||||
id_ * uint64_t{0x1500000001});
|
||||
return str_;
|
||||
}
|
||||
|
||||
bool operator==(const StandardKeyGen& other) {
|
||||
// Same prefix is assumed
|
||||
return id_ == other.id_;
|
||||
}
|
||||
bool operator!=(const StandardKeyGen& other) {
|
||||
// Same prefix is assumed
|
||||
return id_ != other.id_;
|
||||
}
|
||||
|
||||
uint64_t id_;
|
||||
std::string str_;
|
||||
};
|
||||
|
||||
// Generate small sequential keys, that can misbehave with sequential seeds
|
||||
// as in https://github.com/Cyan4973/xxHash/issues/469.
|
||||
// These keys are only heuristically unique, but that's OK with 64 bits,
|
||||
// for testing purposes.
|
||||
struct SmallKeyGen {
|
||||
SmallKeyGen(const std::string& prefix, uint64_t id) : id_(id) {
|
||||
// Hash the prefix for a heuristically unique offset
|
||||
id_ += ROCKSDB_NAMESPACE::GetSliceHash64(prefix);
|
||||
ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
|
||||
}
|
||||
|
||||
// Prefix (only one required)
|
||||
SmallKeyGen& operator++() {
|
||||
++id_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::string& operator*() {
|
||||
ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], id_);
|
||||
return str_;
|
||||
}
|
||||
|
||||
bool operator==(const SmallKeyGen& other) { return id_ == other.id_; }
|
||||
bool operator!=(const SmallKeyGen& other) { return id_ != other.id_; }
|
||||
|
||||
uint64_t id_;
|
||||
std::string str_;
|
||||
};
|
||||
|
||||
template <typename KeyGen>
|
||||
struct Hash32KeyGenWrapper : public KeyGen {
|
||||
Hash32KeyGenWrapper(const std::string& prefix, uint64_t id)
|
||||
: KeyGen(prefix, id) {}
|
||||
uint32_t operator*() {
|
||||
auto& key = *static_cast<KeyGen&>(*this);
|
||||
// unseeded
|
||||
return ROCKSDB_NAMESPACE::GetSliceHash(key);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename KeyGen>
|
||||
struct Hash64KeyGenWrapper : public KeyGen {
|
||||
Hash64KeyGenWrapper(const std::string& prefix, uint64_t id)
|
||||
: KeyGen(prefix, id) {}
|
||||
uint64_t operator*() {
|
||||
auto& key = *static_cast<KeyGen&>(*this);
|
||||
// unseeded
|
||||
return ROCKSDB_NAMESPACE::GetSliceHash64(key);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
using ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate;
|
||||
using ROCKSDB_NAMESPACE::ribbon::StandardHasher;
|
||||
using ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter;
|
||||
|
||||
struct DefaultTypesAndSettings {
|
||||
using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
|
||||
using ResultRow = uint8_t;
|
||||
using Index = uint32_t;
|
||||
using Hash = uint64_t;
|
||||
using Key = ROCKSDB_NAMESPACE::Slice;
|
||||
using Seed = uint32_t;
|
||||
using Key = ROCKSDB_NAMESPACE::Slice;
|
||||
static constexpr bool kIsFilter = true;
|
||||
static constexpr bool kFirstCoeffAlwaysOne = true;
|
||||
static constexpr bool kUseSmash = false;
|
||||
static constexpr bool kAllowZeroStarts = false;
|
||||
static Hash HashFn(const Key& key, Seed seed) {
|
||||
// TODO/FIXME: is there sufficient independence with sequential keys and
|
||||
// sequential seeds?
|
||||
return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), seed);
|
||||
static Hash HashFn(const Key& key, uint64_t raw_seed) {
|
||||
// This version 0.7.2 preview of XXH3 (a.k.a. XXH3p) function does
|
||||
// not pass SmallKeyGen tests below without some seed premixing from
|
||||
// StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469
|
||||
return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed);
|
||||
}
|
||||
// For testing
|
||||
using KeyGen = StandardKeyGen;
|
||||
};
|
||||
|
||||
using TypesAndSettings_Coeff128 = DefaultTypesAndSettings;
|
||||
@ -62,16 +159,19 @@ struct TypesAndSettings_Coeff64Smash0 : public TypesAndSettings_Coeff64Smash1 {
|
||||
struct TypesAndSettings_Result16 : public DefaultTypesAndSettings {
|
||||
using ResultRow = uint16_t;
|
||||
};
|
||||
struct TypesAndSettings_Result32 : public DefaultTypesAndSettings {
|
||||
using ResultRow = uint32_t;
|
||||
};
|
||||
struct TypesAndSettings_IndexSizeT : public DefaultTypesAndSettings {
|
||||
using Index = size_t;
|
||||
};
|
||||
struct TypesAndSettings_Hash32 : public DefaultTypesAndSettings {
|
||||
using Hash = uint32_t;
|
||||
static Hash HashFn(const Key& key, Seed seed) {
|
||||
// NOTE: Using RocksDB 32-bit Hash() here fails test below because of
|
||||
// insufficient mixing of seed (or generally insufficient mixing)
|
||||
return ROCKSDB_NAMESPACE::Upper32of64(
|
||||
ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), seed));
|
||||
static Hash HashFn(const Key& key, Hash raw_seed) {
|
||||
// This MurmurHash1 function does not pass tests below without the
|
||||
// seed premixing from StandardHasher. In fact, it needs more than
|
||||
// just a multiplication mixer on the ordinal seed.
|
||||
return ROCKSDB_NAMESPACE::Hash(key.data(), key.size(), raw_seed);
|
||||
}
|
||||
};
|
||||
struct TypesAndSettings_Hash32_Result16 : public TypesAndSettings_Hash32 {
|
||||
@ -81,6 +181,9 @@ struct TypesAndSettings_KeyString : public DefaultTypesAndSettings {
|
||||
using Key = std::string;
|
||||
};
|
||||
struct TypesAndSettings_Seed8 : public DefaultTypesAndSettings {
|
||||
// This is not a generally recommended configuration. With the configured
|
||||
// hash function, it would fail with SmallKeyGen due to insufficient
|
||||
// independence among the seeds.
|
||||
using Seed = uint8_t;
|
||||
};
|
||||
struct TypesAndSettings_NoAlwaysOne : public DefaultTypesAndSettings {
|
||||
@ -89,78 +192,58 @@ struct TypesAndSettings_NoAlwaysOne : public DefaultTypesAndSettings {
|
||||
struct TypesAndSettings_AllowZeroStarts : public DefaultTypesAndSettings {
|
||||
static constexpr bool kAllowZeroStarts = true;
|
||||
};
|
||||
struct TypesAndSettings_RehasherWrapped : public DefaultTypesAndSettings {
|
||||
// This doesn't directly use StandardRehasher as a whole, but simulates
|
||||
// its behavior with unseeded hash of key, then seeded hash-to-hash
|
||||
// transform.
|
||||
static Hash HashFn(const Key& key, Seed seed) {
|
||||
Hash unseeded = DefaultTypesAndSettings::HashFn(key, /*seed*/ 0);
|
||||
using Rehasher = ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter<
|
||||
DefaultTypesAndSettings>;
|
||||
return Rehasher::HashFn(unseeded, seed);
|
||||
}
|
||||
struct TypesAndSettings_Seed64 : public DefaultTypesAndSettings {
|
||||
using Seed = uint64_t;
|
||||
};
|
||||
struct TypesAndSettings_RehasherWrapped_Result16
|
||||
: public TypesAndSettings_RehasherWrapped {
|
||||
struct TypesAndSettings_Rehasher
|
||||
: public StandardRehasherAdapter<DefaultTypesAndSettings> {
|
||||
using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
|
||||
};
|
||||
struct TypesAndSettings_Rehasher_Result16 : public TypesAndSettings_Rehasher {
|
||||
using ResultRow = uint16_t;
|
||||
};
|
||||
struct TypesAndSettings_Rehasher32Wrapped : public TypesAndSettings_Hash32 {
|
||||
// This doesn't directly use StandardRehasher as a whole, but simulates
|
||||
// its behavior with unseeded hash of key, then seeded hash-to-hash
|
||||
// transform.
|
||||
static Hash HashFn(const Key& key, Seed seed) {
|
||||
Hash unseeded = TypesAndSettings_Hash32::HashFn(key, /*seed*/ 0);
|
||||
using Rehasher = ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter<
|
||||
TypesAndSettings_Hash32>;
|
||||
return Rehasher::HashFn(unseeded, seed);
|
||||
}
|
||||
struct TypesAndSettings_Rehasher_Result32 : public TypesAndSettings_Rehasher {
|
||||
using ResultRow = uint32_t;
|
||||
};
|
||||
struct TypesAndSettings_Rehasher_Seed64
|
||||
: public StandardRehasherAdapter<TypesAndSettings_Seed64> {
|
||||
using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
|
||||
// Note: 64-bit seed with Rehasher gives slightly better average reseeds
|
||||
};
|
||||
struct TypesAndSettings_Rehasher32
|
||||
: public StandardRehasherAdapter<TypesAndSettings_Hash32> {
|
||||
using KeyGen = Hash32KeyGenWrapper<StandardKeyGen>;
|
||||
};
|
||||
struct TypesAndSettings_Rehasher32_Coeff64
|
||||
: public TypesAndSettings_Rehasher32 {
|
||||
using CoeffRow = uint64_t;
|
||||
};
|
||||
struct TypesAndSettings_SmallKeyGen : public DefaultTypesAndSettings {
|
||||
// SmallKeyGen stresses the independence of different hash seeds
|
||||
using KeyGen = SmallKeyGen;
|
||||
};
|
||||
struct TypesAndSettings_Hash32_SmallKeyGen : public TypesAndSettings_Hash32 {
|
||||
// SmallKeyGen stresses the independence of different hash seeds
|
||||
using KeyGen = SmallKeyGen;
|
||||
};
|
||||
|
||||
using TestTypesAndSettings = ::testing::Types<
|
||||
TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash,
|
||||
TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash0,
|
||||
TypesAndSettings_Coeff64Smash1, TypesAndSettings_Result16,
|
||||
TypesAndSettings_IndexSizeT, TypesAndSettings_Hash32,
|
||||
TypesAndSettings_Hash32_Result16, TypesAndSettings_KeyString,
|
||||
TypesAndSettings_Seed8, TypesAndSettings_NoAlwaysOne,
|
||||
TypesAndSettings_AllowZeroStarts, TypesAndSettings_RehasherWrapped,
|
||||
TypesAndSettings_RehasherWrapped_Result16,
|
||||
TypesAndSettings_Rehasher32Wrapped>;
|
||||
TypesAndSettings_Result32, TypesAndSettings_IndexSizeT,
|
||||
TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16,
|
||||
TypesAndSettings_KeyString, TypesAndSettings_Seed8,
|
||||
TypesAndSettings_NoAlwaysOne, TypesAndSettings_AllowZeroStarts,
|
||||
TypesAndSettings_Seed64, TypesAndSettings_Rehasher,
|
||||
TypesAndSettings_Rehasher_Result16, TypesAndSettings_Rehasher_Result32,
|
||||
TypesAndSettings_Rehasher_Seed64, TypesAndSettings_Rehasher32,
|
||||
TypesAndSettings_Rehasher32_Coeff64, TypesAndSettings_SmallKeyGen,
|
||||
TypesAndSettings_Hash32_SmallKeyGen>;
|
||||
TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings);
|
||||
|
||||
namespace {
|
||||
|
||||
struct KeyGen {
|
||||
KeyGen(const std::string& prefix, uint64_t id) : id_(id), str_(prefix) {
|
||||
ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
|
||||
}
|
||||
|
||||
// Prefix (only one required)
|
||||
KeyGen& operator++() {
|
||||
++id_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::string& operator*() {
|
||||
// Use multiplication to mix things up a little in the key
|
||||
ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
|
||||
id_ * uint64_t{0x1500000001});
|
||||
return str_;
|
||||
}
|
||||
|
||||
bool operator==(const KeyGen& other) {
|
||||
// Same prefix is assumed
|
||||
return id_ == other.id_;
|
||||
}
|
||||
bool operator!=(const KeyGen& other) {
|
||||
// Same prefix is assumed
|
||||
return id_ != other.id_;
|
||||
}
|
||||
|
||||
uint64_t id_;
|
||||
std::string str_;
|
||||
};
|
||||
|
||||
// For testing Poisson-distributed (or similar) statistics, get value for
|
||||
// `stddevs_allowed` standard deviations above expected mean
|
||||
// `expected_count`.
|
||||
@ -199,14 +282,13 @@ uint64_t InfrequentPoissonLowerBound(double expected_count) {
|
||||
TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
|
||||
IMPORT_RIBBON_IMPL_TYPES(TypeParam);
|
||||
using KeyGen = typename TypeParam::KeyGen;
|
||||
|
||||
// For testing FP rate etc.
|
||||
constexpr Index kNumToCheck = 100000;
|
||||
|
||||
const auto log2_thoroughness =
|
||||
static_cast<Seed>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness));
|
||||
// FIXME: This upper bound seems excessive
|
||||
const Seed max_seed = 12 + log2_thoroughness;
|
||||
static_cast<Hash>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness));
|
||||
|
||||
// With overhead of just 2%, expect ~50% encoding success per
|
||||
// seed with ~5k keys on 64-bit ribbon, or ~150k keys on 128-bit ribbon.
|
||||
@ -224,12 +306,15 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
uint64_t isoln_query_nanos = 0;
|
||||
uint64_t isoln_query_count = 0;
|
||||
|
||||
// Take different samples if you change thoroughness
|
||||
ROCKSDB_NAMESPACE::Random32 rnd(FLAGS_thoroughness);
|
||||
|
||||
for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) {
|
||||
Index num_to_add =
|
||||
uint32_t num_to_add =
|
||||
sizeof(CoeffRow) == 16 ? 130000 : TypeParam::kUseSmash ? 5500 : 2500;
|
||||
|
||||
// Use different values between that number and 50% of that number
|
||||
num_to_add -= (i * /* misc prime */ 15485863) % (num_to_add / 2);
|
||||
num_to_add -= rnd.Uniformish(num_to_add / 2);
|
||||
|
||||
total_added += num_to_add;
|
||||
|
||||
@ -243,19 +328,21 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
// Round to nearest multiple of kCoeffBits
|
||||
num_slots = ((num_slots + kCoeffBits / 2) / kCoeffBits) * kCoeffBits;
|
||||
// Re-adjust num_to_add to get as close as possible to kFactor
|
||||
num_to_add = static_cast<Index>(num_slots / kFactor);
|
||||
num_to_add = static_cast<uint32_t>(num_slots / kFactor);
|
||||
}
|
||||
|
||||
std::string prefix;
|
||||
// Take different samples if you change thoroughness
|
||||
ROCKSDB_NAMESPACE::PutFixed32(&prefix,
|
||||
i + (FLAGS_thoroughness * 123456789U));
|
||||
ROCKSDB_NAMESPACE::PutFixed32(&prefix, rnd.Next());
|
||||
|
||||
// Batch that must be added
|
||||
std::string added_str = prefix + "added";
|
||||
KeyGen keys_begin(added_str, 0);
|
||||
KeyGen keys_end(added_str, num_to_add);
|
||||
|
||||
// A couple more that will probably be added
|
||||
KeyGen one_more(prefix + "more", 1);
|
||||
KeyGen two_more(prefix + "more", 2);
|
||||
|
||||
// Batch that may or may not be added
|
||||
const Index kBatchSize =
|
||||
sizeof(CoeffRow) == 16 ? 300 : TypeParam::kUseSmash ? 20 : 10;
|
||||
@ -268,11 +355,19 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
KeyGen other_keys_begin(not_str, 0);
|
||||
KeyGen other_keys_end(not_str, kNumToCheck);
|
||||
|
||||
// Vary bytes uniformly for InterleavedSoln to use number of solution
|
||||
// columns varying from 0 to max allowed by ResultRow type (and used by
|
||||
// SimpleSoln).
|
||||
size_t ibytes =
|
||||
(i * /* misc odd */ 67896789) % (sizeof(ResultRow) * num_to_add + 1);
|
||||
// Vary bytes for InterleavedSoln to use number of solution columns
|
||||
// from 0 to max allowed by ResultRow type (and used by SimpleSoln).
|
||||
// Specifically include 0 and max, and otherwise skew toward max.
|
||||
uint32_t max_ibytes = static_cast<uint32_t>(sizeof(ResultRow) * num_slots);
|
||||
size_t ibytes;
|
||||
if (i == 0) {
|
||||
ibytes = 0;
|
||||
} else if (i == 1) {
|
||||
ibytes = max_ibytes;
|
||||
} else {
|
||||
// Skewed
|
||||
ibytes = std::max(rnd.Uniformish(max_ibytes), rnd.Uniformish(max_ibytes));
|
||||
}
|
||||
std::unique_ptr<char[]> idata(new char[ibytes]);
|
||||
InterleavedSoln isoln(idata.get(), ibytes);
|
||||
|
||||
@ -284,20 +379,23 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
{
|
||||
Banding banding;
|
||||
// Traditional solve for a fixed set.
|
||||
ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, keys_begin,
|
||||
keys_end, max_seed));
|
||||
ASSERT_TRUE(
|
||||
banding.ResetAndFindSeedToSolve(num_slots, keys_begin, keys_end));
|
||||
|
||||
// Now to test backtracking, starting with guaranteed fail
|
||||
// Now to test backtracking, starting with guaranteed fail. By using
|
||||
// the keys that will be used to test FP rate, we are then doing an
|
||||
// extra check that after backtracking there are no remnants (e.g. in
|
||||
// result side of banding) of these entries.
|
||||
Index occupied_count = banding.GetOccupiedCount();
|
||||
banding.EnsureBacktrackSize(kNumToCheck);
|
||||
ASSERT_FALSE(
|
||||
EXPECT_FALSE(
|
||||
banding.AddRangeOrRollBack(other_keys_begin, other_keys_end));
|
||||
ASSERT_EQ(occupied_count, banding.GetOccupiedCount());
|
||||
EXPECT_EQ(occupied_count, banding.GetOccupiedCount());
|
||||
|
||||
// Check that we still have a good chance of adding a couple more
|
||||
// individually
|
||||
first_single = banding.Add("one_more");
|
||||
second_single = banding.Add("two_more");
|
||||
first_single = banding.Add(*one_more);
|
||||
second_single = banding.Add(*two_more);
|
||||
Index more_added = (first_single ? 1 : 0) + (second_single ? 1 : 0);
|
||||
total_single_failures += 2U - more_added;
|
||||
|
||||
@ -307,12 +405,12 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
more_added += kBatchSize;
|
||||
++total_batch_successes;
|
||||
}
|
||||
ASSERT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
|
||||
EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
|
||||
|
||||
// Also verify that redundant adds are OK (no effect)
|
||||
ASSERT_TRUE(
|
||||
banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8)));
|
||||
ASSERT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
|
||||
EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
|
||||
|
||||
// Now back-substitution
|
||||
soln.BackSubstFrom(banding);
|
||||
@ -320,39 +418,42 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
isoln.BackSubstFrom(banding);
|
||||
}
|
||||
|
||||
Seed seed = banding.GetSeed();
|
||||
total_reseeds += seed;
|
||||
if (seed > log2_thoroughness + 1) {
|
||||
fprintf(stderr, "%s high reseeds at %u, %u/%u: %u\n",
|
||||
seed > log2_thoroughness + 8 ? "FIXME Extremely" : "Somewhat",
|
||||
static_cast<unsigned>(i), static_cast<unsigned>(num_to_add),
|
||||
static_cast<unsigned>(num_slots), static_cast<unsigned>(seed));
|
||||
Seed reseeds = banding.GetOrdinalSeed();
|
||||
total_reseeds += reseeds;
|
||||
|
||||
EXPECT_LE(reseeds, 8 + log2_thoroughness);
|
||||
if (reseeds > log2_thoroughness + 1) {
|
||||
fprintf(
|
||||
stderr, "%s high reseeds at %u, %u/%u: %u\n",
|
||||
reseeds > log2_thoroughness + 8 ? "ERROR Extremely" : "Somewhat",
|
||||
static_cast<unsigned>(i), static_cast<unsigned>(num_to_add),
|
||||
static_cast<unsigned>(num_slots), static_cast<unsigned>(reseeds));
|
||||
}
|
||||
hasher.ResetSeed(seed);
|
||||
hasher.SetOrdinalSeed(reseeds);
|
||||
}
|
||||
// soln and hasher now independent of Banding object
|
||||
|
||||
// Verify keys added
|
||||
KeyGen cur = keys_begin;
|
||||
while (cur != keys_end) {
|
||||
EXPECT_TRUE(soln.FilterQuery(*cur, hasher));
|
||||
EXPECT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
|
||||
ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
|
||||
ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
|
||||
++cur;
|
||||
}
|
||||
// We (maybe) snuck these in!
|
||||
if (first_single) {
|
||||
EXPECT_TRUE(soln.FilterQuery("one_more", hasher));
|
||||
EXPECT_TRUE(!test_interleaved || isoln.FilterQuery("one_more", hasher));
|
||||
ASSERT_TRUE(soln.FilterQuery(*one_more, hasher));
|
||||
ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*one_more, hasher));
|
||||
}
|
||||
if (second_single) {
|
||||
EXPECT_TRUE(soln.FilterQuery("two_more", hasher));
|
||||
EXPECT_TRUE(!test_interleaved || isoln.FilterQuery("two_more", hasher));
|
||||
ASSERT_TRUE(soln.FilterQuery(*two_more, hasher));
|
||||
ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*two_more, hasher));
|
||||
}
|
||||
if (batch_success) {
|
||||
cur = batch_begin;
|
||||
while (cur != batch_end) {
|
||||
EXPECT_TRUE(soln.FilterQuery(*cur, hasher));
|
||||
EXPECT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
|
||||
ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
|
||||
ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
|
||||
++cur;
|
||||
}
|
||||
}
|
||||
@ -364,7 +465,8 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
|
||||
true);
|
||||
while (cur != other_keys_end) {
|
||||
fp_count += soln.FilterQuery(*cur, hasher) ? 1 : 0;
|
||||
bool fp = soln.FilterQuery(*cur, hasher);
|
||||
fp_count += fp ? 1 : 0;
|
||||
++cur;
|
||||
}
|
||||
soln_query_nanos += timer.ElapsedNanos();
|
||||
@ -375,8 +477,7 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
// For expected FP rate, also include false positives due to collisions
|
||||
// in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
|
||||
double correction =
|
||||
kNumToCheck * ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate(
|
||||
hasher, num_to_add);
|
||||
kNumToCheck * ExpectedCollisionFpRate(hasher, num_to_add);
|
||||
EXPECT_LE(fp_count,
|
||||
FrequentPoissonUpperBound(expected_fp_count + correction));
|
||||
EXPECT_GE(fp_count,
|
||||
@ -401,8 +502,7 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
// For expected FP rate, also include false positives due to collisions
|
||||
// in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
|
||||
double correction =
|
||||
kNumToCheck * ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate(
|
||||
hasher, num_to_add);
|
||||
kNumToCheck * ExpectedCollisionFpRate(hasher, num_to_add);
|
||||
EXPECT_LE(ifp_count,
|
||||
FrequentPoissonUpperBound(expected_fp_count + correction));
|
||||
EXPECT_GE(ifp_count,
|
||||
@ -448,12 +548,17 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness;
|
||||
fprintf(stderr, "Average re-seeds: %g\n", average_reseeds);
|
||||
// Values above were chosen to target around 50% chance of encoding success
|
||||
// rate (average of 1.0 re-seeds) or slightly better. But 1.1 is also close
|
||||
// rate (average of 1.0 re-seeds) or slightly better. But 1.15 is also close
|
||||
// enough.
|
||||
EXPECT_LE(total_reseeds,
|
||||
InfrequentPoissonUpperBound(1.1 * FLAGS_thoroughness));
|
||||
InfrequentPoissonUpperBound(1.15 * FLAGS_thoroughness));
|
||||
// Would use 0.85 here instead of 0.75, but
|
||||
// TypesAndSettings_Hash32_SmallKeyGen can "beat the odds" because of
|
||||
// sequential keys with a small, cheap hash function. We accept that
|
||||
// there are surely inputs that are somewhat bad for this setup, but
|
||||
// these somewhat good inputs are probably more likely.
|
||||
EXPECT_GE(total_reseeds,
|
||||
InfrequentPoissonLowerBound(0.9 * FLAGS_thoroughness));
|
||||
InfrequentPoissonLowerBound(0.75 * FLAGS_thoroughness));
|
||||
}
|
||||
|
||||
{
|
||||
@ -489,8 +594,7 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
// in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
|
||||
double average_added = 1.0 * total_added / FLAGS_thoroughness;
|
||||
expected_total_fp_count +=
|
||||
total_checked * ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate(
|
||||
Hasher(), average_added);
|
||||
total_checked * ExpectedCollisionFpRate(Hasher(), average_added);
|
||||
|
||||
uint64_t upper_bound = InfrequentPoissonUpperBound(expected_total_fp_count);
|
||||
uint64_t lower_bound = InfrequentPoissonLowerBound(expected_total_fp_count);
|
||||
@ -499,10 +603,6 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
expected_total_fp_count / total_checked,
|
||||
1.0 * upper_bound / total_checked,
|
||||
1.0 * lower_bound / total_checked);
|
||||
// FIXME: this can fail for Result16, e.g. --thoroughness=300
|
||||
// Seems due to inexpensive hashing in StandardHasher::GetCoeffRow and
|
||||
// GetResultRowFromHash as replacing those with different Hash64 instances
|
||||
// fixes it, at least mostly.
|
||||
EXPECT_LE(total_fp_count, upper_bound);
|
||||
EXPECT_GE(total_fp_count, lower_bound);
|
||||
}
|
||||
@ -511,6 +611,7 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
|
||||
TYPED_TEST(RibbonTypeParamTest, Extremes) {
|
||||
IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
|
||||
IMPORT_RIBBON_IMPL_TYPES(TypeParam);
|
||||
using KeyGen = typename TypeParam::KeyGen;
|
||||
|
||||
size_t bytes = 128 * 1024;
|
||||
std::unique_ptr<char[]> buf(new char[bytes]);
|
||||
@ -523,7 +624,8 @@ TYPED_TEST(RibbonTypeParamTest, Extremes) {
|
||||
// Add zero keys to minimal number of slots
|
||||
KeyGen begin_and_end("foo", 123);
|
||||
ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
|
||||
/*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*max_seed*/ 0));
|
||||
/*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*first seed*/ 0,
|
||||
/* seed mask*/ 0));
|
||||
|
||||
soln.BackSubstFrom(banding);
|
||||
isoln.BackSubstFrom(banding);
|
||||
@ -547,9 +649,10 @@ TYPED_TEST(RibbonTypeParamTest, Extremes) {
|
||||
// Solutions are equivalent
|
||||
ASSERT_EQ(isoln_query_result, soln_query_result);
|
||||
// And in fact we only expect an FP when ResultRow is 0
|
||||
ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash(
|
||||
hasher.GetHash(*cur)) == ResultRow{0});
|
||||
|
||||
// CHANGE: no longer true because of filling some unused slots
|
||||
// with pseudorandom values.
|
||||
// ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash(
|
||||
// hasher.GetHash(*cur)) == ResultRow{0});
|
||||
fp_count += soln_query_result ? 1 : 0;
|
||||
++cur;
|
||||
}
|
||||
@ -567,7 +670,8 @@ TYPED_TEST(RibbonTypeParamTest, Extremes) {
|
||||
KeyGen key_begin("added", 0);
|
||||
KeyGen key_end("added", 1);
|
||||
ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
|
||||
/*slots*/ kCoeffBits, key_begin, key_end, /*max_seed*/ 0));
|
||||
/*slots*/ kCoeffBits, key_begin, key_end, /*first seed*/ 0,
|
||||
/* seed mask*/ 0));
|
||||
|
||||
InterleavedSoln isoln2(nullptr, /*bytes*/ 0);
|
||||
|
||||
@ -584,6 +688,7 @@ TYPED_TEST(RibbonTypeParamTest, Extremes) {
|
||||
TEST(RibbonTest, AllowZeroStarts) {
|
||||
IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts);
|
||||
IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts);
|
||||
using KeyGen = StandardKeyGen;
|
||||
|
||||
InterleavedSoln isoln(nullptr, /*bytes*/ 0);
|
||||
SimpleSoln soln;
|
||||
@ -593,17 +698,16 @@ TEST(RibbonTest, AllowZeroStarts) {
|
||||
KeyGen begin("foo", 0);
|
||||
KeyGen end("foo", 1);
|
||||
// Can't add 1 entry
|
||||
ASSERT_FALSE(
|
||||
banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end, /*max_seed*/ 5));
|
||||
ASSERT_FALSE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end));
|
||||
|
||||
KeyGen begin_and_end("foo", 123);
|
||||
// Can add 0 entries
|
||||
ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end,
|
||||
begin_and_end, /*max_seed*/ 5));
|
||||
begin_and_end));
|
||||
|
||||
Seed seed = banding.GetSeed();
|
||||
ASSERT_EQ(seed, 0U);
|
||||
hasher.ResetSeed(seed);
|
||||
Seed reseeds = banding.GetOrdinalSeed();
|
||||
ASSERT_EQ(reseeds, 0U);
|
||||
hasher.SetOrdinalSeed(reseeds);
|
||||
|
||||
// Can construct 0-slot solutions
|
||||
isoln.BackSubstFrom(banding);
|
||||
@ -618,6 +722,123 @@ TEST(RibbonTest, AllowZeroStarts) {
|
||||
ASSERT_EQ(soln.ExpectedFpRate(), 0.0);
|
||||
}
|
||||
|
||||
TEST(RibbonTest, RawAndOrdinalSeeds) {
|
||||
StandardHasher<TypesAndSettings_Seed64> hasher64;
|
||||
StandardHasher<DefaultTypesAndSettings> hasher64_32;
|
||||
StandardHasher<TypesAndSettings_Hash32> hasher32;
|
||||
StandardHasher<TypesAndSettings_Seed8> hasher8;
|
||||
|
||||
for (uint32_t limit : {0xffU, 0xffffU}) {
|
||||
std::vector<bool> seen(limit + 1);
|
||||
for (uint32_t i = 0; i < limit; ++i) {
|
||||
hasher64.SetOrdinalSeed(i);
|
||||
auto raw64 = hasher64.GetRawSeed();
|
||||
hasher32.SetOrdinalSeed(i);
|
||||
auto raw32 = hasher32.GetRawSeed();
|
||||
hasher8.SetOrdinalSeed(static_cast<uint8_t>(i));
|
||||
auto raw8 = hasher8.GetRawSeed();
|
||||
{
|
||||
hasher64_32.SetOrdinalSeed(i);
|
||||
auto raw64_32 = hasher64_32.GetRawSeed();
|
||||
ASSERT_EQ(raw64_32, raw32); // Same size seed
|
||||
}
|
||||
if (i == 0) {
|
||||
// Documented that ordinal seed 0 == raw seed 0
|
||||
ASSERT_EQ(raw64, 0U);
|
||||
ASSERT_EQ(raw32, 0U);
|
||||
ASSERT_EQ(raw8, 0U);
|
||||
} else {
|
||||
// Extremely likely that upper bits are set
|
||||
ASSERT_GT(raw64, raw32);
|
||||
ASSERT_GT(raw32, raw8);
|
||||
}
|
||||
// Hashers agree on lower bits
|
||||
ASSERT_EQ(static_cast<uint32_t>(raw64), raw32);
|
||||
ASSERT_EQ(static_cast<uint8_t>(raw32), raw8);
|
||||
|
||||
// The translation is one-to-one for this size prefix
|
||||
uint32_t v = static_cast<uint32_t>(raw32 & limit);
|
||||
ASSERT_EQ(raw64 & limit, v);
|
||||
ASSERT_FALSE(seen[v]);
|
||||
seen[v] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct PhsfInputGen {
|
||||
PhsfInputGen(const std::string& prefix, uint64_t id) : id_(id) {
|
||||
val_.first = prefix;
|
||||
ROCKSDB_NAMESPACE::PutFixed64(&val_.first, /*placeholder*/ 0);
|
||||
}
|
||||
|
||||
// Prefix (only one required)
|
||||
PhsfInputGen& operator++() {
|
||||
++id_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const std::pair<std::string, uint8_t>& operator*() {
|
||||
// Use multiplication to mix things up a little in the key
|
||||
ROCKSDB_NAMESPACE::EncodeFixed64(&val_.first[val_.first.size() - 8],
|
||||
id_ * uint64_t{0x1500000001});
|
||||
// Occasionally repeat values etc.
|
||||
val_.second = static_cast<uint8_t>(id_ * 7 / 8);
|
||||
return val_;
|
||||
}
|
||||
|
||||
const std::pair<std::string, uint8_t>* operator->() { return &**this; }
|
||||
|
||||
bool operator==(const PhsfInputGen& other) {
|
||||
// Same prefix is assumed
|
||||
return id_ == other.id_;
|
||||
}
|
||||
bool operator!=(const PhsfInputGen& other) {
|
||||
// Same prefix is assumed
|
||||
return id_ != other.id_;
|
||||
}
|
||||
|
||||
uint64_t id_;
|
||||
std::pair<std::string, uint8_t> val_;
|
||||
};
|
||||
|
||||
struct PhsfTypesAndSettings : public DefaultTypesAndSettings {
|
||||
static constexpr bool kIsFilter = false;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
TEST(RibbonTest, PhsfBasic) {
|
||||
IMPORT_RIBBON_TYPES_AND_SETTINGS(PhsfTypesAndSettings);
|
||||
IMPORT_RIBBON_IMPL_TYPES(PhsfTypesAndSettings);
|
||||
|
||||
Index num_slots = 12800;
|
||||
Index num_to_add = static_cast<Index>(num_slots / 1.02);
|
||||
|
||||
PhsfInputGen begin("in", 0);
|
||||
PhsfInputGen end("in", num_to_add);
|
||||
|
||||
std::unique_ptr<char[]> idata(new char[/*bytes*/ num_slots]);
|
||||
InterleavedSoln isoln(idata.get(), /*bytes*/ num_slots);
|
||||
SimpleSoln soln;
|
||||
Hasher hasher;
|
||||
|
||||
{
|
||||
Banding banding;
|
||||
ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, begin, end));
|
||||
|
||||
soln.BackSubstFrom(banding);
|
||||
isoln.BackSubstFrom(banding);
|
||||
|
||||
hasher.SetOrdinalSeed(banding.GetOrdinalSeed());
|
||||
}
|
||||
|
||||
for (PhsfInputGen cur = begin; cur != end; ++cur) {
|
||||
ASSERT_EQ(cur->second, soln.PhsfQuery(cur->first, hasher));
|
||||
ASSERT_EQ(cur->second, isoln.PhsfQuery(cur->first, hasher));
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
#ifdef GFLAGS
|
||||
|
Loading…
Reference in New Issue
Block a user