Pick samples for compression dictionary using prime number (#7987)

Summary:
The sample selection technique taken in https://github.com/facebook/rocksdb/issues/7970 was problematic
because it had two code paths for sample selection depending on the
number of data blocks, and one of those code paths involved an
allocation. Using prime numbers, we can consolidate into one code path
without allocation. The downside is there will be values of N (number of
data blocks buffered) that suffer from poor spread in the selected
samples.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/7987

Test Plan: `make check -j48`

Reviewed By: pdillinger

Differential Revision: D26586147

Pulled By: ajkr

fbshipit-source-id: 62028e54336fadb6e2c7a7fe6747daa05a263d32
This commit is contained in:
Andrew Kryczka 2021-02-22 17:41:11 -08:00 committed by Facebook GitHub Bot
parent 59d91796d2
commit daca92c17a

View File

@ -1644,44 +1644,41 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0 const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
? r->compression_opts.zstd_max_train_bytes ? r->compression_opts.zstd_max_train_bytes
: r->compression_opts.max_dict_bytes; : r->compression_opts.max_dict_bytes;
const size_t kNumBlocksBuffered = r->data_block_and_keys_buffers.size();
// If buffer size is reasonable, we pre-generate a permutation to enforce // Abstract algebra teaches us that a finite cyclic group (such as the
// uniqueness. This prevents wasting samples on duplicates, which is // additive group of integers modulo N) can be generated by a number that is
// particularly likely when not many blocks were buffered. // coprime with N. Since N is variable (number of buffered data blocks), we
std::vector<uint16_t> data_block_order; // must then pick a prime number in order to guarantee coprimeness with any N.
size_t data_block_order_idx = 0; //
if (r->data_block_and_keys_buffers.size() <= ((1 << 16) - 1)) { // One downside of this approach is the spread will be poor when
data_block_order.resize(r->data_block_and_keys_buffers.size()); // `kPrimeGeneratorRemainder` is close to zero or close to
std::iota(data_block_order.begin(), data_block_order.end(), // `kNumBlocksBuffered`.
static_cast<uint16_t>(0)); //
// We could be smarter and interleave the shuffling and sample appending // Picked a random number between one and one trillion and then chose the
// logic. Then we could terminate as soon as `kSampleBytes` is reached, // next prime number greater than or equal to it.
// saving some shuffling computation. const uint64_t kPrimeGenerator = 545055921143ull;
RandomShuffle(data_block_order.begin(), data_block_order.end(), // Can avoid repeated division by just adding the remainder repeatedly.
static_cast<uint32_t>(r->creation_time)); const size_t kPrimeGeneratorRemainder = static_cast<size_t>(
} kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
Random64 generator{r->creation_time};
std::string compression_dict_samples; std::string compression_dict_samples;
std::vector<size_t> compression_dict_sample_lens; std::vector<size_t> compression_dict_sample_lens;
if (!r->data_block_and_keys_buffers.empty()) { size_t buffer_idx = kInitSampleIdx;
while ((data_block_order.empty() || for (size_t i = 0;
data_block_order_idx < data_block_order.size()) && i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes;
compression_dict_samples.size() < kSampleBytes) { ++i) {
size_t rand_idx; size_t copy_len =
if (data_block_order.empty()) { std::min(kSampleBytes - compression_dict_samples.size(),
rand_idx = static_cast<size_t>( r->data_block_and_keys_buffers[buffer_idx].first.size());
generator.Uniform(r->data_block_and_keys_buffers.size())); compression_dict_samples.append(
} else { r->data_block_and_keys_buffers[buffer_idx].first, 0, copy_len);
rand_idx = data_block_order[data_block_order_idx]; compression_dict_sample_lens.emplace_back(copy_len);
++data_block_order_idx;
} buffer_idx += kPrimeGeneratorRemainder;
size_t copy_len = if (buffer_idx >= kNumBlocksBuffered) {
std::min(kSampleBytes - compression_dict_samples.size(), buffer_idx -= kNumBlocksBuffered;
r->data_block_and_keys_buffers[rand_idx].first.size());
compression_dict_samples.append(
r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len);
compression_dict_sample_lens.emplace_back(copy_len);
} }
} }