62f70f6d14
Summary: Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio. So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include: - The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called. - After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up. - Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952 Differential Revision: D13967980 Pulled By: ajkr fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
244 lines
6.9 KiB
C++
244 lines
6.9 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
#include <stdint.h>
|
|
#include "rocksdb/sst_dump_tool.h"
|
|
|
|
#include "rocksdb/filter_policy.h"
|
|
#include "table/block_based_table_factory.h"
|
|
#include "table/table_builder.h"
|
|
#include "util/file_reader_writer.h"
|
|
#include "util/testharness.h"
|
|
#include "util/testutil.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
const uint32_t optLength = 100;
|
|
|
|
namespace {
|
|
static std::string MakeKey(int i) {
|
|
char buf[100];
|
|
snprintf(buf, sizeof(buf), "k_%04d", i);
|
|
InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
|
|
return key.Encode().ToString();
|
|
}
|
|
|
|
static std::string MakeValue(int i) {
|
|
char buf[100];
|
|
snprintf(buf, sizeof(buf), "v_%04d", i);
|
|
InternalKey key(std::string(buf), 0, ValueType::kTypeValue);
|
|
return key.Encode().ToString();
|
|
}
|
|
|
|
void createSST(const Options& opts, const std::string& file_name) {
|
|
Env* env = opts.env;
|
|
EnvOptions env_options(opts);
|
|
ReadOptions read_options;
|
|
const ImmutableCFOptions imoptions(opts);
|
|
const MutableCFOptions moptions(opts);
|
|
rocksdb::InternalKeyComparator ikc(opts.comparator);
|
|
std::unique_ptr<TableBuilder> tb;
|
|
|
|
std::unique_ptr<WritableFile> file;
|
|
ASSERT_OK(env->NewWritableFile(file_name, &file, env_options));
|
|
|
|
std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
|
|
int_tbl_prop_collector_factories;
|
|
std::unique_ptr<WritableFileWriter> file_writer(
|
|
new WritableFileWriter(std::move(file), file_name, EnvOptions()));
|
|
std::string column_family_name;
|
|
int unknown_level = -1;
|
|
tb.reset(opts.table_factory->NewTableBuilder(
|
|
TableBuilderOptions(
|
|
imoptions, moptions, ikc, &int_tbl_prop_collector_factories,
|
|
CompressionType::kNoCompression, CompressionOptions(),
|
|
false /* skip_filters */, column_family_name, unknown_level),
|
|
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
|
|
file_writer.get()));
|
|
|
|
// Populate slightly more than 1K keys
|
|
uint32_t num_keys = 1024;
|
|
for (uint32_t i = 0; i < num_keys; i++) {
|
|
tb->Add(MakeKey(i), MakeValue(i));
|
|
}
|
|
tb->Finish();
|
|
file_writer->Close();
|
|
}
|
|
|
|
void cleanup(const Options& opts, const std::string& file_name) {
|
|
Env* env = opts.env;
|
|
env->DeleteFile(file_name);
|
|
std::string outfile_name = file_name.substr(0, file_name.length() - 4);
|
|
outfile_name.append("_dump.txt");
|
|
env->DeleteFile(outfile_name);
|
|
}
|
|
} // namespace
|
|
|
|
// Test for sst dump tool "raw" mode
|
|
class SSTDumpToolTest : public testing::Test {
|
|
std::string testDir_;
|
|
|
|
public:
|
|
SSTDumpToolTest() { testDir_ = test::TmpDir(); }
|
|
|
|
~SSTDumpToolTest() {}
|
|
|
|
std::string MakeFilePath(const std::string& file_name) const {
|
|
std::string path(testDir_);
|
|
path.append("/").append(file_name);
|
|
return path;
|
|
}
|
|
|
|
template <std::size_t N>
|
|
void PopulateCommandArgs(const std::string& file_path, const char* command,
|
|
char* (&usage)[N]) const {
|
|
for (int i = 0; i < static_cast<int>(N); ++i) {
|
|
usage[i] = new char[optLength];
|
|
}
|
|
snprintf(usage[0], optLength, "./sst_dump");
|
|
snprintf(usage[1], optLength, "%s", command);
|
|
snprintf(usage[2], optLength, "--file=%s", file_path.c_str());
|
|
}
|
|
};
|
|
|
|
TEST_F(SSTDumpToolTest, EmptyFilter) {
|
|
Options opts;
|
|
std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
|
|
createSST(opts, file_path);
|
|
|
|
char* usage[3];
|
|
PopulateCommandArgs(file_path, "--command=raw", usage);
|
|
|
|
rocksdb::SSTDumpTool tool;
|
|
ASSERT_TRUE(!tool.Run(3, usage, opts));
|
|
|
|
cleanup(opts, file_path);
|
|
for (int i = 0; i < 3; i++) {
|
|
delete[] usage[i];
|
|
}
|
|
}
|
|
|
|
TEST_F(SSTDumpToolTest, FilterBlock) {
|
|
Options opts;
|
|
BlockBasedTableOptions table_opts;
|
|
table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
|
|
opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
|
|
std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
|
|
createSST(opts, file_path);
|
|
|
|
char* usage[3];
|
|
PopulateCommandArgs(file_path, "--command=raw", usage);
|
|
|
|
rocksdb::SSTDumpTool tool;
|
|
ASSERT_TRUE(!tool.Run(3, usage, opts));
|
|
|
|
cleanup(opts, file_path);
|
|
for (int i = 0; i < 3; i++) {
|
|
delete[] usage[i];
|
|
}
|
|
}
|
|
|
|
TEST_F(SSTDumpToolTest, FullFilterBlock) {
|
|
Options opts;
|
|
BlockBasedTableOptions table_opts;
|
|
table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
|
|
opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
|
|
std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
|
|
createSST(opts, file_path);
|
|
|
|
char* usage[3];
|
|
PopulateCommandArgs(file_path, "--command=raw", usage);
|
|
|
|
rocksdb::SSTDumpTool tool;
|
|
ASSERT_TRUE(!tool.Run(3, usage, opts));
|
|
|
|
cleanup(opts, file_path);
|
|
for (int i = 0; i < 3; i++) {
|
|
delete[] usage[i];
|
|
}
|
|
}
|
|
|
|
TEST_F(SSTDumpToolTest, GetProperties) {
|
|
Options opts;
|
|
BlockBasedTableOptions table_opts;
|
|
table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
|
|
opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
|
|
std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
|
|
createSST(opts, file_path);
|
|
|
|
char* usage[3];
|
|
PopulateCommandArgs(file_path, "--show_properties", usage);
|
|
|
|
rocksdb::SSTDumpTool tool;
|
|
ASSERT_TRUE(!tool.Run(3, usage, opts));
|
|
|
|
cleanup(opts, file_path);
|
|
for (int i = 0; i < 3; i++) {
|
|
delete[] usage[i];
|
|
}
|
|
}
|
|
|
|
TEST_F(SSTDumpToolTest, CompressedSizes) {
|
|
Options opts;
|
|
BlockBasedTableOptions table_opts;
|
|
table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
|
|
opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
|
|
std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
|
|
createSST(opts, file_path);
|
|
|
|
char* usage[3];
|
|
PopulateCommandArgs(file_path, "--command=recompress", usage);
|
|
|
|
rocksdb::SSTDumpTool tool;
|
|
ASSERT_TRUE(!tool.Run(3, usage, opts));
|
|
|
|
cleanup(opts, file_path);
|
|
for (int i = 0; i < 3; i++) {
|
|
delete[] usage[i];
|
|
}
|
|
}
|
|
|
|
TEST_F(SSTDumpToolTest, MemEnv) {
|
|
std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
|
|
Options opts;
|
|
opts.env = env.get();
|
|
std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
|
|
createSST(opts, file_path);
|
|
|
|
char* usage[3];
|
|
PopulateCommandArgs(file_path, "--command=verify_checksum", usage);
|
|
|
|
rocksdb::SSTDumpTool tool;
|
|
ASSERT_TRUE(!tool.Run(3, usage, opts));
|
|
|
|
cleanup(opts, file_path);
|
|
for (int i = 0; i < 3; i++) {
|
|
delete[] usage[i];
|
|
}
|
|
}
|
|
|
|
} // namespace rocksdb
|
|
|
|
int main(int argc, char** argv) {
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
return RUN_ALL_TESTS();
|
|
}
|
|
|
|
#else
|
|
#include <stdio.h>
|
|
|
|
int main(int /*argc*/, char** /*argv*/) {
|
|
fprintf(stderr, "SKIPPED as SSTDumpTool is not supported in ROCKSDB_LITE\n");
|
|
return 0;
|
|
}
|
|
|
|
#endif // !ROCKSDB_LITE return RUN_ALL_TESTS();
|