Support ingest_behind for IngestExternalFile
Summary: First cut for early review; there are few conceptual points to answer and some code structure issues. For conceptual points - - restriction-wise, we're going to disallow ingest_behind if (use_seqno_zero_out=true || disable_auto_compaction=false), the user is responsible to properly open and close DB with required params - we wanted to ingest into reserved bottom most level. Should we fail fast if bottom level isn't empty, or should we attempt to ingest if file fits there key-ranges-wise? - Modifying AssignLevelForIngestedFile seems the place we we'd handle that. On code structure - going to refactor GenerateAndAddExternalFile call in the test class to allow passing instance of IngestionOptions, that's just going to incur lots of changes at callsites. Closes https://github.com/facebook/rocksdb/pull/2144 Differential Revision: D4873732 Pulled By: lightmark fbshipit-source-id: 81cb698106b68ef8797f564453651d50900e153a
This commit is contained in:
parent
01ab7b528c
commit
ba685a472a
@ -172,6 +172,12 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
|
|||||||
result.num_levels < 2) {
|
result.num_levels < 2) {
|
||||||
result.num_levels = 2;
|
result.num_levels = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (result.compaction_style == kCompactionStyleUniversal &&
|
||||||
|
db_options.allow_ingest_behind && result.num_levels < 3) {
|
||||||
|
result.num_levels = 3;
|
||||||
|
}
|
||||||
|
|
||||||
if (result.max_write_buffer_number < 2) {
|
if (result.max_write_buffer_number < 2) {
|
||||||
result.max_write_buffer_number = 2;
|
result.max_write_buffer_number = 2;
|
||||||
}
|
}
|
||||||
|
@ -548,7 +548,8 @@ void CompactionIterator::PrepareOutput() {
|
|||||||
|
|
||||||
// This is safe for TransactionDB write-conflict checking since transactions
|
// This is safe for TransactionDB write-conflict checking since transactions
|
||||||
// only care about sequence number larger than any active snapshots.
|
// only care about sequence number larger than any active snapshots.
|
||||||
if (bottommost_level_ && valid_ && ikey_.sequence <= earliest_snapshot_ &&
|
if ((compaction_ != nullptr && !compaction_->allow_ingest_behind()) &&
|
||||||
|
bottommost_level_ && valid_ && ikey_.sequence <= earliest_snapshot_ &&
|
||||||
ikey_.type != kTypeMerge &&
|
ikey_.type != kTypeMerge &&
|
||||||
!cmp_->Equal(compaction_->GetLargestUserKey(), ikey_.user_key)) {
|
!cmp_->Equal(compaction_->GetLargestUserKey(), ikey_.user_key)) {
|
||||||
assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
|
assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
#include "db/merge_helper.h"
|
#include "db/merge_helper.h"
|
||||||
#include "db/pinned_iterators_manager.h"
|
#include "db/pinned_iterators_manager.h"
|
||||||
#include "db/range_del_aggregator.h"
|
#include "db/range_del_aggregator.h"
|
||||||
|
#include "options/cf_options.h"
|
||||||
#include "rocksdb/compaction_filter.h"
|
#include "rocksdb/compaction_filter.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
@ -48,6 +49,9 @@ class CompactionIterator {
|
|||||||
virtual Slice GetLargestUserKey() const {
|
virtual Slice GetLargestUserKey() const {
|
||||||
return compaction_->GetLargestUserKey();
|
return compaction_->GetLargestUserKey();
|
||||||
}
|
}
|
||||||
|
virtual bool allow_ingest_behind() const {
|
||||||
|
return compaction_->immutable_cf_options()->allow_ingest_behind;
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
CompactionProxy() = default;
|
CompactionProxy() = default;
|
||||||
|
@ -156,6 +156,7 @@ class FakeCompaction : public CompactionIterator::CompactionProxy {
|
|||||||
virtual Slice GetLargestUserKey() const {
|
virtual Slice GetLargestUserKey() const {
|
||||||
return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
|
return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
|
||||||
}
|
}
|
||||||
|
virtual bool allow_ingest_behind() const { return false; }
|
||||||
|
|
||||||
bool key_not_exists_beyond_output_level = false;
|
bool key_not_exists_beyond_output_level = false;
|
||||||
};
|
};
|
||||||
|
@ -529,7 +529,11 @@ Compaction* CompactionPicker::CompactRange(
|
|||||||
// files together to the last level.
|
// files together to the last level.
|
||||||
assert(vstorage->num_levels() > 1);
|
assert(vstorage->num_levels() > 1);
|
||||||
// DBImpl::CompactRange() set output level to be the last level
|
// DBImpl::CompactRange() set output level to be the last level
|
||||||
assert(output_level == vstorage->num_levels() - 1);
|
if (ioptions_.allow_ingest_behind) {
|
||||||
|
assert(output_level == vstorage->num_levels() - 2);
|
||||||
|
} else {
|
||||||
|
assert(output_level == vstorage->num_levels() - 1);
|
||||||
|
}
|
||||||
// DBImpl::RunManualCompaction will make full range for universal compaction
|
// DBImpl::RunManualCompaction will make full range for universal compaction
|
||||||
assert(begin == nullptr);
|
assert(begin == nullptr);
|
||||||
assert(end == nullptr);
|
assert(end == nullptr);
|
||||||
|
@ -402,6 +402,35 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
|
|||||||
vstorage_->CompactionScore(0) >= 1);
|
vstorage_->CompactionScore(0) >= 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
|
||||||
|
const uint64_t kFileSize = 100000;
|
||||||
|
NewVersionStorage(1, kCompactionStyleUniversal);
|
||||||
|
ioptions_.allow_ingest_behind = true;
|
||||||
|
ioptions_.num_levels = 3;
|
||||||
|
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
|
||||||
|
// must return false when there's no files.
|
||||||
|
ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
|
||||||
|
false);
|
||||||
|
|
||||||
|
NewVersionStorage(3, kCompactionStyleUniversal);
|
||||||
|
|
||||||
|
Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
|
||||||
|
Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
|
||||||
|
Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
|
||||||
|
Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
|
||||||
|
Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
|
||||||
|
Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
|
||||||
|
|
||||||
|
UpdateVersionStorageInfo();
|
||||||
|
|
||||||
|
std::unique_ptr<Compaction> compaction(
|
||||||
|
universal_compaction_picker.PickCompaction(
|
||||||
|
cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
|
||||||
|
|
||||||
|
// output level should be the one above the bottom-most
|
||||||
|
ASSERT_EQ(1, compaction->output_level());
|
||||||
|
}
|
||||||
// Tests if the files can be trivially moved in multi level
|
// Tests if the files can be trivially moved in multi level
|
||||||
// universal compaction when allow_trivial_move option is set
|
// universal compaction when allow_trivial_move option is set
|
||||||
// In this test as the input files overlaps, they cannot
|
// In this test as the input files overlaps, they cannot
|
||||||
|
@ -568,6 +568,13 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
|
|||||||
output_level = sorted_runs[first_index_after].level - 1;
|
output_level = sorted_runs[first_index_after].level - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// last level is reserved for the files ingested behind
|
||||||
|
if (ioptions_.allow_ingest_behind &&
|
||||||
|
(output_level == vstorage->num_levels() - 1)) {
|
||||||
|
assert(output_level > 1);
|
||||||
|
output_level--;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<CompactionInputFiles> inputs(vstorage->num_levels());
|
std::vector<CompactionInputFiles> inputs(vstorage->num_levels());
|
||||||
for (size_t i = 0; i < inputs.size(); ++i) {
|
for (size_t i = 0; i < inputs.size(); ++i) {
|
||||||
inputs[i].level = start_level + static_cast<int>(i);
|
inputs[i].level = start_level + static_cast<int>(i);
|
||||||
@ -719,13 +726,20 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
|
|||||||
cf_name.c_str(), file_num_buf);
|
cf_name.c_str(), file_num_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// output files at the bottom most level, unless it's reserved
|
||||||
|
int output_level = vstorage->num_levels() - 1;
|
||||||
|
// last level is reserved for the files ingested behind
|
||||||
|
if (ioptions_.allow_ingest_behind) {
|
||||||
|
assert(output_level > 1);
|
||||||
|
output_level--;
|
||||||
|
}
|
||||||
|
|
||||||
return new Compaction(
|
return new Compaction(
|
||||||
vstorage, ioptions_, mutable_cf_options, std::move(inputs),
|
vstorage, ioptions_, mutable_cf_options, std::move(inputs),
|
||||||
vstorage->num_levels() - 1,
|
output_level, mutable_cf_options.MaxFileSizeForLevel(output_level),
|
||||||
mutable_cf_options.MaxFileSizeForLevel(vstorage->num_levels() - 1),
|
|
||||||
/* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
|
/* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
|
||||||
GetCompressionType(ioptions_, vstorage, mutable_cf_options,
|
GetCompressionType(ioptions_, vstorage, mutable_cf_options,
|
||||||
vstorage->num_levels() - 1, 1),
|
output_level, 1),
|
||||||
/* grandparents */ {}, /* is manual */ false, score,
|
/* grandparents */ {}, /* is manual */ false, score,
|
||||||
false /* deletion_compaction */,
|
false /* deletion_compaction */,
|
||||||
CompactionReason::kUniversalSizeAmplification);
|
CompactionReason::kUniversalSizeAmplification);
|
||||||
|
@ -2614,6 +2614,15 @@ Status DBImpl::IngestExternalFile(
|
|||||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||||
auto cfd = cfh->cfd();
|
auto cfd = cfh->cfd();
|
||||||
|
|
||||||
|
// Ingest should immediately fail if ingest_behind is requested,
|
||||||
|
// but the DB doesn't support it.
|
||||||
|
if (ingestion_options.ingest_behind) {
|
||||||
|
if (!immutable_db_options_.allow_ingest_behind) {
|
||||||
|
return Status::InvalidArgument(
|
||||||
|
"Can't ingest_behind file in DB with allow_ingest_behind=false");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ExternalSstFileIngestionJob ingestion_job(env_, versions_.get(), cfd,
|
ExternalSstFileIngestionJob ingestion_job(env_, versions_.get(), cfd,
|
||||||
immutable_db_options_, env_options_,
|
immutable_db_options_, env_options_,
|
||||||
&snapshots_, ingestion_options);
|
&snapshots_, ingestion_options);
|
||||||
|
@ -284,10 +284,14 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
|
|||||||
if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
|
if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
|
||||||
cfd->NumberLevels() > 1) {
|
cfd->NumberLevels() > 1) {
|
||||||
// Always compact all files together.
|
// Always compact all files together.
|
||||||
s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
|
|
||||||
cfd->NumberLevels() - 1, options.target_path_id,
|
|
||||||
begin, end, exclusive);
|
|
||||||
final_output_level = cfd->NumberLevels() - 1;
|
final_output_level = cfd->NumberLevels() - 1;
|
||||||
|
// if bottom most level is reserved
|
||||||
|
if (immutable_db_options_.allow_ingest_behind) {
|
||||||
|
final_output_level--;
|
||||||
|
}
|
||||||
|
s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
|
||||||
|
final_output_level, options.target_path_id,
|
||||||
|
begin, end, exclusive);
|
||||||
} else {
|
} else {
|
||||||
for (int level = 0; level <= max_level_with_files; level++) {
|
for (int level = 0; level <= max_level_with_files; level++) {
|
||||||
int output_level;
|
int output_level;
|
||||||
|
@ -171,9 +171,13 @@ Status ExternalSstFileIngestionJob::Run() {
|
|||||||
|
|
||||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||||
SequenceNumber assigned_seqno = 0;
|
SequenceNumber assigned_seqno = 0;
|
||||||
status = AssignLevelAndSeqnoForIngestedFile(
|
if (ingestion_options_.ingest_behind) {
|
||||||
super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
|
status = CheckLevelForIngestedBehindFile(&f);
|
||||||
&f, &assigned_seqno);
|
} else {
|
||||||
|
status = AssignLevelAndSeqnoForIngestedFile(
|
||||||
|
super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
|
||||||
|
&f, &assigned_seqno);
|
||||||
|
}
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
@ -408,9 +412,9 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
|
|||||||
Arena arena;
|
Arena arena;
|
||||||
ReadOptions ro;
|
ReadOptions ro;
|
||||||
ro.total_order_seek = true;
|
ro.total_order_seek = true;
|
||||||
|
|
||||||
int target_level = 0;
|
int target_level = 0;
|
||||||
auto* vstorage = cfd_->current()->storage_info();
|
auto* vstorage = cfd_->current()->storage_info();
|
||||||
|
|
||||||
for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
|
for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
|
||||||
if (lvl > 0 && lvl < vstorage->base_level()) {
|
if (lvl > 0 && lvl < vstorage->base_level()) {
|
||||||
continue;
|
continue;
|
||||||
@ -418,20 +422,8 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
|
|||||||
|
|
||||||
if (vstorage->NumLevelFiles(lvl) > 0) {
|
if (vstorage->NumLevelFiles(lvl) > 0) {
|
||||||
bool overlap_with_level = false;
|
bool overlap_with_level = false;
|
||||||
MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(),
|
status = IngestedFileOverlapWithLevel(sv, file_to_ingest, lvl,
|
||||||
&arena);
|
&overlap_with_level);
|
||||||
RangeDelAggregator range_del_agg(cfd_->internal_comparator(),
|
|
||||||
{} /* snapshots */);
|
|
||||||
sv->current->AddIteratorsForLevel(ro, env_options_, &merge_iter_builder,
|
|
||||||
lvl, &range_del_agg);
|
|
||||||
if (!range_del_agg.IsEmpty()) {
|
|
||||||
return Status::NotSupported(
|
|
||||||
"file ingestion with range tombstones is currently unsupported");
|
|
||||||
}
|
|
||||||
ScopedArenaIterator level_iter(merge_iter_builder.Finish());
|
|
||||||
|
|
||||||
status = IngestedFileOverlapWithIteratorRange(
|
|
||||||
file_to_ingest, level_iter.get(), &overlap_with_level);
|
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
@ -478,6 +470,33 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
|
||||||
|
IngestedFileInfo* file_to_ingest) {
|
||||||
|
auto* vstorage = cfd_->current()->storage_info();
|
||||||
|
// first check if new files fit in the bottommost level
|
||||||
|
int bottom_lvl = cfd_->NumberLevels() - 1;
|
||||||
|
if(!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) {
|
||||||
|
return Status::InvalidArgument(
|
||||||
|
"Can't ingest_behind file as it doesn't fit "
|
||||||
|
"at the bottommost level!");
|
||||||
|
}
|
||||||
|
|
||||||
|
// second check if despite allow_ingest_behind=true we still have 0 seqnums
|
||||||
|
// at some upper level
|
||||||
|
for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
|
||||||
|
for (auto file : vstorage->LevelFiles(lvl)) {
|
||||||
|
if (file->smallest_seqno == 0) {
|
||||||
|
return Status::InvalidArgument(
|
||||||
|
"Can't ingest_behind file as despite allow_ingest_behind=true "
|
||||||
|
"there are files with 0 seqno in database at upper levels!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
file_to_ingest->picked_level = bottom_lvl;
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
|
Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
|
||||||
IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
|
IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
|
||||||
if (file_to_ingest->original_seqno == seqno) {
|
if (file_to_ingest->original_seqno == seqno) {
|
||||||
@ -564,6 +583,27 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status ExternalSstFileIngestionJob::IngestedFileOverlapWithLevel(
|
||||||
|
SuperVersion* sv, IngestedFileInfo* file_to_ingest, int lvl,
|
||||||
|
bool* overlap_with_level) {
|
||||||
|
Arena arena;
|
||||||
|
ReadOptions ro;
|
||||||
|
ro.total_order_seek = true;
|
||||||
|
MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(),
|
||||||
|
&arena);
|
||||||
|
RangeDelAggregator range_del_agg(cfd_->internal_comparator(),
|
||||||
|
{} /* snapshots */);
|
||||||
|
sv->current->AddIteratorsForLevel(ro, env_options_, &merge_iter_builder,
|
||||||
|
lvl, &range_del_agg);
|
||||||
|
if (!range_del_agg.IsEmpty()) {
|
||||||
|
return Status::NotSupported(
|
||||||
|
"file ingestion with range tombstones is currently unsupported");
|
||||||
|
}
|
||||||
|
ScopedArenaIterator level_iter(merge_iter_builder.Finish());
|
||||||
|
return IngestedFileOverlapWithIteratorRange(
|
||||||
|
file_to_ingest, level_iter.get(), overlap_with_level);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
#endif // !ROCKSDB_LITE
|
#endif // !ROCKSDB_LITE
|
||||||
|
@ -125,6 +125,12 @@ class ExternalSstFileIngestionJob {
|
|||||||
IngestedFileInfo* file_to_ingest,
|
IngestedFileInfo* file_to_ingest,
|
||||||
SequenceNumber* assigned_seqno);
|
SequenceNumber* assigned_seqno);
|
||||||
|
|
||||||
|
// File that we want to ingest behind always goes to the lowest level;
|
||||||
|
// we just check that it fits in the level, that DB allows ingest_behind,
|
||||||
|
// and that we don't have 0 seqnums at the upper levels.
|
||||||
|
// REQUIRES: Mutex held
|
||||||
|
Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest);
|
||||||
|
|
||||||
// Set the file global sequence number to `seqno`
|
// Set the file global sequence number to `seqno`
|
||||||
Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
|
Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
|
||||||
SequenceNumber seqno);
|
SequenceNumber seqno);
|
||||||
@ -135,6 +141,11 @@ class ExternalSstFileIngestionJob {
|
|||||||
const IngestedFileInfo* file_to_ingest, InternalIterator* iter,
|
const IngestedFileInfo* file_to_ingest, InternalIterator* iter,
|
||||||
bool* overlap);
|
bool* overlap);
|
||||||
|
|
||||||
|
// Check if `file_to_ingest` key range overlap with level
|
||||||
|
// REQUIRES: Mutex held
|
||||||
|
Status IngestedFileOverlapWithLevel(SuperVersion* sv,
|
||||||
|
IngestedFileInfo* file_to_ingest, int lvl, bool* overlap_with_level);
|
||||||
|
|
||||||
// Check if `file_to_ingest` can fit in level `level`
|
// Check if `file_to_ingest` can fit in level `level`
|
||||||
// REQUIRES: Mutex held
|
// REQUIRES: Mutex held
|
||||||
bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
|
bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
|
||||||
|
@ -90,6 +90,68 @@ class ExternalSSTFileTest : public DBTestBase {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status GenerateAndAddExternalFileIngestBehind(
|
||||||
|
const Options options, const IngestExternalFileOptions ifo,
|
||||||
|
std::vector<std::pair<std::string, std::string>> data, int file_id = -1,
|
||||||
|
bool sort_data = false,
|
||||||
|
std::map<std::string, std::string>* true_data = nullptr,
|
||||||
|
ColumnFamilyHandle* cfh = nullptr) {
|
||||||
|
// Generate a file id if not provided
|
||||||
|
if (file_id == -1) {
|
||||||
|
file_id = last_file_id_ + 1;
|
||||||
|
last_file_id_++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort data if asked to do so
|
||||||
|
if (sort_data) {
|
||||||
|
std::sort(data.begin(), data.end(),
|
||||||
|
[&](const std::pair<std::string, std::string>& e1,
|
||||||
|
const std::pair<std::string, std::string>& e2) {
|
||||||
|
return options.comparator->Compare(e1.first, e2.first) < 0;
|
||||||
|
});
|
||||||
|
auto uniq_iter = std::unique(
|
||||||
|
data.begin(), data.end(),
|
||||||
|
[&](const std::pair<std::string, std::string>& e1,
|
||||||
|
const std::pair<std::string, std::string>& e2) {
|
||||||
|
return options.comparator->Compare(e1.first, e2.first) == 0;
|
||||||
|
});
|
||||||
|
data.resize(uniq_iter - data.begin());
|
||||||
|
}
|
||||||
|
std::string file_path = sst_files_dir_ + ToString(file_id);
|
||||||
|
SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
|
||||||
|
|
||||||
|
Status s = sst_file_writer.Open(file_path);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
for (auto& entry : data) {
|
||||||
|
s = sst_file_writer.Add(entry.first, entry.second);
|
||||||
|
if (!s.ok()) {
|
||||||
|
sst_file_writer.Finish();
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s = sst_file_writer.Finish();
|
||||||
|
|
||||||
|
if (s.ok()) {
|
||||||
|
if (cfh) {
|
||||||
|
s = db_->IngestExternalFile(cfh, {file_path}, ifo);
|
||||||
|
} else {
|
||||||
|
s = db_->IngestExternalFile({file_path}, ifo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s.ok() && true_data) {
|
||||||
|
for (auto& entry : data) {
|
||||||
|
(*true_data)[entry.first] = entry.second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Status GenerateAndAddExternalFile(
|
Status GenerateAndAddExternalFile(
|
||||||
const Options options, std::vector<std::pair<int, std::string>> data,
|
const Options options, std::vector<std::pair<int, std::string>> data,
|
||||||
int file_id = -1, bool allow_global_seqno = false, bool sort_data = false,
|
int file_id = -1, bool allow_global_seqno = false, bool sort_data = false,
|
||||||
@ -1817,6 +1879,69 @@ TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
|
|||||||
|
|
||||||
db_->ReleaseSnapshot(snap);
|
db_->ReleaseSnapshot(snap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(ExternalSSTFileTest, IngestBehind) {
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.compaction_style = kCompactionStyleUniversal;
|
||||||
|
options.num_levels = 3;
|
||||||
|
options.disable_auto_compactions = false;
|
||||||
|
DestroyAndReopen(options);
|
||||||
|
std::vector<std::pair<std::string, std::string>> file_data;
|
||||||
|
std::map<std::string, std::string> true_data;
|
||||||
|
|
||||||
|
// Insert 100 -> 200 into the memtable
|
||||||
|
for (int i = 100; i <= 200; i++) {
|
||||||
|
ASSERT_OK(Put(Key(i), "memtable"));
|
||||||
|
true_data[Key(i)] = "memtable";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert 100 -> 200 using IngestExternalFile
|
||||||
|
file_data.clear();
|
||||||
|
for (int i = 0; i <= 20; i++) {
|
||||||
|
file_data.emplace_back(Key(i), "ingest_behind");
|
||||||
|
}
|
||||||
|
|
||||||
|
IngestExternalFileOptions ifo;
|
||||||
|
ifo.allow_global_seqno = true;
|
||||||
|
ifo.ingest_behind = true;
|
||||||
|
|
||||||
|
// Can't ingest behind since allow_ingest_behind isn't set to true
|
||||||
|
ASSERT_NOK(GenerateAndAddExternalFileIngestBehind(options, ifo,
|
||||||
|
file_data, -1, false,
|
||||||
|
&true_data));
|
||||||
|
|
||||||
|
options.allow_ingest_behind = true;
|
||||||
|
// check that we still can open the DB, as num_levels should be
|
||||||
|
// sanitized to 3
|
||||||
|
options.num_levels = 2;
|
||||||
|
DestroyAndReopen(options);
|
||||||
|
|
||||||
|
options.num_levels = 3;
|
||||||
|
DestroyAndReopen(options);
|
||||||
|
// Insert 100 -> 200 into the memtable
|
||||||
|
for (int i = 100; i <= 200; i++) {
|
||||||
|
ASSERT_OK(Put(Key(i), "memtable"));
|
||||||
|
true_data[Key(i)] = "memtable";
|
||||||
|
}
|
||||||
|
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
|
||||||
|
// Universal picker should go at second from the bottom level
|
||||||
|
ASSERT_EQ("0,1", FilesPerLevel());
|
||||||
|
ASSERT_OK(GenerateAndAddExternalFileIngestBehind(options, ifo,
|
||||||
|
file_data, -1, false,
|
||||||
|
&true_data));
|
||||||
|
ASSERT_EQ("0,1,1", FilesPerLevel());
|
||||||
|
// this time ingest should fail as the file doesn't fit to the bottom level
|
||||||
|
ASSERT_NOK(GenerateAndAddExternalFileIngestBehind(options, ifo,
|
||||||
|
file_data, -1, false,
|
||||||
|
&true_data));
|
||||||
|
ASSERT_EQ("0,1,1", FilesPerLevel());
|
||||||
|
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
|
||||||
|
// bottom level should be empty
|
||||||
|
ASSERT_EQ("0,1", FilesPerLevel());
|
||||||
|
|
||||||
|
size_t kcnt = 0;
|
||||||
|
VerifyDBFromMap(true_data, &kcnt, false);
|
||||||
|
}
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
@ -944,14 +944,22 @@ class DB {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// IngestExternalFile() will load a list of external SST files (1) into the DB
|
// IngestExternalFile() will load a list of external SST files (1) into the DB
|
||||||
// We will try to find the lowest possible level that the file can fit in, and
|
// Two primary modes are supported:
|
||||||
// ingest the file into this level (2). A file that have a key range that
|
// - Duplicate keys in the new files will overwrite exiting keys (default)
|
||||||
// overlap with the memtable key range will require us to Flush the memtable
|
// - Duplicate keys will be skipped (set ingest_behind=true)
|
||||||
// first before ingesting the file.
|
// In the first mode we will try to find the lowest possible level that
|
||||||
|
// the file can fit in, and ingest the file into this level (2). A file that
|
||||||
|
// have a key range that overlap with the memtable key range will require us
|
||||||
|
// to Flush the memtable first before ingesting the file.
|
||||||
|
// In the second mode we will always ingest in the bottom mode level (see
|
||||||
|
// docs to IngestExternalFileOptions::ingest_behind).
|
||||||
//
|
//
|
||||||
// (1) External SST files can be created using SstFileWriter
|
// (1) External SST files can be created using SstFileWriter
|
||||||
// (2) We will try to ingest the files to the lowest possible level
|
// (2) We will try to ingest the files to the lowest possible level
|
||||||
// even if the file compression dont match the level compression
|
// even if the file compression dont match the level compression
|
||||||
|
// (3) If IngestExternalFileOptions->ingest_behind is set to true,
|
||||||
|
// we always ingest at the bottommost level, which should be reserved
|
||||||
|
// for this purpose (see DBOPtions::allow_ingest_behind flag).
|
||||||
virtual Status IngestExternalFile(
|
virtual Status IngestExternalFile(
|
||||||
ColumnFamilyHandle* column_family,
|
ColumnFamilyHandle* column_family,
|
||||||
const std::vector<std::string>& external_files,
|
const std::vector<std::string>& external_files,
|
||||||
|
@ -847,6 +847,18 @@ struct DBOptions {
|
|||||||
//
|
//
|
||||||
// Dynamically changeable through SetDBOptions() API.
|
// Dynamically changeable through SetDBOptions() API.
|
||||||
bool avoid_flush_during_shutdown = false;
|
bool avoid_flush_during_shutdown = false;
|
||||||
|
|
||||||
|
// Set this option to true during creation of database if you want
|
||||||
|
// to be able to ingest behind (call IngestExternalFile() skipping keys
|
||||||
|
// that already exist, rather than overwriting matching keys).
|
||||||
|
// Setting this option to true will affect 2 things:
|
||||||
|
// 1) Disable some internal optimizations around SST file compression
|
||||||
|
// 2) Reserve bottom-most level for ingested files only.
|
||||||
|
// 3) Note that num_levels should be >= 3 if this option is turned on.
|
||||||
|
//
|
||||||
|
// DEFAULT: false
|
||||||
|
// Immutable.
|
||||||
|
bool allow_ingest_behind = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Options to control the behavior of a database (passed to DB::Open)
|
// Options to control the behavior of a database (passed to DB::Open)
|
||||||
@ -1126,6 +1138,14 @@ struct IngestExternalFileOptions {
|
|||||||
// If set to false and the file key range overlaps with the memtable key range
|
// If set to false and the file key range overlaps with the memtable key range
|
||||||
// (memtable flush required), IngestExternalFile will fail.
|
// (memtable flush required), IngestExternalFile will fail.
|
||||||
bool allow_blocking_flush = true;
|
bool allow_blocking_flush = true;
|
||||||
|
// Set to true if you would like duplicate keys in the file being ingested
|
||||||
|
// to be skipped rather than overwriting existing data under that key.
|
||||||
|
// Usecase: back-fill of some historical data in the database without
|
||||||
|
// over-writing existing newer version of data.
|
||||||
|
// This option could only be used if the DB has been running
|
||||||
|
// with allow_ingest_behind=true since the dawn of time.
|
||||||
|
// All files will be ingested at the bottommost level with seqno=0.
|
||||||
|
bool ingest_behind = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
@ -71,6 +71,7 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
|
|||||||
num_levels(cf_options.num_levels),
|
num_levels(cf_options.num_levels),
|
||||||
optimize_filters_for_hits(cf_options.optimize_filters_for_hits),
|
optimize_filters_for_hits(cf_options.optimize_filters_for_hits),
|
||||||
force_consistency_checks(cf_options.force_consistency_checks),
|
force_consistency_checks(cf_options.force_consistency_checks),
|
||||||
|
allow_ingest_behind(db_options.allow_ingest_behind),
|
||||||
listeners(db_options.listeners),
|
listeners(db_options.listeners),
|
||||||
row_cache(db_options.row_cache),
|
row_cache(db_options.row_cache),
|
||||||
max_subcompactions(db_options.max_subcompactions),
|
max_subcompactions(db_options.max_subcompactions),
|
||||||
|
@ -108,6 +108,8 @@ struct ImmutableCFOptions {
|
|||||||
|
|
||||||
bool force_consistency_checks;
|
bool force_consistency_checks;
|
||||||
|
|
||||||
|
bool allow_ingest_behind;
|
||||||
|
|
||||||
// A vector of EventListeners which call-back functions will be called
|
// A vector of EventListeners which call-back functions will be called
|
||||||
// when specific RocksDB event happens.
|
// when specific RocksDB event happens.
|
||||||
std::vector<std::shared_ptr<EventListener>> listeners;
|
std::vector<std::shared_ptr<EventListener>> listeners;
|
||||||
|
@ -84,7 +84,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
|
|||||||
#endif // ROCKSDB_LITE
|
#endif // ROCKSDB_LITE
|
||||||
fail_if_options_file_error(options.fail_if_options_file_error),
|
fail_if_options_file_error(options.fail_if_options_file_error),
|
||||||
dump_malloc_stats(options.dump_malloc_stats),
|
dump_malloc_stats(options.dump_malloc_stats),
|
||||||
avoid_flush_during_recovery(options.avoid_flush_during_recovery) {
|
avoid_flush_during_recovery(options.avoid_flush_during_recovery),
|
||||||
|
allow_ingest_behind(options.allow_ingest_behind) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ImmutableDBOptions::Dump(Logger* log) const {
|
void ImmutableDBOptions::Dump(Logger* log) const {
|
||||||
@ -210,8 +211,11 @@ void ImmutableDBOptions::Dump(Logger* log) const {
|
|||||||
ROCKS_LOG_HEADER(log, " Options.wal_filter: %s",
|
ROCKS_LOG_HEADER(log, " Options.wal_filter: %s",
|
||||||
wal_filter ? wal_filter->Name() : "None");
|
wal_filter ? wal_filter->Name() : "None");
|
||||||
#endif // ROCKDB_LITE
|
#endif // ROCKDB_LITE
|
||||||
|
|
||||||
ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_recovery: %d",
|
ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_recovery: %d",
|
||||||
avoid_flush_during_recovery);
|
avoid_flush_during_recovery);
|
||||||
|
ROCKS_LOG_HEADER(log, " Options.allow_ingest_behind: %d",
|
||||||
|
allow_ingest_behind);
|
||||||
}
|
}
|
||||||
|
|
||||||
MutableDBOptions::MutableDBOptions()
|
MutableDBOptions::MutableDBOptions()
|
||||||
|
@ -77,6 +77,7 @@ struct ImmutableDBOptions {
|
|||||||
bool fail_if_options_file_error;
|
bool fail_if_options_file_error;
|
||||||
bool dump_malloc_stats;
|
bool dump_malloc_stats;
|
||||||
bool avoid_flush_during_recovery;
|
bool avoid_flush_during_recovery;
|
||||||
|
bool allow_ingest_behind;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct MutableDBOptions {
|
struct MutableDBOptions {
|
||||||
|
@ -190,7 +190,8 @@ DBOptions::DBOptions(const Options& options)
|
|||||||
fail_if_options_file_error(options.fail_if_options_file_error),
|
fail_if_options_file_error(options.fail_if_options_file_error),
|
||||||
dump_malloc_stats(options.dump_malloc_stats),
|
dump_malloc_stats(options.dump_malloc_stats),
|
||||||
avoid_flush_during_recovery(options.avoid_flush_during_recovery),
|
avoid_flush_during_recovery(options.avoid_flush_during_recovery),
|
||||||
avoid_flush_during_shutdown(options.avoid_flush_during_shutdown) {
|
avoid_flush_during_shutdown(options.avoid_flush_during_shutdown),
|
||||||
|
allow_ingest_behind(options.allow_ingest_behind) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void DBOptions::Dump(Logger* log) const {
|
void DBOptions::Dump(Logger* log) const {
|
||||||
|
@ -119,6 +119,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
|
|||||||
immutable_db_options.avoid_flush_during_recovery;
|
immutable_db_options.avoid_flush_during_recovery;
|
||||||
options.avoid_flush_during_shutdown =
|
options.avoid_flush_during_shutdown =
|
||||||
mutable_db_options.avoid_flush_during_shutdown;
|
mutable_db_options.avoid_flush_during_shutdown;
|
||||||
|
options.allow_ingest_behind =
|
||||||
|
immutable_db_options.allow_ingest_behind;
|
||||||
|
|
||||||
return options;
|
return options;
|
||||||
}
|
}
|
||||||
|
@ -335,7 +335,11 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
|
|||||||
{"avoid_flush_during_shutdown",
|
{"avoid_flush_during_shutdown",
|
||||||
{offsetof(struct DBOptions, avoid_flush_during_shutdown),
|
{offsetof(struct DBOptions, avoid_flush_during_shutdown),
|
||||||
OptionType::kBoolean, OptionVerificationType::kNormal, true,
|
OptionType::kBoolean, OptionVerificationType::kNormal, true,
|
||||||
offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}}};
|
offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}},
|
||||||
|
{"allow_ingest_behind",
|
||||||
|
{offsetof(struct DBOptions, allow_ingest_behind),
|
||||||
|
OptionType::kBoolean, OptionVerificationType::kNormal, false,
|
||||||
|
offsetof(struct ImmutableDBOptions, allow_ingest_behind)}}};
|
||||||
|
|
||||||
// offset_of is used to get the offset of a class data member
|
// offset_of is used to get the offset of a class data member
|
||||||
// ex: offset_of(&ColumnFamilyOptions::num_levels)
|
// ex: offset_of(&ColumnFamilyOptions::num_levels)
|
||||||
|
@ -291,7 +291,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
|
|||||||
"dump_malloc_stats=false;"
|
"dump_malloc_stats=false;"
|
||||||
"allow_2pc=false;"
|
"allow_2pc=false;"
|
||||||
"avoid_flush_during_recovery=false;"
|
"avoid_flush_during_recovery=false;"
|
||||||
"avoid_flush_during_shutdown=false;",
|
"avoid_flush_during_shutdown=false;"
|
||||||
|
"allow_ingest_behind=false;",
|
||||||
new_options));
|
new_options));
|
||||||
|
|
||||||
ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
|
ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user