Fix a bug in looking up duplicate keys with MultiGet (#6953)

Summary:
When MultiGet is called with duplicate keys, and the key matches the
largest key in an SST file and the value type is merge, only the first
instance of the duplicate key is returned with correct results. This is
due to the incorrect assumption that if a key in a batch is equal to the
largest key in the file, the next key cannot be present in that file.

Tests:
Add a new unit test
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6953

Reviewed By: cheng-chang

Differential Revision: D21935898

Pulled By: anand1976

fbshipit-source-id: a2cc327a15150e23fd997546ca64d1c33021cb4c
This commit is contained in:
anand76 2020-06-08 16:08:31 -07:00 committed by Facebook GitHub Bot
parent f5e649453c
commit 1fb3593f25
3 changed files with 86 additions and 11 deletions

View File

@ -14,6 +14,7 @@
* Fix potential file descriptor leakage in PosixEnv's IsDirectory() and NewRandomAccessFile(). * Fix potential file descriptor leakage in PosixEnv's IsDirectory() and NewRandomAccessFile().
* Fix false negative from the VerifyChecksum() API when there is a checksum mismatch in an index partition block in a BlockBasedTable format table file (index_type is kTwoLevelIndexSearch). * Fix false negative from the VerifyChecksum() API when there is a checksum mismatch in an index partition block in a BlockBasedTable format table file (index_type is kTwoLevelIndexSearch).
* Fix sst_dump to return non-zero exit code if the specified file is not a recognized SST file or fails requested checks. * Fix sst_dump to return non-zero exit code if the specified file is not a recognized SST file or fails requested checks.
* Fix incorrect results from batched MultiGet for duplicate keys, when the duplicate key matches the largest key of an SST file and the value type for the key in the file is a merge value.
### Public API Change ### Public API Change
* Flush(..., column_family) may return Status::ColumnFamilyDropped() instead of Status::InvalidArgument() if column_family is dropped while processing the flush request. * Flush(..., column_family) may return Status::ColumnFamilyDropped() instead of Status::InvalidArgument() if column_family is dropped while processing the flush request.

View File

@ -9,6 +9,7 @@
#include "db/db_test_util.h" #include "db/db_test_util.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/perf_context.h" #include "rocksdb/perf_context.h"
#include "rocksdb/utilities/debug.h" #include "rocksdb/utilities/debug.h"
#include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_based_table_reader.h"
@ -17,6 +18,8 @@
#if !defined(ROCKSDB_LITE) #if !defined(ROCKSDB_LITE)
#include "test_util/sync_point.h" #include "test_util/sync_point.h"
#endif #endif
#include "utilities/merge_operators.h"
#include "utilities/merge_operators/string_append/stringappend.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -1340,6 +1343,57 @@ TEST_F(DBBasicTest, MultiGetBatchedSortedMultiFile) {
} while (ChangeOptions()); } while (ChangeOptions());
} }
TEST_F(DBBasicTest, MultiGetBatchedDuplicateKeys) {
Options opts = CurrentOptions();
opts.merge_operator = MergeOperators::CreateStringAppendOperator();
CreateAndReopenWithCF({"pikachu"}, opts);
SetPerfLevel(kEnableCount);
// To expand the power of this test, generate > 1 table file and
// mix with memtable
ASSERT_OK(Merge(1, "k1", "v1"));
ASSERT_OK(Merge(1, "k2", "v2"));
Flush(1);
MoveFilesToLevel(2, 1);
ASSERT_OK(Merge(1, "k3", "v3"));
ASSERT_OK(Merge(1, "k4", "v4"));
Flush(1);
MoveFilesToLevel(2, 1);
ASSERT_OK(Merge(1, "k4", "v4_2"));
ASSERT_OK(Merge(1, "k6", "v6"));
Flush(1);
MoveFilesToLevel(2, 1);
ASSERT_OK(Merge(1, "k7", "v7"));
ASSERT_OK(Merge(1, "k8", "v8"));
Flush(1);
MoveFilesToLevel(2, 1);
get_perf_context()->Reset();
std::vector<Slice> keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"});
std::vector<PinnableSlice> values(keys.size());
std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
std::vector<Status> s(keys.size());
db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
values.data(), s.data(), false);
ASSERT_EQ(values.size(), keys.size());
ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8");
ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8");
ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8");
ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2");
ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2");
ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3");
ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes);
for (Status& status : s) {
ASSERT_OK(status);
}
SetPerfLevel(kDisable);
}
TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.disable_auto_compactions = true; options.disable_auto_compactions = true;

View File

@ -365,6 +365,7 @@ class FilePickerMultiGet {
range_(range), range_(range),
batch_iter_(range->begin()), batch_iter_(range->begin()),
batch_iter_prev_(range->begin()), batch_iter_prev_(range->begin()),
upper_key_(range->begin()),
maybe_repeat_key_(false), maybe_repeat_key_(false),
current_level_range_(*range, range->begin(), range->end()), current_level_range_(*range, range->begin(), range->end()),
current_file_range_(*range, range->begin(), range->end()), current_file_range_(*range, range->begin(), range->end()),
@ -481,9 +482,19 @@ class FilePickerMultiGet {
} }
if (cmp_largest == 0) { if (cmp_largest == 0) {
// cmp_largest is 0, which means the next key will not be in this // cmp_largest is 0, which means the next key will not be in this
// file, so stop looking further. Also don't increment megt_iter_ // file, so stop looking further. However, its possible there are
// as we may have to look for this key in the next file if we don't // duplicates in the batch, so find the upper bound for the batch
// find it in this one // in this file (upper_key_) by skipping past the duplicates. We
// leave batch_iter_ as is since we may have to pick up from there
// for the next file, if this file has a merge value rather than
// final value
upper_key_ = batch_iter_;
++upper_key_;
while (upper_key_ != current_level_range_.end() &&
user_comparator_->Compare(batch_iter_->ukey, upper_key_->ukey) ==
0) {
++upper_key_;
}
break; break;
} else { } else {
if (curr_level_ == 0) { if (curr_level_ == 0) {
@ -503,6 +514,12 @@ class FilePickerMultiGet {
*fd = f; *fd = f;
*file_index = curr_file_index; *file_index = curr_file_index;
*is_last_key_in_file = cmp_largest == 0; *is_last_key_in_file = cmp_largest == 0;
if (!*is_last_key_in_file) {
// If the largest key in the batch overlapping the file is not the
// largest key in the file, upper_ley_ would not have been updated so
// update it here
upper_key_ = batch_iter_;
}
return file_hit; return file_hit;
} }
@ -524,7 +541,7 @@ class FilePickerMultiGet {
// file regardless for all keys not found yet // file regardless for all keys not found yet
if (current_level_range_.CheckKeyDone(batch_iter_) || if (current_level_range_.CheckKeyDone(batch_iter_) ||
curr_level_ == 0) { curr_level_ == 0) {
++batch_iter_; batch_iter_ = upper_key_;
} }
} }
// batch_iter_prev_ will become the start key for the next file // batch_iter_prev_ will become the start key for the next file
@ -544,18 +561,20 @@ class FilePickerMultiGet {
&is_last_key_in_file)) { &is_last_key_in_file)) {
search_ended_ = !PrepareNextLevel(); search_ended_ = !PrepareNextLevel();
} else { } else {
MultiGetRange::Iterator upper_key = batch_iter_;
if (is_last_key_in_file) { if (is_last_key_in_file) {
// Since cmp_largest is 0, batch_iter_ still points to the last key // Since cmp_largest is 0, batch_iter_ still points to the last key
// that falls in this file, instead of the next one. Increment // that falls in this file, instead of the next one. Increment
// upper_key so we can set the range properly for SST MultiGet // the file index for all keys between batch_iter_ and upper_key_
++upper_key; auto tmp_iter = batch_iter_;
++(fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level); while (tmp_iter != upper_key_) {
++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
++tmp_iter;
}
maybe_repeat_key_ = true; maybe_repeat_key_ = true;
} }
// Set the range for this file // Set the range for this file
current_file_range_ = current_file_range_ =
MultiGetRange(next_file_range, batch_iter_prev_, upper_key); MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
returned_file_level_ = curr_level_; returned_file_level_ = curr_level_;
hit_file_level_ = curr_level_; hit_file_level_ = curr_level_;
is_hit_file_last_in_level_ = is_hit_file_last_in_level_ =
@ -607,6 +626,7 @@ class FilePickerMultiGet {
// key found in the previous SST file, in order to serve as the start of // key found in the previous SST file, in order to serve as the start of
// the batch key range for the next SST file // the batch key range for the next SST file
MultiGetRange::Iterator batch_iter_prev_; MultiGetRange::Iterator batch_iter_prev_;
MultiGetRange::Iterator upper_key_;
bool maybe_repeat_key_; bool maybe_repeat_key_;
MultiGetRange current_level_range_; MultiGetRange current_level_range_;
MultiGetRange current_file_range_; MultiGetRange current_file_range_;
@ -626,7 +646,7 @@ class FilePickerMultiGet {
if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level < if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
curr_file_level_->num_files) { curr_file_level_->num_files) {
batch_iter_prev_ = current_level_range_.begin(); batch_iter_prev_ = current_level_range_.begin();
batch_iter_ = current_level_range_.begin(); upper_key_ = batch_iter_ = current_level_range_.begin();
return true; return true;
} }
} }
@ -721,7 +741,7 @@ class FilePickerMultiGet {
} }
if (level_contains_keys) { if (level_contains_keys) {
batch_iter_prev_ = current_level_range_.begin(); batch_iter_prev_ = current_level_range_.begin();
batch_iter_ = current_level_range_.begin(); upper_key_ = batch_iter_ = current_level_range_.begin();
return true; return true;
} }
curr_level_++; curr_level_++;