Batched MultiGet API for multiple column families (#5816)
Summary: Add a new API that allows a user to call MultiGet specifying multiple keys belonging to different column families. This is mainly useful for users who want to do a consistent read of keys across column families, with the added performance benefits of batching and returning values using PinnableSlice. As part of this change, the code in the original multi-column family MultiGet for acquiring the super versions has been refactored into a separate function that can be used by both, the batching and the non-batching versions of MultiGet. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5816 Test Plan: make check make asan_check asan_crash_test Differential Revision: D18408676 Pulled By: anand1976 fbshipit-source-id: 933e7bec91dd70e7b633be4ff623a1116cc28c8d
This commit is contained in:
parent
a19de78da5
commit
6c7b1a0cc7
@ -9,6 +9,7 @@
|
|||||||
### New Features
|
### New Features
|
||||||
* Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
|
* Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
|
||||||
* `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
|
* `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
|
||||||
|
* A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
|
||||||
|
|
||||||
### Performance Improvements
|
### Performance Improvements
|
||||||
* For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
|
* For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
|
||||||
|
@ -1019,15 +1019,27 @@ TEST_F(DBBasicTest, DBCloseFlushError) {
|
|||||||
Destroy(options);
|
Destroy(options);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(DBBasicTest, MultiGetMultiCF) {
|
class DBMultiGetTestWithParam : public DBBasicTest,
|
||||||
|
public testing::WithParamInterface<bool> {};
|
||||||
|
|
||||||
|
TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
|
CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
|
||||||
"alyosha", "popovich"},
|
"alyosha", "popovich"},
|
||||||
options);
|
options);
|
||||||
|
// <CF, key, value> tuples
|
||||||
|
std::vector<std::tuple<int, std::string, std::string>> cf_kv_vec;
|
||||||
|
static const int num_keys = 24;
|
||||||
|
cf_kv_vec.reserve(num_keys);
|
||||||
|
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < num_keys; ++i) {
|
||||||
ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
|
int cf = i / 3;
|
||||||
"cf" + std::to_string(i) + "_val"));
|
int cf_key = 1 % 3;
|
||||||
|
cf_kv_vec.emplace_back(std::make_tuple(
|
||||||
|
cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key),
|
||||||
|
"cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)));
|
||||||
|
ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
|
||||||
|
std::get<2>(cf_kv_vec[i])));
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_sv_count = 0;
|
int get_sv_count = 0;
|
||||||
@ -1037,10 +1049,14 @@ TEST_F(DBBasicTest, MultiGetMultiCF) {
|
|||||||
if (++get_sv_count == 2) {
|
if (++get_sv_count == 2) {
|
||||||
// After MultiGet refs a couple of CFs, flush all CFs so MultiGet
|
// After MultiGet refs a couple of CFs, flush all CFs so MultiGet
|
||||||
// is forced to repeat the process
|
// is forced to repeat the process
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < num_keys; ++i) {
|
||||||
ASSERT_OK(Flush(i));
|
int cf = i / 3;
|
||||||
ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
|
int cf_key = i % 8;
|
||||||
"cf" + std::to_string(i) + "_val2"));
|
if (cf_key == 0) {
|
||||||
|
ASSERT_OK(Flush(cf));
|
||||||
|
}
|
||||||
|
ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
|
||||||
|
std::get<2>(cf_kv_vec[i]) + "_2"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (get_sv_count == 11) {
|
if (get_sv_count == 11) {
|
||||||
@ -1058,26 +1074,53 @@ TEST_F(DBBasicTest, MultiGetMultiCF) {
|
|||||||
std::vector<std::string> keys;
|
std::vector<std::string> keys;
|
||||||
std::vector<std::string> values;
|
std::vector<std::string> values;
|
||||||
|
|
||||||
for (int i = 0; i < 8; ++i) {
|
for (int i = 0; i < num_keys; ++i) {
|
||||||
cfs.push_back(i);
|
cfs.push_back(std::get<0>(cf_kv_vec[i]));
|
||||||
keys.push_back("cf" + std::to_string(i) + "_key");
|
keys.push_back(std::get<1>(cf_kv_vec[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
values = MultiGet(cfs, keys, nullptr);
|
values = MultiGet(cfs, keys, nullptr, GetParam());
|
||||||
ASSERT_EQ(values.size(), 8);
|
ASSERT_EQ(values.size(), num_keys);
|
||||||
for (unsigned int j = 0; j < values.size(); ++j) {
|
for (unsigned int j = 0; j < values.size(); ++j) {
|
||||||
ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val2");
|
ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2");
|
||||||
}
|
}
|
||||||
for (int i = 0; i < 8; ++i) {
|
|
||||||
|
keys.clear();
|
||||||
|
cfs.clear();
|
||||||
|
cfs.push_back(std::get<0>(cf_kv_vec[0]));
|
||||||
|
keys.push_back(std::get<1>(cf_kv_vec[0]));
|
||||||
|
cfs.push_back(std::get<0>(cf_kv_vec[3]));
|
||||||
|
keys.push_back(std::get<1>(cf_kv_vec[3]));
|
||||||
|
cfs.push_back(std::get<0>(cf_kv_vec[4]));
|
||||||
|
keys.push_back(std::get<1>(cf_kv_vec[4]));
|
||||||
|
values = MultiGet(cfs, keys, nullptr, GetParam());
|
||||||
|
ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2");
|
||||||
|
ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2");
|
||||||
|
ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2");
|
||||||
|
|
||||||
|
keys.clear();
|
||||||
|
cfs.clear();
|
||||||
|
cfs.push_back(std::get<0>(cf_kv_vec[7]));
|
||||||
|
keys.push_back(std::get<1>(cf_kv_vec[7]));
|
||||||
|
cfs.push_back(std::get<0>(cf_kv_vec[6]));
|
||||||
|
keys.push_back(std::get<1>(cf_kv_vec[6]));
|
||||||
|
cfs.push_back(std::get<0>(cf_kv_vec[1]));
|
||||||
|
keys.push_back(std::get<1>(cf_kv_vec[1]));
|
||||||
|
values = MultiGet(cfs, keys, nullptr, GetParam());
|
||||||
|
ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2");
|
||||||
|
ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2");
|
||||||
|
ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
|
||||||
|
|
||||||
|
for (int cf = 0; cf < 8; ++cf) {
|
||||||
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
|
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
|
||||||
reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
|
reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(cf))
|
||||||
->cfd();
|
->cfd();
|
||||||
ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
|
ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
|
||||||
ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
|
ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(DBBasicTest, MultiGetMultiCFMutex) {
|
TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
|
CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
|
||||||
"alyosha", "popovich"},
|
"alyosha", "popovich"},
|
||||||
@ -1123,7 +1166,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFMutex) {
|
|||||||
keys.push_back("cf" + std::to_string(i) + "_key");
|
keys.push_back("cf" + std::to_string(i) + "_key");
|
||||||
}
|
}
|
||||||
|
|
||||||
values = MultiGet(cfs, keys, nullptr);
|
values = MultiGet(cfs, keys, nullptr, GetParam());
|
||||||
ASSERT_TRUE(last_try);
|
ASSERT_TRUE(last_try);
|
||||||
ASSERT_EQ(values.size(), 8);
|
ASSERT_EQ(values.size(), 8);
|
||||||
for (unsigned int j = 0; j < values.size(); ++j) {
|
for (unsigned int j = 0; j < values.size(); ++j) {
|
||||||
@ -1138,7 +1181,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFMutex) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) {
|
TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
|
CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
|
||||||
"alyosha", "popovich"},
|
"alyosha", "popovich"},
|
||||||
@ -1183,7 +1226,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const Snapshot* snapshot = db_->GetSnapshot();
|
const Snapshot* snapshot = db_->GetSnapshot();
|
||||||
values = MultiGet(cfs, keys, snapshot);
|
values = MultiGet(cfs, keys, snapshot, GetParam());
|
||||||
db_->ReleaseSnapshot(snapshot);
|
db_->ReleaseSnapshot(snapshot);
|
||||||
ASSERT_EQ(values.size(), 8);
|
ASSERT_EQ(values.size(), 8);
|
||||||
for (unsigned int j = 0; j < values.size(); ++j) {
|
for (unsigned int j = 0; j < values.size(); ++j) {
|
||||||
@ -1197,6 +1240,9 @@ TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
|
||||||
|
testing::Bool());
|
||||||
|
|
||||||
TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) {
|
TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) {
|
||||||
do {
|
do {
|
||||||
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
||||||
|
@ -1645,14 +1645,9 @@ std::vector<Status> DBImpl::MultiGet(
|
|||||||
StopWatch sw(env_, stats_, DB_MULTIGET);
|
StopWatch sw(env_, stats_, DB_MULTIGET);
|
||||||
PERF_TIMER_GUARD(get_snapshot_time);
|
PERF_TIMER_GUARD(get_snapshot_time);
|
||||||
|
|
||||||
SequenceNumber snapshot;
|
SequenceNumber consistent_seqnum;
|
||||||
|
;
|
||||||
|
|
||||||
struct MultiGetColumnFamilyData {
|
|
||||||
ColumnFamilyData* cfd;
|
|
||||||
SuperVersion* super_version;
|
|
||||||
MultiGetColumnFamilyData(ColumnFamilyData* cf, SuperVersion* sv)
|
|
||||||
: cfd(cf), super_version(sv) {}
|
|
||||||
};
|
|
||||||
std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
|
std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
|
||||||
column_family.size());
|
column_family.size());
|
||||||
for (auto cf : column_family) {
|
for (auto cf : column_family) {
|
||||||
@ -1660,86 +1655,20 @@ std::vector<Status> DBImpl::MultiGet(
|
|||||||
auto cfd = cfh->cfd();
|
auto cfd = cfh->cfd();
|
||||||
if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
|
if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
|
||||||
multiget_cf_data.emplace(cfd->GetID(),
|
multiget_cf_data.emplace(cfd->GetID(),
|
||||||
MultiGetColumnFamilyData(cfd, nullptr));
|
MultiGetColumnFamilyData(cfh, nullptr));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool last_try = false;
|
std::function<MultiGetColumnFamilyData*(
|
||||||
{
|
std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&)>
|
||||||
// If we end up with the same issue of memtable geting sealed during 2
|
iter_deref_lambda =
|
||||||
// consecutive retries, it means the write rate is very high. In that case
|
[](std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&
|
||||||
// its probably ok to take the mutex on the 3rd try so we can succeed for
|
cf_iter) { return &cf_iter->second; };
|
||||||
// sure
|
|
||||||
static const int num_retries = 3;
|
|
||||||
for (auto i = 0; i < num_retries; ++i) {
|
|
||||||
last_try = (i == num_retries - 1);
|
|
||||||
bool retry = false;
|
|
||||||
|
|
||||||
if (i > 0) {
|
bool unref_only =
|
||||||
for (auto mgd_iter = multiget_cf_data.begin();
|
MultiCFSnapshot<std::unordered_map<uint32_t, MultiGetColumnFamilyData>>(
|
||||||
mgd_iter != multiget_cf_data.end(); ++mgd_iter) {
|
read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
|
||||||
auto super_version = mgd_iter->second.super_version;
|
&consistent_seqnum);
|
||||||
auto cfd = mgd_iter->second.cfd;
|
|
||||||
if (super_version != nullptr) {
|
|
||||||
ReturnAndCleanupSuperVersion(cfd, super_version);
|
|
||||||
}
|
|
||||||
mgd_iter->second.super_version = nullptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (read_options.snapshot == nullptr) {
|
|
||||||
if (last_try) {
|
|
||||||
TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
|
|
||||||
// We're close to max number of retries. For the last retry,
|
|
||||||
// acquire the lock so we're sure to succeed
|
|
||||||
mutex_.Lock();
|
|
||||||
}
|
|
||||||
snapshot = last_seq_same_as_publish_seq_
|
|
||||||
? versions_->LastSequence()
|
|
||||||
: versions_->LastPublishedSequence();
|
|
||||||
} else {
|
|
||||||
snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
|
|
||||||
->number_;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto mgd_iter = multiget_cf_data.begin();
|
|
||||||
mgd_iter != multiget_cf_data.end(); ++mgd_iter) {
|
|
||||||
if (!last_try) {
|
|
||||||
mgd_iter->second.super_version =
|
|
||||||
GetAndRefSuperVersion(mgd_iter->second.cfd);
|
|
||||||
} else {
|
|
||||||
mgd_iter->second.super_version =
|
|
||||||
mgd_iter->second.cfd->GetSuperVersion()->Ref();
|
|
||||||
}
|
|
||||||
TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
|
|
||||||
if (read_options.snapshot != nullptr || last_try) {
|
|
||||||
// If user passed a snapshot, then we don't care if a memtable is
|
|
||||||
// sealed or compaction happens because the snapshot would ensure
|
|
||||||
// that older key versions are kept around. If this is the last
|
|
||||||
// retry, then we have the lock so nothing bad can happen
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// We could get the earliest sequence number for the whole list of
|
|
||||||
// memtables, which will include immutable memtables as well, but that
|
|
||||||
// might be tricky to maintain in case we decide, in future, to do
|
|
||||||
// memtable compaction.
|
|
||||||
if (!last_try) {
|
|
||||||
auto seq =
|
|
||||||
mgd_iter->second.super_version->mem->GetEarliestSequenceNumber();
|
|
||||||
if (seq > snapshot) {
|
|
||||||
retry = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!retry) {
|
|
||||||
if (last_try) {
|
|
||||||
mutex_.Unlock();
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Contain a list of merge operations if merge occurs.
|
// Contain a list of merge operations if merge occurs.
|
||||||
MergeContext merge_context;
|
MergeContext merge_context;
|
||||||
@ -1763,7 +1692,7 @@ std::vector<Status> DBImpl::MultiGet(
|
|||||||
Status& s = stat_list[i];
|
Status& s = stat_list[i];
|
||||||
std::string* value = &(*values)[i];
|
std::string* value = &(*values)[i];
|
||||||
|
|
||||||
LookupKey lkey(keys[i], snapshot);
|
LookupKey lkey(keys[i], consistent_seqnum);
|
||||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
|
||||||
SequenceNumber max_covering_tombstone_seq = 0;
|
SequenceNumber max_covering_tombstone_seq = 0;
|
||||||
auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
|
auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
|
||||||
@ -1807,7 +1736,7 @@ std::vector<Status> DBImpl::MultiGet(
|
|||||||
|
|
||||||
for (auto mgd_iter : multiget_cf_data) {
|
for (auto mgd_iter : multiget_cf_data) {
|
||||||
auto mgd = mgd_iter.second;
|
auto mgd = mgd_iter.second;
|
||||||
if (!last_try) {
|
if (!unref_only) {
|
||||||
ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
|
ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
|
||||||
} else {
|
} else {
|
||||||
mgd.cfd->GetSuperVersion()->Unref();
|
mgd.cfd->GetSuperVersion()->Unref();
|
||||||
@ -1824,78 +1753,21 @@ std::vector<Status> DBImpl::MultiGet(
|
|||||||
return stat_list;
|
return stat_list;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Order keys by CF ID, followed by key contents
|
template <class T>
|
||||||
struct CompareKeyContext {
|
bool DBImpl::MultiCFSnapshot(
|
||||||
inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
|
const ReadOptions& read_options, ReadCallback* callback,
|
||||||
const Comparator* comparator = cfd->user_comparator();
|
std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
|
||||||
int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
|
iter_deref_func,
|
||||||
if (cmp < 0) {
|
T* cf_list, SequenceNumber* snapshot) {
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
const ColumnFamilyData* cfd;
|
|
||||||
};
|
|
||||||
|
|
||||||
void DBImpl::MultiGet(const ReadOptions& read_options,
|
|
||||||
ColumnFamilyHandle* column_family, const size_t num_keys,
|
|
||||||
const Slice* keys, PinnableSlice* values,
|
|
||||||
Status* statuses, const bool sorted_input) {
|
|
||||||
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
|
|
||||||
for (size_t i = 0; i < num_keys; ++i) {
|
|
||||||
key_context.emplace_back(keys[i], &values[i], &statuses[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
MultiGetImpl(read_options, column_family, key_context, sorted_input, nullptr,
|
|
||||||
nullptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void DBImpl::MultiGetImpl(
|
|
||||||
const ReadOptions& read_options, ColumnFamilyHandle* column_family,
|
|
||||||
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE>& key_context,
|
|
||||||
bool sorted_input, ReadCallback* callback, bool* is_blob_index) {
|
|
||||||
PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
|
|
||||||
StopWatch sw(env_, stats_, DB_MULTIGET);
|
|
||||||
size_t num_keys = key_context.size();
|
|
||||||
|
|
||||||
PERF_TIMER_GUARD(get_snapshot_time);
|
PERF_TIMER_GUARD(get_snapshot_time);
|
||||||
|
|
||||||
ColumnFamilyHandleImpl* cfh =
|
bool last_try = false;
|
||||||
reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
if (cf_list->size() == 1) {
|
||||||
ColumnFamilyData* cfd = cfh->cfd();
|
// Fast path for a single column family. We can simply get the thread loca
|
||||||
|
// super version
|
||||||
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
|
auto cf_iter = cf_list->begin();
|
||||||
sorted_keys.resize(num_keys);
|
auto node = iter_deref_func(cf_iter);
|
||||||
{
|
node->super_version = GetAndRefSuperVersion(node->cfd);
|
||||||
size_t index = 0;
|
|
||||||
for (KeyContext& key : key_context) {
|
|
||||||
#ifndef NDEBUG
|
|
||||||
if (index > 0 && sorted_input) {
|
|
||||||
KeyContext* lhs = &key_context[index-1];
|
|
||||||
KeyContext* rhs = &key_context[index];
|
|
||||||
const Comparator* comparator = cfd->user_comparator();
|
|
||||||
int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
|
|
||||||
assert(cmp <= 0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
sorted_keys[index] = &key;
|
|
||||||
index++;
|
|
||||||
}
|
|
||||||
if (!sorted_input) {
|
|
||||||
CompareKeyContext sort_comparator;
|
|
||||||
sort_comparator.cfd = cfd;
|
|
||||||
std::sort(sorted_keys.begin(), sorted_keys.begin() + index,
|
|
||||||
sort_comparator);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Keep track of bytes that we read for statistics-recording later
|
|
||||||
PERF_TIMER_STOP(get_snapshot_time);
|
|
||||||
|
|
||||||
// Acquire SuperVersion
|
|
||||||
SuperVersion* super_version = GetAndRefSuperVersion(cfd);
|
|
||||||
SequenceNumber snapshot;
|
|
||||||
if (read_options.snapshot != nullptr) {
|
if (read_options.snapshot != nullptr) {
|
||||||
// Note: In WritePrepared txns this is not necessary but not harmful
|
// Note: In WritePrepared txns this is not necessary but not harmful
|
||||||
// either. Because prep_seq > snapshot => commit_seq > snapshot so if
|
// either. Because prep_seq > snapshot => commit_seq > snapshot so if
|
||||||
@ -1905,10 +1777,10 @@ void DBImpl::MultiGetImpl(
|
|||||||
// In WriteUnprepared, we cannot set snapshot in the lookup key because we
|
// In WriteUnprepared, we cannot set snapshot in the lookup key because we
|
||||||
// may skip uncommitted data that should be visible to the transaction for
|
// may skip uncommitted data that should be visible to the transaction for
|
||||||
// reading own writes.
|
// reading own writes.
|
||||||
snapshot =
|
*snapshot =
|
||||||
reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
|
static_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
|
||||||
if (callback) {
|
if (callback) {
|
||||||
snapshot = std::max(snapshot, callback->max_visible_seq());
|
*snapshot = std::max(*snapshot, callback->max_visible_seq());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Since we get and reference the super version before getting
|
// Since we get and reference the super version before getting
|
||||||
@ -1921,15 +1793,263 @@ void DBImpl::MultiGetImpl(
|
|||||||
// version because a flush happening in between may compact away data for
|
// version because a flush happening in between may compact away data for
|
||||||
// the snapshot, but the snapshot is earlier than the data overwriting it,
|
// the snapshot, but the snapshot is earlier than the data overwriting it,
|
||||||
// so users may see wrong results.
|
// so users may see wrong results.
|
||||||
snapshot = last_seq_same_as_publish_seq_
|
*snapshot = last_seq_same_as_publish_seq_
|
||||||
? versions_->LastSequence()
|
? versions_->LastSequence()
|
||||||
: versions_->LastPublishedSequence();
|
: versions_->LastPublishedSequence();
|
||||||
if (callback) {
|
}
|
||||||
|
} else {
|
||||||
|
// If we end up with the same issue of memtable geting sealed during 2
|
||||||
|
// consecutive retries, it means the write rate is very high. In that case
|
||||||
|
// its probably ok to take the mutex on the 3rd try so we can succeed for
|
||||||
|
// sure
|
||||||
|
static const int num_retries = 3;
|
||||||
|
for (int i = 0; i < num_retries; ++i) {
|
||||||
|
last_try = (i == num_retries - 1);
|
||||||
|
bool retry = false;
|
||||||
|
|
||||||
|
if (i > 0) {
|
||||||
|
for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
|
||||||
|
++cf_iter) {
|
||||||
|
auto node = iter_deref_func(cf_iter);
|
||||||
|
SuperVersion* super_version = node->super_version;
|
||||||
|
ColumnFamilyData* cfd = node->cfd;
|
||||||
|
if (super_version != nullptr) {
|
||||||
|
ReturnAndCleanupSuperVersion(cfd, super_version);
|
||||||
|
}
|
||||||
|
node->super_version = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (read_options.snapshot == nullptr) {
|
||||||
|
if (last_try) {
|
||||||
|
TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
|
||||||
|
// We're close to max number of retries. For the last retry,
|
||||||
|
// acquire the lock so we're sure to succeed
|
||||||
|
mutex_.Lock();
|
||||||
|
}
|
||||||
|
*snapshot = last_seq_same_as_publish_seq_
|
||||||
|
? versions_->LastSequence()
|
||||||
|
: versions_->LastPublishedSequence();
|
||||||
|
} else {
|
||||||
|
*snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
|
||||||
|
->number_;
|
||||||
|
}
|
||||||
|
for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
|
||||||
|
++cf_iter) {
|
||||||
|
auto node = iter_deref_func(cf_iter);
|
||||||
|
if (!last_try) {
|
||||||
|
node->super_version = GetAndRefSuperVersion(node->cfd);
|
||||||
|
} else {
|
||||||
|
node->super_version = node->cfd->GetSuperVersion()->Ref();
|
||||||
|
}
|
||||||
|
TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
|
||||||
|
if (read_options.snapshot != nullptr || last_try) {
|
||||||
|
// If user passed a snapshot, then we don't care if a memtable is
|
||||||
|
// sealed or compaction happens because the snapshot would ensure
|
||||||
|
// that older key versions are kept around. If this is the last
|
||||||
|
// retry, then we have the lock so nothing bad can happen
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// We could get the earliest sequence number for the whole list of
|
||||||
|
// memtables, which will include immutable memtables as well, but that
|
||||||
|
// might be tricky to maintain in case we decide, in future, to do
|
||||||
|
// memtable compaction.
|
||||||
|
if (!last_try) {
|
||||||
|
SequenceNumber seq =
|
||||||
|
node->super_version->mem->GetEarliestSequenceNumber();
|
||||||
|
if (seq > *snapshot) {
|
||||||
|
retry = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!retry) {
|
||||||
|
if (last_try) {
|
||||||
|
mutex_.Unlock();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep track of bytes that we read for statistics-recording later
|
||||||
|
PERF_TIMER_STOP(get_snapshot_time);
|
||||||
|
|
||||||
|
return last_try;
|
||||||
|
}
|
||||||
|
|
||||||
|
void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
|
||||||
|
ColumnFamilyHandle** column_families, const Slice* keys,
|
||||||
|
PinnableSlice* values, Status* statuses,
|
||||||
|
const bool sorted_input) {
|
||||||
|
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
|
||||||
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
|
||||||
|
sorted_keys.resize(num_keys);
|
||||||
|
for (size_t i = 0; i < num_keys; ++i) {
|
||||||
|
key_context.emplace_back(column_families[i], keys[i], &values[i],
|
||||||
|
&statuses[i]);
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < num_keys; ++i) {
|
||||||
|
sorted_keys[i] = &key_context[i];
|
||||||
|
}
|
||||||
|
PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
|
||||||
|
|
||||||
|
autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
|
||||||
|
multiget_cf_data;
|
||||||
|
size_t cf_start = 0;
|
||||||
|
ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
|
||||||
|
for (size_t i = 0; i < num_keys; ++i) {
|
||||||
|
KeyContext* key_ctx = sorted_keys[i];
|
||||||
|
if (key_ctx->column_family != cf) {
|
||||||
|
multiget_cf_data.emplace_back(
|
||||||
|
MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr));
|
||||||
|
cf_start = i;
|
||||||
|
cf = key_ctx->column_family;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// multiget_cf_data.emplace_back(
|
||||||
|
// MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr));
|
||||||
|
multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
|
||||||
|
}
|
||||||
|
std::function<MultiGetColumnFamilyData*(
|
||||||
|
autovector<MultiGetColumnFamilyData,
|
||||||
|
MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
|
||||||
|
iter_deref_lambda =
|
||||||
|
[](autovector<MultiGetColumnFamilyData,
|
||||||
|
MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
|
||||||
|
return &(*cf_iter);
|
||||||
|
};
|
||||||
|
|
||||||
|
SequenceNumber consistent_seqnum;
|
||||||
|
bool unref_only = MultiCFSnapshot<
|
||||||
|
autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
|
||||||
|
read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
|
||||||
|
&consistent_seqnum);
|
||||||
|
|
||||||
|
for (auto cf_iter = multiget_cf_data.begin();
|
||||||
|
cf_iter != multiget_cf_data.end(); ++cf_iter) {
|
||||||
|
MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys,
|
||||||
|
cf_iter->super_version, consistent_seqnum, nullptr, nullptr);
|
||||||
|
if (!unref_only) {
|
||||||
|
ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version);
|
||||||
|
} else {
|
||||||
|
cf_iter->cfd->GetSuperVersion()->Unref();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
// Order keys by CF ID, followed by key contents
|
||||||
|
struct CompareKeyContext {
|
||||||
|
inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
|
||||||
|
ColumnFamilyHandleImpl* cfh =
|
||||||
|
static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
|
||||||
|
uint32_t cfd_id1 = cfh->cfd()->GetID();
|
||||||
|
const Comparator* comparator = cfh->cfd()->user_comparator();
|
||||||
|
cfh = static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
|
||||||
|
uint32_t cfd_id2 = cfh->cfd()->GetID();
|
||||||
|
|
||||||
|
if (cfd_id1 < cfd_id2) {
|
||||||
|
return true;
|
||||||
|
} else if (cfd_id1 > cfd_id2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Both keys are from the same column family
|
||||||
|
int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
|
||||||
|
if (cmp < 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
|
void DBImpl::PrepareMultiGetKeys(
|
||||||
|
size_t num_keys, bool sorted_input,
|
||||||
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
|
||||||
|
#ifndef NDEBUG
|
||||||
|
if (sorted_input) {
|
||||||
|
for (size_t index = 0; index < sorted_keys->size(); ++index) {
|
||||||
|
if (index > 0) {
|
||||||
|
KeyContext* lhs = (*sorted_keys)[index - 1];
|
||||||
|
KeyContext* rhs = (*sorted_keys)[index];
|
||||||
|
ColumnFamilyHandleImpl* cfh =
|
||||||
|
reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
|
||||||
|
uint32_t cfd_id1 = cfh->cfd()->GetID();
|
||||||
|
const Comparator* comparator = cfh->cfd()->user_comparator();
|
||||||
|
cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
|
||||||
|
uint32_t cfd_id2 = cfh->cfd()->GetID();
|
||||||
|
|
||||||
|
assert(cfd_id1 <= cfd_id2);
|
||||||
|
if (cfd_id1 < cfd_id2) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Both keys are from the same column family
|
||||||
|
int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
|
||||||
|
assert(cmp <= 0);
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (!sorted_input) {
|
||||||
|
CompareKeyContext sort_comparator;
|
||||||
|
std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
|
||||||
|
sort_comparator);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void DBImpl::MultiGet(const ReadOptions& read_options,
|
||||||
|
ColumnFamilyHandle* column_family, const size_t num_keys,
|
||||||
|
const Slice* keys, PinnableSlice* values,
|
||||||
|
Status* statuses, const bool sorted_input) {
|
||||||
|
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
|
||||||
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
|
||||||
|
sorted_keys.resize(num_keys);
|
||||||
|
for (size_t i = 0; i < num_keys; ++i) {
|
||||||
|
key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < num_keys; ++i) {
|
||||||
|
sorted_keys[i] = &key_context[i];
|
||||||
|
}
|
||||||
|
PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
|
||||||
|
MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DBImpl::MultiGetWithCallback(
|
||||||
|
const ReadOptions& read_options, ColumnFamilyHandle* column_family,
|
||||||
|
ReadCallback* callback,
|
||||||
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
|
||||||
|
std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
|
||||||
|
multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
|
||||||
|
std::function<MultiGetColumnFamilyData*(
|
||||||
|
std::array<MultiGetColumnFamilyData, 1>::iterator&)>
|
||||||
|
iter_deref_lambda =
|
||||||
|
[](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
|
||||||
|
return &(*cf_iter);
|
||||||
|
};
|
||||||
|
|
||||||
|
size_t num_keys = sorted_keys->size();
|
||||||
|
SequenceNumber consistent_seqnum;
|
||||||
|
bool unref_only = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
|
||||||
|
read_options, callback, iter_deref_lambda, &multiget_cf_data,
|
||||||
|
&consistent_seqnum);
|
||||||
|
#ifndef NDEBUG
|
||||||
|
assert(!unref_only);
|
||||||
|
#else
|
||||||
|
// Silence unused variable warning
|
||||||
|
(void)unref_only;
|
||||||
|
#endif // NDEBUG
|
||||||
|
|
||||||
|
if (callback && read_options.snapshot == nullptr) {
|
||||||
// The unprep_seqs are not published for write unprepared, so it could be
|
// The unprep_seqs are not published for write unprepared, so it could be
|
||||||
// that max_visible_seq is larger. Seek to the std::max of the two.
|
// that max_visible_seq is larger. Seek to the std::max of the two.
|
||||||
// However, we still want our callback to contain the actual snapshot so
|
// However, we still want our callback to contain the actual snapshot so
|
||||||
// that it can do the correct visibility filtering.
|
// that it can do the correct visibility filtering.
|
||||||
callback->Refresh(snapshot);
|
callback->Refresh(consistent_seqnum);
|
||||||
|
|
||||||
// Internally, WriteUnpreparedTxnReadCallback::Refresh would set
|
// Internally, WriteUnpreparedTxnReadCallback::Refresh would set
|
||||||
// max_visible_seq = max(max_visible_seq, snapshot)
|
// max_visible_seq = max(max_visible_seq, snapshot)
|
||||||
@ -1940,10 +2060,24 @@ void DBImpl::MultiGetImpl(
|
|||||||
// be needed.
|
// be needed.
|
||||||
//
|
//
|
||||||
// assert(callback->max_visible_seq() >= snapshot);
|
// assert(callback->max_visible_seq() >= snapshot);
|
||||||
snapshot = callback->max_visible_seq();
|
consistent_seqnum = callback->max_visible_seq();
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MultiGetImpl(read_options, 0, num_keys, sorted_keys,
|
||||||
|
multiget_cf_data[0].super_version, consistent_seqnum, nullptr,
|
||||||
|
nullptr);
|
||||||
|
ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
|
||||||
|
multiget_cf_data[0].super_version);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DBImpl::MultiGetImpl(
|
||||||
|
const ReadOptions& read_options, size_t start_key, size_t num_keys,
|
||||||
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
|
||||||
|
SuperVersion* super_version, SequenceNumber snapshot,
|
||||||
|
ReadCallback* callback, bool* is_blob_index) {
|
||||||
|
PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
|
||||||
|
StopWatch sw(env_, stats_, DB_MULTIGET);
|
||||||
|
|
||||||
// For each of the given keys, apply the entire "get" process as follows:
|
// For each of the given keys, apply the entire "get" process as follows:
|
||||||
// First look in the memtable, then in the immutable memtable (if any).
|
// First look in the memtable, then in the immutable memtable (if any).
|
||||||
// s is both in/out. When in, s could either be OK or MergeInProgress.
|
// s is both in/out. When in, s could either be OK or MergeInProgress.
|
||||||
@ -1953,8 +2087,8 @@ void DBImpl::MultiGetImpl(
|
|||||||
size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
|
size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
|
||||||
? MultiGetContext::MAX_BATCH_SIZE
|
? MultiGetContext::MAX_BATCH_SIZE
|
||||||
: keys_left;
|
: keys_left;
|
||||||
MultiGetContext ctx(&sorted_keys[num_keys - keys_left], batch_size,
|
MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
|
||||||
snapshot);
|
batch_size, snapshot);
|
||||||
MultiGetRange range = ctx.GetMultiGetRange();
|
MultiGetRange range = ctx.GetMultiGetRange();
|
||||||
bool lookup_current = false;
|
bool lookup_current = false;
|
||||||
|
|
||||||
@ -1992,15 +2126,14 @@ void DBImpl::MultiGetImpl(
|
|||||||
PERF_TIMER_GUARD(get_post_process_time);
|
PERF_TIMER_GUARD(get_post_process_time);
|
||||||
size_t num_found = 0;
|
size_t num_found = 0;
|
||||||
uint64_t bytes_read = 0;
|
uint64_t bytes_read = 0;
|
||||||
for (KeyContext& key : key_context) {
|
for (size_t i = start_key; i < start_key + num_keys; ++i) {
|
||||||
if (key.s->ok()) {
|
KeyContext* key = (*sorted_keys)[i];
|
||||||
bytes_read += key.value->size();
|
if (key->s->ok()) {
|
||||||
|
bytes_read += key->value->size();
|
||||||
num_found++;
|
num_found++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ReturnAndCleanupSuperVersion(cfd, super_version);
|
|
||||||
|
|
||||||
RecordTick(stats_, NUMBER_MULTIGET_CALLS);
|
RecordTick(stats_, NUMBER_MULTIGET_CALLS);
|
||||||
RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
|
RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
|
||||||
RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
|
RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
|
||||||
|
@ -199,11 +199,15 @@ class DBImpl : public DB {
|
|||||||
PinnableSlice* values, Status* statuses,
|
PinnableSlice* values, Status* statuses,
|
||||||
const bool sorted_input = false) override;
|
const bool sorted_input = false) override;
|
||||||
|
|
||||||
void MultiGetImpl(
|
virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
|
||||||
|
ColumnFamilyHandle** column_families, const Slice* keys,
|
||||||
|
PinnableSlice* values, Status* statuses,
|
||||||
|
const bool sorted_input = false) override;
|
||||||
|
|
||||||
|
virtual void MultiGetWithCallback(
|
||||||
const ReadOptions& options, ColumnFamilyHandle* column_family,
|
const ReadOptions& options, ColumnFamilyHandle* column_family,
|
||||||
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE>& key_context,
|
ReadCallback* callback,
|
||||||
bool sorted_input, ReadCallback* callback = nullptr,
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
|
||||||
bool* is_blob_index = nullptr);
|
|
||||||
|
|
||||||
virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
|
virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
|
||||||
const std::string& column_family,
|
const std::string& column_family,
|
||||||
@ -1640,6 +1644,81 @@ class DBImpl : public DB {
|
|||||||
const DBOptions& db_options,
|
const DBOptions& db_options,
|
||||||
const std::vector<ColumnFamilyDescriptor>& column_families);
|
const std::vector<ColumnFamilyDescriptor>& column_families);
|
||||||
|
|
||||||
|
// Utility function to do some debug validation and sort the given vector
|
||||||
|
// of MultiGet keys
|
||||||
|
void PrepareMultiGetKeys(
|
||||||
|
const size_t num_keys, bool sorted,
|
||||||
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
|
||||||
|
|
||||||
|
// A structure to hold the information required to process MultiGet of keys
|
||||||
|
// belonging to one column family. For a multi column family MultiGet, there
|
||||||
|
// will be a container of these objects.
|
||||||
|
struct MultiGetColumnFamilyData {
|
||||||
|
ColumnFamilyHandle* cf;
|
||||||
|
ColumnFamilyData* cfd;
|
||||||
|
|
||||||
|
// For the batched MultiGet which relies on sorted keys, start specifies
|
||||||
|
// the index of first key belonging to this column family in the sorted
|
||||||
|
// list.
|
||||||
|
size_t start;
|
||||||
|
|
||||||
|
// For the batched MultiGet case, num_keys specifies the number of keys
|
||||||
|
// belonging to this column family in the sorted list
|
||||||
|
size_t num_keys;
|
||||||
|
|
||||||
|
// SuperVersion for the column family obtained in a manner that ensures a
|
||||||
|
// consistent view across all column families in the DB
|
||||||
|
SuperVersion* super_version;
|
||||||
|
MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
|
||||||
|
SuperVersion* sv)
|
||||||
|
: cf(column_family),
|
||||||
|
cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
|
||||||
|
start(0),
|
||||||
|
num_keys(0),
|
||||||
|
super_version(sv) {}
|
||||||
|
|
||||||
|
MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
|
||||||
|
size_t count, SuperVersion* sv)
|
||||||
|
: cf(column_family),
|
||||||
|
cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
|
||||||
|
start(first),
|
||||||
|
num_keys(count),
|
||||||
|
super_version(sv) {}
|
||||||
|
|
||||||
|
MultiGetColumnFamilyData() = default;
|
||||||
|
};
|
||||||
|
|
||||||
|
// A common function to obtain a consistent snapshot, which can be implicit
|
||||||
|
// if the user doesn't specify a snapshot in read_options, across
|
||||||
|
// multiple column families for MultiGet. It will attempt to get an implicit
|
||||||
|
// snapshot without acquiring the db_mutes, but will give up after a few
|
||||||
|
// tries and acquire the mutex if a memtable flush happens. The template
|
||||||
|
// allows both the batched and non-batched MultiGet to call this with
|
||||||
|
// either an std::unordered_map or autovector of column families.
|
||||||
|
//
|
||||||
|
// If callback is non-null, the callback is refreshed with the snapshot
|
||||||
|
// sequence number
|
||||||
|
//
|
||||||
|
// A return value of true indicates that the SuperVersions were obtained
|
||||||
|
// from the ColumnFamilyData, whereas false indicates they are thread
|
||||||
|
// local
|
||||||
|
template <class T>
|
||||||
|
bool MultiCFSnapshot(
|
||||||
|
const ReadOptions& read_options, ReadCallback* callback,
|
||||||
|
std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
|
||||||
|
iter_deref_func,
|
||||||
|
T* cf_list, SequenceNumber* snapshot);
|
||||||
|
|
||||||
|
// The actual implementation of the batching MultiGet. The caller is expected
|
||||||
|
// to have acquired the SuperVersion and pass in a snapshot sequence number
|
||||||
|
// in order to construct the LookupKeys. The start_key and num_keys specify
|
||||||
|
// the range of keys in the sorted_keys vector for a single column family.
|
||||||
|
void MultiGetImpl(
|
||||||
|
const ReadOptions& read_options, size_t start_key, size_t num_keys,
|
||||||
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
|
||||||
|
SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback,
|
||||||
|
bool* is_blob_index);
|
||||||
|
|
||||||
// table_cache_ provides its own synchronization
|
// table_cache_ provides its own synchronization
|
||||||
std::shared_ptr<Cache> table_cache_;
|
std::shared_ptr<Cache> table_cache_;
|
||||||
|
|
||||||
|
@ -777,7 +777,8 @@ std::string DBTestBase::Get(int cf, const std::string& k,
|
|||||||
|
|
||||||
std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
|
std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
|
||||||
const std::vector<std::string>& k,
|
const std::vector<std::string>& k,
|
||||||
const Snapshot* snapshot) {
|
const Snapshot* snapshot,
|
||||||
|
const bool batched) {
|
||||||
ReadOptions options;
|
ReadOptions options;
|
||||||
options.verify_checksums = true;
|
options.verify_checksums = true;
|
||||||
options.snapshot = snapshot;
|
options.snapshot = snapshot;
|
||||||
@ -789,7 +790,9 @@ std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
|
|||||||
handles.push_back(handles_[cfs[i]]);
|
handles.push_back(handles_[cfs[i]]);
|
||||||
keys.push_back(k[i]);
|
keys.push_back(k[i]);
|
||||||
}
|
}
|
||||||
std::vector<Status> s = db_->MultiGet(options, handles, keys, &result);
|
std::vector<Status> s;
|
||||||
|
if (!batched) {
|
||||||
|
s = db_->MultiGet(options, handles, keys, &result);
|
||||||
for (unsigned int i = 0; i < s.size(); ++i) {
|
for (unsigned int i = 0; i < s.size(); ++i) {
|
||||||
if (s[i].IsNotFound()) {
|
if (s[i].IsNotFound()) {
|
||||||
result[i] = "NOT_FOUND";
|
result[i] = "NOT_FOUND";
|
||||||
@ -797,6 +800,22 @@ std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
|
|||||||
result[i] = s[i].ToString();
|
result[i] = s[i].ToString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
std::vector<PinnableSlice> pin_values(cfs.size());
|
||||||
|
result.resize(cfs.size());
|
||||||
|
s.resize(cfs.size());
|
||||||
|
db_->MultiGet(options, cfs.size(), handles.data(), keys.data(),
|
||||||
|
pin_values.data(), s.data());
|
||||||
|
for (unsigned int i = 0; i < s.size(); ++i) {
|
||||||
|
if (s[i].IsNotFound()) {
|
||||||
|
result[i] = "NOT_FOUND";
|
||||||
|
} else if (!s[i].ok()) {
|
||||||
|
result[i] = s[i].ToString();
|
||||||
|
} else {
|
||||||
|
result[i].assign(pin_values[i].data(), pin_values[i].size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -850,7 +850,8 @@ class DBTestBase : public testing::Test {
|
|||||||
|
|
||||||
std::vector<std::string> MultiGet(std::vector<int> cfs,
|
std::vector<std::string> MultiGet(std::vector<int> cfs,
|
||||||
const std::vector<std::string>& k,
|
const std::vector<std::string>& k,
|
||||||
const Snapshot* snapshot = nullptr);
|
const Snapshot* snapshot,
|
||||||
|
const bool batched);
|
||||||
|
|
||||||
std::vector<std::string> MultiGet(const std::vector<std::string>& k,
|
std::vector<std::string> MultiGet(const std::vector<std::string>& k,
|
||||||
const Snapshot* snapshot = nullptr);
|
const Snapshot* snapshot = nullptr);
|
||||||
|
@ -490,6 +490,47 @@ class DB {
|
|||||||
values++;
|
values++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Overloaded MultiGet API that improves performance by batching operations
|
||||||
|
// in the read path for greater efficiency. Currently, only the block based
|
||||||
|
// table format with full filters are supported. Other table formats such
|
||||||
|
// as plain table, block based table with block based filters and
|
||||||
|
// partitioned indexes will still work, but will not get any performance
|
||||||
|
// benefits.
|
||||||
|
// Parameters -
|
||||||
|
// options - ReadOptions
|
||||||
|
// column_family - ColumnFamilyHandle* that the keys belong to. All the keys
|
||||||
|
// passed to the API are restricted to a single column family
|
||||||
|
// num_keys - Number of keys to lookup
|
||||||
|
// keys - Pointer to C style array of key Slices with num_keys elements
|
||||||
|
// values - Pointer to C style array of PinnableSlices with num_keys elements
|
||||||
|
// statuses - Pointer to C style array of Status with num_keys elements
|
||||||
|
// sorted_input - If true, it means the input keys are already sorted by key
|
||||||
|
// order, so the MultiGet() API doesn't have to sort them
|
||||||
|
// again. If false, the keys will be copied and sorted
|
||||||
|
// internally by the API - the input array will not be
|
||||||
|
// modified
|
||||||
|
virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
|
||||||
|
ColumnFamilyHandle** column_families, const Slice* keys,
|
||||||
|
PinnableSlice* values, Status* statuses,
|
||||||
|
const bool /*sorted_input*/ = false) {
|
||||||
|
std::vector<ColumnFamilyHandle*> cf;
|
||||||
|
std::vector<Slice> user_keys;
|
||||||
|
std::vector<Status> status;
|
||||||
|
std::vector<std::string> vals;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < num_keys; ++i) {
|
||||||
|
cf.emplace_back(column_families[i]);
|
||||||
|
user_keys.emplace_back(keys[i]);
|
||||||
|
}
|
||||||
|
status = MultiGet(options, cf, user_keys, &vals);
|
||||||
|
std::copy(status.begin(), status.end(), statuses);
|
||||||
|
for (auto& value : vals) {
|
||||||
|
values->PinSelf(value);
|
||||||
|
values++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If the key definitely does not exist in the database, then this method
|
// If the key definitely does not exist in the database, then this method
|
||||||
// returns false, else true. If the caller wants to obtain value when the key
|
// returns false, else true. If the caller wants to obtain value when the key
|
||||||
// is found in memory, a bool for 'value_found' must be passed. 'value_found'
|
// is found in memory, a bool for 'value_found' must be passed. 'value_found'
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <array>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include "db/lookup_key.h"
|
#include "db/lookup_key.h"
|
||||||
#include "db/merge_context.h"
|
#include "db/merge_context.h"
|
||||||
@ -21,6 +22,7 @@ struct KeyContext {
|
|||||||
LookupKey* lkey;
|
LookupKey* lkey;
|
||||||
Slice ukey;
|
Slice ukey;
|
||||||
Slice ikey;
|
Slice ikey;
|
||||||
|
ColumnFamilyHandle* column_family;
|
||||||
Status* s;
|
Status* s;
|
||||||
MergeContext merge_context;
|
MergeContext merge_context;
|
||||||
SequenceNumber max_covering_tombstone_seq;
|
SequenceNumber max_covering_tombstone_seq;
|
||||||
@ -29,9 +31,11 @@ struct KeyContext {
|
|||||||
PinnableSlice* value;
|
PinnableSlice* value;
|
||||||
GetContext* get_context;
|
GetContext* get_context;
|
||||||
|
|
||||||
KeyContext(const Slice& user_key, PinnableSlice* val, Status* stat)
|
KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key,
|
||||||
|
PinnableSlice* val, Status* stat)
|
||||||
: key(&user_key),
|
: key(&user_key),
|
||||||
lkey(nullptr),
|
lkey(nullptr),
|
||||||
|
column_family(col_family),
|
||||||
s(stat),
|
s(stat),
|
||||||
max_covering_tombstone_seq(0),
|
max_covering_tombstone_seq(0),
|
||||||
key_exists(false),
|
key_exists(false),
|
||||||
@ -85,10 +89,9 @@ class MultiGetContext {
|
|||||||
// htat need to be performed
|
// htat need to be performed
|
||||||
static const int MAX_BATCH_SIZE = 32;
|
static const int MAX_BATCH_SIZE = 32;
|
||||||
|
|
||||||
MultiGetContext(KeyContext** sorted_keys, size_t num_keys,
|
MultiGetContext(autovector<KeyContext*, MAX_BATCH_SIZE>* sorted_keys,
|
||||||
SequenceNumber snapshot)
|
size_t begin, size_t num_keys, SequenceNumber snapshot)
|
||||||
: sorted_keys_(sorted_keys),
|
: num_keys_(num_keys),
|
||||||
num_keys_(num_keys),
|
|
||||||
value_mask_(0),
|
value_mask_(0),
|
||||||
lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf)) {
|
lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf)) {
|
||||||
int index = 0;
|
int index = 0;
|
||||||
@ -100,6 +103,8 @@ class MultiGetContext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (size_t iter = 0; iter != num_keys_; ++iter) {
|
for (size_t iter = 0; iter != num_keys_; ++iter) {
|
||||||
|
// autovector may not be contiguous storage, so make a copy
|
||||||
|
sorted_keys_[iter] = (*sorted_keys)[begin + iter];
|
||||||
sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[index])
|
sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[index])
|
||||||
LookupKey(*sorted_keys_[iter]->key, snapshot);
|
LookupKey(*sorted_keys_[iter]->key, snapshot);
|
||||||
sorted_keys_[iter]->ukey = sorted_keys_[iter]->lkey->user_key();
|
sorted_keys_[iter]->ukey = sorted_keys_[iter]->lkey->user_key();
|
||||||
@ -118,7 +123,7 @@ class MultiGetContext {
|
|||||||
static const int MAX_LOOKUP_KEYS_ON_STACK = 16;
|
static const int MAX_LOOKUP_KEYS_ON_STACK = 16;
|
||||||
alignas(alignof(LookupKey))
|
alignas(alignof(LookupKey))
|
||||||
char lookup_key_stack_buf[sizeof(LookupKey) * MAX_LOOKUP_KEYS_ON_STACK];
|
char lookup_key_stack_buf[sizeof(LookupKey) * MAX_LOOKUP_KEYS_ON_STACK];
|
||||||
KeyContext** sorted_keys_;
|
std::array<KeyContext*, MAX_BATCH_SIZE> sorted_keys_;
|
||||||
size_t num_keys_;
|
size_t num_keys_;
|
||||||
uint64_t value_mask_;
|
uint64_t value_mask_;
|
||||||
std::unique_ptr<char[]> lookup_key_heap_buf;
|
std::unique_ptr<char[]> lookup_key_heap_buf;
|
||||||
|
@ -963,6 +963,7 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB(
|
|||||||
->immutable_db_options();
|
->immutable_db_options();
|
||||||
|
|
||||||
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
|
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
|
||||||
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
|
||||||
// To hold merges from the write batch
|
// To hold merges from the write batch
|
||||||
autovector<std::pair<WriteBatchWithIndexInternal::Result, MergeContext>,
|
autovector<std::pair<WriteBatchWithIndexInternal::Result, MergeContext>,
|
||||||
MultiGetContext::MAX_BATCH_SIZE>
|
MultiGetContext::MAX_BATCH_SIZE>
|
||||||
@ -1002,14 +1003,17 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB(
|
|||||||
|
|
||||||
assert(result == WriteBatchWithIndexInternal::Result::kMergeInProgress ||
|
assert(result == WriteBatchWithIndexInternal::Result::kMergeInProgress ||
|
||||||
result == WriteBatchWithIndexInternal::Result::kNotFound);
|
result == WriteBatchWithIndexInternal::Result::kNotFound);
|
||||||
key_context.emplace_back(keys[i], &values[i], &statuses[i]);
|
key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
|
||||||
|
sorted_keys.emplace_back(&key_context.back());
|
||||||
merges.emplace_back(result, std::move(merge_context));
|
merges.emplace_back(result, std::move(merge_context));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Did not find key in batch OR could not resolve Merges. Try DB.
|
// Did not find key in batch OR could not resolve Merges. Try DB.
|
||||||
static_cast_with_check<DBImpl, DB>(db->GetRootDB())
|
static_cast_with_check<DBImpl, DB>(db->GetRootDB())
|
||||||
->MultiGetImpl(read_options, column_family, key_context, sorted_input,
|
->PrepareMultiGetKeys(key_context.size(), sorted_input, &sorted_keys);
|
||||||
callback);
|
static_cast_with_check<DBImpl, DB>(db->GetRootDB())
|
||||||
|
->MultiGetWithCallback(read_options, column_family, callback,
|
||||||
|
&sorted_keys);
|
||||||
|
|
||||||
ColumnFamilyHandleImpl* cfh =
|
ColumnFamilyHandleImpl* cfh =
|
||||||
reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||||
|
Loading…
Reference in New Issue
Block a user