Blob DB: avoid having a separate read of checksum
Summary: Previously on a blob db read, we are making a read of the blob value, and then make another read to get CRC checksum. I'm combining the two read into one. readrandom db_bench with 1G database with base db size of 13M, value size 1k: `./db_bench --db=/home/yiwu/tmp/db_bench --use_blob_db --value_size=1024 --num=1000000 --benchmarks=readrandom --use_existing_db --cache_size=32000000` master: throughput 234MB/s, get micros p50 5.984 p95 9.998 p99 20.817 p100 787 this PR: throughput 261MB/s, get micros p50 5.157 p95 9.928 p99 20.724 p100 190 Closes https://github.com/facebook/rocksdb/pull/3301 Differential Revision: D6615950 Pulled By: yiwu-arbug fbshipit-source-id: 052410c6d8539ec0cc305d53793bbc8f3616baa3
This commit is contained in:
parent
af9746fd22
commit
6d4ef9e839
@ -1055,58 +1055,58 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||
std::shared_ptr<RandomAccessFileReader> reader =
|
||||
GetOrOpenRandomAccessReader(bfile, env_, env_options_);
|
||||
|
||||
std::string* valueptr = value->GetSelf();
|
||||
std::string value_c;
|
||||
if (bdb_options_.compression != kNoCompression) {
|
||||
valueptr = &value_c;
|
||||
}
|
||||
assert(blob_index.offset() > key.size() + sizeof(uint32_t));
|
||||
uint64_t record_offset = blob_index.offset() - key.size() - sizeof(uint32_t);
|
||||
uint64_t record_size = sizeof(uint32_t) + key.size() + blob_index.size();
|
||||
|
||||
// Allocate the buffer. This is safe in C++11
|
||||
// Note that std::string::reserved() does not work, since previous value
|
||||
// of the buffer can be larger than blob_index.size().
|
||||
valueptr->resize(blob_index.size());
|
||||
char* buffer = &(*valueptr)[0];
|
||||
std::string buffer_str(record_size, static_cast<char>(0));
|
||||
char* buffer = &buffer_str[0];
|
||||
|
||||
Slice blob_value;
|
||||
// A partial blob record contain checksum, key and value.
|
||||
Slice blob_record;
|
||||
{
|
||||
StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
|
||||
s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value,
|
||||
buffer);
|
||||
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_value.size());
|
||||
s = reader->Read(record_offset, record_size, &blob_record, buffer);
|
||||
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
|
||||
}
|
||||
if (!s.ok() || blob_value.size() != blob_index.size()) {
|
||||
if (debug_level_ >= 2) {
|
||||
ROCKS_LOG_ERROR(db_options_.info_log,
|
||||
"Failed to read blob from file: %s blob_offset: %" PRIu64
|
||||
" blob_size: %" PRIu64 " read: %d key: %s status: '%s'",
|
||||
bfile->PathName().c_str(), blob_index.offset(),
|
||||
blob_index.size(), static_cast<int>(blob_value.size()),
|
||||
key.data(), s.ToString().c_str());
|
||||
}
|
||||
return Status::NotFound("Blob Not Found as couldnt retrieve Blob");
|
||||
if (!s.ok()) {
|
||||
ROCKS_LOG_DEBUG(db_options_.info_log,
|
||||
"Failed to read blob from blob file %" PRIu64
|
||||
", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
|
||||
", key_size: " PRIu64 ", read " PRIu64
|
||||
"bytes, status: '%s'",
|
||||
bfile->BlobFileNumber(), blob_index.offset(),
|
||||
blob_index.size(), key.size(), s.ToString().c_str());
|
||||
return s;
|
||||
}
|
||||
if (blob_record.size() != record_size) {
|
||||
ROCKS_LOG_DEBUG(db_options_.info_log,
|
||||
"Failed to read blob from blob file %" PRIu64
|
||||
", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
|
||||
", key_size: " PRIu64 ", read " PRIu64
|
||||
"bytes, status: '%s'",
|
||||
bfile->BlobFileNumber(), blob_index.offset(),
|
||||
blob_index.size(), key.size(), s.ToString().c_str());
|
||||
|
||||
// TODO(yiwu): Add an option to skip crc checking.
|
||||
Slice crc_slice;
|
||||
return Status::Corruption("Failed to retrieve blob from blob index.");
|
||||
}
|
||||
Slice crc_slice(blob_record.data(), sizeof(uint32_t));
|
||||
Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(),
|
||||
blob_index.size());
|
||||
uint32_t crc_exp;
|
||||
std::string crc_str;
|
||||
crc_str.resize(sizeof(uint32_t));
|
||||
char* crc_buffer = &(crc_str[0]);
|
||||
s = reader->Read(blob_index.offset() - (key.size() + sizeof(uint32_t)),
|
||||
sizeof(uint32_t), &crc_slice, crc_buffer);
|
||||
if (!s.ok() || !GetFixed32(&crc_slice, &crc_exp)) {
|
||||
if (debug_level_ >= 2) {
|
||||
ROCKS_LOG_ERROR(db_options_.info_log,
|
||||
"Failed to fetch blob crc file: %s blob_offset: %" PRIu64
|
||||
" blob_size: %" PRIu64 " key: %s status: '%s'",
|
||||
bfile->PathName().c_str(), blob_index.offset(),
|
||||
blob_index.size(), key.data(), s.ToString().c_str());
|
||||
}
|
||||
return Status::NotFound("Blob Not Found as couldnt retrieve CRC");
|
||||
if (!GetFixed32(&crc_slice, &crc_exp)) {
|
||||
ROCKS_LOG_DEBUG(db_options_.info_log,
|
||||
"Unable to decode CRC from blob file %" PRIu64
|
||||
", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
|
||||
", key size: %" PRIu64 ", status: '%s'",
|
||||
bfile->BlobFileNumber(), blob_index.offset(),
|
||||
blob_index.size(), key.size(), s.ToString().c_str());
|
||||
return Status::Corruption("Unable to decode checksum.");
|
||||
}
|
||||
|
||||
uint32_t crc = crc32c::Value(key.data(), key.size());
|
||||
crc = crc32c::Extend(crc, blob_value.data(), blob_value.size());
|
||||
uint32_t crc = crc32c::Value(blob_record.data() + sizeof(uint32_t),
|
||||
blob_record.size() - sizeof(uint32_t));
|
||||
crc = crc32c::Mask(crc); // Adjust for storage
|
||||
if (crc != crc_exp) {
|
||||
if (debug_level_ >= 2) {
|
||||
@ -1119,7 +1119,9 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||
return Status::Corruption("Corruption. Blob CRC mismatch");
|
||||
}
|
||||
|
||||
if (bfile->compression() != kNoCompression) {
|
||||
if (bfile->compression() == kNoCompression) {
|
||||
value->PinSelf(blob_value);
|
||||
} else {
|
||||
BlockContents contents;
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
|
||||
{
|
||||
@ -1130,11 +1132,9 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||
kBlockBasedTableVersionFormat, Slice(), bfile->compression(),
|
||||
*(cfh->cfd()->ioptions()));
|
||||
}
|
||||
*(value->GetSelf()) = contents.data.ToString();
|
||||
value->PinSelf(contents.data);
|
||||
}
|
||||
|
||||
value->PinSelf();
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user